aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-04-16 18:20:36 -0400
commit1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree0bba044c4ce775e45a88a51686b5d9f90697ea9d /drivers/md
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig240
-rw-r--r--drivers/md/Makefile107
-rw-r--r--drivers/md/dm-bio-list.h68
-rw-r--r--drivers/md/dm-bio-record.h45
-rw-r--r--drivers/md/dm-crypt.c977
-rw-r--r--drivers/md/dm-emc.c359
-rw-r--r--drivers/md/dm-exception-store.c648
-rw-r--r--drivers/md/dm-hw-handler.c216
-rw-r--r--drivers/md/dm-hw-handler.h61
-rw-r--r--drivers/md/dm-io.c426
-rw-r--r--drivers/md/dm-io.h77
-rw-r--r--drivers/md/dm-ioctl.c1416
-rw-r--r--drivers/md/dm-linear.c123
-rw-r--r--drivers/md/dm-log.c711
-rw-r--r--drivers/md/dm-log.h130
-rw-r--r--drivers/md/dm-mpath.c1302
-rw-r--r--drivers/md/dm-mpath.h25
-rw-r--r--drivers/md/dm-path-selector.c156
-rw-r--r--drivers/md/dm-path-selector.h93
-rw-r--r--drivers/md/dm-raid1.c1269
-rw-r--r--drivers/md/dm-round-robin.c214
-rw-r--r--drivers/md/dm-snap.c1208
-rw-r--r--drivers/md/dm-snap.h161
-rw-r--r--drivers/md/dm-stripe.c234
-rw-r--r--drivers/md/dm-table.c950
-rw-r--r--drivers/md/dm-target.c196
-rw-r--r--drivers/md/dm-zero.c81
-rw-r--r--drivers/md/dm.c1194
-rw-r--r--drivers/md/dm.h195
-rw-r--r--drivers/md/faulty.c343
-rw-r--r--drivers/md/kcopyd.c687
-rw-r--r--drivers/md/kcopyd.h42
-rw-r--r--drivers/md/linear.c343
-rw-r--r--drivers/md/md.c3766
-rw-r--r--drivers/md/mktables.c125
-rw-r--r--drivers/md/multipath.c584
-rw-r--r--drivers/md/raid0.c539
-rw-r--r--drivers/md/raid1.c1449
-rw-r--r--drivers/md/raid10.c1787
-rw-r--r--drivers/md/raid5.c1965
-rw-r--r--drivers/md/raid6.h135
-rw-r--r--drivers/md/raid6algos.c153
-rw-r--r--drivers/md/raid6altivec.uc122
-rw-r--r--drivers/md/raid6int.uc117
-rw-r--r--drivers/md/raid6main.c2136
-rw-r--r--drivers/md/raid6mmx.c150
-rw-r--r--drivers/md/raid6recov.c133
-rw-r--r--drivers/md/raid6sse1.c171
-rw-r--r--drivers/md/raid6sse2.c270
-rw-r--r--drivers/md/raid6test/Makefile58
-rw-r--r--drivers/md/raid6test/test.c103
-rw-r--r--drivers/md/raid6x86.h245
-rw-r--r--drivers/md/unroll.pl24
-rw-r--r--drivers/md/xor.c154
54 files changed, 28483 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
new file mode 100644
index 000000000000..ac43f98062fd
--- /dev/null
+++ b/drivers/md/Kconfig
@@ -0,0 +1,240 @@
1#
2# Block device driver configuration
3#
4
5menu "Multi-device support (RAID and LVM)"
6
7config MD
8 bool "Multiple devices driver support (RAID and LVM)"
9 help
10 Support multiple physical spindles through a single logical device.
11 Required for RAID and logical volume management.
12
13config BLK_DEV_MD
14 tristate "RAID support"
15 depends on MD
16 ---help---
17 This driver lets you combine several hard disk partitions into one
18 logical block device. This can be used to simply append one
19 partition to another one or to combine several redundant hard disks
20 into a RAID1/4/5 device so as to provide protection against hard
21 disk failures. This is called "Software RAID" since the combining of
22 the partitions is done by the kernel. "Hardware RAID" means that the
23 combining is done by a dedicated controller; if you have such a
24 controller, you do not need to say Y here.
25
26 More information about Software RAID on Linux is contained in the
27 Software RAID mini-HOWTO, available from
28 <http://www.tldp.org/docs.html#howto>. There you will also learn
29 where to get the supporting user space utilities raidtools.
30
31 If unsure, say N.
32
33config MD_LINEAR
34 tristate "Linear (append) mode"
35 depends on BLK_DEV_MD
36 ---help---
37 If you say Y here, then your multiple devices driver will be able to
38 use the so-called linear mode, i.e. it will combine the hard disk
39 partitions by simply appending one to the other.
40
41 To compile this as a module, choose M here: the module
42 will be called linear.
43
44 If unsure, say Y.
45
46config MD_RAID0
47 tristate "RAID-0 (striping) mode"
48 depends on BLK_DEV_MD
49 ---help---
50 If you say Y here, then your multiple devices driver will be able to
51 use the so-called raid0 mode, i.e. it will combine the hard disk
52 partitions into one logical device in such a fashion as to fill them
53 up evenly, one chunk here and one chunk there. This will increase
54 the throughput rate if the partitions reside on distinct disks.
55
56 Information about Software RAID on Linux is contained in the
57 Software-RAID mini-HOWTO, available from
58 <http://www.tldp.org/docs.html#howto>. There you will also
59 learn where to get the supporting user space utilities raidtools.
60
61 To compile this as a module, choose M here: the module
62 will be called raid0.
63
64 If unsure, say Y.
65
66config MD_RAID1
67 tristate "RAID-1 (mirroring) mode"
68 depends on BLK_DEV_MD
69 ---help---
70 A RAID-1 set consists of several disk drives which are exact copies
71 of each other. In the event of a mirror failure, the RAID driver
72 will continue to use the operational mirrors in the set, providing
73 an error free MD (multiple device) to the higher levels of the
74 kernel. In a set with N drives, the available space is the capacity
75 of a single drive, and the set protects against a failure of (N - 1)
76 drives.
77
78 Information about Software RAID on Linux is contained in the
79 Software-RAID mini-HOWTO, available from
80 <http://www.tldp.org/docs.html#howto>. There you will also
81 learn where to get the supporting user space utilities raidtools.
82
83 If you want to use such a RAID-1 set, say Y. To compile this code
84 as a module, choose M here: the module will be called raid1.
85
86 If unsure, say Y.
87
88config MD_RAID10
89 tristate "RAID-10 (mirrored striping) mode (EXPERIMENTAL)"
90 depends on BLK_DEV_MD && EXPERIMENTAL
91 ---help---
92 RAID-10 provides a combination of striping (RAID-0) and
93 mirroring (RAID-1) with easier configuration and more flexable
94 layout.
95 Unlike RAID-0, but like RAID-1, RAID-10 requires all devices to
96 be the same size (or at least, only as much as the smallest device
97 will be used).
98 RAID-10 provides a variety of layouts that provide different levels
99 of redundancy and performance.
100
101 RAID-10 requires mdadm-1.7.0 or later, available at:
102
103 ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
104
105 If unsure, say Y.
106
107config MD_RAID5
108 tristate "RAID-4/RAID-5 mode"
109 depends on BLK_DEV_MD
110 ---help---
111 A RAID-5 set of N drives with a capacity of C MB per drive provides
112 the capacity of C * (N - 1) MB, and protects against a failure
113 of a single drive. For a given sector (row) number, (N - 1) drives
114 contain data sectors, and one drive contains the parity protection.
115 For a RAID-4 set, the parity blocks are present on a single drive,
116 while a RAID-5 set distributes the parity across the drives in one
117 of the available parity distribution methods.
118
119 Information about Software RAID on Linux is contained in the
120 Software-RAID mini-HOWTO, available from
121 <http://www.tldp.org/docs.html#howto>. There you will also
122 learn where to get the supporting user space utilities raidtools.
123
124 If you want to use such a RAID-4/RAID-5 set, say Y. To
125 compile this code as a module, choose M here: the module
126 will be called raid5.
127
128 If unsure, say Y.
129
130config MD_RAID6
131 tristate "RAID-6 mode"
132 depends on BLK_DEV_MD
133 ---help---
134 A RAID-6 set of N drives with a capacity of C MB per drive
135 provides the capacity of C * (N - 2) MB, and protects
136 against a failure of any two drives. For a given sector
137 (row) number, (N - 2) drives contain data sectors, and two
138 drives contains two independent redundancy syndromes. Like
139 RAID-5, RAID-6 distributes the syndromes across the drives
140 in one of the available parity distribution methods.
141
142 RAID-6 requires mdadm-1.5.0 or later, available at:
143
144 ftp://ftp.kernel.org/pub/linux/utils/raid/mdadm/
145
146 If you want to use such a RAID-6 set, say Y. To compile
147 this code as a module, choose M here: the module will be
148 called raid6.
149
150 If unsure, say Y.
151
152config MD_MULTIPATH
153 tristate "Multipath I/O support"
154 depends on BLK_DEV_MD
155 help
156 Multipath-IO is the ability of certain devices to address the same
157 physical disk over multiple 'IO paths'. The code ensures that such
158 paths can be defined and handled at runtime, and ensures that a
159 transparent failover to the backup path(s) happens if a IO errors
160 arrives on the primary path.
161
162 If unsure, say N.
163
164config MD_FAULTY
165 tristate "Faulty test module for MD"
166 depends on BLK_DEV_MD
167 help
168 The "faulty" module allows for a block device that occasionally returns
169 read or write errors. It is useful for testing.
170
171 In unsure, say N.
172
173config BLK_DEV_DM
174 tristate "Device mapper support"
175 depends on MD
176 ---help---
177 Device-mapper is a low level volume manager. It works by allowing
178 people to specify mappings for ranges of logical sectors. Various
179 mapping types are available, in addition people may write their own
180 modules containing custom mappings if they wish.
181
182 Higher level volume managers such as LVM2 use this driver.
183
184 To compile this as a module, choose M here: the module will be
185 called dm-mod.
186
187 If unsure, say N.
188
189config DM_CRYPT
190 tristate "Crypt target support"
191 depends on BLK_DEV_DM && EXPERIMENTAL
192 select CRYPTO
193 ---help---
194 This device-mapper target allows you to create a device that
195 transparently encrypts the data on it. You'll need to activate
196 the ciphers you're going to use in the cryptoapi configuration.
197
198 Information on how to use dm-crypt can be found on
199
200 <http://www.saout.de/misc/dm-crypt/>
201
202 To compile this code as a module, choose M here: the module will
203 be called dm-crypt.
204
205 If unsure, say N.
206
207config DM_SNAPSHOT
208 tristate "Snapshot target (EXPERIMENTAL)"
209 depends on BLK_DEV_DM && EXPERIMENTAL
210 ---help---
211 Allow volume managers to take writeable snapshots of a device.
212
213config DM_MIRROR
214 tristate "Mirror target (EXPERIMENTAL)"
215 depends on BLK_DEV_DM && EXPERIMENTAL
216 ---help---
217 Allow volume managers to mirror logical volumes, also
218 needed for live data migration tools such as 'pvmove'.
219
220config DM_ZERO
221 tristate "Zero target (EXPERIMENTAL)"
222 depends on BLK_DEV_DM && EXPERIMENTAL
223 ---help---
224 A target that discards writes, and returns all zeroes for
225 reads. Useful in some recovery situations.
226
227config DM_MULTIPATH
228 tristate "Multipath target (EXPERIMENTAL)"
229 depends on BLK_DEV_DM && EXPERIMENTAL
230 ---help---
231 Allow volume managers to support multipath hardware.
232
233config DM_MULTIPATH_EMC
234 tristate "EMC CX/AX multipath support (EXPERIMENTAL)"
235 depends on DM_MULTIPATH && BLK_DEV_DM && EXPERIMENTAL
236 ---help---
237 Multipath support for EMC CX/AX series hardware.
238
239endmenu
240
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
new file mode 100644
index 000000000000..90de9c146a5f
--- /dev/null
+++ b/drivers/md/Makefile
@@ -0,0 +1,107 @@
1#
2# Makefile for the kernel software RAID and LVM drivers.
3#
4
5dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
6 dm-ioctl.o dm-io.o kcopyd.o
7dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o
8dm-snapshot-objs := dm-snap.o dm-exception-store.o
9dm-mirror-objs := dm-log.o dm-raid1.o
10raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \
11 raid6int1.o raid6int2.o raid6int4.o \
12 raid6int8.o raid6int16.o raid6int32.o \
13 raid6altivec1.o raid6altivec2.o raid6altivec4.o \
14 raid6altivec8.o \
15 raid6mmx.o raid6sse1.o raid6sse2.o
16hostprogs-y := mktables
17
18# Note: link order is important. All raid personalities
19# and xor.o must come before md.o, as they each initialise
20# themselves, and md.o may use the personalities when it
21# auto-initialised.
22
23obj-$(CONFIG_MD_LINEAR) += linear.o
24obj-$(CONFIG_MD_RAID0) += raid0.o
25obj-$(CONFIG_MD_RAID1) += raid1.o
26obj-$(CONFIG_MD_RAID10) += raid10.o
27obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
28obj-$(CONFIG_MD_RAID6) += raid6.o xor.o
29obj-$(CONFIG_MD_MULTIPATH) += multipath.o
30obj-$(CONFIG_MD_FAULTY) += faulty.o
31obj-$(CONFIG_BLK_DEV_MD) += md.o
32obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
33obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
34obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
35obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
36obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
37obj-$(CONFIG_DM_MIRROR) += dm-mirror.o
38obj-$(CONFIG_DM_ZERO) += dm-zero.o
39
40quiet_cmd_unroll = UNROLL $@
41 cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
42 < $< > $@ || ( rm -f $@ && exit 1 )
43
44ifeq ($(CONFIG_ALTIVEC),y)
45altivec_flags := -maltivec -mabi=altivec
46endif
47
48targets += raid6int1.c
49$(obj)/raid6int1.c: UNROLL := 1
50$(obj)/raid6int1.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
51 $(call if_changed,unroll)
52
53targets += raid6int2.c
54$(obj)/raid6int2.c: UNROLL := 2
55$(obj)/raid6int2.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
56 $(call if_changed,unroll)
57
58targets += raid6int4.c
59$(obj)/raid6int4.c: UNROLL := 4
60$(obj)/raid6int4.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
61 $(call if_changed,unroll)
62
63targets += raid6int8.c
64$(obj)/raid6int8.c: UNROLL := 8
65$(obj)/raid6int8.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
66 $(call if_changed,unroll)
67
68targets += raid6int16.c
69$(obj)/raid6int16.c: UNROLL := 16
70$(obj)/raid6int16.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
71 $(call if_changed,unroll)
72
73targets += raid6int32.c
74$(obj)/raid6int32.c: UNROLL := 32
75$(obj)/raid6int32.c: $(src)/raid6int.uc $(src)/unroll.pl FORCE
76 $(call if_changed,unroll)
77
78CFLAGS_raid6altivec1.o += $(altivec_flags)
79targets += raid6altivec1.c
80$(obj)/raid6altivec1.c: UNROLL := 1
81$(obj)/raid6altivec1.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
82 $(call if_changed,unroll)
83
84CFLAGS_raid6altivec2.o += $(altivec_flags)
85targets += raid6altivec2.c
86$(obj)/raid6altivec2.c: UNROLL := 2
87$(obj)/raid6altivec2.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
88 $(call if_changed,unroll)
89
90CFLAGS_raid6altivec4.o += $(altivec_flags)
91targets += raid6altivec4.c
92$(obj)/raid6altivec4.c: UNROLL := 4
93$(obj)/raid6altivec4.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
94 $(call if_changed,unroll)
95
96CFLAGS_raid6altivec8.o += $(altivec_flags)
97targets += raid6altivec8.c
98$(obj)/raid6altivec8.c: UNROLL := 8
99$(obj)/raid6altivec8.c: $(src)/raid6altivec.uc $(src)/unroll.pl FORCE
100 $(call if_changed,unroll)
101
102quiet_cmd_mktable = TABLE $@
103 cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
104
105targets += raid6tables.c
106$(obj)/raid6tables.c: $(obj)/mktables FORCE
107 $(call if_changed,mktable)
diff --git a/drivers/md/dm-bio-list.h b/drivers/md/dm-bio-list.h
new file mode 100644
index 000000000000..bc021e1fd4d1
--- /dev/null
+++ b/drivers/md/dm-bio-list.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 2004 Red Hat UK Ltd.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BIO_LIST_H
8#define DM_BIO_LIST_H
9
10#include <linux/bio.h>
11
12struct bio_list {
13 struct bio *head;
14 struct bio *tail;
15};
16
17static inline void bio_list_init(struct bio_list *bl)
18{
19 bl->head = bl->tail = NULL;
20}
21
22static inline void bio_list_add(struct bio_list *bl, struct bio *bio)
23{
24 bio->bi_next = NULL;
25
26 if (bl->tail)
27 bl->tail->bi_next = bio;
28 else
29 bl->head = bio;
30
31 bl->tail = bio;
32}
33
34static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
35{
36 if (bl->tail)
37 bl->tail->bi_next = bl2->head;
38 else
39 bl->head = bl2->head;
40
41 bl->tail = bl2->tail;
42}
43
44static inline struct bio *bio_list_pop(struct bio_list *bl)
45{
46 struct bio *bio = bl->head;
47
48 if (bio) {
49 bl->head = bl->head->bi_next;
50 if (!bl->head)
51 bl->tail = NULL;
52
53 bio->bi_next = NULL;
54 }
55
56 return bio;
57}
58
59static inline struct bio *bio_list_get(struct bio_list *bl)
60{
61 struct bio *bio = bl->head;
62
63 bl->head = bl->tail = NULL;
64
65 return bio;
66}
67
68#endif
diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h
new file mode 100644
index 000000000000..d3ec217847d6
--- /dev/null
+++ b/drivers/md/dm-bio-record.h
@@ -0,0 +1,45 @@
1/*
2 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef DM_BIO_RECORD_H
8#define DM_BIO_RECORD_H
9
10#include <linux/bio.h>
11
12/*
13 * There are lots of mutable fields in the bio struct that get
14 * changed by the lower levels of the block layer. Some targets,
15 * such as multipath, may wish to resubmit a bio on error. The
16 * functions in this file help the target record and restore the
17 * original bio state.
18 */
19struct dm_bio_details {
20 sector_t bi_sector;
21 struct block_device *bi_bdev;
22 unsigned int bi_size;
23 unsigned short bi_idx;
24 unsigned long bi_flags;
25};
26
27static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio)
28{
29 bd->bi_sector = bio->bi_sector;
30 bd->bi_bdev = bio->bi_bdev;
31 bd->bi_size = bio->bi_size;
32 bd->bi_idx = bio->bi_idx;
33 bd->bi_flags = bio->bi_flags;
34}
35
36static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio)
37{
38 bio->bi_sector = bd->bi_sector;
39 bio->bi_bdev = bd->bi_bdev;
40 bio->bi_size = bd->bi_size;
41 bio->bi_idx = bd->bi_idx;
42 bio->bi_flags = bd->bi_flags;
43}
44
45#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
new file mode 100644
index 000000000000..77619a56e2bf
--- /dev/null
+++ b/drivers/md/dm-crypt.c
@@ -0,0 +1,977 @@
1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/kernel.h>
11#include <linux/bio.h>
12#include <linux/blkdev.h>
13#include <linux/mempool.h>
14#include <linux/slab.h>
15#include <linux/crypto.h>
16#include <linux/workqueue.h>
17#include <asm/atomic.h>
18#include <asm/scatterlist.h>
19#include <asm/page.h>
20
21#include "dm.h"
22
23#define PFX "crypt: "
24
25/*
26 * per bio private data
27 */
28struct crypt_io {
29 struct dm_target *target;
30 struct bio *bio;
31 struct bio *first_clone;
32 struct work_struct work;
33 atomic_t pending;
34 int error;
35};
36
37/*
38 * context holding the current state of a multi-part conversion
39 */
40struct convert_context {
41 struct bio *bio_in;
42 struct bio *bio_out;
43 unsigned int offset_in;
44 unsigned int offset_out;
45 unsigned int idx_in;
46 unsigned int idx_out;
47 sector_t sector;
48 int write;
49};
50
51struct crypt_config;
52
53struct crypt_iv_operations {
54 int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
55 const char *opts);
56 void (*dtr)(struct crypt_config *cc);
57 const char *(*status)(struct crypt_config *cc);
58 int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
59};
60
61/*
62 * Crypt: maps a linear range of a block device
63 * and encrypts / decrypts at the same time.
64 */
65struct crypt_config {
66 struct dm_dev *dev;
67 sector_t start;
68
69 /*
70 * pool for per bio private data and
71 * for encryption buffer pages
72 */
73 mempool_t *io_pool;
74 mempool_t *page_pool;
75
76 /*
77 * crypto related data
78 */
79 struct crypt_iv_operations *iv_gen_ops;
80 char *iv_mode;
81 void *iv_gen_private;
82 sector_t iv_offset;
83 unsigned int iv_size;
84
85 struct crypto_tfm *tfm;
86 unsigned int key_size;
87 u8 key[0];
88};
89
90#define MIN_IOS 256
91#define MIN_POOL_PAGES 32
92#define MIN_BIO_PAGES 8
93
94static kmem_cache_t *_crypt_io_pool;
95
96/*
97 * Mempool alloc and free functions for the page
98 */
99static void *mempool_alloc_page(unsigned int __nocast gfp_mask, void *data)
100{
101 return alloc_page(gfp_mask);
102}
103
104static void mempool_free_page(void *page, void *data)
105{
106 __free_page(page);
107}
108
109
110/*
111 * Different IV generation algorithms:
112 *
113 * plain: the initial vector is the 32-bit low-endian version of the sector
114 * number, padded with zeros if neccessary.
115 *
116 * ess_iv: "encrypted sector|salt initial vector", the sector number is
117 * encrypted with the bulk cipher using a salt as key. The salt
118 * should be derived from the bulk cipher's key via hashing.
119 *
120 * plumb: unimplemented, see:
121 * http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
122 */
123
124static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
125{
126 memset(iv, 0, cc->iv_size);
127 *(u32 *)iv = cpu_to_le32(sector & 0xffffffff);
128
129 return 0;
130}
131
132static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
133 const char *opts)
134{
135 struct crypto_tfm *essiv_tfm;
136 struct crypto_tfm *hash_tfm;
137 struct scatterlist sg;
138 unsigned int saltsize;
139 u8 *salt;
140
141 if (opts == NULL) {
142 ti->error = PFX "Digest algorithm missing for ESSIV mode";
143 return -EINVAL;
144 }
145
146 /* Hash the cipher key with the given hash algorithm */
147 hash_tfm = crypto_alloc_tfm(opts, 0);
148 if (hash_tfm == NULL) {
149 ti->error = PFX "Error initializing ESSIV hash";
150 return -EINVAL;
151 }
152
153 if (crypto_tfm_alg_type(hash_tfm) != CRYPTO_ALG_TYPE_DIGEST) {
154 ti->error = PFX "Expected digest algorithm for ESSIV hash";
155 crypto_free_tfm(hash_tfm);
156 return -EINVAL;
157 }
158
159 saltsize = crypto_tfm_alg_digestsize(hash_tfm);
160 salt = kmalloc(saltsize, GFP_KERNEL);
161 if (salt == NULL) {
162 ti->error = PFX "Error kmallocing salt storage in ESSIV";
163 crypto_free_tfm(hash_tfm);
164 return -ENOMEM;
165 }
166
167 sg.page = virt_to_page(cc->key);
168 sg.offset = offset_in_page(cc->key);
169 sg.length = cc->key_size;
170 crypto_digest_digest(hash_tfm, &sg, 1, salt);
171 crypto_free_tfm(hash_tfm);
172
173 /* Setup the essiv_tfm with the given salt */
174 essiv_tfm = crypto_alloc_tfm(crypto_tfm_alg_name(cc->tfm),
175 CRYPTO_TFM_MODE_ECB);
176 if (essiv_tfm == NULL) {
177 ti->error = PFX "Error allocating crypto tfm for ESSIV";
178 kfree(salt);
179 return -EINVAL;
180 }
181 if (crypto_tfm_alg_blocksize(essiv_tfm)
182 != crypto_tfm_alg_ivsize(cc->tfm)) {
183 ti->error = PFX "Block size of ESSIV cipher does "
184 "not match IV size of block cipher";
185 crypto_free_tfm(essiv_tfm);
186 kfree(salt);
187 return -EINVAL;
188 }
189 if (crypto_cipher_setkey(essiv_tfm, salt, saltsize) < 0) {
190 ti->error = PFX "Failed to set key for ESSIV cipher";
191 crypto_free_tfm(essiv_tfm);
192 kfree(salt);
193 return -EINVAL;
194 }
195 kfree(salt);
196
197 cc->iv_gen_private = (void *)essiv_tfm;
198 return 0;
199}
200
201static void crypt_iv_essiv_dtr(struct crypt_config *cc)
202{
203 crypto_free_tfm((struct crypto_tfm *)cc->iv_gen_private);
204 cc->iv_gen_private = NULL;
205}
206
207static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
208{
209 struct scatterlist sg = { NULL, };
210
211 memset(iv, 0, cc->iv_size);
212 *(u64 *)iv = cpu_to_le64(sector);
213
214 sg.page = virt_to_page(iv);
215 sg.offset = offset_in_page(iv);
216 sg.length = cc->iv_size;
217 crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private,
218 &sg, &sg, cc->iv_size);
219
220 return 0;
221}
222
223static struct crypt_iv_operations crypt_iv_plain_ops = {
224 .generator = crypt_iv_plain_gen
225};
226
227static struct crypt_iv_operations crypt_iv_essiv_ops = {
228 .ctr = crypt_iv_essiv_ctr,
229 .dtr = crypt_iv_essiv_dtr,
230 .generator = crypt_iv_essiv_gen
231};
232
233
234static inline int
235crypt_convert_scatterlist(struct crypt_config *cc, struct scatterlist *out,
236 struct scatterlist *in, unsigned int length,
237 int write, sector_t sector)
238{
239 u8 iv[cc->iv_size];
240 int r;
241
242 if (cc->iv_gen_ops) {
243 r = cc->iv_gen_ops->generator(cc, iv, sector);
244 if (r < 0)
245 return r;
246
247 if (write)
248 r = crypto_cipher_encrypt_iv(cc->tfm, out, in, length, iv);
249 else
250 r = crypto_cipher_decrypt_iv(cc->tfm, out, in, length, iv);
251 } else {
252 if (write)
253 r = crypto_cipher_encrypt(cc->tfm, out, in, length);
254 else
255 r = crypto_cipher_decrypt(cc->tfm, out, in, length);
256 }
257
258 return r;
259}
260
261static void
262crypt_convert_init(struct crypt_config *cc, struct convert_context *ctx,
263 struct bio *bio_out, struct bio *bio_in,
264 sector_t sector, int write)
265{
266 ctx->bio_in = bio_in;
267 ctx->bio_out = bio_out;
268 ctx->offset_in = 0;
269 ctx->offset_out = 0;
270 ctx->idx_in = bio_in ? bio_in->bi_idx : 0;
271 ctx->idx_out = bio_out ? bio_out->bi_idx : 0;
272 ctx->sector = sector + cc->iv_offset;
273 ctx->write = write;
274}
275
276/*
277 * Encrypt / decrypt data from one bio to another one (can be the same one)
278 */
279static int crypt_convert(struct crypt_config *cc,
280 struct convert_context *ctx)
281{
282 int r = 0;
283
284 while(ctx->idx_in < ctx->bio_in->bi_vcnt &&
285 ctx->idx_out < ctx->bio_out->bi_vcnt) {
286 struct bio_vec *bv_in = bio_iovec_idx(ctx->bio_in, ctx->idx_in);
287 struct bio_vec *bv_out = bio_iovec_idx(ctx->bio_out, ctx->idx_out);
288 struct scatterlist sg_in = {
289 .page = bv_in->bv_page,
290 .offset = bv_in->bv_offset + ctx->offset_in,
291 .length = 1 << SECTOR_SHIFT
292 };
293 struct scatterlist sg_out = {
294 .page = bv_out->bv_page,
295 .offset = bv_out->bv_offset + ctx->offset_out,
296 .length = 1 << SECTOR_SHIFT
297 };
298
299 ctx->offset_in += sg_in.length;
300 if (ctx->offset_in >= bv_in->bv_len) {
301 ctx->offset_in = 0;
302 ctx->idx_in++;
303 }
304
305 ctx->offset_out += sg_out.length;
306 if (ctx->offset_out >= bv_out->bv_len) {
307 ctx->offset_out = 0;
308 ctx->idx_out++;
309 }
310
311 r = crypt_convert_scatterlist(cc, &sg_out, &sg_in, sg_in.length,
312 ctx->write, ctx->sector);
313 if (r < 0)
314 break;
315
316 ctx->sector++;
317 }
318
319 return r;
320}
321
322/*
323 * Generate a new unfragmented bio with the given size
324 * This should never violate the device limitations
325 * May return a smaller bio when running out of pages
326 */
327static struct bio *
328crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
329 struct bio *base_bio, unsigned int *bio_vec_idx)
330{
331 struct bio *bio;
332 unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
333 int gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
334 unsigned long flags = current->flags;
335 unsigned int i;
336
337 /*
338 * Tell VM to act less aggressively and fail earlier.
339 * This is not necessary but increases throughput.
340 * FIXME: Is this really intelligent?
341 */
342 current->flags &= ~PF_MEMALLOC;
343
344 if (base_bio)
345 bio = bio_clone(base_bio, GFP_NOIO);
346 else
347 bio = bio_alloc(GFP_NOIO, nr_iovecs);
348 if (!bio) {
349 if (flags & PF_MEMALLOC)
350 current->flags |= PF_MEMALLOC;
351 return NULL;
352 }
353
354 /* if the last bio was not complete, continue where that one ended */
355 bio->bi_idx = *bio_vec_idx;
356 bio->bi_vcnt = *bio_vec_idx;
357 bio->bi_size = 0;
358 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
359
360 /* bio->bi_idx pages have already been allocated */
361 size -= bio->bi_idx * PAGE_SIZE;
362
363 for(i = bio->bi_idx; i < nr_iovecs; i++) {
364 struct bio_vec *bv = bio_iovec_idx(bio, i);
365
366 bv->bv_page = mempool_alloc(cc->page_pool, gfp_mask);
367 if (!bv->bv_page)
368 break;
369
370 /*
371 * if additional pages cannot be allocated without waiting,
372 * return a partially allocated bio, the caller will then try
373 * to allocate additional bios while submitting this partial bio
374 */
375 if ((i - bio->bi_idx) == (MIN_BIO_PAGES - 1))
376 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
377
378 bv->bv_offset = 0;
379 if (size > PAGE_SIZE)
380 bv->bv_len = PAGE_SIZE;
381 else
382 bv->bv_len = size;
383
384 bio->bi_size += bv->bv_len;
385 bio->bi_vcnt++;
386 size -= bv->bv_len;
387 }
388
389 if (flags & PF_MEMALLOC)
390 current->flags |= PF_MEMALLOC;
391
392 if (!bio->bi_size) {
393 bio_put(bio);
394 return NULL;
395 }
396
397 /*
398 * Remember the last bio_vec allocated to be able
399 * to correctly continue after the splitting.
400 */
401 *bio_vec_idx = bio->bi_vcnt;
402
403 return bio;
404}
405
406static void crypt_free_buffer_pages(struct crypt_config *cc,
407 struct bio *bio, unsigned int bytes)
408{
409 unsigned int i, start, end;
410 struct bio_vec *bv;
411
412 /*
413 * This is ugly, but Jens Axboe thinks that using bi_idx in the
414 * endio function is too dangerous at the moment, so I calculate the
415 * correct position using bi_vcnt and bi_size.
416 * The bv_offset and bv_len fields might already be modified but we
417 * know that we always allocated whole pages.
418 * A fix to the bi_idx issue in the kernel is in the works, so
419 * we will hopefully be able to revert to the cleaner solution soon.
420 */
421 i = bio->bi_vcnt - 1;
422 bv = bio_iovec_idx(bio, i);
423 end = (i << PAGE_SHIFT) + (bv->bv_offset + bv->bv_len) - bio->bi_size;
424 start = end - bytes;
425
426 start >>= PAGE_SHIFT;
427 if (!bio->bi_size)
428 end = bio->bi_vcnt;
429 else
430 end >>= PAGE_SHIFT;
431
432 for(i = start; i < end; i++) {
433 bv = bio_iovec_idx(bio, i);
434 BUG_ON(!bv->bv_page);
435 mempool_free(bv->bv_page, cc->page_pool);
436 bv->bv_page = NULL;
437 }
438}
439
440/*
441 * One of the bios was finished. Check for completion of
442 * the whole request and correctly clean up the buffer.
443 */
444static void dec_pending(struct crypt_io *io, int error)
445{
446 struct crypt_config *cc = (struct crypt_config *) io->target->private;
447
448 if (error < 0)
449 io->error = error;
450
451 if (!atomic_dec_and_test(&io->pending))
452 return;
453
454 if (io->first_clone)
455 bio_put(io->first_clone);
456
457 bio_endio(io->bio, io->bio->bi_size, io->error);
458
459 mempool_free(io, cc->io_pool);
460}
461
462/*
463 * kcryptd:
464 *
465 * Needed because it would be very unwise to do decryption in an
466 * interrupt context, so bios returning from read requests get
467 * queued here.
468 */
469static struct workqueue_struct *_kcryptd_workqueue;
470
471static void kcryptd_do_work(void *data)
472{
473 struct crypt_io *io = (struct crypt_io *) data;
474 struct crypt_config *cc = (struct crypt_config *) io->target->private;
475 struct convert_context ctx;
476 int r;
477
478 crypt_convert_init(cc, &ctx, io->bio, io->bio,
479 io->bio->bi_sector - io->target->begin, 0);
480 r = crypt_convert(cc, &ctx);
481
482 dec_pending(io, r);
483}
484
485static void kcryptd_queue_io(struct crypt_io *io)
486{
487 INIT_WORK(&io->work, kcryptd_do_work, io);
488 queue_work(_kcryptd_workqueue, &io->work);
489}
490
491/*
492 * Decode key from its hex representation
493 */
494static int crypt_decode_key(u8 *key, char *hex, unsigned int size)
495{
496 char buffer[3];
497 char *endp;
498 unsigned int i;
499
500 buffer[2] = '\0';
501
502 for(i = 0; i < size; i++) {
503 buffer[0] = *hex++;
504 buffer[1] = *hex++;
505
506 key[i] = (u8)simple_strtoul(buffer, &endp, 16);
507
508 if (endp != &buffer[2])
509 return -EINVAL;
510 }
511
512 if (*hex != '\0')
513 return -EINVAL;
514
515 return 0;
516}
517
518/*
519 * Encode key into its hex representation
520 */
521static void crypt_encode_key(char *hex, u8 *key, unsigned int size)
522{
523 unsigned int i;
524
525 for(i = 0; i < size; i++) {
526 sprintf(hex, "%02x", *key);
527 hex += 2;
528 key++;
529 }
530}
531
532/*
533 * Construct an encryption mapping:
534 * <cipher> <key> <iv_offset> <dev_path> <start>
535 */
536static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
537{
538 struct crypt_config *cc;
539 struct crypto_tfm *tfm;
540 char *tmp;
541 char *cipher;
542 char *chainmode;
543 char *ivmode;
544 char *ivopts;
545 unsigned int crypto_flags;
546 unsigned int key_size;
547
548 if (argc != 5) {
549 ti->error = PFX "Not enough arguments";
550 return -EINVAL;
551 }
552
553 tmp = argv[0];
554 cipher = strsep(&tmp, "-");
555 chainmode = strsep(&tmp, "-");
556 ivopts = strsep(&tmp, "-");
557 ivmode = strsep(&ivopts, ":");
558
559 if (tmp)
560 DMWARN(PFX "Unexpected additional cipher options");
561
562 key_size = strlen(argv[1]) >> 1;
563
564 cc = kmalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
565 if (cc == NULL) {
566 ti->error =
567 PFX "Cannot allocate transparent encryption context";
568 return -ENOMEM;
569 }
570
571 cc->key_size = key_size;
572 if ((!key_size && strcmp(argv[1], "-") != 0) ||
573 (key_size && crypt_decode_key(cc->key, argv[1], key_size) < 0)) {
574 ti->error = PFX "Error decoding key";
575 goto bad1;
576 }
577
578 /* Compatiblity mode for old dm-crypt cipher strings */
579 if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
580 chainmode = "cbc";
581 ivmode = "plain";
582 }
583
584 /* Choose crypto_flags according to chainmode */
585 if (strcmp(chainmode, "cbc") == 0)
586 crypto_flags = CRYPTO_TFM_MODE_CBC;
587 else if (strcmp(chainmode, "ecb") == 0)
588 crypto_flags = CRYPTO_TFM_MODE_ECB;
589 else {
590 ti->error = PFX "Unknown chaining mode";
591 goto bad1;
592 }
593
594 if (crypto_flags != CRYPTO_TFM_MODE_ECB && !ivmode) {
595 ti->error = PFX "This chaining mode requires an IV mechanism";
596 goto bad1;
597 }
598
599 tfm = crypto_alloc_tfm(cipher, crypto_flags);
600 if (!tfm) {
601 ti->error = PFX "Error allocating crypto tfm";
602 goto bad1;
603 }
604 if (crypto_tfm_alg_type(tfm) != CRYPTO_ALG_TYPE_CIPHER) {
605 ti->error = PFX "Expected cipher algorithm";
606 goto bad2;
607 }
608
609 cc->tfm = tfm;
610
611 /*
612 * Choose ivmode. Valid modes: "plain", "essiv:<esshash>".
613 * See comments at iv code
614 */
615
616 if (ivmode == NULL)
617 cc->iv_gen_ops = NULL;
618 else if (strcmp(ivmode, "plain") == 0)
619 cc->iv_gen_ops = &crypt_iv_plain_ops;
620 else if (strcmp(ivmode, "essiv") == 0)
621 cc->iv_gen_ops = &crypt_iv_essiv_ops;
622 else {
623 ti->error = PFX "Invalid IV mode";
624 goto bad2;
625 }
626
627 if (cc->iv_gen_ops && cc->iv_gen_ops->ctr &&
628 cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
629 goto bad2;
630
631 if (tfm->crt_cipher.cit_decrypt_iv && tfm->crt_cipher.cit_encrypt_iv)
632 /* at least a 64 bit sector number should fit in our buffer */
633 cc->iv_size = max(crypto_tfm_alg_ivsize(tfm),
634 (unsigned int)(sizeof(u64) / sizeof(u8)));
635 else {
636 cc->iv_size = 0;
637 if (cc->iv_gen_ops) {
638 DMWARN(PFX "Selected cipher does not support IVs");
639 if (cc->iv_gen_ops->dtr)
640 cc->iv_gen_ops->dtr(cc);
641 cc->iv_gen_ops = NULL;
642 }
643 }
644
645 cc->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
646 mempool_free_slab, _crypt_io_pool);
647 if (!cc->io_pool) {
648 ti->error = PFX "Cannot allocate crypt io mempool";
649 goto bad3;
650 }
651
652 cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page,
653 mempool_free_page, NULL);
654 if (!cc->page_pool) {
655 ti->error = PFX "Cannot allocate page mempool";
656 goto bad4;
657 }
658
659 if (tfm->crt_cipher.cit_setkey(tfm, cc->key, key_size) < 0) {
660 ti->error = PFX "Error setting key";
661 goto bad5;
662 }
663
664 if (sscanf(argv[2], SECTOR_FORMAT, &cc->iv_offset) != 1) {
665 ti->error = PFX "Invalid iv_offset sector";
666 goto bad5;
667 }
668
669 if (sscanf(argv[4], SECTOR_FORMAT, &cc->start) != 1) {
670 ti->error = PFX "Invalid device sector";
671 goto bad5;
672 }
673
674 if (dm_get_device(ti, argv[3], cc->start, ti->len,
675 dm_table_get_mode(ti->table), &cc->dev)) {
676 ti->error = PFX "Device lookup failed";
677 goto bad5;
678 }
679
680 if (ivmode && cc->iv_gen_ops) {
681 if (ivopts)
682 *(ivopts - 1) = ':';
683 cc->iv_mode = kmalloc(strlen(ivmode) + 1, GFP_KERNEL);
684 if (!cc->iv_mode) {
685 ti->error = PFX "Error kmallocing iv_mode string";
686 goto bad5;
687 }
688 strcpy(cc->iv_mode, ivmode);
689 } else
690 cc->iv_mode = NULL;
691
692 ti->private = cc;
693 return 0;
694
695bad5:
696 mempool_destroy(cc->page_pool);
697bad4:
698 mempool_destroy(cc->io_pool);
699bad3:
700 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
701 cc->iv_gen_ops->dtr(cc);
702bad2:
703 crypto_free_tfm(tfm);
704bad1:
705 kfree(cc);
706 return -EINVAL;
707}
708
709static void crypt_dtr(struct dm_target *ti)
710{
711 struct crypt_config *cc = (struct crypt_config *) ti->private;
712
713 mempool_destroy(cc->page_pool);
714 mempool_destroy(cc->io_pool);
715
716 if (cc->iv_mode)
717 kfree(cc->iv_mode);
718 if (cc->iv_gen_ops && cc->iv_gen_ops->dtr)
719 cc->iv_gen_ops->dtr(cc);
720 crypto_free_tfm(cc->tfm);
721 dm_put_device(ti, cc->dev);
722 kfree(cc);
723}
724
725static int crypt_endio(struct bio *bio, unsigned int done, int error)
726{
727 struct crypt_io *io = (struct crypt_io *) bio->bi_private;
728 struct crypt_config *cc = (struct crypt_config *) io->target->private;
729
730 if (bio_data_dir(bio) == WRITE) {
731 /*
732 * free the processed pages, even if
733 * it's only a partially completed write
734 */
735 crypt_free_buffer_pages(cc, bio, done);
736 }
737
738 if (bio->bi_size)
739 return 1;
740
741 bio_put(bio);
742
743 /*
744 * successful reads are decrypted by the worker thread
745 */
746 if ((bio_data_dir(bio) == READ)
747 && bio_flagged(bio, BIO_UPTODATE)) {
748 kcryptd_queue_io(io);
749 return 0;
750 }
751
752 dec_pending(io, error);
753 return error;
754}
755
756static inline struct bio *
757crypt_clone(struct crypt_config *cc, struct crypt_io *io, struct bio *bio,
758 sector_t sector, unsigned int *bvec_idx,
759 struct convert_context *ctx)
760{
761 struct bio *clone;
762
763 if (bio_data_dir(bio) == WRITE) {
764 clone = crypt_alloc_buffer(cc, bio->bi_size,
765 io->first_clone, bvec_idx);
766 if (clone) {
767 ctx->bio_out = clone;
768 if (crypt_convert(cc, ctx) < 0) {
769 crypt_free_buffer_pages(cc, clone,
770 clone->bi_size);
771 bio_put(clone);
772 return NULL;
773 }
774 }
775 } else {
776 /*
777 * The block layer might modify the bvec array, so always
778 * copy the required bvecs because we need the original
779 * one in order to decrypt the whole bio data *afterwards*.
780 */
781 clone = bio_alloc(GFP_NOIO, bio_segments(bio));
782 if (clone) {
783 clone->bi_idx = 0;
784 clone->bi_vcnt = bio_segments(bio);
785 clone->bi_size = bio->bi_size;
786 memcpy(clone->bi_io_vec, bio_iovec(bio),
787 sizeof(struct bio_vec) * clone->bi_vcnt);
788 }
789 }
790
791 if (!clone)
792 return NULL;
793
794 clone->bi_private = io;
795 clone->bi_end_io = crypt_endio;
796 clone->bi_bdev = cc->dev->bdev;
797 clone->bi_sector = cc->start + sector;
798 clone->bi_rw = bio->bi_rw;
799
800 return clone;
801}
802
803static int crypt_map(struct dm_target *ti, struct bio *bio,
804 union map_info *map_context)
805{
806 struct crypt_config *cc = (struct crypt_config *) ti->private;
807 struct crypt_io *io = mempool_alloc(cc->io_pool, GFP_NOIO);
808 struct convert_context ctx;
809 struct bio *clone;
810 unsigned int remaining = bio->bi_size;
811 sector_t sector = bio->bi_sector - ti->begin;
812 unsigned int bvec_idx = 0;
813
814 io->target = ti;
815 io->bio = bio;
816 io->first_clone = NULL;
817 io->error = 0;
818 atomic_set(&io->pending, 1); /* hold a reference */
819
820 if (bio_data_dir(bio) == WRITE)
821 crypt_convert_init(cc, &ctx, NULL, bio, sector, 1);
822
823 /*
824 * The allocated buffers can be smaller than the whole bio,
825 * so repeat the whole process until all the data can be handled.
826 */
827 while (remaining) {
828 clone = crypt_clone(cc, io, bio, sector, &bvec_idx, &ctx);
829 if (!clone)
830 goto cleanup;
831
832 if (!io->first_clone) {
833 /*
834 * hold a reference to the first clone, because it
835 * holds the bio_vec array and that can't be freed
836 * before all other clones are released
837 */
838 bio_get(clone);
839 io->first_clone = clone;
840 }
841 atomic_inc(&io->pending);
842
843 remaining -= clone->bi_size;
844 sector += bio_sectors(clone);
845
846 generic_make_request(clone);
847
848 /* out of memory -> run queues */
849 if (remaining)
850 blk_congestion_wait(bio_data_dir(clone), HZ/100);
851 }
852
853 /* drop reference, clones could have returned before we reach this */
854 dec_pending(io, 0);
855 return 0;
856
857cleanup:
858 if (io->first_clone) {
859 dec_pending(io, -ENOMEM);
860 return 0;
861 }
862
863 /* if no bio has been dispatched yet, we can directly return the error */
864 mempool_free(io, cc->io_pool);
865 return -ENOMEM;
866}
867
868static int crypt_status(struct dm_target *ti, status_type_t type,
869 char *result, unsigned int maxlen)
870{
871 struct crypt_config *cc = (struct crypt_config *) ti->private;
872 const char *cipher;
873 const char *chainmode = NULL;
874 unsigned int sz = 0;
875
876 switch (type) {
877 case STATUSTYPE_INFO:
878 result[0] = '\0';
879 break;
880
881 case STATUSTYPE_TABLE:
882 cipher = crypto_tfm_alg_name(cc->tfm);
883
884 switch(cc->tfm->crt_cipher.cit_mode) {
885 case CRYPTO_TFM_MODE_CBC:
886 chainmode = "cbc";
887 break;
888 case CRYPTO_TFM_MODE_ECB:
889 chainmode = "ecb";
890 break;
891 default:
892 BUG();
893 }
894
895 if (cc->iv_mode)
896 DMEMIT("%s-%s-%s ", cipher, chainmode, cc->iv_mode);
897 else
898 DMEMIT("%s-%s ", cipher, chainmode);
899
900 if (cc->key_size > 0) {
901 if ((maxlen - sz) < ((cc->key_size << 1) + 1))
902 return -ENOMEM;
903
904 crypt_encode_key(result + sz, cc->key, cc->key_size);
905 sz += cc->key_size << 1;
906 } else {
907 if (sz >= maxlen)
908 return -ENOMEM;
909 result[sz++] = '-';
910 }
911
912 DMEMIT(" " SECTOR_FORMAT " %s " SECTOR_FORMAT,
913 cc->iv_offset, cc->dev->name, cc->start);
914 break;
915 }
916 return 0;
917}
918
919static struct target_type crypt_target = {
920 .name = "crypt",
921 .version= {1, 1, 0},
922 .module = THIS_MODULE,
923 .ctr = crypt_ctr,
924 .dtr = crypt_dtr,
925 .map = crypt_map,
926 .status = crypt_status,
927};
928
929static int __init dm_crypt_init(void)
930{
931 int r;
932
933 _crypt_io_pool = kmem_cache_create("dm-crypt_io",
934 sizeof(struct crypt_io),
935 0, 0, NULL, NULL);
936 if (!_crypt_io_pool)
937 return -ENOMEM;
938
939 _kcryptd_workqueue = create_workqueue("kcryptd");
940 if (!_kcryptd_workqueue) {
941 r = -ENOMEM;
942 DMERR(PFX "couldn't create kcryptd");
943 goto bad1;
944 }
945
946 r = dm_register_target(&crypt_target);
947 if (r < 0) {
948 DMERR(PFX "register failed %d", r);
949 goto bad2;
950 }
951
952 return 0;
953
954bad2:
955 destroy_workqueue(_kcryptd_workqueue);
956bad1:
957 kmem_cache_destroy(_crypt_io_pool);
958 return r;
959}
960
961static void __exit dm_crypt_exit(void)
962{
963 int r = dm_unregister_target(&crypt_target);
964
965 if (r < 0)
966 DMERR(PFX "unregister failed %d", r);
967
968 destroy_workqueue(_kcryptd_workqueue);
969 kmem_cache_destroy(_crypt_io_pool);
970}
971
972module_init(dm_crypt_init);
973module_exit(dm_crypt_exit);
974
975MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
976MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
977MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-emc.c b/drivers/md/dm-emc.c
new file mode 100644
index 000000000000..700658664594
--- /dev/null
+++ b/drivers/md/dm-emc.c
@@ -0,0 +1,359 @@
1/*
2 * Copyright (C) 2004 SUSE LINUX Products GmbH. All rights reserved.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 *
7 * Multipath support for EMC CLARiiON AX/CX-series hardware.
8 */
9
10#include "dm.h"
11#include "dm-hw-handler.h"
12#include <scsi/scsi.h>
13#include <scsi/scsi_cmnd.h>
14
15struct emc_handler {
16 spinlock_t lock;
17
18 /* Whether we should send the short trespass command (FC-series)
19 * or the long version (default for AX/CX CLARiiON arrays). */
20 unsigned short_trespass;
21 /* Whether or not to honor SCSI reservations when initiating a
22 * switch-over. Default: Don't. */
23 unsigned hr;
24
25 unsigned char sense[SCSI_SENSE_BUFFERSIZE];
26};
27
28#define TRESPASS_PAGE 0x22
29#define EMC_FAILOVER_TIMEOUT (60 * HZ)
30
31/* Code borrowed from dm-lsi-rdac by Mike Christie */
32
33static inline void free_bio(struct bio *bio)
34{
35 __free_page(bio->bi_io_vec[0].bv_page);
36 bio_put(bio);
37}
38
39static int emc_endio(struct bio *bio, unsigned int bytes_done, int error)
40{
41 struct path *path = bio->bi_private;
42
43 if (bio->bi_size)
44 return 1;
45
46 /* We also need to look at the sense keys here whether or not to
47 * switch to the next PG etc.
48 *
49 * For now simple logic: either it works or it doesn't.
50 */
51 if (error)
52 dm_pg_init_complete(path, MP_FAIL_PATH);
53 else
54 dm_pg_init_complete(path, 0);
55
56 /* request is freed in block layer */
57 free_bio(bio);
58
59 return 0;
60}
61
62static struct bio *get_failover_bio(struct path *path, unsigned data_size)
63{
64 struct bio *bio;
65 struct page *page;
66
67 bio = bio_alloc(GFP_ATOMIC, 1);
68 if (!bio) {
69 DMERR("dm-emc: get_failover_bio: bio_alloc() failed.");
70 return NULL;
71 }
72
73 bio->bi_rw |= (1 << BIO_RW);
74 bio->bi_bdev = path->dev->bdev;
75 bio->bi_sector = 0;
76 bio->bi_private = path;
77 bio->bi_end_io = emc_endio;
78
79 page = alloc_page(GFP_ATOMIC);
80 if (!page) {
81 DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
82 bio_put(bio);
83 return NULL;
84 }
85
86 if (bio_add_page(bio, page, data_size, 0) != data_size) {
87 DMERR("dm-emc: get_failover_bio: alloc_page() failed.");
88 __free_page(page);
89 bio_put(bio);
90 return NULL;
91 }
92
93 return bio;
94}
95
96static struct request *get_failover_req(struct emc_handler *h,
97 struct bio *bio, struct path *path)
98{
99 struct request *rq;
100 struct block_device *bdev = bio->bi_bdev;
101 struct request_queue *q = bdev_get_queue(bdev);
102
103 /* FIXME: Figure out why it fails with GFP_ATOMIC. */
104 rq = blk_get_request(q, WRITE, __GFP_WAIT);
105 if (!rq) {
106 DMERR("dm-emc: get_failover_req: blk_get_request failed");
107 return NULL;
108 }
109
110 rq->bio = rq->biotail = bio;
111 blk_rq_bio_prep(q, rq, bio);
112
113 rq->rq_disk = bdev->bd_contains->bd_disk;
114
115 /* bio backed don't set data */
116 rq->buffer = rq->data = NULL;
117 /* rq data_len used for pc cmd's request_bufflen */
118 rq->data_len = bio->bi_size;
119
120 rq->sense = h->sense;
121 memset(rq->sense, 0, SCSI_SENSE_BUFFERSIZE);
122 rq->sense_len = 0;
123
124 memset(&rq->cmd, 0, BLK_MAX_CDB);
125
126 rq->timeout = EMC_FAILOVER_TIMEOUT;
127 rq->flags |= (REQ_BLOCK_PC | REQ_FAILFAST | REQ_NOMERGE);
128
129 return rq;
130}
131
132static struct request *emc_trespass_get(struct emc_handler *h,
133 struct path *path)
134{
135 struct bio *bio;
136 struct request *rq;
137 unsigned char *page22;
138 unsigned char long_trespass_pg[] = {
139 0, 0, 0, 0,
140 TRESPASS_PAGE, /* Page code */
141 0x09, /* Page length - 2 */
142 h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
143 0xff, 0xff, /* Trespass target */
144 0, 0, 0, 0, 0, 0 /* Reserved bytes / unknown */
145 };
146 unsigned char short_trespass_pg[] = {
147 0, 0, 0, 0,
148 TRESPASS_PAGE, /* Page code */
149 0x02, /* Page length - 2 */
150 h->hr ? 0x01 : 0x81, /* Trespass code + Honor reservation bit */
151 0xff, /* Trespass target */
152 };
153 unsigned data_size = h->short_trespass ? sizeof(short_trespass_pg) :
154 sizeof(long_trespass_pg);
155
156 /* get bio backing */
157 if (data_size > PAGE_SIZE)
158 /* this should never happen */
159 return NULL;
160
161 bio = get_failover_bio(path, data_size);
162 if (!bio) {
163 DMERR("dm-emc: emc_trespass_get: no bio");
164 return NULL;
165 }
166
167 page22 = (unsigned char *)bio_data(bio);
168 memset(page22, 0, data_size);
169
170 memcpy(page22, h->short_trespass ?
171 short_trespass_pg : long_trespass_pg, data_size);
172
173 /* get request for block layer packet command */
174 rq = get_failover_req(h, bio, path);
175 if (!rq) {
176 DMERR("dm-emc: emc_trespass_get: no rq");
177 free_bio(bio);
178 return NULL;
179 }
180
181 /* Prepare the command. */
182 rq->cmd[0] = MODE_SELECT;
183 rq->cmd[1] = 0x10;
184 rq->cmd[4] = data_size;
185 rq->cmd_len = COMMAND_SIZE(rq->cmd[0]);
186
187 return rq;
188}
189
190static void emc_pg_init(struct hw_handler *hwh, unsigned bypassed,
191 struct path *path)
192{
193 struct request *rq;
194 struct request_queue *q = bdev_get_queue(path->dev->bdev);
195
196 /*
197 * We can either blindly init the pg (then look at the sense),
198 * or we can send some commands to get the state here (then
199 * possibly send the fo cmnd), or we can also have the
200 * initial state passed into us and then get an update here.
201 */
202 if (!q) {
203 DMINFO("dm-emc: emc_pg_init: no queue");
204 goto fail_path;
205 }
206
207 /* FIXME: The request should be pre-allocated. */
208 rq = emc_trespass_get(hwh->context, path);
209 if (!rq) {
210 DMERR("dm-emc: emc_pg_init: no rq");
211 goto fail_path;
212 }
213
214 DMINFO("dm-emc: emc_pg_init: sending switch-over command");
215 elv_add_request(q, rq, ELEVATOR_INSERT_FRONT, 1);
216 return;
217
218fail_path:
219 dm_pg_init_complete(path, MP_FAIL_PATH);
220}
221
222static struct emc_handler *alloc_emc_handler(void)
223{
224 struct emc_handler *h = kmalloc(sizeof(*h), GFP_KERNEL);
225
226 if (h)
227 spin_lock_init(&h->lock);
228
229 return h;
230}
231
232static int emc_create(struct hw_handler *hwh, unsigned argc, char **argv)
233{
234 struct emc_handler *h;
235 unsigned hr, short_trespass;
236
237 if (argc == 0) {
238 /* No arguments: use defaults */
239 hr = 0;
240 short_trespass = 0;
241 } else if (argc != 2) {
242 DMWARN("dm-emc hwhandler: incorrect number of arguments");
243 return -EINVAL;
244 } else {
245 if ((sscanf(argv[0], "%u", &short_trespass) != 1)
246 || (short_trespass > 1)) {
247 DMWARN("dm-emc: invalid trespass mode selected");
248 return -EINVAL;
249 }
250
251 if ((sscanf(argv[1], "%u", &hr) != 1)
252 || (hr > 1)) {
253 DMWARN("dm-emc: invalid honor reservation flag selected");
254 return -EINVAL;
255 }
256 }
257
258 h = alloc_emc_handler();
259 if (!h)
260 return -ENOMEM;
261
262 memset(h, 0, sizeof(*h));
263
264 hwh->context = h;
265
266 if ((h->short_trespass = short_trespass))
267 DMWARN("dm-emc: short trespass command will be send");
268 else
269 DMWARN("dm-emc: long trespass command will be send");
270
271 if ((h->hr = hr))
272 DMWARN("dm-emc: honor reservation bit will be set");
273 else
274 DMWARN("dm-emc: honor reservation bit will not be set (default)");
275
276 return 0;
277}
278
279static void emc_destroy(struct hw_handler *hwh)
280{
281 struct emc_handler *h = (struct emc_handler *) hwh->context;
282
283 kfree(h);
284 hwh->context = NULL;
285}
286
287static unsigned emc_error(struct hw_handler *hwh, struct bio *bio)
288{
289 /* FIXME: Patch from axboe still missing */
290#if 0
291 int sense;
292
293 if (bio->bi_error & BIO_SENSE) {
294 sense = bio->bi_error & 0xffffff; /* sense key / asc / ascq */
295
296 if (sense == 0x020403) {
297 /* LUN Not Ready - Manual Intervention Required
298 * indicates this is a passive path.
299 *
300 * FIXME: However, if this is seen and EVPD C0
301 * indicates that this is due to a NDU in
302 * progress, we should set FAIL_PATH too.
303 * This indicates we might have to do a SCSI
304 * inquiry in the end_io path. Ugh. */
305 return MP_BYPASS_PG | MP_RETRY_IO;
306 } else if (sense == 0x052501) {
307 /* An array based copy is in progress. Do not
308 * fail the path, do not bypass to another PG,
309 * do not retry. Fail the IO immediately.
310 * (Actually this is the same conclusion as in
311 * the default handler, but lets make sure.) */
312 return 0;
313 } else if (sense == 0x062900) {
314 /* Unit Attention Code. This is the first IO
315 * to the new path, so just retry. */
316 return MP_RETRY_IO;
317 }
318 }
319#endif
320
321 /* Try default handler */
322 return dm_scsi_err_handler(hwh, bio);
323}
324
325static struct hw_handler_type emc_hwh = {
326 .name = "emc",
327 .module = THIS_MODULE,
328 .create = emc_create,
329 .destroy = emc_destroy,
330 .pg_init = emc_pg_init,
331 .error = emc_error,
332};
333
334static int __init dm_emc_init(void)
335{
336 int r = dm_register_hw_handler(&emc_hwh);
337
338 if (r < 0)
339 DMERR("emc: register failed %d", r);
340
341 DMINFO("dm-emc version 0.0.3 loaded");
342
343 return r;
344}
345
346static void __exit dm_emc_exit(void)
347{
348 int r = dm_unregister_hw_handler(&emc_hwh);
349
350 if (r < 0)
351 DMERR("emc: unregister failed %d", r);
352}
353
354module_init(dm_emc_init);
355module_exit(dm_emc_exit);
356
357MODULE_DESCRIPTION(DM_NAME " EMC CX/AX/FC-family multipath");
358MODULE_AUTHOR("Lars Marowsky-Bree <lmb@suse.de>");
359MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
new file mode 100644
index 000000000000..17212b4201a1
--- /dev/null
+++ b/drivers/md/dm-exception-store.c
@@ -0,0 +1,648 @@
1/*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 *
6 * This file is released under the GPL.
7 */
8
9#include "dm.h"
10#include "dm-snap.h"
11#include "dm-io.h"
12#include "kcopyd.h"
13
14#include <linux/mm.h>
15#include <linux/pagemap.h>
16#include <linux/vmalloc.h>
17#include <linux/slab.h>
18
19/*-----------------------------------------------------------------
20 * Persistent snapshots, by persistent we mean that the snapshot
21 * will survive a reboot.
22 *---------------------------------------------------------------*/
23
24/*
25 * We need to store a record of which parts of the origin have
26 * been copied to the snapshot device. The snapshot code
27 * requires that we copy exception chunks to chunk aligned areas
28 * of the COW store. It makes sense therefore, to store the
29 * metadata in chunk size blocks.
30 *
31 * There is no backward or forward compatibility implemented,
32 * snapshots with different disk versions than the kernel will
33 * not be usable. It is expected that "lvcreate" will blank out
34 * the start of a fresh COW device before calling the snapshot
35 * constructor.
36 *
37 * The first chunk of the COW device just contains the header.
38 * After this there is a chunk filled with exception metadata,
39 * followed by as many exception chunks as can fit in the
40 * metadata areas.
41 *
42 * All on disk structures are in little-endian format. The end
43 * of the exceptions info is indicated by an exception with a
44 * new_chunk of 0, which is invalid since it would point to the
45 * header chunk.
46 */
47
48/*
49 * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
50 */
51#define SNAP_MAGIC 0x70416e53
52
53/*
54 * The on-disk version of the metadata.
55 */
56#define SNAPSHOT_DISK_VERSION 1
57
58struct disk_header {
59 uint32_t magic;
60
61 /*
62 * Is this snapshot valid. There is no way of recovering
63 * an invalid snapshot.
64 */
65 uint32_t valid;
66
67 /*
68 * Simple, incrementing version. no backward
69 * compatibility.
70 */
71 uint32_t version;
72
73 /* In sectors */
74 uint32_t chunk_size;
75};
76
77struct disk_exception {
78 uint64_t old_chunk;
79 uint64_t new_chunk;
80};
81
82struct commit_callback {
83 void (*callback)(void *, int success);
84 void *context;
85};
86
87/*
88 * The top level structure for a persistent exception store.
89 */
90struct pstore {
91 struct dm_snapshot *snap; /* up pointer to my snapshot */
92 int version;
93 int valid;
94 uint32_t chunk_size;
95 uint32_t exceptions_per_area;
96
97 /*
98 * Now that we have an asynchronous kcopyd there is no
99 * need for large chunk sizes, so it wont hurt to have a
100 * whole chunks worth of metadata in memory at once.
101 */
102 void *area;
103
104 /*
105 * Used to keep track of which metadata area the data in
106 * 'chunk' refers to.
107 */
108 uint32_t current_area;
109
110 /*
111 * The next free chunk for an exception.
112 */
113 uint32_t next_free;
114
115 /*
116 * The index of next free exception in the current
117 * metadata area.
118 */
119 uint32_t current_committed;
120
121 atomic_t pending_count;
122 uint32_t callback_count;
123 struct commit_callback *callbacks;
124};
125
126static inline unsigned int sectors_to_pages(unsigned int sectors)
127{
128 return sectors / (PAGE_SIZE >> 9);
129}
130
131static int alloc_area(struct pstore *ps)
132{
133 int r = -ENOMEM;
134 size_t len;
135
136 len = ps->chunk_size << SECTOR_SHIFT;
137
138 /*
139 * Allocate the chunk_size block of memory that will hold
140 * a single metadata area.
141 */
142 ps->area = vmalloc(len);
143 if (!ps->area)
144 return r;
145
146 return 0;
147}
148
149static void free_area(struct pstore *ps)
150{
151 vfree(ps->area);
152}
153
154/*
155 * Read or write a chunk aligned and sized block of data from a device.
156 */
157static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
158{
159 struct io_region where;
160 unsigned long bits;
161
162 where.bdev = ps->snap->cow->bdev;
163 where.sector = ps->chunk_size * chunk;
164 where.count = ps->chunk_size;
165
166 return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
167}
168
169/*
170 * Read or write a metadata area. Remembering to skip the first
171 * chunk which holds the header.
172 */
173static int area_io(struct pstore *ps, uint32_t area, int rw)
174{
175 int r;
176 uint32_t chunk;
177
178 /* convert a metadata area index to a chunk index */
179 chunk = 1 + ((ps->exceptions_per_area + 1) * area);
180
181 r = chunk_io(ps, chunk, rw);
182 if (r)
183 return r;
184
185 ps->current_area = area;
186 return 0;
187}
188
189static int zero_area(struct pstore *ps, uint32_t area)
190{
191 memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
192 return area_io(ps, area, WRITE);
193}
194
195static int read_header(struct pstore *ps, int *new_snapshot)
196{
197 int r;
198 struct disk_header *dh;
199
200 r = chunk_io(ps, 0, READ);
201 if (r)
202 return r;
203
204 dh = (struct disk_header *) ps->area;
205
206 if (le32_to_cpu(dh->magic) == 0) {
207 *new_snapshot = 1;
208
209 } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
210 *new_snapshot = 0;
211 ps->valid = le32_to_cpu(dh->valid);
212 ps->version = le32_to_cpu(dh->version);
213 ps->chunk_size = le32_to_cpu(dh->chunk_size);
214
215 } else {
216 DMWARN("Invalid/corrupt snapshot");
217 r = -ENXIO;
218 }
219
220 return r;
221}
222
223static int write_header(struct pstore *ps)
224{
225 struct disk_header *dh;
226
227 memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
228
229 dh = (struct disk_header *) ps->area;
230 dh->magic = cpu_to_le32(SNAP_MAGIC);
231 dh->valid = cpu_to_le32(ps->valid);
232 dh->version = cpu_to_le32(ps->version);
233 dh->chunk_size = cpu_to_le32(ps->chunk_size);
234
235 return chunk_io(ps, 0, WRITE);
236}
237
238/*
239 * Access functions for the disk exceptions, these do the endian conversions.
240 */
241static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
242{
243 if (index >= ps->exceptions_per_area)
244 return NULL;
245
246 return ((struct disk_exception *) ps->area) + index;
247}
248
249static int read_exception(struct pstore *ps,
250 uint32_t index, struct disk_exception *result)
251{
252 struct disk_exception *e;
253
254 e = get_exception(ps, index);
255 if (!e)
256 return -EINVAL;
257
258 /* copy it */
259 result->old_chunk = le64_to_cpu(e->old_chunk);
260 result->new_chunk = le64_to_cpu(e->new_chunk);
261
262 return 0;
263}
264
265static int write_exception(struct pstore *ps,
266 uint32_t index, struct disk_exception *de)
267{
268 struct disk_exception *e;
269
270 e = get_exception(ps, index);
271 if (!e)
272 return -EINVAL;
273
274 /* copy it */
275 e->old_chunk = cpu_to_le64(de->old_chunk);
276 e->new_chunk = cpu_to_le64(de->new_chunk);
277
278 return 0;
279}
280
281/*
282 * Registers the exceptions that are present in the current area.
283 * 'full' is filled in to indicate if the area has been
284 * filled.
285 */
286static int insert_exceptions(struct pstore *ps, int *full)
287{
288 int r;
289 unsigned int i;
290 struct disk_exception de;
291
292 /* presume the area is full */
293 *full = 1;
294
295 for (i = 0; i < ps->exceptions_per_area; i++) {
296 r = read_exception(ps, i, &de);
297
298 if (r)
299 return r;
300
301 /*
302 * If the new_chunk is pointing at the start of
303 * the COW device, where the first metadata area
304 * is we know that we've hit the end of the
305 * exceptions. Therefore the area is not full.
306 */
307 if (de.new_chunk == 0LL) {
308 ps->current_committed = i;
309 *full = 0;
310 break;
311 }
312
313 /*
314 * Keep track of the start of the free chunks.
315 */
316 if (ps->next_free <= de.new_chunk)
317 ps->next_free = de.new_chunk + 1;
318
319 /*
320 * Otherwise we add the exception to the snapshot.
321 */
322 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
323 if (r)
324 return r;
325 }
326
327 return 0;
328}
329
330static int read_exceptions(struct pstore *ps)
331{
332 uint32_t area;
333 int r, full = 1;
334
335 /*
336 * Keeping reading chunks and inserting exceptions until
337 * we find a partially full area.
338 */
339 for (area = 0; full; area++) {
340 r = area_io(ps, area, READ);
341 if (r)
342 return r;
343
344 r = insert_exceptions(ps, &full);
345 if (r)
346 return r;
347 }
348
349 return 0;
350}
351
352static inline struct pstore *get_info(struct exception_store *store)
353{
354 return (struct pstore *) store->context;
355}
356
357static void persistent_fraction_full(struct exception_store *store,
358 sector_t *numerator, sector_t *denominator)
359{
360 *numerator = get_info(store)->next_free * store->snap->chunk_size;
361 *denominator = get_dev_size(store->snap->cow->bdev);
362}
363
364static void persistent_destroy(struct exception_store *store)
365{
366 struct pstore *ps = get_info(store);
367
368 dm_io_put(sectors_to_pages(ps->chunk_size));
369 vfree(ps->callbacks);
370 free_area(ps);
371 kfree(ps);
372}
373
374static int persistent_read_metadata(struct exception_store *store)
375{
376 int r, new_snapshot;
377 struct pstore *ps = get_info(store);
378
379 /*
380 * Read the snapshot header.
381 */
382 r = read_header(ps, &new_snapshot);
383 if (r)
384 return r;
385
386 /*
387 * Do we need to setup a new snapshot ?
388 */
389 if (new_snapshot) {
390 r = write_header(ps);
391 if (r) {
392 DMWARN("write_header failed");
393 return r;
394 }
395
396 r = zero_area(ps, 0);
397 if (r) {
398 DMWARN("zero_area(0) failed");
399 return r;
400 }
401
402 } else {
403 /*
404 * Sanity checks.
405 */
406 if (!ps->valid) {
407 DMWARN("snapshot is marked invalid");
408 return -EINVAL;
409 }
410
411 if (ps->version != SNAPSHOT_DISK_VERSION) {
412 DMWARN("unable to handle snapshot disk version %d",
413 ps->version);
414 return -EINVAL;
415 }
416
417 /*
418 * Read the metadata.
419 */
420 r = read_exceptions(ps);
421 if (r)
422 return r;
423 }
424
425 return 0;
426}
427
428static int persistent_prepare(struct exception_store *store,
429 struct exception *e)
430{
431 struct pstore *ps = get_info(store);
432 uint32_t stride;
433 sector_t size = get_dev_size(store->snap->cow->bdev);
434
435 /* Is there enough room ? */
436 if (size < ((ps->next_free + 1) * store->snap->chunk_size))
437 return -ENOSPC;
438
439 e->new_chunk = ps->next_free;
440
441 /*
442 * Move onto the next free pending, making sure to take
443 * into account the location of the metadata chunks.
444 */
445 stride = (ps->exceptions_per_area + 1);
446 if ((++ps->next_free % stride) == 1)
447 ps->next_free++;
448
449 atomic_inc(&ps->pending_count);
450 return 0;
451}
452
453static void persistent_commit(struct exception_store *store,
454 struct exception *e,
455 void (*callback) (void *, int success),
456 void *callback_context)
457{
458 int r;
459 unsigned int i;
460 struct pstore *ps = get_info(store);
461 struct disk_exception de;
462 struct commit_callback *cb;
463
464 de.old_chunk = e->old_chunk;
465 de.new_chunk = e->new_chunk;
466 write_exception(ps, ps->current_committed++, &de);
467
468 /*
469 * Add the callback to the back of the array. This code
470 * is the only place where the callback array is
471 * manipulated, and we know that it will never be called
472 * multiple times concurrently.
473 */
474 cb = ps->callbacks + ps->callback_count++;
475 cb->callback = callback;
476 cb->context = callback_context;
477
478 /*
479 * If there are no more exceptions in flight, or we have
480 * filled this metadata area we commit the exceptions to
481 * disk.
482 */
483 if (atomic_dec_and_test(&ps->pending_count) ||
484 (ps->current_committed == ps->exceptions_per_area)) {
485 r = area_io(ps, ps->current_area, WRITE);
486 if (r)
487 ps->valid = 0;
488
489 for (i = 0; i < ps->callback_count; i++) {
490 cb = ps->callbacks + i;
491 cb->callback(cb->context, r == 0 ? 1 : 0);
492 }
493
494 ps->callback_count = 0;
495 }
496
497 /*
498 * Have we completely filled the current area ?
499 */
500 if (ps->current_committed == ps->exceptions_per_area) {
501 ps->current_committed = 0;
502 r = zero_area(ps, ps->current_area + 1);
503 if (r)
504 ps->valid = 0;
505 }
506}
507
508static void persistent_drop(struct exception_store *store)
509{
510 struct pstore *ps = get_info(store);
511
512 ps->valid = 0;
513 if (write_header(ps))
514 DMWARN("write header failed");
515}
516
517int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
518{
519 int r;
520 struct pstore *ps;
521
522 r = dm_io_get(sectors_to_pages(chunk_size));
523 if (r)
524 return r;
525
526 /* allocate the pstore */
527 ps = kmalloc(sizeof(*ps), GFP_KERNEL);
528 if (!ps) {
529 r = -ENOMEM;
530 goto bad;
531 }
532
533 ps->snap = store->snap;
534 ps->valid = 1;
535 ps->version = SNAPSHOT_DISK_VERSION;
536 ps->chunk_size = chunk_size;
537 ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
538 sizeof(struct disk_exception);
539 ps->next_free = 2; /* skipping the header and first area */
540 ps->current_committed = 0;
541
542 r = alloc_area(ps);
543 if (r)
544 goto bad;
545
546 /*
547 * Allocate space for all the callbacks.
548 */
549 ps->callback_count = 0;
550 atomic_set(&ps->pending_count, 0);
551 ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
552 sizeof(*ps->callbacks));
553
554 if (!ps->callbacks) {
555 r = -ENOMEM;
556 goto bad;
557 }
558
559 store->destroy = persistent_destroy;
560 store->read_metadata = persistent_read_metadata;
561 store->prepare_exception = persistent_prepare;
562 store->commit_exception = persistent_commit;
563 store->drop_snapshot = persistent_drop;
564 store->fraction_full = persistent_fraction_full;
565 store->context = ps;
566
567 return 0;
568
569 bad:
570 dm_io_put(sectors_to_pages(chunk_size));
571 if (ps) {
572 if (ps->area)
573 free_area(ps);
574
575 kfree(ps);
576 }
577 return r;
578}
579
580/*-----------------------------------------------------------------
581 * Implementation of the store for non-persistent snapshots.
582 *---------------------------------------------------------------*/
583struct transient_c {
584 sector_t next_free;
585};
586
587static void transient_destroy(struct exception_store *store)
588{
589 kfree(store->context);
590}
591
592static int transient_read_metadata(struct exception_store *store)
593{
594 return 0;
595}
596
597static int transient_prepare(struct exception_store *store, struct exception *e)
598{
599 struct transient_c *tc = (struct transient_c *) store->context;
600 sector_t size = get_dev_size(store->snap->cow->bdev);
601
602 if (size < (tc->next_free + store->snap->chunk_size))
603 return -1;
604
605 e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
606 tc->next_free += store->snap->chunk_size;
607
608 return 0;
609}
610
611static void transient_commit(struct exception_store *store,
612 struct exception *e,
613 void (*callback) (void *, int success),
614 void *callback_context)
615{
616 /* Just succeed */
617 callback(callback_context, 1);
618}
619
620static void transient_fraction_full(struct exception_store *store,
621 sector_t *numerator, sector_t *denominator)
622{
623 *numerator = ((struct transient_c *) store->context)->next_free;
624 *denominator = get_dev_size(store->snap->cow->bdev);
625}
626
627int dm_create_transient(struct exception_store *store,
628 struct dm_snapshot *s, int blocksize)
629{
630 struct transient_c *tc;
631
632 memset(store, 0, sizeof(*store));
633 store->destroy = transient_destroy;
634 store->read_metadata = transient_read_metadata;
635 store->prepare_exception = transient_prepare;
636 store->commit_exception = transient_commit;
637 store->fraction_full = transient_fraction_full;
638 store->snap = s;
639
640 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
641 if (!tc)
642 return -ENOMEM;
643
644 tc->next_free = 0;
645 store->context = tc;
646
647 return 0;
648}
diff --git a/drivers/md/dm-hw-handler.c b/drivers/md/dm-hw-handler.c
new file mode 100644
index 000000000000..ae63772e44c9
--- /dev/null
+++ b/drivers/md/dm-hw-handler.c
@@ -0,0 +1,216 @@
1/*
2 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3 *
4 * This file is released under the GPL.
5 *
6 * Multipath hardware handler registration.
7 */
8
9#include "dm.h"
10#include "dm-hw-handler.h"
11
12#include <linux/slab.h>
13
14struct hwh_internal {
15 struct hw_handler_type hwht;
16
17 struct list_head list;
18 long use;
19};
20
21#define hwht_to_hwhi(__hwht) container_of((__hwht), struct hwh_internal, hwht)
22
23static LIST_HEAD(_hw_handlers);
24static DECLARE_RWSEM(_hwh_lock);
25
26struct hwh_internal *__find_hw_handler_type(const char *name)
27{
28 struct hwh_internal *hwhi;
29
30 list_for_each_entry(hwhi, &_hw_handlers, list) {
31 if (!strcmp(name, hwhi->hwht.name))
32 return hwhi;
33 }
34
35 return NULL;
36}
37
38static struct hwh_internal *get_hw_handler(const char *name)
39{
40 struct hwh_internal *hwhi;
41
42 down_read(&_hwh_lock);
43 hwhi = __find_hw_handler_type(name);
44 if (hwhi) {
45 if ((hwhi->use == 0) && !try_module_get(hwhi->hwht.module))
46 hwhi = NULL;
47 else
48 hwhi->use++;
49 }
50 up_read(&_hwh_lock);
51
52 return hwhi;
53}
54
55struct hw_handler_type *dm_get_hw_handler(const char *name)
56{
57 struct hwh_internal *hwhi;
58
59 if (!name)
60 return NULL;
61
62 hwhi = get_hw_handler(name);
63 if (!hwhi) {
64 request_module("dm-%s", name);
65 hwhi = get_hw_handler(name);
66 }
67
68 return hwhi ? &hwhi->hwht : NULL;
69}
70
71void dm_put_hw_handler(struct hw_handler_type *hwht)
72{
73 struct hwh_internal *hwhi;
74
75 if (!hwht)
76 return;
77
78 down_read(&_hwh_lock);
79 hwhi = __find_hw_handler_type(hwht->name);
80 if (!hwhi)
81 goto out;
82
83 if (--hwhi->use == 0)
84 module_put(hwhi->hwht.module);
85
86 if (hwhi->use < 0)
87 BUG();
88
89 out:
90 up_read(&_hwh_lock);
91}
92
93static struct hwh_internal *_alloc_hw_handler(struct hw_handler_type *hwht)
94{
95 struct hwh_internal *hwhi = kmalloc(sizeof(*hwhi), GFP_KERNEL);
96
97 if (hwhi) {
98 memset(hwhi, 0, sizeof(*hwhi));
99 hwhi->hwht = *hwht;
100 }
101
102 return hwhi;
103}
104
105int dm_register_hw_handler(struct hw_handler_type *hwht)
106{
107 int r = 0;
108 struct hwh_internal *hwhi = _alloc_hw_handler(hwht);
109
110 if (!hwhi)
111 return -ENOMEM;
112
113 down_write(&_hwh_lock);
114
115 if (__find_hw_handler_type(hwht->name)) {
116 kfree(hwhi);
117 r = -EEXIST;
118 } else
119 list_add(&hwhi->list, &_hw_handlers);
120
121 up_write(&_hwh_lock);
122
123 return r;
124}
125
126int dm_unregister_hw_handler(struct hw_handler_type *hwht)
127{
128 struct hwh_internal *hwhi;
129
130 down_write(&_hwh_lock);
131
132 hwhi = __find_hw_handler_type(hwht->name);
133 if (!hwhi) {
134 up_write(&_hwh_lock);
135 return -EINVAL;
136 }
137
138 if (hwhi->use) {
139 up_write(&_hwh_lock);
140 return -ETXTBSY;
141 }
142
143 list_del(&hwhi->list);
144
145 up_write(&_hwh_lock);
146
147 kfree(hwhi);
148
149 return 0;
150}
151
152unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio)
153{
154#if 0
155 int sense_key, asc, ascq;
156
157 if (bio->bi_error & BIO_SENSE) {
158 /* FIXME: This is just an initial guess. */
159 /* key / asc / ascq */
160 sense_key = (bio->bi_error >> 16) & 0xff;
161 asc = (bio->bi_error >> 8) & 0xff;
162 ascq = bio->bi_error & 0xff;
163
164 switch (sense_key) {
165 /* This block as a whole comes from the device.
166 * So no point retrying on another path. */
167 case 0x03: /* Medium error */
168 case 0x05: /* Illegal request */
169 case 0x07: /* Data protect */
170 case 0x08: /* Blank check */
171 case 0x0a: /* copy aborted */
172 case 0x0c: /* obsolete - no clue ;-) */
173 case 0x0d: /* volume overflow */
174 case 0x0e: /* data miscompare */
175 case 0x0f: /* reserved - no idea either. */
176 return MP_ERROR_IO;
177
178 /* For these errors it's unclear whether they
179 * come from the device or the controller.
180 * So just lets try a different path, and if
181 * it eventually succeeds, user-space will clear
182 * the paths again... */
183 case 0x02: /* Not ready */
184 case 0x04: /* Hardware error */
185 case 0x09: /* vendor specific */
186 case 0x0b: /* Aborted command */
187 return MP_FAIL_PATH;
188
189 case 0x06: /* Unit attention - might want to decode */
190 if (asc == 0x04 && ascq == 0x01)
191 /* "Unit in the process of
192 * becoming ready" */
193 return 0;
194 return MP_FAIL_PATH;
195
196 /* FIXME: For Unit Not Ready we may want
197 * to have a generic pg activation
198 * feature (START_UNIT). */
199
200 /* Should these two ever end up in the
201 * error path? I don't think so. */
202 case 0x00: /* No sense */
203 case 0x01: /* Recovered error */
204 return 0;
205 }
206 }
207#endif
208
209 /* We got no idea how to decode the other kinds of errors ->
210 * assume generic error condition. */
211 return MP_FAIL_PATH;
212}
213
214EXPORT_SYMBOL_GPL(dm_register_hw_handler);
215EXPORT_SYMBOL_GPL(dm_unregister_hw_handler);
216EXPORT_SYMBOL_GPL(dm_scsi_err_handler);
diff --git a/drivers/md/dm-hw-handler.h b/drivers/md/dm-hw-handler.h
new file mode 100644
index 000000000000..15f5629e231a
--- /dev/null
+++ b/drivers/md/dm-hw-handler.h
@@ -0,0 +1,61 @@
1/*
2 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3 *
4 * This file is released under the GPL.
5 *
6 * Multipath hardware handler registration.
7 */
8
9#ifndef DM_HW_HANDLER_H
10#define DM_HW_HANDLER_H
11
12#include <linux/device-mapper.h>
13
14#include "dm-mpath.h"
15
16struct hw_handler_type;
17struct hw_handler {
18 struct hw_handler_type *type;
19 void *context;
20};
21
22/*
23 * Constructs a hardware handler object, takes custom arguments
24 */
25/* Information about a hardware handler type */
26struct hw_handler_type {
27 char *name;
28 struct module *module;
29
30 int (*create) (struct hw_handler *handler, unsigned int argc,
31 char **argv);
32 void (*destroy) (struct hw_handler *hwh);
33
34 void (*pg_init) (struct hw_handler *hwh, unsigned bypassed,
35 struct path *path);
36 unsigned (*error) (struct hw_handler *hwh, struct bio *bio);
37 int (*status) (struct hw_handler *hwh, status_type_t type,
38 char *result, unsigned int maxlen);
39};
40
41/* Register a hardware handler */
42int dm_register_hw_handler(struct hw_handler_type *type);
43
44/* Unregister a hardware handler */
45int dm_unregister_hw_handler(struct hw_handler_type *type);
46
47/* Returns a registered hardware handler type */
48struct hw_handler_type *dm_get_hw_handler(const char *name);
49
50/* Releases a hardware handler */
51void dm_put_hw_handler(struct hw_handler_type *hwht);
52
53/* Default err function */
54unsigned dm_scsi_err_handler(struct hw_handler *hwh, struct bio *bio);
55
56/* Error flags for err and dm_pg_init_complete */
57#define MP_FAIL_PATH 1
58#define MP_BYPASS_PG 2
59#define MP_ERROR_IO 4 /* Don't retry this I/O */
60
61#endif
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
new file mode 100644
index 000000000000..45754bb6a799
--- /dev/null
+++ b/drivers/md/dm-io.c
@@ -0,0 +1,426 @@
1/*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm-io.h"
8
9#include <linux/bio.h>
10#include <linux/mempool.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/slab.h>
14
15static struct bio_set *_bios;
16
17/* FIXME: can we shrink this ? */
18struct io {
19 unsigned long error;
20 atomic_t count;
21 struct task_struct *sleeper;
22 io_notify_fn callback;
23 void *context;
24};
25
26/*
27 * io contexts are only dynamically allocated for asynchronous
28 * io. Since async io is likely to be the majority of io we'll
29 * have the same number of io contexts as buffer heads ! (FIXME:
30 * must reduce this).
31 */
32static unsigned _num_ios;
33static mempool_t *_io_pool;
34
35static void *alloc_io(unsigned int __nocast gfp_mask, void *pool_data)
36{
37 return kmalloc(sizeof(struct io), gfp_mask);
38}
39
40static void free_io(void *element, void *pool_data)
41{
42 kfree(element);
43}
44
45static unsigned int pages_to_ios(unsigned int pages)
46{
47 return 4 * pages; /* too many ? */
48}
49
50static int resize_pool(unsigned int new_ios)
51{
52 int r = 0;
53
54 if (_io_pool) {
55 if (new_ios == 0) {
56 /* free off the pool */
57 mempool_destroy(_io_pool);
58 _io_pool = NULL;
59 bioset_free(_bios);
60
61 } else {
62 /* resize the pool */
63 r = mempool_resize(_io_pool, new_ios, GFP_KERNEL);
64 }
65
66 } else {
67 /* create new pool */
68 _io_pool = mempool_create(new_ios, alloc_io, free_io, NULL);
69 if (!_io_pool)
70 return -ENOMEM;
71
72 _bios = bioset_create(16, 16, 4);
73 if (!_bios) {
74 mempool_destroy(_io_pool);
75 _io_pool = NULL;
76 return -ENOMEM;
77 }
78 }
79
80 if (!r)
81 _num_ios = new_ios;
82
83 return r;
84}
85
86int dm_io_get(unsigned int num_pages)
87{
88 return resize_pool(_num_ios + pages_to_ios(num_pages));
89}
90
91void dm_io_put(unsigned int num_pages)
92{
93 resize_pool(_num_ios - pages_to_ios(num_pages));
94}
95
96/*-----------------------------------------------------------------
97 * We need to keep track of which region a bio is doing io for.
98 * In order to save a memory allocation we store this the last
99 * bvec which we know is unused (blech).
100 * XXX This is ugly and can OOPS with some configs... find another way.
101 *---------------------------------------------------------------*/
102static inline void bio_set_region(struct bio *bio, unsigned region)
103{
104 bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len = region;
105}
106
107static inline unsigned bio_get_region(struct bio *bio)
108{
109 return bio->bi_io_vec[bio->bi_max_vecs - 1].bv_len;
110}
111
112/*-----------------------------------------------------------------
113 * We need an io object to keep track of the number of bios that
114 * have been dispatched for a particular io.
115 *---------------------------------------------------------------*/
116static void dec_count(struct io *io, unsigned int region, int error)
117{
118 if (error)
119 set_bit(region, &io->error);
120
121 if (atomic_dec_and_test(&io->count)) {
122 if (io->sleeper)
123 wake_up_process(io->sleeper);
124
125 else {
126 int r = io->error;
127 io_notify_fn fn = io->callback;
128 void *context = io->context;
129
130 mempool_free(io, _io_pool);
131 fn(r, context);
132 }
133 }
134}
135
136static int endio(struct bio *bio, unsigned int done, int error)
137{
138 struct io *io = (struct io *) bio->bi_private;
139
140 /* keep going until we've finished */
141 if (bio->bi_size)
142 return 1;
143
144 if (error && bio_data_dir(bio) == READ)
145 zero_fill_bio(bio);
146
147 dec_count(io, bio_get_region(bio), error);
148 bio_put(bio);
149
150 return 0;
151}
152
153/*-----------------------------------------------------------------
154 * These little objects provide an abstraction for getting a new
155 * destination page for io.
156 *---------------------------------------------------------------*/
157struct dpages {
158 void (*get_page)(struct dpages *dp,
159 struct page **p, unsigned long *len, unsigned *offset);
160 void (*next_page)(struct dpages *dp);
161
162 unsigned context_u;
163 void *context_ptr;
164};
165
166/*
167 * Functions for getting the pages from a list.
168 */
169static void list_get_page(struct dpages *dp,
170 struct page **p, unsigned long *len, unsigned *offset)
171{
172 unsigned o = dp->context_u;
173 struct page_list *pl = (struct page_list *) dp->context_ptr;
174
175 *p = pl->page;
176 *len = PAGE_SIZE - o;
177 *offset = o;
178}
179
180static void list_next_page(struct dpages *dp)
181{
182 struct page_list *pl = (struct page_list *) dp->context_ptr;
183 dp->context_ptr = pl->next;
184 dp->context_u = 0;
185}
186
187static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offset)
188{
189 dp->get_page = list_get_page;
190 dp->next_page = list_next_page;
191 dp->context_u = offset;
192 dp->context_ptr = pl;
193}
194
195/*
196 * Functions for getting the pages from a bvec.
197 */
198static void bvec_get_page(struct dpages *dp,
199 struct page **p, unsigned long *len, unsigned *offset)
200{
201 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
202 *p = bvec->bv_page;
203 *len = bvec->bv_len;
204 *offset = bvec->bv_offset;
205}
206
207static void bvec_next_page(struct dpages *dp)
208{
209 struct bio_vec *bvec = (struct bio_vec *) dp->context_ptr;
210 dp->context_ptr = bvec + 1;
211}
212
213static void bvec_dp_init(struct dpages *dp, struct bio_vec *bvec)
214{
215 dp->get_page = bvec_get_page;
216 dp->next_page = bvec_next_page;
217 dp->context_ptr = bvec;
218}
219
220static void vm_get_page(struct dpages *dp,
221 struct page **p, unsigned long *len, unsigned *offset)
222{
223 *p = vmalloc_to_page(dp->context_ptr);
224 *offset = dp->context_u;
225 *len = PAGE_SIZE - dp->context_u;
226}
227
228static void vm_next_page(struct dpages *dp)
229{
230 dp->context_ptr += PAGE_SIZE - dp->context_u;
231 dp->context_u = 0;
232}
233
234static void vm_dp_init(struct dpages *dp, void *data)
235{
236 dp->get_page = vm_get_page;
237 dp->next_page = vm_next_page;
238 dp->context_u = ((unsigned long) data) & (PAGE_SIZE - 1);
239 dp->context_ptr = data;
240}
241
242/*-----------------------------------------------------------------
243 * IO routines that accept a list of pages.
244 *---------------------------------------------------------------*/
245static void do_region(int rw, unsigned int region, struct io_region *where,
246 struct dpages *dp, struct io *io)
247{
248 struct bio *bio;
249 struct page *page;
250 unsigned long len;
251 unsigned offset;
252 unsigned num_bvecs;
253 sector_t remaining = where->count;
254
255 while (remaining) {
256 /*
257 * Allocate a suitably sized bio, we add an extra
258 * bvec for bio_get/set_region().
259 */
260 num_bvecs = (remaining / (PAGE_SIZE >> 9)) + 2;
261 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, _bios);
262 bio->bi_sector = where->sector + (where->count - remaining);
263 bio->bi_bdev = where->bdev;
264 bio->bi_end_io = endio;
265 bio->bi_private = io;
266 bio_set_region(bio, region);
267
268 /*
269 * Try and add as many pages as possible.
270 */
271 while (remaining) {
272 dp->get_page(dp, &page, &len, &offset);
273 len = min(len, to_bytes(remaining));
274 if (!bio_add_page(bio, page, len, offset))
275 break;
276
277 offset = 0;
278 remaining -= to_sector(len);
279 dp->next_page(dp);
280 }
281
282 atomic_inc(&io->count);
283 submit_bio(rw, bio);
284 }
285}
286
287static void dispatch_io(int rw, unsigned int num_regions,
288 struct io_region *where, struct dpages *dp,
289 struct io *io, int sync)
290{
291 int i;
292 struct dpages old_pages = *dp;
293
294 if (sync)
295 rw |= (1 << BIO_RW_SYNC);
296
297 /*
298 * For multiple regions we need to be careful to rewind
299 * the dp object for each call to do_region.
300 */
301 for (i = 0; i < num_regions; i++) {
302 *dp = old_pages;
303 if (where[i].count)
304 do_region(rw, i, where + i, dp, io);
305 }
306
307 /*
308 * Drop the extra refence that we were holding to avoid
309 * the io being completed too early.
310 */
311 dec_count(io, 0, 0);
312}
313
314static int sync_io(unsigned int num_regions, struct io_region *where,
315 int rw, struct dpages *dp, unsigned long *error_bits)
316{
317 struct io io;
318
319 if (num_regions > 1 && rw != WRITE) {
320 WARN_ON(1);
321 return -EIO;
322 }
323
324 io.error = 0;
325 atomic_set(&io.count, 1); /* see dispatch_io() */
326 io.sleeper = current;
327
328 dispatch_io(rw, num_regions, where, dp, &io, 1);
329
330 while (1) {
331 set_current_state(TASK_UNINTERRUPTIBLE);
332
333 if (!atomic_read(&io.count) || signal_pending(current))
334 break;
335
336 io_schedule();
337 }
338 set_current_state(TASK_RUNNING);
339
340 if (atomic_read(&io.count))
341 return -EINTR;
342
343 *error_bits = io.error;
344 return io.error ? -EIO : 0;
345}
346
347static int async_io(unsigned int num_regions, struct io_region *where, int rw,
348 struct dpages *dp, io_notify_fn fn, void *context)
349{
350 struct io *io;
351
352 if (num_regions > 1 && rw != WRITE) {
353 WARN_ON(1);
354 fn(1, context);
355 return -EIO;
356 }
357
358 io = mempool_alloc(_io_pool, GFP_NOIO);
359 io->error = 0;
360 atomic_set(&io->count, 1); /* see dispatch_io() */
361 io->sleeper = NULL;
362 io->callback = fn;
363 io->context = context;
364
365 dispatch_io(rw, num_regions, where, dp, io, 0);
366 return 0;
367}
368
369int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
370 struct page_list *pl, unsigned int offset,
371 unsigned long *error_bits)
372{
373 struct dpages dp;
374 list_dp_init(&dp, pl, offset);
375 return sync_io(num_regions, where, rw, &dp, error_bits);
376}
377
378int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
379 struct bio_vec *bvec, unsigned long *error_bits)
380{
381 struct dpages dp;
382 bvec_dp_init(&dp, bvec);
383 return sync_io(num_regions, where, rw, &dp, error_bits);
384}
385
386int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
387 void *data, unsigned long *error_bits)
388{
389 struct dpages dp;
390 vm_dp_init(&dp, data);
391 return sync_io(num_regions, where, rw, &dp, error_bits);
392}
393
394int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
395 struct page_list *pl, unsigned int offset,
396 io_notify_fn fn, void *context)
397{
398 struct dpages dp;
399 list_dp_init(&dp, pl, offset);
400 return async_io(num_regions, where, rw, &dp, fn, context);
401}
402
403int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
404 struct bio_vec *bvec, io_notify_fn fn, void *context)
405{
406 struct dpages dp;
407 bvec_dp_init(&dp, bvec);
408 return async_io(num_regions, where, rw, &dp, fn, context);
409}
410
411int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
412 void *data, io_notify_fn fn, void *context)
413{
414 struct dpages dp;
415 vm_dp_init(&dp, data);
416 return async_io(num_regions, where, rw, &dp, fn, context);
417}
418
419EXPORT_SYMBOL(dm_io_get);
420EXPORT_SYMBOL(dm_io_put);
421EXPORT_SYMBOL(dm_io_sync);
422EXPORT_SYMBOL(dm_io_async);
423EXPORT_SYMBOL(dm_io_sync_bvec);
424EXPORT_SYMBOL(dm_io_async_bvec);
425EXPORT_SYMBOL(dm_io_sync_vm);
426EXPORT_SYMBOL(dm_io_async_vm);
diff --git a/drivers/md/dm-io.h b/drivers/md/dm-io.h
new file mode 100644
index 000000000000..1a77f3265706
--- /dev/null
+++ b/drivers/md/dm-io.h
@@ -0,0 +1,77 @@
1/*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the GPL.
5 */
6
7#ifndef _DM_IO_H
8#define _DM_IO_H
9
10#include "dm.h"
11
12/* FIXME make this configurable */
13#define DM_MAX_IO_REGIONS 8
14
15struct io_region {
16 struct block_device *bdev;
17 sector_t sector;
18 sector_t count;
19};
20
21struct page_list {
22 struct page_list *next;
23 struct page *page;
24};
25
26
27/*
28 * 'error' is a bitset, with each bit indicating whether an error
29 * occurred doing io to the corresponding region.
30 */
31typedef void (*io_notify_fn)(unsigned long error, void *context);
32
33
34/*
35 * Before anyone uses the IO interface they should call
36 * dm_io_get(), specifying roughly how many pages they are
37 * expecting to perform io on concurrently.
38 *
39 * This function may block.
40 */
41int dm_io_get(unsigned int num_pages);
42void dm_io_put(unsigned int num_pages);
43
44/*
45 * Synchronous IO.
46 *
47 * Please ensure that the rw flag in the next two functions is
48 * either READ or WRITE, ie. we don't take READA. Any
49 * regions with a zero count field will be ignored.
50 */
51int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
52 struct page_list *pl, unsigned int offset,
53 unsigned long *error_bits);
54
55int dm_io_sync_bvec(unsigned int num_regions, struct io_region *where, int rw,
56 struct bio_vec *bvec, unsigned long *error_bits);
57
58int dm_io_sync_vm(unsigned int num_regions, struct io_region *where, int rw,
59 void *data, unsigned long *error_bits);
60
61/*
62 * Aynchronous IO.
63 *
64 * The 'where' array may be safely allocated on the stack since
65 * the function takes a copy.
66 */
67int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
68 struct page_list *pl, unsigned int offset,
69 io_notify_fn fn, void *context);
70
71int dm_io_async_bvec(unsigned int num_regions, struct io_region *where, int rw,
72 struct bio_vec *bvec, io_notify_fn fn, void *context);
73
74int dm_io_async_vm(unsigned int num_regions, struct io_region *where, int rw,
75 void *data, io_notify_fn fn, void *context);
76
77#endif
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
new file mode 100644
index 000000000000..ee3c869d9701
--- /dev/null
+++ b/drivers/md/dm-ioctl.c
@@ -0,0 +1,1416 @@
1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
9
10#include <linux/module.h>
11#include <linux/vmalloc.h>
12#include <linux/miscdevice.h>
13#include <linux/init.h>
14#include <linux/wait.h>
15#include <linux/slab.h>
16#include <linux/devfs_fs_kernel.h>
17#include <linux/dm-ioctl.h>
18
19#include <asm/uaccess.h>
20
21#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
22
23/*-----------------------------------------------------------------
24 * The ioctl interface needs to be able to look up devices by
25 * name or uuid.
26 *---------------------------------------------------------------*/
27struct hash_cell {
28 struct list_head name_list;
29 struct list_head uuid_list;
30
31 char *name;
32 char *uuid;
33 struct mapped_device *md;
34 struct dm_table *new_map;
35};
36
37struct vers_iter {
38 size_t param_size;
39 struct dm_target_versions *vers, *old_vers;
40 char *end;
41 uint32_t flags;
42};
43
44
45#define NUM_BUCKETS 64
46#define MASK_BUCKETS (NUM_BUCKETS - 1)
47static struct list_head _name_buckets[NUM_BUCKETS];
48static struct list_head _uuid_buckets[NUM_BUCKETS];
49
50static void dm_hash_remove_all(void);
51
52/*
53 * Guards access to both hash tables.
54 */
55static DECLARE_RWSEM(_hash_lock);
56
57static void init_buckets(struct list_head *buckets)
58{
59 unsigned int i;
60
61 for (i = 0; i < NUM_BUCKETS; i++)
62 INIT_LIST_HEAD(buckets + i);
63}
64
65static int dm_hash_init(void)
66{
67 init_buckets(_name_buckets);
68 init_buckets(_uuid_buckets);
69 devfs_mk_dir(DM_DIR);
70 return 0;
71}
72
73static void dm_hash_exit(void)
74{
75 dm_hash_remove_all();
76 devfs_remove(DM_DIR);
77}
78
79/*-----------------------------------------------------------------
80 * Hash function:
81 * We're not really concerned with the str hash function being
82 * fast since it's only used by the ioctl interface.
83 *---------------------------------------------------------------*/
84static unsigned int hash_str(const char *str)
85{
86 const unsigned int hash_mult = 2654435387U;
87 unsigned int h = 0;
88
89 while (*str)
90 h = (h + (unsigned int) *str++) * hash_mult;
91
92 return h & MASK_BUCKETS;
93}
94
95/*-----------------------------------------------------------------
96 * Code for looking up a device by name
97 *---------------------------------------------------------------*/
98static struct hash_cell *__get_name_cell(const char *str)
99{
100 struct hash_cell *hc;
101 unsigned int h = hash_str(str);
102
103 list_for_each_entry (hc, _name_buckets + h, name_list)
104 if (!strcmp(hc->name, str))
105 return hc;
106
107 return NULL;
108}
109
110static struct hash_cell *__get_uuid_cell(const char *str)
111{
112 struct hash_cell *hc;
113 unsigned int h = hash_str(str);
114
115 list_for_each_entry (hc, _uuid_buckets + h, uuid_list)
116 if (!strcmp(hc->uuid, str))
117 return hc;
118
119 return NULL;
120}
121
122/*-----------------------------------------------------------------
123 * Inserting, removing and renaming a device.
124 *---------------------------------------------------------------*/
125static inline char *kstrdup(const char *str)
126{
127 char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
128 if (r)
129 strcpy(r, str);
130 return r;
131}
132
133static struct hash_cell *alloc_cell(const char *name, const char *uuid,
134 struct mapped_device *md)
135{
136 struct hash_cell *hc;
137
138 hc = kmalloc(sizeof(*hc), GFP_KERNEL);
139 if (!hc)
140 return NULL;
141
142 hc->name = kstrdup(name);
143 if (!hc->name) {
144 kfree(hc);
145 return NULL;
146 }
147
148 if (!uuid)
149 hc->uuid = NULL;
150
151 else {
152 hc->uuid = kstrdup(uuid);
153 if (!hc->uuid) {
154 kfree(hc->name);
155 kfree(hc);
156 return NULL;
157 }
158 }
159
160 INIT_LIST_HEAD(&hc->name_list);
161 INIT_LIST_HEAD(&hc->uuid_list);
162 hc->md = md;
163 hc->new_map = NULL;
164 return hc;
165}
166
167static void free_cell(struct hash_cell *hc)
168{
169 if (hc) {
170 kfree(hc->name);
171 kfree(hc->uuid);
172 kfree(hc);
173 }
174}
175
176/*
177 * devfs stuff.
178 */
179static int register_with_devfs(struct hash_cell *hc)
180{
181 struct gendisk *disk = dm_disk(hc->md);
182
183 devfs_mk_bdev(MKDEV(disk->major, disk->first_minor),
184 S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
185 DM_DIR "/%s", hc->name);
186 return 0;
187}
188
189static int unregister_with_devfs(struct hash_cell *hc)
190{
191 devfs_remove(DM_DIR"/%s", hc->name);
192 return 0;
193}
194
195/*
196 * The kdev_t and uuid of a device can never change once it is
197 * initially inserted.
198 */
199static int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
200{
201 struct hash_cell *cell;
202
203 /*
204 * Allocate the new cells.
205 */
206 cell = alloc_cell(name, uuid, md);
207 if (!cell)
208 return -ENOMEM;
209
210 /*
211 * Insert the cell into both hash tables.
212 */
213 down_write(&_hash_lock);
214 if (__get_name_cell(name))
215 goto bad;
216
217 list_add(&cell->name_list, _name_buckets + hash_str(name));
218
219 if (uuid) {
220 if (__get_uuid_cell(uuid)) {
221 list_del(&cell->name_list);
222 goto bad;
223 }
224 list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
225 }
226 register_with_devfs(cell);
227 dm_get(md);
228 dm_set_mdptr(md, cell);
229 up_write(&_hash_lock);
230
231 return 0;
232
233 bad:
234 up_write(&_hash_lock);
235 free_cell(cell);
236 return -EBUSY;
237}
238
239static void __hash_remove(struct hash_cell *hc)
240{
241 /* remove from the dev hash */
242 list_del(&hc->uuid_list);
243 list_del(&hc->name_list);
244 unregister_with_devfs(hc);
245 dm_set_mdptr(hc->md, NULL);
246 dm_put(hc->md);
247 if (hc->new_map)
248 dm_table_put(hc->new_map);
249 free_cell(hc);
250}
251
252static void dm_hash_remove_all(void)
253{
254 int i;
255 struct hash_cell *hc;
256 struct list_head *tmp, *n;
257
258 down_write(&_hash_lock);
259 for (i = 0; i < NUM_BUCKETS; i++) {
260 list_for_each_safe (tmp, n, _name_buckets + i) {
261 hc = list_entry(tmp, struct hash_cell, name_list);
262 __hash_remove(hc);
263 }
264 }
265 up_write(&_hash_lock);
266}
267
268static int dm_hash_rename(const char *old, const char *new)
269{
270 char *new_name, *old_name;
271 struct hash_cell *hc;
272
273 /*
274 * duplicate new.
275 */
276 new_name = kstrdup(new);
277 if (!new_name)
278 return -ENOMEM;
279
280 down_write(&_hash_lock);
281
282 /*
283 * Is new free ?
284 */
285 hc = __get_name_cell(new);
286 if (hc) {
287 DMWARN("asked to rename to an already existing name %s -> %s",
288 old, new);
289 up_write(&_hash_lock);
290 kfree(new_name);
291 return -EBUSY;
292 }
293
294 /*
295 * Is there such a device as 'old' ?
296 */
297 hc = __get_name_cell(old);
298 if (!hc) {
299 DMWARN("asked to rename a non existent device %s -> %s",
300 old, new);
301 up_write(&_hash_lock);
302 kfree(new_name);
303 return -ENXIO;
304 }
305
306 /*
307 * rename and move the name cell.
308 */
309 unregister_with_devfs(hc);
310
311 list_del(&hc->name_list);
312 old_name = hc->name;
313 hc->name = new_name;
314 list_add(&hc->name_list, _name_buckets + hash_str(new_name));
315
316 /* rename the device node in devfs */
317 register_with_devfs(hc);
318
319 up_write(&_hash_lock);
320 kfree(old_name);
321 return 0;
322}
323
324/*-----------------------------------------------------------------
325 * Implementation of the ioctl commands
326 *---------------------------------------------------------------*/
327/*
328 * All the ioctl commands get dispatched to functions with this
329 * prototype.
330 */
331typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
332
333static int remove_all(struct dm_ioctl *param, size_t param_size)
334{
335 dm_hash_remove_all();
336 param->data_size = 0;
337 return 0;
338}
339
340/*
341 * Round up the ptr to an 8-byte boundary.
342 */
343#define ALIGN_MASK 7
344static inline void *align_ptr(void *ptr)
345{
346 return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
347}
348
349/*
350 * Retrieves the data payload buffer from an already allocated
351 * struct dm_ioctl.
352 */
353static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
354 size_t *len)
355{
356 param->data_start = align_ptr(param + 1) - (void *) param;
357
358 if (param->data_start < param_size)
359 *len = param_size - param->data_start;
360 else
361 *len = 0;
362
363 return ((void *) param) + param->data_start;
364}
365
366static int list_devices(struct dm_ioctl *param, size_t param_size)
367{
368 unsigned int i;
369 struct hash_cell *hc;
370 size_t len, needed = 0;
371 struct gendisk *disk;
372 struct dm_name_list *nl, *old_nl = NULL;
373
374 down_write(&_hash_lock);
375
376 /*
377 * Loop through all the devices working out how much
378 * space we need.
379 */
380 for (i = 0; i < NUM_BUCKETS; i++) {
381 list_for_each_entry (hc, _name_buckets + i, name_list) {
382 needed += sizeof(struct dm_name_list);
383 needed += strlen(hc->name) + 1;
384 needed += ALIGN_MASK;
385 }
386 }
387
388 /*
389 * Grab our output buffer.
390 */
391 nl = get_result_buffer(param, param_size, &len);
392 if (len < needed) {
393 param->flags |= DM_BUFFER_FULL_FLAG;
394 goto out;
395 }
396 param->data_size = param->data_start + needed;
397
398 nl->dev = 0; /* Flags no data */
399
400 /*
401 * Now loop through filling out the names.
402 */
403 for (i = 0; i < NUM_BUCKETS; i++) {
404 list_for_each_entry (hc, _name_buckets + i, name_list) {
405 if (old_nl)
406 old_nl->next = (uint32_t) ((void *) nl -
407 (void *) old_nl);
408 disk = dm_disk(hc->md);
409 nl->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
410 nl->next = 0;
411 strcpy(nl->name, hc->name);
412
413 old_nl = nl;
414 nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
415 }
416 }
417
418 out:
419 up_write(&_hash_lock);
420 return 0;
421}
422
423static void list_version_get_needed(struct target_type *tt, void *needed_param)
424{
425 size_t *needed = needed_param;
426
427 *needed += strlen(tt->name);
428 *needed += sizeof(tt->version);
429 *needed += ALIGN_MASK;
430}
431
432static void list_version_get_info(struct target_type *tt, void *param)
433{
434 struct vers_iter *info = param;
435
436 /* Check space - it might have changed since the first iteration */
437 if ((char *)info->vers + sizeof(tt->version) + strlen(tt->name) + 1 >
438 info->end) {
439
440 info->flags = DM_BUFFER_FULL_FLAG;
441 return;
442 }
443
444 if (info->old_vers)
445 info->old_vers->next = (uint32_t) ((void *)info->vers -
446 (void *)info->old_vers);
447 info->vers->version[0] = tt->version[0];
448 info->vers->version[1] = tt->version[1];
449 info->vers->version[2] = tt->version[2];
450 info->vers->next = 0;
451 strcpy(info->vers->name, tt->name);
452
453 info->old_vers = info->vers;
454 info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
455}
456
457static int list_versions(struct dm_ioctl *param, size_t param_size)
458{
459 size_t len, needed = 0;
460 struct dm_target_versions *vers;
461 struct vers_iter iter_info;
462
463 /*
464 * Loop through all the devices working out how much
465 * space we need.
466 */
467 dm_target_iterate(list_version_get_needed, &needed);
468
469 /*
470 * Grab our output buffer.
471 */
472 vers = get_result_buffer(param, param_size, &len);
473 if (len < needed) {
474 param->flags |= DM_BUFFER_FULL_FLAG;
475 goto out;
476 }
477 param->data_size = param->data_start + needed;
478
479 iter_info.param_size = param_size;
480 iter_info.old_vers = NULL;
481 iter_info.vers = vers;
482 iter_info.flags = 0;
483 iter_info.end = (char *)vers+len;
484
485 /*
486 * Now loop through filling out the names & versions.
487 */
488 dm_target_iterate(list_version_get_info, &iter_info);
489 param->flags |= iter_info.flags;
490
491 out:
492 return 0;
493}
494
495
496
497static int check_name(const char *name)
498{
499 if (strchr(name, '/')) {
500 DMWARN("invalid device name");
501 return -EINVAL;
502 }
503
504 return 0;
505}
506
507/*
508 * Fills in a dm_ioctl structure, ready for sending back to
509 * userland.
510 */
511static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
512{
513 struct gendisk *disk = dm_disk(md);
514 struct dm_table *table;
515 struct block_device *bdev;
516
517 param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
518 DM_ACTIVE_PRESENT_FLAG);
519
520 if (dm_suspended(md))
521 param->flags |= DM_SUSPEND_FLAG;
522
523 param->dev = huge_encode_dev(MKDEV(disk->major, disk->first_minor));
524
525 if (!(param->flags & DM_SKIP_BDGET_FLAG)) {
526 bdev = bdget_disk(disk, 0);
527 if (!bdev)
528 return -ENXIO;
529
530 /*
531 * Yes, this will be out of date by the time it gets back
532 * to userland, but it is still very useful for
533 * debugging.
534 */
535 param->open_count = bdev->bd_openers;
536 bdput(bdev);
537 } else
538 param->open_count = -1;
539
540 if (disk->policy)
541 param->flags |= DM_READONLY_FLAG;
542
543 param->event_nr = dm_get_event_nr(md);
544
545 table = dm_get_table(md);
546 if (table) {
547 param->flags |= DM_ACTIVE_PRESENT_FLAG;
548 param->target_count = dm_table_get_num_targets(table);
549 dm_table_put(table);
550 } else
551 param->target_count = 0;
552
553 return 0;
554}
555
556static int dev_create(struct dm_ioctl *param, size_t param_size)
557{
558 int r;
559 struct mapped_device *md;
560
561 r = check_name(param->name);
562 if (r)
563 return r;
564
565 if (param->flags & DM_PERSISTENT_DEV_FLAG)
566 r = dm_create_with_minor(MINOR(huge_decode_dev(param->dev)), &md);
567 else
568 r = dm_create(&md);
569
570 if (r)
571 return r;
572
573 r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
574 if (r) {
575 dm_put(md);
576 return r;
577 }
578
579 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
580
581 r = __dev_status(md, param);
582 dm_put(md);
583
584 return r;
585}
586
587/*
588 * Always use UUID for lookups if it's present, otherwise use name or dev.
589 */
590static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
591{
592 if (*param->uuid)
593 return __get_uuid_cell(param->uuid);
594 else if (*param->name)
595 return __get_name_cell(param->name);
596 else
597 return dm_get_mdptr(huge_decode_dev(param->dev));
598}
599
600static inline struct mapped_device *find_device(struct dm_ioctl *param)
601{
602 struct hash_cell *hc;
603 struct mapped_device *md = NULL;
604
605 down_read(&_hash_lock);
606 hc = __find_device_hash_cell(param);
607 if (hc) {
608 md = hc->md;
609 dm_get(md);
610
611 /*
612 * Sneakily write in both the name and the uuid
613 * while we have the cell.
614 */
615 strncpy(param->name, hc->name, sizeof(param->name));
616 if (hc->uuid)
617 strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1);
618 else
619 param->uuid[0] = '\0';
620
621 if (hc->new_map)
622 param->flags |= DM_INACTIVE_PRESENT_FLAG;
623 else
624 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
625 }
626 up_read(&_hash_lock);
627
628 return md;
629}
630
631static int dev_remove(struct dm_ioctl *param, size_t param_size)
632{
633 struct hash_cell *hc;
634
635 down_write(&_hash_lock);
636 hc = __find_device_hash_cell(param);
637
638 if (!hc) {
639 DMWARN("device doesn't appear to be in the dev hash table.");
640 up_write(&_hash_lock);
641 return -ENXIO;
642 }
643
644 __hash_remove(hc);
645 up_write(&_hash_lock);
646 param->data_size = 0;
647 return 0;
648}
649
650/*
651 * Check a string doesn't overrun the chunk of
652 * memory we copied from userland.
653 */
654static int invalid_str(char *str, void *end)
655{
656 while ((void *) str < end)
657 if (!*str++)
658 return 0;
659
660 return -EINVAL;
661}
662
663static int dev_rename(struct dm_ioctl *param, size_t param_size)
664{
665 int r;
666 char *new_name = (char *) param + param->data_start;
667
668 if (new_name < (char *) (param + 1) ||
669 invalid_str(new_name, (void *) param + param_size)) {
670 DMWARN("Invalid new logical volume name supplied.");
671 return -EINVAL;
672 }
673
674 r = check_name(new_name);
675 if (r)
676 return r;
677
678 param->data_size = 0;
679 return dm_hash_rename(param->name, new_name);
680}
681
682static int do_suspend(struct dm_ioctl *param)
683{
684 int r = 0;
685 struct mapped_device *md;
686
687 md = find_device(param);
688 if (!md)
689 return -ENXIO;
690
691 if (!dm_suspended(md))
692 r = dm_suspend(md);
693
694 if (!r)
695 r = __dev_status(md, param);
696
697 dm_put(md);
698 return r;
699}
700
701static int do_resume(struct dm_ioctl *param)
702{
703 int r = 0;
704 struct hash_cell *hc;
705 struct mapped_device *md;
706 struct dm_table *new_map;
707
708 down_write(&_hash_lock);
709
710 hc = __find_device_hash_cell(param);
711 if (!hc) {
712 DMWARN("device doesn't appear to be in the dev hash table.");
713 up_write(&_hash_lock);
714 return -ENXIO;
715 }
716
717 md = hc->md;
718 dm_get(md);
719
720 new_map = hc->new_map;
721 hc->new_map = NULL;
722 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
723
724 up_write(&_hash_lock);
725
726 /* Do we need to load a new map ? */
727 if (new_map) {
728 /* Suspend if it isn't already suspended */
729 if (!dm_suspended(md))
730 dm_suspend(md);
731
732 r = dm_swap_table(md, new_map);
733 if (r) {
734 dm_put(md);
735 dm_table_put(new_map);
736 return r;
737 }
738
739 if (dm_table_get_mode(new_map) & FMODE_WRITE)
740 set_disk_ro(dm_disk(md), 0);
741 else
742 set_disk_ro(dm_disk(md), 1);
743
744 dm_table_put(new_map);
745 }
746
747 if (dm_suspended(md))
748 r = dm_resume(md);
749
750 if (!r)
751 r = __dev_status(md, param);
752
753 dm_put(md);
754 return r;
755}
756
757/*
758 * Set or unset the suspension state of a device.
759 * If the device already is in the requested state we just return its status.
760 */
761static int dev_suspend(struct dm_ioctl *param, size_t param_size)
762{
763 if (param->flags & DM_SUSPEND_FLAG)
764 return do_suspend(param);
765
766 return do_resume(param);
767}
768
769/*
770 * Copies device info back to user space, used by
771 * the create and info ioctls.
772 */
773static int dev_status(struct dm_ioctl *param, size_t param_size)
774{
775 int r;
776 struct mapped_device *md;
777
778 md = find_device(param);
779 if (!md)
780 return -ENXIO;
781
782 r = __dev_status(md, param);
783 dm_put(md);
784 return r;
785}
786
787/*
788 * Build up the status struct for each target
789 */
790static void retrieve_status(struct dm_table *table,
791 struct dm_ioctl *param, size_t param_size)
792{
793 unsigned int i, num_targets;
794 struct dm_target_spec *spec;
795 char *outbuf, *outptr;
796 status_type_t type;
797 size_t remaining, len, used = 0;
798
799 outptr = outbuf = get_result_buffer(param, param_size, &len);
800
801 if (param->flags & DM_STATUS_TABLE_FLAG)
802 type = STATUSTYPE_TABLE;
803 else
804 type = STATUSTYPE_INFO;
805
806 /* Get all the target info */
807 num_targets = dm_table_get_num_targets(table);
808 for (i = 0; i < num_targets; i++) {
809 struct dm_target *ti = dm_table_get_target(table, i);
810
811 remaining = len - (outptr - outbuf);
812 if (remaining <= sizeof(struct dm_target_spec)) {
813 param->flags |= DM_BUFFER_FULL_FLAG;
814 break;
815 }
816
817 spec = (struct dm_target_spec *) outptr;
818
819 spec->status = 0;
820 spec->sector_start = ti->begin;
821 spec->length = ti->len;
822 strncpy(spec->target_type, ti->type->name,
823 sizeof(spec->target_type));
824
825 outptr += sizeof(struct dm_target_spec);
826 remaining = len - (outptr - outbuf);
827 if (remaining <= 0) {
828 param->flags |= DM_BUFFER_FULL_FLAG;
829 break;
830 }
831
832 /* Get the status/table string from the target driver */
833 if (ti->type->status) {
834 if (ti->type->status(ti, type, outptr, remaining)) {
835 param->flags |= DM_BUFFER_FULL_FLAG;
836 break;
837 }
838 } else
839 outptr[0] = '\0';
840
841 outptr += strlen(outptr) + 1;
842 used = param->data_start + (outptr - outbuf);
843
844 outptr = align_ptr(outptr);
845 spec->next = outptr - outbuf;
846 }
847
848 if (used)
849 param->data_size = used;
850
851 param->target_count = num_targets;
852}
853
854/*
855 * Wait for a device to report an event
856 */
857static int dev_wait(struct dm_ioctl *param, size_t param_size)
858{
859 int r;
860 struct mapped_device *md;
861 struct dm_table *table;
862
863 md = find_device(param);
864 if (!md)
865 return -ENXIO;
866
867 /*
868 * Wait for a notification event
869 */
870 if (dm_wait_event(md, param->event_nr)) {
871 r = -ERESTARTSYS;
872 goto out;
873 }
874
875 /*
876 * The userland program is going to want to know what
877 * changed to trigger the event, so we may as well tell
878 * him and save an ioctl.
879 */
880 r = __dev_status(md, param);
881 if (r)
882 goto out;
883
884 table = dm_get_table(md);
885 if (table) {
886 retrieve_status(table, param, param_size);
887 dm_table_put(table);
888 }
889
890 out:
891 dm_put(md);
892 return r;
893}
894
895static inline int get_mode(struct dm_ioctl *param)
896{
897 int mode = FMODE_READ | FMODE_WRITE;
898
899 if (param->flags & DM_READONLY_FLAG)
900 mode = FMODE_READ;
901
902 return mode;
903}
904
905static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
906 struct dm_target_spec **spec, char **target_params)
907{
908 *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
909 *target_params = (char *) (*spec + 1);
910
911 if (*spec < (last + 1))
912 return -EINVAL;
913
914 return invalid_str(*target_params, end);
915}
916
917static int populate_table(struct dm_table *table,
918 struct dm_ioctl *param, size_t param_size)
919{
920 int r;
921 unsigned int i = 0;
922 struct dm_target_spec *spec = (struct dm_target_spec *) param;
923 uint32_t next = param->data_start;
924 void *end = (void *) param + param_size;
925 char *target_params;
926
927 if (!param->target_count) {
928 DMWARN("populate_table: no targets specified");
929 return -EINVAL;
930 }
931
932 for (i = 0; i < param->target_count; i++) {
933
934 r = next_target(spec, next, end, &spec, &target_params);
935 if (r) {
936 DMWARN("unable to find target");
937 return r;
938 }
939
940 r = dm_table_add_target(table, spec->target_type,
941 (sector_t) spec->sector_start,
942 (sector_t) spec->length,
943 target_params);
944 if (r) {
945 DMWARN("error adding target to table");
946 return r;
947 }
948
949 next = spec->next;
950 }
951
952 return dm_table_complete(table);
953}
954
955static int table_load(struct dm_ioctl *param, size_t param_size)
956{
957 int r;
958 struct hash_cell *hc;
959 struct dm_table *t;
960
961 r = dm_table_create(&t, get_mode(param), param->target_count);
962 if (r)
963 return r;
964
965 r = populate_table(t, param, param_size);
966 if (r) {
967 dm_table_put(t);
968 return r;
969 }
970
971 down_write(&_hash_lock);
972 hc = __find_device_hash_cell(param);
973 if (!hc) {
974 DMWARN("device doesn't appear to be in the dev hash table.");
975 up_write(&_hash_lock);
976 return -ENXIO;
977 }
978
979 if (hc->new_map)
980 dm_table_put(hc->new_map);
981 hc->new_map = t;
982 param->flags |= DM_INACTIVE_PRESENT_FLAG;
983
984 r = __dev_status(hc->md, param);
985 up_write(&_hash_lock);
986 return r;
987}
988
989static int table_clear(struct dm_ioctl *param, size_t param_size)
990{
991 int r;
992 struct hash_cell *hc;
993
994 down_write(&_hash_lock);
995
996 hc = __find_device_hash_cell(param);
997 if (!hc) {
998 DMWARN("device doesn't appear to be in the dev hash table.");
999 up_write(&_hash_lock);
1000 return -ENXIO;
1001 }
1002
1003 if (hc->new_map) {
1004 dm_table_put(hc->new_map);
1005 hc->new_map = NULL;
1006 }
1007
1008 param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
1009
1010 r = __dev_status(hc->md, param);
1011 up_write(&_hash_lock);
1012 return r;
1013}
1014
1015/*
1016 * Retrieves a list of devices used by a particular dm device.
1017 */
1018static void retrieve_deps(struct dm_table *table,
1019 struct dm_ioctl *param, size_t param_size)
1020{
1021 unsigned int count = 0;
1022 struct list_head *tmp;
1023 size_t len, needed;
1024 struct dm_dev *dd;
1025 struct dm_target_deps *deps;
1026
1027 deps = get_result_buffer(param, param_size, &len);
1028
1029 /*
1030 * Count the devices.
1031 */
1032 list_for_each (tmp, dm_table_get_devices(table))
1033 count++;
1034
1035 /*
1036 * Check we have enough space.
1037 */
1038 needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
1039 if (len < needed) {
1040 param->flags |= DM_BUFFER_FULL_FLAG;
1041 return;
1042 }
1043
1044 /*
1045 * Fill in the devices.
1046 */
1047 deps->count = count;
1048 count = 0;
1049 list_for_each_entry (dd, dm_table_get_devices(table), list)
1050 deps->dev[count++] = huge_encode_dev(dd->bdev->bd_dev);
1051
1052 param->data_size = param->data_start + needed;
1053}
1054
1055static int table_deps(struct dm_ioctl *param, size_t param_size)
1056{
1057 int r = 0;
1058 struct mapped_device *md;
1059 struct dm_table *table;
1060
1061 md = find_device(param);
1062 if (!md)
1063 return -ENXIO;
1064
1065 r = __dev_status(md, param);
1066 if (r)
1067 goto out;
1068
1069 table = dm_get_table(md);
1070 if (table) {
1071 retrieve_deps(table, param, param_size);
1072 dm_table_put(table);
1073 }
1074
1075 out:
1076 dm_put(md);
1077 return r;
1078}
1079
1080/*
1081 * Return the status of a device as a text string for each
1082 * target.
1083 */
1084static int table_status(struct dm_ioctl *param, size_t param_size)
1085{
1086 int r;
1087 struct mapped_device *md;
1088 struct dm_table *table;
1089
1090 md = find_device(param);
1091 if (!md)
1092 return -ENXIO;
1093
1094 r = __dev_status(md, param);
1095 if (r)
1096 goto out;
1097
1098 table = dm_get_table(md);
1099 if (table) {
1100 retrieve_status(table, param, param_size);
1101 dm_table_put(table);
1102 }
1103
1104 out:
1105 dm_put(md);
1106 return r;
1107}
1108
1109/*
1110 * Pass a message to the target that's at the supplied device offset.
1111 */
1112static int target_message(struct dm_ioctl *param, size_t param_size)
1113{
1114 int r, argc;
1115 char **argv;
1116 struct mapped_device *md;
1117 struct dm_table *table;
1118 struct dm_target *ti;
1119 struct dm_target_msg *tmsg = (void *) param + param->data_start;
1120
1121 md = find_device(param);
1122 if (!md)
1123 return -ENXIO;
1124
1125 r = __dev_status(md, param);
1126 if (r)
1127 goto out;
1128
1129 if (tmsg < (struct dm_target_msg *) (param + 1) ||
1130 invalid_str(tmsg->message, (void *) param + param_size)) {
1131 DMWARN("Invalid target message parameters.");
1132 r = -EINVAL;
1133 goto out;
1134 }
1135
1136 r = dm_split_args(&argc, &argv, tmsg->message);
1137 if (r) {
1138 DMWARN("Failed to split target message parameters");
1139 goto out;
1140 }
1141
1142 table = dm_get_table(md);
1143 if (!table)
1144 goto out_argv;
1145
1146 if (tmsg->sector >= dm_table_get_size(table)) {
1147 DMWARN("Target message sector outside device.");
1148 r = -EINVAL;
1149 goto out_table;
1150 }
1151
1152 ti = dm_table_find_target(table, tmsg->sector);
1153 if (ti->type->message)
1154 r = ti->type->message(ti, argc, argv);
1155 else {
1156 DMWARN("Target type does not support messages");
1157 r = -EINVAL;
1158 }
1159
1160 out_table:
1161 dm_table_put(table);
1162 out_argv:
1163 kfree(argv);
1164 out:
1165 param->data_size = 0;
1166 dm_put(md);
1167 return r;
1168}
1169
1170/*-----------------------------------------------------------------
1171 * Implementation of open/close/ioctl on the special char
1172 * device.
1173 *---------------------------------------------------------------*/
1174static ioctl_fn lookup_ioctl(unsigned int cmd)
1175{
1176 static struct {
1177 int cmd;
1178 ioctl_fn fn;
1179 } _ioctls[] = {
1180 {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
1181 {DM_REMOVE_ALL_CMD, remove_all},
1182 {DM_LIST_DEVICES_CMD, list_devices},
1183
1184 {DM_DEV_CREATE_CMD, dev_create},
1185 {DM_DEV_REMOVE_CMD, dev_remove},
1186 {DM_DEV_RENAME_CMD, dev_rename},
1187 {DM_DEV_SUSPEND_CMD, dev_suspend},
1188 {DM_DEV_STATUS_CMD, dev_status},
1189 {DM_DEV_WAIT_CMD, dev_wait},
1190
1191 {DM_TABLE_LOAD_CMD, table_load},
1192 {DM_TABLE_CLEAR_CMD, table_clear},
1193 {DM_TABLE_DEPS_CMD, table_deps},
1194 {DM_TABLE_STATUS_CMD, table_status},
1195
1196 {DM_LIST_VERSIONS_CMD, list_versions},
1197
1198 {DM_TARGET_MSG_CMD, target_message}
1199 };
1200
1201 return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
1202}
1203
1204/*
1205 * As well as checking the version compatibility this always
1206 * copies the kernel interface version out.
1207 */
1208static int check_version(unsigned int cmd, struct dm_ioctl __user *user)
1209{
1210 uint32_t version[3];
1211 int r = 0;
1212
1213 if (copy_from_user(version, user->version, sizeof(version)))
1214 return -EFAULT;
1215
1216 if ((DM_VERSION_MAJOR != version[0]) ||
1217 (DM_VERSION_MINOR < version[1])) {
1218 DMWARN("ioctl interface mismatch: "
1219 "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
1220 DM_VERSION_MAJOR, DM_VERSION_MINOR,
1221 DM_VERSION_PATCHLEVEL,
1222 version[0], version[1], version[2], cmd);
1223 r = -EINVAL;
1224 }
1225
1226 /*
1227 * Fill in the kernel version.
1228 */
1229 version[0] = DM_VERSION_MAJOR;
1230 version[1] = DM_VERSION_MINOR;
1231 version[2] = DM_VERSION_PATCHLEVEL;
1232 if (copy_to_user(user->version, version, sizeof(version)))
1233 return -EFAULT;
1234
1235 return r;
1236}
1237
1238static void free_params(struct dm_ioctl *param)
1239{
1240 vfree(param);
1241}
1242
1243static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl **param)
1244{
1245 struct dm_ioctl tmp, *dmi;
1246
1247 if (copy_from_user(&tmp, user, sizeof(tmp)))
1248 return -EFAULT;
1249
1250 if (tmp.data_size < sizeof(tmp))
1251 return -EINVAL;
1252
1253 dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
1254 if (!dmi)
1255 return -ENOMEM;
1256
1257 if (copy_from_user(dmi, user, tmp.data_size)) {
1258 vfree(dmi);
1259 return -EFAULT;
1260 }
1261
1262 *param = dmi;
1263 return 0;
1264}
1265
1266static int validate_params(uint cmd, struct dm_ioctl *param)
1267{
1268 /* Always clear this flag */
1269 param->flags &= ~DM_BUFFER_FULL_FLAG;
1270
1271 /* Ignores parameters */
1272 if (cmd == DM_REMOVE_ALL_CMD ||
1273 cmd == DM_LIST_DEVICES_CMD ||
1274 cmd == DM_LIST_VERSIONS_CMD)
1275 return 0;
1276
1277 if ((cmd == DM_DEV_CREATE_CMD)) {
1278 if (!*param->name) {
1279 DMWARN("name not supplied when creating device");
1280 return -EINVAL;
1281 }
1282 } else if ((*param->uuid && *param->name)) {
1283 DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
1284 return -EINVAL;
1285 }
1286
1287 /* Ensure strings are terminated */
1288 param->name[DM_NAME_LEN - 1] = '\0';
1289 param->uuid[DM_UUID_LEN - 1] = '\0';
1290
1291 return 0;
1292}
1293
1294static int ctl_ioctl(struct inode *inode, struct file *file,
1295 uint command, ulong u)
1296{
1297 int r = 0;
1298 unsigned int cmd;
1299 struct dm_ioctl *param;
1300 struct dm_ioctl __user *user = (struct dm_ioctl __user *) u;
1301 ioctl_fn fn = NULL;
1302 size_t param_size;
1303
1304 /* only root can play with this */
1305 if (!capable(CAP_SYS_ADMIN))
1306 return -EACCES;
1307
1308 if (_IOC_TYPE(command) != DM_IOCTL)
1309 return -ENOTTY;
1310
1311 cmd = _IOC_NR(command);
1312
1313 /*
1314 * Check the interface version passed in. This also
1315 * writes out the kernel's interface version.
1316 */
1317 r = check_version(cmd, user);
1318 if (r)
1319 return r;
1320
1321 /*
1322 * Nothing more to do for the version command.
1323 */
1324 if (cmd == DM_VERSION_CMD)
1325 return 0;
1326
1327 fn = lookup_ioctl(cmd);
1328 if (!fn) {
1329 DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
1330 return -ENOTTY;
1331 }
1332
1333 /*
1334 * Trying to avoid low memory issues when a device is
1335 * suspended.
1336 */
1337 current->flags |= PF_MEMALLOC;
1338
1339 /*
1340 * Copy the parameters into kernel space.
1341 */
1342 r = copy_params(user, &param);
1343 if (r) {
1344 current->flags &= ~PF_MEMALLOC;
1345 return r;
1346 }
1347
1348 /*
1349 * FIXME: eventually we will remove the PF_MEMALLOC flag
1350 * here. However the tools still do nasty things like
1351 * 'load' while a device is suspended.
1352 */
1353
1354 r = validate_params(cmd, param);
1355 if (r)
1356 goto out;
1357
1358 param_size = param->data_size;
1359 param->data_size = sizeof(*param);
1360 r = fn(param, param_size);
1361
1362 /*
1363 * Copy the results back to userland.
1364 */
1365 if (!r && copy_to_user(user, param, param->data_size))
1366 r = -EFAULT;
1367
1368 out:
1369 free_params(param);
1370 current->flags &= ~PF_MEMALLOC;
1371 return r;
1372}
1373
1374static struct file_operations _ctl_fops = {
1375 .ioctl = ctl_ioctl,
1376 .owner = THIS_MODULE,
1377};
1378
1379static struct miscdevice _dm_misc = {
1380 .minor = MISC_DYNAMIC_MINOR,
1381 .name = DM_NAME,
1382 .devfs_name = "mapper/control",
1383 .fops = &_ctl_fops
1384};
1385
1386/*
1387 * Create misc character device and link to DM_DIR/control.
1388 */
1389int __init dm_interface_init(void)
1390{
1391 int r;
1392
1393 r = dm_hash_init();
1394 if (r)
1395 return r;
1396
1397 r = misc_register(&_dm_misc);
1398 if (r) {
1399 DMERR("misc_register failed for control device");
1400 dm_hash_exit();
1401 return r;
1402 }
1403
1404 DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
1405 DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
1406 DM_DRIVER_EMAIL);
1407 return 0;
1408}
1409
1410void dm_interface_exit(void)
1411{
1412 if (misc_deregister(&_dm_misc) < 0)
1413 DMERR("misc_deregister failed for control device");
1414
1415 dm_hash_exit();
1416}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
new file mode 100644
index 000000000000..6a2cd5dc8a63
--- /dev/null
+++ b/drivers/md/dm-linear.c
@@ -0,0 +1,123 @@
1/*
2 * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/blkdev.h>
12#include <linux/bio.h>
13#include <linux/slab.h>
14
15/*
16 * Linear: maps a linear range of a device.
17 */
18struct linear_c {
19 struct dm_dev *dev;
20 sector_t start;
21};
22
23/*
24 * Construct a linear mapping: <dev_path> <offset>
25 */
26static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
27{
28 struct linear_c *lc;
29
30 if (argc != 2) {
31 ti->error = "dm-linear: Invalid argument count";
32 return -EINVAL;
33 }
34
35 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
36 if (lc == NULL) {
37 ti->error = "dm-linear: Cannot allocate linear context";
38 return -ENOMEM;
39 }
40
41 if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
42 ti->error = "dm-linear: Invalid device sector";
43 goto bad;
44 }
45
46 if (dm_get_device(ti, argv[0], lc->start, ti->len,
47 dm_table_get_mode(ti->table), &lc->dev)) {
48 ti->error = "dm-linear: Device lookup failed";
49 goto bad;
50 }
51
52 ti->private = lc;
53 return 0;
54
55 bad:
56 kfree(lc);
57 return -EINVAL;
58}
59
60static void linear_dtr(struct dm_target *ti)
61{
62 struct linear_c *lc = (struct linear_c *) ti->private;
63
64 dm_put_device(ti, lc->dev);
65 kfree(lc);
66}
67
68static int linear_map(struct dm_target *ti, struct bio *bio,
69 union map_info *map_context)
70{
71 struct linear_c *lc = (struct linear_c *) ti->private;
72
73 bio->bi_bdev = lc->dev->bdev;
74 bio->bi_sector = lc->start + (bio->bi_sector - ti->begin);
75
76 return 1;
77}
78
79static int linear_status(struct dm_target *ti, status_type_t type,
80 char *result, unsigned int maxlen)
81{
82 struct linear_c *lc = (struct linear_c *) ti->private;
83
84 switch (type) {
85 case STATUSTYPE_INFO:
86 result[0] = '\0';
87 break;
88
89 case STATUSTYPE_TABLE:
90 snprintf(result, maxlen, "%s " SECTOR_FORMAT, lc->dev->name,
91 lc->start);
92 break;
93 }
94 return 0;
95}
96
97static struct target_type linear_target = {
98 .name = "linear",
99 .version= {1, 0, 1},
100 .module = THIS_MODULE,
101 .ctr = linear_ctr,
102 .dtr = linear_dtr,
103 .map = linear_map,
104 .status = linear_status,
105};
106
107int __init dm_linear_init(void)
108{
109 int r = dm_register_target(&linear_target);
110
111 if (r < 0)
112 DMERR("linear: register failed %d", r);
113
114 return r;
115}
116
117void dm_linear_exit(void)
118{
119 int r = dm_unregister_target(&linear_target);
120
121 if (r < 0)
122 DMERR("linear: unregister failed %d", r);
123}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
new file mode 100644
index 000000000000..e110655eabdb
--- /dev/null
+++ b/drivers/md/dm-log.c
@@ -0,0 +1,711 @@
1/*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the LGPL.
5 */
6
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/module.h>
10#include <linux/vmalloc.h>
11
12#include "dm-log.h"
13#include "dm-io.h"
14
15static LIST_HEAD(_log_types);
16static DEFINE_SPINLOCK(_lock);
17
18int dm_register_dirty_log_type(struct dirty_log_type *type)
19{
20 spin_lock(&_lock);
21 type->use_count = 0;
22 list_add(&type->list, &_log_types);
23 spin_unlock(&_lock);
24
25 return 0;
26}
27
28int dm_unregister_dirty_log_type(struct dirty_log_type *type)
29{
30 spin_lock(&_lock);
31
32 if (type->use_count)
33 DMWARN("Attempt to unregister a log type that is still in use");
34 else
35 list_del(&type->list);
36
37 spin_unlock(&_lock);
38
39 return 0;
40}
41
42static struct dirty_log_type *get_type(const char *type_name)
43{
44 struct dirty_log_type *type;
45
46 spin_lock(&_lock);
47 list_for_each_entry (type, &_log_types, list)
48 if (!strcmp(type_name, type->name)) {
49 if (!type->use_count && !try_module_get(type->module)){
50 spin_unlock(&_lock);
51 return NULL;
52 }
53 type->use_count++;
54 spin_unlock(&_lock);
55 return type;
56 }
57
58 spin_unlock(&_lock);
59 return NULL;
60}
61
62static void put_type(struct dirty_log_type *type)
63{
64 spin_lock(&_lock);
65 if (!--type->use_count)
66 module_put(type->module);
67 spin_unlock(&_lock);
68}
69
70struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
71 unsigned int argc, char **argv)
72{
73 struct dirty_log_type *type;
74 struct dirty_log *log;
75
76 log = kmalloc(sizeof(*log), GFP_KERNEL);
77 if (!log)
78 return NULL;
79
80 type = get_type(type_name);
81 if (!type) {
82 kfree(log);
83 return NULL;
84 }
85
86 log->type = type;
87 if (type->ctr(log, ti, argc, argv)) {
88 kfree(log);
89 put_type(type);
90 return NULL;
91 }
92
93 return log;
94}
95
96void dm_destroy_dirty_log(struct dirty_log *log)
97{
98 log->type->dtr(log);
99 put_type(log->type);
100 kfree(log);
101}
102
103/*-----------------------------------------------------------------
104 * Persistent and core logs share a lot of their implementation.
105 * FIXME: need a reload method to be called from a resume
106 *---------------------------------------------------------------*/
107/*
108 * Magic for persistent mirrors: "MiRr"
109 */
110#define MIRROR_MAGIC 0x4D695272
111
112/*
113 * The on-disk version of the metadata.
114 */
115#define MIRROR_DISK_VERSION 1
116#define LOG_OFFSET 2
117
118struct log_header {
119 uint32_t magic;
120
121 /*
122 * Simple, incrementing version. no backward
123 * compatibility.
124 */
125 uint32_t version;
126 sector_t nr_regions;
127};
128
129struct log_c {
130 struct dm_target *ti;
131 int touched;
132 uint32_t region_size;
133 unsigned int region_count;
134 region_t sync_count;
135
136 unsigned bitset_uint32_count;
137 uint32_t *clean_bits;
138 uint32_t *sync_bits;
139 uint32_t *recovering_bits; /* FIXME: this seems excessive */
140
141 int sync_search;
142
143 /* Resync flag */
144 enum sync {
145 DEFAULTSYNC, /* Synchronize if necessary */
146 NOSYNC, /* Devices known to be already in sync */
147 FORCESYNC, /* Force a sync to happen */
148 } sync;
149
150 /*
151 * Disk log fields
152 */
153 struct dm_dev *log_dev;
154 struct log_header header;
155
156 struct io_region header_location;
157 struct log_header *disk_header;
158
159 struct io_region bits_location;
160 uint32_t *disk_bits;
161};
162
163/*
164 * The touched member needs to be updated every time we access
165 * one of the bitsets.
166 */
167static inline int log_test_bit(uint32_t *bs, unsigned bit)
168{
169 return test_bit(bit, (unsigned long *) bs) ? 1 : 0;
170}
171
172static inline void log_set_bit(struct log_c *l,
173 uint32_t *bs, unsigned bit)
174{
175 set_bit(bit, (unsigned long *) bs);
176 l->touched = 1;
177}
178
179static inline void log_clear_bit(struct log_c *l,
180 uint32_t *bs, unsigned bit)
181{
182 clear_bit(bit, (unsigned long *) bs);
183 l->touched = 1;
184}
185
186/*----------------------------------------------------------------
187 * Header IO
188 *--------------------------------------------------------------*/
189static void header_to_disk(struct log_header *core, struct log_header *disk)
190{
191 disk->magic = cpu_to_le32(core->magic);
192 disk->version = cpu_to_le32(core->version);
193 disk->nr_regions = cpu_to_le64(core->nr_regions);
194}
195
196static void header_from_disk(struct log_header *core, struct log_header *disk)
197{
198 core->magic = le32_to_cpu(disk->magic);
199 core->version = le32_to_cpu(disk->version);
200 core->nr_regions = le64_to_cpu(disk->nr_regions);
201}
202
203static int read_header(struct log_c *log)
204{
205 int r;
206 unsigned long ebits;
207
208 r = dm_io_sync_vm(1, &log->header_location, READ,
209 log->disk_header, &ebits);
210 if (r)
211 return r;
212
213 header_from_disk(&log->header, log->disk_header);
214
215 /* New log required? */
216 if (log->sync != DEFAULTSYNC || log->header.magic != MIRROR_MAGIC) {
217 log->header.magic = MIRROR_MAGIC;
218 log->header.version = MIRROR_DISK_VERSION;
219 log->header.nr_regions = 0;
220 }
221
222 if (log->header.version != MIRROR_DISK_VERSION) {
223 DMWARN("incompatible disk log version");
224 return -EINVAL;
225 }
226
227 return 0;
228}
229
230static inline int write_header(struct log_c *log)
231{
232 unsigned long ebits;
233
234 header_to_disk(&log->header, log->disk_header);
235 return dm_io_sync_vm(1, &log->header_location, WRITE,
236 log->disk_header, &ebits);
237}
238
239/*----------------------------------------------------------------
240 * Bits IO
241 *--------------------------------------------------------------*/
242static inline void bits_to_core(uint32_t *core, uint32_t *disk, unsigned count)
243{
244 unsigned i;
245
246 for (i = 0; i < count; i++)
247 core[i] = le32_to_cpu(disk[i]);
248}
249
250static inline void bits_to_disk(uint32_t *core, uint32_t *disk, unsigned count)
251{
252 unsigned i;
253
254 /* copy across the clean/dirty bitset */
255 for (i = 0; i < count; i++)
256 disk[i] = cpu_to_le32(core[i]);
257}
258
259static int read_bits(struct log_c *log)
260{
261 int r;
262 unsigned long ebits;
263
264 r = dm_io_sync_vm(1, &log->bits_location, READ,
265 log->disk_bits, &ebits);
266 if (r)
267 return r;
268
269 bits_to_core(log->clean_bits, log->disk_bits,
270 log->bitset_uint32_count);
271 return 0;
272}
273
274static int write_bits(struct log_c *log)
275{
276 unsigned long ebits;
277 bits_to_disk(log->clean_bits, log->disk_bits,
278 log->bitset_uint32_count);
279 return dm_io_sync_vm(1, &log->bits_location, WRITE,
280 log->disk_bits, &ebits);
281}
282
283/*----------------------------------------------------------------
284 * core log constructor/destructor
285 *
286 * argv contains region_size followed optionally by [no]sync
287 *--------------------------------------------------------------*/
288#define BYTE_SHIFT 3
289static int core_ctr(struct dirty_log *log, struct dm_target *ti,
290 unsigned int argc, char **argv)
291{
292 enum sync sync = DEFAULTSYNC;
293
294 struct log_c *lc;
295 uint32_t region_size;
296 unsigned int region_count;
297 size_t bitset_size;
298
299 if (argc < 1 || argc > 2) {
300 DMWARN("wrong number of arguments to mirror log");
301 return -EINVAL;
302 }
303
304 if (argc > 1) {
305 if (!strcmp(argv[1], "sync"))
306 sync = FORCESYNC;
307 else if (!strcmp(argv[1], "nosync"))
308 sync = NOSYNC;
309 else {
310 DMWARN("unrecognised sync argument to mirror log: %s",
311 argv[1]);
312 return -EINVAL;
313 }
314 }
315
316 if (sscanf(argv[0], "%u", &region_size) != 1) {
317 DMWARN("invalid region size string");
318 return -EINVAL;
319 }
320
321 region_count = dm_sector_div_up(ti->len, region_size);
322
323 lc = kmalloc(sizeof(*lc), GFP_KERNEL);
324 if (!lc) {
325 DMWARN("couldn't allocate core log");
326 return -ENOMEM;
327 }
328
329 lc->ti = ti;
330 lc->touched = 0;
331 lc->region_size = region_size;
332 lc->region_count = region_count;
333 lc->sync = sync;
334
335 /*
336 * Work out how many words we need to hold the bitset.
337 */
338 bitset_size = dm_round_up(region_count,
339 sizeof(*lc->clean_bits) << BYTE_SHIFT);
340 bitset_size >>= BYTE_SHIFT;
341
342 lc->bitset_uint32_count = bitset_size / 4;
343 lc->clean_bits = vmalloc(bitset_size);
344 if (!lc->clean_bits) {
345 DMWARN("couldn't allocate clean bitset");
346 kfree(lc);
347 return -ENOMEM;
348 }
349 memset(lc->clean_bits, -1, bitset_size);
350
351 lc->sync_bits = vmalloc(bitset_size);
352 if (!lc->sync_bits) {
353 DMWARN("couldn't allocate sync bitset");
354 vfree(lc->clean_bits);
355 kfree(lc);
356 return -ENOMEM;
357 }
358 memset(lc->sync_bits, (sync == NOSYNC) ? -1 : 0, bitset_size);
359 lc->sync_count = (sync == NOSYNC) ? region_count : 0;
360
361 lc->recovering_bits = vmalloc(bitset_size);
362 if (!lc->recovering_bits) {
363 DMWARN("couldn't allocate sync bitset");
364 vfree(lc->sync_bits);
365 vfree(lc->clean_bits);
366 kfree(lc);
367 return -ENOMEM;
368 }
369 memset(lc->recovering_bits, 0, bitset_size);
370 lc->sync_search = 0;
371 log->context = lc;
372 return 0;
373}
374
375static void core_dtr(struct dirty_log *log)
376{
377 struct log_c *lc = (struct log_c *) log->context;
378 vfree(lc->clean_bits);
379 vfree(lc->sync_bits);
380 vfree(lc->recovering_bits);
381 kfree(lc);
382}
383
384/*----------------------------------------------------------------
385 * disk log constructor/destructor
386 *
387 * argv contains log_device region_size followed optionally by [no]sync
388 *--------------------------------------------------------------*/
389static int disk_ctr(struct dirty_log *log, struct dm_target *ti,
390 unsigned int argc, char **argv)
391{
392 int r;
393 size_t size;
394 struct log_c *lc;
395 struct dm_dev *dev;
396
397 if (argc < 2 || argc > 3) {
398 DMWARN("wrong number of arguments to disk mirror log");
399 return -EINVAL;
400 }
401
402 r = dm_get_device(ti, argv[0], 0, 0 /* FIXME */,
403 FMODE_READ | FMODE_WRITE, &dev);
404 if (r)
405 return r;
406
407 r = core_ctr(log, ti, argc - 1, argv + 1);
408 if (r) {
409 dm_put_device(ti, dev);
410 return r;
411 }
412
413 lc = (struct log_c *) log->context;
414 lc->log_dev = dev;
415
416 /* setup the disk header fields */
417 lc->header_location.bdev = lc->log_dev->bdev;
418 lc->header_location.sector = 0;
419 lc->header_location.count = 1;
420
421 /*
422 * We can't read less than this amount, even though we'll
423 * not be using most of this space.
424 */
425 lc->disk_header = vmalloc(1 << SECTOR_SHIFT);
426 if (!lc->disk_header)
427 goto bad;
428
429 /* setup the disk bitset fields */
430 lc->bits_location.bdev = lc->log_dev->bdev;
431 lc->bits_location.sector = LOG_OFFSET;
432
433 size = dm_round_up(lc->bitset_uint32_count * sizeof(uint32_t),
434 1 << SECTOR_SHIFT);
435 lc->bits_location.count = size >> SECTOR_SHIFT;
436 lc->disk_bits = vmalloc(size);
437 if (!lc->disk_bits) {
438 vfree(lc->disk_header);
439 goto bad;
440 }
441 return 0;
442
443 bad:
444 dm_put_device(ti, lc->log_dev);
445 core_dtr(log);
446 return -ENOMEM;
447}
448
449static void disk_dtr(struct dirty_log *log)
450{
451 struct log_c *lc = (struct log_c *) log->context;
452 dm_put_device(lc->ti, lc->log_dev);
453 vfree(lc->disk_header);
454 vfree(lc->disk_bits);
455 core_dtr(log);
456}
457
458static int count_bits32(uint32_t *addr, unsigned size)
459{
460 int count = 0, i;
461
462 for (i = 0; i < size; i++) {
463 count += hweight32(*(addr+i));
464 }
465 return count;
466}
467
468static int disk_resume(struct dirty_log *log)
469{
470 int r;
471 unsigned i;
472 struct log_c *lc = (struct log_c *) log->context;
473 size_t size = lc->bitset_uint32_count * sizeof(uint32_t);
474
475 /* read the disk header */
476 r = read_header(lc);
477 if (r)
478 return r;
479
480 /* read the bits */
481 r = read_bits(lc);
482 if (r)
483 return r;
484
485 /* set or clear any new bits */
486 if (lc->sync == NOSYNC)
487 for (i = lc->header.nr_regions; i < lc->region_count; i++)
488 /* FIXME: amazingly inefficient */
489 log_set_bit(lc, lc->clean_bits, i);
490 else
491 for (i = lc->header.nr_regions; i < lc->region_count; i++)
492 /* FIXME: amazingly inefficient */
493 log_clear_bit(lc, lc->clean_bits, i);
494
495 /* copy clean across to sync */
496 memcpy(lc->sync_bits, lc->clean_bits, size);
497 lc->sync_count = count_bits32(lc->clean_bits, lc->bitset_uint32_count);
498
499 /* write the bits */
500 r = write_bits(lc);
501 if (r)
502 return r;
503
504 /* set the correct number of regions in the header */
505 lc->header.nr_regions = lc->region_count;
506
507 /* write the new header */
508 return write_header(lc);
509}
510
511static uint32_t core_get_region_size(struct dirty_log *log)
512{
513 struct log_c *lc = (struct log_c *) log->context;
514 return lc->region_size;
515}
516
517static int core_is_clean(struct dirty_log *log, region_t region)
518{
519 struct log_c *lc = (struct log_c *) log->context;
520 return log_test_bit(lc->clean_bits, region);
521}
522
523static int core_in_sync(struct dirty_log *log, region_t region, int block)
524{
525 struct log_c *lc = (struct log_c *) log->context;
526 return log_test_bit(lc->sync_bits, region);
527}
528
529static int core_flush(struct dirty_log *log)
530{
531 /* no op */
532 return 0;
533}
534
535static int disk_flush(struct dirty_log *log)
536{
537 int r;
538 struct log_c *lc = (struct log_c *) log->context;
539
540 /* only write if the log has changed */
541 if (!lc->touched)
542 return 0;
543
544 r = write_bits(lc);
545 if (!r)
546 lc->touched = 0;
547
548 return r;
549}
550
551static void core_mark_region(struct dirty_log *log, region_t region)
552{
553 struct log_c *lc = (struct log_c *) log->context;
554 log_clear_bit(lc, lc->clean_bits, region);
555}
556
557static void core_clear_region(struct dirty_log *log, region_t region)
558{
559 struct log_c *lc = (struct log_c *) log->context;
560 log_set_bit(lc, lc->clean_bits, region);
561}
562
563static int core_get_resync_work(struct dirty_log *log, region_t *region)
564{
565 struct log_c *lc = (struct log_c *) log->context;
566
567 if (lc->sync_search >= lc->region_count)
568 return 0;
569
570 do {
571 *region = find_next_zero_bit((unsigned long *) lc->sync_bits,
572 lc->region_count,
573 lc->sync_search);
574 lc->sync_search = *region + 1;
575
576 if (*region == lc->region_count)
577 return 0;
578
579 } while (log_test_bit(lc->recovering_bits, *region));
580
581 log_set_bit(lc, lc->recovering_bits, *region);
582 return 1;
583}
584
585static void core_complete_resync_work(struct dirty_log *log, region_t region,
586 int success)
587{
588 struct log_c *lc = (struct log_c *) log->context;
589
590 log_clear_bit(lc, lc->recovering_bits, region);
591 if (success) {
592 log_set_bit(lc, lc->sync_bits, region);
593 lc->sync_count++;
594 }
595}
596
597static region_t core_get_sync_count(struct dirty_log *log)
598{
599 struct log_c *lc = (struct log_c *) log->context;
600
601 return lc->sync_count;
602}
603
604#define DMEMIT_SYNC \
605 if (lc->sync != DEFAULTSYNC) \
606 DMEMIT("%ssync ", lc->sync == NOSYNC ? "no" : "")
607
608static int core_status(struct dirty_log *log, status_type_t status,
609 char *result, unsigned int maxlen)
610{
611 int sz = 0;
612 struct log_c *lc = log->context;
613
614 switch(status) {
615 case STATUSTYPE_INFO:
616 break;
617
618 case STATUSTYPE_TABLE:
619 DMEMIT("%s %u %u ", log->type->name,
620 lc->sync == DEFAULTSYNC ? 1 : 2, lc->region_size);
621 DMEMIT_SYNC;
622 }
623
624 return sz;
625}
626
627static int disk_status(struct dirty_log *log, status_type_t status,
628 char *result, unsigned int maxlen)
629{
630 int sz = 0;
631 char buffer[16];
632 struct log_c *lc = log->context;
633
634 switch(status) {
635 case STATUSTYPE_INFO:
636 break;
637
638 case STATUSTYPE_TABLE:
639 format_dev_t(buffer, lc->log_dev->bdev->bd_dev);
640 DMEMIT("%s %u %s %u ", log->type->name,
641 lc->sync == DEFAULTSYNC ? 2 : 3, buffer,
642 lc->region_size);
643 DMEMIT_SYNC;
644 }
645
646 return sz;
647}
648
649static struct dirty_log_type _core_type = {
650 .name = "core",
651 .module = THIS_MODULE,
652 .ctr = core_ctr,
653 .dtr = core_dtr,
654 .get_region_size = core_get_region_size,
655 .is_clean = core_is_clean,
656 .in_sync = core_in_sync,
657 .flush = core_flush,
658 .mark_region = core_mark_region,
659 .clear_region = core_clear_region,
660 .get_resync_work = core_get_resync_work,
661 .complete_resync_work = core_complete_resync_work,
662 .get_sync_count = core_get_sync_count,
663 .status = core_status,
664};
665
666static struct dirty_log_type _disk_type = {
667 .name = "disk",
668 .module = THIS_MODULE,
669 .ctr = disk_ctr,
670 .dtr = disk_dtr,
671 .suspend = disk_flush,
672 .resume = disk_resume,
673 .get_region_size = core_get_region_size,
674 .is_clean = core_is_clean,
675 .in_sync = core_in_sync,
676 .flush = disk_flush,
677 .mark_region = core_mark_region,
678 .clear_region = core_clear_region,
679 .get_resync_work = core_get_resync_work,
680 .complete_resync_work = core_complete_resync_work,
681 .get_sync_count = core_get_sync_count,
682 .status = disk_status,
683};
684
685int __init dm_dirty_log_init(void)
686{
687 int r;
688
689 r = dm_register_dirty_log_type(&_core_type);
690 if (r)
691 DMWARN("couldn't register core log");
692
693 r = dm_register_dirty_log_type(&_disk_type);
694 if (r) {
695 DMWARN("couldn't register disk type");
696 dm_unregister_dirty_log_type(&_core_type);
697 }
698
699 return r;
700}
701
702void dm_dirty_log_exit(void)
703{
704 dm_unregister_dirty_log_type(&_disk_type);
705 dm_unregister_dirty_log_type(&_core_type);
706}
707
708EXPORT_SYMBOL(dm_register_dirty_log_type);
709EXPORT_SYMBOL(dm_unregister_dirty_log_type);
710EXPORT_SYMBOL(dm_create_dirty_log);
711EXPORT_SYMBOL(dm_destroy_dirty_log);
diff --git a/drivers/md/dm-log.h b/drivers/md/dm-log.h
new file mode 100644
index 000000000000..5ae5309ebf28
--- /dev/null
+++ b/drivers/md/dm-log.h
@@ -0,0 +1,130 @@
1/*
2 * Copyright (C) 2003 Sistina Software
3 *
4 * This file is released under the LGPL.
5 */
6
7#ifndef DM_DIRTY_LOG
8#define DM_DIRTY_LOG
9
10#include "dm.h"
11
12typedef sector_t region_t;
13
14struct dirty_log_type;
15
16struct dirty_log {
17 struct dirty_log_type *type;
18 void *context;
19};
20
21struct dirty_log_type {
22 struct list_head list;
23 const char *name;
24 struct module *module;
25 unsigned int use_count;
26
27 int (*ctr)(struct dirty_log *log, struct dm_target *ti,
28 unsigned int argc, char **argv);
29 void (*dtr)(struct dirty_log *log);
30
31 /*
32 * There are times when we don't want the log to touch
33 * the disk.
34 */
35 int (*suspend)(struct dirty_log *log);
36 int (*resume)(struct dirty_log *log);
37
38 /*
39 * Retrieves the smallest size of region that the log can
40 * deal with.
41 */
42 uint32_t (*get_region_size)(struct dirty_log *log);
43
44 /*
45 * A predicate to say whether a region is clean or not.
46 * May block.
47 */
48 int (*is_clean)(struct dirty_log *log, region_t region);
49
50 /*
51 * Returns: 0, 1, -EWOULDBLOCK, < 0
52 *
53 * A predicate function to check the area given by
54 * [sector, sector + len) is in sync.
55 *
56 * If -EWOULDBLOCK is returned the state of the region is
57 * unknown, typically this will result in a read being
58 * passed to a daemon to deal with, since a daemon is
59 * allowed to block.
60 */
61 int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
62
63 /*
64 * Flush the current log state (eg, to disk). This
65 * function may block.
66 */
67 int (*flush)(struct dirty_log *log);
68
69 /*
70 * Mark an area as clean or dirty. These functions may
71 * block, though for performance reasons blocking should
72 * be extremely rare (eg, allocating another chunk of
73 * memory for some reason).
74 */
75 void (*mark_region)(struct dirty_log *log, region_t region);
76 void (*clear_region)(struct dirty_log *log, region_t region);
77
78 /*
79 * Returns: <0 (error), 0 (no region), 1 (region)
80 *
81 * The mirrord will need perform recovery on regions of
82 * the mirror that are in the NOSYNC state. This
83 * function asks the log to tell the caller about the
84 * next region that this machine should recover.
85 *
86 * Do not confuse this function with 'in_sync()', one
87 * tells you if an area is synchronised, the other
88 * assigns recovery work.
89 */
90 int (*get_resync_work)(struct dirty_log *log, region_t *region);
91
92 /*
93 * This notifies the log that the resync of an area has
94 * been completed. The log should then mark this region
95 * as CLEAN.
96 */
97 void (*complete_resync_work)(struct dirty_log *log,
98 region_t region, int success);
99
100 /*
101 * Returns the number of regions that are in sync.
102 */
103 region_t (*get_sync_count)(struct dirty_log *log);
104
105 /*
106 * Support function for mirror status requests.
107 */
108 int (*status)(struct dirty_log *log, status_type_t status_type,
109 char *result, unsigned int maxlen);
110};
111
112int dm_register_dirty_log_type(struct dirty_log_type *type);
113int dm_unregister_dirty_log_type(struct dirty_log_type *type);
114
115
116/*
117 * Make sure you use these two functions, rather than calling
118 * type->constructor/destructor() directly.
119 */
120struct dirty_log *dm_create_dirty_log(const char *type_name, struct dm_target *ti,
121 unsigned int argc, char **argv);
122void dm_destroy_dirty_log(struct dirty_log *log);
123
124/*
125 * init/exit functions.
126 */
127int dm_dirty_log_init(void);
128void dm_dirty_log_exit(void);
129
130#endif
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
new file mode 100644
index 000000000000..43763a0bd096
--- /dev/null
+++ b/drivers/md/dm-mpath.c
@@ -0,0 +1,1302 @@
1/*
2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
9#include "dm-path-selector.h"
10#include "dm-hw-handler.h"
11#include "dm-bio-list.h"
12#include "dm-bio-record.h"
13
14#include <linux/ctype.h>
15#include <linux/init.h>
16#include <linux/mempool.h>
17#include <linux/module.h>
18#include <linux/pagemap.h>
19#include <linux/slab.h>
20#include <linux/time.h>
21#include <linux/workqueue.h>
22#include <asm/atomic.h>
23
24#define MESG_STR(x) x, sizeof(x)
25
26/* Path properties */
27struct pgpath {
28 struct list_head list;
29
30 struct priority_group *pg; /* Owning PG */
31 unsigned fail_count; /* Cumulative failure count */
32
33 struct path path;
34};
35
36#define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
37
38/*
39 * Paths are grouped into Priority Groups and numbered from 1 upwards.
40 * Each has a path selector which controls which path gets used.
41 */
42struct priority_group {
43 struct list_head list;
44
45 struct multipath *m; /* Owning multipath instance */
46 struct path_selector ps;
47
48 unsigned pg_num; /* Reference number */
49 unsigned bypassed; /* Temporarily bypass this PG? */
50
51 unsigned nr_pgpaths; /* Number of paths in PG */
52 struct list_head pgpaths;
53};
54
55/* Multipath context */
56struct multipath {
57 struct list_head list;
58 struct dm_target *ti;
59
60 spinlock_t lock;
61
62 struct hw_handler hw_handler;
63 unsigned nr_priority_groups;
64 struct list_head priority_groups;
65 unsigned pg_init_required; /* pg_init needs calling? */
66
67 unsigned nr_valid_paths; /* Total number of usable paths */
68 struct pgpath *current_pgpath;
69 struct priority_group *current_pg;
70 struct priority_group *next_pg; /* Switch to this PG if set */
71 unsigned repeat_count; /* I/Os left before calling PS again */
72
73 unsigned queue_io; /* Must we queue all I/O? */
74 unsigned queue_if_no_path; /* Queue I/O if last path fails? */
75 unsigned suspended; /* Has dm core suspended our I/O? */
76
77 struct work_struct process_queued_ios;
78 struct bio_list queued_ios;
79 unsigned queue_size;
80
81 struct work_struct trigger_event;
82
83 /*
84 * We must use a mempool of mpath_io structs so that we
85 * can resubmit bios on error.
86 */
87 mempool_t *mpio_pool;
88};
89
90/*
91 * Context information attached to each bio we process.
92 */
93struct mpath_io {
94 struct pgpath *pgpath;
95 struct dm_bio_details details;
96};
97
98typedef int (*action_fn) (struct pgpath *pgpath);
99
100#define MIN_IOS 256 /* Mempool size */
101
102static kmem_cache_t *_mpio_cache;
103
104static void process_queued_ios(void *data);
105static void trigger_event(void *data);
106
107
108/*-----------------------------------------------
109 * Allocation routines
110 *-----------------------------------------------*/
111
112static struct pgpath *alloc_pgpath(void)
113{
114 struct pgpath *pgpath = kmalloc(sizeof(*pgpath), GFP_KERNEL);
115
116 if (pgpath) {
117 memset(pgpath, 0, sizeof(*pgpath));
118 pgpath->path.is_active = 1;
119 }
120
121 return pgpath;
122}
123
124static inline void free_pgpath(struct pgpath *pgpath)
125{
126 kfree(pgpath);
127}
128
129static struct priority_group *alloc_priority_group(void)
130{
131 struct priority_group *pg;
132
133 pg = kmalloc(sizeof(*pg), GFP_KERNEL);
134 if (!pg)
135 return NULL;
136
137 memset(pg, 0, sizeof(*pg));
138 INIT_LIST_HEAD(&pg->pgpaths);
139
140 return pg;
141}
142
143static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
144{
145 struct pgpath *pgpath, *tmp;
146
147 list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
148 list_del(&pgpath->list);
149 dm_put_device(ti, pgpath->path.dev);
150 free_pgpath(pgpath);
151 }
152}
153
154static void free_priority_group(struct priority_group *pg,
155 struct dm_target *ti)
156{
157 struct path_selector *ps = &pg->ps;
158
159 if (ps->type) {
160 ps->type->destroy(ps);
161 dm_put_path_selector(ps->type);
162 }
163
164 free_pgpaths(&pg->pgpaths, ti);
165 kfree(pg);
166}
167
168static struct multipath *alloc_multipath(void)
169{
170 struct multipath *m;
171
172 m = kmalloc(sizeof(*m), GFP_KERNEL);
173 if (m) {
174 memset(m, 0, sizeof(*m));
175 INIT_LIST_HEAD(&m->priority_groups);
176 spin_lock_init(&m->lock);
177 m->queue_io = 1;
178 INIT_WORK(&m->process_queued_ios, process_queued_ios, m);
179 INIT_WORK(&m->trigger_event, trigger_event, m);
180 m->mpio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
181 mempool_free_slab, _mpio_cache);
182 if (!m->mpio_pool) {
183 kfree(m);
184 return NULL;
185 }
186 }
187
188 return m;
189}
190
191static void free_multipath(struct multipath *m)
192{
193 struct priority_group *pg, *tmp;
194 struct hw_handler *hwh = &m->hw_handler;
195
196 list_for_each_entry_safe(pg, tmp, &m->priority_groups, list) {
197 list_del(&pg->list);
198 free_priority_group(pg, m->ti);
199 }
200
201 if (hwh->type) {
202 hwh->type->destroy(hwh);
203 dm_put_hw_handler(hwh->type);
204 }
205
206 mempool_destroy(m->mpio_pool);
207 kfree(m);
208}
209
210
211/*-----------------------------------------------
212 * Path selection
213 *-----------------------------------------------*/
214
215static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
216{
217 struct hw_handler *hwh = &m->hw_handler;
218
219 m->current_pg = pgpath->pg;
220
221 /* Must we initialise the PG first, and queue I/O till it's ready? */
222 if (hwh->type && hwh->type->pg_init) {
223 m->pg_init_required = 1;
224 m->queue_io = 1;
225 } else {
226 m->pg_init_required = 0;
227 m->queue_io = 0;
228 }
229}
230
231static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
232{
233 struct path *path;
234
235 path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
236 if (!path)
237 return -ENXIO;
238
239 m->current_pgpath = path_to_pgpath(path);
240
241 if (m->current_pg != pg)
242 __switch_pg(m, m->current_pgpath);
243
244 return 0;
245}
246
247static void __choose_pgpath(struct multipath *m)
248{
249 struct priority_group *pg;
250 unsigned bypassed = 1;
251
252 if (!m->nr_valid_paths)
253 goto failed;
254
255 /* Were we instructed to switch PG? */
256 if (m->next_pg) {
257 pg = m->next_pg;
258 m->next_pg = NULL;
259 if (!__choose_path_in_pg(m, pg))
260 return;
261 }
262
263 /* Don't change PG until it has no remaining paths */
264 if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
265 return;
266
267 /*
268 * Loop through priority groups until we find a valid path.
269 * First time we skip PGs marked 'bypassed'.
270 * Second time we only try the ones we skipped.
271 */
272 do {
273 list_for_each_entry(pg, &m->priority_groups, list) {
274 if (pg->bypassed == bypassed)
275 continue;
276 if (!__choose_path_in_pg(m, pg))
277 return;
278 }
279 } while (bypassed--);
280
281failed:
282 m->current_pgpath = NULL;
283 m->current_pg = NULL;
284}
285
286static int map_io(struct multipath *m, struct bio *bio, struct mpath_io *mpio,
287 unsigned was_queued)
288{
289 int r = 1;
290 unsigned long flags;
291 struct pgpath *pgpath;
292
293 spin_lock_irqsave(&m->lock, flags);
294
295 /* Do we need to select a new pgpath? */
296 if (!m->current_pgpath ||
297 (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
298 __choose_pgpath(m);
299
300 pgpath = m->current_pgpath;
301
302 if (was_queued)
303 m->queue_size--;
304
305 if ((pgpath && m->queue_io) ||
306 (!pgpath && m->queue_if_no_path && !m->suspended)) {
307 /* Queue for the daemon to resubmit */
308 bio_list_add(&m->queued_ios, bio);
309 m->queue_size++;
310 if (m->pg_init_required || !m->queue_io)
311 schedule_work(&m->process_queued_ios);
312 pgpath = NULL;
313 r = 0;
314 } else if (!pgpath)
315 r = -EIO; /* Failed */
316 else
317 bio->bi_bdev = pgpath->path.dev->bdev;
318
319 mpio->pgpath = pgpath;
320
321 spin_unlock_irqrestore(&m->lock, flags);
322
323 return r;
324}
325
326/*
327 * If we run out of usable paths, should we queue I/O or error it?
328 */
329static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path)
330{
331 unsigned long flags;
332
333 spin_lock_irqsave(&m->lock, flags);
334
335 m->queue_if_no_path = queue_if_no_path;
336 if (!m->queue_if_no_path)
337 schedule_work(&m->process_queued_ios);
338
339 spin_unlock_irqrestore(&m->lock, flags);
340
341 return 0;
342}
343
344/*-----------------------------------------------------------------
345 * The multipath daemon is responsible for resubmitting queued ios.
346 *---------------------------------------------------------------*/
347
348static void dispatch_queued_ios(struct multipath *m)
349{
350 int r;
351 unsigned long flags;
352 struct bio *bio = NULL, *next;
353 struct mpath_io *mpio;
354 union map_info *info;
355
356 spin_lock_irqsave(&m->lock, flags);
357 bio = bio_list_get(&m->queued_ios);
358 spin_unlock_irqrestore(&m->lock, flags);
359
360 while (bio) {
361 next = bio->bi_next;
362 bio->bi_next = NULL;
363
364 info = dm_get_mapinfo(bio);
365 mpio = info->ptr;
366
367 r = map_io(m, bio, mpio, 1);
368 if (r < 0)
369 bio_endio(bio, bio->bi_size, r);
370 else if (r == 1)
371 generic_make_request(bio);
372
373 bio = next;
374 }
375}
376
377static void process_queued_ios(void *data)
378{
379 struct multipath *m = (struct multipath *) data;
380 struct hw_handler *hwh = &m->hw_handler;
381 struct pgpath *pgpath;
382 unsigned init_required, must_queue = 0;
383 unsigned long flags;
384
385 spin_lock_irqsave(&m->lock, flags);
386
387 if (!m->current_pgpath)
388 __choose_pgpath(m);
389
390 pgpath = m->current_pgpath;
391
392 if ((pgpath && m->queue_io) ||
393 (!pgpath && m->queue_if_no_path && !m->suspended))
394 must_queue = 1;
395
396 init_required = m->pg_init_required;
397 if (init_required)
398 m->pg_init_required = 0;
399
400 spin_unlock_irqrestore(&m->lock, flags);
401
402 if (init_required)
403 hwh->type->pg_init(hwh, pgpath->pg->bypassed, &pgpath->path);
404
405 if (!must_queue)
406 dispatch_queued_ios(m);
407}
408
409/*
410 * An event is triggered whenever a path is taken out of use.
411 * Includes path failure and PG bypass.
412 */
413static void trigger_event(void *data)
414{
415 struct multipath *m = (struct multipath *) data;
416
417 dm_table_event(m->ti->table);
418}
419
420/*-----------------------------------------------------------------
421 * Constructor/argument parsing:
422 * <#multipath feature args> [<arg>]*
423 * <#hw_handler args> [hw_handler [<arg>]*]
424 * <#priority groups>
425 * <initial priority group>
426 * [<selector> <#selector args> [<arg>]*
427 * <#paths> <#per-path selector args>
428 * [<path> [<arg>]* ]+ ]+
429 *---------------------------------------------------------------*/
430struct param {
431 unsigned min;
432 unsigned max;
433 char *error;
434};
435
436#define ESTR(s) ("dm-multipath: " s)
437
438static int read_param(struct param *param, char *str, unsigned *v, char **error)
439{
440 if (!str ||
441 (sscanf(str, "%u", v) != 1) ||
442 (*v < param->min) ||
443 (*v > param->max)) {
444 *error = param->error;
445 return -EINVAL;
446 }
447
448 return 0;
449}
450
451struct arg_set {
452 unsigned argc;
453 char **argv;
454};
455
456static char *shift(struct arg_set *as)
457{
458 char *r;
459
460 if (as->argc) {
461 as->argc--;
462 r = *as->argv;
463 as->argv++;
464 return r;
465 }
466
467 return NULL;
468}
469
470static void consume(struct arg_set *as, unsigned n)
471{
472 BUG_ON (as->argc < n);
473 as->argc -= n;
474 as->argv += n;
475}
476
477static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
478 struct dm_target *ti)
479{
480 int r;
481 struct path_selector_type *pst;
482 unsigned ps_argc;
483
484 static struct param _params[] = {
485 {0, 1024, ESTR("invalid number of path selector args")},
486 };
487
488 pst = dm_get_path_selector(shift(as));
489 if (!pst) {
490 ti->error = ESTR("unknown path selector type");
491 return -EINVAL;
492 }
493
494 r = read_param(_params, shift(as), &ps_argc, &ti->error);
495 if (r)
496 return -EINVAL;
497
498 r = pst->create(&pg->ps, ps_argc, as->argv);
499 if (r) {
500 dm_put_path_selector(pst);
501 ti->error = ESTR("path selector constructor failed");
502 return r;
503 }
504
505 pg->ps.type = pst;
506 consume(as, ps_argc);
507
508 return 0;
509}
510
511static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
512 struct dm_target *ti)
513{
514 int r;
515 struct pgpath *p;
516
517 /* we need at least a path arg */
518 if (as->argc < 1) {
519 ti->error = ESTR("no device given");
520 return NULL;
521 }
522
523 p = alloc_pgpath();
524 if (!p)
525 return NULL;
526
527 r = dm_get_device(ti, shift(as), ti->begin, ti->len,
528 dm_table_get_mode(ti->table), &p->path.dev);
529 if (r) {
530 ti->error = ESTR("error getting device");
531 goto bad;
532 }
533
534 r = ps->type->add_path(ps, &p->path, as->argc, as->argv, &ti->error);
535 if (r) {
536 dm_put_device(ti, p->path.dev);
537 goto bad;
538 }
539
540 return p;
541
542 bad:
543 free_pgpath(p);
544 return NULL;
545}
546
547static struct priority_group *parse_priority_group(struct arg_set *as,
548 struct multipath *m,
549 struct dm_target *ti)
550{
551 static struct param _params[] = {
552 {1, 1024, ESTR("invalid number of paths")},
553 {0, 1024, ESTR("invalid number of selector args")}
554 };
555
556 int r;
557 unsigned i, nr_selector_args, nr_params;
558 struct priority_group *pg;
559
560 if (as->argc < 2) {
561 as->argc = 0;
562 ti->error = ESTR("not enough priority group aruments");
563 return NULL;
564 }
565
566 pg = alloc_priority_group();
567 if (!pg) {
568 ti->error = ESTR("couldn't allocate priority group");
569 return NULL;
570 }
571 pg->m = m;
572
573 r = parse_path_selector(as, pg, ti);
574 if (r)
575 goto bad;
576
577 /*
578 * read the paths
579 */
580 r = read_param(_params, shift(as), &pg->nr_pgpaths, &ti->error);
581 if (r)
582 goto bad;
583
584 r = read_param(_params + 1, shift(as), &nr_selector_args, &ti->error);
585 if (r)
586 goto bad;
587
588 nr_params = 1 + nr_selector_args;
589 for (i = 0; i < pg->nr_pgpaths; i++) {
590 struct pgpath *pgpath;
591 struct arg_set path_args;
592
593 if (as->argc < nr_params)
594 goto bad;
595
596 path_args.argc = nr_params;
597 path_args.argv = as->argv;
598
599 pgpath = parse_path(&path_args, &pg->ps, ti);
600 if (!pgpath)
601 goto bad;
602
603 pgpath->pg = pg;
604 list_add_tail(&pgpath->list, &pg->pgpaths);
605 consume(as, nr_params);
606 }
607
608 return pg;
609
610 bad:
611 free_priority_group(pg, ti);
612 return NULL;
613}
614
615static int parse_hw_handler(struct arg_set *as, struct multipath *m,
616 struct dm_target *ti)
617{
618 int r;
619 struct hw_handler_type *hwht;
620 unsigned hw_argc;
621
622 static struct param _params[] = {
623 {0, 1024, ESTR("invalid number of hardware handler args")},
624 };
625
626 r = read_param(_params, shift(as), &hw_argc, &ti->error);
627 if (r)
628 return -EINVAL;
629
630 if (!hw_argc)
631 return 0;
632
633 hwht = dm_get_hw_handler(shift(as));
634 if (!hwht) {
635 ti->error = ESTR("unknown hardware handler type");
636 return -EINVAL;
637 }
638
639 r = hwht->create(&m->hw_handler, hw_argc - 1, as->argv);
640 if (r) {
641 dm_put_hw_handler(hwht);
642 ti->error = ESTR("hardware handler constructor failed");
643 return r;
644 }
645
646 m->hw_handler.type = hwht;
647 consume(as, hw_argc - 1);
648
649 return 0;
650}
651
652static int parse_features(struct arg_set *as, struct multipath *m,
653 struct dm_target *ti)
654{
655 int r;
656 unsigned argc;
657
658 static struct param _params[] = {
659 {0, 1, ESTR("invalid number of feature args")},
660 };
661
662 r = read_param(_params, shift(as), &argc, &ti->error);
663 if (r)
664 return -EINVAL;
665
666 if (!argc)
667 return 0;
668
669 if (!strnicmp(shift(as), MESG_STR("queue_if_no_path")))
670 return queue_if_no_path(m, 1);
671 else {
672 ti->error = "Unrecognised multipath feature request";
673 return -EINVAL;
674 }
675}
676
677static int multipath_ctr(struct dm_target *ti, unsigned int argc,
678 char **argv)
679{
680 /* target parameters */
681 static struct param _params[] = {
682 {1, 1024, ESTR("invalid number of priority groups")},
683 {1, 1024, ESTR("invalid initial priority group number")},
684 };
685
686 int r;
687 struct multipath *m;
688 struct arg_set as;
689 unsigned pg_count = 0;
690 unsigned next_pg_num;
691
692 as.argc = argc;
693 as.argv = argv;
694
695 m = alloc_multipath();
696 if (!m) {
697 ti->error = ESTR("can't allocate multipath");
698 return -EINVAL;
699 }
700
701 r = parse_features(&as, m, ti);
702 if (r)
703 goto bad;
704
705 r = parse_hw_handler(&as, m, ti);
706 if (r)
707 goto bad;
708
709 r = read_param(_params, shift(&as), &m->nr_priority_groups, &ti->error);
710 if (r)
711 goto bad;
712
713 r = read_param(_params + 1, shift(&as), &next_pg_num, &ti->error);
714 if (r)
715 goto bad;
716
717 /* parse the priority groups */
718 while (as.argc) {
719 struct priority_group *pg;
720
721 pg = parse_priority_group(&as, m, ti);
722 if (!pg) {
723 r = -EINVAL;
724 goto bad;
725 }
726
727 m->nr_valid_paths += pg->nr_pgpaths;
728 list_add_tail(&pg->list, &m->priority_groups);
729 pg_count++;
730 pg->pg_num = pg_count;
731 if (!--next_pg_num)
732 m->next_pg = pg;
733 }
734
735 if (pg_count != m->nr_priority_groups) {
736 ti->error = ESTR("priority group count mismatch");
737 r = -EINVAL;
738 goto bad;
739 }
740
741 ti->private = m;
742 m->ti = ti;
743
744 return 0;
745
746 bad:
747 free_multipath(m);
748 return r;
749}
750
751static void multipath_dtr(struct dm_target *ti)
752{
753 struct multipath *m = (struct multipath *) ti->private;
754 free_multipath(m);
755}
756
757/*
758 * Map bios, recording original fields for later in case we have to resubmit
759 */
760static int multipath_map(struct dm_target *ti, struct bio *bio,
761 union map_info *map_context)
762{
763 int r;
764 struct mpath_io *mpio;
765 struct multipath *m = (struct multipath *) ti->private;
766
767 mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
768 dm_bio_record(&mpio->details, bio);
769
770 map_context->ptr = mpio;
771 bio->bi_rw |= (1 << BIO_RW_FAILFAST);
772 r = map_io(m, bio, mpio, 0);
773 if (r < 0)
774 mempool_free(mpio, m->mpio_pool);
775
776 return r;
777}
778
779/*
780 * Take a path out of use.
781 */
782static int fail_path(struct pgpath *pgpath)
783{
784 unsigned long flags;
785 struct multipath *m = pgpath->pg->m;
786
787 spin_lock_irqsave(&m->lock, flags);
788
789 if (!pgpath->path.is_active)
790 goto out;
791
792 DMWARN("dm-multipath: Failing path %s.", pgpath->path.dev->name);
793
794 pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
795 pgpath->path.is_active = 0;
796 pgpath->fail_count++;
797
798 m->nr_valid_paths--;
799
800 if (pgpath == m->current_pgpath)
801 m->current_pgpath = NULL;
802
803 schedule_work(&m->trigger_event);
804
805out:
806 spin_unlock_irqrestore(&m->lock, flags);
807
808 return 0;
809}
810
811/*
812 * Reinstate a previously-failed path
813 */
814static int reinstate_path(struct pgpath *pgpath)
815{
816 int r = 0;
817 unsigned long flags;
818 struct multipath *m = pgpath->pg->m;
819
820 spin_lock_irqsave(&m->lock, flags);
821
822 if (pgpath->path.is_active)
823 goto out;
824
825 if (!pgpath->pg->ps.type) {
826 DMWARN("Reinstate path not supported by path selector %s",
827 pgpath->pg->ps.type->name);
828 r = -EINVAL;
829 goto out;
830 }
831
832 r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path);
833 if (r)
834 goto out;
835
836 pgpath->path.is_active = 1;
837
838 m->current_pgpath = NULL;
839 if (!m->nr_valid_paths++)
840 schedule_work(&m->process_queued_ios);
841
842 schedule_work(&m->trigger_event);
843
844out:
845 spin_unlock_irqrestore(&m->lock, flags);
846
847 return r;
848}
849
850/*
851 * Fail or reinstate all paths that match the provided struct dm_dev.
852 */
853static int action_dev(struct multipath *m, struct dm_dev *dev,
854 action_fn action)
855{
856 int r = 0;
857 struct pgpath *pgpath;
858 struct priority_group *pg;
859
860 list_for_each_entry(pg, &m->priority_groups, list) {
861 list_for_each_entry(pgpath, &pg->pgpaths, list) {
862 if (pgpath->path.dev == dev)
863 r = action(pgpath);
864 }
865 }
866
867 return r;
868}
869
870/*
871 * Temporarily try to avoid having to use the specified PG
872 */
873static void bypass_pg(struct multipath *m, struct priority_group *pg,
874 int bypassed)
875{
876 unsigned long flags;
877
878 spin_lock_irqsave(&m->lock, flags);
879
880 pg->bypassed = bypassed;
881 m->current_pgpath = NULL;
882 m->current_pg = NULL;
883
884 spin_unlock_irqrestore(&m->lock, flags);
885
886 schedule_work(&m->trigger_event);
887}
888
889/*
890 * Switch to using the specified PG from the next I/O that gets mapped
891 */
892static int switch_pg_num(struct multipath *m, const char *pgstr)
893{
894 struct priority_group *pg;
895 unsigned pgnum;
896 unsigned long flags;
897
898 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
899 (pgnum > m->nr_priority_groups)) {
900 DMWARN("invalid PG number supplied to switch_pg_num");
901 return -EINVAL;
902 }
903
904 spin_lock_irqsave(&m->lock, flags);
905 list_for_each_entry(pg, &m->priority_groups, list) {
906 pg->bypassed = 0;
907 if (--pgnum)
908 continue;
909
910 m->current_pgpath = NULL;
911 m->current_pg = NULL;
912 m->next_pg = pg;
913 }
914 spin_unlock_irqrestore(&m->lock, flags);
915
916 schedule_work(&m->trigger_event);
917 return 0;
918}
919
920/*
921 * Set/clear bypassed status of a PG.
922 * PGs are numbered upwards from 1 in the order they were declared.
923 */
924static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
925{
926 struct priority_group *pg;
927 unsigned pgnum;
928
929 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
930 (pgnum > m->nr_priority_groups)) {
931 DMWARN("invalid PG number supplied to bypass_pg");
932 return -EINVAL;
933 }
934
935 list_for_each_entry(pg, &m->priority_groups, list) {
936 if (!--pgnum)
937 break;
938 }
939
940 bypass_pg(m, pg, bypassed);
941 return 0;
942}
943
944/*
945 * pg_init must call this when it has completed its initialisation
946 */
947void dm_pg_init_complete(struct path *path, unsigned err_flags)
948{
949 struct pgpath *pgpath = path_to_pgpath(path);
950 struct priority_group *pg = pgpath->pg;
951 struct multipath *m = pg->m;
952 unsigned long flags;
953
954 /* We insist on failing the path if the PG is already bypassed. */
955 if (err_flags && pg->bypassed)
956 err_flags |= MP_FAIL_PATH;
957
958 if (err_flags & MP_FAIL_PATH)
959 fail_path(pgpath);
960
961 if (err_flags & MP_BYPASS_PG)
962 bypass_pg(m, pg, 1);
963
964 spin_lock_irqsave(&m->lock, flags);
965 if (!err_flags)
966 m->queue_io = 0;
967 else {
968 m->current_pgpath = NULL;
969 m->current_pg = NULL;
970 }
971 schedule_work(&m->process_queued_ios);
972 spin_unlock_irqrestore(&m->lock, flags);
973}
974
975/*
976 * end_io handling
977 */
978static int do_end_io(struct multipath *m, struct bio *bio,
979 int error, struct mpath_io *mpio)
980{
981 struct hw_handler *hwh = &m->hw_handler;
982 unsigned err_flags = MP_FAIL_PATH; /* Default behavior */
983
984 if (!error)
985 return 0; /* I/O complete */
986
987 spin_lock(&m->lock);
988 if (!m->nr_valid_paths) {
989 if (!m->queue_if_no_path || m->suspended) {
990 spin_unlock(&m->lock);
991 return -EIO;
992 } else {
993 spin_unlock(&m->lock);
994 goto requeue;
995 }
996 }
997 spin_unlock(&m->lock);
998
999 if (hwh->type && hwh->type->error)
1000 err_flags = hwh->type->error(hwh, bio);
1001
1002 if (mpio->pgpath) {
1003 if (err_flags & MP_FAIL_PATH)
1004 fail_path(mpio->pgpath);
1005
1006 if (err_flags & MP_BYPASS_PG)
1007 bypass_pg(m, mpio->pgpath->pg, 1);
1008 }
1009
1010 if (err_flags & MP_ERROR_IO)
1011 return -EIO;
1012
1013 requeue:
1014 dm_bio_restore(&mpio->details, bio);
1015
1016 /* queue for the daemon to resubmit or fail */
1017 spin_lock(&m->lock);
1018 bio_list_add(&m->queued_ios, bio);
1019 m->queue_size++;
1020 if (!m->queue_io)
1021 schedule_work(&m->process_queued_ios);
1022 spin_unlock(&m->lock);
1023
1024 return 1; /* io not complete */
1025}
1026
1027static int multipath_end_io(struct dm_target *ti, struct bio *bio,
1028 int error, union map_info *map_context)
1029{
1030 struct multipath *m = (struct multipath *) ti->private;
1031 struct mpath_io *mpio = (struct mpath_io *) map_context->ptr;
1032 struct pgpath *pgpath = mpio->pgpath;
1033 struct path_selector *ps;
1034 int r;
1035
1036 r = do_end_io(m, bio, error, mpio);
1037 if (pgpath) {
1038 ps = &pgpath->pg->ps;
1039 if (ps->type->end_io)
1040 ps->type->end_io(ps, &pgpath->path);
1041 }
1042 if (r <= 0)
1043 mempool_free(mpio, m->mpio_pool);
1044
1045 return r;
1046}
1047
1048/*
1049 * Suspend can't complete until all the I/O is processed so if
1050 * the last path failed we will now error any queued I/O.
1051 */
1052static void multipath_presuspend(struct dm_target *ti)
1053{
1054 struct multipath *m = (struct multipath *) ti->private;
1055 unsigned long flags;
1056
1057 spin_lock_irqsave(&m->lock, flags);
1058 m->suspended = 1;
1059 if (m->queue_if_no_path)
1060 schedule_work(&m->process_queued_ios);
1061 spin_unlock_irqrestore(&m->lock, flags);
1062}
1063
1064static void multipath_resume(struct dm_target *ti)
1065{
1066 struct multipath *m = (struct multipath *) ti->private;
1067 unsigned long flags;
1068
1069 spin_lock_irqsave(&m->lock, flags);
1070 m->suspended = 0;
1071 spin_unlock_irqrestore(&m->lock, flags);
1072}
1073
1074/*
1075 * Info output has the following format:
1076 * num_multipath_feature_args [multipath_feature_args]*
1077 * num_handler_status_args [handler_status_args]*
1078 * num_groups init_group_number
1079 * [A|D|E num_ps_status_args [ps_status_args]*
1080 * num_paths num_selector_args
1081 * [path_dev A|F fail_count [selector_args]* ]+ ]+
1082 *
1083 * Table output has the following format (identical to the constructor string):
1084 * num_feature_args [features_args]*
1085 * num_handler_args hw_handler [hw_handler_args]*
1086 * num_groups init_group_number
1087 * [priority selector-name num_ps_args [ps_args]*
1088 * num_paths num_selector_args [path_dev [selector_args]* ]+ ]+
1089 */
1090static int multipath_status(struct dm_target *ti, status_type_t type,
1091 char *result, unsigned int maxlen)
1092{
1093 int sz = 0;
1094 unsigned long flags;
1095 struct multipath *m = (struct multipath *) ti->private;
1096 struct hw_handler *hwh = &m->hw_handler;
1097 struct priority_group *pg;
1098 struct pgpath *p;
1099 unsigned pg_num;
1100 char state;
1101
1102 spin_lock_irqsave(&m->lock, flags);
1103
1104 /* Features */
1105 if (type == STATUSTYPE_INFO)
1106 DMEMIT("1 %u ", m->queue_size);
1107 else if (m->queue_if_no_path)
1108 DMEMIT("1 queue_if_no_path ");
1109 else
1110 DMEMIT("0 ");
1111
1112 if (hwh->type && hwh->type->status)
1113 sz += hwh->type->status(hwh, type, result + sz, maxlen - sz);
1114 else if (!hwh->type || type == STATUSTYPE_INFO)
1115 DMEMIT("0 ");
1116 else
1117 DMEMIT("1 %s ", hwh->type->name);
1118
1119 DMEMIT("%u ", m->nr_priority_groups);
1120
1121 if (m->next_pg)
1122 pg_num = m->next_pg->pg_num;
1123 else if (m->current_pg)
1124 pg_num = m->current_pg->pg_num;
1125 else
1126 pg_num = 1;
1127
1128 DMEMIT("%u ", pg_num);
1129
1130 switch (type) {
1131 case STATUSTYPE_INFO:
1132 list_for_each_entry(pg, &m->priority_groups, list) {
1133 if (pg->bypassed)
1134 state = 'D'; /* Disabled */
1135 else if (pg == m->current_pg)
1136 state = 'A'; /* Currently Active */
1137 else
1138 state = 'E'; /* Enabled */
1139
1140 DMEMIT("%c ", state);
1141
1142 if (pg->ps.type->status)
1143 sz += pg->ps.type->status(&pg->ps, NULL, type,
1144 result + sz,
1145 maxlen - sz);
1146 else
1147 DMEMIT("0 ");
1148
1149 DMEMIT("%u %u ", pg->nr_pgpaths,
1150 pg->ps.type->info_args);
1151
1152 list_for_each_entry(p, &pg->pgpaths, list) {
1153 DMEMIT("%s %s %u ", p->path.dev->name,
1154 p->path.is_active ? "A" : "F",
1155 p->fail_count);
1156 if (pg->ps.type->status)
1157 sz += pg->ps.type->status(&pg->ps,
1158 &p->path, type, result + sz,
1159 maxlen - sz);
1160 }
1161 }
1162 break;
1163
1164 case STATUSTYPE_TABLE:
1165 list_for_each_entry(pg, &m->priority_groups, list) {
1166 DMEMIT("%s ", pg->ps.type->name);
1167
1168 if (pg->ps.type->status)
1169 sz += pg->ps.type->status(&pg->ps, NULL, type,
1170 result + sz,
1171 maxlen - sz);
1172 else
1173 DMEMIT("0 ");
1174
1175 DMEMIT("%u %u ", pg->nr_pgpaths,
1176 pg->ps.type->table_args);
1177
1178 list_for_each_entry(p, &pg->pgpaths, list) {
1179 DMEMIT("%s ", p->path.dev->name);
1180 if (pg->ps.type->status)
1181 sz += pg->ps.type->status(&pg->ps,
1182 &p->path, type, result + sz,
1183 maxlen - sz);
1184 }
1185 }
1186 break;
1187 }
1188
1189 spin_unlock_irqrestore(&m->lock, flags);
1190
1191 return 0;
1192}
1193
1194static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
1195{
1196 int r;
1197 struct dm_dev *dev;
1198 struct multipath *m = (struct multipath *) ti->private;
1199 action_fn action;
1200
1201 if (argc == 1) {
1202 if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
1203 return queue_if_no_path(m, 1);
1204 else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
1205 return queue_if_no_path(m, 0);
1206 }
1207
1208 if (argc != 2)
1209 goto error;
1210
1211 if (!strnicmp(argv[0], MESG_STR("disable_group")))
1212 return bypass_pg_num(m, argv[1], 1);
1213 else if (!strnicmp(argv[0], MESG_STR("enable_group")))
1214 return bypass_pg_num(m, argv[1], 0);
1215 else if (!strnicmp(argv[0], MESG_STR("switch_group")))
1216 return switch_pg_num(m, argv[1]);
1217 else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
1218 action = reinstate_path;
1219 else if (!strnicmp(argv[0], MESG_STR("fail_path")))
1220 action = fail_path;
1221 else
1222 goto error;
1223
1224 r = dm_get_device(ti, argv[1], ti->begin, ti->len,
1225 dm_table_get_mode(ti->table), &dev);
1226 if (r) {
1227 DMWARN("dm-multipath message: error getting device %s",
1228 argv[1]);
1229 return -EINVAL;
1230 }
1231
1232 r = action_dev(m, dev, action);
1233
1234 dm_put_device(ti, dev);
1235
1236 return r;
1237
1238error:
1239 DMWARN("Unrecognised multipath message received.");
1240 return -EINVAL;
1241}
1242
1243/*-----------------------------------------------------------------
1244 * Module setup
1245 *---------------------------------------------------------------*/
1246static struct target_type multipath_target = {
1247 .name = "multipath",
1248 .version = {1, 0, 4},
1249 .module = THIS_MODULE,
1250 .ctr = multipath_ctr,
1251 .dtr = multipath_dtr,
1252 .map = multipath_map,
1253 .end_io = multipath_end_io,
1254 .presuspend = multipath_presuspend,
1255 .resume = multipath_resume,
1256 .status = multipath_status,
1257 .message = multipath_message,
1258};
1259
1260static int __init dm_multipath_init(void)
1261{
1262 int r;
1263
1264 /* allocate a slab for the dm_ios */
1265 _mpio_cache = kmem_cache_create("dm_mpath", sizeof(struct mpath_io),
1266 0, 0, NULL, NULL);
1267 if (!_mpio_cache)
1268 return -ENOMEM;
1269
1270 r = dm_register_target(&multipath_target);
1271 if (r < 0) {
1272 DMERR("%s: register failed %d", multipath_target.name, r);
1273 kmem_cache_destroy(_mpio_cache);
1274 return -EINVAL;
1275 }
1276
1277 DMINFO("dm-multipath version %u.%u.%u loaded",
1278 multipath_target.version[0], multipath_target.version[1],
1279 multipath_target.version[2]);
1280
1281 return r;
1282}
1283
1284static void __exit dm_multipath_exit(void)
1285{
1286 int r;
1287
1288 r = dm_unregister_target(&multipath_target);
1289 if (r < 0)
1290 DMERR("%s: target unregister failed %d",
1291 multipath_target.name, r);
1292 kmem_cache_destroy(_mpio_cache);
1293}
1294
1295EXPORT_SYMBOL_GPL(dm_pg_init_complete);
1296
1297module_init(dm_multipath_init);
1298module_exit(dm_multipath_exit);
1299
1300MODULE_DESCRIPTION(DM_NAME " multipath target");
1301MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
1302MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
new file mode 100644
index 000000000000..8a4bf2b6d52e
--- /dev/null
+++ b/drivers/md/dm-mpath.h
@@ -0,0 +1,25 @@
1/*
2 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
3 *
4 * This file is released under the GPL.
5 *
6 * Multipath.
7 */
8
9#ifndef DM_MPATH_H
10#define DM_MPATH_H
11
12struct dm_dev;
13
14struct path {
15 struct dm_dev *dev; /* Read-only */
16 unsigned is_active; /* Read-only */
17
18 void *pscontext; /* For path-selector use */
19 void *hwhcontext; /* For hw-handler use */
20};
21
22/* Callback for hwh_pg_init_fn to use when complete */
23void dm_pg_init_complete(struct path *path, unsigned err_flags);
24
25#endif
diff --git a/drivers/md/dm-path-selector.c b/drivers/md/dm-path-selector.c
new file mode 100644
index 000000000000..ac5c4bbec6c1
--- /dev/null
+++ b/drivers/md/dm-path-selector.c
@@ -0,0 +1,156 @@
1/*
2 * Copyright (C) 2003 Sistina Software.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4 *
5 * Module Author: Heinz Mauelshagen
6 *
7 * This file is released under the GPL.
8 *
9 * Path selector registration.
10 */
11
12#include "dm.h"
13#include "dm-path-selector.h"
14
15#include <linux/slab.h>
16
17struct ps_internal {
18 struct path_selector_type pst;
19
20 struct list_head list;
21 long use;
22};
23
24#define pst_to_psi(__pst) container_of((__pst), struct ps_internal, pst)
25
26static LIST_HEAD(_path_selectors);
27static DECLARE_RWSEM(_ps_lock);
28
29struct ps_internal *__find_path_selector_type(const char *name)
30{
31 struct ps_internal *psi;
32
33 list_for_each_entry(psi, &_path_selectors, list) {
34 if (!strcmp(name, psi->pst.name))
35 return psi;
36 }
37
38 return NULL;
39}
40
41static struct ps_internal *get_path_selector(const char *name)
42{
43 struct ps_internal *psi;
44
45 down_read(&_ps_lock);
46 psi = __find_path_selector_type(name);
47 if (psi) {
48 if ((psi->use == 0) && !try_module_get(psi->pst.module))
49 psi = NULL;
50 else
51 psi->use++;
52 }
53 up_read(&_ps_lock);
54
55 return psi;
56}
57
58struct path_selector_type *dm_get_path_selector(const char *name)
59{
60 struct ps_internal *psi;
61
62 if (!name)
63 return NULL;
64
65 psi = get_path_selector(name);
66 if (!psi) {
67 request_module("dm-%s", name);
68 psi = get_path_selector(name);
69 }
70
71 return psi ? &psi->pst : NULL;
72}
73
74void dm_put_path_selector(struct path_selector_type *pst)
75{
76 struct ps_internal *psi;
77
78 if (!pst)
79 return;
80
81 down_read(&_ps_lock);
82 psi = __find_path_selector_type(pst->name);
83 if (!psi)
84 goto out;
85
86 if (--psi->use == 0)
87 module_put(psi->pst.module);
88
89 if (psi->use < 0)
90 BUG();
91
92out:
93 up_read(&_ps_lock);
94}
95
96static struct ps_internal *_alloc_path_selector(struct path_selector_type *pst)
97{
98 struct ps_internal *psi = kmalloc(sizeof(*psi), GFP_KERNEL);
99
100 if (psi) {
101 memset(psi, 0, sizeof(*psi));
102 psi->pst = *pst;
103 }
104
105 return psi;
106}
107
108int dm_register_path_selector(struct path_selector_type *pst)
109{
110 int r = 0;
111 struct ps_internal *psi = _alloc_path_selector(pst);
112
113 if (!psi)
114 return -ENOMEM;
115
116 down_write(&_ps_lock);
117
118 if (__find_path_selector_type(pst->name)) {
119 kfree(psi);
120 r = -EEXIST;
121 } else
122 list_add(&psi->list, &_path_selectors);
123
124 up_write(&_ps_lock);
125
126 return r;
127}
128
129int dm_unregister_path_selector(struct path_selector_type *pst)
130{
131 struct ps_internal *psi;
132
133 down_write(&_ps_lock);
134
135 psi = __find_path_selector_type(pst->name);
136 if (!psi) {
137 up_write(&_ps_lock);
138 return -EINVAL;
139 }
140
141 if (psi->use) {
142 up_write(&_ps_lock);
143 return -ETXTBSY;
144 }
145
146 list_del(&psi->list);
147
148 up_write(&_ps_lock);
149
150 kfree(psi);
151
152 return 0;
153}
154
155EXPORT_SYMBOL_GPL(dm_register_path_selector);
156EXPORT_SYMBOL_GPL(dm_unregister_path_selector);
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
new file mode 100644
index 000000000000..732d06a84f85
--- /dev/null
+++ b/drivers/md/dm-path-selector.h
@@ -0,0 +1,93 @@
1/*
2 * Copyright (C) 2003 Sistina Software.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4 *
5 * Module Author: Heinz Mauelshagen
6 *
7 * This file is released under the GPL.
8 *
9 * Path-Selector registration.
10 */
11
12#ifndef DM_PATH_SELECTOR_H
13#define DM_PATH_SELECTOR_H
14
15#include <linux/device-mapper.h>
16
17#include "dm-mpath.h"
18
19/*
20 * We provide an abstraction for the code that chooses which path
21 * to send some io down.
22 */
23struct path_selector_type;
24struct path_selector {
25 struct path_selector_type *type;
26 void *context;
27};
28
29/* Information about a path selector type */
30struct path_selector_type {
31 char *name;
32 struct module *module;
33
34 unsigned int table_args;
35 unsigned int info_args;
36
37 /*
38 * Constructs a path selector object, takes custom arguments
39 */
40 int (*create) (struct path_selector *ps, unsigned argc, char **argv);
41 void (*destroy) (struct path_selector *ps);
42
43 /*
44 * Add an opaque path object, along with some selector specific
45 * path args (eg, path priority).
46 */
47 int (*add_path) (struct path_selector *ps, struct path *path,
48 int argc, char **argv, char **error);
49
50 /*
51 * Chooses a path for this io, if no paths are available then
52 * NULL will be returned.
53 *
54 * repeat_count is the number of times to use the path before
55 * calling the function again. 0 means don't call it again unless
56 * the path fails.
57 */
58 struct path *(*select_path) (struct path_selector *ps,
59 unsigned *repeat_count);
60
61 /*
62 * Notify the selector that a path has failed.
63 */
64 void (*fail_path) (struct path_selector *ps, struct path *p);
65
66 /*
67 * Ask selector to reinstate a path.
68 */
69 int (*reinstate_path) (struct path_selector *ps, struct path *p);
70
71 /*
72 * Table content based on parameters added in ps_add_path_fn
73 * or path selector status
74 */
75 int (*status) (struct path_selector *ps, struct path *path,
76 status_type_t type, char *result, unsigned int maxlen);
77
78 int (*end_io) (struct path_selector *ps, struct path *path);
79};
80
81/* Register a path selector */
82int dm_register_path_selector(struct path_selector_type *type);
83
84/* Unregister a path selector */
85int dm_unregister_path_selector(struct path_selector_type *type);
86
87/* Returns a registered path selector type */
88struct path_selector_type *dm_get_path_selector(const char *name);
89
90/* Releases a path selector */
91void dm_put_path_selector(struct path_selector_type *pst);
92
93#endif
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
new file mode 100644
index 000000000000..6e3cf7e13451
--- /dev/null
+++ b/drivers/md/dm-raid1.c
@@ -0,0 +1,1269 @@
1/*
2 * Copyright (C) 2003 Sistina Software Limited.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-list.h"
9#include "dm-io.h"
10#include "dm-log.h"
11#include "kcopyd.h"
12
13#include <linux/ctype.h>
14#include <linux/init.h>
15#include <linux/mempool.h>
16#include <linux/module.h>
17#include <linux/pagemap.h>
18#include <linux/slab.h>
19#include <linux/time.h>
20#include <linux/vmalloc.h>
21#include <linux/workqueue.h>
22
23static struct workqueue_struct *_kmirrord_wq;
24static struct work_struct _kmirrord_work;
25
26static inline void wake(void)
27{
28 queue_work(_kmirrord_wq, &_kmirrord_work);
29}
30
31/*-----------------------------------------------------------------
32 * Region hash
33 *
34 * The mirror splits itself up into discrete regions. Each
35 * region can be in one of three states: clean, dirty,
36 * nosync. There is no need to put clean regions in the hash.
37 *
38 * In addition to being present in the hash table a region _may_
39 * be present on one of three lists.
40 *
41 * clean_regions: Regions on this list have no io pending to
42 * them, they are in sync, we are no longer interested in them,
43 * they are dull. rh_update_states() will remove them from the
44 * hash table.
45 *
46 * quiesced_regions: These regions have been spun down, ready
47 * for recovery. rh_recovery_start() will remove regions from
48 * this list and hand them to kmirrord, which will schedule the
49 * recovery io with kcopyd.
50 *
51 * recovered_regions: Regions that kcopyd has successfully
52 * recovered. rh_update_states() will now schedule any delayed
53 * io, up the recovery_count, and remove the region from the
54 * hash.
55 *
56 * There are 2 locks:
57 * A rw spin lock 'hash_lock' protects just the hash table,
58 * this is never held in write mode from interrupt context,
59 * which I believe means that we only have to disable irqs when
60 * doing a write lock.
61 *
62 * An ordinary spin lock 'region_lock' that protects the three
63 * lists in the region_hash, with the 'state', 'list' and
64 * 'bhs_delayed' fields of the regions. This is used from irq
65 * context, so all other uses will have to suspend local irqs.
66 *---------------------------------------------------------------*/
67struct mirror_set;
68struct region_hash {
69 struct mirror_set *ms;
70 uint32_t region_size;
71 unsigned region_shift;
72
73 /* holds persistent region state */
74 struct dirty_log *log;
75
76 /* hash table */
77 rwlock_t hash_lock;
78 mempool_t *region_pool;
79 unsigned int mask;
80 unsigned int nr_buckets;
81 struct list_head *buckets;
82
83 spinlock_t region_lock;
84 struct semaphore recovery_count;
85 struct list_head clean_regions;
86 struct list_head quiesced_regions;
87 struct list_head recovered_regions;
88};
89
90enum {
91 RH_CLEAN,
92 RH_DIRTY,
93 RH_NOSYNC,
94 RH_RECOVERING
95};
96
97struct region {
98 struct region_hash *rh; /* FIXME: can we get rid of this ? */
99 region_t key;
100 int state;
101
102 struct list_head hash_list;
103 struct list_head list;
104
105 atomic_t pending;
106 struct bio_list delayed_bios;
107};
108
109/*
110 * Conversion fns
111 */
112static inline region_t bio_to_region(struct region_hash *rh, struct bio *bio)
113{
114 return bio->bi_sector >> rh->region_shift;
115}
116
117static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
118{
119 return region << rh->region_shift;
120}
121
122/* FIXME move this */
123static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
124
125static void *region_alloc(unsigned int __nocast gfp_mask, void *pool_data)
126{
127 return kmalloc(sizeof(struct region), gfp_mask);
128}
129
130static void region_free(void *element, void *pool_data)
131{
132 kfree(element);
133}
134
135#define MIN_REGIONS 64
136#define MAX_RECOVERY 1
137static int rh_init(struct region_hash *rh, struct mirror_set *ms,
138 struct dirty_log *log, uint32_t region_size,
139 region_t nr_regions)
140{
141 unsigned int nr_buckets, max_buckets;
142 size_t i;
143
144 /*
145 * Calculate a suitable number of buckets for our hash
146 * table.
147 */
148 max_buckets = nr_regions >> 6;
149 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
150 ;
151 nr_buckets >>= 1;
152
153 rh->ms = ms;
154 rh->log = log;
155 rh->region_size = region_size;
156 rh->region_shift = ffs(region_size) - 1;
157 rwlock_init(&rh->hash_lock);
158 rh->mask = nr_buckets - 1;
159 rh->nr_buckets = nr_buckets;
160
161 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
162 if (!rh->buckets) {
163 DMERR("unable to allocate region hash memory");
164 return -ENOMEM;
165 }
166
167 for (i = 0; i < nr_buckets; i++)
168 INIT_LIST_HEAD(rh->buckets + i);
169
170 spin_lock_init(&rh->region_lock);
171 sema_init(&rh->recovery_count, 0);
172 INIT_LIST_HEAD(&rh->clean_regions);
173 INIT_LIST_HEAD(&rh->quiesced_regions);
174 INIT_LIST_HEAD(&rh->recovered_regions);
175
176 rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
177 region_free, NULL);
178 if (!rh->region_pool) {
179 vfree(rh->buckets);
180 rh->buckets = NULL;
181 return -ENOMEM;
182 }
183
184 return 0;
185}
186
187static void rh_exit(struct region_hash *rh)
188{
189 unsigned int h;
190 struct region *reg, *nreg;
191
192 BUG_ON(!list_empty(&rh->quiesced_regions));
193 for (h = 0; h < rh->nr_buckets; h++) {
194 list_for_each_entry_safe(reg, nreg, rh->buckets + h, hash_list) {
195 BUG_ON(atomic_read(&reg->pending));
196 mempool_free(reg, rh->region_pool);
197 }
198 }
199
200 if (rh->log)
201 dm_destroy_dirty_log(rh->log);
202 if (rh->region_pool)
203 mempool_destroy(rh->region_pool);
204 vfree(rh->buckets);
205}
206
207#define RH_HASH_MULT 2654435387U
208
209static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
210{
211 return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
212}
213
214static struct region *__rh_lookup(struct region_hash *rh, region_t region)
215{
216 struct region *reg;
217
218 list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
219 if (reg->key == region)
220 return reg;
221
222 return NULL;
223}
224
225static void __rh_insert(struct region_hash *rh, struct region *reg)
226{
227 unsigned int h = rh_hash(rh, reg->key);
228 list_add(&reg->hash_list, rh->buckets + h);
229}
230
231static struct region *__rh_alloc(struct region_hash *rh, region_t region)
232{
233 struct region *reg, *nreg;
234
235 read_unlock(&rh->hash_lock);
236 nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
237 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
238 RH_CLEAN : RH_NOSYNC;
239 nreg->rh = rh;
240 nreg->key = region;
241
242 INIT_LIST_HEAD(&nreg->list);
243
244 atomic_set(&nreg->pending, 0);
245 bio_list_init(&nreg->delayed_bios);
246 write_lock_irq(&rh->hash_lock);
247
248 reg = __rh_lookup(rh, region);
249 if (reg)
250 /* we lost the race */
251 mempool_free(nreg, rh->region_pool);
252
253 else {
254 __rh_insert(rh, nreg);
255 if (nreg->state == RH_CLEAN) {
256 spin_lock(&rh->region_lock);
257 list_add(&nreg->list, &rh->clean_regions);
258 spin_unlock(&rh->region_lock);
259 }
260 reg = nreg;
261 }
262 write_unlock_irq(&rh->hash_lock);
263 read_lock(&rh->hash_lock);
264
265 return reg;
266}
267
268static inline struct region *__rh_find(struct region_hash *rh, region_t region)
269{
270 struct region *reg;
271
272 reg = __rh_lookup(rh, region);
273 if (!reg)
274 reg = __rh_alloc(rh, region);
275
276 return reg;
277}
278
279static int rh_state(struct region_hash *rh, region_t region, int may_block)
280{
281 int r;
282 struct region *reg;
283
284 read_lock(&rh->hash_lock);
285 reg = __rh_lookup(rh, region);
286 read_unlock(&rh->hash_lock);
287
288 if (reg)
289 return reg->state;
290
291 /*
292 * The region wasn't in the hash, so we fall back to the
293 * dirty log.
294 */
295 r = rh->log->type->in_sync(rh->log, region, may_block);
296
297 /*
298 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
299 * taken as a RH_NOSYNC
300 */
301 return r == 1 ? RH_CLEAN : RH_NOSYNC;
302}
303
304static inline int rh_in_sync(struct region_hash *rh,
305 region_t region, int may_block)
306{
307 int state = rh_state(rh, region, may_block);
308 return state == RH_CLEAN || state == RH_DIRTY;
309}
310
311static void dispatch_bios(struct mirror_set *ms, struct bio_list *bio_list)
312{
313 struct bio *bio;
314
315 while ((bio = bio_list_pop(bio_list))) {
316 queue_bio(ms, bio, WRITE);
317 }
318}
319
320static void rh_update_states(struct region_hash *rh)
321{
322 struct region *reg, *next;
323
324 LIST_HEAD(clean);
325 LIST_HEAD(recovered);
326
327 /*
328 * Quickly grab the lists.
329 */
330 write_lock_irq(&rh->hash_lock);
331 spin_lock(&rh->region_lock);
332 if (!list_empty(&rh->clean_regions)) {
333 list_splice(&rh->clean_regions, &clean);
334 INIT_LIST_HEAD(&rh->clean_regions);
335
336 list_for_each_entry (reg, &clean, list) {
337 rh->log->type->clear_region(rh->log, reg->key);
338 list_del(&reg->hash_list);
339 }
340 }
341
342 if (!list_empty(&rh->recovered_regions)) {
343 list_splice(&rh->recovered_regions, &recovered);
344 INIT_LIST_HEAD(&rh->recovered_regions);
345
346 list_for_each_entry (reg, &recovered, list)
347 list_del(&reg->hash_list);
348 }
349 spin_unlock(&rh->region_lock);
350 write_unlock_irq(&rh->hash_lock);
351
352 /*
353 * All the regions on the recovered and clean lists have
354 * now been pulled out of the system, so no need to do
355 * any more locking.
356 */
357 list_for_each_entry_safe (reg, next, &recovered, list) {
358 rh->log->type->clear_region(rh->log, reg->key);
359 rh->log->type->complete_resync_work(rh->log, reg->key, 1);
360 dispatch_bios(rh->ms, &reg->delayed_bios);
361 up(&rh->recovery_count);
362 mempool_free(reg, rh->region_pool);
363 }
364
365 if (!list_empty(&recovered))
366 rh->log->type->flush(rh->log);
367
368 list_for_each_entry_safe (reg, next, &clean, list)
369 mempool_free(reg, rh->region_pool);
370}
371
372static void rh_inc(struct region_hash *rh, region_t region)
373{
374 struct region *reg;
375
376 read_lock(&rh->hash_lock);
377 reg = __rh_find(rh, region);
378 if (reg->state == RH_CLEAN) {
379 rh->log->type->mark_region(rh->log, reg->key);
380
381 spin_lock_irq(&rh->region_lock);
382 reg->state = RH_DIRTY;
383 list_del_init(&reg->list); /* take off the clean list */
384 spin_unlock_irq(&rh->region_lock);
385 }
386
387 atomic_inc(&reg->pending);
388 read_unlock(&rh->hash_lock);
389}
390
391static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios)
392{
393 struct bio *bio;
394
395 for (bio = bios->head; bio; bio = bio->bi_next)
396 rh_inc(rh, bio_to_region(rh, bio));
397}
398
399static void rh_dec(struct region_hash *rh, region_t region)
400{
401 unsigned long flags;
402 struct region *reg;
403 int should_wake = 0;
404
405 read_lock(&rh->hash_lock);
406 reg = __rh_lookup(rh, region);
407 read_unlock(&rh->hash_lock);
408
409 if (atomic_dec_and_test(&reg->pending)) {
410 spin_lock_irqsave(&rh->region_lock, flags);
411 if (reg->state == RH_RECOVERING) {
412 list_add_tail(&reg->list, &rh->quiesced_regions);
413 } else {
414 reg->state = RH_CLEAN;
415 list_add(&reg->list, &rh->clean_regions);
416 }
417 spin_unlock_irqrestore(&rh->region_lock, flags);
418 should_wake = 1;
419 }
420
421 if (should_wake)
422 wake();
423}
424
425/*
426 * Starts quiescing a region in preparation for recovery.
427 */
428static int __rh_recovery_prepare(struct region_hash *rh)
429{
430 int r;
431 struct region *reg;
432 region_t region;
433
434 /*
435 * Ask the dirty log what's next.
436 */
437 r = rh->log->type->get_resync_work(rh->log, &region);
438 if (r <= 0)
439 return r;
440
441 /*
442 * Get this region, and start it quiescing by setting the
443 * recovering flag.
444 */
445 read_lock(&rh->hash_lock);
446 reg = __rh_find(rh, region);
447 read_unlock(&rh->hash_lock);
448
449 spin_lock_irq(&rh->region_lock);
450 reg->state = RH_RECOVERING;
451
452 /* Already quiesced ? */
453 if (atomic_read(&reg->pending))
454 list_del_init(&reg->list);
455
456 else {
457 list_del_init(&reg->list);
458 list_add(&reg->list, &rh->quiesced_regions);
459 }
460 spin_unlock_irq(&rh->region_lock);
461
462 return 1;
463}
464
465static void rh_recovery_prepare(struct region_hash *rh)
466{
467 while (!down_trylock(&rh->recovery_count))
468 if (__rh_recovery_prepare(rh) <= 0) {
469 up(&rh->recovery_count);
470 break;
471 }
472}
473
474/*
475 * Returns any quiesced regions.
476 */
477static struct region *rh_recovery_start(struct region_hash *rh)
478{
479 struct region *reg = NULL;
480
481 spin_lock_irq(&rh->region_lock);
482 if (!list_empty(&rh->quiesced_regions)) {
483 reg = list_entry(rh->quiesced_regions.next,
484 struct region, list);
485 list_del_init(&reg->list); /* remove from the quiesced list */
486 }
487 spin_unlock_irq(&rh->region_lock);
488
489 return reg;
490}
491
492/* FIXME: success ignored for now */
493static void rh_recovery_end(struct region *reg, int success)
494{
495 struct region_hash *rh = reg->rh;
496
497 spin_lock_irq(&rh->region_lock);
498 list_add(&reg->list, &reg->rh->recovered_regions);
499 spin_unlock_irq(&rh->region_lock);
500
501 wake();
502}
503
504static void rh_flush(struct region_hash *rh)
505{
506 rh->log->type->flush(rh->log);
507}
508
509static void rh_delay(struct region_hash *rh, struct bio *bio)
510{
511 struct region *reg;
512
513 read_lock(&rh->hash_lock);
514 reg = __rh_find(rh, bio_to_region(rh, bio));
515 bio_list_add(&reg->delayed_bios, bio);
516 read_unlock(&rh->hash_lock);
517}
518
519static void rh_stop_recovery(struct region_hash *rh)
520{
521 int i;
522
523 /* wait for any recovering regions */
524 for (i = 0; i < MAX_RECOVERY; i++)
525 down(&rh->recovery_count);
526}
527
528static void rh_start_recovery(struct region_hash *rh)
529{
530 int i;
531
532 for (i = 0; i < MAX_RECOVERY; i++)
533 up(&rh->recovery_count);
534
535 wake();
536}
537
538/*-----------------------------------------------------------------
539 * Mirror set structures.
540 *---------------------------------------------------------------*/
541struct mirror {
542 atomic_t error_count;
543 struct dm_dev *dev;
544 sector_t offset;
545};
546
547struct mirror_set {
548 struct dm_target *ti;
549 struct list_head list;
550 struct region_hash rh;
551 struct kcopyd_client *kcopyd_client;
552
553 spinlock_t lock; /* protects the next two lists */
554 struct bio_list reads;
555 struct bio_list writes;
556
557 /* recovery */
558 region_t nr_regions;
559 int in_sync;
560
561 unsigned int nr_mirrors;
562 struct mirror mirror[0];
563};
564
565/*
566 * Every mirror should look like this one.
567 */
568#define DEFAULT_MIRROR 0
569
570/*
571 * This is yucky. We squirrel the mirror_set struct away inside
572 * bi_next for write buffers. This is safe since the bh
573 * doesn't get submitted to the lower levels of block layer.
574 */
575static struct mirror_set *bio_get_ms(struct bio *bio)
576{
577 return (struct mirror_set *) bio->bi_next;
578}
579
580static void bio_set_ms(struct bio *bio, struct mirror_set *ms)
581{
582 bio->bi_next = (struct bio *) ms;
583}
584
585/*-----------------------------------------------------------------
586 * Recovery.
587 *
588 * When a mirror is first activated we may find that some regions
589 * are in the no-sync state. We have to recover these by
590 * recopying from the default mirror to all the others.
591 *---------------------------------------------------------------*/
592static void recovery_complete(int read_err, unsigned int write_err,
593 void *context)
594{
595 struct region *reg = (struct region *) context;
596
597 /* FIXME: better error handling */
598 rh_recovery_end(reg, read_err || write_err);
599}
600
601static int recover(struct mirror_set *ms, struct region *reg)
602{
603 int r;
604 unsigned int i;
605 struct io_region from, to[KCOPYD_MAX_REGIONS], *dest;
606 struct mirror *m;
607 unsigned long flags = 0;
608
609 /* fill in the source */
610 m = ms->mirror + DEFAULT_MIRROR;
611 from.bdev = m->dev->bdev;
612 from.sector = m->offset + region_to_sector(reg->rh, reg->key);
613 if (reg->key == (ms->nr_regions - 1)) {
614 /*
615 * The final region may be smaller than
616 * region_size.
617 */
618 from.count = ms->ti->len & (reg->rh->region_size - 1);
619 if (!from.count)
620 from.count = reg->rh->region_size;
621 } else
622 from.count = reg->rh->region_size;
623
624 /* fill in the destinations */
625 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
626 if (i == DEFAULT_MIRROR)
627 continue;
628
629 m = ms->mirror + i;
630 dest->bdev = m->dev->bdev;
631 dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
632 dest->count = from.count;
633 dest++;
634 }
635
636 /* hand to kcopyd */
637 set_bit(KCOPYD_IGNORE_ERROR, &flags);
638 r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
639 recovery_complete, reg);
640
641 return r;
642}
643
644static void do_recovery(struct mirror_set *ms)
645{
646 int r;
647 struct region *reg;
648 struct dirty_log *log = ms->rh.log;
649
650 /*
651 * Start quiescing some regions.
652 */
653 rh_recovery_prepare(&ms->rh);
654
655 /*
656 * Copy any already quiesced regions.
657 */
658 while ((reg = rh_recovery_start(&ms->rh))) {
659 r = recover(ms, reg);
660 if (r)
661 rh_recovery_end(reg, 0);
662 }
663
664 /*
665 * Update the in sync flag.
666 */
667 if (!ms->in_sync &&
668 (log->type->get_sync_count(log) == ms->nr_regions)) {
669 /* the sync is complete */
670 dm_table_event(ms->ti->table);
671 ms->in_sync = 1;
672 }
673}
674
675/*-----------------------------------------------------------------
676 * Reads
677 *---------------------------------------------------------------*/
678static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
679{
680 /* FIXME: add read balancing */
681 return ms->mirror + DEFAULT_MIRROR;
682}
683
684/*
685 * remap a buffer to a particular mirror.
686 */
687static void map_bio(struct mirror_set *ms, struct mirror *m, struct bio *bio)
688{
689 bio->bi_bdev = m->dev->bdev;
690 bio->bi_sector = m->offset + (bio->bi_sector - ms->ti->begin);
691}
692
693static void do_reads(struct mirror_set *ms, struct bio_list *reads)
694{
695 region_t region;
696 struct bio *bio;
697 struct mirror *m;
698
699 while ((bio = bio_list_pop(reads))) {
700 region = bio_to_region(&ms->rh, bio);
701
702 /*
703 * We can only read balance if the region is in sync.
704 */
705 if (rh_in_sync(&ms->rh, region, 0))
706 m = choose_mirror(ms, bio->bi_sector);
707 else
708 m = ms->mirror + DEFAULT_MIRROR;
709
710 map_bio(ms, m, bio);
711 generic_make_request(bio);
712 }
713}
714
715/*-----------------------------------------------------------------
716 * Writes.
717 *
718 * We do different things with the write io depending on the
719 * state of the region that it's in:
720 *
721 * SYNC: increment pending, use kcopyd to write to *all* mirrors
722 * RECOVERING: delay the io until recovery completes
723 * NOSYNC: increment pending, just write to the default mirror
724 *---------------------------------------------------------------*/
725static void write_callback(unsigned long error, void *context)
726{
727 unsigned int i;
728 int uptodate = 1;
729 struct bio *bio = (struct bio *) context;
730 struct mirror_set *ms;
731
732 ms = bio_get_ms(bio);
733 bio_set_ms(bio, NULL);
734
735 /*
736 * NOTE: We don't decrement the pending count here,
737 * instead it is done by the targets endio function.
738 * This way we handle both writes to SYNC and NOSYNC
739 * regions with the same code.
740 */
741
742 if (error) {
743 /*
744 * only error the io if all mirrors failed.
745 * FIXME: bogus
746 */
747 uptodate = 0;
748 for (i = 0; i < ms->nr_mirrors; i++)
749 if (!test_bit(i, &error)) {
750 uptodate = 1;
751 break;
752 }
753 }
754 bio_endio(bio, bio->bi_size, 0);
755}
756
757static void do_write(struct mirror_set *ms, struct bio *bio)
758{
759 unsigned int i;
760 struct io_region io[KCOPYD_MAX_REGIONS+1];
761 struct mirror *m;
762
763 for (i = 0; i < ms->nr_mirrors; i++) {
764 m = ms->mirror + i;
765
766 io[i].bdev = m->dev->bdev;
767 io[i].sector = m->offset + (bio->bi_sector - ms->ti->begin);
768 io[i].count = bio->bi_size >> 9;
769 }
770
771 bio_set_ms(bio, ms);
772 dm_io_async_bvec(ms->nr_mirrors, io, WRITE,
773 bio->bi_io_vec + bio->bi_idx,
774 write_callback, bio);
775}
776
777static void do_writes(struct mirror_set *ms, struct bio_list *writes)
778{
779 int state;
780 struct bio *bio;
781 struct bio_list sync, nosync, recover, *this_list = NULL;
782
783 if (!writes->head)
784 return;
785
786 /*
787 * Classify each write.
788 */
789 bio_list_init(&sync);
790 bio_list_init(&nosync);
791 bio_list_init(&recover);
792
793 while ((bio = bio_list_pop(writes))) {
794 state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1);
795 switch (state) {
796 case RH_CLEAN:
797 case RH_DIRTY:
798 this_list = &sync;
799 break;
800
801 case RH_NOSYNC:
802 this_list = &nosync;
803 break;
804
805 case RH_RECOVERING:
806 this_list = &recover;
807 break;
808 }
809
810 bio_list_add(this_list, bio);
811 }
812
813 /*
814 * Increment the pending counts for any regions that will
815 * be written to (writes to recover regions are going to
816 * be delayed).
817 */
818 rh_inc_pending(&ms->rh, &sync);
819 rh_inc_pending(&ms->rh, &nosync);
820 rh_flush(&ms->rh);
821
822 /*
823 * Dispatch io.
824 */
825 while ((bio = bio_list_pop(&sync)))
826 do_write(ms, bio);
827
828 while ((bio = bio_list_pop(&recover)))
829 rh_delay(&ms->rh, bio);
830
831 while ((bio = bio_list_pop(&nosync))) {
832 map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio);
833 generic_make_request(bio);
834 }
835}
836
837/*-----------------------------------------------------------------
838 * kmirrord
839 *---------------------------------------------------------------*/
840static LIST_HEAD(_mirror_sets);
841static DECLARE_RWSEM(_mirror_sets_lock);
842
843static void do_mirror(struct mirror_set *ms)
844{
845 struct bio_list reads, writes;
846
847 spin_lock(&ms->lock);
848 reads = ms->reads;
849 writes = ms->writes;
850 bio_list_init(&ms->reads);
851 bio_list_init(&ms->writes);
852 spin_unlock(&ms->lock);
853
854 rh_update_states(&ms->rh);
855 do_recovery(ms);
856 do_reads(ms, &reads);
857 do_writes(ms, &writes);
858}
859
860static void do_work(void *ignored)
861{
862 struct mirror_set *ms;
863
864 down_read(&_mirror_sets_lock);
865 list_for_each_entry (ms, &_mirror_sets, list)
866 do_mirror(ms);
867 up_read(&_mirror_sets_lock);
868}
869
870/*-----------------------------------------------------------------
871 * Target functions
872 *---------------------------------------------------------------*/
873static struct mirror_set *alloc_context(unsigned int nr_mirrors,
874 uint32_t region_size,
875 struct dm_target *ti,
876 struct dirty_log *dl)
877{
878 size_t len;
879 struct mirror_set *ms = NULL;
880
881 if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
882 return NULL;
883
884 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
885
886 ms = kmalloc(len, GFP_KERNEL);
887 if (!ms) {
888 ti->error = "dm-mirror: Cannot allocate mirror context";
889 return NULL;
890 }
891
892 memset(ms, 0, len);
893 spin_lock_init(&ms->lock);
894
895 ms->ti = ti;
896 ms->nr_mirrors = nr_mirrors;
897 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
898 ms->in_sync = 0;
899
900 if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
901 ti->error = "dm-mirror: Error creating dirty region hash";
902 kfree(ms);
903 return NULL;
904 }
905
906 return ms;
907}
908
909static void free_context(struct mirror_set *ms, struct dm_target *ti,
910 unsigned int m)
911{
912 while (m--)
913 dm_put_device(ti, ms->mirror[m].dev);
914
915 rh_exit(&ms->rh);
916 kfree(ms);
917}
918
919static inline int _check_region_size(struct dm_target *ti, uint32_t size)
920{
921 return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
922 size > ti->len);
923}
924
925static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
926 unsigned int mirror, char **argv)
927{
928 sector_t offset;
929
930 if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
931 ti->error = "dm-mirror: Invalid offset";
932 return -EINVAL;
933 }
934
935 if (dm_get_device(ti, argv[0], offset, ti->len,
936 dm_table_get_mode(ti->table),
937 &ms->mirror[mirror].dev)) {
938 ti->error = "dm-mirror: Device lookup failure";
939 return -ENXIO;
940 }
941
942 ms->mirror[mirror].offset = offset;
943
944 return 0;
945}
946
947static int add_mirror_set(struct mirror_set *ms)
948{
949 down_write(&_mirror_sets_lock);
950 list_add_tail(&ms->list, &_mirror_sets);
951 up_write(&_mirror_sets_lock);
952 wake();
953
954 return 0;
955}
956
957static void del_mirror_set(struct mirror_set *ms)
958{
959 down_write(&_mirror_sets_lock);
960 list_del(&ms->list);
961 up_write(&_mirror_sets_lock);
962}
963
964/*
965 * Create dirty log: log_type #log_params <log_params>
966 */
967static struct dirty_log *create_dirty_log(struct dm_target *ti,
968 unsigned int argc, char **argv,
969 unsigned int *args_used)
970{
971 unsigned int param_count;
972 struct dirty_log *dl;
973
974 if (argc < 2) {
975 ti->error = "dm-mirror: Insufficient mirror log arguments";
976 return NULL;
977 }
978
979 if (sscanf(argv[1], "%u", &param_count) != 1) {
980 ti->error = "dm-mirror: Invalid mirror log argument count";
981 return NULL;
982 }
983
984 *args_used = 2 + param_count;
985
986 if (argc < *args_used) {
987 ti->error = "dm-mirror: Insufficient mirror log arguments";
988 return NULL;
989 }
990
991 dl = dm_create_dirty_log(argv[0], ti, param_count, argv + 2);
992 if (!dl) {
993 ti->error = "dm-mirror: Error creating mirror dirty log";
994 return NULL;
995 }
996
997 if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
998 ti->error = "dm-mirror: Invalid region size";
999 dm_destroy_dirty_log(dl);
1000 return NULL;
1001 }
1002
1003 return dl;
1004}
1005
1006/*
1007 * Construct a mirror mapping:
1008 *
1009 * log_type #log_params <log_params>
1010 * #mirrors [mirror_path offset]{2,}
1011 *
1012 * log_type is "core" or "disk"
1013 * #log_params is between 1 and 3
1014 */
1015#define DM_IO_PAGES 64
1016static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1017{
1018 int r;
1019 unsigned int nr_mirrors, m, args_used;
1020 struct mirror_set *ms;
1021 struct dirty_log *dl;
1022
1023 dl = create_dirty_log(ti, argc, argv, &args_used);
1024 if (!dl)
1025 return -EINVAL;
1026
1027 argv += args_used;
1028 argc -= args_used;
1029
1030 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
1031 nr_mirrors < 2 || nr_mirrors > KCOPYD_MAX_REGIONS + 1) {
1032 ti->error = "dm-mirror: Invalid number of mirrors";
1033 dm_destroy_dirty_log(dl);
1034 return -EINVAL;
1035 }
1036
1037 argv++, argc--;
1038
1039 if (argc != nr_mirrors * 2) {
1040 ti->error = "dm-mirror: Wrong number of mirror arguments";
1041 dm_destroy_dirty_log(dl);
1042 return -EINVAL;
1043 }
1044
1045 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
1046 if (!ms) {
1047 dm_destroy_dirty_log(dl);
1048 return -ENOMEM;
1049 }
1050
1051 /* Get the mirror parameter sets */
1052 for (m = 0; m < nr_mirrors; m++) {
1053 r = get_mirror(ms, ti, m, argv);
1054 if (r) {
1055 free_context(ms, ti, m);
1056 return r;
1057 }
1058 argv += 2;
1059 argc -= 2;
1060 }
1061
1062 ti->private = ms;
1063
1064 r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
1065 if (r) {
1066 free_context(ms, ti, ms->nr_mirrors);
1067 return r;
1068 }
1069
1070 add_mirror_set(ms);
1071 return 0;
1072}
1073
1074static void mirror_dtr(struct dm_target *ti)
1075{
1076 struct mirror_set *ms = (struct mirror_set *) ti->private;
1077
1078 del_mirror_set(ms);
1079 kcopyd_client_destroy(ms->kcopyd_client);
1080 free_context(ms, ti, ms->nr_mirrors);
1081}
1082
1083static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
1084{
1085 int should_wake = 0;
1086 struct bio_list *bl;
1087
1088 bl = (rw == WRITE) ? &ms->writes : &ms->reads;
1089 spin_lock(&ms->lock);
1090 should_wake = !(bl->head);
1091 bio_list_add(bl, bio);
1092 spin_unlock(&ms->lock);
1093
1094 if (should_wake)
1095 wake();
1096}
1097
1098/*
1099 * Mirror mapping function
1100 */
1101static int mirror_map(struct dm_target *ti, struct bio *bio,
1102 union map_info *map_context)
1103{
1104 int r, rw = bio_rw(bio);
1105 struct mirror *m;
1106 struct mirror_set *ms = ti->private;
1107
1108 map_context->ll = bio->bi_sector >> ms->rh.region_shift;
1109
1110 if (rw == WRITE) {
1111 queue_bio(ms, bio, rw);
1112 return 0;
1113 }
1114
1115 r = ms->rh.log->type->in_sync(ms->rh.log,
1116 bio_to_region(&ms->rh, bio), 0);
1117 if (r < 0 && r != -EWOULDBLOCK)
1118 return r;
1119
1120 if (r == -EWOULDBLOCK) /* FIXME: ugly */
1121 r = 0;
1122
1123 /*
1124 * We don't want to fast track a recovery just for a read
1125 * ahead. So we just let it silently fail.
1126 * FIXME: get rid of this.
1127 */
1128 if (!r && rw == READA)
1129 return -EIO;
1130
1131 if (!r) {
1132 /* Pass this io over to the daemon */
1133 queue_bio(ms, bio, rw);
1134 return 0;
1135 }
1136
1137 m = choose_mirror(ms, bio->bi_sector);
1138 if (!m)
1139 return -EIO;
1140
1141 map_bio(ms, m, bio);
1142 return 1;
1143}
1144
1145static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1146 int error, union map_info *map_context)
1147{
1148 int rw = bio_rw(bio);
1149 struct mirror_set *ms = (struct mirror_set *) ti->private;
1150 region_t region = map_context->ll;
1151
1152 /*
1153 * We need to dec pending if this was a write.
1154 */
1155 if (rw == WRITE)
1156 rh_dec(&ms->rh, region);
1157
1158 return 0;
1159}
1160
1161static void mirror_postsuspend(struct dm_target *ti)
1162{
1163 struct mirror_set *ms = (struct mirror_set *) ti->private;
1164 struct dirty_log *log = ms->rh.log;
1165
1166 rh_stop_recovery(&ms->rh);
1167 if (log->type->suspend && log->type->suspend(log))
1168 /* FIXME: need better error handling */
1169 DMWARN("log suspend failed");
1170}
1171
1172static void mirror_resume(struct dm_target *ti)
1173{
1174 struct mirror_set *ms = (struct mirror_set *) ti->private;
1175 struct dirty_log *log = ms->rh.log;
1176 if (log->type->resume && log->type->resume(log))
1177 /* FIXME: need better error handling */
1178 DMWARN("log resume failed");
1179 rh_start_recovery(&ms->rh);
1180}
1181
1182static int mirror_status(struct dm_target *ti, status_type_t type,
1183 char *result, unsigned int maxlen)
1184{
1185 unsigned int m, sz;
1186 struct mirror_set *ms = (struct mirror_set *) ti->private;
1187
1188 sz = ms->rh.log->type->status(ms->rh.log, type, result, maxlen);
1189
1190 switch (type) {
1191 case STATUSTYPE_INFO:
1192 DMEMIT("%d ", ms->nr_mirrors);
1193 for (m = 0; m < ms->nr_mirrors; m++)
1194 DMEMIT("%s ", ms->mirror[m].dev->name);
1195
1196 DMEMIT(SECTOR_FORMAT "/" SECTOR_FORMAT,
1197 ms->rh.log->type->get_sync_count(ms->rh.log),
1198 ms->nr_regions);
1199 break;
1200
1201 case STATUSTYPE_TABLE:
1202 DMEMIT("%d ", ms->nr_mirrors);
1203 for (m = 0; m < ms->nr_mirrors; m++)
1204 DMEMIT("%s " SECTOR_FORMAT " ",
1205 ms->mirror[m].dev->name, ms->mirror[m].offset);
1206 }
1207
1208 return 0;
1209}
1210
1211static struct target_type mirror_target = {
1212 .name = "mirror",
1213 .version = {1, 0, 1},
1214 .module = THIS_MODULE,
1215 .ctr = mirror_ctr,
1216 .dtr = mirror_dtr,
1217 .map = mirror_map,
1218 .end_io = mirror_end_io,
1219 .postsuspend = mirror_postsuspend,
1220 .resume = mirror_resume,
1221 .status = mirror_status,
1222};
1223
1224static int __init dm_mirror_init(void)
1225{
1226 int r;
1227
1228 r = dm_dirty_log_init();
1229 if (r)
1230 return r;
1231
1232 _kmirrord_wq = create_workqueue("kmirrord");
1233 if (!_kmirrord_wq) {
1234 DMERR("couldn't start kmirrord");
1235 dm_dirty_log_exit();
1236 return r;
1237 }
1238 INIT_WORK(&_kmirrord_work, do_work, NULL);
1239
1240 r = dm_register_target(&mirror_target);
1241 if (r < 0) {
1242 DMERR("%s: Failed to register mirror target",
1243 mirror_target.name);
1244 dm_dirty_log_exit();
1245 destroy_workqueue(_kmirrord_wq);
1246 }
1247
1248 return r;
1249}
1250
1251static void __exit dm_mirror_exit(void)
1252{
1253 int r;
1254
1255 r = dm_unregister_target(&mirror_target);
1256 if (r < 0)
1257 DMERR("%s: unregister failed %d", mirror_target.name, r);
1258
1259 destroy_workqueue(_kmirrord_wq);
1260 dm_dirty_log_exit();
1261}
1262
1263/* Module hooks */
1264module_init(dm_mirror_init);
1265module_exit(dm_mirror_exit);
1266
1267MODULE_DESCRIPTION(DM_NAME " mirror target");
1268MODULE_AUTHOR("Joe Thornber");
1269MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
new file mode 100644
index 000000000000..d0024865a789
--- /dev/null
+++ b/drivers/md/dm-round-robin.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (C) 2003 Sistina Software.
3 * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
4 *
5 * Module Author: Heinz Mauelshagen
6 *
7 * This file is released under the GPL.
8 *
9 * Round-robin path selector.
10 */
11
12#include "dm.h"
13#include "dm-path-selector.h"
14
15#include <linux/slab.h>
16
17/*-----------------------------------------------------------------
18 * Path-handling code, paths are held in lists
19 *---------------------------------------------------------------*/
20struct path_info {
21 struct list_head list;
22 struct path *path;
23 unsigned repeat_count;
24};
25
26static void free_paths(struct list_head *paths)
27{
28 struct path_info *pi, *next;
29
30 list_for_each_entry_safe(pi, next, paths, list) {
31 list_del(&pi->list);
32 kfree(pi);
33 }
34}
35
36/*-----------------------------------------------------------------
37 * Round-robin selector
38 *---------------------------------------------------------------*/
39
40#define RR_MIN_IO 1000
41
42struct selector {
43 struct list_head valid_paths;
44 struct list_head invalid_paths;
45};
46
47static struct selector *alloc_selector(void)
48{
49 struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
50
51 if (s) {
52 INIT_LIST_HEAD(&s->valid_paths);
53 INIT_LIST_HEAD(&s->invalid_paths);
54 }
55
56 return s;
57}
58
59static int rr_create(struct path_selector *ps, unsigned argc, char **argv)
60{
61 struct selector *s;
62
63 s = alloc_selector();
64 if (!s)
65 return -ENOMEM;
66
67 ps->context = s;
68 return 0;
69}
70
71static void rr_destroy(struct path_selector *ps)
72{
73 struct selector *s = (struct selector *) ps->context;
74
75 free_paths(&s->valid_paths);
76 free_paths(&s->invalid_paths);
77 kfree(s);
78 ps->context = NULL;
79}
80
81static int rr_status(struct path_selector *ps, struct path *path,
82 status_type_t type, char *result, unsigned int maxlen)
83{
84 struct path_info *pi;
85 int sz = 0;
86
87 if (!path)
88 DMEMIT("0 ");
89 else {
90 switch(type) {
91 case STATUSTYPE_INFO:
92 break;
93 case STATUSTYPE_TABLE:
94 pi = path->pscontext;
95 DMEMIT("%u ", pi->repeat_count);
96 break;
97 }
98 }
99
100 return sz;
101}
102
103/*
104 * Called during initialisation to register each path with an
105 * optional repeat_count.
106 */
107static int rr_add_path(struct path_selector *ps, struct path *path,
108 int argc, char **argv, char **error)
109{
110 struct selector *s = (struct selector *) ps->context;
111 struct path_info *pi;
112 unsigned repeat_count = RR_MIN_IO;
113
114 if (argc > 1) {
115 *error = "round-robin ps: incorrect number of arguments";
116 return -EINVAL;
117 }
118
119 /* First path argument is number of I/Os before switching path */
120 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
121 *error = "round-robin ps: invalid repeat count";
122 return -EINVAL;
123 }
124
125 /* allocate the path */
126 pi = kmalloc(sizeof(*pi), GFP_KERNEL);
127 if (!pi) {
128 *error = "round-robin ps: Error allocating path context";
129 return -ENOMEM;
130 }
131
132 pi->path = path;
133 pi->repeat_count = repeat_count;
134
135 path->pscontext = pi;
136
137 list_add(&pi->list, &s->valid_paths);
138
139 return 0;
140}
141
142static void rr_fail_path(struct path_selector *ps, struct path *p)
143{
144 struct selector *s = (struct selector *) ps->context;
145 struct path_info *pi = p->pscontext;
146
147 list_move(&pi->list, &s->invalid_paths);
148}
149
150static int rr_reinstate_path(struct path_selector *ps, struct path *p)
151{
152 struct selector *s = (struct selector *) ps->context;
153 struct path_info *pi = p->pscontext;
154
155 list_move(&pi->list, &s->valid_paths);
156
157 return 0;
158}
159
160static struct path *rr_select_path(struct path_selector *ps,
161 unsigned *repeat_count)
162{
163 struct selector *s = (struct selector *) ps->context;
164 struct path_info *pi = NULL;
165
166 if (!list_empty(&s->valid_paths)) {
167 pi = list_entry(s->valid_paths.next, struct path_info, list);
168 list_move_tail(&pi->list, &s->valid_paths);
169 *repeat_count = pi->repeat_count;
170 }
171
172 return pi ? pi->path : NULL;
173}
174
175static struct path_selector_type rr_ps = {
176 .name = "round-robin",
177 .module = THIS_MODULE,
178 .table_args = 1,
179 .info_args = 0,
180 .create = rr_create,
181 .destroy = rr_destroy,
182 .status = rr_status,
183 .add_path = rr_add_path,
184 .fail_path = rr_fail_path,
185 .reinstate_path = rr_reinstate_path,
186 .select_path = rr_select_path,
187};
188
189static int __init dm_rr_init(void)
190{
191 int r = dm_register_path_selector(&rr_ps);
192
193 if (r < 0)
194 DMERR("round-robin: register failed %d", r);
195
196 DMINFO("dm-round-robin version 1.0.0 loaded");
197
198 return r;
199}
200
201static void __exit dm_rr_exit(void)
202{
203 int r = dm_unregister_path_selector(&rr_ps);
204
205 if (r < 0)
206 DMERR("round-robin: unregister failed %d", r);
207}
208
209module_init(dm_rr_init);
210module_exit(dm_rr_exit);
211
212MODULE_DESCRIPTION(DM_NAME " round-robin multipath path selector");
213MODULE_AUTHOR("Sistina Software <dm-devel@redhat.com>");
214MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
new file mode 100644
index 000000000000..7e691ab9a748
--- /dev/null
+++ b/drivers/md/dm-snap.c
@@ -0,0 +1,1208 @@
1/*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 *
6 * This file is released under the GPL.
7 */
8
9#include <linux/blkdev.h>
10#include <linux/config.h>
11#include <linux/ctype.h>
12#include <linux/device-mapper.h>
13#include <linux/fs.h>
14#include <linux/init.h>
15#include <linux/kdev_t.h>
16#include <linux/list.h>
17#include <linux/mempool.h>
18#include <linux/module.h>
19#include <linux/slab.h>
20#include <linux/vmalloc.h>
21
22#include "dm-snap.h"
23#include "dm-bio-list.h"
24#include "kcopyd.h"
25
26/*
27 * The percentage increment we will wake up users at
28 */
29#define WAKE_UP_PERCENT 5
30
31/*
32 * kcopyd priority of snapshot operations
33 */
34#define SNAPSHOT_COPY_PRIORITY 2
35
36/*
37 * Each snapshot reserves this many pages for io
38 */
39#define SNAPSHOT_PAGES 256
40
41struct pending_exception {
42 struct exception e;
43
44 /*
45 * Origin buffers waiting for this to complete are held
46 * in a bio list
47 */
48 struct bio_list origin_bios;
49 struct bio_list snapshot_bios;
50
51 /*
52 * Other pending_exceptions that are processing this
53 * chunk. When this list is empty, we know we can
54 * complete the origins.
55 */
56 struct list_head siblings;
57
58 /* Pointer back to snapshot context */
59 struct dm_snapshot *snap;
60
61 /*
62 * 1 indicates the exception has already been sent to
63 * kcopyd.
64 */
65 int started;
66};
67
68/*
69 * Hash table mapping origin volumes to lists of snapshots and
70 * a lock to protect it
71 */
72static kmem_cache_t *exception_cache;
73static kmem_cache_t *pending_cache;
74static mempool_t *pending_pool;
75
76/*
77 * One of these per registered origin, held in the snapshot_origins hash
78 */
79struct origin {
80 /* The origin device */
81 struct block_device *bdev;
82
83 struct list_head hash_list;
84
85 /* List of snapshots for this origin */
86 struct list_head snapshots;
87};
88
89/*
90 * Size of the hash table for origin volumes. If we make this
91 * the size of the minors list then it should be nearly perfect
92 */
93#define ORIGIN_HASH_SIZE 256
94#define ORIGIN_MASK 0xFF
95static struct list_head *_origins;
96static struct rw_semaphore _origins_lock;
97
98static int init_origin_hash(void)
99{
100 int i;
101
102 _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
103 GFP_KERNEL);
104 if (!_origins) {
105 DMERR("Device mapper: Snapshot: unable to allocate memory");
106 return -ENOMEM;
107 }
108
109 for (i = 0; i < ORIGIN_HASH_SIZE; i++)
110 INIT_LIST_HEAD(_origins + i);
111 init_rwsem(&_origins_lock);
112
113 return 0;
114}
115
116static void exit_origin_hash(void)
117{
118 kfree(_origins);
119}
120
121static inline unsigned int origin_hash(struct block_device *bdev)
122{
123 return bdev->bd_dev & ORIGIN_MASK;
124}
125
126static struct origin *__lookup_origin(struct block_device *origin)
127{
128 struct list_head *ol;
129 struct origin *o;
130
131 ol = &_origins[origin_hash(origin)];
132 list_for_each_entry (o, ol, hash_list)
133 if (bdev_equal(o->bdev, origin))
134 return o;
135
136 return NULL;
137}
138
139static void __insert_origin(struct origin *o)
140{
141 struct list_head *sl = &_origins[origin_hash(o->bdev)];
142 list_add_tail(&o->hash_list, sl);
143}
144
145/*
146 * Make a note of the snapshot and its origin so we can look it
147 * up when the origin has a write on it.
148 */
149static int register_snapshot(struct dm_snapshot *snap)
150{
151 struct origin *o;
152 struct block_device *bdev = snap->origin->bdev;
153
154 down_write(&_origins_lock);
155 o = __lookup_origin(bdev);
156
157 if (!o) {
158 /* New origin */
159 o = kmalloc(sizeof(*o), GFP_KERNEL);
160 if (!o) {
161 up_write(&_origins_lock);
162 return -ENOMEM;
163 }
164
165 /* Initialise the struct */
166 INIT_LIST_HEAD(&o->snapshots);
167 o->bdev = bdev;
168
169 __insert_origin(o);
170 }
171
172 list_add_tail(&snap->list, &o->snapshots);
173
174 up_write(&_origins_lock);
175 return 0;
176}
177
178static void unregister_snapshot(struct dm_snapshot *s)
179{
180 struct origin *o;
181
182 down_write(&_origins_lock);
183 o = __lookup_origin(s->origin->bdev);
184
185 list_del(&s->list);
186 if (list_empty(&o->snapshots)) {
187 list_del(&o->hash_list);
188 kfree(o);
189 }
190
191 up_write(&_origins_lock);
192}
193
194/*
195 * Implementation of the exception hash tables.
196 */
197static int init_exception_table(struct exception_table *et, uint32_t size)
198{
199 unsigned int i;
200
201 et->hash_mask = size - 1;
202 et->table = dm_vcalloc(size, sizeof(struct list_head));
203 if (!et->table)
204 return -ENOMEM;
205
206 for (i = 0; i < size; i++)
207 INIT_LIST_HEAD(et->table + i);
208
209 return 0;
210}
211
212static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
213{
214 struct list_head *slot;
215 struct exception *ex, *next;
216 int i, size;
217
218 size = et->hash_mask + 1;
219 for (i = 0; i < size; i++) {
220 slot = et->table + i;
221
222 list_for_each_entry_safe (ex, next, slot, hash_list)
223 kmem_cache_free(mem, ex);
224 }
225
226 vfree(et->table);
227}
228
229static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
230{
231 return chunk & et->hash_mask;
232}
233
234static void insert_exception(struct exception_table *eh, struct exception *e)
235{
236 struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
237 list_add(&e->hash_list, l);
238}
239
240static inline void remove_exception(struct exception *e)
241{
242 list_del(&e->hash_list);
243}
244
245/*
246 * Return the exception data for a sector, or NULL if not
247 * remapped.
248 */
249static struct exception *lookup_exception(struct exception_table *et,
250 chunk_t chunk)
251{
252 struct list_head *slot;
253 struct exception *e;
254
255 slot = &et->table[exception_hash(et, chunk)];
256 list_for_each_entry (e, slot, hash_list)
257 if (e->old_chunk == chunk)
258 return e;
259
260 return NULL;
261}
262
263static inline struct exception *alloc_exception(void)
264{
265 struct exception *e;
266
267 e = kmem_cache_alloc(exception_cache, GFP_NOIO);
268 if (!e)
269 e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
270
271 return e;
272}
273
274static inline void free_exception(struct exception *e)
275{
276 kmem_cache_free(exception_cache, e);
277}
278
279static inline struct pending_exception *alloc_pending_exception(void)
280{
281 return mempool_alloc(pending_pool, GFP_NOIO);
282}
283
284static inline void free_pending_exception(struct pending_exception *pe)
285{
286 mempool_free(pe, pending_pool);
287}
288
289int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
290{
291 struct exception *e;
292
293 e = alloc_exception();
294 if (!e)
295 return -ENOMEM;
296
297 e->old_chunk = old;
298 e->new_chunk = new;
299 insert_exception(&s->complete, e);
300 return 0;
301}
302
303/*
304 * Hard coded magic.
305 */
306static int calc_max_buckets(void)
307{
308 /* use a fixed size of 2MB */
309 unsigned long mem = 2 * 1024 * 1024;
310 mem /= sizeof(struct list_head);
311
312 return mem;
313}
314
315/*
316 * Rounds a number down to a power of 2.
317 */
318static inline uint32_t round_down(uint32_t n)
319{
320 while (n & (n - 1))
321 n &= (n - 1);
322 return n;
323}
324
325/*
326 * Allocate room for a suitable hash table.
327 */
328static int init_hash_tables(struct dm_snapshot *s)
329{
330 sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
331
332 /*
333 * Calculate based on the size of the original volume or
334 * the COW volume...
335 */
336 cow_dev_size = get_dev_size(s->cow->bdev);
337 origin_dev_size = get_dev_size(s->origin->bdev);
338 max_buckets = calc_max_buckets();
339
340 hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
341 hash_size = min(hash_size, max_buckets);
342
343 /* Round it down to a power of 2 */
344 hash_size = round_down(hash_size);
345 if (init_exception_table(&s->complete, hash_size))
346 return -ENOMEM;
347
348 /*
349 * Allocate hash table for in-flight exceptions
350 * Make this smaller than the real hash table
351 */
352 hash_size >>= 3;
353 if (hash_size < 64)
354 hash_size = 64;
355
356 if (init_exception_table(&s->pending, hash_size)) {
357 exit_exception_table(&s->complete, exception_cache);
358 return -ENOMEM;
359 }
360
361 return 0;
362}
363
364/*
365 * Round a number up to the nearest 'size' boundary. size must
366 * be a power of 2.
367 */
368static inline ulong round_up(ulong n, ulong size)
369{
370 size--;
371 return (n + size) & ~size;
372}
373
374/*
375 * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
376 */
377static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
378{
379 struct dm_snapshot *s;
380 unsigned long chunk_size;
381 int r = -EINVAL;
382 char persistent;
383 char *origin_path;
384 char *cow_path;
385 char *value;
386 int blocksize;
387
388 if (argc < 4) {
389 ti->error = "dm-snapshot: requires exactly 4 arguments";
390 r = -EINVAL;
391 goto bad1;
392 }
393
394 origin_path = argv[0];
395 cow_path = argv[1];
396 persistent = toupper(*argv[2]);
397
398 if (persistent != 'P' && persistent != 'N') {
399 ti->error = "Persistent flag is not P or N";
400 r = -EINVAL;
401 goto bad1;
402 }
403
404 chunk_size = simple_strtoul(argv[3], &value, 10);
405 if (chunk_size == 0 || value == NULL) {
406 ti->error = "Invalid chunk size";
407 r = -EINVAL;
408 goto bad1;
409 }
410
411 s = kmalloc(sizeof(*s), GFP_KERNEL);
412 if (s == NULL) {
413 ti->error = "Cannot allocate snapshot context private "
414 "structure";
415 r = -ENOMEM;
416 goto bad1;
417 }
418
419 r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
420 if (r) {
421 ti->error = "Cannot get origin device";
422 goto bad2;
423 }
424
425 r = dm_get_device(ti, cow_path, 0, 0,
426 FMODE_READ | FMODE_WRITE, &s->cow);
427 if (r) {
428 dm_put_device(ti, s->origin);
429 ti->error = "Cannot get COW device";
430 goto bad2;
431 }
432
433 /*
434 * Chunk size must be multiple of page size. Silently
435 * round up if it's not.
436 */
437 chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
438
439 /* Validate the chunk size against the device block size */
440 blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
441 if (chunk_size % (blocksize >> 9)) {
442 ti->error = "Chunk size is not a multiple of device blocksize";
443 r = -EINVAL;
444 goto bad3;
445 }
446
447 /* Check chunk_size is a power of 2 */
448 if (chunk_size & (chunk_size - 1)) {
449 ti->error = "Chunk size is not a power of 2";
450 r = -EINVAL;
451 goto bad3;
452 }
453
454 s->chunk_size = chunk_size;
455 s->chunk_mask = chunk_size - 1;
456 s->type = persistent;
457 s->chunk_shift = ffs(chunk_size) - 1;
458
459 s->valid = 1;
460 s->have_metadata = 0;
461 s->last_percent = 0;
462 init_rwsem(&s->lock);
463 s->table = ti->table;
464
465 /* Allocate hash table for COW data */
466 if (init_hash_tables(s)) {
467 ti->error = "Unable to allocate hash table space";
468 r = -ENOMEM;
469 goto bad3;
470 }
471
472 /*
473 * Check the persistent flag - done here because we need the iobuf
474 * to check the LV header
475 */
476 s->store.snap = s;
477
478 if (persistent == 'P')
479 r = dm_create_persistent(&s->store, chunk_size);
480 else
481 r = dm_create_transient(&s->store, s, blocksize);
482
483 if (r) {
484 ti->error = "Couldn't create exception store";
485 r = -EINVAL;
486 goto bad4;
487 }
488
489 r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
490 if (r) {
491 ti->error = "Could not create kcopyd client";
492 goto bad5;
493 }
494
495 /* Add snapshot to the list of snapshots for this origin */
496 if (register_snapshot(s)) {
497 r = -EINVAL;
498 ti->error = "Cannot register snapshot origin";
499 goto bad6;
500 }
501
502 ti->private = s;
503 ti->split_io = chunk_size;
504
505 return 0;
506
507 bad6:
508 kcopyd_client_destroy(s->kcopyd_client);
509
510 bad5:
511 s->store.destroy(&s->store);
512
513 bad4:
514 exit_exception_table(&s->pending, pending_cache);
515 exit_exception_table(&s->complete, exception_cache);
516
517 bad3:
518 dm_put_device(ti, s->cow);
519 dm_put_device(ti, s->origin);
520
521 bad2:
522 kfree(s);
523
524 bad1:
525 return r;
526}
527
528static void snapshot_dtr(struct dm_target *ti)
529{
530 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
531
532 unregister_snapshot(s);
533
534 exit_exception_table(&s->pending, pending_cache);
535 exit_exception_table(&s->complete, exception_cache);
536
537 /* Deallocate memory used */
538 s->store.destroy(&s->store);
539
540 dm_put_device(ti, s->origin);
541 dm_put_device(ti, s->cow);
542 kcopyd_client_destroy(s->kcopyd_client);
543 kfree(s);
544}
545
546/*
547 * Flush a list of buffers.
548 */
549static void flush_bios(struct bio *bio)
550{
551 struct bio *n;
552
553 while (bio) {
554 n = bio->bi_next;
555 bio->bi_next = NULL;
556 generic_make_request(bio);
557 bio = n;
558 }
559}
560
561/*
562 * Error a list of buffers.
563 */
564static void error_bios(struct bio *bio)
565{
566 struct bio *n;
567
568 while (bio) {
569 n = bio->bi_next;
570 bio->bi_next = NULL;
571 bio_io_error(bio, bio->bi_size);
572 bio = n;
573 }
574}
575
576static struct bio *__flush_bios(struct pending_exception *pe)
577{
578 struct pending_exception *sibling;
579
580 if (list_empty(&pe->siblings))
581 return bio_list_get(&pe->origin_bios);
582
583 sibling = list_entry(pe->siblings.next,
584 struct pending_exception, siblings);
585
586 list_del(&pe->siblings);
587
588 /* This is fine as long as kcopyd is single-threaded. If kcopyd
589 * becomes multi-threaded, we'll need some locking here.
590 */
591 bio_list_merge(&sibling->origin_bios, &pe->origin_bios);
592
593 return NULL;
594}
595
596static void pending_complete(struct pending_exception *pe, int success)
597{
598 struct exception *e;
599 struct dm_snapshot *s = pe->snap;
600 struct bio *flush = NULL;
601
602 if (success) {
603 e = alloc_exception();
604 if (!e) {
605 DMWARN("Unable to allocate exception.");
606 down_write(&s->lock);
607 s->store.drop_snapshot(&s->store);
608 s->valid = 0;
609 flush = __flush_bios(pe);
610 up_write(&s->lock);
611
612 error_bios(bio_list_get(&pe->snapshot_bios));
613 goto out;
614 }
615 *e = pe->e;
616
617 /*
618 * Add a proper exception, and remove the
619 * in-flight exception from the list.
620 */
621 down_write(&s->lock);
622 insert_exception(&s->complete, e);
623 remove_exception(&pe->e);
624 flush = __flush_bios(pe);
625
626 /* Submit any pending write bios */
627 up_write(&s->lock);
628
629 flush_bios(bio_list_get(&pe->snapshot_bios));
630 } else {
631 /* Read/write error - snapshot is unusable */
632 down_write(&s->lock);
633 if (s->valid)
634 DMERR("Error reading/writing snapshot");
635 s->store.drop_snapshot(&s->store);
636 s->valid = 0;
637 remove_exception(&pe->e);
638 flush = __flush_bios(pe);
639 up_write(&s->lock);
640
641 error_bios(bio_list_get(&pe->snapshot_bios));
642
643 dm_table_event(s->table);
644 }
645
646 out:
647 free_pending_exception(pe);
648
649 if (flush)
650 flush_bios(flush);
651}
652
653static void commit_callback(void *context, int success)
654{
655 struct pending_exception *pe = (struct pending_exception *) context;
656 pending_complete(pe, success);
657}
658
659/*
660 * Called when the copy I/O has finished. kcopyd actually runs
661 * this code so don't block.
662 */
663static void copy_callback(int read_err, unsigned int write_err, void *context)
664{
665 struct pending_exception *pe = (struct pending_exception *) context;
666 struct dm_snapshot *s = pe->snap;
667
668 if (read_err || write_err)
669 pending_complete(pe, 0);
670
671 else
672 /* Update the metadata if we are persistent */
673 s->store.commit_exception(&s->store, &pe->e, commit_callback,
674 pe);
675}
676
677/*
678 * Dispatches the copy operation to kcopyd.
679 */
680static inline void start_copy(struct pending_exception *pe)
681{
682 struct dm_snapshot *s = pe->snap;
683 struct io_region src, dest;
684 struct block_device *bdev = s->origin->bdev;
685 sector_t dev_size;
686
687 dev_size = get_dev_size(bdev);
688
689 src.bdev = bdev;
690 src.sector = chunk_to_sector(s, pe->e.old_chunk);
691 src.count = min(s->chunk_size, dev_size - src.sector);
692
693 dest.bdev = s->cow->bdev;
694 dest.sector = chunk_to_sector(s, pe->e.new_chunk);
695 dest.count = src.count;
696
697 /* Hand over to kcopyd */
698 kcopyd_copy(s->kcopyd_client,
699 &src, 1, &dest, 0, copy_callback, pe);
700}
701
702/*
703 * Looks to see if this snapshot already has a pending exception
704 * for this chunk, otherwise it allocates a new one and inserts
705 * it into the pending table.
706 *
707 * NOTE: a write lock must be held on snap->lock before calling
708 * this.
709 */
710static struct pending_exception *
711__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
712{
713 struct exception *e;
714 struct pending_exception *pe;
715 chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
716
717 /*
718 * Is there a pending exception for this already ?
719 */
720 e = lookup_exception(&s->pending, chunk);
721 if (e) {
722 /* cast the exception to a pending exception */
723 pe = container_of(e, struct pending_exception, e);
724
725 } else {
726 /*
727 * Create a new pending exception, we don't want
728 * to hold the lock while we do this.
729 */
730 up_write(&s->lock);
731 pe = alloc_pending_exception();
732 down_write(&s->lock);
733
734 e = lookup_exception(&s->pending, chunk);
735 if (e) {
736 free_pending_exception(pe);
737 pe = container_of(e, struct pending_exception, e);
738 } else {
739 pe->e.old_chunk = chunk;
740 bio_list_init(&pe->origin_bios);
741 bio_list_init(&pe->snapshot_bios);
742 INIT_LIST_HEAD(&pe->siblings);
743 pe->snap = s;
744 pe->started = 0;
745
746 if (s->store.prepare_exception(&s->store, &pe->e)) {
747 free_pending_exception(pe);
748 s->valid = 0;
749 return NULL;
750 }
751
752 insert_exception(&s->pending, &pe->e);
753 }
754 }
755
756 return pe;
757}
758
759static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
760 struct bio *bio)
761{
762 bio->bi_bdev = s->cow->bdev;
763 bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
764 (bio->bi_sector & s->chunk_mask);
765}
766
767static int snapshot_map(struct dm_target *ti, struct bio *bio,
768 union map_info *map_context)
769{
770 struct exception *e;
771 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
772 int r = 1;
773 chunk_t chunk;
774 struct pending_exception *pe;
775
776 chunk = sector_to_chunk(s, bio->bi_sector);
777
778 /* Full snapshots are not usable */
779 if (!s->valid)
780 return -1;
781
782 /*
783 * Write to snapshot - higher level takes care of RW/RO
784 * flags so we should only get this if we are
785 * writeable.
786 */
787 if (bio_rw(bio) == WRITE) {
788
789 /* FIXME: should only take write lock if we need
790 * to copy an exception */
791 down_write(&s->lock);
792
793 /* If the block is already remapped - use that, else remap it */
794 e = lookup_exception(&s->complete, chunk);
795 if (e) {
796 remap_exception(s, e, bio);
797 up_write(&s->lock);
798
799 } else {
800 pe = __find_pending_exception(s, bio);
801
802 if (!pe) {
803 if (s->store.drop_snapshot)
804 s->store.drop_snapshot(&s->store);
805 s->valid = 0;
806 r = -EIO;
807 up_write(&s->lock);
808 } else {
809 remap_exception(s, &pe->e, bio);
810 bio_list_add(&pe->snapshot_bios, bio);
811
812 if (!pe->started) {
813 /* this is protected by snap->lock */
814 pe->started = 1;
815 up_write(&s->lock);
816 start_copy(pe);
817 } else
818 up_write(&s->lock);
819 r = 0;
820 }
821 }
822
823 } else {
824 /*
825 * FIXME: this read path scares me because we
826 * always use the origin when we have a pending
827 * exception. However I can't think of a
828 * situation where this is wrong - ejt.
829 */
830
831 /* Do reads */
832 down_read(&s->lock);
833
834 /* See if it it has been remapped */
835 e = lookup_exception(&s->complete, chunk);
836 if (e)
837 remap_exception(s, e, bio);
838 else
839 bio->bi_bdev = s->origin->bdev;
840
841 up_read(&s->lock);
842 }
843
844 return r;
845}
846
847static void snapshot_resume(struct dm_target *ti)
848{
849 struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
850
851 if (s->have_metadata)
852 return;
853
854 if (s->store.read_metadata(&s->store)) {
855 down_write(&s->lock);
856 s->valid = 0;
857 up_write(&s->lock);
858 }
859
860 s->have_metadata = 1;
861}
862
863static int snapshot_status(struct dm_target *ti, status_type_t type,
864 char *result, unsigned int maxlen)
865{
866 struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
867
868 switch (type) {
869 case STATUSTYPE_INFO:
870 if (!snap->valid)
871 snprintf(result, maxlen, "Invalid");
872 else {
873 if (snap->store.fraction_full) {
874 sector_t numerator, denominator;
875 snap->store.fraction_full(&snap->store,
876 &numerator,
877 &denominator);
878 snprintf(result, maxlen,
879 SECTOR_FORMAT "/" SECTOR_FORMAT,
880 numerator, denominator);
881 }
882 else
883 snprintf(result, maxlen, "Unknown");
884 }
885 break;
886
887 case STATUSTYPE_TABLE:
888 /*
889 * kdevname returns a static pointer so we need
890 * to make private copies if the output is to
891 * make sense.
892 */
893 snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
894 snap->origin->name, snap->cow->name,
895 snap->type, snap->chunk_size);
896 break;
897 }
898
899 return 0;
900}
901
902/*-----------------------------------------------------------------
903 * Origin methods
904 *---------------------------------------------------------------*/
905static void list_merge(struct list_head *l1, struct list_head *l2)
906{
907 struct list_head *l1_n, *l2_p;
908
909 l1_n = l1->next;
910 l2_p = l2->prev;
911
912 l1->next = l2;
913 l2->prev = l1;
914
915 l2_p->next = l1_n;
916 l1_n->prev = l2_p;
917}
918
919static int __origin_write(struct list_head *snapshots, struct bio *bio)
920{
921 int r = 1, first = 1;
922 struct dm_snapshot *snap;
923 struct exception *e;
924 struct pending_exception *pe, *last = NULL;
925 chunk_t chunk;
926
927 /* Do all the snapshots on this origin */
928 list_for_each_entry (snap, snapshots, list) {
929
930 /* Only deal with valid snapshots */
931 if (!snap->valid)
932 continue;
933
934 down_write(&snap->lock);
935
936 /*
937 * Remember, different snapshots can have
938 * different chunk sizes.
939 */
940 chunk = sector_to_chunk(snap, bio->bi_sector);
941
942 /*
943 * Check exception table to see if block
944 * is already remapped in this snapshot
945 * and trigger an exception if not.
946 */
947 e = lookup_exception(&snap->complete, chunk);
948 if (!e) {
949 pe = __find_pending_exception(snap, bio);
950 if (!pe) {
951 snap->store.drop_snapshot(&snap->store);
952 snap->valid = 0;
953
954 } else {
955 if (last)
956 list_merge(&pe->siblings,
957 &last->siblings);
958
959 last = pe;
960 r = 0;
961 }
962 }
963
964 up_write(&snap->lock);
965 }
966
967 /*
968 * Now that we have a complete pe list we can start the copying.
969 */
970 if (last) {
971 pe = last;
972 do {
973 down_write(&pe->snap->lock);
974 if (first)
975 bio_list_add(&pe->origin_bios, bio);
976 if (!pe->started) {
977 pe->started = 1;
978 up_write(&pe->snap->lock);
979 start_copy(pe);
980 } else
981 up_write(&pe->snap->lock);
982 first = 0;
983 pe = list_entry(pe->siblings.next,
984 struct pending_exception, siblings);
985
986 } while (pe != last);
987 }
988
989 return r;
990}
991
992/*
993 * Called on a write from the origin driver.
994 */
995static int do_origin(struct dm_dev *origin, struct bio *bio)
996{
997 struct origin *o;
998 int r = 1;
999
1000 down_read(&_origins_lock);
1001 o = __lookup_origin(origin->bdev);
1002 if (o)
1003 r = __origin_write(&o->snapshots, bio);
1004 up_read(&_origins_lock);
1005
1006 return r;
1007}
1008
1009/*
1010 * Origin: maps a linear range of a device, with hooks for snapshotting.
1011 */
1012
1013/*
1014 * Construct an origin mapping: <dev_path>
1015 * The context for an origin is merely a 'struct dm_dev *'
1016 * pointing to the real device.
1017 */
1018static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1019{
1020 int r;
1021 struct dm_dev *dev;
1022
1023 if (argc != 1) {
1024 ti->error = "dm-origin: incorrect number of arguments";
1025 return -EINVAL;
1026 }
1027
1028 r = dm_get_device(ti, argv[0], 0, ti->len,
1029 dm_table_get_mode(ti->table), &dev);
1030 if (r) {
1031 ti->error = "Cannot get target device";
1032 return r;
1033 }
1034
1035 ti->private = dev;
1036 return 0;
1037}
1038
1039static void origin_dtr(struct dm_target *ti)
1040{
1041 struct dm_dev *dev = (struct dm_dev *) ti->private;
1042 dm_put_device(ti, dev);
1043}
1044
1045static int origin_map(struct dm_target *ti, struct bio *bio,
1046 union map_info *map_context)
1047{
1048 struct dm_dev *dev = (struct dm_dev *) ti->private;
1049 bio->bi_bdev = dev->bdev;
1050
1051 /* Only tell snapshots if this is a write */
1052 return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
1053}
1054
1055#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
1056
1057/*
1058 * Set the target "split_io" field to the minimum of all the snapshots'
1059 * chunk sizes.
1060 */
1061static void origin_resume(struct dm_target *ti)
1062{
1063 struct dm_dev *dev = (struct dm_dev *) ti->private;
1064 struct dm_snapshot *snap;
1065 struct origin *o;
1066 chunk_t chunk_size = 0;
1067
1068 down_read(&_origins_lock);
1069 o = __lookup_origin(dev->bdev);
1070 if (o)
1071 list_for_each_entry (snap, &o->snapshots, list)
1072 chunk_size = min_not_zero(chunk_size, snap->chunk_size);
1073 up_read(&_origins_lock);
1074
1075 ti->split_io = chunk_size;
1076}
1077
1078static int origin_status(struct dm_target *ti, status_type_t type, char *result,
1079 unsigned int maxlen)
1080{
1081 struct dm_dev *dev = (struct dm_dev *) ti->private;
1082
1083 switch (type) {
1084 case STATUSTYPE_INFO:
1085 result[0] = '\0';
1086 break;
1087
1088 case STATUSTYPE_TABLE:
1089 snprintf(result, maxlen, "%s", dev->name);
1090 break;
1091 }
1092
1093 return 0;
1094}
1095
1096static struct target_type origin_target = {
1097 .name = "snapshot-origin",
1098 .version = {1, 0, 1},
1099 .module = THIS_MODULE,
1100 .ctr = origin_ctr,
1101 .dtr = origin_dtr,
1102 .map = origin_map,
1103 .resume = origin_resume,
1104 .status = origin_status,
1105};
1106
1107static struct target_type snapshot_target = {
1108 .name = "snapshot",
1109 .version = {1, 0, 1},
1110 .module = THIS_MODULE,
1111 .ctr = snapshot_ctr,
1112 .dtr = snapshot_dtr,
1113 .map = snapshot_map,
1114 .resume = snapshot_resume,
1115 .status = snapshot_status,
1116};
1117
1118static int __init dm_snapshot_init(void)
1119{
1120 int r;
1121
1122 r = dm_register_target(&snapshot_target);
1123 if (r) {
1124 DMERR("snapshot target register failed %d", r);
1125 return r;
1126 }
1127
1128 r = dm_register_target(&origin_target);
1129 if (r < 0) {
1130 DMERR("Device mapper: Origin: register failed %d\n", r);
1131 goto bad1;
1132 }
1133
1134 r = init_origin_hash();
1135 if (r) {
1136 DMERR("init_origin_hash failed.");
1137 goto bad2;
1138 }
1139
1140 exception_cache = kmem_cache_create("dm-snapshot-ex",
1141 sizeof(struct exception),
1142 __alignof__(struct exception),
1143 0, NULL, NULL);
1144 if (!exception_cache) {
1145 DMERR("Couldn't create exception cache.");
1146 r = -ENOMEM;
1147 goto bad3;
1148 }
1149
1150 pending_cache =
1151 kmem_cache_create("dm-snapshot-in",
1152 sizeof(struct pending_exception),
1153 __alignof__(struct pending_exception),
1154 0, NULL, NULL);
1155 if (!pending_cache) {
1156 DMERR("Couldn't create pending cache.");
1157 r = -ENOMEM;
1158 goto bad4;
1159 }
1160
1161 pending_pool = mempool_create(128, mempool_alloc_slab,
1162 mempool_free_slab, pending_cache);
1163 if (!pending_pool) {
1164 DMERR("Couldn't create pending pool.");
1165 r = -ENOMEM;
1166 goto bad5;
1167 }
1168
1169 return 0;
1170
1171 bad5:
1172 kmem_cache_destroy(pending_cache);
1173 bad4:
1174 kmem_cache_destroy(exception_cache);
1175 bad3:
1176 exit_origin_hash();
1177 bad2:
1178 dm_unregister_target(&origin_target);
1179 bad1:
1180 dm_unregister_target(&snapshot_target);
1181 return r;
1182}
1183
1184static void __exit dm_snapshot_exit(void)
1185{
1186 int r;
1187
1188 r = dm_unregister_target(&snapshot_target);
1189 if (r)
1190 DMERR("snapshot unregister failed %d", r);
1191
1192 r = dm_unregister_target(&origin_target);
1193 if (r)
1194 DMERR("origin unregister failed %d", r);
1195
1196 exit_origin_hash();
1197 mempool_destroy(pending_pool);
1198 kmem_cache_destroy(pending_cache);
1199 kmem_cache_destroy(exception_cache);
1200}
1201
1202/* Module hooks */
1203module_init(dm_snapshot_init);
1204module_exit(dm_snapshot_exit);
1205
1206MODULE_DESCRIPTION(DM_NAME " snapshot target");
1207MODULE_AUTHOR("Joe Thornber");
1208MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-snap.h b/drivers/md/dm-snap.h
new file mode 100644
index 000000000000..375aa24d4d7d
--- /dev/null
+++ b/drivers/md/dm-snap.h
@@ -0,0 +1,161 @@
1/*
2 * dm-snapshot.c
3 *
4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5 *
6 * This file is released under the GPL.
7 */
8
9#ifndef DM_SNAPSHOT_H
10#define DM_SNAPSHOT_H
11
12#include "dm.h"
13#include <linux/blkdev.h>
14
15struct exception_table {
16 uint32_t hash_mask;
17 struct list_head *table;
18};
19
20/*
21 * The snapshot code deals with largish chunks of the disk at a
22 * time. Typically 64k - 256k.
23 */
24/* FIXME: can we get away with limiting these to a uint32_t ? */
25typedef sector_t chunk_t;
26
27/*
28 * An exception is used where an old chunk of data has been
29 * replaced by a new one.
30 */
31struct exception {
32 struct list_head hash_list;
33
34 chunk_t old_chunk;
35 chunk_t new_chunk;
36};
37
38/*
39 * Abstraction to handle the meta/layout of exception stores (the
40 * COW device).
41 */
42struct exception_store {
43
44 /*
45 * Destroys this object when you've finished with it.
46 */
47 void (*destroy) (struct exception_store *store);
48
49 /*
50 * The target shouldn't read the COW device until this is
51 * called.
52 */
53 int (*read_metadata) (struct exception_store *store);
54
55 /*
56 * Find somewhere to store the next exception.
57 */
58 int (*prepare_exception) (struct exception_store *store,
59 struct exception *e);
60
61 /*
62 * Update the metadata with this exception.
63 */
64 void (*commit_exception) (struct exception_store *store,
65 struct exception *e,
66 void (*callback) (void *, int success),
67 void *callback_context);
68
69 /*
70 * The snapshot is invalid, note this in the metadata.
71 */
72 void (*drop_snapshot) (struct exception_store *store);
73
74 /*
75 * Return how full the snapshot is.
76 */
77 void (*fraction_full) (struct exception_store *store,
78 sector_t *numerator,
79 sector_t *denominator);
80
81 struct dm_snapshot *snap;
82 void *context;
83};
84
85struct dm_snapshot {
86 struct rw_semaphore lock;
87 struct dm_table *table;
88
89 struct dm_dev *origin;
90 struct dm_dev *cow;
91
92 /* List of snapshots per Origin */
93 struct list_head list;
94
95 /* Size of data blocks saved - must be a power of 2 */
96 chunk_t chunk_size;
97 chunk_t chunk_mask;
98 chunk_t chunk_shift;
99
100 /* You can't use a snapshot if this is 0 (e.g. if full) */
101 int valid;
102 int have_metadata;
103
104 /* Used for display of table */
105 char type;
106
107 /* The last percentage we notified */
108 int last_percent;
109
110 struct exception_table pending;
111 struct exception_table complete;
112
113 /* The on disk metadata handler */
114 struct exception_store store;
115
116 struct kcopyd_client *kcopyd_client;
117};
118
119/*
120 * Used by the exception stores to load exceptions hen
121 * initialising.
122 */
123int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
124
125/*
126 * Constructor and destructor for the default persistent
127 * store.
128 */
129int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
130
131int dm_create_transient(struct exception_store *store,
132 struct dm_snapshot *s, int blocksize);
133
134/*
135 * Return the number of sectors in the device.
136 */
137static inline sector_t get_dev_size(struct block_device *bdev)
138{
139 return bdev->bd_inode->i_size >> SECTOR_SHIFT;
140}
141
142static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
143{
144 return (sector & ~s->chunk_mask) >> s->chunk_shift;
145}
146
147static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
148{
149 return chunk << s->chunk_shift;
150}
151
152static inline int bdev_equal(struct block_device *lhs, struct block_device *rhs)
153{
154 /*
155 * There is only ever one instance of a particular block
156 * device so we can compare pointers safely.
157 */
158 return lhs == rhs;
159}
160
161#endif
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
new file mode 100644
index 000000000000..ab89278a56bf
--- /dev/null
+++ b/drivers/md/dm-stripe.c
@@ -0,0 +1,234 @@
1/*
2 * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/blkdev.h>
12#include <linux/bio.h>
13#include <linux/slab.h>
14
15struct stripe {
16 struct dm_dev *dev;
17 sector_t physical_start;
18};
19
20struct stripe_c {
21 uint32_t stripes;
22
23 /* The size of this target / num. stripes */
24 sector_t stripe_width;
25
26 /* stripe chunk size */
27 uint32_t chunk_shift;
28 sector_t chunk_mask;
29
30 struct stripe stripe[0];
31};
32
33static inline struct stripe_c *alloc_context(unsigned int stripes)
34{
35 size_t len;
36
37 if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
38 stripes))
39 return NULL;
40
41 len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
42
43 return kmalloc(len, GFP_KERNEL);
44}
45
46/*
47 * Parse a single <dev> <sector> pair
48 */
49static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
50 unsigned int stripe, char **argv)
51{
52 sector_t start;
53
54 if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
55 return -EINVAL;
56
57 if (dm_get_device(ti, argv[0], start, sc->stripe_width,
58 dm_table_get_mode(ti->table),
59 &sc->stripe[stripe].dev))
60 return -ENXIO;
61
62 sc->stripe[stripe].physical_start = start;
63 return 0;
64}
65
66/*
67 * Construct a striped mapping.
68 * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
69 */
70static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
71{
72 struct stripe_c *sc;
73 sector_t width;
74 uint32_t stripes;
75 uint32_t chunk_size;
76 char *end;
77 int r;
78 unsigned int i;
79
80 if (argc < 2) {
81 ti->error = "dm-stripe: Not enough arguments";
82 return -EINVAL;
83 }
84
85 stripes = simple_strtoul(argv[0], &end, 10);
86 if (*end) {
87 ti->error = "dm-stripe: Invalid stripe count";
88 return -EINVAL;
89 }
90
91 chunk_size = simple_strtoul(argv[1], &end, 10);
92 if (*end) {
93 ti->error = "dm-stripe: Invalid chunk_size";
94 return -EINVAL;
95 }
96
97 /*
98 * chunk_size is a power of two
99 */
100 if (!chunk_size || (chunk_size & (chunk_size - 1)) ||
101 (chunk_size < (PAGE_SIZE >> SECTOR_SHIFT))) {
102 ti->error = "dm-stripe: Invalid chunk size";
103 return -EINVAL;
104 }
105
106 width = ti->len;
107 if (sector_div(width, stripes)) {
108 ti->error = "dm-stripe: Target length not divisable by "
109 "number of stripes";
110 return -EINVAL;
111 }
112
113 /*
114 * Do we have enough arguments for that many stripes ?
115 */
116 if (argc != (2 + 2 * stripes)) {
117 ti->error = "dm-stripe: Not enough destinations "
118 "specified";
119 return -EINVAL;
120 }
121
122 sc = alloc_context(stripes);
123 if (!sc) {
124 ti->error = "dm-stripe: Memory allocation for striped context "
125 "failed";
126 return -ENOMEM;
127 }
128
129 sc->stripes = stripes;
130 sc->stripe_width = width;
131 ti->split_io = chunk_size;
132
133 sc->chunk_mask = ((sector_t) chunk_size) - 1;
134 for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
135 chunk_size >>= 1;
136 sc->chunk_shift--;
137
138 /*
139 * Get the stripe destinations.
140 */
141 for (i = 0; i < stripes; i++) {
142 argv += 2;
143
144 r = get_stripe(ti, sc, i, argv);
145 if (r < 0) {
146 ti->error = "dm-stripe: Couldn't parse stripe "
147 "destination";
148 while (i--)
149 dm_put_device(ti, sc->stripe[i].dev);
150 kfree(sc);
151 return r;
152 }
153 }
154
155 ti->private = sc;
156 return 0;
157}
158
159static void stripe_dtr(struct dm_target *ti)
160{
161 unsigned int i;
162 struct stripe_c *sc = (struct stripe_c *) ti->private;
163
164 for (i = 0; i < sc->stripes; i++)
165 dm_put_device(ti, sc->stripe[i].dev);
166
167 kfree(sc);
168}
169
170static int stripe_map(struct dm_target *ti, struct bio *bio,
171 union map_info *map_context)
172{
173 struct stripe_c *sc = (struct stripe_c *) ti->private;
174
175 sector_t offset = bio->bi_sector - ti->begin;
176 sector_t chunk = offset >> sc->chunk_shift;
177 uint32_t stripe = sector_div(chunk, sc->stripes);
178
179 bio->bi_bdev = sc->stripe[stripe].dev->bdev;
180 bio->bi_sector = sc->stripe[stripe].physical_start +
181 (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
182 return 1;
183}
184
185static int stripe_status(struct dm_target *ti,
186 status_type_t type, char *result, unsigned int maxlen)
187{
188 struct stripe_c *sc = (struct stripe_c *) ti->private;
189 unsigned int sz = 0;
190 unsigned int i;
191
192 switch (type) {
193 case STATUSTYPE_INFO:
194 result[0] = '\0';
195 break;
196
197 case STATUSTYPE_TABLE:
198 DMEMIT("%d " SECTOR_FORMAT, sc->stripes, sc->chunk_mask + 1);
199 for (i = 0; i < sc->stripes; i++)
200 DMEMIT(" %s " SECTOR_FORMAT, sc->stripe[i].dev->name,
201 sc->stripe[i].physical_start);
202 break;
203 }
204 return 0;
205}
206
207static struct target_type stripe_target = {
208 .name = "striped",
209 .version= {1, 0, 2},
210 .module = THIS_MODULE,
211 .ctr = stripe_ctr,
212 .dtr = stripe_dtr,
213 .map = stripe_map,
214 .status = stripe_status,
215};
216
217int __init dm_stripe_init(void)
218{
219 int r;
220
221 r = dm_register_target(&stripe_target);
222 if (r < 0)
223 DMWARN("striped target registration failed");
224
225 return r;
226}
227
228void dm_stripe_exit(void)
229{
230 if (dm_unregister_target(&stripe_target))
231 DMWARN("striped target unregistration failed");
232
233 return;
234}
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
new file mode 100644
index 000000000000..ee175d4906c4
--- /dev/null
+++ b/drivers/md/dm-table.c
@@ -0,0 +1,950 @@
1/*
2 * Copyright (C) 2001 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
9
10#include <linux/module.h>
11#include <linux/vmalloc.h>
12#include <linux/blkdev.h>
13#include <linux/namei.h>
14#include <linux/ctype.h>
15#include <linux/slab.h>
16#include <linux/interrupt.h>
17#include <asm/atomic.h>
18
19#define MAX_DEPTH 16
20#define NODE_SIZE L1_CACHE_BYTES
21#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
22#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
23
24struct dm_table {
25 atomic_t holders;
26
27 /* btree table */
28 unsigned int depth;
29 unsigned int counts[MAX_DEPTH]; /* in nodes */
30 sector_t *index[MAX_DEPTH];
31
32 unsigned int num_targets;
33 unsigned int num_allocated;
34 sector_t *highs;
35 struct dm_target *targets;
36
37 /*
38 * Indicates the rw permissions for the new logical
39 * device. This should be a combination of FMODE_READ
40 * and FMODE_WRITE.
41 */
42 int mode;
43
44 /* a list of devices used by this table */
45 struct list_head devices;
46
47 /*
48 * These are optimistic limits taken from all the
49 * targets, some targets will need smaller limits.
50 */
51 struct io_restrictions limits;
52
53 /* events get handed up using this callback */
54 void (*event_fn)(void *);
55 void *event_context;
56};
57
58/*
59 * Similar to ceiling(log_size(n))
60 */
61static unsigned int int_log(unsigned int n, unsigned int base)
62{
63 int result = 0;
64
65 while (n > 1) {
66 n = dm_div_up(n, base);
67 result++;
68 }
69
70 return result;
71}
72
73/*
74 * Returns the minimum that is _not_ zero, unless both are zero.
75 */
76#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
77
78/*
79 * Combine two io_restrictions, always taking the lower value.
80 */
81static void combine_restrictions_low(struct io_restrictions *lhs,
82 struct io_restrictions *rhs)
83{
84 lhs->max_sectors =
85 min_not_zero(lhs->max_sectors, rhs->max_sectors);
86
87 lhs->max_phys_segments =
88 min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
89
90 lhs->max_hw_segments =
91 min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
92
93 lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
94
95 lhs->max_segment_size =
96 min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
97
98 lhs->seg_boundary_mask =
99 min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
100}
101
102/*
103 * Calculate the index of the child node of the n'th node k'th key.
104 */
105static inline unsigned int get_child(unsigned int n, unsigned int k)
106{
107 return (n * CHILDREN_PER_NODE) + k;
108}
109
110/*
111 * Return the n'th node of level l from table t.
112 */
113static inline sector_t *get_node(struct dm_table *t,
114 unsigned int l, unsigned int n)
115{
116 return t->index[l] + (n * KEYS_PER_NODE);
117}
118
119/*
120 * Return the highest key that you could lookup from the n'th
121 * node on level l of the btree.
122 */
123static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
124{
125 for (; l < t->depth - 1; l++)
126 n = get_child(n, CHILDREN_PER_NODE - 1);
127
128 if (n >= t->counts[l])
129 return (sector_t) - 1;
130
131 return get_node(t, l, n)[KEYS_PER_NODE - 1];
132}
133
134/*
135 * Fills in a level of the btree based on the highs of the level
136 * below it.
137 */
138static int setup_btree_index(unsigned int l, struct dm_table *t)
139{
140 unsigned int n, k;
141 sector_t *node;
142
143 for (n = 0U; n < t->counts[l]; n++) {
144 node = get_node(t, l, n);
145
146 for (k = 0U; k < KEYS_PER_NODE; k++)
147 node[k] = high(t, l + 1, get_child(n, k));
148 }
149
150 return 0;
151}
152
153void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
154{
155 unsigned long size;
156 void *addr;
157
158 /*
159 * Check that we're not going to overflow.
160 */
161 if (nmemb > (ULONG_MAX / elem_size))
162 return NULL;
163
164 size = nmemb * elem_size;
165 addr = vmalloc(size);
166 if (addr)
167 memset(addr, 0, size);
168
169 return addr;
170}
171
172/*
173 * highs, and targets are managed as dynamic arrays during a
174 * table load.
175 */
176static int alloc_targets(struct dm_table *t, unsigned int num)
177{
178 sector_t *n_highs;
179 struct dm_target *n_targets;
180 int n = t->num_targets;
181
182 /*
183 * Allocate both the target array and offset array at once.
184 */
185 n_highs = (sector_t *) dm_vcalloc(num, sizeof(struct dm_target) +
186 sizeof(sector_t));
187 if (!n_highs)
188 return -ENOMEM;
189
190 n_targets = (struct dm_target *) (n_highs + num);
191
192 if (n) {
193 memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
194 memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
195 }
196
197 memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
198 vfree(t->highs);
199
200 t->num_allocated = num;
201 t->highs = n_highs;
202 t->targets = n_targets;
203
204 return 0;
205}
206
207int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
208{
209 struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
210
211 if (!t)
212 return -ENOMEM;
213
214 memset(t, 0, sizeof(*t));
215 INIT_LIST_HEAD(&t->devices);
216 atomic_set(&t->holders, 1);
217
218 if (!num_targets)
219 num_targets = KEYS_PER_NODE;
220
221 num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
222
223 if (alloc_targets(t, num_targets)) {
224 kfree(t);
225 t = NULL;
226 return -ENOMEM;
227 }
228
229 t->mode = mode;
230 *result = t;
231 return 0;
232}
233
234static void free_devices(struct list_head *devices)
235{
236 struct list_head *tmp, *next;
237
238 for (tmp = devices->next; tmp != devices; tmp = next) {
239 struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
240 next = tmp->next;
241 kfree(dd);
242 }
243}
244
245void table_destroy(struct dm_table *t)
246{
247 unsigned int i;
248
249 /* free the indexes (see dm_table_complete) */
250 if (t->depth >= 2)
251 vfree(t->index[t->depth - 2]);
252
253 /* free the targets */
254 for (i = 0; i < t->num_targets; i++) {
255 struct dm_target *tgt = t->targets + i;
256
257 if (tgt->type->dtr)
258 tgt->type->dtr(tgt);
259
260 dm_put_target_type(tgt->type);
261 }
262
263 vfree(t->highs);
264
265 /* free the device list */
266 if (t->devices.next != &t->devices) {
267 DMWARN("devices still present during destroy: "
268 "dm_table_remove_device calls missing");
269
270 free_devices(&t->devices);
271 }
272
273 kfree(t);
274}
275
276void dm_table_get(struct dm_table *t)
277{
278 atomic_inc(&t->holders);
279}
280
281void dm_table_put(struct dm_table *t)
282{
283 if (!t)
284 return;
285
286 if (atomic_dec_and_test(&t->holders))
287 table_destroy(t);
288}
289
290/*
291 * Checks to see if we need to extend highs or targets.
292 */
293static inline int check_space(struct dm_table *t)
294{
295 if (t->num_targets >= t->num_allocated)
296 return alloc_targets(t, t->num_allocated * 2);
297
298 return 0;
299}
300
301/*
302 * Convert a device path to a dev_t.
303 */
304static int lookup_device(const char *path, dev_t *dev)
305{
306 int r;
307 struct nameidata nd;
308 struct inode *inode;
309
310 if ((r = path_lookup(path, LOOKUP_FOLLOW, &nd)))
311 return r;
312
313 inode = nd.dentry->d_inode;
314 if (!inode) {
315 r = -ENOENT;
316 goto out;
317 }
318
319 if (!S_ISBLK(inode->i_mode)) {
320 r = -ENOTBLK;
321 goto out;
322 }
323
324 *dev = inode->i_rdev;
325
326 out:
327 path_release(&nd);
328 return r;
329}
330
331/*
332 * See if we've already got a device in the list.
333 */
334static struct dm_dev *find_device(struct list_head *l, dev_t dev)
335{
336 struct dm_dev *dd;
337
338 list_for_each_entry (dd, l, list)
339 if (dd->bdev->bd_dev == dev)
340 return dd;
341
342 return NULL;
343}
344
345/*
346 * Open a device so we can use it as a map destination.
347 */
348static int open_dev(struct dm_dev *d, dev_t dev)
349{
350 static char *_claim_ptr = "I belong to device-mapper";
351 struct block_device *bdev;
352
353 int r;
354
355 if (d->bdev)
356 BUG();
357
358 bdev = open_by_devnum(dev, d->mode);
359 if (IS_ERR(bdev))
360 return PTR_ERR(bdev);
361 r = bd_claim(bdev, _claim_ptr);
362 if (r)
363 blkdev_put(bdev);
364 else
365 d->bdev = bdev;
366 return r;
367}
368
369/*
370 * Close a device that we've been using.
371 */
372static void close_dev(struct dm_dev *d)
373{
374 if (!d->bdev)
375 return;
376
377 bd_release(d->bdev);
378 blkdev_put(d->bdev);
379 d->bdev = NULL;
380}
381
382/*
383 * If possible (ie. blk_size[major] is set), this checks an area
384 * of a destination device is valid.
385 */
386static int check_device_area(struct dm_dev *dd, sector_t start, sector_t len)
387{
388 sector_t dev_size;
389 dev_size = dd->bdev->bd_inode->i_size >> SECTOR_SHIFT;
390 return ((start < dev_size) && (len <= (dev_size - start)));
391}
392
393/*
394 * This upgrades the mode on an already open dm_dev. Being
395 * careful to leave things as they were if we fail to reopen the
396 * device.
397 */
398static int upgrade_mode(struct dm_dev *dd, int new_mode)
399{
400 int r;
401 struct dm_dev dd_copy;
402 dev_t dev = dd->bdev->bd_dev;
403
404 dd_copy = *dd;
405
406 dd->mode |= new_mode;
407 dd->bdev = NULL;
408 r = open_dev(dd, dev);
409 if (!r)
410 close_dev(&dd_copy);
411 else
412 *dd = dd_copy;
413
414 return r;
415}
416
417/*
418 * Add a device to the list, or just increment the usage count if
419 * it's already present.
420 */
421static int __table_get_device(struct dm_table *t, struct dm_target *ti,
422 const char *path, sector_t start, sector_t len,
423 int mode, struct dm_dev **result)
424{
425 int r;
426 dev_t dev;
427 struct dm_dev *dd;
428 unsigned int major, minor;
429
430 if (!t)
431 BUG();
432
433 if (sscanf(path, "%u:%u", &major, &minor) == 2) {
434 /* Extract the major/minor numbers */
435 dev = MKDEV(major, minor);
436 if (MAJOR(dev) != major || MINOR(dev) != minor)
437 return -EOVERFLOW;
438 } else {
439 /* convert the path to a device */
440 if ((r = lookup_device(path, &dev)))
441 return r;
442 }
443
444 dd = find_device(&t->devices, dev);
445 if (!dd) {
446 dd = kmalloc(sizeof(*dd), GFP_KERNEL);
447 if (!dd)
448 return -ENOMEM;
449
450 dd->mode = mode;
451 dd->bdev = NULL;
452
453 if ((r = open_dev(dd, dev))) {
454 kfree(dd);
455 return r;
456 }
457
458 format_dev_t(dd->name, dev);
459
460 atomic_set(&dd->count, 0);
461 list_add(&dd->list, &t->devices);
462
463 } else if (dd->mode != (mode | dd->mode)) {
464 r = upgrade_mode(dd, mode);
465 if (r)
466 return r;
467 }
468 atomic_inc(&dd->count);
469
470 if (!check_device_area(dd, start, len)) {
471 DMWARN("device %s too small for target", path);
472 dm_put_device(ti, dd);
473 return -EINVAL;
474 }
475
476 *result = dd;
477
478 return 0;
479}
480
481
482int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
483 sector_t len, int mode, struct dm_dev **result)
484{
485 int r = __table_get_device(ti->table, ti, path,
486 start, len, mode, result);
487 if (!r) {
488 request_queue_t *q = bdev_get_queue((*result)->bdev);
489 struct io_restrictions *rs = &ti->limits;
490
491 /*
492 * Combine the device limits low.
493 *
494 * FIXME: if we move an io_restriction struct
495 * into q this would just be a call to
496 * combine_restrictions_low()
497 */
498 rs->max_sectors =
499 min_not_zero(rs->max_sectors, q->max_sectors);
500
501 /* FIXME: Device-Mapper on top of RAID-0 breaks because DM
502 * currently doesn't honor MD's merge_bvec_fn routine.
503 * In this case, we'll force DM to use PAGE_SIZE or
504 * smaller I/O, just to be safe. A better fix is in the
505 * works, but add this for the time being so it will at
506 * least operate correctly.
507 */
508 if (q->merge_bvec_fn)
509 rs->max_sectors =
510 min_not_zero(rs->max_sectors,
511 (unsigned short)(PAGE_SIZE >> 9));
512
513 rs->max_phys_segments =
514 min_not_zero(rs->max_phys_segments,
515 q->max_phys_segments);
516
517 rs->max_hw_segments =
518 min_not_zero(rs->max_hw_segments, q->max_hw_segments);
519
520 rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
521
522 rs->max_segment_size =
523 min_not_zero(rs->max_segment_size, q->max_segment_size);
524
525 rs->seg_boundary_mask =
526 min_not_zero(rs->seg_boundary_mask,
527 q->seg_boundary_mask);
528 }
529
530 return r;
531}
532
533/*
534 * Decrement a devices use count and remove it if necessary.
535 */
536void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
537{
538 if (atomic_dec_and_test(&dd->count)) {
539 close_dev(dd);
540 list_del(&dd->list);
541 kfree(dd);
542 }
543}
544
545/*
546 * Checks to see if the target joins onto the end of the table.
547 */
548static int adjoin(struct dm_table *table, struct dm_target *ti)
549{
550 struct dm_target *prev;
551
552 if (!table->num_targets)
553 return !ti->begin;
554
555 prev = &table->targets[table->num_targets - 1];
556 return (ti->begin == (prev->begin + prev->len));
557}
558
559/*
560 * Used to dynamically allocate the arg array.
561 */
562static char **realloc_argv(unsigned *array_size, char **old_argv)
563{
564 char **argv;
565 unsigned new_size;
566
567 new_size = *array_size ? *array_size * 2 : 64;
568 argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
569 if (argv) {
570 memcpy(argv, old_argv, *array_size * sizeof(*argv));
571 *array_size = new_size;
572 }
573
574 kfree(old_argv);
575 return argv;
576}
577
578/*
579 * Destructively splits up the argument list to pass to ctr.
580 */
581int dm_split_args(int *argc, char ***argvp, char *input)
582{
583 char *start, *end = input, *out, **argv = NULL;
584 unsigned array_size = 0;
585
586 *argc = 0;
587 argv = realloc_argv(&array_size, argv);
588 if (!argv)
589 return -ENOMEM;
590
591 while (1) {
592 start = end;
593
594 /* Skip whitespace */
595 while (*start && isspace(*start))
596 start++;
597
598 if (!*start)
599 break; /* success, we hit the end */
600
601 /* 'out' is used to remove any back-quotes */
602 end = out = start;
603 while (*end) {
604 /* Everything apart from '\0' can be quoted */
605 if (*end == '\\' && *(end + 1)) {
606 *out++ = *(end + 1);
607 end += 2;
608 continue;
609 }
610
611 if (isspace(*end))
612 break; /* end of token */
613
614 *out++ = *end++;
615 }
616
617 /* have we already filled the array ? */
618 if ((*argc + 1) > array_size) {
619 argv = realloc_argv(&array_size, argv);
620 if (!argv)
621 return -ENOMEM;
622 }
623
624 /* we know this is whitespace */
625 if (*end)
626 end++;
627
628 /* terminate the string and put it in the array */
629 *out = '\0';
630 argv[*argc] = start;
631 (*argc)++;
632 }
633
634 *argvp = argv;
635 return 0;
636}
637
638static void check_for_valid_limits(struct io_restrictions *rs)
639{
640 if (!rs->max_sectors)
641 rs->max_sectors = MAX_SECTORS;
642 if (!rs->max_phys_segments)
643 rs->max_phys_segments = MAX_PHYS_SEGMENTS;
644 if (!rs->max_hw_segments)
645 rs->max_hw_segments = MAX_HW_SEGMENTS;
646 if (!rs->hardsect_size)
647 rs->hardsect_size = 1 << SECTOR_SHIFT;
648 if (!rs->max_segment_size)
649 rs->max_segment_size = MAX_SEGMENT_SIZE;
650 if (!rs->seg_boundary_mask)
651 rs->seg_boundary_mask = -1;
652}
653
654int dm_table_add_target(struct dm_table *t, const char *type,
655 sector_t start, sector_t len, char *params)
656{
657 int r = -EINVAL, argc;
658 char **argv;
659 struct dm_target *tgt;
660
661 if ((r = check_space(t)))
662 return r;
663
664 tgt = t->targets + t->num_targets;
665 memset(tgt, 0, sizeof(*tgt));
666
667 if (!len) {
668 tgt->error = "zero-length target";
669 DMERR("%s", tgt->error);
670 return -EINVAL;
671 }
672
673 tgt->type = dm_get_target_type(type);
674 if (!tgt->type) {
675 tgt->error = "unknown target type";
676 DMERR("%s", tgt->error);
677 return -EINVAL;
678 }
679
680 tgt->table = t;
681 tgt->begin = start;
682 tgt->len = len;
683 tgt->error = "Unknown error";
684
685 /*
686 * Does this target adjoin the previous one ?
687 */
688 if (!adjoin(t, tgt)) {
689 tgt->error = "Gap in table";
690 r = -EINVAL;
691 goto bad;
692 }
693
694 r = dm_split_args(&argc, &argv, params);
695 if (r) {
696 tgt->error = "couldn't split parameters (insufficient memory)";
697 goto bad;
698 }
699
700 r = tgt->type->ctr(tgt, argc, argv);
701 kfree(argv);
702 if (r)
703 goto bad;
704
705 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
706
707 /* FIXME: the plan is to combine high here and then have
708 * the merge fn apply the target level restrictions. */
709 combine_restrictions_low(&t->limits, &tgt->limits);
710 return 0;
711
712 bad:
713 DMERR("%s", tgt->error);
714 dm_put_target_type(tgt->type);
715 return r;
716}
717
718static int setup_indexes(struct dm_table *t)
719{
720 int i;
721 unsigned int total = 0;
722 sector_t *indexes;
723
724 /* allocate the space for *all* the indexes */
725 for (i = t->depth - 2; i >= 0; i--) {
726 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
727 total += t->counts[i];
728 }
729
730 indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
731 if (!indexes)
732 return -ENOMEM;
733
734 /* set up internal nodes, bottom-up */
735 for (i = t->depth - 2, total = 0; i >= 0; i--) {
736 t->index[i] = indexes;
737 indexes += (KEYS_PER_NODE * t->counts[i]);
738 setup_btree_index(i, t);
739 }
740
741 return 0;
742}
743
744/*
745 * Builds the btree to index the map.
746 */
747int dm_table_complete(struct dm_table *t)
748{
749 int r = 0;
750 unsigned int leaf_nodes;
751
752 check_for_valid_limits(&t->limits);
753
754 /* how many indexes will the btree have ? */
755 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
756 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
757
758 /* leaf layer has already been set up */
759 t->counts[t->depth - 1] = leaf_nodes;
760 t->index[t->depth - 1] = t->highs;
761
762 if (t->depth >= 2)
763 r = setup_indexes(t);
764
765 return r;
766}
767
768static DECLARE_MUTEX(_event_lock);
769void dm_table_event_callback(struct dm_table *t,
770 void (*fn)(void *), void *context)
771{
772 down(&_event_lock);
773 t->event_fn = fn;
774 t->event_context = context;
775 up(&_event_lock);
776}
777
778void dm_table_event(struct dm_table *t)
779{
780 /*
781 * You can no longer call dm_table_event() from interrupt
782 * context, use a bottom half instead.
783 */
784 BUG_ON(in_interrupt());
785
786 down(&_event_lock);
787 if (t->event_fn)
788 t->event_fn(t->event_context);
789 up(&_event_lock);
790}
791
792sector_t dm_table_get_size(struct dm_table *t)
793{
794 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
795}
796
797struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
798{
799 if (index > t->num_targets)
800 return NULL;
801
802 return t->targets + index;
803}
804
805/*
806 * Search the btree for the correct target.
807 */
808struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
809{
810 unsigned int l, n = 0, k = 0;
811 sector_t *node;
812
813 for (l = 0; l < t->depth; l++) {
814 n = get_child(n, k);
815 node = get_node(t, l, n);
816
817 for (k = 0; k < KEYS_PER_NODE; k++)
818 if (node[k] >= sector)
819 break;
820 }
821
822 return &t->targets[(KEYS_PER_NODE * n) + k];
823}
824
825void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
826{
827 /*
828 * Make sure we obey the optimistic sub devices
829 * restrictions.
830 */
831 blk_queue_max_sectors(q, t->limits.max_sectors);
832 q->max_phys_segments = t->limits.max_phys_segments;
833 q->max_hw_segments = t->limits.max_hw_segments;
834 q->hardsect_size = t->limits.hardsect_size;
835 q->max_segment_size = t->limits.max_segment_size;
836 q->seg_boundary_mask = t->limits.seg_boundary_mask;
837}
838
839unsigned int dm_table_get_num_targets(struct dm_table *t)
840{
841 return t->num_targets;
842}
843
844struct list_head *dm_table_get_devices(struct dm_table *t)
845{
846 return &t->devices;
847}
848
849int dm_table_get_mode(struct dm_table *t)
850{
851 return t->mode;
852}
853
854static void suspend_targets(struct dm_table *t, unsigned postsuspend)
855{
856 int i = t->num_targets;
857 struct dm_target *ti = t->targets;
858
859 while (i--) {
860 if (postsuspend) {
861 if (ti->type->postsuspend)
862 ti->type->postsuspend(ti);
863 } else if (ti->type->presuspend)
864 ti->type->presuspend(ti);
865
866 ti++;
867 }
868}
869
870void dm_table_presuspend_targets(struct dm_table *t)
871{
872 return suspend_targets(t, 0);
873}
874
875void dm_table_postsuspend_targets(struct dm_table *t)
876{
877 return suspend_targets(t, 1);
878}
879
880void dm_table_resume_targets(struct dm_table *t)
881{
882 int i;
883
884 for (i = 0; i < t->num_targets; i++) {
885 struct dm_target *ti = t->targets + i;
886
887 if (ti->type->resume)
888 ti->type->resume(ti);
889 }
890}
891
892int dm_table_any_congested(struct dm_table *t, int bdi_bits)
893{
894 struct list_head *d, *devices;
895 int r = 0;
896
897 devices = dm_table_get_devices(t);
898 for (d = devices->next; d != devices; d = d->next) {
899 struct dm_dev *dd = list_entry(d, struct dm_dev, list);
900 request_queue_t *q = bdev_get_queue(dd->bdev);
901 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
902 }
903
904 return r;
905}
906
907void dm_table_unplug_all(struct dm_table *t)
908{
909 struct list_head *d, *devices = dm_table_get_devices(t);
910
911 for (d = devices->next; d != devices; d = d->next) {
912 struct dm_dev *dd = list_entry(d, struct dm_dev, list);
913 request_queue_t *q = bdev_get_queue(dd->bdev);
914
915 if (q->unplug_fn)
916 q->unplug_fn(q);
917 }
918}
919
920int dm_table_flush_all(struct dm_table *t)
921{
922 struct list_head *d, *devices = dm_table_get_devices(t);
923 int ret = 0;
924
925 for (d = devices->next; d != devices; d = d->next) {
926 struct dm_dev *dd = list_entry(d, struct dm_dev, list);
927 request_queue_t *q = bdev_get_queue(dd->bdev);
928 int err;
929
930 if (!q->issue_flush_fn)
931 err = -EOPNOTSUPP;
932 else
933 err = q->issue_flush_fn(q, dd->bdev->bd_disk, NULL);
934
935 if (!ret)
936 ret = err;
937 }
938
939 return ret;
940}
941
942EXPORT_SYMBOL(dm_vcalloc);
943EXPORT_SYMBOL(dm_get_device);
944EXPORT_SYMBOL(dm_put_device);
945EXPORT_SYMBOL(dm_table_event);
946EXPORT_SYMBOL(dm_table_get_mode);
947EXPORT_SYMBOL(dm_table_put);
948EXPORT_SYMBOL(dm_table_get);
949EXPORT_SYMBOL(dm_table_unplug_all);
950EXPORT_SYMBOL(dm_table_flush_all);
diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c
new file mode 100644
index 000000000000..aecd9e0c2616
--- /dev/null
+++ b/drivers/md/dm-target.c
@@ -0,0 +1,196 @@
1/*
2 * Copyright (C) 2001 Sistina Software (UK) Limited
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/kmod.h>
12#include <linux/bio.h>
13#include <linux/slab.h>
14
15struct tt_internal {
16 struct target_type tt;
17
18 struct list_head list;
19 long use;
20};
21
22static LIST_HEAD(_targets);
23static DECLARE_RWSEM(_lock);
24
25#define DM_MOD_NAME_SIZE 32
26
27static inline struct tt_internal *__find_target_type(const char *name)
28{
29 struct tt_internal *ti;
30
31 list_for_each_entry (ti, &_targets, list)
32 if (!strcmp(name, ti->tt.name))
33 return ti;
34
35 return NULL;
36}
37
38static struct tt_internal *get_target_type(const char *name)
39{
40 struct tt_internal *ti;
41
42 down_read(&_lock);
43
44 ti = __find_target_type(name);
45 if (ti) {
46 if ((ti->use == 0) && !try_module_get(ti->tt.module))
47 ti = NULL;
48 else
49 ti->use++;
50 }
51
52 up_read(&_lock);
53 return ti;
54}
55
56static void load_module(const char *name)
57{
58 request_module("dm-%s", name);
59}
60
61struct target_type *dm_get_target_type(const char *name)
62{
63 struct tt_internal *ti = get_target_type(name);
64
65 if (!ti) {
66 load_module(name);
67 ti = get_target_type(name);
68 }
69
70 return ti ? &ti->tt : NULL;
71}
72
73void dm_put_target_type(struct target_type *t)
74{
75 struct tt_internal *ti = (struct tt_internal *) t;
76
77 down_read(&_lock);
78 if (--ti->use == 0)
79 module_put(ti->tt.module);
80
81 if (ti->use < 0)
82 BUG();
83 up_read(&_lock);
84
85 return;
86}
87
88static struct tt_internal *alloc_target(struct target_type *t)
89{
90 struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
91
92 if (ti) {
93 memset(ti, 0, sizeof(*ti));
94 ti->tt = *t;
95 }
96
97 return ti;
98}
99
100
101int dm_target_iterate(void (*iter_func)(struct target_type *tt,
102 void *param), void *param)
103{
104 struct tt_internal *ti;
105
106 down_read(&_lock);
107 list_for_each_entry (ti, &_targets, list)
108 iter_func(&ti->tt, param);
109 up_read(&_lock);
110
111 return 0;
112}
113
114int dm_register_target(struct target_type *t)
115{
116 int rv = 0;
117 struct tt_internal *ti = alloc_target(t);
118
119 if (!ti)
120 return -ENOMEM;
121
122 down_write(&_lock);
123 if (__find_target_type(t->name))
124 rv = -EEXIST;
125 else
126 list_add(&ti->list, &_targets);
127
128 up_write(&_lock);
129 if (rv)
130 kfree(ti);
131 return rv;
132}
133
134int dm_unregister_target(struct target_type *t)
135{
136 struct tt_internal *ti;
137
138 down_write(&_lock);
139 if (!(ti = __find_target_type(t->name))) {
140 up_write(&_lock);
141 return -EINVAL;
142 }
143
144 if (ti->use) {
145 up_write(&_lock);
146 return -ETXTBSY;
147 }
148
149 list_del(&ti->list);
150 kfree(ti);
151
152 up_write(&_lock);
153 return 0;
154}
155
156/*
157 * io-err: always fails an io, useful for bringing
158 * up LVs that have holes in them.
159 */
160static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
161{
162 return 0;
163}
164
165static void io_err_dtr(struct dm_target *ti)
166{
167 /* empty */
168}
169
170static int io_err_map(struct dm_target *ti, struct bio *bio,
171 union map_info *map_context)
172{
173 return -EIO;
174}
175
176static struct target_type error_target = {
177 .name = "error",
178 .version = {1, 0, 1},
179 .ctr = io_err_ctr,
180 .dtr = io_err_dtr,
181 .map = io_err_map,
182};
183
184int __init dm_target_init(void)
185{
186 return dm_register_target(&error_target);
187}
188
189void dm_target_exit(void)
190{
191 if (dm_unregister_target(&error_target))
192 DMWARN("error target unregistration failed");
193}
194
195EXPORT_SYMBOL(dm_register_target);
196EXPORT_SYMBOL(dm_unregister_target);
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
new file mode 100644
index 000000000000..7febc2cac73d
--- /dev/null
+++ b/drivers/md/dm-zero.c
@@ -0,0 +1,81 @@
1/*
2 * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8
9#include <linux/module.h>
10#include <linux/init.h>
11#include <linux/bio.h>
12
13/*
14 * Construct a dummy mapping that only returns zeros
15 */
16static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
17{
18 if (argc != 0) {
19 ti->error = "dm-zero: No arguments required";
20 return -EINVAL;
21 }
22
23 return 0;
24}
25
26/*
27 * Return zeros only on reads
28 */
29static int zero_map(struct dm_target *ti, struct bio *bio,
30 union map_info *map_context)
31{
32 switch(bio_rw(bio)) {
33 case READ:
34 zero_fill_bio(bio);
35 break;
36 case READA:
37 /* readahead of null bytes only wastes buffer cache */
38 return -EIO;
39 case WRITE:
40 /* writes get silently dropped */
41 break;
42 }
43
44 bio_endio(bio, bio->bi_size, 0);
45
46 /* accepted bio, don't make new request */
47 return 0;
48}
49
50static struct target_type zero_target = {
51 .name = "zero",
52 .version = {1, 0, 0},
53 .module = THIS_MODULE,
54 .ctr = zero_ctr,
55 .map = zero_map,
56};
57
58int __init dm_zero_init(void)
59{
60 int r = dm_register_target(&zero_target);
61
62 if (r < 0)
63 DMERR("zero: register failed %d", r);
64
65 return r;
66}
67
68void __exit dm_zero_exit(void)
69{
70 int r = dm_unregister_target(&zero_target);
71
72 if (r < 0)
73 DMERR("zero: unregister failed %d", r);
74}
75
76module_init(dm_zero_init)
77module_exit(dm_zero_exit)
78
79MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
80MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
81MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
new file mode 100644
index 000000000000..243ff6884e83
--- /dev/null
+++ b/drivers/md/dm.c
@@ -0,0 +1,1194 @@
1/*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include "dm.h"
9#include "dm-bio-list.h"
10
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/blkpg.h>
15#include <linux/bio.h>
16#include <linux/buffer_head.h>
17#include <linux/mempool.h>
18#include <linux/slab.h>
19#include <linux/idr.h>
20
21static const char *_name = DM_NAME;
22
23static unsigned int major = 0;
24static unsigned int _major = 0;
25
26/*
27 * One of these is allocated per bio.
28 */
29struct dm_io {
30 struct mapped_device *md;
31 int error;
32 struct bio *bio;
33 atomic_t io_count;
34};
35
36/*
37 * One of these is allocated per target within a bio. Hopefully
38 * this will be simplified out one day.
39 */
40struct target_io {
41 struct dm_io *io;
42 struct dm_target *ti;
43 union map_info info;
44};
45
46union map_info *dm_get_mapinfo(struct bio *bio)
47{
48 if (bio && bio->bi_private)
49 return &((struct target_io *)bio->bi_private)->info;
50 return NULL;
51}
52
53/*
54 * Bits for the md->flags field.
55 */
56#define DMF_BLOCK_IO 0
57#define DMF_SUSPENDED 1
58#define DMF_FS_LOCKED 2
59
60struct mapped_device {
61 struct rw_semaphore lock;
62 rwlock_t map_lock;
63 atomic_t holders;
64
65 unsigned long flags;
66
67 request_queue_t *queue;
68 struct gendisk *disk;
69
70 void *interface_ptr;
71
72 /*
73 * A list of ios that arrived while we were suspended.
74 */
75 atomic_t pending;
76 wait_queue_head_t wait;
77 struct bio_list deferred;
78
79 /*
80 * The current mapping.
81 */
82 struct dm_table *map;
83
84 /*
85 * io objects are allocated from here.
86 */
87 mempool_t *io_pool;
88 mempool_t *tio_pool;
89
90 /*
91 * Event handling.
92 */
93 atomic_t event_nr;
94 wait_queue_head_t eventq;
95
96 /*
97 * freeze/thaw support require holding onto a super block
98 */
99 struct super_block *frozen_sb;
100};
101
102#define MIN_IOS 256
103static kmem_cache_t *_io_cache;
104static kmem_cache_t *_tio_cache;
105
106static struct bio_set *dm_set;
107
108static int __init local_init(void)
109{
110 int r;
111
112 dm_set = bioset_create(16, 16, 4);
113 if (!dm_set)
114 return -ENOMEM;
115
116 /* allocate a slab for the dm_ios */
117 _io_cache = kmem_cache_create("dm_io",
118 sizeof(struct dm_io), 0, 0, NULL, NULL);
119 if (!_io_cache)
120 return -ENOMEM;
121
122 /* allocate a slab for the target ios */
123 _tio_cache = kmem_cache_create("dm_tio", sizeof(struct target_io),
124 0, 0, NULL, NULL);
125 if (!_tio_cache) {
126 kmem_cache_destroy(_io_cache);
127 return -ENOMEM;
128 }
129
130 _major = major;
131 r = register_blkdev(_major, _name);
132 if (r < 0) {
133 kmem_cache_destroy(_tio_cache);
134 kmem_cache_destroy(_io_cache);
135 return r;
136 }
137
138 if (!_major)
139 _major = r;
140
141 return 0;
142}
143
144static void local_exit(void)
145{
146 kmem_cache_destroy(_tio_cache);
147 kmem_cache_destroy(_io_cache);
148
149 bioset_free(dm_set);
150
151 if (unregister_blkdev(_major, _name) < 0)
152 DMERR("devfs_unregister_blkdev failed");
153
154 _major = 0;
155
156 DMINFO("cleaned up");
157}
158
159int (*_inits[])(void) __initdata = {
160 local_init,
161 dm_target_init,
162 dm_linear_init,
163 dm_stripe_init,
164 dm_interface_init,
165};
166
167void (*_exits[])(void) = {
168 local_exit,
169 dm_target_exit,
170 dm_linear_exit,
171 dm_stripe_exit,
172 dm_interface_exit,
173};
174
175static int __init dm_init(void)
176{
177 const int count = ARRAY_SIZE(_inits);
178
179 int r, i;
180
181 for (i = 0; i < count; i++) {
182 r = _inits[i]();
183 if (r)
184 goto bad;
185 }
186
187 return 0;
188
189 bad:
190 while (i--)
191 _exits[i]();
192
193 return r;
194}
195
196static void __exit dm_exit(void)
197{
198 int i = ARRAY_SIZE(_exits);
199
200 while (i--)
201 _exits[i]();
202}
203
204/*
205 * Block device functions
206 */
207static int dm_blk_open(struct inode *inode, struct file *file)
208{
209 struct mapped_device *md;
210
211 md = inode->i_bdev->bd_disk->private_data;
212 dm_get(md);
213 return 0;
214}
215
216static int dm_blk_close(struct inode *inode, struct file *file)
217{
218 struct mapped_device *md;
219
220 md = inode->i_bdev->bd_disk->private_data;
221 dm_put(md);
222 return 0;
223}
224
225static inline struct dm_io *alloc_io(struct mapped_device *md)
226{
227 return mempool_alloc(md->io_pool, GFP_NOIO);
228}
229
230static inline void free_io(struct mapped_device *md, struct dm_io *io)
231{
232 mempool_free(io, md->io_pool);
233}
234
235static inline struct target_io *alloc_tio(struct mapped_device *md)
236{
237 return mempool_alloc(md->tio_pool, GFP_NOIO);
238}
239
240static inline void free_tio(struct mapped_device *md, struct target_io *tio)
241{
242 mempool_free(tio, md->tio_pool);
243}
244
245/*
246 * Add the bio to the list of deferred io.
247 */
248static int queue_io(struct mapped_device *md, struct bio *bio)
249{
250 down_write(&md->lock);
251
252 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
253 up_write(&md->lock);
254 return 1;
255 }
256
257 bio_list_add(&md->deferred, bio);
258
259 up_write(&md->lock);
260 return 0; /* deferred successfully */
261}
262
263/*
264 * Everyone (including functions in this file), should use this
265 * function to access the md->map field, and make sure they call
266 * dm_table_put() when finished.
267 */
268struct dm_table *dm_get_table(struct mapped_device *md)
269{
270 struct dm_table *t;
271
272 read_lock(&md->map_lock);
273 t = md->map;
274 if (t)
275 dm_table_get(t);
276 read_unlock(&md->map_lock);
277
278 return t;
279}
280
281/*-----------------------------------------------------------------
282 * CRUD START:
283 * A more elegant soln is in the works that uses the queue
284 * merge fn, unfortunately there are a couple of changes to
285 * the block layer that I want to make for this. So in the
286 * interests of getting something for people to use I give
287 * you this clearly demarcated crap.
288 *---------------------------------------------------------------*/
289
290/*
291 * Decrements the number of outstanding ios that a bio has been
292 * cloned into, completing the original io if necc.
293 */
294static inline void dec_pending(struct dm_io *io, int error)
295{
296 if (error)
297 io->error = error;
298
299 if (atomic_dec_and_test(&io->io_count)) {
300 if (atomic_dec_and_test(&io->md->pending))
301 /* nudge anyone waiting on suspend queue */
302 wake_up(&io->md->wait);
303
304 bio_endio(io->bio, io->bio->bi_size, io->error);
305 free_io(io->md, io);
306 }
307}
308
309static int clone_endio(struct bio *bio, unsigned int done, int error)
310{
311 int r = 0;
312 struct target_io *tio = bio->bi_private;
313 struct dm_io *io = tio->io;
314 dm_endio_fn endio = tio->ti->type->end_io;
315
316 if (bio->bi_size)
317 return 1;
318
319 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
320 error = -EIO;
321
322 if (endio) {
323 r = endio(tio->ti, bio, error, &tio->info);
324 if (r < 0)
325 error = r;
326
327 else if (r > 0)
328 /* the target wants another shot at the io */
329 return 1;
330 }
331
332 free_tio(io->md, tio);
333 dec_pending(io, error);
334 bio_put(bio);
335 return r;
336}
337
338static sector_t max_io_len(struct mapped_device *md,
339 sector_t sector, struct dm_target *ti)
340{
341 sector_t offset = sector - ti->begin;
342 sector_t len = ti->len - offset;
343
344 /*
345 * Does the target need to split even further ?
346 */
347 if (ti->split_io) {
348 sector_t boundary;
349 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
350 - offset;
351 if (len > boundary)
352 len = boundary;
353 }
354
355 return len;
356}
357
358static void __map_bio(struct dm_target *ti, struct bio *clone,
359 struct target_io *tio)
360{
361 int r;
362
363 /*
364 * Sanity checks.
365 */
366 BUG_ON(!clone->bi_size);
367
368 clone->bi_end_io = clone_endio;
369 clone->bi_private = tio;
370
371 /*
372 * Map the clone. If r == 0 we don't need to do
373 * anything, the target has assumed ownership of
374 * this io.
375 */
376 atomic_inc(&tio->io->io_count);
377 r = ti->type->map(ti, clone, &tio->info);
378 if (r > 0)
379 /* the bio has been remapped so dispatch it */
380 generic_make_request(clone);
381
382 else if (r < 0) {
383 /* error the io and bail out */
384 struct dm_io *io = tio->io;
385 free_tio(tio->io->md, tio);
386 dec_pending(io, -EIO);
387 bio_put(clone);
388 }
389}
390
391struct clone_info {
392 struct mapped_device *md;
393 struct dm_table *map;
394 struct bio *bio;
395 struct dm_io *io;
396 sector_t sector;
397 sector_t sector_count;
398 unsigned short idx;
399};
400
401/*
402 * Creates a little bio that is just does part of a bvec.
403 */
404static struct bio *split_bvec(struct bio *bio, sector_t sector,
405 unsigned short idx, unsigned int offset,
406 unsigned int len)
407{
408 struct bio *clone;
409 struct bio_vec *bv = bio->bi_io_vec + idx;
410
411 clone = bio_alloc_bioset(GFP_NOIO, 1, dm_set);
412 *clone->bi_io_vec = *bv;
413
414 clone->bi_sector = sector;
415 clone->bi_bdev = bio->bi_bdev;
416 clone->bi_rw = bio->bi_rw;
417 clone->bi_vcnt = 1;
418 clone->bi_size = to_bytes(len);
419 clone->bi_io_vec->bv_offset = offset;
420 clone->bi_io_vec->bv_len = clone->bi_size;
421
422 return clone;
423}
424
425/*
426 * Creates a bio that consists of range of complete bvecs.
427 */
428static struct bio *clone_bio(struct bio *bio, sector_t sector,
429 unsigned short idx, unsigned short bv_count,
430 unsigned int len)
431{
432 struct bio *clone;
433
434 clone = bio_clone(bio, GFP_NOIO);
435 clone->bi_sector = sector;
436 clone->bi_idx = idx;
437 clone->bi_vcnt = idx + bv_count;
438 clone->bi_size = to_bytes(len);
439 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
440
441 return clone;
442}
443
444static void __clone_and_map(struct clone_info *ci)
445{
446 struct bio *clone, *bio = ci->bio;
447 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
448 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
449 struct target_io *tio;
450
451 /*
452 * Allocate a target io object.
453 */
454 tio = alloc_tio(ci->md);
455 tio->io = ci->io;
456 tio->ti = ti;
457 memset(&tio->info, 0, sizeof(tio->info));
458
459 if (ci->sector_count <= max) {
460 /*
461 * Optimise for the simple case where we can do all of
462 * the remaining io with a single clone.
463 */
464 clone = clone_bio(bio, ci->sector, ci->idx,
465 bio->bi_vcnt - ci->idx, ci->sector_count);
466 __map_bio(ti, clone, tio);
467 ci->sector_count = 0;
468
469 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
470 /*
471 * There are some bvecs that don't span targets.
472 * Do as many of these as possible.
473 */
474 int i;
475 sector_t remaining = max;
476 sector_t bv_len;
477
478 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
479 bv_len = to_sector(bio->bi_io_vec[i].bv_len);
480
481 if (bv_len > remaining)
482 break;
483
484 remaining -= bv_len;
485 len += bv_len;
486 }
487
488 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
489 __map_bio(ti, clone, tio);
490
491 ci->sector += len;
492 ci->sector_count -= len;
493 ci->idx = i;
494
495 } else {
496 /*
497 * Create two copy bios to deal with io that has
498 * been split across a target.
499 */
500 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
501
502 clone = split_bvec(bio, ci->sector, ci->idx,
503 bv->bv_offset, max);
504 __map_bio(ti, clone, tio);
505
506 ci->sector += max;
507 ci->sector_count -= max;
508 ti = dm_table_find_target(ci->map, ci->sector);
509
510 len = to_sector(bv->bv_len) - max;
511 clone = split_bvec(bio, ci->sector, ci->idx,
512 bv->bv_offset + to_bytes(max), len);
513 tio = alloc_tio(ci->md);
514 tio->io = ci->io;
515 tio->ti = ti;
516 memset(&tio->info, 0, sizeof(tio->info));
517 __map_bio(ti, clone, tio);
518
519 ci->sector += len;
520 ci->sector_count -= len;
521 ci->idx++;
522 }
523}
524
525/*
526 * Split the bio into several clones.
527 */
528static void __split_bio(struct mapped_device *md, struct bio *bio)
529{
530 struct clone_info ci;
531
532 ci.map = dm_get_table(md);
533 if (!ci.map) {
534 bio_io_error(bio, bio->bi_size);
535 return;
536 }
537
538 ci.md = md;
539 ci.bio = bio;
540 ci.io = alloc_io(md);
541 ci.io->error = 0;
542 atomic_set(&ci.io->io_count, 1);
543 ci.io->bio = bio;
544 ci.io->md = md;
545 ci.sector = bio->bi_sector;
546 ci.sector_count = bio_sectors(bio);
547 ci.idx = bio->bi_idx;
548
549 atomic_inc(&md->pending);
550 while (ci.sector_count)
551 __clone_and_map(&ci);
552
553 /* drop the extra reference count */
554 dec_pending(ci.io, 0);
555 dm_table_put(ci.map);
556}
557/*-----------------------------------------------------------------
558 * CRUD END
559 *---------------------------------------------------------------*/
560
561/*
562 * The request function that just remaps the bio built up by
563 * dm_merge_bvec.
564 */
565static int dm_request(request_queue_t *q, struct bio *bio)
566{
567 int r;
568 struct mapped_device *md = q->queuedata;
569
570 down_read(&md->lock);
571
572 /*
573 * If we're suspended we have to queue
574 * this io for later.
575 */
576 while (test_bit(DMF_BLOCK_IO, &md->flags)) {
577 up_read(&md->lock);
578
579 if (bio_rw(bio) == READA) {
580 bio_io_error(bio, bio->bi_size);
581 return 0;
582 }
583
584 r = queue_io(md, bio);
585 if (r < 0) {
586 bio_io_error(bio, bio->bi_size);
587 return 0;
588
589 } else if (r == 0)
590 return 0; /* deferred successfully */
591
592 /*
593 * We're in a while loop, because someone could suspend
594 * before we get to the following read lock.
595 */
596 down_read(&md->lock);
597 }
598
599 __split_bio(md, bio);
600 up_read(&md->lock);
601 return 0;
602}
603
604static int dm_flush_all(request_queue_t *q, struct gendisk *disk,
605 sector_t *error_sector)
606{
607 struct mapped_device *md = q->queuedata;
608 struct dm_table *map = dm_get_table(md);
609 int ret = -ENXIO;
610
611 if (map) {
612 ret = dm_table_flush_all(md->map);
613 dm_table_put(map);
614 }
615
616 return ret;
617}
618
619static void dm_unplug_all(request_queue_t *q)
620{
621 struct mapped_device *md = q->queuedata;
622 struct dm_table *map = dm_get_table(md);
623
624 if (map) {
625 dm_table_unplug_all(map);
626 dm_table_put(map);
627 }
628}
629
630static int dm_any_congested(void *congested_data, int bdi_bits)
631{
632 int r;
633 struct mapped_device *md = (struct mapped_device *) congested_data;
634 struct dm_table *map = dm_get_table(md);
635
636 if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
637 r = bdi_bits;
638 else
639 r = dm_table_any_congested(map, bdi_bits);
640
641 dm_table_put(map);
642 return r;
643}
644
645/*-----------------------------------------------------------------
646 * An IDR is used to keep track of allocated minor numbers.
647 *---------------------------------------------------------------*/
648static DECLARE_MUTEX(_minor_lock);
649static DEFINE_IDR(_minor_idr);
650
651static void free_minor(unsigned int minor)
652{
653 down(&_minor_lock);
654 idr_remove(&_minor_idr, minor);
655 up(&_minor_lock);
656}
657
658/*
659 * See if the device with a specific minor # is free.
660 */
661static int specific_minor(struct mapped_device *md, unsigned int minor)
662{
663 int r, m;
664
665 if (minor >= (1 << MINORBITS))
666 return -EINVAL;
667
668 down(&_minor_lock);
669
670 if (idr_find(&_minor_idr, minor)) {
671 r = -EBUSY;
672 goto out;
673 }
674
675 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
676 if (!r) {
677 r = -ENOMEM;
678 goto out;
679 }
680
681 r = idr_get_new_above(&_minor_idr, md, minor, &m);
682 if (r) {
683 goto out;
684 }
685
686 if (m != minor) {
687 idr_remove(&_minor_idr, m);
688 r = -EBUSY;
689 goto out;
690 }
691
692out:
693 up(&_minor_lock);
694 return r;
695}
696
697static int next_free_minor(struct mapped_device *md, unsigned int *minor)
698{
699 int r;
700 unsigned int m;
701
702 down(&_minor_lock);
703
704 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
705 if (!r) {
706 r = -ENOMEM;
707 goto out;
708 }
709
710 r = idr_get_new(&_minor_idr, md, &m);
711 if (r) {
712 goto out;
713 }
714
715 if (m >= (1 << MINORBITS)) {
716 idr_remove(&_minor_idr, m);
717 r = -ENOSPC;
718 goto out;
719 }
720
721 *minor = m;
722
723out:
724 up(&_minor_lock);
725 return r;
726}
727
728static struct block_device_operations dm_blk_dops;
729
730/*
731 * Allocate and initialise a blank device with a given minor.
732 */
733static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
734{
735 int r;
736 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
737
738 if (!md) {
739 DMWARN("unable to allocate device, out of memory.");
740 return NULL;
741 }
742
743 /* get a minor number for the dev */
744 r = persistent ? specific_minor(md, minor) : next_free_minor(md, &minor);
745 if (r < 0)
746 goto bad1;
747
748 memset(md, 0, sizeof(*md));
749 init_rwsem(&md->lock);
750 rwlock_init(&md->map_lock);
751 atomic_set(&md->holders, 1);
752 atomic_set(&md->event_nr, 0);
753
754 md->queue = blk_alloc_queue(GFP_KERNEL);
755 if (!md->queue)
756 goto bad1;
757
758 md->queue->queuedata = md;
759 md->queue->backing_dev_info.congested_fn = dm_any_congested;
760 md->queue->backing_dev_info.congested_data = md;
761 blk_queue_make_request(md->queue, dm_request);
762 md->queue->unplug_fn = dm_unplug_all;
763 md->queue->issue_flush_fn = dm_flush_all;
764
765 md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
766 mempool_free_slab, _io_cache);
767 if (!md->io_pool)
768 goto bad2;
769
770 md->tio_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
771 mempool_free_slab, _tio_cache);
772 if (!md->tio_pool)
773 goto bad3;
774
775 md->disk = alloc_disk(1);
776 if (!md->disk)
777 goto bad4;
778
779 md->disk->major = _major;
780 md->disk->first_minor = minor;
781 md->disk->fops = &dm_blk_dops;
782 md->disk->queue = md->queue;
783 md->disk->private_data = md;
784 sprintf(md->disk->disk_name, "dm-%d", minor);
785 add_disk(md->disk);
786
787 atomic_set(&md->pending, 0);
788 init_waitqueue_head(&md->wait);
789 init_waitqueue_head(&md->eventq);
790
791 return md;
792
793 bad4:
794 mempool_destroy(md->tio_pool);
795 bad3:
796 mempool_destroy(md->io_pool);
797 bad2:
798 blk_put_queue(md->queue);
799 free_minor(minor);
800 bad1:
801 kfree(md);
802 return NULL;
803}
804
805static void free_dev(struct mapped_device *md)
806{
807 free_minor(md->disk->first_minor);
808 mempool_destroy(md->tio_pool);
809 mempool_destroy(md->io_pool);
810 del_gendisk(md->disk);
811 put_disk(md->disk);
812 blk_put_queue(md->queue);
813 kfree(md);
814}
815
816/*
817 * Bind a table to the device.
818 */
819static void event_callback(void *context)
820{
821 struct mapped_device *md = (struct mapped_device *) context;
822
823 atomic_inc(&md->event_nr);
824 wake_up(&md->eventq);
825}
826
827static void __set_size(struct gendisk *disk, sector_t size)
828{
829 struct block_device *bdev;
830
831 set_capacity(disk, size);
832 bdev = bdget_disk(disk, 0);
833 if (bdev) {
834 down(&bdev->bd_inode->i_sem);
835 i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
836 up(&bdev->bd_inode->i_sem);
837 bdput(bdev);
838 }
839}
840
841static int __bind(struct mapped_device *md, struct dm_table *t)
842{
843 request_queue_t *q = md->queue;
844 sector_t size;
845
846 size = dm_table_get_size(t);
847 __set_size(md->disk, size);
848 if (size == 0)
849 return 0;
850
851 write_lock(&md->map_lock);
852 md->map = t;
853 write_unlock(&md->map_lock);
854
855 dm_table_get(t);
856 dm_table_event_callback(md->map, event_callback, md);
857 dm_table_set_restrictions(t, q);
858 return 0;
859}
860
861static void __unbind(struct mapped_device *md)
862{
863 struct dm_table *map = md->map;
864
865 if (!map)
866 return;
867
868 dm_table_event_callback(map, NULL, NULL);
869 write_lock(&md->map_lock);
870 md->map = NULL;
871 write_unlock(&md->map_lock);
872 dm_table_put(map);
873}
874
875/*
876 * Constructor for a new device.
877 */
878static int create_aux(unsigned int minor, int persistent,
879 struct mapped_device **result)
880{
881 struct mapped_device *md;
882
883 md = alloc_dev(minor, persistent);
884 if (!md)
885 return -ENXIO;
886
887 *result = md;
888 return 0;
889}
890
891int dm_create(struct mapped_device **result)
892{
893 return create_aux(0, 0, result);
894}
895
896int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
897{
898 return create_aux(minor, 1, result);
899}
900
901void *dm_get_mdptr(dev_t dev)
902{
903 struct mapped_device *md;
904 void *mdptr = NULL;
905 unsigned minor = MINOR(dev);
906
907 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
908 return NULL;
909
910 down(&_minor_lock);
911
912 md = idr_find(&_minor_idr, minor);
913
914 if (md && (dm_disk(md)->first_minor == minor))
915 mdptr = md->interface_ptr;
916
917 up(&_minor_lock);
918
919 return mdptr;
920}
921
922void dm_set_mdptr(struct mapped_device *md, void *ptr)
923{
924 md->interface_ptr = ptr;
925}
926
927void dm_get(struct mapped_device *md)
928{
929 atomic_inc(&md->holders);
930}
931
932void dm_put(struct mapped_device *md)
933{
934 struct dm_table *map = dm_get_table(md);
935
936 if (atomic_dec_and_test(&md->holders)) {
937 if (!test_bit(DMF_SUSPENDED, &md->flags) && map) {
938 dm_table_presuspend_targets(map);
939 dm_table_postsuspend_targets(map);
940 }
941 __unbind(md);
942 free_dev(md);
943 }
944
945 dm_table_put(map);
946}
947
948/*
949 * Process the deferred bios
950 */
951static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
952{
953 struct bio *n;
954
955 while (c) {
956 n = c->bi_next;
957 c->bi_next = NULL;
958 __split_bio(md, c);
959 c = n;
960 }
961}
962
963/*
964 * Swap in a new table (destroying old one).
965 */
966int dm_swap_table(struct mapped_device *md, struct dm_table *table)
967{
968 int r;
969
970 down_write(&md->lock);
971
972 /* device must be suspended */
973 if (!test_bit(DMF_SUSPENDED, &md->flags)) {
974 up_write(&md->lock);
975 return -EPERM;
976 }
977
978 __unbind(md);
979 r = __bind(md, table);
980 if (r)
981 return r;
982
983 up_write(&md->lock);
984 return 0;
985}
986
987/*
988 * Functions to lock and unlock any filesystem running on the
989 * device.
990 */
991static int __lock_fs(struct mapped_device *md)
992{
993 struct block_device *bdev;
994
995 if (test_and_set_bit(DMF_FS_LOCKED, &md->flags))
996 return 0;
997
998 bdev = bdget_disk(md->disk, 0);
999 if (!bdev) {
1000 DMWARN("bdget failed in __lock_fs");
1001 return -ENOMEM;
1002 }
1003
1004 WARN_ON(md->frozen_sb);
1005 md->frozen_sb = freeze_bdev(bdev);
1006 /* don't bdput right now, we don't want the bdev
1007 * to go away while it is locked. We'll bdput
1008 * in __unlock_fs
1009 */
1010 return 0;
1011}
1012
1013static int __unlock_fs(struct mapped_device *md)
1014{
1015 struct block_device *bdev;
1016
1017 if (!test_and_clear_bit(DMF_FS_LOCKED, &md->flags))
1018 return 0;
1019
1020 bdev = bdget_disk(md->disk, 0);
1021 if (!bdev) {
1022 DMWARN("bdget failed in __unlock_fs");
1023 return -ENOMEM;
1024 }
1025
1026 thaw_bdev(bdev, md->frozen_sb);
1027 md->frozen_sb = NULL;
1028 bdput(bdev);
1029 bdput(bdev);
1030 return 0;
1031}
1032
1033/*
1034 * We need to be able to change a mapping table under a mounted
1035 * filesystem. For example we might want to move some data in
1036 * the background. Before the table can be swapped with
1037 * dm_bind_table, dm_suspend must be called to flush any in
1038 * flight bios and ensure that any further io gets deferred.
1039 */
1040int dm_suspend(struct mapped_device *md)
1041{
1042 struct dm_table *map;
1043 DECLARE_WAITQUEUE(wait, current);
1044
1045 /* Flush I/O to the device. */
1046 down_read(&md->lock);
1047 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1048 up_read(&md->lock);
1049 return -EINVAL;
1050 }
1051
1052 map = dm_get_table(md);
1053 if (map)
1054 dm_table_presuspend_targets(map);
1055 __lock_fs(md);
1056
1057 up_read(&md->lock);
1058
1059 /*
1060 * First we set the BLOCK_IO flag so no more ios will be
1061 * mapped.
1062 */
1063 down_write(&md->lock);
1064 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1065 /*
1066 * If we get here we know another thread is
1067 * trying to suspend as well, so we leave the fs
1068 * locked for this thread.
1069 */
1070 up_write(&md->lock);
1071 return -EINVAL;
1072 }
1073
1074 set_bit(DMF_BLOCK_IO, &md->flags);
1075 add_wait_queue(&md->wait, &wait);
1076 up_write(&md->lock);
1077
1078 /* unplug */
1079 if (map) {
1080 dm_table_unplug_all(map);
1081 dm_table_put(map);
1082 }
1083
1084 /*
1085 * Then we wait for the already mapped ios to
1086 * complete.
1087 */
1088 while (1) {
1089 set_current_state(TASK_INTERRUPTIBLE);
1090
1091 if (!atomic_read(&md->pending) || signal_pending(current))
1092 break;
1093
1094 io_schedule();
1095 }
1096 set_current_state(TASK_RUNNING);
1097
1098 down_write(&md->lock);
1099 remove_wait_queue(&md->wait, &wait);
1100
1101 /* were we interrupted ? */
1102 if (atomic_read(&md->pending)) {
1103 __unlock_fs(md);
1104 clear_bit(DMF_BLOCK_IO, &md->flags);
1105 up_write(&md->lock);
1106 return -EINTR;
1107 }
1108
1109 set_bit(DMF_SUSPENDED, &md->flags);
1110
1111 map = dm_get_table(md);
1112 if (map)
1113 dm_table_postsuspend_targets(map);
1114 dm_table_put(map);
1115 up_write(&md->lock);
1116
1117 return 0;
1118}
1119
1120int dm_resume(struct mapped_device *md)
1121{
1122 struct bio *def;
1123 struct dm_table *map = dm_get_table(md);
1124
1125 down_write(&md->lock);
1126 if (!map ||
1127 !test_bit(DMF_SUSPENDED, &md->flags) ||
1128 !dm_table_get_size(map)) {
1129 up_write(&md->lock);
1130 dm_table_put(map);
1131 return -EINVAL;
1132 }
1133
1134 dm_table_resume_targets(map);
1135 clear_bit(DMF_SUSPENDED, &md->flags);
1136 clear_bit(DMF_BLOCK_IO, &md->flags);
1137
1138 def = bio_list_get(&md->deferred);
1139 __flush_deferred_io(md, def);
1140 up_write(&md->lock);
1141 __unlock_fs(md);
1142 dm_table_unplug_all(map);
1143 dm_table_put(map);
1144
1145 return 0;
1146}
1147
1148/*-----------------------------------------------------------------
1149 * Event notification.
1150 *---------------------------------------------------------------*/
1151uint32_t dm_get_event_nr(struct mapped_device *md)
1152{
1153 return atomic_read(&md->event_nr);
1154}
1155
1156int dm_wait_event(struct mapped_device *md, int event_nr)
1157{
1158 return wait_event_interruptible(md->eventq,
1159 (event_nr != atomic_read(&md->event_nr)));
1160}
1161
1162/*
1163 * The gendisk is only valid as long as you have a reference
1164 * count on 'md'.
1165 */
1166struct gendisk *dm_disk(struct mapped_device *md)
1167{
1168 return md->disk;
1169}
1170
1171int dm_suspended(struct mapped_device *md)
1172{
1173 return test_bit(DMF_SUSPENDED, &md->flags);
1174}
1175
1176static struct block_device_operations dm_blk_dops = {
1177 .open = dm_blk_open,
1178 .release = dm_blk_close,
1179 .owner = THIS_MODULE
1180};
1181
1182EXPORT_SYMBOL(dm_get_mapinfo);
1183
1184/*
1185 * module hooks
1186 */
1187module_init(dm_init);
1188module_exit(dm_exit);
1189
1190module_param(major, uint, 0);
1191MODULE_PARM_DESC(major, "The major number of the device mapper");
1192MODULE_DESCRIPTION(DM_NAME " driver");
1193MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1194MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
new file mode 100644
index 000000000000..e38c3fc1a1db
--- /dev/null
+++ b/drivers/md/dm.h
@@ -0,0 +1,195 @@
1/*
2 * Internal header file for device mapper
3 *
4 * Copyright (C) 2001, 2002 Sistina Software
5 * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
6 *
7 * This file is released under the LGPL.
8 */
9
10#ifndef DM_INTERNAL_H
11#define DM_INTERNAL_H
12
13#include <linux/fs.h>
14#include <linux/device-mapper.h>
15#include <linux/list.h>
16#include <linux/blkdev.h>
17
18#define DM_NAME "device-mapper"
19#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
20#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
21#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
22
23#define DMEMIT(x...) sz += ((sz >= maxlen) ? \
24 0 : scnprintf(result + sz, maxlen - sz, x))
25
26/*
27 * FIXME: I think this should be with the definition of sector_t
28 * in types.h.
29 */
30#ifdef CONFIG_LBD
31#define SECTOR_FORMAT "%Lu"
32#else
33#define SECTOR_FORMAT "%lu"
34#endif
35
36#define SECTOR_SHIFT 9
37
38/*
39 * List of devices that a metadevice uses and should open/close.
40 */
41struct dm_dev {
42 struct list_head list;
43
44 atomic_t count;
45 int mode;
46 struct block_device *bdev;
47 char name[16];
48};
49
50struct dm_table;
51struct mapped_device;
52
53/*-----------------------------------------------------------------
54 * Functions for manipulating a struct mapped_device.
55 * Drop the reference with dm_put when you finish with the object.
56 *---------------------------------------------------------------*/
57int dm_create(struct mapped_device **md);
58int dm_create_with_minor(unsigned int minor, struct mapped_device **md);
59void dm_set_mdptr(struct mapped_device *md, void *ptr);
60void *dm_get_mdptr(dev_t dev);
61
62/*
63 * Reference counting for md.
64 */
65void dm_get(struct mapped_device *md);
66void dm_put(struct mapped_device *md);
67
68/*
69 * A device can still be used while suspended, but I/O is deferred.
70 */
71int dm_suspend(struct mapped_device *md);
72int dm_resume(struct mapped_device *md);
73
74/*
75 * The device must be suspended before calling this method.
76 */
77int dm_swap_table(struct mapped_device *md, struct dm_table *t);
78
79/*
80 * Drop a reference on the table when you've finished with the
81 * result.
82 */
83struct dm_table *dm_get_table(struct mapped_device *md);
84
85/*
86 * Event functions.
87 */
88uint32_t dm_get_event_nr(struct mapped_device *md);
89int dm_wait_event(struct mapped_device *md, int event_nr);
90
91/*
92 * Info functions.
93 */
94struct gendisk *dm_disk(struct mapped_device *md);
95int dm_suspended(struct mapped_device *md);
96
97/*-----------------------------------------------------------------
98 * Functions for manipulating a table. Tables are also reference
99 * counted.
100 *---------------------------------------------------------------*/
101int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
102
103void dm_table_get(struct dm_table *t);
104void dm_table_put(struct dm_table *t);
105
106int dm_table_add_target(struct dm_table *t, const char *type,
107 sector_t start, sector_t len, char *params);
108int dm_table_complete(struct dm_table *t);
109void dm_table_event_callback(struct dm_table *t,
110 void (*fn)(void *), void *context);
111void dm_table_event(struct dm_table *t);
112sector_t dm_table_get_size(struct dm_table *t);
113struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
114struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
115void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
116unsigned int dm_table_get_num_targets(struct dm_table *t);
117struct list_head *dm_table_get_devices(struct dm_table *t);
118int dm_table_get_mode(struct dm_table *t);
119void dm_table_presuspend_targets(struct dm_table *t);
120void dm_table_postsuspend_targets(struct dm_table *t);
121void dm_table_resume_targets(struct dm_table *t);
122int dm_table_any_congested(struct dm_table *t, int bdi_bits);
123void dm_table_unplug_all(struct dm_table *t);
124int dm_table_flush_all(struct dm_table *t);
125
126/*-----------------------------------------------------------------
127 * A registry of target types.
128 *---------------------------------------------------------------*/
129int dm_target_init(void);
130void dm_target_exit(void);
131struct target_type *dm_get_target_type(const char *name);
132void dm_put_target_type(struct target_type *t);
133int dm_target_iterate(void (*iter_func)(struct target_type *tt,
134 void *param), void *param);
135
136
137/*-----------------------------------------------------------------
138 * Useful inlines.
139 *---------------------------------------------------------------*/
140static inline int array_too_big(unsigned long fixed, unsigned long obj,
141 unsigned long num)
142{
143 return (num > (ULONG_MAX - fixed) / obj);
144}
145
146/*
147 * Ceiling(n / sz)
148 */
149#define dm_div_up(n, sz) (((n) + (sz) - 1) / (sz))
150
151#define dm_sector_div_up(n, sz) ( \
152{ \
153 sector_t _r = ((n) + (sz) - 1); \
154 sector_div(_r, (sz)); \
155 _r; \
156} \
157)
158
159/*
160 * ceiling(n / size) * size
161 */
162#define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz))
163
164static inline sector_t to_sector(unsigned long n)
165{
166 return (n >> 9);
167}
168
169static inline unsigned long to_bytes(sector_t n)
170{
171 return (n << 9);
172}
173
174int dm_split_args(int *argc, char ***argvp, char *input);
175
176/*
177 * The device-mapper can be driven through one of two interfaces;
178 * ioctl or filesystem, depending which patch you have applied.
179 */
180int dm_interface_init(void);
181void dm_interface_exit(void);
182
183/*
184 * Targets for linear and striped mappings
185 */
186int dm_linear_init(void);
187void dm_linear_exit(void);
188
189int dm_stripe_init(void);
190void dm_stripe_exit(void);
191
192void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
193union map_info *dm_get_mapinfo(struct bio *bio);
194
195#endif
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
new file mode 100644
index 000000000000..0248f8e7eac0
--- /dev/null
+++ b/drivers/md/faulty.c
@@ -0,0 +1,343 @@
1/*
2 * faulty.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 2004 Neil Brown
5 *
6 * fautly-device-simulator personality for md
7 *
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * You should have received a copy of the GNU General Public License
15 * (for example /usr/src/linux/COPYING); if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19
20/*
21 * The "faulty" personality causes some requests to fail.
22 *
23 * Possible failure modes are:
24 * reads fail "randomly" but succeed on retry
25 * writes fail "randomly" but succeed on retry
26 * reads for some address fail and then persist until a write
27 * reads for some address fail and then persist irrespective of write
28 * writes for some address fail and persist
29 * all writes fail
30 *
31 * Different modes can be active at a time, but only
32 * one can be set at array creation. Others can be added later.
33 * A mode can be one-shot or recurrent with the recurrance being
34 * once in every N requests.
35 * The bottom 5 bits of the "layout" indicate the mode. The
36 * remainder indicate a period, or 0 for one-shot.
37 *
38 * There is an implementation limit on the number of concurrently
39 * persisting-faulty blocks. When a new fault is requested that would
40 * exceed the limit, it is ignored.
41 * All current faults can be clear using a layout of "0".
42 *
43 * Requests are always sent to the device. If they are to fail,
44 * we clone the bio and insert a new b_end_io into the chain.
45 */
46
47#define WriteTransient 0
48#define ReadTransient 1
49#define WritePersistent 2
50#define ReadPersistent 3
51#define WriteAll 4 /* doesn't go to device */
52#define ReadFixable 5
53#define Modes 6
54
55#define ClearErrors 31
56#define ClearFaults 30
57
58#define AllPersist 100 /* internal use only */
59#define NoPersist 101
60
61#define ModeMask 0x1f
62#define ModeShift 5
63
64#define MaxFault 50
65#include <linux/raid/md.h>
66
67
68static int faulty_fail(struct bio *bio, unsigned int bytes_done, int error)
69{
70 struct bio *b = bio->bi_private;
71
72 b->bi_size = bio->bi_size;
73 b->bi_sector = bio->bi_sector;
74
75 if (bio->bi_size == 0)
76 bio_put(bio);
77
78 clear_bit(BIO_UPTODATE, &b->bi_flags);
79 return (b->bi_end_io)(b, bytes_done, -EIO);
80}
81
82typedef struct faulty_conf {
83 int period[Modes];
84 atomic_t counters[Modes];
85 sector_t faults[MaxFault];
86 int modes[MaxFault];
87 int nfaults;
88 mdk_rdev_t *rdev;
89} conf_t;
90
91static int check_mode(conf_t *conf, int mode)
92{
93 if (conf->period[mode] == 0 &&
94 atomic_read(&conf->counters[mode]) <= 0)
95 return 0; /* no failure, no decrement */
96
97
98 if (atomic_dec_and_test(&conf->counters[mode])) {
99 if (conf->period[mode])
100 atomic_set(&conf->counters[mode], conf->period[mode]);
101 return 1;
102 }
103 return 0;
104}
105
106static int check_sector(conf_t *conf, sector_t start, sector_t end, int dir)
107{
108 /* If we find a ReadFixable sector, we fix it ... */
109 int i;
110 for (i=0; i<conf->nfaults; i++)
111 if (conf->faults[i] >= start &&
112 conf->faults[i] < end) {
113 /* found it ... */
114 switch (conf->modes[i] * 2 + dir) {
115 case WritePersistent*2+WRITE: return 1;
116 case ReadPersistent*2+READ: return 1;
117 case ReadFixable*2+READ: return 1;
118 case ReadFixable*2+WRITE:
119 conf->modes[i] = NoPersist;
120 return 0;
121 case AllPersist*2+READ:
122 case AllPersist*2+WRITE: return 1;
123 default:
124 return 0;
125 }
126 }
127 return 0;
128}
129
130static void add_sector(conf_t *conf, sector_t start, int mode)
131{
132 int i;
133 int n = conf->nfaults;
134 for (i=0; i<conf->nfaults; i++)
135 if (conf->faults[i] == start) {
136 switch(mode) {
137 case NoPersist: conf->modes[i] = mode; return;
138 case WritePersistent:
139 if (conf->modes[i] == ReadPersistent ||
140 conf->modes[i] == ReadFixable)
141 conf->modes[i] = AllPersist;
142 else
143 conf->modes[i] = WritePersistent;
144 return;
145 case ReadPersistent:
146 if (conf->modes[i] == WritePersistent)
147 conf->modes[i] = AllPersist;
148 else
149 conf->modes[i] = ReadPersistent;
150 return;
151 case ReadFixable:
152 if (conf->modes[i] == WritePersistent ||
153 conf->modes[i] == ReadPersistent)
154 conf->modes[i] = AllPersist;
155 else
156 conf->modes[i] = ReadFixable;
157 return;
158 }
159 } else if (conf->modes[i] == NoPersist)
160 n = i;
161
162 if (n >= MaxFault)
163 return;
164 conf->faults[n] = start;
165 conf->modes[n] = mode;
166 if (conf->nfaults == n)
167 conf->nfaults = n+1;
168}
169
170static int make_request(request_queue_t *q, struct bio *bio)
171{
172 mddev_t *mddev = q->queuedata;
173 conf_t *conf = (conf_t*)mddev->private;
174 int failit = 0;
175
176 if (bio->bi_rw & 1) {
177 /* write request */
178 if (atomic_read(&conf->counters[WriteAll])) {
179 /* special case - don't decrement, don't generic_make_request,
180 * just fail immediately
181 */
182 bio_endio(bio, bio->bi_size, -EIO);
183 return 0;
184 }
185
186 if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
187 WRITE))
188 failit = 1;
189 if (check_mode(conf, WritePersistent)) {
190 add_sector(conf, bio->bi_sector, WritePersistent);
191 failit = 1;
192 }
193 if (check_mode(conf, WriteTransient))
194 failit = 1;
195 } else {
196 /* read request */
197 if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
198 READ))
199 failit = 1;
200 if (check_mode(conf, ReadTransient))
201 failit = 1;
202 if (check_mode(conf, ReadPersistent)) {
203 add_sector(conf, bio->bi_sector, ReadPersistent);
204 failit = 1;
205 }
206 if (check_mode(conf, ReadFixable)) {
207 add_sector(conf, bio->bi_sector, ReadFixable);
208 failit = 1;
209 }
210 }
211 if (failit) {
212 struct bio *b = bio_clone(bio, GFP_NOIO);
213 b->bi_bdev = conf->rdev->bdev;
214 b->bi_private = bio;
215 b->bi_end_io = faulty_fail;
216 generic_make_request(b);
217 return 0;
218 } else {
219 bio->bi_bdev = conf->rdev->bdev;
220 return 1;
221 }
222}
223
224static void status(struct seq_file *seq, mddev_t *mddev)
225{
226 conf_t *conf = (conf_t*)mddev->private;
227 int n;
228
229 if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
230 seq_printf(seq, " WriteTransient=%d(%d)",
231 n, conf->period[WriteTransient]);
232
233 if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
234 seq_printf(seq, " ReadTransient=%d(%d)",
235 n, conf->period[ReadTransient]);
236
237 if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
238 seq_printf(seq, " WritePersistent=%d(%d)",
239 n, conf->period[WritePersistent]);
240
241 if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
242 seq_printf(seq, " ReadPersistent=%d(%d)",
243 n, conf->period[ReadPersistent]);
244
245
246 if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
247 seq_printf(seq, " ReadFixable=%d(%d)",
248 n, conf->period[ReadFixable]);
249
250 if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
251 seq_printf(seq, " WriteAll");
252
253 seq_printf(seq, " nfaults=%d", conf->nfaults);
254}
255
256
257static int reconfig(mddev_t *mddev, int layout, int chunk_size)
258{
259 int mode = layout & ModeMask;
260 int count = layout >> ModeShift;
261 conf_t *conf = mddev->private;
262
263 if (chunk_size != -1)
264 return -EINVAL;
265
266 /* new layout */
267 if (mode == ClearFaults)
268 conf->nfaults = 0;
269 else if (mode == ClearErrors) {
270 int i;
271 for (i=0 ; i < Modes ; i++) {
272 conf->period[i] = 0;
273 atomic_set(&conf->counters[i], 0);
274 }
275 } else if (mode < Modes) {
276 conf->period[mode] = count;
277 if (!count) count++;
278 atomic_set(&conf->counters[mode], count);
279 } else
280 return -EINVAL;
281 mddev->layout = -1; /* makes sure further changes come through */
282 return 0;
283}
284
285static int run(mddev_t *mddev)
286{
287 mdk_rdev_t *rdev;
288 struct list_head *tmp;
289 int i;
290
291 conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
292
293 for (i=0; i<Modes; i++) {
294 atomic_set(&conf->counters[i], 0);
295 conf->period[i] = 0;
296 }
297 conf->nfaults = 0;
298
299 ITERATE_RDEV(mddev, rdev, tmp)
300 conf->rdev = rdev;
301
302 mddev->array_size = mddev->size;
303 mddev->private = conf;
304
305 reconfig(mddev, mddev->layout, -1);
306
307 return 0;
308}
309
310static int stop(mddev_t *mddev)
311{
312 conf_t *conf = (conf_t *)mddev->private;
313
314 kfree(conf);
315 mddev->private = NULL;
316 return 0;
317}
318
319static mdk_personality_t faulty_personality =
320{
321 .name = "faulty",
322 .owner = THIS_MODULE,
323 .make_request = make_request,
324 .run = run,
325 .stop = stop,
326 .status = status,
327 .reconfig = reconfig,
328};
329
330static int __init raid_init(void)
331{
332 return register_md_personality(FAULTY, &faulty_personality);
333}
334
335static void raid_exit(void)
336{
337 unregister_md_personality(FAULTY);
338}
339
340module_init(raid_init);
341module_exit(raid_exit);
342MODULE_LICENSE("GPL");
343MODULE_ALIAS("md-personality-10"); /* faulty */
diff --git a/drivers/md/kcopyd.c b/drivers/md/kcopyd.c
new file mode 100644
index 000000000000..eb7036485975
--- /dev/null
+++ b/drivers/md/kcopyd.c
@@ -0,0 +1,687 @@
1/*
2 * Copyright (C) 2002 Sistina Software (UK) Limited.
3 *
4 * This file is released under the GPL.
5 *
6 * Kcopyd provides a simple interface for copying an area of one
7 * block-device to one or more other block-devices, with an asynchronous
8 * completion notification.
9 */
10
11#include <asm/atomic.h>
12
13#include <linux/blkdev.h>
14#include <linux/config.h>
15#include <linux/fs.h>
16#include <linux/init.h>
17#include <linux/list.h>
18#include <linux/mempool.h>
19#include <linux/module.h>
20#include <linux/pagemap.h>
21#include <linux/slab.h>
22#include <linux/vmalloc.h>
23#include <linux/workqueue.h>
24
25#include "kcopyd.h"
26
27static struct workqueue_struct *_kcopyd_wq;
28static struct work_struct _kcopyd_work;
29
30static inline void wake(void)
31{
32 queue_work(_kcopyd_wq, &_kcopyd_work);
33}
34
35/*-----------------------------------------------------------------
36 * Each kcopyd client has its own little pool of preallocated
37 * pages for kcopyd io.
38 *---------------------------------------------------------------*/
39struct kcopyd_client {
40 struct list_head list;
41
42 spinlock_t lock;
43 struct page_list *pages;
44 unsigned int nr_pages;
45 unsigned int nr_free_pages;
46};
47
48static struct page_list *alloc_pl(void)
49{
50 struct page_list *pl;
51
52 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
53 if (!pl)
54 return NULL;
55
56 pl->page = alloc_page(GFP_KERNEL);
57 if (!pl->page) {
58 kfree(pl);
59 return NULL;
60 }
61
62 return pl;
63}
64
65static void free_pl(struct page_list *pl)
66{
67 __free_page(pl->page);
68 kfree(pl);
69}
70
71static int kcopyd_get_pages(struct kcopyd_client *kc,
72 unsigned int nr, struct page_list **pages)
73{
74 struct page_list *pl;
75
76 spin_lock(&kc->lock);
77 if (kc->nr_free_pages < nr) {
78 spin_unlock(&kc->lock);
79 return -ENOMEM;
80 }
81
82 kc->nr_free_pages -= nr;
83 for (*pages = pl = kc->pages; --nr; pl = pl->next)
84 ;
85
86 kc->pages = pl->next;
87 pl->next = NULL;
88
89 spin_unlock(&kc->lock);
90
91 return 0;
92}
93
94static void kcopyd_put_pages(struct kcopyd_client *kc, struct page_list *pl)
95{
96 struct page_list *cursor;
97
98 spin_lock(&kc->lock);
99 for (cursor = pl; cursor->next; cursor = cursor->next)
100 kc->nr_free_pages++;
101
102 kc->nr_free_pages++;
103 cursor->next = kc->pages;
104 kc->pages = pl;
105 spin_unlock(&kc->lock);
106}
107
108/*
109 * These three functions resize the page pool.
110 */
111static void drop_pages(struct page_list *pl)
112{
113 struct page_list *next;
114
115 while (pl) {
116 next = pl->next;
117 free_pl(pl);
118 pl = next;
119 }
120}
121
122static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
123{
124 unsigned int i;
125 struct page_list *pl = NULL, *next;
126
127 for (i = 0; i < nr; i++) {
128 next = alloc_pl();
129 if (!next) {
130 if (pl)
131 drop_pages(pl);
132 return -ENOMEM;
133 }
134 next->next = pl;
135 pl = next;
136 }
137
138 kcopyd_put_pages(kc, pl);
139 kc->nr_pages += nr;
140 return 0;
141}
142
143static void client_free_pages(struct kcopyd_client *kc)
144{
145 BUG_ON(kc->nr_free_pages != kc->nr_pages);
146 drop_pages(kc->pages);
147 kc->pages = NULL;
148 kc->nr_free_pages = kc->nr_pages = 0;
149}
150
151/*-----------------------------------------------------------------
152 * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
153 * for this reason we use a mempool to prevent the client from
154 * ever having to do io (which could cause a deadlock).
155 *---------------------------------------------------------------*/
156struct kcopyd_job {
157 struct kcopyd_client *kc;
158 struct list_head list;
159 unsigned long flags;
160
161 /*
162 * Error state of the job.
163 */
164 int read_err;
165 unsigned int write_err;
166
167 /*
168 * Either READ or WRITE
169 */
170 int rw;
171 struct io_region source;
172
173 /*
174 * The destinations for the transfer.
175 */
176 unsigned int num_dests;
177 struct io_region dests[KCOPYD_MAX_REGIONS];
178
179 sector_t offset;
180 unsigned int nr_pages;
181 struct page_list *pages;
182
183 /*
184 * Set this to ensure you are notified when the job has
185 * completed. 'context' is for callback to use.
186 */
187 kcopyd_notify_fn fn;
188 void *context;
189
190 /*
191 * These fields are only used if the job has been split
192 * into more manageable parts.
193 */
194 struct semaphore lock;
195 atomic_t sub_jobs;
196 sector_t progress;
197};
198
199/* FIXME: this should scale with the number of pages */
200#define MIN_JOBS 512
201
202static kmem_cache_t *_job_cache;
203static mempool_t *_job_pool;
204
205/*
206 * We maintain three lists of jobs:
207 *
208 * i) jobs waiting for pages
209 * ii) jobs that have pages, and are waiting for the io to be issued.
210 * iii) jobs that have completed.
211 *
212 * All three of these are protected by job_lock.
213 */
214static DEFINE_SPINLOCK(_job_lock);
215
216static LIST_HEAD(_complete_jobs);
217static LIST_HEAD(_io_jobs);
218static LIST_HEAD(_pages_jobs);
219
220static int jobs_init(void)
221{
222 _job_cache = kmem_cache_create("kcopyd-jobs",
223 sizeof(struct kcopyd_job),
224 __alignof__(struct kcopyd_job),
225 0, NULL, NULL);
226 if (!_job_cache)
227 return -ENOMEM;
228
229 _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
230 mempool_free_slab, _job_cache);
231 if (!_job_pool) {
232 kmem_cache_destroy(_job_cache);
233 return -ENOMEM;
234 }
235
236 return 0;
237}
238
239static void jobs_exit(void)
240{
241 BUG_ON(!list_empty(&_complete_jobs));
242 BUG_ON(!list_empty(&_io_jobs));
243 BUG_ON(!list_empty(&_pages_jobs));
244
245 mempool_destroy(_job_pool);
246 kmem_cache_destroy(_job_cache);
247 _job_pool = NULL;
248 _job_cache = NULL;
249}
250
251/*
252 * Functions to push and pop a job onto the head of a given job
253 * list.
254 */
255static inline struct kcopyd_job *pop(struct list_head *jobs)
256{
257 struct kcopyd_job *job = NULL;
258 unsigned long flags;
259
260 spin_lock_irqsave(&_job_lock, flags);
261
262 if (!list_empty(jobs)) {
263 job = list_entry(jobs->next, struct kcopyd_job, list);
264 list_del(&job->list);
265 }
266 spin_unlock_irqrestore(&_job_lock, flags);
267
268 return job;
269}
270
271static inline void push(struct list_head *jobs, struct kcopyd_job *job)
272{
273 unsigned long flags;
274
275 spin_lock_irqsave(&_job_lock, flags);
276 list_add_tail(&job->list, jobs);
277 spin_unlock_irqrestore(&_job_lock, flags);
278}
279
280/*
281 * These three functions process 1 item from the corresponding
282 * job list.
283 *
284 * They return:
285 * < 0: error
286 * 0: success
287 * > 0: can't process yet.
288 */
289static int run_complete_job(struct kcopyd_job *job)
290{
291 void *context = job->context;
292 int read_err = job->read_err;
293 unsigned int write_err = job->write_err;
294 kcopyd_notify_fn fn = job->fn;
295
296 kcopyd_put_pages(job->kc, job->pages);
297 mempool_free(job, _job_pool);
298 fn(read_err, write_err, context);
299 return 0;
300}
301
302static void complete_io(unsigned long error, void *context)
303{
304 struct kcopyd_job *job = (struct kcopyd_job *) context;
305
306 if (error) {
307 if (job->rw == WRITE)
308 job->write_err &= error;
309 else
310 job->read_err = 1;
311
312 if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
313 push(&_complete_jobs, job);
314 wake();
315 return;
316 }
317 }
318
319 if (job->rw == WRITE)
320 push(&_complete_jobs, job);
321
322 else {
323 job->rw = WRITE;
324 push(&_io_jobs, job);
325 }
326
327 wake();
328}
329
330/*
331 * Request io on as many buffer heads as we can currently get for
332 * a particular job.
333 */
334static int run_io_job(struct kcopyd_job *job)
335{
336 int r;
337
338 if (job->rw == READ)
339 r = dm_io_async(1, &job->source, job->rw,
340 job->pages,
341 job->offset, complete_io, job);
342
343 else
344 r = dm_io_async(job->num_dests, job->dests, job->rw,
345 job->pages,
346 job->offset, complete_io, job);
347
348 return r;
349}
350
351static int run_pages_job(struct kcopyd_job *job)
352{
353 int r;
354
355 job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
356 PAGE_SIZE >> 9);
357 r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
358 if (!r) {
359 /* this job is ready for io */
360 push(&_io_jobs, job);
361 return 0;
362 }
363
364 if (r == -ENOMEM)
365 /* can't complete now */
366 return 1;
367
368 return r;
369}
370
371/*
372 * Run through a list for as long as possible. Returns the count
373 * of successful jobs.
374 */
375static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
376{
377 struct kcopyd_job *job;
378 int r, count = 0;
379
380 while ((job = pop(jobs))) {
381
382 r = fn(job);
383
384 if (r < 0) {
385 /* error this rogue job */
386 if (job->rw == WRITE)
387 job->write_err = (unsigned int) -1;
388 else
389 job->read_err = 1;
390 push(&_complete_jobs, job);
391 break;
392 }
393
394 if (r > 0) {
395 /*
396 * We couldn't service this job ATM, so
397 * push this job back onto the list.
398 */
399 push(jobs, job);
400 break;
401 }
402
403 count++;
404 }
405
406 return count;
407}
408
409/*
410 * kcopyd does this every time it's woken up.
411 */
412static void do_work(void *ignored)
413{
414 /*
415 * The order that these are called is *very* important.
416 * complete jobs can free some pages for pages jobs.
417 * Pages jobs when successful will jump onto the io jobs
418 * list. io jobs call wake when they complete and it all
419 * starts again.
420 */
421 process_jobs(&_complete_jobs, run_complete_job);
422 process_jobs(&_pages_jobs, run_pages_job);
423 process_jobs(&_io_jobs, run_io_job);
424}
425
426/*
427 * If we are copying a small region we just dispatch a single job
428 * to do the copy, otherwise the io has to be split up into many
429 * jobs.
430 */
431static void dispatch_job(struct kcopyd_job *job)
432{
433 push(&_pages_jobs, job);
434 wake();
435}
436
437#define SUB_JOB_SIZE 128
438static void segment_complete(int read_err,
439 unsigned int write_err, void *context)
440{
441 /* FIXME: tidy this function */
442 sector_t progress = 0;
443 sector_t count = 0;
444 struct kcopyd_job *job = (struct kcopyd_job *) context;
445
446 down(&job->lock);
447
448 /* update the error */
449 if (read_err)
450 job->read_err = 1;
451
452 if (write_err)
453 job->write_err &= write_err;
454
455 /*
456 * Only dispatch more work if there hasn't been an error.
457 */
458 if ((!job->read_err && !job->write_err) ||
459 test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
460 /* get the next chunk of work */
461 progress = job->progress;
462 count = job->source.count - progress;
463 if (count) {
464 if (count > SUB_JOB_SIZE)
465 count = SUB_JOB_SIZE;
466
467 job->progress += count;
468 }
469 }
470 up(&job->lock);
471
472 if (count) {
473 int i;
474 struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
475
476 *sub_job = *job;
477 sub_job->source.sector += progress;
478 sub_job->source.count = count;
479
480 for (i = 0; i < job->num_dests; i++) {
481 sub_job->dests[i].sector += progress;
482 sub_job->dests[i].count = count;
483 }
484
485 sub_job->fn = segment_complete;
486 sub_job->context = job;
487 dispatch_job(sub_job);
488
489 } else if (atomic_dec_and_test(&job->sub_jobs)) {
490
491 /*
492 * To avoid a race we must keep the job around
493 * until after the notify function has completed.
494 * Otherwise the client may try and stop the job
495 * after we've completed.
496 */
497 job->fn(read_err, write_err, job->context);
498 mempool_free(job, _job_pool);
499 }
500}
501
502/*
503 * Create some little jobs that will do the move between
504 * them.
505 */
506#define SPLIT_COUNT 8
507static void split_job(struct kcopyd_job *job)
508{
509 int i;
510
511 atomic_set(&job->sub_jobs, SPLIT_COUNT);
512 for (i = 0; i < SPLIT_COUNT; i++)
513 segment_complete(0, 0u, job);
514}
515
516int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
517 unsigned int num_dests, struct io_region *dests,
518 unsigned int flags, kcopyd_notify_fn fn, void *context)
519{
520 struct kcopyd_job *job;
521
522 /*
523 * Allocate a new job.
524 */
525 job = mempool_alloc(_job_pool, GFP_NOIO);
526
527 /*
528 * set up for the read.
529 */
530 job->kc = kc;
531 job->flags = flags;
532 job->read_err = 0;
533 job->write_err = 0;
534 job->rw = READ;
535
536 job->source = *from;
537
538 job->num_dests = num_dests;
539 memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
540
541 job->offset = 0;
542 job->nr_pages = 0;
543 job->pages = NULL;
544
545 job->fn = fn;
546 job->context = context;
547
548 if (job->source.count < SUB_JOB_SIZE)
549 dispatch_job(job);
550
551 else {
552 init_MUTEX(&job->lock);
553 job->progress = 0;
554 split_job(job);
555 }
556
557 return 0;
558}
559
560/*
561 * Cancels a kcopyd job, eg. someone might be deactivating a
562 * mirror.
563 */
564int kcopyd_cancel(struct kcopyd_job *job, int block)
565{
566 /* FIXME: finish */
567 return -1;
568}
569
570/*-----------------------------------------------------------------
571 * Unit setup
572 *---------------------------------------------------------------*/
573static DECLARE_MUTEX(_client_lock);
574static LIST_HEAD(_clients);
575
576static void client_add(struct kcopyd_client *kc)
577{
578 down(&_client_lock);
579 list_add(&kc->list, &_clients);
580 up(&_client_lock);
581}
582
583static void client_del(struct kcopyd_client *kc)
584{
585 down(&_client_lock);
586 list_del(&kc->list);
587 up(&_client_lock);
588}
589
590static DECLARE_MUTEX(kcopyd_init_lock);
591static int kcopyd_clients = 0;
592
593static int kcopyd_init(void)
594{
595 int r;
596
597 down(&kcopyd_init_lock);
598
599 if (kcopyd_clients) {
600 /* Already initialized. */
601 kcopyd_clients++;
602 up(&kcopyd_init_lock);
603 return 0;
604 }
605
606 r = jobs_init();
607 if (r) {
608 up(&kcopyd_init_lock);
609 return r;
610 }
611
612 _kcopyd_wq = create_singlethread_workqueue("kcopyd");
613 if (!_kcopyd_wq) {
614 jobs_exit();
615 up(&kcopyd_init_lock);
616 return -ENOMEM;
617 }
618
619 kcopyd_clients++;
620 INIT_WORK(&_kcopyd_work, do_work, NULL);
621 up(&kcopyd_init_lock);
622 return 0;
623}
624
625static void kcopyd_exit(void)
626{
627 down(&kcopyd_init_lock);
628 kcopyd_clients--;
629 if (!kcopyd_clients) {
630 jobs_exit();
631 destroy_workqueue(_kcopyd_wq);
632 _kcopyd_wq = NULL;
633 }
634 up(&kcopyd_init_lock);
635}
636
637int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
638{
639 int r = 0;
640 struct kcopyd_client *kc;
641
642 r = kcopyd_init();
643 if (r)
644 return r;
645
646 kc = kmalloc(sizeof(*kc), GFP_KERNEL);
647 if (!kc) {
648 kcopyd_exit();
649 return -ENOMEM;
650 }
651
652 spin_lock_init(&kc->lock);
653 kc->pages = NULL;
654 kc->nr_pages = kc->nr_free_pages = 0;
655 r = client_alloc_pages(kc, nr_pages);
656 if (r) {
657 kfree(kc);
658 kcopyd_exit();
659 return r;
660 }
661
662 r = dm_io_get(nr_pages);
663 if (r) {
664 client_free_pages(kc);
665 kfree(kc);
666 kcopyd_exit();
667 return r;
668 }
669
670 client_add(kc);
671 *result = kc;
672 return 0;
673}
674
675void kcopyd_client_destroy(struct kcopyd_client *kc)
676{
677 dm_io_put(kc->nr_pages);
678 client_free_pages(kc);
679 client_del(kc);
680 kfree(kc);
681 kcopyd_exit();
682}
683
684EXPORT_SYMBOL(kcopyd_client_create);
685EXPORT_SYMBOL(kcopyd_client_destroy);
686EXPORT_SYMBOL(kcopyd_copy);
687EXPORT_SYMBOL(kcopyd_cancel);
diff --git a/drivers/md/kcopyd.h b/drivers/md/kcopyd.h
new file mode 100644
index 000000000000..4621ea055c0e
--- /dev/null
+++ b/drivers/md/kcopyd.h
@@ -0,0 +1,42 @@
1/*
2 * Copyright (C) 2001 Sistina Software
3 *
4 * This file is released under the GPL.
5 *
6 * Kcopyd provides a simple interface for copying an area of one
7 * block-device to one or more other block-devices, with an asynchronous
8 * completion notification.
9 */
10
11#ifndef DM_KCOPYD_H
12#define DM_KCOPYD_H
13
14#include "dm-io.h"
15
16/* FIXME: make this configurable */
17#define KCOPYD_MAX_REGIONS 8
18
19#define KCOPYD_IGNORE_ERROR 1
20
21/*
22 * To use kcopyd you must first create a kcopyd client object.
23 */
24struct kcopyd_client;
25int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
26void kcopyd_client_destroy(struct kcopyd_client *kc);
27
28/*
29 * Submit a copy job to kcopyd. This is built on top of the
30 * previous three fns.
31 *
32 * read_err is a boolean,
33 * write_err is a bitset, with 1 bit for each destination region
34 */
35typedef void (*kcopyd_notify_fn)(int read_err,
36 unsigned int write_err, void *context);
37
38int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
39 unsigned int num_dests, struct io_region *dests,
40 unsigned int flags, kcopyd_notify_fn fn, void *context);
41
42#endif
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
new file mode 100644
index 000000000000..161e9aa87291
--- /dev/null
+++ b/drivers/md/linear.c
@@ -0,0 +1,343 @@
1/*
2 linear.c : Multiple Devices driver for Linux
3 Copyright (C) 1994-96 Marc ZYNGIER
4 <zyngier@ufr-info-p7.ibp.fr> or
5 <maz@gloups.fdn.fr>
6
7 Linear mode management functions.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2, or (at your option)
12 any later version.
13
14 You should have received a copy of the GNU General Public License
15 (for example /usr/src/linux/COPYING); if not, write to the Free
16 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17*/
18
19#include <linux/module.h>
20
21#include <linux/raid/md.h>
22#include <linux/slab.h>
23#include <linux/raid/linear.h>
24
25#define MAJOR_NR MD_MAJOR
26#define MD_DRIVER
27#define MD_PERSONALITY
28
29/*
30 * find which device holds a particular offset
31 */
32static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
33{
34 dev_info_t *hash;
35 linear_conf_t *conf = mddev_to_conf(mddev);
36 sector_t block = sector >> 1;
37
38 /*
39 * sector_div(a,b) returns the remainer and sets a to a/b
40 */
41 (void)sector_div(block, conf->smallest->size);
42 hash = conf->hash_table[block];
43
44 while ((sector>>1) >= (hash->size + hash->offset))
45 hash++;
46 return hash;
47}
48
49/**
50 * linear_mergeable_bvec -- tell bio layer if a two requests can be merged
51 * @q: request queue
52 * @bio: the buffer head that's been built up so far
53 * @biovec: the request that could be merged to it.
54 *
55 * Return amount of bytes we can take at this offset
56 */
57static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
58{
59 mddev_t *mddev = q->queuedata;
60 dev_info_t *dev0;
61 unsigned long maxsectors, bio_sectors = bio->bi_size >> 9;
62 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
63
64 dev0 = which_dev(mddev, sector);
65 maxsectors = (dev0->size << 1) - (sector - (dev0->offset<<1));
66
67 if (maxsectors < bio_sectors)
68 maxsectors = 0;
69 else
70 maxsectors -= bio_sectors;
71
72 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
73 return biovec->bv_len;
74 /* The bytes available at this offset could be really big,
75 * so we cap at 2^31 to avoid overflow */
76 if (maxsectors > (1 << (31-9)))
77 return 1<<31;
78 return maxsectors << 9;
79}
80
81static void linear_unplug(request_queue_t *q)
82{
83 mddev_t *mddev = q->queuedata;
84 linear_conf_t *conf = mddev_to_conf(mddev);
85 int i;
86
87 for (i=0; i < mddev->raid_disks; i++) {
88 request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
89 if (r_queue->unplug_fn)
90 r_queue->unplug_fn(r_queue);
91 }
92}
93
94static int linear_issue_flush(request_queue_t *q, struct gendisk *disk,
95 sector_t *error_sector)
96{
97 mddev_t *mddev = q->queuedata;
98 linear_conf_t *conf = mddev_to_conf(mddev);
99 int i, ret = 0;
100
101 for (i=0; i < mddev->raid_disks && ret == 0; i++) {
102 struct block_device *bdev = conf->disks[i].rdev->bdev;
103 request_queue_t *r_queue = bdev_get_queue(bdev);
104
105 if (!r_queue->issue_flush_fn)
106 ret = -EOPNOTSUPP;
107 else
108 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
109 }
110 return ret;
111}
112
113static int linear_run (mddev_t *mddev)
114{
115 linear_conf_t *conf;
116 dev_info_t **table;
117 mdk_rdev_t *rdev;
118 int i, nb_zone, cnt;
119 sector_t start;
120 sector_t curr_offset;
121 struct list_head *tmp;
122
123 conf = kmalloc (sizeof (*conf) + mddev->raid_disks*sizeof(dev_info_t),
124 GFP_KERNEL);
125 if (!conf)
126 goto out;
127 memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
128 mddev->private = conf;
129
130 /*
131 * Find the smallest device.
132 */
133
134 conf->smallest = NULL;
135 cnt = 0;
136 mddev->array_size = 0;
137
138 ITERATE_RDEV(mddev,rdev,tmp) {
139 int j = rdev->raid_disk;
140 dev_info_t *disk = conf->disks + j;
141
142 if (j < 0 || j > mddev->raid_disks || disk->rdev) {
143 printk("linear: disk numbering problem. Aborting!\n");
144 goto out;
145 }
146
147 disk->rdev = rdev;
148
149 blk_queue_stack_limits(mddev->queue,
150 rdev->bdev->bd_disk->queue);
151 /* as we don't honour merge_bvec_fn, we must never risk
152 * violating it, so limit ->max_sector to one PAGE, as
153 * a one page request is never in violation.
154 */
155 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
156 mddev->queue->max_sectors > (PAGE_SIZE>>9))
157 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
158
159 disk->size = rdev->size;
160 mddev->array_size += rdev->size;
161
162 if (!conf->smallest || (disk->size < conf->smallest->size))
163 conf->smallest = disk;
164 cnt++;
165 }
166 if (cnt != mddev->raid_disks) {
167 printk("linear: not enough drives present. Aborting!\n");
168 goto out;
169 }
170
171 /*
172 * This code was restructured to work around a gcc-2.95.3 internal
173 * compiler error. Alter it with care.
174 */
175 {
176 sector_t sz;
177 unsigned round;
178 unsigned long base;
179
180 sz = mddev->array_size;
181 base = conf->smallest->size;
182 round = sector_div(sz, base);
183 nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
184 }
185
186 conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
187 GFP_KERNEL);
188 if (!conf->hash_table)
189 goto out;
190
191 /*
192 * Here we generate the linear hash table
193 */
194 table = conf->hash_table;
195 start = 0;
196 curr_offset = 0;
197 for (i = 0; i < cnt; i++) {
198 dev_info_t *disk = conf->disks + i;
199
200 disk->offset = curr_offset;
201 curr_offset += disk->size;
202
203 /* 'curr_offset' is the end of this disk
204 * 'start' is the start of table
205 */
206 while (start < curr_offset) {
207 *table++ = disk;
208 start += conf->smallest->size;
209 }
210 }
211 if (table-conf->hash_table != nb_zone)
212 BUG();
213
214 blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
215 mddev->queue->unplug_fn = linear_unplug;
216 mddev->queue->issue_flush_fn = linear_issue_flush;
217 return 0;
218
219out:
220 if (conf)
221 kfree(conf);
222 return 1;
223}
224
225static int linear_stop (mddev_t *mddev)
226{
227 linear_conf_t *conf = mddev_to_conf(mddev);
228
229 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
230 kfree(conf->hash_table);
231 kfree(conf);
232
233 return 0;
234}
235
236static int linear_make_request (request_queue_t *q, struct bio *bio)
237{
238 mddev_t *mddev = q->queuedata;
239 dev_info_t *tmp_dev;
240 sector_t block;
241
242 if (bio_data_dir(bio)==WRITE) {
243 disk_stat_inc(mddev->gendisk, writes);
244 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
245 } else {
246 disk_stat_inc(mddev->gendisk, reads);
247 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
248 }
249
250 tmp_dev = which_dev(mddev, bio->bi_sector);
251 block = bio->bi_sector >> 1;
252
253 if (unlikely(block >= (tmp_dev->size + tmp_dev->offset)
254 || block < tmp_dev->offset)) {
255 char b[BDEVNAME_SIZE];
256
257 printk("linear_make_request: Block %llu out of bounds on "
258 "dev %s size %llu offset %llu\n",
259 (unsigned long long)block,
260 bdevname(tmp_dev->rdev->bdev, b),
261 (unsigned long long)tmp_dev->size,
262 (unsigned long long)tmp_dev->offset);
263 bio_io_error(bio, bio->bi_size);
264 return 0;
265 }
266 if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
267 (tmp_dev->offset + tmp_dev->size)<<1)) {
268 /* This bio crosses a device boundary, so we have to
269 * split it.
270 */
271 struct bio_pair *bp;
272 bp = bio_split(bio, bio_split_pool,
273 (bio->bi_sector + (bio->bi_size >> 9) -
274 (tmp_dev->offset + tmp_dev->size))<<1);
275 if (linear_make_request(q, &bp->bio1))
276 generic_make_request(&bp->bio1);
277 if (linear_make_request(q, &bp->bio2))
278 generic_make_request(&bp->bio2);
279 bio_pair_release(bp);
280 return 0;
281 }
282
283 bio->bi_bdev = tmp_dev->rdev->bdev;
284 bio->bi_sector = bio->bi_sector - (tmp_dev->offset << 1) + tmp_dev->rdev->data_offset;
285
286 return 1;
287}
288
289static void linear_status (struct seq_file *seq, mddev_t *mddev)
290{
291
292#undef MD_DEBUG
293#ifdef MD_DEBUG
294 int j;
295 linear_conf_t *conf = mddev_to_conf(mddev);
296 sector_t s = 0;
297
298 seq_printf(seq, " ");
299 for (j = 0; j < conf->nr_zones; j++)
300 {
301 char b[BDEVNAME_SIZE];
302 s += conf->smallest_size;
303 seq_printf(seq, "[%s",
304 bdevname(conf->hash_table[j][0].rdev->bdev,b));
305
306 while (s > conf->hash_table[j][0].offset +
307 conf->hash_table[j][0].size)
308 seq_printf(seq, "/%s] ",
309 bdevname(conf->hash_table[j][1].rdev->bdev,b));
310 else
311 seq_printf(seq, "] ");
312 }
313 seq_printf(seq, "\n");
314#endif
315 seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
316}
317
318
319static mdk_personality_t linear_personality=
320{
321 .name = "linear",
322 .owner = THIS_MODULE,
323 .make_request = linear_make_request,
324 .run = linear_run,
325 .stop = linear_stop,
326 .status = linear_status,
327};
328
329static int __init linear_init (void)
330{
331 return register_md_personality (LINEAR, &linear_personality);
332}
333
334static void linear_exit (void)
335{
336 unregister_md_personality (LINEAR);
337}
338
339
340module_init(linear_init);
341module_exit(linear_exit);
342MODULE_LICENSE("GPL");
343MODULE_ALIAS("md-personality-1"); /* LINEAR */
diff --git a/drivers/md/md.c b/drivers/md/md.c
new file mode 100644
index 000000000000..04562add1920
--- /dev/null
+++ b/drivers/md/md.c
@@ -0,0 +1,3766 @@
1/*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
22 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option)
25 any later version.
26
27 You should have received a copy of the GNU General Public License
28 (for example /usr/src/linux/COPYING); if not, write to the Free
29 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30*/
31
32#include <linux/module.h>
33#include <linux/config.h>
34#include <linux/linkage.h>
35#include <linux/raid/md.h>
36#include <linux/sysctl.h>
37#include <linux/devfs_fs_kernel.h>
38#include <linux/buffer_head.h> /* for invalidate_bdev */
39#include <linux/suspend.h>
40
41#include <linux/init.h>
42
43#ifdef CONFIG_KMOD
44#include <linux/kmod.h>
45#endif
46
47#include <asm/unaligned.h>
48
49#define MAJOR_NR MD_MAJOR
50#define MD_DRIVER
51
52/* 63 partitions with the alternate major number (mdp) */
53#define MdpMinorShift 6
54
55#define DEBUG 0
56#define dprintk(x...) ((void)(DEBUG && printk(x)))
57
58
59#ifndef MODULE
60static void autostart_arrays (int part);
61#endif
62
63static mdk_personality_t *pers[MAX_PERSONALITY];
64static DEFINE_SPINLOCK(pers_lock);
65
66/*
67 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
68 * is 1000 KB/sec, so the extra system load does not show up that much.
69 * Increase it if you want to have more _guaranteed_ speed. Note that
70 * the RAID driver will use the maximum available bandwith if the IO
71 * subsystem is idle. There is also an 'absolute maximum' reconstruction
72 * speed limit - in case reconstruction slows down your system despite
73 * idle IO detection.
74 *
75 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
76 */
77
78static int sysctl_speed_limit_min = 1000;
79static int sysctl_speed_limit_max = 200000;
80
81static struct ctl_table_header *raid_table_header;
82
83static ctl_table raid_table[] = {
84 {
85 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
86 .procname = "speed_limit_min",
87 .data = &sysctl_speed_limit_min,
88 .maxlen = sizeof(int),
89 .mode = 0644,
90 .proc_handler = &proc_dointvec,
91 },
92 {
93 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
94 .procname = "speed_limit_max",
95 .data = &sysctl_speed_limit_max,
96 .maxlen = sizeof(int),
97 .mode = 0644,
98 .proc_handler = &proc_dointvec,
99 },
100 { .ctl_name = 0 }
101};
102
103static ctl_table raid_dir_table[] = {
104 {
105 .ctl_name = DEV_RAID,
106 .procname = "raid",
107 .maxlen = 0,
108 .mode = 0555,
109 .child = raid_table,
110 },
111 { .ctl_name = 0 }
112};
113
114static ctl_table raid_root_table[] = {
115 {
116 .ctl_name = CTL_DEV,
117 .procname = "dev",
118 .maxlen = 0,
119 .mode = 0555,
120 .child = raid_dir_table,
121 },
122 { .ctl_name = 0 }
123};
124
125static struct block_device_operations md_fops;
126
127/*
128 * Enables to iterate over all existing md arrays
129 * all_mddevs_lock protects this list.
130 */
131static LIST_HEAD(all_mddevs);
132static DEFINE_SPINLOCK(all_mddevs_lock);
133
134
135/*
136 * iterates through all used mddevs in the system.
137 * We take care to grab the all_mddevs_lock whenever navigating
138 * the list, and to always hold a refcount when unlocked.
139 * Any code which breaks out of this loop while own
140 * a reference to the current mddev and must mddev_put it.
141 */
142#define ITERATE_MDDEV(mddev,tmp) \
143 \
144 for (({ spin_lock(&all_mddevs_lock); \
145 tmp = all_mddevs.next; \
146 mddev = NULL;}); \
147 ({ if (tmp != &all_mddevs) \
148 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
149 spin_unlock(&all_mddevs_lock); \
150 if (mddev) mddev_put(mddev); \
151 mddev = list_entry(tmp, mddev_t, all_mddevs); \
152 tmp != &all_mddevs;}); \
153 ({ spin_lock(&all_mddevs_lock); \
154 tmp = tmp->next;}) \
155 )
156
157
158static int md_fail_request (request_queue_t *q, struct bio *bio)
159{
160 bio_io_error(bio, bio->bi_size);
161 return 0;
162}
163
164static inline mddev_t *mddev_get(mddev_t *mddev)
165{
166 atomic_inc(&mddev->active);
167 return mddev;
168}
169
170static void mddev_put(mddev_t *mddev)
171{
172 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
173 return;
174 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
175 list_del(&mddev->all_mddevs);
176 blk_put_queue(mddev->queue);
177 kfree(mddev);
178 }
179 spin_unlock(&all_mddevs_lock);
180}
181
182static mddev_t * mddev_find(dev_t unit)
183{
184 mddev_t *mddev, *new = NULL;
185
186 retry:
187 spin_lock(&all_mddevs_lock);
188 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
189 if (mddev->unit == unit) {
190 mddev_get(mddev);
191 spin_unlock(&all_mddevs_lock);
192 if (new)
193 kfree(new);
194 return mddev;
195 }
196
197 if (new) {
198 list_add(&new->all_mddevs, &all_mddevs);
199 spin_unlock(&all_mddevs_lock);
200 return new;
201 }
202 spin_unlock(&all_mddevs_lock);
203
204 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
205 if (!new)
206 return NULL;
207
208 memset(new, 0, sizeof(*new));
209
210 new->unit = unit;
211 if (MAJOR(unit) == MD_MAJOR)
212 new->md_minor = MINOR(unit);
213 else
214 new->md_minor = MINOR(unit) >> MdpMinorShift;
215
216 init_MUTEX(&new->reconfig_sem);
217 INIT_LIST_HEAD(&new->disks);
218 INIT_LIST_HEAD(&new->all_mddevs);
219 init_timer(&new->safemode_timer);
220 atomic_set(&new->active, 1);
221
222 new->queue = blk_alloc_queue(GFP_KERNEL);
223 if (!new->queue) {
224 kfree(new);
225 return NULL;
226 }
227
228 blk_queue_make_request(new->queue, md_fail_request);
229
230 goto retry;
231}
232
233static inline int mddev_lock(mddev_t * mddev)
234{
235 return down_interruptible(&mddev->reconfig_sem);
236}
237
238static inline void mddev_lock_uninterruptible(mddev_t * mddev)
239{
240 down(&mddev->reconfig_sem);
241}
242
243static inline int mddev_trylock(mddev_t * mddev)
244{
245 return down_trylock(&mddev->reconfig_sem);
246}
247
248static inline void mddev_unlock(mddev_t * mddev)
249{
250 up(&mddev->reconfig_sem);
251
252 if (mddev->thread)
253 md_wakeup_thread(mddev->thread);
254}
255
256mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
257{
258 mdk_rdev_t * rdev;
259 struct list_head *tmp;
260
261 ITERATE_RDEV(mddev,rdev,tmp) {
262 if (rdev->desc_nr == nr)
263 return rdev;
264 }
265 return NULL;
266}
267
268static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
269{
270 struct list_head *tmp;
271 mdk_rdev_t *rdev;
272
273 ITERATE_RDEV(mddev,rdev,tmp) {
274 if (rdev->bdev->bd_dev == dev)
275 return rdev;
276 }
277 return NULL;
278}
279
280inline static sector_t calc_dev_sboffset(struct block_device *bdev)
281{
282 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
283 return MD_NEW_SIZE_BLOCKS(size);
284}
285
286static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
287{
288 sector_t size;
289
290 size = rdev->sb_offset;
291
292 if (chunk_size)
293 size &= ~((sector_t)chunk_size/1024 - 1);
294 return size;
295}
296
297static int alloc_disk_sb(mdk_rdev_t * rdev)
298{
299 if (rdev->sb_page)
300 MD_BUG();
301
302 rdev->sb_page = alloc_page(GFP_KERNEL);
303 if (!rdev->sb_page) {
304 printk(KERN_ALERT "md: out of memory.\n");
305 return -EINVAL;
306 }
307
308 return 0;
309}
310
311static void free_disk_sb(mdk_rdev_t * rdev)
312{
313 if (rdev->sb_page) {
314 page_cache_release(rdev->sb_page);
315 rdev->sb_loaded = 0;
316 rdev->sb_page = NULL;
317 rdev->sb_offset = 0;
318 rdev->size = 0;
319 }
320}
321
322
323static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
324{
325 if (bio->bi_size)
326 return 1;
327
328 complete((struct completion*)bio->bi_private);
329 return 0;
330}
331
332static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
333 struct page *page, int rw)
334{
335 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
336 struct completion event;
337 int ret;
338
339 rw |= (1 << BIO_RW_SYNC);
340
341 bio->bi_bdev = bdev;
342 bio->bi_sector = sector;
343 bio_add_page(bio, page, size, 0);
344 init_completion(&event);
345 bio->bi_private = &event;
346 bio->bi_end_io = bi_complete;
347 submit_bio(rw, bio);
348 wait_for_completion(&event);
349
350 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
351 bio_put(bio);
352 return ret;
353}
354
355static int read_disk_sb(mdk_rdev_t * rdev)
356{
357 char b[BDEVNAME_SIZE];
358 if (!rdev->sb_page) {
359 MD_BUG();
360 return -EINVAL;
361 }
362 if (rdev->sb_loaded)
363 return 0;
364
365
366 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
367 goto fail;
368 rdev->sb_loaded = 1;
369 return 0;
370
371fail:
372 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
373 bdevname(rdev->bdev,b));
374 return -EINVAL;
375}
376
377static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
378{
379 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
380 (sb1->set_uuid1 == sb2->set_uuid1) &&
381 (sb1->set_uuid2 == sb2->set_uuid2) &&
382 (sb1->set_uuid3 == sb2->set_uuid3))
383
384 return 1;
385
386 return 0;
387}
388
389
390static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
391{
392 int ret;
393 mdp_super_t *tmp1, *tmp2;
394
395 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
396 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
397
398 if (!tmp1 || !tmp2) {
399 ret = 0;
400 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
401 goto abort;
402 }
403
404 *tmp1 = *sb1;
405 *tmp2 = *sb2;
406
407 /*
408 * nr_disks is not constant
409 */
410 tmp1->nr_disks = 0;
411 tmp2->nr_disks = 0;
412
413 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
414 ret = 0;
415 else
416 ret = 1;
417
418abort:
419 if (tmp1)
420 kfree(tmp1);
421 if (tmp2)
422 kfree(tmp2);
423
424 return ret;
425}
426
427static unsigned int calc_sb_csum(mdp_super_t * sb)
428{
429 unsigned int disk_csum, csum;
430
431 disk_csum = sb->sb_csum;
432 sb->sb_csum = 0;
433 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
434 sb->sb_csum = disk_csum;
435 return csum;
436}
437
438
439/*
440 * Handle superblock details.
441 * We want to be able to handle multiple superblock formats
442 * so we have a common interface to them all, and an array of
443 * different handlers.
444 * We rely on user-space to write the initial superblock, and support
445 * reading and updating of superblocks.
446 * Interface methods are:
447 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
448 * loads and validates a superblock on dev.
449 * if refdev != NULL, compare superblocks on both devices
450 * Return:
451 * 0 - dev has a superblock that is compatible with refdev
452 * 1 - dev has a superblock that is compatible and newer than refdev
453 * so dev should be used as the refdev in future
454 * -EINVAL superblock incompatible or invalid
455 * -othererror e.g. -EIO
456 *
457 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
458 * Verify that dev is acceptable into mddev.
459 * The first time, mddev->raid_disks will be 0, and data from
460 * dev should be merged in. Subsequent calls check that dev
461 * is new enough. Return 0 or -EINVAL
462 *
463 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
464 * Update the superblock for rdev with data in mddev
465 * This does not write to disc.
466 *
467 */
468
469struct super_type {
470 char *name;
471 struct module *owner;
472 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
473 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
474 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
475};
476
477/*
478 * load_super for 0.90.0
479 */
480static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
481{
482 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
483 mdp_super_t *sb;
484 int ret;
485 sector_t sb_offset;
486
487 /*
488 * Calculate the position of the superblock,
489 * it's at the end of the disk.
490 *
491 * It also happens to be a multiple of 4Kb.
492 */
493 sb_offset = calc_dev_sboffset(rdev->bdev);
494 rdev->sb_offset = sb_offset;
495
496 ret = read_disk_sb(rdev);
497 if (ret) return ret;
498
499 ret = -EINVAL;
500
501 bdevname(rdev->bdev, b);
502 sb = (mdp_super_t*)page_address(rdev->sb_page);
503
504 if (sb->md_magic != MD_SB_MAGIC) {
505 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
506 b);
507 goto abort;
508 }
509
510 if (sb->major_version != 0 ||
511 sb->minor_version != 90) {
512 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
513 sb->major_version, sb->minor_version,
514 b);
515 goto abort;
516 }
517
518 if (sb->raid_disks <= 0)
519 goto abort;
520
521 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
522 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
523 b);
524 goto abort;
525 }
526
527 rdev->preferred_minor = sb->md_minor;
528 rdev->data_offset = 0;
529
530 if (sb->level == LEVEL_MULTIPATH)
531 rdev->desc_nr = -1;
532 else
533 rdev->desc_nr = sb->this_disk.number;
534
535 if (refdev == 0)
536 ret = 1;
537 else {
538 __u64 ev1, ev2;
539 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
540 if (!uuid_equal(refsb, sb)) {
541 printk(KERN_WARNING "md: %s has different UUID to %s\n",
542 b, bdevname(refdev->bdev,b2));
543 goto abort;
544 }
545 if (!sb_equal(refsb, sb)) {
546 printk(KERN_WARNING "md: %s has same UUID"
547 " but different superblock to %s\n",
548 b, bdevname(refdev->bdev, b2));
549 goto abort;
550 }
551 ev1 = md_event(sb);
552 ev2 = md_event(refsb);
553 if (ev1 > ev2)
554 ret = 1;
555 else
556 ret = 0;
557 }
558 rdev->size = calc_dev_size(rdev, sb->chunk_size);
559
560 abort:
561 return ret;
562}
563
564/*
565 * validate_super for 0.90.0
566 */
567static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
568{
569 mdp_disk_t *desc;
570 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
571
572 if (mddev->raid_disks == 0) {
573 mddev->major_version = 0;
574 mddev->minor_version = sb->minor_version;
575 mddev->patch_version = sb->patch_version;
576 mddev->persistent = ! sb->not_persistent;
577 mddev->chunk_size = sb->chunk_size;
578 mddev->ctime = sb->ctime;
579 mddev->utime = sb->utime;
580 mddev->level = sb->level;
581 mddev->layout = sb->layout;
582 mddev->raid_disks = sb->raid_disks;
583 mddev->size = sb->size;
584 mddev->events = md_event(sb);
585
586 if (sb->state & (1<<MD_SB_CLEAN))
587 mddev->recovery_cp = MaxSector;
588 else {
589 if (sb->events_hi == sb->cp_events_hi &&
590 sb->events_lo == sb->cp_events_lo) {
591 mddev->recovery_cp = sb->recovery_cp;
592 } else
593 mddev->recovery_cp = 0;
594 }
595
596 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
597 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
598 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
599 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
600
601 mddev->max_disks = MD_SB_DISKS;
602 } else {
603 __u64 ev1;
604 ev1 = md_event(sb);
605 ++ev1;
606 if (ev1 < mddev->events)
607 return -EINVAL;
608 }
609 if (mddev->level != LEVEL_MULTIPATH) {
610 rdev->raid_disk = -1;
611 rdev->in_sync = rdev->faulty = 0;
612 desc = sb->disks + rdev->desc_nr;
613
614 if (desc->state & (1<<MD_DISK_FAULTY))
615 rdev->faulty = 1;
616 else if (desc->state & (1<<MD_DISK_SYNC) &&
617 desc->raid_disk < mddev->raid_disks) {
618 rdev->in_sync = 1;
619 rdev->raid_disk = desc->raid_disk;
620 }
621 }
622 return 0;
623}
624
625/*
626 * sync_super for 0.90.0
627 */
628static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
629{
630 mdp_super_t *sb;
631 struct list_head *tmp;
632 mdk_rdev_t *rdev2;
633 int next_spare = mddev->raid_disks;
634
635 /* make rdev->sb match mddev data..
636 *
637 * 1/ zero out disks
638 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
639 * 3/ any empty disks < next_spare become removed
640 *
641 * disks[0] gets initialised to REMOVED because
642 * we cannot be sure from other fields if it has
643 * been initialised or not.
644 */
645 int i;
646 int active=0, working=0,failed=0,spare=0,nr_disks=0;
647
648 sb = (mdp_super_t*)page_address(rdev->sb_page);
649
650 memset(sb, 0, sizeof(*sb));
651
652 sb->md_magic = MD_SB_MAGIC;
653 sb->major_version = mddev->major_version;
654 sb->minor_version = mddev->minor_version;
655 sb->patch_version = mddev->patch_version;
656 sb->gvalid_words = 0; /* ignored */
657 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
658 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
659 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
660 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
661
662 sb->ctime = mddev->ctime;
663 sb->level = mddev->level;
664 sb->size = mddev->size;
665 sb->raid_disks = mddev->raid_disks;
666 sb->md_minor = mddev->md_minor;
667 sb->not_persistent = !mddev->persistent;
668 sb->utime = mddev->utime;
669 sb->state = 0;
670 sb->events_hi = (mddev->events>>32);
671 sb->events_lo = (u32)mddev->events;
672
673 if (mddev->in_sync)
674 {
675 sb->recovery_cp = mddev->recovery_cp;
676 sb->cp_events_hi = (mddev->events>>32);
677 sb->cp_events_lo = (u32)mddev->events;
678 if (mddev->recovery_cp == MaxSector)
679 sb->state = (1<< MD_SB_CLEAN);
680 } else
681 sb->recovery_cp = 0;
682
683 sb->layout = mddev->layout;
684 sb->chunk_size = mddev->chunk_size;
685
686 sb->disks[0].state = (1<<MD_DISK_REMOVED);
687 ITERATE_RDEV(mddev,rdev2,tmp) {
688 mdp_disk_t *d;
689 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
690 rdev2->desc_nr = rdev2->raid_disk;
691 else
692 rdev2->desc_nr = next_spare++;
693 d = &sb->disks[rdev2->desc_nr];
694 nr_disks++;
695 d->number = rdev2->desc_nr;
696 d->major = MAJOR(rdev2->bdev->bd_dev);
697 d->minor = MINOR(rdev2->bdev->bd_dev);
698 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
699 d->raid_disk = rdev2->raid_disk;
700 else
701 d->raid_disk = rdev2->desc_nr; /* compatibility */
702 if (rdev2->faulty) {
703 d->state = (1<<MD_DISK_FAULTY);
704 failed++;
705 } else if (rdev2->in_sync) {
706 d->state = (1<<MD_DISK_ACTIVE);
707 d->state |= (1<<MD_DISK_SYNC);
708 active++;
709 working++;
710 } else {
711 d->state = 0;
712 spare++;
713 working++;
714 }
715 }
716
717 /* now set the "removed" and "faulty" bits on any missing devices */
718 for (i=0 ; i < mddev->raid_disks ; i++) {
719 mdp_disk_t *d = &sb->disks[i];
720 if (d->state == 0 && d->number == 0) {
721 d->number = i;
722 d->raid_disk = i;
723 d->state = (1<<MD_DISK_REMOVED);
724 d->state |= (1<<MD_DISK_FAULTY);
725 failed++;
726 }
727 }
728 sb->nr_disks = nr_disks;
729 sb->active_disks = active;
730 sb->working_disks = working;
731 sb->failed_disks = failed;
732 sb->spare_disks = spare;
733
734 sb->this_disk = sb->disks[rdev->desc_nr];
735 sb->sb_csum = calc_sb_csum(sb);
736}
737
738/*
739 * version 1 superblock
740 */
741
742static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
743{
744 unsigned int disk_csum, csum;
745 unsigned long long newcsum;
746 int size = 256 + le32_to_cpu(sb->max_dev)*2;
747 unsigned int *isuper = (unsigned int*)sb;
748 int i;
749
750 disk_csum = sb->sb_csum;
751 sb->sb_csum = 0;
752 newcsum = 0;
753 for (i=0; size>=4; size -= 4 )
754 newcsum += le32_to_cpu(*isuper++);
755
756 if (size == 2)
757 newcsum += le16_to_cpu(*(unsigned short*) isuper);
758
759 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
760 sb->sb_csum = disk_csum;
761 return cpu_to_le32(csum);
762}
763
764static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
765{
766 struct mdp_superblock_1 *sb;
767 int ret;
768 sector_t sb_offset;
769 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
770
771 /*
772 * Calculate the position of the superblock.
773 * It is always aligned to a 4K boundary and
774 * depeding on minor_version, it can be:
775 * 0: At least 8K, but less than 12K, from end of device
776 * 1: At start of device
777 * 2: 4K from start of device.
778 */
779 switch(minor_version) {
780 case 0:
781 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
782 sb_offset -= 8*2;
783 sb_offset &= ~(4*2-1);
784 /* convert from sectors to K */
785 sb_offset /= 2;
786 break;
787 case 1:
788 sb_offset = 0;
789 break;
790 case 2:
791 sb_offset = 4;
792 break;
793 default:
794 return -EINVAL;
795 }
796 rdev->sb_offset = sb_offset;
797
798 ret = read_disk_sb(rdev);
799 if (ret) return ret;
800
801
802 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
803
804 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
805 sb->major_version != cpu_to_le32(1) ||
806 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
807 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
808 sb->feature_map != 0)
809 return -EINVAL;
810
811 if (calc_sb_1_csum(sb) != sb->sb_csum) {
812 printk("md: invalid superblock checksum on %s\n",
813 bdevname(rdev->bdev,b));
814 return -EINVAL;
815 }
816 if (le64_to_cpu(sb->data_size) < 10) {
817 printk("md: data_size too small on %s\n",
818 bdevname(rdev->bdev,b));
819 return -EINVAL;
820 }
821 rdev->preferred_minor = 0xffff;
822 rdev->data_offset = le64_to_cpu(sb->data_offset);
823
824 if (refdev == 0)
825 return 1;
826 else {
827 __u64 ev1, ev2;
828 struct mdp_superblock_1 *refsb =
829 (struct mdp_superblock_1*)page_address(refdev->sb_page);
830
831 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
832 sb->level != refsb->level ||
833 sb->layout != refsb->layout ||
834 sb->chunksize != refsb->chunksize) {
835 printk(KERN_WARNING "md: %s has strangely different"
836 " superblock to %s\n",
837 bdevname(rdev->bdev,b),
838 bdevname(refdev->bdev,b2));
839 return -EINVAL;
840 }
841 ev1 = le64_to_cpu(sb->events);
842 ev2 = le64_to_cpu(refsb->events);
843
844 if (ev1 > ev2)
845 return 1;
846 }
847 if (minor_version)
848 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
849 else
850 rdev->size = rdev->sb_offset;
851 if (rdev->size < le64_to_cpu(sb->data_size)/2)
852 return -EINVAL;
853 rdev->size = le64_to_cpu(sb->data_size)/2;
854 if (le32_to_cpu(sb->chunksize))
855 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
856 return 0;
857}
858
859static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
860{
861 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
862
863 if (mddev->raid_disks == 0) {
864 mddev->major_version = 1;
865 mddev->patch_version = 0;
866 mddev->persistent = 1;
867 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
868 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
869 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
870 mddev->level = le32_to_cpu(sb->level);
871 mddev->layout = le32_to_cpu(sb->layout);
872 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
873 mddev->size = le64_to_cpu(sb->size)/2;
874 mddev->events = le64_to_cpu(sb->events);
875
876 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
877 memcpy(mddev->uuid, sb->set_uuid, 16);
878
879 mddev->max_disks = (4096-256)/2;
880 } else {
881 __u64 ev1;
882 ev1 = le64_to_cpu(sb->events);
883 ++ev1;
884 if (ev1 < mddev->events)
885 return -EINVAL;
886 }
887
888 if (mddev->level != LEVEL_MULTIPATH) {
889 int role;
890 rdev->desc_nr = le32_to_cpu(sb->dev_number);
891 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
892 switch(role) {
893 case 0xffff: /* spare */
894 rdev->in_sync = 0;
895 rdev->faulty = 0;
896 rdev->raid_disk = -1;
897 break;
898 case 0xfffe: /* faulty */
899 rdev->in_sync = 0;
900 rdev->faulty = 1;
901 rdev->raid_disk = -1;
902 break;
903 default:
904 rdev->in_sync = 1;
905 rdev->faulty = 0;
906 rdev->raid_disk = role;
907 break;
908 }
909 }
910 return 0;
911}
912
913static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
914{
915 struct mdp_superblock_1 *sb;
916 struct list_head *tmp;
917 mdk_rdev_t *rdev2;
918 int max_dev, i;
919 /* make rdev->sb match mddev and rdev data. */
920
921 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
922
923 sb->feature_map = 0;
924 sb->pad0 = 0;
925 memset(sb->pad1, 0, sizeof(sb->pad1));
926 memset(sb->pad2, 0, sizeof(sb->pad2));
927 memset(sb->pad3, 0, sizeof(sb->pad3));
928
929 sb->utime = cpu_to_le64((__u64)mddev->utime);
930 sb->events = cpu_to_le64(mddev->events);
931 if (mddev->in_sync)
932 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
933 else
934 sb->resync_offset = cpu_to_le64(0);
935
936 max_dev = 0;
937 ITERATE_RDEV(mddev,rdev2,tmp)
938 if (rdev2->desc_nr+1 > max_dev)
939 max_dev = rdev2->desc_nr+1;
940
941 sb->max_dev = cpu_to_le32(max_dev);
942 for (i=0; i<max_dev;i++)
943 sb->dev_roles[i] = cpu_to_le16(0xfffe);
944
945 ITERATE_RDEV(mddev,rdev2,tmp) {
946 i = rdev2->desc_nr;
947 if (rdev2->faulty)
948 sb->dev_roles[i] = cpu_to_le16(0xfffe);
949 else if (rdev2->in_sync)
950 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
951 else
952 sb->dev_roles[i] = cpu_to_le16(0xffff);
953 }
954
955 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
956 sb->sb_csum = calc_sb_1_csum(sb);
957}
958
959
960struct super_type super_types[] = {
961 [0] = {
962 .name = "0.90.0",
963 .owner = THIS_MODULE,
964 .load_super = super_90_load,
965 .validate_super = super_90_validate,
966 .sync_super = super_90_sync,
967 },
968 [1] = {
969 .name = "md-1",
970 .owner = THIS_MODULE,
971 .load_super = super_1_load,
972 .validate_super = super_1_validate,
973 .sync_super = super_1_sync,
974 },
975};
976
977static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
978{
979 struct list_head *tmp;
980 mdk_rdev_t *rdev;
981
982 ITERATE_RDEV(mddev,rdev,tmp)
983 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
984 return rdev;
985
986 return NULL;
987}
988
989static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
990{
991 struct list_head *tmp;
992 mdk_rdev_t *rdev;
993
994 ITERATE_RDEV(mddev1,rdev,tmp)
995 if (match_dev_unit(mddev2, rdev))
996 return 1;
997
998 return 0;
999}
1000
1001static LIST_HEAD(pending_raid_disks);
1002
1003static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1004{
1005 mdk_rdev_t *same_pdev;
1006 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1007
1008 if (rdev->mddev) {
1009 MD_BUG();
1010 return -EINVAL;
1011 }
1012 same_pdev = match_dev_unit(mddev, rdev);
1013 if (same_pdev)
1014 printk(KERN_WARNING
1015 "%s: WARNING: %s appears to be on the same physical"
1016 " disk as %s. True\n protection against single-disk"
1017 " failure might be compromised.\n",
1018 mdname(mddev), bdevname(rdev->bdev,b),
1019 bdevname(same_pdev->bdev,b2));
1020
1021 /* Verify rdev->desc_nr is unique.
1022 * If it is -1, assign a free number, else
1023 * check number is not in use
1024 */
1025 if (rdev->desc_nr < 0) {
1026 int choice = 0;
1027 if (mddev->pers) choice = mddev->raid_disks;
1028 while (find_rdev_nr(mddev, choice))
1029 choice++;
1030 rdev->desc_nr = choice;
1031 } else {
1032 if (find_rdev_nr(mddev, rdev->desc_nr))
1033 return -EBUSY;
1034 }
1035
1036 list_add(&rdev->same_set, &mddev->disks);
1037 rdev->mddev = mddev;
1038 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b));
1039 return 0;
1040}
1041
1042static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1043{
1044 char b[BDEVNAME_SIZE];
1045 if (!rdev->mddev) {
1046 MD_BUG();
1047 return;
1048 }
1049 list_del_init(&rdev->same_set);
1050 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1051 rdev->mddev = NULL;
1052}
1053
1054/*
1055 * prevent the device from being mounted, repartitioned or
1056 * otherwise reused by a RAID array (or any other kernel
1057 * subsystem), by bd_claiming the device.
1058 */
1059static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1060{
1061 int err = 0;
1062 struct block_device *bdev;
1063 char b[BDEVNAME_SIZE];
1064
1065 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1066 if (IS_ERR(bdev)) {
1067 printk(KERN_ERR "md: could not open %s.\n",
1068 __bdevname(dev, b));
1069 return PTR_ERR(bdev);
1070 }
1071 err = bd_claim(bdev, rdev);
1072 if (err) {
1073 printk(KERN_ERR "md: could not bd_claim %s.\n",
1074 bdevname(bdev, b));
1075 blkdev_put(bdev);
1076 return err;
1077 }
1078 rdev->bdev = bdev;
1079 return err;
1080}
1081
1082static void unlock_rdev(mdk_rdev_t *rdev)
1083{
1084 struct block_device *bdev = rdev->bdev;
1085 rdev->bdev = NULL;
1086 if (!bdev)
1087 MD_BUG();
1088 bd_release(bdev);
1089 blkdev_put(bdev);
1090}
1091
1092void md_autodetect_dev(dev_t dev);
1093
1094static void export_rdev(mdk_rdev_t * rdev)
1095{
1096 char b[BDEVNAME_SIZE];
1097 printk(KERN_INFO "md: export_rdev(%s)\n",
1098 bdevname(rdev->bdev,b));
1099 if (rdev->mddev)
1100 MD_BUG();
1101 free_disk_sb(rdev);
1102 list_del_init(&rdev->same_set);
1103#ifndef MODULE
1104 md_autodetect_dev(rdev->bdev->bd_dev);
1105#endif
1106 unlock_rdev(rdev);
1107 kfree(rdev);
1108}
1109
1110static void kick_rdev_from_array(mdk_rdev_t * rdev)
1111{
1112 unbind_rdev_from_array(rdev);
1113 export_rdev(rdev);
1114}
1115
1116static void export_array(mddev_t *mddev)
1117{
1118 struct list_head *tmp;
1119 mdk_rdev_t *rdev;
1120
1121 ITERATE_RDEV(mddev,rdev,tmp) {
1122 if (!rdev->mddev) {
1123 MD_BUG();
1124 continue;
1125 }
1126 kick_rdev_from_array(rdev);
1127 }
1128 if (!list_empty(&mddev->disks))
1129 MD_BUG();
1130 mddev->raid_disks = 0;
1131 mddev->major_version = 0;
1132}
1133
1134static void print_desc(mdp_disk_t *desc)
1135{
1136 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1137 desc->major,desc->minor,desc->raid_disk,desc->state);
1138}
1139
1140static void print_sb(mdp_super_t *sb)
1141{
1142 int i;
1143
1144 printk(KERN_INFO
1145 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1146 sb->major_version, sb->minor_version, sb->patch_version,
1147 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1148 sb->ctime);
1149 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1150 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1151 sb->md_minor, sb->layout, sb->chunk_size);
1152 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1153 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1154 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1155 sb->failed_disks, sb->spare_disks,
1156 sb->sb_csum, (unsigned long)sb->events_lo);
1157
1158 printk(KERN_INFO);
1159 for (i = 0; i < MD_SB_DISKS; i++) {
1160 mdp_disk_t *desc;
1161
1162 desc = sb->disks + i;
1163 if (desc->number || desc->major || desc->minor ||
1164 desc->raid_disk || (desc->state && (desc->state != 4))) {
1165 printk(" D %2d: ", i);
1166 print_desc(desc);
1167 }
1168 }
1169 printk(KERN_INFO "md: THIS: ");
1170 print_desc(&sb->this_disk);
1171
1172}
1173
1174static void print_rdev(mdk_rdev_t *rdev)
1175{
1176 char b[BDEVNAME_SIZE];
1177 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1178 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1179 rdev->faulty, rdev->in_sync, rdev->desc_nr);
1180 if (rdev->sb_loaded) {
1181 printk(KERN_INFO "md: rdev superblock:\n");
1182 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1183 } else
1184 printk(KERN_INFO "md: no rdev superblock!\n");
1185}
1186
1187void md_print_devices(void)
1188{
1189 struct list_head *tmp, *tmp2;
1190 mdk_rdev_t *rdev;
1191 mddev_t *mddev;
1192 char b[BDEVNAME_SIZE];
1193
1194 printk("\n");
1195 printk("md: **********************************\n");
1196 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1197 printk("md: **********************************\n");
1198 ITERATE_MDDEV(mddev,tmp) {
1199 printk("%s: ", mdname(mddev));
1200
1201 ITERATE_RDEV(mddev,rdev,tmp2)
1202 printk("<%s>", bdevname(rdev->bdev,b));
1203 printk("\n");
1204
1205 ITERATE_RDEV(mddev,rdev,tmp2)
1206 print_rdev(rdev);
1207 }
1208 printk("md: **********************************\n");
1209 printk("\n");
1210}
1211
1212
1213static int write_disk_sb(mdk_rdev_t * rdev)
1214{
1215 char b[BDEVNAME_SIZE];
1216 if (!rdev->sb_loaded) {
1217 MD_BUG();
1218 return 1;
1219 }
1220 if (rdev->faulty) {
1221 MD_BUG();
1222 return 1;
1223 }
1224
1225 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1226 bdevname(rdev->bdev,b),
1227 (unsigned long long)rdev->sb_offset);
1228
1229 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
1230 return 0;
1231
1232 printk("md: write_disk_sb failed for device %s\n",
1233 bdevname(rdev->bdev,b));
1234 return 1;
1235}
1236
1237static void sync_sbs(mddev_t * mddev)
1238{
1239 mdk_rdev_t *rdev;
1240 struct list_head *tmp;
1241
1242 ITERATE_RDEV(mddev,rdev,tmp) {
1243 super_types[mddev->major_version].
1244 sync_super(mddev, rdev);
1245 rdev->sb_loaded = 1;
1246 }
1247}
1248
1249static void md_update_sb(mddev_t * mddev)
1250{
1251 int err, count = 100;
1252 struct list_head *tmp;
1253 mdk_rdev_t *rdev;
1254
1255 mddev->sb_dirty = 0;
1256repeat:
1257 mddev->utime = get_seconds();
1258 mddev->events ++;
1259
1260 if (!mddev->events) {
1261 /*
1262 * oops, this 64-bit counter should never wrap.
1263 * Either we are in around ~1 trillion A.C., assuming
1264 * 1 reboot per second, or we have a bug:
1265 */
1266 MD_BUG();
1267 mddev->events --;
1268 }
1269 sync_sbs(mddev);
1270
1271 /*
1272 * do not write anything to disk if using
1273 * nonpersistent superblocks
1274 */
1275 if (!mddev->persistent)
1276 return;
1277
1278 dprintk(KERN_INFO
1279 "md: updating %s RAID superblock on device (in sync %d)\n",
1280 mdname(mddev),mddev->in_sync);
1281
1282 err = 0;
1283 ITERATE_RDEV(mddev,rdev,tmp) {
1284 char b[BDEVNAME_SIZE];
1285 dprintk(KERN_INFO "md: ");
1286 if (rdev->faulty)
1287 dprintk("(skipping faulty ");
1288
1289 dprintk("%s ", bdevname(rdev->bdev,b));
1290 if (!rdev->faulty) {
1291 err += write_disk_sb(rdev);
1292 } else
1293 dprintk(")\n");
1294 if (!err && mddev->level == LEVEL_MULTIPATH)
1295 /* only need to write one superblock... */
1296 break;
1297 }
1298 if (err) {
1299 if (--count) {
1300 printk(KERN_ERR "md: errors occurred during superblock"
1301 " update, repeating\n");
1302 goto repeat;
1303 }
1304 printk(KERN_ERR \
1305 "md: excessive errors occurred during superblock update, exiting\n");
1306 }
1307}
1308
1309/*
1310 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1311 *
1312 * mark the device faulty if:
1313 *
1314 * - the device is nonexistent (zero size)
1315 * - the device has no valid superblock
1316 *
1317 * a faulty rdev _never_ has rdev->sb set.
1318 */
1319static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1320{
1321 char b[BDEVNAME_SIZE];
1322 int err;
1323 mdk_rdev_t *rdev;
1324 sector_t size;
1325
1326 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1327 if (!rdev) {
1328 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1329 return ERR_PTR(-ENOMEM);
1330 }
1331 memset(rdev, 0, sizeof(*rdev));
1332
1333 if ((err = alloc_disk_sb(rdev)))
1334 goto abort_free;
1335
1336 err = lock_rdev(rdev, newdev);
1337 if (err)
1338 goto abort_free;
1339
1340 rdev->desc_nr = -1;
1341 rdev->faulty = 0;
1342 rdev->in_sync = 0;
1343 rdev->data_offset = 0;
1344 atomic_set(&rdev->nr_pending, 0);
1345
1346 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1347 if (!size) {
1348 printk(KERN_WARNING
1349 "md: %s has zero or unknown size, marking faulty!\n",
1350 bdevname(rdev->bdev,b));
1351 err = -EINVAL;
1352 goto abort_free;
1353 }
1354
1355 if (super_format >= 0) {
1356 err = super_types[super_format].
1357 load_super(rdev, NULL, super_minor);
1358 if (err == -EINVAL) {
1359 printk(KERN_WARNING
1360 "md: %s has invalid sb, not importing!\n",
1361 bdevname(rdev->bdev,b));
1362 goto abort_free;
1363 }
1364 if (err < 0) {
1365 printk(KERN_WARNING
1366 "md: could not read %s's sb, not importing!\n",
1367 bdevname(rdev->bdev,b));
1368 goto abort_free;
1369 }
1370 }
1371 INIT_LIST_HEAD(&rdev->same_set);
1372
1373 return rdev;
1374
1375abort_free:
1376 if (rdev->sb_page) {
1377 if (rdev->bdev)
1378 unlock_rdev(rdev);
1379 free_disk_sb(rdev);
1380 }
1381 kfree(rdev);
1382 return ERR_PTR(err);
1383}
1384
1385/*
1386 * Check a full RAID array for plausibility
1387 */
1388
1389
1390static int analyze_sbs(mddev_t * mddev)
1391{
1392 int i;
1393 struct list_head *tmp;
1394 mdk_rdev_t *rdev, *freshest;
1395 char b[BDEVNAME_SIZE];
1396
1397 freshest = NULL;
1398 ITERATE_RDEV(mddev,rdev,tmp)
1399 switch (super_types[mddev->major_version].
1400 load_super(rdev, freshest, mddev->minor_version)) {
1401 case 1:
1402 freshest = rdev;
1403 break;
1404 case 0:
1405 break;
1406 default:
1407 printk( KERN_ERR \
1408 "md: fatal superblock inconsistency in %s"
1409 " -- removing from array\n",
1410 bdevname(rdev->bdev,b));
1411 kick_rdev_from_array(rdev);
1412 }
1413
1414
1415 super_types[mddev->major_version].
1416 validate_super(mddev, freshest);
1417
1418 i = 0;
1419 ITERATE_RDEV(mddev,rdev,tmp) {
1420 if (rdev != freshest)
1421 if (super_types[mddev->major_version].
1422 validate_super(mddev, rdev)) {
1423 printk(KERN_WARNING "md: kicking non-fresh %s"
1424 " from array!\n",
1425 bdevname(rdev->bdev,b));
1426 kick_rdev_from_array(rdev);
1427 continue;
1428 }
1429 if (mddev->level == LEVEL_MULTIPATH) {
1430 rdev->desc_nr = i++;
1431 rdev->raid_disk = rdev->desc_nr;
1432 rdev->in_sync = 1;
1433 }
1434 }
1435
1436
1437
1438 if (mddev->recovery_cp != MaxSector &&
1439 mddev->level >= 1)
1440 printk(KERN_ERR "md: %s: raid array is not clean"
1441 " -- starting background reconstruction\n",
1442 mdname(mddev));
1443
1444 return 0;
1445}
1446
1447int mdp_major = 0;
1448
1449static struct kobject *md_probe(dev_t dev, int *part, void *data)
1450{
1451 static DECLARE_MUTEX(disks_sem);
1452 mddev_t *mddev = mddev_find(dev);
1453 struct gendisk *disk;
1454 int partitioned = (MAJOR(dev) != MD_MAJOR);
1455 int shift = partitioned ? MdpMinorShift : 0;
1456 int unit = MINOR(dev) >> shift;
1457
1458 if (!mddev)
1459 return NULL;
1460
1461 down(&disks_sem);
1462 if (mddev->gendisk) {
1463 up(&disks_sem);
1464 mddev_put(mddev);
1465 return NULL;
1466 }
1467 disk = alloc_disk(1 << shift);
1468 if (!disk) {
1469 up(&disks_sem);
1470 mddev_put(mddev);
1471 return NULL;
1472 }
1473 disk->major = MAJOR(dev);
1474 disk->first_minor = unit << shift;
1475 if (partitioned) {
1476 sprintf(disk->disk_name, "md_d%d", unit);
1477 sprintf(disk->devfs_name, "md/d%d", unit);
1478 } else {
1479 sprintf(disk->disk_name, "md%d", unit);
1480 sprintf(disk->devfs_name, "md/%d", unit);
1481 }
1482 disk->fops = &md_fops;
1483 disk->private_data = mddev;
1484 disk->queue = mddev->queue;
1485 add_disk(disk);
1486 mddev->gendisk = disk;
1487 up(&disks_sem);
1488 return NULL;
1489}
1490
1491void md_wakeup_thread(mdk_thread_t *thread);
1492
1493static void md_safemode_timeout(unsigned long data)
1494{
1495 mddev_t *mddev = (mddev_t *) data;
1496
1497 mddev->safemode = 1;
1498 md_wakeup_thread(mddev->thread);
1499}
1500
1501
1502static int do_md_run(mddev_t * mddev)
1503{
1504 int pnum, err;
1505 int chunk_size;
1506 struct list_head *tmp;
1507 mdk_rdev_t *rdev;
1508 struct gendisk *disk;
1509 char b[BDEVNAME_SIZE];
1510
1511 if (list_empty(&mddev->disks)) {
1512 MD_BUG();
1513 return -EINVAL;
1514 }
1515
1516 if (mddev->pers)
1517 return -EBUSY;
1518
1519 /*
1520 * Analyze all RAID superblock(s)
1521 */
1522 if (!mddev->raid_disks && analyze_sbs(mddev)) {
1523 MD_BUG();
1524 return -EINVAL;
1525 }
1526
1527 chunk_size = mddev->chunk_size;
1528 pnum = level_to_pers(mddev->level);
1529
1530 if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1531 if (!chunk_size) {
1532 /*
1533 * 'default chunksize' in the old md code used to
1534 * be PAGE_SIZE, baaad.
1535 * we abort here to be on the safe side. We don't
1536 * want to continue the bad practice.
1537 */
1538 printk(KERN_ERR
1539 "no chunksize specified, see 'man raidtab'\n");
1540 return -EINVAL;
1541 }
1542 if (chunk_size > MAX_CHUNK_SIZE) {
1543 printk(KERN_ERR "too big chunk_size: %d > %d\n",
1544 chunk_size, MAX_CHUNK_SIZE);
1545 return -EINVAL;
1546 }
1547 /*
1548 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1549 */
1550 if ( (1 << ffz(~chunk_size)) != chunk_size) {
1551 MD_BUG();
1552 return -EINVAL;
1553 }
1554 if (chunk_size < PAGE_SIZE) {
1555 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
1556 chunk_size, PAGE_SIZE);
1557 return -EINVAL;
1558 }
1559
1560 /* devices must have minimum size of one chunk */
1561 ITERATE_RDEV(mddev,rdev,tmp) {
1562 if (rdev->faulty)
1563 continue;
1564 if (rdev->size < chunk_size / 1024) {
1565 printk(KERN_WARNING
1566 "md: Dev %s smaller than chunk_size:"
1567 " %lluk < %dk\n",
1568 bdevname(rdev->bdev,b),
1569 (unsigned long long)rdev->size,
1570 chunk_size / 1024);
1571 return -EINVAL;
1572 }
1573 }
1574 }
1575
1576 if (pnum >= MAX_PERSONALITY) {
1577 MD_BUG();
1578 return -EINVAL;
1579 }
1580
1581#ifdef CONFIG_KMOD
1582 if (!pers[pnum])
1583 {
1584 request_module("md-personality-%d", pnum);
1585 }
1586#endif
1587
1588 /*
1589 * Drop all container device buffers, from now on
1590 * the only valid external interface is through the md
1591 * device.
1592 * Also find largest hardsector size
1593 */
1594 ITERATE_RDEV(mddev,rdev,tmp) {
1595 if (rdev->faulty)
1596 continue;
1597 sync_blockdev(rdev->bdev);
1598 invalidate_bdev(rdev->bdev, 0);
1599 }
1600
1601 md_probe(mddev->unit, NULL, NULL);
1602 disk = mddev->gendisk;
1603 if (!disk)
1604 return -ENOMEM;
1605
1606 spin_lock(&pers_lock);
1607 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
1608 spin_unlock(&pers_lock);
1609 printk(KERN_WARNING "md: personality %d is not loaded!\n",
1610 pnum);
1611 return -EINVAL;
1612 }
1613
1614 mddev->pers = pers[pnum];
1615 spin_unlock(&pers_lock);
1616
1617 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1618
1619 err = mddev->pers->run(mddev);
1620 if (err) {
1621 printk(KERN_ERR "md: pers->run() failed ...\n");
1622 module_put(mddev->pers->owner);
1623 mddev->pers = NULL;
1624 return -EINVAL;
1625 }
1626 atomic_set(&mddev->writes_pending,0);
1627 mddev->safemode = 0;
1628 mddev->safemode_timer.function = md_safemode_timeout;
1629 mddev->safemode_timer.data = (unsigned long) mddev;
1630 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
1631 mddev->in_sync = 1;
1632
1633 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1634
1635 if (mddev->sb_dirty)
1636 md_update_sb(mddev);
1637
1638 set_capacity(disk, mddev->array_size<<1);
1639
1640 /* If we call blk_queue_make_request here, it will
1641 * re-initialise max_sectors etc which may have been
1642 * refined inside -> run. So just set the bits we need to set.
1643 * Most initialisation happended when we called
1644 * blk_queue_make_request(..., md_fail_request)
1645 * earlier.
1646 */
1647 mddev->queue->queuedata = mddev;
1648 mddev->queue->make_request_fn = mddev->pers->make_request;
1649
1650 mddev->changed = 1;
1651 return 0;
1652}
1653
1654static int restart_array(mddev_t *mddev)
1655{
1656 struct gendisk *disk = mddev->gendisk;
1657 int err;
1658
1659 /*
1660 * Complain if it has no devices
1661 */
1662 err = -ENXIO;
1663 if (list_empty(&mddev->disks))
1664 goto out;
1665
1666 if (mddev->pers) {
1667 err = -EBUSY;
1668 if (!mddev->ro)
1669 goto out;
1670
1671 mddev->safemode = 0;
1672 mddev->ro = 0;
1673 set_disk_ro(disk, 0);
1674
1675 printk(KERN_INFO "md: %s switched to read-write mode.\n",
1676 mdname(mddev));
1677 /*
1678 * Kick recovery or resync if necessary
1679 */
1680 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1681 md_wakeup_thread(mddev->thread);
1682 err = 0;
1683 } else {
1684 printk(KERN_ERR "md: %s has no personality assigned.\n",
1685 mdname(mddev));
1686 err = -EINVAL;
1687 }
1688
1689out:
1690 return err;
1691}
1692
1693static int do_md_stop(mddev_t * mddev, int ro)
1694{
1695 int err = 0;
1696 struct gendisk *disk = mddev->gendisk;
1697
1698 if (mddev->pers) {
1699 if (atomic_read(&mddev->active)>2) {
1700 printk("md: %s still in use.\n",mdname(mddev));
1701 return -EBUSY;
1702 }
1703
1704 if (mddev->sync_thread) {
1705 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1706 md_unregister_thread(mddev->sync_thread);
1707 mddev->sync_thread = NULL;
1708 }
1709
1710 del_timer_sync(&mddev->safemode_timer);
1711
1712 invalidate_partition(disk, 0);
1713
1714 if (ro) {
1715 err = -ENXIO;
1716 if (mddev->ro)
1717 goto out;
1718 mddev->ro = 1;
1719 } else {
1720 if (mddev->ro)
1721 set_disk_ro(disk, 0);
1722 blk_queue_make_request(mddev->queue, md_fail_request);
1723 mddev->pers->stop(mddev);
1724 module_put(mddev->pers->owner);
1725 mddev->pers = NULL;
1726 if (mddev->ro)
1727 mddev->ro = 0;
1728 }
1729 if (!mddev->in_sync) {
1730 /* mark array as shutdown cleanly */
1731 mddev->in_sync = 1;
1732 md_update_sb(mddev);
1733 }
1734 if (ro)
1735 set_disk_ro(disk, 1);
1736 }
1737 /*
1738 * Free resources if final stop
1739 */
1740 if (!ro) {
1741 struct gendisk *disk;
1742 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
1743
1744 export_array(mddev);
1745
1746 mddev->array_size = 0;
1747 disk = mddev->gendisk;
1748 if (disk)
1749 set_capacity(disk, 0);
1750 mddev->changed = 1;
1751 } else
1752 printk(KERN_INFO "md: %s switched to read-only mode.\n",
1753 mdname(mddev));
1754 err = 0;
1755out:
1756 return err;
1757}
1758
1759static void autorun_array(mddev_t *mddev)
1760{
1761 mdk_rdev_t *rdev;
1762 struct list_head *tmp;
1763 int err;
1764
1765 if (list_empty(&mddev->disks)) {
1766 MD_BUG();
1767 return;
1768 }
1769
1770 printk(KERN_INFO "md: running: ");
1771
1772 ITERATE_RDEV(mddev,rdev,tmp) {
1773 char b[BDEVNAME_SIZE];
1774 printk("<%s>", bdevname(rdev->bdev,b));
1775 }
1776 printk("\n");
1777
1778 err = do_md_run (mddev);
1779 if (err) {
1780 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
1781 do_md_stop (mddev, 0);
1782 }
1783}
1784
1785/*
1786 * lets try to run arrays based on all disks that have arrived
1787 * until now. (those are in pending_raid_disks)
1788 *
1789 * the method: pick the first pending disk, collect all disks with
1790 * the same UUID, remove all from the pending list and put them into
1791 * the 'same_array' list. Then order this list based on superblock
1792 * update time (freshest comes first), kick out 'old' disks and
1793 * compare superblocks. If everything's fine then run it.
1794 *
1795 * If "unit" is allocated, then bump its reference count
1796 */
1797static void autorun_devices(int part)
1798{
1799 struct list_head candidates;
1800 struct list_head *tmp;
1801 mdk_rdev_t *rdev0, *rdev;
1802 mddev_t *mddev;
1803 char b[BDEVNAME_SIZE];
1804
1805 printk(KERN_INFO "md: autorun ...\n");
1806 while (!list_empty(&pending_raid_disks)) {
1807 dev_t dev;
1808 rdev0 = list_entry(pending_raid_disks.next,
1809 mdk_rdev_t, same_set);
1810
1811 printk(KERN_INFO "md: considering %s ...\n",
1812 bdevname(rdev0->bdev,b));
1813 INIT_LIST_HEAD(&candidates);
1814 ITERATE_RDEV_PENDING(rdev,tmp)
1815 if (super_90_load(rdev, rdev0, 0) >= 0) {
1816 printk(KERN_INFO "md: adding %s ...\n",
1817 bdevname(rdev->bdev,b));
1818 list_move(&rdev->same_set, &candidates);
1819 }
1820 /*
1821 * now we have a set of devices, with all of them having
1822 * mostly sane superblocks. It's time to allocate the
1823 * mddev.
1824 */
1825 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
1826 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
1827 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
1828 break;
1829 }
1830 if (part)
1831 dev = MKDEV(mdp_major,
1832 rdev0->preferred_minor << MdpMinorShift);
1833 else
1834 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
1835
1836 md_probe(dev, NULL, NULL);
1837 mddev = mddev_find(dev);
1838 if (!mddev) {
1839 printk(KERN_ERR
1840 "md: cannot allocate memory for md drive.\n");
1841 break;
1842 }
1843 if (mddev_lock(mddev))
1844 printk(KERN_WARNING "md: %s locked, cannot run\n",
1845 mdname(mddev));
1846 else if (mddev->raid_disks || mddev->major_version
1847 || !list_empty(&mddev->disks)) {
1848 printk(KERN_WARNING
1849 "md: %s already running, cannot run %s\n",
1850 mdname(mddev), bdevname(rdev0->bdev,b));
1851 mddev_unlock(mddev);
1852 } else {
1853 printk(KERN_INFO "md: created %s\n", mdname(mddev));
1854 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
1855 list_del_init(&rdev->same_set);
1856 if (bind_rdev_to_array(rdev, mddev))
1857 export_rdev(rdev);
1858 }
1859 autorun_array(mddev);
1860 mddev_unlock(mddev);
1861 }
1862 /* on success, candidates will be empty, on error
1863 * it won't...
1864 */
1865 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
1866 export_rdev(rdev);
1867 mddev_put(mddev);
1868 }
1869 printk(KERN_INFO "md: ... autorun DONE.\n");
1870}
1871
1872/*
1873 * import RAID devices based on one partition
1874 * if possible, the array gets run as well.
1875 */
1876
1877static int autostart_array(dev_t startdev)
1878{
1879 char b[BDEVNAME_SIZE];
1880 int err = -EINVAL, i;
1881 mdp_super_t *sb = NULL;
1882 mdk_rdev_t *start_rdev = NULL, *rdev;
1883
1884 start_rdev = md_import_device(startdev, 0, 0);
1885 if (IS_ERR(start_rdev))
1886 return err;
1887
1888
1889 /* NOTE: this can only work for 0.90.0 superblocks */
1890 sb = (mdp_super_t*)page_address(start_rdev->sb_page);
1891 if (sb->major_version != 0 ||
1892 sb->minor_version != 90 ) {
1893 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
1894 export_rdev(start_rdev);
1895 return err;
1896 }
1897
1898 if (start_rdev->faulty) {
1899 printk(KERN_WARNING
1900 "md: can not autostart based on faulty %s!\n",
1901 bdevname(start_rdev->bdev,b));
1902 export_rdev(start_rdev);
1903 return err;
1904 }
1905 list_add(&start_rdev->same_set, &pending_raid_disks);
1906
1907 for (i = 0; i < MD_SB_DISKS; i++) {
1908 mdp_disk_t *desc = sb->disks + i;
1909 dev_t dev = MKDEV(desc->major, desc->minor);
1910
1911 if (!dev)
1912 continue;
1913 if (dev == startdev)
1914 continue;
1915 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
1916 continue;
1917 rdev = md_import_device(dev, 0, 0);
1918 if (IS_ERR(rdev))
1919 continue;
1920
1921 list_add(&rdev->same_set, &pending_raid_disks);
1922 }
1923
1924 /*
1925 * possibly return codes
1926 */
1927 autorun_devices(0);
1928 return 0;
1929
1930}
1931
1932
1933static int get_version(void __user * arg)
1934{
1935 mdu_version_t ver;
1936
1937 ver.major = MD_MAJOR_VERSION;
1938 ver.minor = MD_MINOR_VERSION;
1939 ver.patchlevel = MD_PATCHLEVEL_VERSION;
1940
1941 if (copy_to_user(arg, &ver, sizeof(ver)))
1942 return -EFAULT;
1943
1944 return 0;
1945}
1946
1947static int get_array_info(mddev_t * mddev, void __user * arg)
1948{
1949 mdu_array_info_t info;
1950 int nr,working,active,failed,spare;
1951 mdk_rdev_t *rdev;
1952 struct list_head *tmp;
1953
1954 nr=working=active=failed=spare=0;
1955 ITERATE_RDEV(mddev,rdev,tmp) {
1956 nr++;
1957 if (rdev->faulty)
1958 failed++;
1959 else {
1960 working++;
1961 if (rdev->in_sync)
1962 active++;
1963 else
1964 spare++;
1965 }
1966 }
1967
1968 info.major_version = mddev->major_version;
1969 info.minor_version = mddev->minor_version;
1970 info.patch_version = MD_PATCHLEVEL_VERSION;
1971 info.ctime = mddev->ctime;
1972 info.level = mddev->level;
1973 info.size = mddev->size;
1974 info.nr_disks = nr;
1975 info.raid_disks = mddev->raid_disks;
1976 info.md_minor = mddev->md_minor;
1977 info.not_persistent= !mddev->persistent;
1978
1979 info.utime = mddev->utime;
1980 info.state = 0;
1981 if (mddev->in_sync)
1982 info.state = (1<<MD_SB_CLEAN);
1983 info.active_disks = active;
1984 info.working_disks = working;
1985 info.failed_disks = failed;
1986 info.spare_disks = spare;
1987
1988 info.layout = mddev->layout;
1989 info.chunk_size = mddev->chunk_size;
1990
1991 if (copy_to_user(arg, &info, sizeof(info)))
1992 return -EFAULT;
1993
1994 return 0;
1995}
1996
1997static int get_disk_info(mddev_t * mddev, void __user * arg)
1998{
1999 mdu_disk_info_t info;
2000 unsigned int nr;
2001 mdk_rdev_t *rdev;
2002
2003 if (copy_from_user(&info, arg, sizeof(info)))
2004 return -EFAULT;
2005
2006 nr = info.number;
2007
2008 rdev = find_rdev_nr(mddev, nr);
2009 if (rdev) {
2010 info.major = MAJOR(rdev->bdev->bd_dev);
2011 info.minor = MINOR(rdev->bdev->bd_dev);
2012 info.raid_disk = rdev->raid_disk;
2013 info.state = 0;
2014 if (rdev->faulty)
2015 info.state |= (1<<MD_DISK_FAULTY);
2016 else if (rdev->in_sync) {
2017 info.state |= (1<<MD_DISK_ACTIVE);
2018 info.state |= (1<<MD_DISK_SYNC);
2019 }
2020 } else {
2021 info.major = info.minor = 0;
2022 info.raid_disk = -1;
2023 info.state = (1<<MD_DISK_REMOVED);
2024 }
2025
2026 if (copy_to_user(arg, &info, sizeof(info)))
2027 return -EFAULT;
2028
2029 return 0;
2030}
2031
2032static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2033{
2034 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
2035 mdk_rdev_t *rdev;
2036 dev_t dev = MKDEV(info->major,info->minor);
2037
2038 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
2039 return -EOVERFLOW;
2040
2041 if (!mddev->raid_disks) {
2042 int err;
2043 /* expecting a device which has a superblock */
2044 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
2045 if (IS_ERR(rdev)) {
2046 printk(KERN_WARNING
2047 "md: md_import_device returned %ld\n",
2048 PTR_ERR(rdev));
2049 return PTR_ERR(rdev);
2050 }
2051 if (!list_empty(&mddev->disks)) {
2052 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2053 mdk_rdev_t, same_set);
2054 int err = super_types[mddev->major_version]
2055 .load_super(rdev, rdev0, mddev->minor_version);
2056 if (err < 0) {
2057 printk(KERN_WARNING
2058 "md: %s has different UUID to %s\n",
2059 bdevname(rdev->bdev,b),
2060 bdevname(rdev0->bdev,b2));
2061 export_rdev(rdev);
2062 return -EINVAL;
2063 }
2064 }
2065 err = bind_rdev_to_array(rdev, mddev);
2066 if (err)
2067 export_rdev(rdev);
2068 return err;
2069 }
2070
2071 /*
2072 * add_new_disk can be used once the array is assembled
2073 * to add "hot spares". They must already have a superblock
2074 * written
2075 */
2076 if (mddev->pers) {
2077 int err;
2078 if (!mddev->pers->hot_add_disk) {
2079 printk(KERN_WARNING
2080 "%s: personality does not support diskops!\n",
2081 mdname(mddev));
2082 return -EINVAL;
2083 }
2084 rdev = md_import_device(dev, mddev->major_version,
2085 mddev->minor_version);
2086 if (IS_ERR(rdev)) {
2087 printk(KERN_WARNING
2088 "md: md_import_device returned %ld\n",
2089 PTR_ERR(rdev));
2090 return PTR_ERR(rdev);
2091 }
2092 rdev->in_sync = 0; /* just to be sure */
2093 rdev->raid_disk = -1;
2094 err = bind_rdev_to_array(rdev, mddev);
2095 if (err)
2096 export_rdev(rdev);
2097 if (mddev->thread)
2098 md_wakeup_thread(mddev->thread);
2099 return err;
2100 }
2101
2102 /* otherwise, add_new_disk is only allowed
2103 * for major_version==0 superblocks
2104 */
2105 if (mddev->major_version != 0) {
2106 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
2107 mdname(mddev));
2108 return -EINVAL;
2109 }
2110
2111 if (!(info->state & (1<<MD_DISK_FAULTY))) {
2112 int err;
2113 rdev = md_import_device (dev, -1, 0);
2114 if (IS_ERR(rdev)) {
2115 printk(KERN_WARNING
2116 "md: error, md_import_device() returned %ld\n",
2117 PTR_ERR(rdev));
2118 return PTR_ERR(rdev);
2119 }
2120 rdev->desc_nr = info->number;
2121 if (info->raid_disk < mddev->raid_disks)
2122 rdev->raid_disk = info->raid_disk;
2123 else
2124 rdev->raid_disk = -1;
2125
2126 rdev->faulty = 0;
2127 if (rdev->raid_disk < mddev->raid_disks)
2128 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
2129 else
2130 rdev->in_sync = 0;
2131
2132 err = bind_rdev_to_array(rdev, mddev);
2133 if (err) {
2134 export_rdev(rdev);
2135 return err;
2136 }
2137
2138 if (!mddev->persistent) {
2139 printk(KERN_INFO "md: nonpersistent superblock ...\n");
2140 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2141 } else
2142 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2143 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2144
2145 if (!mddev->size || (mddev->size > rdev->size))
2146 mddev->size = rdev->size;
2147 }
2148
2149 return 0;
2150}
2151
2152static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2153{
2154 char b[BDEVNAME_SIZE];
2155 mdk_rdev_t *rdev;
2156
2157 if (!mddev->pers)
2158 return -ENODEV;
2159
2160 rdev = find_rdev(mddev, dev);
2161 if (!rdev)
2162 return -ENXIO;
2163
2164 if (rdev->raid_disk >= 0)
2165 goto busy;
2166
2167 kick_rdev_from_array(rdev);
2168 md_update_sb(mddev);
2169
2170 return 0;
2171busy:
2172 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
2173 bdevname(rdev->bdev,b), mdname(mddev));
2174 return -EBUSY;
2175}
2176
2177static int hot_add_disk(mddev_t * mddev, dev_t dev)
2178{
2179 char b[BDEVNAME_SIZE];
2180 int err;
2181 unsigned int size;
2182 mdk_rdev_t *rdev;
2183
2184 if (!mddev->pers)
2185 return -ENODEV;
2186
2187 if (mddev->major_version != 0) {
2188 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
2189 " version-0 superblocks.\n",
2190 mdname(mddev));
2191 return -EINVAL;
2192 }
2193 if (!mddev->pers->hot_add_disk) {
2194 printk(KERN_WARNING
2195 "%s: personality does not support diskops!\n",
2196 mdname(mddev));
2197 return -EINVAL;
2198 }
2199
2200 rdev = md_import_device (dev, -1, 0);
2201 if (IS_ERR(rdev)) {
2202 printk(KERN_WARNING
2203 "md: error, md_import_device() returned %ld\n",
2204 PTR_ERR(rdev));
2205 return -EINVAL;
2206 }
2207
2208 if (mddev->persistent)
2209 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2210 else
2211 rdev->sb_offset =
2212 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2213
2214 size = calc_dev_size(rdev, mddev->chunk_size);
2215 rdev->size = size;
2216
2217 if (size < mddev->size) {
2218 printk(KERN_WARNING
2219 "%s: disk size %llu blocks < array size %llu\n",
2220 mdname(mddev), (unsigned long long)size,
2221 (unsigned long long)mddev->size);
2222 err = -ENOSPC;
2223 goto abort_export;
2224 }
2225
2226 if (rdev->faulty) {
2227 printk(KERN_WARNING
2228 "md: can not hot-add faulty %s disk to %s!\n",
2229 bdevname(rdev->bdev,b), mdname(mddev));
2230 err = -EINVAL;
2231 goto abort_export;
2232 }
2233 rdev->in_sync = 0;
2234 rdev->desc_nr = -1;
2235 bind_rdev_to_array(rdev, mddev);
2236
2237 /*
2238 * The rest should better be atomic, we can have disk failures
2239 * noticed in interrupt contexts ...
2240 */
2241
2242 if (rdev->desc_nr == mddev->max_disks) {
2243 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
2244 mdname(mddev));
2245 err = -EBUSY;
2246 goto abort_unbind_export;
2247 }
2248
2249 rdev->raid_disk = -1;
2250
2251 md_update_sb(mddev);
2252
2253 /*
2254 * Kick recovery, maybe this spare has to be added to the
2255 * array immediately.
2256 */
2257 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2258 md_wakeup_thread(mddev->thread);
2259
2260 return 0;
2261
2262abort_unbind_export:
2263 unbind_rdev_from_array(rdev);
2264
2265abort_export:
2266 export_rdev(rdev);
2267 return err;
2268}
2269
2270/*
2271 * set_array_info is used two different ways
2272 * The original usage is when creating a new array.
2273 * In this usage, raid_disks is > 0 and it together with
2274 * level, size, not_persistent,layout,chunksize determine the
2275 * shape of the array.
2276 * This will always create an array with a type-0.90.0 superblock.
2277 * The newer usage is when assembling an array.
2278 * In this case raid_disks will be 0, and the major_version field is
2279 * use to determine which style super-blocks are to be found on the devices.
2280 * The minor and patch _version numbers are also kept incase the
2281 * super_block handler wishes to interpret them.
2282 */
2283static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2284{
2285
2286 if (info->raid_disks == 0) {
2287 /* just setting version number for superblock loading */
2288 if (info->major_version < 0 ||
2289 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
2290 super_types[info->major_version].name == NULL) {
2291 /* maybe try to auto-load a module? */
2292 printk(KERN_INFO
2293 "md: superblock version %d not known\n",
2294 info->major_version);
2295 return -EINVAL;
2296 }
2297 mddev->major_version = info->major_version;
2298 mddev->minor_version = info->minor_version;
2299 mddev->patch_version = info->patch_version;
2300 return 0;
2301 }
2302 mddev->major_version = MD_MAJOR_VERSION;
2303 mddev->minor_version = MD_MINOR_VERSION;
2304 mddev->patch_version = MD_PATCHLEVEL_VERSION;
2305 mddev->ctime = get_seconds();
2306
2307 mddev->level = info->level;
2308 mddev->size = info->size;
2309 mddev->raid_disks = info->raid_disks;
2310 /* don't set md_minor, it is determined by which /dev/md* was
2311 * openned
2312 */
2313 if (info->state & (1<<MD_SB_CLEAN))
2314 mddev->recovery_cp = MaxSector;
2315 else
2316 mddev->recovery_cp = 0;
2317 mddev->persistent = ! info->not_persistent;
2318
2319 mddev->layout = info->layout;
2320 mddev->chunk_size = info->chunk_size;
2321
2322 mddev->max_disks = MD_SB_DISKS;
2323
2324 mddev->sb_dirty = 1;
2325
2326 /*
2327 * Generate a 128 bit UUID
2328 */
2329 get_random_bytes(mddev->uuid, 16);
2330
2331 return 0;
2332}
2333
2334/*
2335 * update_array_info is used to change the configuration of an
2336 * on-line array.
2337 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
2338 * fields in the info are checked against the array.
2339 * Any differences that cannot be handled will cause an error.
2340 * Normally, only one change can be managed at a time.
2341 */
2342static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2343{
2344 int rv = 0;
2345 int cnt = 0;
2346
2347 if (mddev->major_version != info->major_version ||
2348 mddev->minor_version != info->minor_version ||
2349/* mddev->patch_version != info->patch_version || */
2350 mddev->ctime != info->ctime ||
2351 mddev->level != info->level ||
2352/* mddev->layout != info->layout || */
2353 !mddev->persistent != info->not_persistent||
2354 mddev->chunk_size != info->chunk_size )
2355 return -EINVAL;
2356 /* Check there is only one change */
2357 if (mddev->size != info->size) cnt++;
2358 if (mddev->raid_disks != info->raid_disks) cnt++;
2359 if (mddev->layout != info->layout) cnt++;
2360 if (cnt == 0) return 0;
2361 if (cnt > 1) return -EINVAL;
2362
2363 if (mddev->layout != info->layout) {
2364 /* Change layout
2365 * we don't need to do anything at the md level, the
2366 * personality will take care of it all.
2367 */
2368 if (mddev->pers->reconfig == NULL)
2369 return -EINVAL;
2370 else
2371 return mddev->pers->reconfig(mddev, info->layout, -1);
2372 }
2373 if (mddev->size != info->size) {
2374 mdk_rdev_t * rdev;
2375 struct list_head *tmp;
2376 if (mddev->pers->resize == NULL)
2377 return -EINVAL;
2378 /* The "size" is the amount of each device that is used.
2379 * This can only make sense for arrays with redundancy.
2380 * linear and raid0 always use whatever space is available
2381 * We can only consider changing the size if no resync
2382 * or reconstruction is happening, and if the new size
2383 * is acceptable. It must fit before the sb_offset or,
2384 * if that is <data_offset, it must fit before the
2385 * size of each device.
2386 * If size is zero, we find the largest size that fits.
2387 */
2388 if (mddev->sync_thread)
2389 return -EBUSY;
2390 ITERATE_RDEV(mddev,rdev,tmp) {
2391 sector_t avail;
2392 int fit = (info->size == 0);
2393 if (rdev->sb_offset > rdev->data_offset)
2394 avail = (rdev->sb_offset*2) - rdev->data_offset;
2395 else
2396 avail = get_capacity(rdev->bdev->bd_disk)
2397 - rdev->data_offset;
2398 if (fit && (info->size == 0 || info->size > avail/2))
2399 info->size = avail/2;
2400 if (avail < ((sector_t)info->size << 1))
2401 return -ENOSPC;
2402 }
2403 rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
2404 if (!rv) {
2405 struct block_device *bdev;
2406
2407 bdev = bdget_disk(mddev->gendisk, 0);
2408 if (bdev) {
2409 down(&bdev->bd_inode->i_sem);
2410 i_size_write(bdev->bd_inode, mddev->array_size << 10);
2411 up(&bdev->bd_inode->i_sem);
2412 bdput(bdev);
2413 }
2414 }
2415 }
2416 if (mddev->raid_disks != info->raid_disks) {
2417 /* change the number of raid disks */
2418 if (mddev->pers->reshape == NULL)
2419 return -EINVAL;
2420 if (info->raid_disks <= 0 ||
2421 info->raid_disks >= mddev->max_disks)
2422 return -EINVAL;
2423 if (mddev->sync_thread)
2424 return -EBUSY;
2425 rv = mddev->pers->reshape(mddev, info->raid_disks);
2426 if (!rv) {
2427 struct block_device *bdev;
2428
2429 bdev = bdget_disk(mddev->gendisk, 0);
2430 if (bdev) {
2431 down(&bdev->bd_inode->i_sem);
2432 i_size_write(bdev->bd_inode, mddev->array_size << 10);
2433 up(&bdev->bd_inode->i_sem);
2434 bdput(bdev);
2435 }
2436 }
2437 }
2438 md_update_sb(mddev);
2439 return rv;
2440}
2441
2442static int set_disk_faulty(mddev_t *mddev, dev_t dev)
2443{
2444 mdk_rdev_t *rdev;
2445
2446 if (mddev->pers == NULL)
2447 return -ENODEV;
2448
2449 rdev = find_rdev(mddev, dev);
2450 if (!rdev)
2451 return -ENODEV;
2452
2453 md_error(mddev, rdev);
2454 return 0;
2455}
2456
2457static int md_ioctl(struct inode *inode, struct file *file,
2458 unsigned int cmd, unsigned long arg)
2459{
2460 int err = 0;
2461 void __user *argp = (void __user *)arg;
2462 struct hd_geometry __user *loc = argp;
2463 mddev_t *mddev = NULL;
2464
2465 if (!capable(CAP_SYS_ADMIN))
2466 return -EACCES;
2467
2468 /*
2469 * Commands dealing with the RAID driver but not any
2470 * particular array:
2471 */
2472 switch (cmd)
2473 {
2474 case RAID_VERSION:
2475 err = get_version(argp);
2476 goto done;
2477
2478 case PRINT_RAID_DEBUG:
2479 err = 0;
2480 md_print_devices();
2481 goto done;
2482
2483#ifndef MODULE
2484 case RAID_AUTORUN:
2485 err = 0;
2486 autostart_arrays(arg);
2487 goto done;
2488#endif
2489 default:;
2490 }
2491
2492 /*
2493 * Commands creating/starting a new array:
2494 */
2495
2496 mddev = inode->i_bdev->bd_disk->private_data;
2497
2498 if (!mddev) {
2499 BUG();
2500 goto abort;
2501 }
2502
2503
2504 if (cmd == START_ARRAY) {
2505 /* START_ARRAY doesn't need to lock the array as autostart_array
2506 * does the locking, and it could even be a different array
2507 */
2508 static int cnt = 3;
2509 if (cnt > 0 ) {
2510 printk(KERN_WARNING
2511 "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
2512 "This will not be supported beyond 2.6\n",
2513 current->comm, current->pid);
2514 cnt--;
2515 }
2516 err = autostart_array(new_decode_dev(arg));
2517 if (err) {
2518 printk(KERN_WARNING "md: autostart failed!\n");
2519 goto abort;
2520 }
2521 goto done;
2522 }
2523
2524 err = mddev_lock(mddev);
2525 if (err) {
2526 printk(KERN_INFO
2527 "md: ioctl lock interrupted, reason %d, cmd %d\n",
2528 err, cmd);
2529 goto abort;
2530 }
2531
2532 switch (cmd)
2533 {
2534 case SET_ARRAY_INFO:
2535 {
2536 mdu_array_info_t info;
2537 if (!arg)
2538 memset(&info, 0, sizeof(info));
2539 else if (copy_from_user(&info, argp, sizeof(info))) {
2540 err = -EFAULT;
2541 goto abort_unlock;
2542 }
2543 if (mddev->pers) {
2544 err = update_array_info(mddev, &info);
2545 if (err) {
2546 printk(KERN_WARNING "md: couldn't update"
2547 " array info. %d\n", err);
2548 goto abort_unlock;
2549 }
2550 goto done_unlock;
2551 }
2552 if (!list_empty(&mddev->disks)) {
2553 printk(KERN_WARNING
2554 "md: array %s already has disks!\n",
2555 mdname(mddev));
2556 err = -EBUSY;
2557 goto abort_unlock;
2558 }
2559 if (mddev->raid_disks) {
2560 printk(KERN_WARNING
2561 "md: array %s already initialised!\n",
2562 mdname(mddev));
2563 err = -EBUSY;
2564 goto abort_unlock;
2565 }
2566 err = set_array_info(mddev, &info);
2567 if (err) {
2568 printk(KERN_WARNING "md: couldn't set"
2569 " array info. %d\n", err);
2570 goto abort_unlock;
2571 }
2572 }
2573 goto done_unlock;
2574
2575 default:;
2576 }
2577
2578 /*
2579 * Commands querying/configuring an existing array:
2580 */
2581 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2582 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2583 err = -ENODEV;
2584 goto abort_unlock;
2585 }
2586
2587 /*
2588 * Commands even a read-only array can execute:
2589 */
2590 switch (cmd)
2591 {
2592 case GET_ARRAY_INFO:
2593 err = get_array_info(mddev, argp);
2594 goto done_unlock;
2595
2596 case GET_DISK_INFO:
2597 err = get_disk_info(mddev, argp);
2598 goto done_unlock;
2599
2600 case RESTART_ARRAY_RW:
2601 err = restart_array(mddev);
2602 goto done_unlock;
2603
2604 case STOP_ARRAY:
2605 err = do_md_stop (mddev, 0);
2606 goto done_unlock;
2607
2608 case STOP_ARRAY_RO:
2609 err = do_md_stop (mddev, 1);
2610 goto done_unlock;
2611
2612 /*
2613 * We have a problem here : there is no easy way to give a CHS
2614 * virtual geometry. We currently pretend that we have a 2 heads
2615 * 4 sectors (with a BIG number of cylinders...). This drives
2616 * dosfs just mad... ;-)
2617 */
2618 case HDIO_GETGEO:
2619 if (!loc) {
2620 err = -EINVAL;
2621 goto abort_unlock;
2622 }
2623 err = put_user (2, (char __user *) &loc->heads);
2624 if (err)
2625 goto abort_unlock;
2626 err = put_user (4, (char __user *) &loc->sectors);
2627 if (err)
2628 goto abort_unlock;
2629 err = put_user(get_capacity(mddev->gendisk)/8,
2630 (short __user *) &loc->cylinders);
2631 if (err)
2632 goto abort_unlock;
2633 err = put_user (get_start_sect(inode->i_bdev),
2634 (long __user *) &loc->start);
2635 goto done_unlock;
2636 }
2637
2638 /*
2639 * The remaining ioctls are changing the state of the
2640 * superblock, so we do not allow read-only arrays
2641 * here:
2642 */
2643 if (mddev->ro) {
2644 err = -EROFS;
2645 goto abort_unlock;
2646 }
2647
2648 switch (cmd)
2649 {
2650 case ADD_NEW_DISK:
2651 {
2652 mdu_disk_info_t info;
2653 if (copy_from_user(&info, argp, sizeof(info)))
2654 err = -EFAULT;
2655 else
2656 err = add_new_disk(mddev, &info);
2657 goto done_unlock;
2658 }
2659
2660 case HOT_REMOVE_DISK:
2661 err = hot_remove_disk(mddev, new_decode_dev(arg));
2662 goto done_unlock;
2663
2664 case HOT_ADD_DISK:
2665 err = hot_add_disk(mddev, new_decode_dev(arg));
2666 goto done_unlock;
2667
2668 case SET_DISK_FAULTY:
2669 err = set_disk_faulty(mddev, new_decode_dev(arg));
2670 goto done_unlock;
2671
2672 case RUN_ARRAY:
2673 err = do_md_run (mddev);
2674 goto done_unlock;
2675
2676 default:
2677 if (_IOC_TYPE(cmd) == MD_MAJOR)
2678 printk(KERN_WARNING "md: %s(pid %d) used"
2679 " obsolete MD ioctl, upgrade your"
2680 " software to use new ictls.\n",
2681 current->comm, current->pid);
2682 err = -EINVAL;
2683 goto abort_unlock;
2684 }
2685
2686done_unlock:
2687abort_unlock:
2688 mddev_unlock(mddev);
2689
2690 return err;
2691done:
2692 if (err)
2693 MD_BUG();
2694abort:
2695 return err;
2696}
2697
2698static int md_open(struct inode *inode, struct file *file)
2699{
2700 /*
2701 * Succeed if we can lock the mddev, which confirms that
2702 * it isn't being stopped right now.
2703 */
2704 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2705 int err;
2706
2707 if ((err = mddev_lock(mddev)))
2708 goto out;
2709
2710 err = 0;
2711 mddev_get(mddev);
2712 mddev_unlock(mddev);
2713
2714 check_disk_change(inode->i_bdev);
2715 out:
2716 return err;
2717}
2718
2719static int md_release(struct inode *inode, struct file * file)
2720{
2721 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2722
2723 if (!mddev)
2724 BUG();
2725 mddev_put(mddev);
2726
2727 return 0;
2728}
2729
2730static int md_media_changed(struct gendisk *disk)
2731{
2732 mddev_t *mddev = disk->private_data;
2733
2734 return mddev->changed;
2735}
2736
2737static int md_revalidate(struct gendisk *disk)
2738{
2739 mddev_t *mddev = disk->private_data;
2740
2741 mddev->changed = 0;
2742 return 0;
2743}
2744static struct block_device_operations md_fops =
2745{
2746 .owner = THIS_MODULE,
2747 .open = md_open,
2748 .release = md_release,
2749 .ioctl = md_ioctl,
2750 .media_changed = md_media_changed,
2751 .revalidate_disk= md_revalidate,
2752};
2753
2754int md_thread(void * arg)
2755{
2756 mdk_thread_t *thread = arg;
2757
2758 lock_kernel();
2759
2760 /*
2761 * Detach thread
2762 */
2763
2764 daemonize(thread->name, mdname(thread->mddev));
2765
2766 current->exit_signal = SIGCHLD;
2767 allow_signal(SIGKILL);
2768 thread->tsk = current;
2769
2770 /*
2771 * md_thread is a 'system-thread', it's priority should be very
2772 * high. We avoid resource deadlocks individually in each
2773 * raid personality. (RAID5 does preallocation) We also use RR and
2774 * the very same RT priority as kswapd, thus we will never get
2775 * into a priority inversion deadlock.
2776 *
2777 * we definitely have to have equal or higher priority than
2778 * bdflush, otherwise bdflush will deadlock if there are too
2779 * many dirty RAID5 blocks.
2780 */
2781 unlock_kernel();
2782
2783 complete(thread->event);
2784 while (thread->run) {
2785 void (*run)(mddev_t *);
2786
2787 wait_event_interruptible(thread->wqueue,
2788 test_bit(THREAD_WAKEUP, &thread->flags));
2789 if (current->flags & PF_FREEZE)
2790 refrigerator(PF_FREEZE);
2791
2792 clear_bit(THREAD_WAKEUP, &thread->flags);
2793
2794 run = thread->run;
2795 if (run)
2796 run(thread->mddev);
2797
2798 if (signal_pending(current))
2799 flush_signals(current);
2800 }
2801 complete(thread->event);
2802 return 0;
2803}
2804
2805void md_wakeup_thread(mdk_thread_t *thread)
2806{
2807 if (thread) {
2808 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
2809 set_bit(THREAD_WAKEUP, &thread->flags);
2810 wake_up(&thread->wqueue);
2811 }
2812}
2813
2814mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2815 const char *name)
2816{
2817 mdk_thread_t *thread;
2818 int ret;
2819 struct completion event;
2820
2821 thread = (mdk_thread_t *) kmalloc
2822 (sizeof(mdk_thread_t), GFP_KERNEL);
2823 if (!thread)
2824 return NULL;
2825
2826 memset(thread, 0, sizeof(mdk_thread_t));
2827 init_waitqueue_head(&thread->wqueue);
2828
2829 init_completion(&event);
2830 thread->event = &event;
2831 thread->run = run;
2832 thread->mddev = mddev;
2833 thread->name = name;
2834 ret = kernel_thread(md_thread, thread, 0);
2835 if (ret < 0) {
2836 kfree(thread);
2837 return NULL;
2838 }
2839 wait_for_completion(&event);
2840 return thread;
2841}
2842
2843static void md_interrupt_thread(mdk_thread_t *thread)
2844{
2845 if (!thread->tsk) {
2846 MD_BUG();
2847 return;
2848 }
2849 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2850 send_sig(SIGKILL, thread->tsk, 1);
2851}
2852
2853void md_unregister_thread(mdk_thread_t *thread)
2854{
2855 struct completion event;
2856
2857 init_completion(&event);
2858
2859 thread->event = &event;
2860 thread->run = NULL;
2861 thread->name = NULL;
2862 md_interrupt_thread(thread);
2863 wait_for_completion(&event);
2864 kfree(thread);
2865}
2866
2867void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2868{
2869 if (!mddev) {
2870 MD_BUG();
2871 return;
2872 }
2873
2874 if (!rdev || rdev->faulty)
2875 return;
2876
2877 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2878 mdname(mddev),
2879 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2880 __builtin_return_address(0),__builtin_return_address(1),
2881 __builtin_return_address(2),__builtin_return_address(3));
2882
2883 if (!mddev->pers->error_handler)
2884 return;
2885 mddev->pers->error_handler(mddev,rdev);
2886 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2887 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2888 md_wakeup_thread(mddev->thread);
2889}
2890
2891/* seq_file implementation /proc/mdstat */
2892
2893static void status_unused(struct seq_file *seq)
2894{
2895 int i = 0;
2896 mdk_rdev_t *rdev;
2897 struct list_head *tmp;
2898
2899 seq_printf(seq, "unused devices: ");
2900
2901 ITERATE_RDEV_PENDING(rdev,tmp) {
2902 char b[BDEVNAME_SIZE];
2903 i++;
2904 seq_printf(seq, "%s ",
2905 bdevname(rdev->bdev,b));
2906 }
2907 if (!i)
2908 seq_printf(seq, "<none>");
2909
2910 seq_printf(seq, "\n");
2911}
2912
2913
2914static void status_resync(struct seq_file *seq, mddev_t * mddev)
2915{
2916 unsigned long max_blocks, resync, res, dt, db, rt;
2917
2918 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
2919
2920 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2921 max_blocks = mddev->resync_max_sectors >> 1;
2922 else
2923 max_blocks = mddev->size;
2924
2925 /*
2926 * Should not happen.
2927 */
2928 if (!max_blocks) {
2929 MD_BUG();
2930 return;
2931 }
2932 res = (resync/1024)*1000/(max_blocks/1024 + 1);
2933 {
2934 int i, x = res/50, y = 20-x;
2935 seq_printf(seq, "[");
2936 for (i = 0; i < x; i++)
2937 seq_printf(seq, "=");
2938 seq_printf(seq, ">");
2939 for (i = 0; i < y; i++)
2940 seq_printf(seq, ".");
2941 seq_printf(seq, "] ");
2942 }
2943 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
2944 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
2945 "resync" : "recovery"),
2946 res/10, res % 10, resync, max_blocks);
2947
2948 /*
2949 * We do not want to overflow, so the order of operands and
2950 * the * 100 / 100 trick are important. We do a +1 to be
2951 * safe against division by zero. We only estimate anyway.
2952 *
2953 * dt: time from mark until now
2954 * db: blocks written from mark until now
2955 * rt: remaining time
2956 */
2957 dt = ((jiffies - mddev->resync_mark) / HZ);
2958 if (!dt) dt++;
2959 db = resync - (mddev->resync_mark_cnt/2);
2960 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
2961
2962 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
2963
2964 seq_printf(seq, " speed=%ldK/sec", db/dt);
2965}
2966
2967static void *md_seq_start(struct seq_file *seq, loff_t *pos)
2968{
2969 struct list_head *tmp;
2970 loff_t l = *pos;
2971 mddev_t *mddev;
2972
2973 if (l >= 0x10000)
2974 return NULL;
2975 if (!l--)
2976 /* header */
2977 return (void*)1;
2978
2979 spin_lock(&all_mddevs_lock);
2980 list_for_each(tmp,&all_mddevs)
2981 if (!l--) {
2982 mddev = list_entry(tmp, mddev_t, all_mddevs);
2983 mddev_get(mddev);
2984 spin_unlock(&all_mddevs_lock);
2985 return mddev;
2986 }
2987 spin_unlock(&all_mddevs_lock);
2988 if (!l--)
2989 return (void*)2;/* tail */
2990 return NULL;
2991}
2992
2993static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2994{
2995 struct list_head *tmp;
2996 mddev_t *next_mddev, *mddev = v;
2997
2998 ++*pos;
2999 if (v == (void*)2)
3000 return NULL;
3001
3002 spin_lock(&all_mddevs_lock);
3003 if (v == (void*)1)
3004 tmp = all_mddevs.next;
3005 else
3006 tmp = mddev->all_mddevs.next;
3007 if (tmp != &all_mddevs)
3008 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
3009 else {
3010 next_mddev = (void*)2;
3011 *pos = 0x10000;
3012 }
3013 spin_unlock(&all_mddevs_lock);
3014
3015 if (v != (void*)1)
3016 mddev_put(mddev);
3017 return next_mddev;
3018
3019}
3020
3021static void md_seq_stop(struct seq_file *seq, void *v)
3022{
3023 mddev_t *mddev = v;
3024
3025 if (mddev && v != (void*)1 && v != (void*)2)
3026 mddev_put(mddev);
3027}
3028
3029static int md_seq_show(struct seq_file *seq, void *v)
3030{
3031 mddev_t *mddev = v;
3032 sector_t size;
3033 struct list_head *tmp2;
3034 mdk_rdev_t *rdev;
3035 int i;
3036
3037 if (v == (void*)1) {
3038 seq_printf(seq, "Personalities : ");
3039 spin_lock(&pers_lock);
3040 for (i = 0; i < MAX_PERSONALITY; i++)
3041 if (pers[i])
3042 seq_printf(seq, "[%s] ", pers[i]->name);
3043
3044 spin_unlock(&pers_lock);
3045 seq_printf(seq, "\n");
3046 return 0;
3047 }
3048 if (v == (void*)2) {
3049 status_unused(seq);
3050 return 0;
3051 }
3052
3053 if (mddev_lock(mddev)!=0)
3054 return -EINTR;
3055 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
3056 seq_printf(seq, "%s : %sactive", mdname(mddev),
3057 mddev->pers ? "" : "in");
3058 if (mddev->pers) {
3059 if (mddev->ro)
3060 seq_printf(seq, " (read-only)");
3061 seq_printf(seq, " %s", mddev->pers->name);
3062 }
3063
3064 size = 0;
3065 ITERATE_RDEV(mddev,rdev,tmp2) {
3066 char b[BDEVNAME_SIZE];
3067 seq_printf(seq, " %s[%d]",
3068 bdevname(rdev->bdev,b), rdev->desc_nr);
3069 if (rdev->faulty) {
3070 seq_printf(seq, "(F)");
3071 continue;
3072 }
3073 size += rdev->size;
3074 }
3075
3076 if (!list_empty(&mddev->disks)) {
3077 if (mddev->pers)
3078 seq_printf(seq, "\n %llu blocks",
3079 (unsigned long long)mddev->array_size);
3080 else
3081 seq_printf(seq, "\n %llu blocks",
3082 (unsigned long long)size);
3083 }
3084
3085 if (mddev->pers) {
3086 mddev->pers->status (seq, mddev);
3087 seq_printf(seq, "\n ");
3088 if (mddev->curr_resync > 2)
3089 status_resync (seq, mddev);
3090 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3091 seq_printf(seq, " resync=DELAYED");
3092 }
3093
3094 seq_printf(seq, "\n");
3095 }
3096 mddev_unlock(mddev);
3097
3098 return 0;
3099}
3100
3101static struct seq_operations md_seq_ops = {
3102 .start = md_seq_start,
3103 .next = md_seq_next,
3104 .stop = md_seq_stop,
3105 .show = md_seq_show,
3106};
3107
3108static int md_seq_open(struct inode *inode, struct file *file)
3109{
3110 int error;
3111
3112 error = seq_open(file, &md_seq_ops);
3113 return error;
3114}
3115
3116static struct file_operations md_seq_fops = {
3117 .open = md_seq_open,
3118 .read = seq_read,
3119 .llseek = seq_lseek,
3120 .release = seq_release,
3121};
3122
3123int register_md_personality(int pnum, mdk_personality_t *p)
3124{
3125 if (pnum >= MAX_PERSONALITY) {
3126 printk(KERN_ERR
3127 "md: tried to install personality %s as nr %d, but max is %lu\n",
3128 p->name, pnum, MAX_PERSONALITY-1);
3129 return -EINVAL;
3130 }
3131
3132 spin_lock(&pers_lock);
3133 if (pers[pnum]) {
3134 spin_unlock(&pers_lock);
3135 MD_BUG();
3136 return -EBUSY;
3137 }
3138
3139 pers[pnum] = p;
3140 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3141 spin_unlock(&pers_lock);
3142 return 0;
3143}
3144
3145int unregister_md_personality(int pnum)
3146{
3147 if (pnum >= MAX_PERSONALITY) {
3148 MD_BUG();
3149 return -EINVAL;
3150 }
3151
3152 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3153 spin_lock(&pers_lock);
3154 pers[pnum] = NULL;
3155 spin_unlock(&pers_lock);
3156 return 0;
3157}
3158
3159static int is_mddev_idle(mddev_t *mddev)
3160{
3161 mdk_rdev_t * rdev;
3162 struct list_head *tmp;
3163 int idle;
3164 unsigned long curr_events;
3165
3166 idle = 1;
3167 ITERATE_RDEV(mddev,rdev,tmp) {
3168 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
3169 curr_events = disk_stat_read(disk, read_sectors) +
3170 disk_stat_read(disk, write_sectors) -
3171 atomic_read(&disk->sync_io);
3172 /* Allow some slack between valud of curr_events and last_events,
3173 * as there are some uninteresting races.
3174 * Note: the following is an unsigned comparison.
3175 */
3176 if ((curr_events - rdev->last_events + 32) > 64) {
3177 rdev->last_events = curr_events;
3178 idle = 0;
3179 }
3180 }
3181 return idle;
3182}
3183
3184void md_done_sync(mddev_t *mddev, int blocks, int ok)
3185{
3186 /* another "blocks" (512byte) blocks have been synced */
3187 atomic_sub(blocks, &mddev->recovery_active);
3188 wake_up(&mddev->recovery_wait);
3189 if (!ok) {
3190 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3191 md_wakeup_thread(mddev->thread);
3192 // stop recovery, signal do_sync ....
3193 }
3194}
3195
3196
3197void md_write_start(mddev_t *mddev)
3198{
3199 if (!atomic_read(&mddev->writes_pending)) {
3200 mddev_lock_uninterruptible(mddev);
3201 if (mddev->in_sync) {
3202 mddev->in_sync = 0;
3203 del_timer(&mddev->safemode_timer);
3204 md_update_sb(mddev);
3205 }
3206 atomic_inc(&mddev->writes_pending);
3207 mddev_unlock(mddev);
3208 } else
3209 atomic_inc(&mddev->writes_pending);
3210}
3211
3212void md_write_end(mddev_t *mddev)
3213{
3214 if (atomic_dec_and_test(&mddev->writes_pending)) {
3215 if (mddev->safemode == 2)
3216 md_wakeup_thread(mddev->thread);
3217 else
3218 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
3219 }
3220}
3221
3222static inline void md_enter_safemode(mddev_t *mddev)
3223{
3224 if (!mddev->safemode) return;
3225 if (mddev->safemode == 2 &&
3226 (atomic_read(&mddev->writes_pending) || mddev->in_sync ||
3227 mddev->recovery_cp != MaxSector))
3228 return; /* avoid the lock */
3229 mddev_lock_uninterruptible(mddev);
3230 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3231 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3232 mddev->in_sync = 1;
3233 md_update_sb(mddev);
3234 }
3235 mddev_unlock(mddev);
3236
3237 if (mddev->safemode == 1)
3238 mddev->safemode = 0;
3239}
3240
3241void md_handle_safemode(mddev_t *mddev)
3242{
3243 if (signal_pending(current)) {
3244 printk(KERN_INFO "md: %s in immediate safe mode\n",
3245 mdname(mddev));
3246 mddev->safemode = 2;
3247 flush_signals(current);
3248 }
3249 md_enter_safemode(mddev);
3250}
3251
3252
3253DECLARE_WAIT_QUEUE_HEAD(resync_wait);
3254
3255#define SYNC_MARKS 10
3256#define SYNC_MARK_STEP (3*HZ)
3257static void md_do_sync(mddev_t *mddev)
3258{
3259 mddev_t *mddev2;
3260 unsigned int currspeed = 0,
3261 window;
3262 sector_t max_sectors,j;
3263 unsigned long mark[SYNC_MARKS];
3264 sector_t mark_cnt[SYNC_MARKS];
3265 int last_mark,m;
3266 struct list_head *tmp;
3267 sector_t last_check;
3268
3269 /* just incase thread restarts... */
3270 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
3271 return;
3272
3273 /* we overload curr_resync somewhat here.
3274 * 0 == not engaged in resync at all
3275 * 2 == checking that there is no conflict with another sync
3276 * 1 == like 2, but have yielded to allow conflicting resync to
3277 * commense
3278 * other == active in resync - this many blocks
3279 *
3280 * Before starting a resync we must have set curr_resync to
3281 * 2, and then checked that every "conflicting" array has curr_resync
3282 * less than ours. When we find one that is the same or higher
3283 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
3284 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
3285 * This will mean we have to start checking from the beginning again.
3286 *
3287 */
3288
3289 do {
3290 mddev->curr_resync = 2;
3291
3292 try_again:
3293 if (signal_pending(current)) {
3294 flush_signals(current);
3295 goto skip;
3296 }
3297 ITERATE_MDDEV(mddev2,tmp) {
3298 printk(".");
3299 if (mddev2 == mddev)
3300 continue;
3301 if (mddev2->curr_resync &&
3302 match_mddev_units(mddev,mddev2)) {
3303 DEFINE_WAIT(wq);
3304 if (mddev < mddev2 && mddev->curr_resync == 2) {
3305 /* arbitrarily yield */
3306 mddev->curr_resync = 1;
3307 wake_up(&resync_wait);
3308 }
3309 if (mddev > mddev2 && mddev->curr_resync == 1)
3310 /* no need to wait here, we can wait the next
3311 * time 'round when curr_resync == 2
3312 */
3313 continue;
3314 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
3315 if (!signal_pending(current)
3316 && mddev2->curr_resync >= mddev->curr_resync) {
3317 printk(KERN_INFO "md: delaying resync of %s"
3318 " until %s has finished resync (they"
3319 " share one or more physical units)\n",
3320 mdname(mddev), mdname(mddev2));
3321 mddev_put(mddev2);
3322 schedule();
3323 finish_wait(&resync_wait, &wq);
3324 goto try_again;
3325 }
3326 finish_wait(&resync_wait, &wq);
3327 }
3328 }
3329 } while (mddev->curr_resync < 2);
3330
3331 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3332 /* resync follows the size requested by the personality,
3333 * which default to physical size, but can be virtual size
3334 */
3335 max_sectors = mddev->resync_max_sectors;
3336 else
3337 /* recovery follows the physical size of devices */
3338 max_sectors = mddev->size << 1;
3339
3340 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
3341 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
3342 " %d KB/sec/disc.\n", sysctl_speed_limit_min);
3343 printk(KERN_INFO "md: using maximum available idle IO bandwith "
3344 "(but not more than %d KB/sec) for reconstruction.\n",
3345 sysctl_speed_limit_max);
3346
3347 is_mddev_idle(mddev); /* this also initializes IO event counters */
3348 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3349 j = mddev->recovery_cp;
3350 else
3351 j = 0;
3352 for (m = 0; m < SYNC_MARKS; m++) {
3353 mark[m] = jiffies;
3354 mark_cnt[m] = j;
3355 }
3356 last_mark = 0;
3357 mddev->resync_mark = mark[last_mark];
3358 mddev->resync_mark_cnt = mark_cnt[last_mark];
3359
3360 /*
3361 * Tune reconstruction:
3362 */
3363 window = 32*(PAGE_SIZE/512);
3364 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
3365 window/2,(unsigned long long) max_sectors/2);
3366
3367 atomic_set(&mddev->recovery_active, 0);
3368 init_waitqueue_head(&mddev->recovery_wait);
3369 last_check = 0;
3370
3371 if (j>2) {
3372 printk(KERN_INFO
3373 "md: resuming recovery of %s from checkpoint.\n",
3374 mdname(mddev));
3375 mddev->curr_resync = j;
3376 }
3377
3378 while (j < max_sectors) {
3379 int sectors;
3380
3381 sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min);
3382 if (sectors < 0) {
3383 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3384 goto out;
3385 }
3386 atomic_add(sectors, &mddev->recovery_active);
3387 j += sectors;
3388 if (j>1) mddev->curr_resync = j;
3389
3390 if (last_check + window > j || j == max_sectors)
3391 continue;
3392
3393 last_check = j;
3394
3395 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
3396 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
3397 break;
3398
3399 repeat:
3400 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
3401 /* step marks */
3402 int next = (last_mark+1) % SYNC_MARKS;
3403
3404 mddev->resync_mark = mark[next];
3405 mddev->resync_mark_cnt = mark_cnt[next];
3406 mark[next] = jiffies;
3407 mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
3408 last_mark = next;
3409 }
3410
3411
3412 if (signal_pending(current)) {
3413 /*
3414 * got a signal, exit.
3415 */
3416 printk(KERN_INFO
3417 "md: md_do_sync() got signal ... exiting\n");
3418 flush_signals(current);
3419 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3420 goto out;
3421 }
3422
3423 /*
3424 * this loop exits only if either when we are slower than
3425 * the 'hard' speed limit, or the system was IO-idle for
3426 * a jiffy.
3427 * the system might be non-idle CPU-wise, but we only care
3428 * about not overloading the IO subsystem. (things like an
3429 * e2fsck being done on the RAID array should execute fast)
3430 */
3431 mddev->queue->unplug_fn(mddev->queue);
3432 cond_resched();
3433
3434 currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
3435
3436 if (currspeed > sysctl_speed_limit_min) {
3437 if ((currspeed > sysctl_speed_limit_max) ||
3438 !is_mddev_idle(mddev)) {
3439 msleep_interruptible(250);
3440 goto repeat;
3441 }
3442 }
3443 }
3444 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
3445 /*
3446 * this also signals 'finished resyncing' to md_stop
3447 */
3448 out:
3449 mddev->queue->unplug_fn(mddev->queue);
3450
3451 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
3452
3453 /* tell personality that we are finished */
3454 mddev->pers->sync_request(mddev, max_sectors, 1);
3455
3456 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3457 mddev->curr_resync > 2 &&
3458 mddev->curr_resync >= mddev->recovery_cp) {
3459 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3460 printk(KERN_INFO
3461 "md: checkpointing recovery of %s.\n",
3462 mdname(mddev));
3463 mddev->recovery_cp = mddev->curr_resync;
3464 } else
3465 mddev->recovery_cp = MaxSector;
3466 }
3467
3468 md_enter_safemode(mddev);
3469 skip:
3470 mddev->curr_resync = 0;
3471 wake_up(&resync_wait);
3472 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
3473 md_wakeup_thread(mddev->thread);
3474}
3475
3476
3477/*
3478 * This routine is regularly called by all per-raid-array threads to
3479 * deal with generic issues like resync and super-block update.
3480 * Raid personalities that don't have a thread (linear/raid0) do not
3481 * need this as they never do any recovery or update the superblock.
3482 *
3483 * It does not do any resync itself, but rather "forks" off other threads
3484 * to do that as needed.
3485 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
3486 * "->recovery" and create a thread at ->sync_thread.
3487 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
3488 * and wakeups up this thread which will reap the thread and finish up.
3489 * This thread also removes any faulty devices (with nr_pending == 0).
3490 *
3491 * The overall approach is:
3492 * 1/ if the superblock needs updating, update it.
3493 * 2/ If a recovery thread is running, don't do anything else.
3494 * 3/ If recovery has finished, clean up, possibly marking spares active.
3495 * 4/ If there are any faulty devices, remove them.
3496 * 5/ If array is degraded, try to add spares devices
3497 * 6/ If array has spares or is not in-sync, start a resync thread.
3498 */
3499void md_check_recovery(mddev_t *mddev)
3500{
3501 mdk_rdev_t *rdev;
3502 struct list_head *rtmp;
3503
3504
3505 dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
3506
3507 if (mddev->ro)
3508 return;
3509 if ( ! (
3510 mddev->sb_dirty ||
3511 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
3512 test_bit(MD_RECOVERY_DONE, &mddev->recovery)
3513 ))
3514 return;
3515 if (mddev_trylock(mddev)==0) {
3516 int spares =0;
3517 if (mddev->sb_dirty)
3518 md_update_sb(mddev);
3519 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
3520 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
3521 /* resync/recovery still happening */
3522 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3523 goto unlock;
3524 }
3525 if (mddev->sync_thread) {
3526 /* resync has finished, collect result */
3527 md_unregister_thread(mddev->sync_thread);
3528 mddev->sync_thread = NULL;
3529 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3530 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3531 /* success...*/
3532 /* activate any spares */
3533 mddev->pers->spare_active(mddev);
3534 }
3535 md_update_sb(mddev);
3536 mddev->recovery = 0;
3537 /* flag recovery needed just to double check */
3538 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3539 goto unlock;
3540 }
3541 if (mddev->recovery)
3542 /* probably just the RECOVERY_NEEDED flag */
3543 mddev->recovery = 0;
3544
3545 /* no recovery is running.
3546 * remove any failed drives, then
3547 * add spares if possible.
3548 * Spare are also removed and re-added, to allow
3549 * the personality to fail the re-add.
3550 */
3551 ITERATE_RDEV(mddev,rdev,rtmp)
3552 if (rdev->raid_disk >= 0 &&
3553 (rdev->faulty || ! rdev->in_sync) &&
3554 atomic_read(&rdev->nr_pending)==0) {
3555 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0)
3556 rdev->raid_disk = -1;
3557 }
3558
3559 if (mddev->degraded) {
3560 ITERATE_RDEV(mddev,rdev,rtmp)
3561 if (rdev->raid_disk < 0
3562 && !rdev->faulty) {
3563 if (mddev->pers->hot_add_disk(mddev,rdev))
3564 spares++;
3565 else
3566 break;
3567 }
3568 }
3569
3570 if (!spares && (mddev->recovery_cp == MaxSector )) {
3571 /* nothing we can do ... */
3572 goto unlock;
3573 }
3574 if (mddev->pers->sync_request) {
3575 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3576 if (!spares)
3577 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3578 mddev->sync_thread = md_register_thread(md_do_sync,
3579 mddev,
3580 "%s_resync");
3581 if (!mddev->sync_thread) {
3582 printk(KERN_ERR "%s: could not start resync"
3583 " thread...\n",
3584 mdname(mddev));
3585 /* leave the spares where they are, it shouldn't hurt */
3586 mddev->recovery = 0;
3587 } else {
3588 md_wakeup_thread(mddev->sync_thread);
3589 }
3590 }
3591 unlock:
3592 mddev_unlock(mddev);
3593 }
3594}
3595
3596int md_notify_reboot(struct notifier_block *this,
3597 unsigned long code, void *x)
3598{
3599 struct list_head *tmp;
3600 mddev_t *mddev;
3601
3602 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
3603
3604 printk(KERN_INFO "md: stopping all md devices.\n");
3605
3606 ITERATE_MDDEV(mddev,tmp)
3607 if (mddev_trylock(mddev)==0)
3608 do_md_stop (mddev, 1);
3609 /*
3610 * certain more exotic SCSI devices are known to be
3611 * volatile wrt too early system reboots. While the
3612 * right place to handle this issue is the given
3613 * driver, we do want to have a safe RAID driver ...
3614 */
3615 mdelay(1000*1);
3616 }
3617 return NOTIFY_DONE;
3618}
3619
3620struct notifier_block md_notifier = {
3621 .notifier_call = md_notify_reboot,
3622 .next = NULL,
3623 .priority = INT_MAX, /* before any real devices */
3624};
3625
3626static void md_geninit(void)
3627{
3628 struct proc_dir_entry *p;
3629
3630 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3631
3632 p = create_proc_entry("mdstat", S_IRUGO, NULL);
3633 if (p)
3634 p->proc_fops = &md_seq_fops;
3635}
3636
3637int __init md_init(void)
3638{
3639 int minor;
3640
3641 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
3642 " MD_SB_DISKS=%d\n",
3643 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3644 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3645
3646 if (register_blkdev(MAJOR_NR, "md"))
3647 return -1;
3648 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
3649 unregister_blkdev(MAJOR_NR, "md");
3650 return -1;
3651 }
3652 devfs_mk_dir("md");
3653 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
3654 md_probe, NULL, NULL);
3655 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
3656 md_probe, NULL, NULL);
3657
3658 for (minor=0; minor < MAX_MD_DEVS; ++minor)
3659 devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
3660 S_IFBLK|S_IRUSR|S_IWUSR,
3661 "md/%d", minor);
3662
3663 for (minor=0; minor < MAX_MD_DEVS; ++minor)
3664 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
3665 S_IFBLK|S_IRUSR|S_IWUSR,
3666 "md/mdp%d", minor);
3667
3668
3669 register_reboot_notifier(&md_notifier);
3670 raid_table_header = register_sysctl_table(raid_root_table, 1);
3671
3672 md_geninit();
3673 return (0);
3674}
3675
3676
3677#ifndef MODULE
3678
3679/*
3680 * Searches all registered partitions for autorun RAID arrays
3681 * at boot time.
3682 */
3683static dev_t detected_devices[128];
3684static int dev_cnt;
3685
3686void md_autodetect_dev(dev_t dev)
3687{
3688 if (dev_cnt >= 0 && dev_cnt < 127)
3689 detected_devices[dev_cnt++] = dev;
3690}
3691
3692
3693static void autostart_arrays(int part)
3694{
3695 mdk_rdev_t *rdev;
3696 int i;
3697
3698 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3699
3700 for (i = 0; i < dev_cnt; i++) {
3701 dev_t dev = detected_devices[i];
3702
3703 rdev = md_import_device(dev,0, 0);
3704 if (IS_ERR(rdev))
3705 continue;
3706
3707 if (rdev->faulty) {
3708 MD_BUG();
3709 continue;
3710 }
3711 list_add(&rdev->same_set, &pending_raid_disks);
3712 }
3713 dev_cnt = 0;
3714
3715 autorun_devices(part);
3716}
3717
3718#endif
3719
3720static __exit void md_exit(void)
3721{
3722 mddev_t *mddev;
3723 struct list_head *tmp;
3724 int i;
3725 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
3726 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
3727 for (i=0; i < MAX_MD_DEVS; i++)
3728 devfs_remove("md/%d", i);
3729 for (i=0; i < MAX_MD_DEVS; i++)
3730 devfs_remove("md/d%d", i);
3731
3732 devfs_remove("md");
3733
3734 unregister_blkdev(MAJOR_NR,"md");
3735 unregister_blkdev(mdp_major, "mdp");
3736 unregister_reboot_notifier(&md_notifier);
3737 unregister_sysctl_table(raid_table_header);
3738 remove_proc_entry("mdstat", NULL);
3739 ITERATE_MDDEV(mddev,tmp) {
3740 struct gendisk *disk = mddev->gendisk;
3741 if (!disk)
3742 continue;
3743 export_array(mddev);
3744 del_gendisk(disk);
3745 put_disk(disk);
3746 mddev->gendisk = NULL;
3747 mddev_put(mddev);
3748 }
3749}
3750
3751module_init(md_init)
3752module_exit(md_exit)
3753
3754EXPORT_SYMBOL(register_md_personality);
3755EXPORT_SYMBOL(unregister_md_personality);
3756EXPORT_SYMBOL(md_error);
3757EXPORT_SYMBOL(md_done_sync);
3758EXPORT_SYMBOL(md_write_start);
3759EXPORT_SYMBOL(md_write_end);
3760EXPORT_SYMBOL(md_handle_safemode);
3761EXPORT_SYMBOL(md_register_thread);
3762EXPORT_SYMBOL(md_unregister_thread);
3763EXPORT_SYMBOL(md_wakeup_thread);
3764EXPORT_SYMBOL(md_print_devices);
3765EXPORT_SYMBOL(md_check_recovery);
3766MODULE_LICENSE("GPL");
diff --git a/drivers/md/mktables.c b/drivers/md/mktables.c
new file mode 100644
index 000000000000..adef299908cf
--- /dev/null
+++ b/drivers/md/mktables.c
@@ -0,0 +1,125 @@
1#ident "$Id: mktables.c,v 1.2 2002/12/12 22:41:27 hpa Exp $"
2/* ----------------------------------------------------------------------- *
3 *
4 * Copyright 2002 H. Peter Anvin - All Rights Reserved
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
9 * Bostom MA 02111-1307, USA; either version 2 of the License, or
10 * (at your option) any later version; incorporated herein by reference.
11 *
12 * ----------------------------------------------------------------------- */
13
14/*
15 * mktables.c
16 *
17 * Make RAID-6 tables. This is a host user space program to be run at
18 * compile time.
19 */
20
21#include <stdio.h>
22#include <string.h>
23#include <inttypes.h>
24#include <stdlib.h>
25#include <time.h>
26
27static uint8_t gfmul(uint8_t a, uint8_t b)
28{
29 uint8_t v = 0;
30
31 while ( b ) {
32 if ( b & 1 ) v ^= a;
33 a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
34 b >>= 1;
35 }
36 return v;
37}
38
39static uint8_t gfpow(uint8_t a, int b)
40{
41 uint8_t v = 1;
42
43 b %= 255;
44 if ( b < 0 )
45 b += 255;
46
47 while ( b ) {
48 if ( b & 1 ) v = gfmul(v,a);
49 a = gfmul(a,a);
50 b >>= 1;
51 }
52 return v;
53}
54
55int main(int argc, char *argv[])
56{
57 int i, j, k;
58 uint8_t v;
59 uint8_t exptbl[256], invtbl[256];
60
61 printf("#include \"raid6.h\"\n");
62
63 /* Compute multiplication table */
64 printf("\nconst u8 __attribute__((aligned(256)))\n"
65 "raid6_gfmul[256][256] =\n"
66 "{\n");
67 for ( i = 0 ; i < 256 ; i++ ) {
68 printf("\t{\n");
69 for ( j = 0 ; j < 256 ; j += 8 ) {
70 printf("\t\t");
71 for ( k = 0 ; k < 8 ; k++ ) {
72 printf("0x%02x, ", gfmul(i,j+k));
73 }
74 printf("\n");
75 }
76 printf("\t},\n");
77 }
78 printf("};\n");
79
80 /* Compute power-of-2 table (exponent) */
81 v = 1;
82 printf("\nconst u8 __attribute__((aligned(256)))\n"
83 "raid6_gfexp[256] =\n"
84 "{\n");
85 for ( i = 0 ; i < 256 ; i += 8 ) {
86 printf("\t");
87 for ( j = 0 ; j < 8 ; j++ ) {
88 exptbl[i+j] = v;
89 printf("0x%02x, ", v);
90 v = gfmul(v,2);
91 if ( v == 1 ) v = 0; /* For entry 255, not a real entry */
92 }
93 printf("\n");
94 }
95 printf("};\n");
96
97 /* Compute inverse table x^-1 == x^254 */
98 printf("\nconst u8 __attribute__((aligned(256)))\n"
99 "raid6_gfinv[256] =\n"
100 "{\n");
101 for ( i = 0 ; i < 256 ; i += 8 ) {
102 printf("\t");
103 for ( j = 0 ; j < 8 ; j++ ) {
104 invtbl[i+j] = v = gfpow(i+j,254);
105 printf("0x%02x, ", v);
106 }
107 printf("\n");
108 }
109 printf("};\n");
110
111 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
112 printf("\nconst u8 __attribute__((aligned(256)))\n"
113 "raid6_gfexi[256] =\n"
114 "{\n");
115 for ( i = 0 ; i < 256 ; i += 8 ) {
116 printf("\t");
117 for ( j = 0 ; j < 8 ; j++ ) {
118 printf("0x%02x, ", invtbl[exptbl[i+j]^1]);
119 }
120 printf("\n");
121 }
122 printf("};\n\n");
123
124 return 0;
125}
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
new file mode 100644
index 000000000000..c9b134cd1532
--- /dev/null
+++ b/drivers/md/multipath.c
@@ -0,0 +1,584 @@
1/*
2 * multipath.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * MULTIPATH management functions.
9 *
10 * derived from raid1.c.
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2, or (at your option)
15 * any later version.
16 *
17 * You should have received a copy of the GNU General Public License
18 * (for example /usr/src/linux/COPYING); if not, write to the Free
19 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/spinlock.h>
25#include <linux/raid/multipath.h>
26#include <linux/buffer_head.h>
27#include <asm/atomic.h>
28
29#define MAJOR_NR MD_MAJOR
30#define MD_DRIVER
31#define MD_PERSONALITY
32
33#define MAX_WORK_PER_DISK 128
34
35#define NR_RESERVED_BUFS 32
36
37
38static mdk_personality_t multipath_personality;
39
40
41static void *mp_pool_alloc(unsigned int __nocast gfp_flags, void *data)
42{
43 struct multipath_bh *mpb;
44 mpb = kmalloc(sizeof(*mpb), gfp_flags);
45 if (mpb)
46 memset(mpb, 0, sizeof(*mpb));
47 return mpb;
48}
49
50static void mp_pool_free(void *mpb, void *data)
51{
52 kfree(mpb);
53}
54
55static int multipath_map (multipath_conf_t *conf)
56{
57 int i, disks = conf->raid_disks;
58
59 /*
60 * Later we do read balancing on the read side
61 * now we use the first available disk.
62 */
63
64 rcu_read_lock();
65 for (i = 0; i < disks; i++) {
66 mdk_rdev_t *rdev = conf->multipaths[i].rdev;
67 if (rdev && rdev->in_sync) {
68 atomic_inc(&rdev->nr_pending);
69 rcu_read_unlock();
70 return i;
71 }
72 }
73 rcu_read_unlock();
74
75 printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
76 return (-1);
77}
78
79static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
80{
81 unsigned long flags;
82 mddev_t *mddev = mp_bh->mddev;
83 multipath_conf_t *conf = mddev_to_conf(mddev);
84
85 spin_lock_irqsave(&conf->device_lock, flags);
86 list_add(&mp_bh->retry_list, &conf->retry_list);
87 spin_unlock_irqrestore(&conf->device_lock, flags);
88 md_wakeup_thread(mddev->thread);
89}
90
91
92/*
93 * multipath_end_bh_io() is called when we have finished servicing a multipathed
94 * operation and are ready to return a success/failure code to the buffer
95 * cache layer.
96 */
97static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
98{
99 struct bio *bio = mp_bh->master_bio;
100 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
101
102 bio_endio(bio, bio->bi_size, err);
103 mempool_free(mp_bh, conf->pool);
104}
105
106int multipath_end_request(struct bio *bio, unsigned int bytes_done, int error)
107{
108 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
109 struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
110 multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
111 mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
112
113 if (bio->bi_size)
114 return 1;
115
116 if (uptodate)
117 multipath_end_bh_io(mp_bh, 0);
118 else if (!bio_rw_ahead(bio)) {
119 /*
120 * oops, IO error:
121 */
122 char b[BDEVNAME_SIZE];
123 md_error (mp_bh->mddev, rdev);
124 printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
125 bdevname(rdev->bdev,b),
126 (unsigned long long)bio->bi_sector);
127 multipath_reschedule_retry(mp_bh);
128 } else
129 multipath_end_bh_io(mp_bh, error);
130 rdev_dec_pending(rdev, conf->mddev);
131 return 0;
132}
133
134static void unplug_slaves(mddev_t *mddev)
135{
136 multipath_conf_t *conf = mddev_to_conf(mddev);
137 int i;
138
139 rcu_read_lock();
140 for (i=0; i<mddev->raid_disks; i++) {
141 mdk_rdev_t *rdev = conf->multipaths[i].rdev;
142 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
143 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
144
145 atomic_inc(&rdev->nr_pending);
146 rcu_read_unlock();
147
148 if (r_queue->unplug_fn)
149 r_queue->unplug_fn(r_queue);
150
151 rdev_dec_pending(rdev, mddev);
152 rcu_read_lock();
153 }
154 }
155 rcu_read_unlock();
156}
157
158static void multipath_unplug(request_queue_t *q)
159{
160 unplug_slaves(q->queuedata);
161}
162
163
164static int multipath_make_request (request_queue_t *q, struct bio * bio)
165{
166 mddev_t *mddev = q->queuedata;
167 multipath_conf_t *conf = mddev_to_conf(mddev);
168 struct multipath_bh * mp_bh;
169 struct multipath_info *multipath;
170
171 mp_bh = mempool_alloc(conf->pool, GFP_NOIO);
172
173 mp_bh->master_bio = bio;
174 mp_bh->mddev = mddev;
175
176 if (bio_data_dir(bio)==WRITE) {
177 disk_stat_inc(mddev->gendisk, writes);
178 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
179 } else {
180 disk_stat_inc(mddev->gendisk, reads);
181 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
182 }
183
184 mp_bh->path = multipath_map(conf);
185 if (mp_bh->path < 0) {
186 bio_endio(bio, bio->bi_size, -EIO);
187 mempool_free(mp_bh, conf->pool);
188 return 0;
189 }
190 multipath = conf->multipaths + mp_bh->path;
191
192 mp_bh->bio = *bio;
193 mp_bh->bio.bi_sector += multipath->rdev->data_offset;
194 mp_bh->bio.bi_bdev = multipath->rdev->bdev;
195 mp_bh->bio.bi_rw |= (1 << BIO_RW_FAILFAST);
196 mp_bh->bio.bi_end_io = multipath_end_request;
197 mp_bh->bio.bi_private = mp_bh;
198 generic_make_request(&mp_bh->bio);
199 return 0;
200}
201
202static void multipath_status (struct seq_file *seq, mddev_t *mddev)
203{
204 multipath_conf_t *conf = mddev_to_conf(mddev);
205 int i;
206
207 seq_printf (seq, " [%d/%d] [", conf->raid_disks,
208 conf->working_disks);
209 for (i = 0; i < conf->raid_disks; i++)
210 seq_printf (seq, "%s",
211 conf->multipaths[i].rdev &&
212 conf->multipaths[i].rdev->in_sync ? "U" : "_");
213 seq_printf (seq, "]");
214}
215
216static int multipath_issue_flush(request_queue_t *q, struct gendisk *disk,
217 sector_t *error_sector)
218{
219 mddev_t *mddev = q->queuedata;
220 multipath_conf_t *conf = mddev_to_conf(mddev);
221 int i, ret = 0;
222
223 rcu_read_lock();
224 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
225 mdk_rdev_t *rdev = conf->multipaths[i].rdev;
226 if (rdev && !rdev->faulty) {
227 struct block_device *bdev = rdev->bdev;
228 request_queue_t *r_queue = bdev_get_queue(bdev);
229
230 if (!r_queue->issue_flush_fn)
231 ret = -EOPNOTSUPP;
232 else {
233 atomic_inc(&rdev->nr_pending);
234 rcu_read_unlock();
235 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
236 error_sector);
237 rdev_dec_pending(rdev, mddev);
238 rcu_read_lock();
239 }
240 }
241 }
242 rcu_read_unlock();
243 return ret;
244}
245
246/*
247 * Careful, this can execute in IRQ contexts as well!
248 */
249static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
250{
251 multipath_conf_t *conf = mddev_to_conf(mddev);
252
253 if (conf->working_disks <= 1) {
254 /*
255 * Uh oh, we can do nothing if this is our last path, but
256 * first check if this is a queued request for a device
257 * which has just failed.
258 */
259 printk(KERN_ALERT
260 "multipath: only one IO path left and IO error.\n");
261 /* leave it active... it's all we have */
262 } else {
263 /*
264 * Mark disk as unusable
265 */
266 if (!rdev->faulty) {
267 char b[BDEVNAME_SIZE];
268 rdev->in_sync = 0;
269 rdev->faulty = 1;
270 mddev->sb_dirty = 1;
271 conf->working_disks--;
272 printk(KERN_ALERT "multipath: IO failure on %s,"
273 " disabling IO path. \n Operation continuing"
274 " on %d IO paths.\n",
275 bdevname (rdev->bdev,b),
276 conf->working_disks);
277 }
278 }
279}
280
281static void print_multipath_conf (multipath_conf_t *conf)
282{
283 int i;
284 struct multipath_info *tmp;
285
286 printk("MULTIPATH conf printout:\n");
287 if (!conf) {
288 printk("(conf==NULL)\n");
289 return;
290 }
291 printk(" --- wd:%d rd:%d\n", conf->working_disks,
292 conf->raid_disks);
293
294 for (i = 0; i < conf->raid_disks; i++) {
295 char b[BDEVNAME_SIZE];
296 tmp = conf->multipaths + i;
297 if (tmp->rdev)
298 printk(" disk%d, o:%d, dev:%s\n",
299 i,!tmp->rdev->faulty,
300 bdevname(tmp->rdev->bdev,b));
301 }
302}
303
304
305static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
306{
307 multipath_conf_t *conf = mddev->private;
308 int found = 0;
309 int path;
310 struct multipath_info *p;
311
312 print_multipath_conf(conf);
313
314 for (path=0; path<mddev->raid_disks; path++)
315 if ((p=conf->multipaths+path)->rdev == NULL) {
316 blk_queue_stack_limits(mddev->queue,
317 rdev->bdev->bd_disk->queue);
318
319 /* as we don't honour merge_bvec_fn, we must never risk
320 * violating it, so limit ->max_sector to one PAGE, as
321 * a one page request is never in violation.
322 * (Note: it is very unlikely that a device with
323 * merge_bvec_fn will be involved in multipath.)
324 */
325 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
326 mddev->queue->max_sectors > (PAGE_SIZE>>9))
327 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
328
329 conf->working_disks++;
330 rdev->raid_disk = path;
331 rdev->in_sync = 1;
332 p->rdev = rdev;
333 found = 1;
334 }
335
336 print_multipath_conf(conf);
337 return found;
338}
339
340static int multipath_remove_disk(mddev_t *mddev, int number)
341{
342 multipath_conf_t *conf = mddev->private;
343 int err = 0;
344 mdk_rdev_t *rdev;
345 struct multipath_info *p = conf->multipaths + number;
346
347 print_multipath_conf(conf);
348
349 rdev = p->rdev;
350 if (rdev) {
351 if (rdev->in_sync ||
352 atomic_read(&rdev->nr_pending)) {
353 printk(KERN_ERR "hot-remove-disk, slot %d is identified" " but is still operational!\n", number);
354 err = -EBUSY;
355 goto abort;
356 }
357 p->rdev = NULL;
358 synchronize_kernel();
359 if (atomic_read(&rdev->nr_pending)) {
360 /* lost the race, try later */
361 err = -EBUSY;
362 p->rdev = rdev;
363 }
364 }
365abort:
366
367 print_multipath_conf(conf);
368 return err;
369}
370
371
372
373/*
374 * This is a kernel thread which:
375 *
376 * 1. Retries failed read operations on working multipaths.
377 * 2. Updates the raid superblock when problems encounter.
378 * 3. Performs writes following reads for array syncronising.
379 */
380
381static void multipathd (mddev_t *mddev)
382{
383 struct multipath_bh *mp_bh;
384 struct bio *bio;
385 unsigned long flags;
386 multipath_conf_t *conf = mddev_to_conf(mddev);
387 struct list_head *head = &conf->retry_list;
388
389 md_check_recovery(mddev);
390 for (;;) {
391 char b[BDEVNAME_SIZE];
392 spin_lock_irqsave(&conf->device_lock, flags);
393 if (list_empty(head))
394 break;
395 mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
396 list_del(head->prev);
397 spin_unlock_irqrestore(&conf->device_lock, flags);
398
399 bio = &mp_bh->bio;
400 bio->bi_sector = mp_bh->master_bio->bi_sector;
401
402 if ((mp_bh->path = multipath_map (conf))<0) {
403 printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
404 " error for block %llu\n",
405 bdevname(bio->bi_bdev,b),
406 (unsigned long long)bio->bi_sector);
407 multipath_end_bh_io(mp_bh, -EIO);
408 } else {
409 printk(KERN_ERR "multipath: %s: redirecting sector %llu"
410 " to another IO path\n",
411 bdevname(bio->bi_bdev,b),
412 (unsigned long long)bio->bi_sector);
413 *bio = *(mp_bh->master_bio);
414 bio->bi_sector += conf->multipaths[mp_bh->path].rdev->data_offset;
415 bio->bi_bdev = conf->multipaths[mp_bh->path].rdev->bdev;
416 bio->bi_rw |= (1 << BIO_RW_FAILFAST);
417 bio->bi_end_io = multipath_end_request;
418 bio->bi_private = mp_bh;
419 generic_make_request(bio);
420 }
421 }
422 spin_unlock_irqrestore(&conf->device_lock, flags);
423}
424
425static int multipath_run (mddev_t *mddev)
426{
427 multipath_conf_t *conf;
428 int disk_idx;
429 struct multipath_info *disk;
430 mdk_rdev_t *rdev;
431 struct list_head *tmp;
432
433 if (mddev->level != LEVEL_MULTIPATH) {
434 printk("multipath: %s: raid level not set to multipath IO (%d)\n",
435 mdname(mddev), mddev->level);
436 goto out;
437 }
438 /*
439 * copy the already verified devices into our private MULTIPATH
440 * bookkeeping area. [whatever we allocate in multipath_run(),
441 * should be freed in multipath_stop()]
442 */
443
444 conf = kmalloc(sizeof(multipath_conf_t), GFP_KERNEL);
445 mddev->private = conf;
446 if (!conf) {
447 printk(KERN_ERR
448 "multipath: couldn't allocate memory for %s\n",
449 mdname(mddev));
450 goto out;
451 }
452 memset(conf, 0, sizeof(*conf));
453
454 conf->multipaths = kmalloc(sizeof(struct multipath_info)*mddev->raid_disks,
455 GFP_KERNEL);
456 if (!conf->multipaths) {
457 printk(KERN_ERR
458 "multipath: couldn't allocate memory for %s\n",
459 mdname(mddev));
460 goto out_free_conf;
461 }
462 memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
463
464 mddev->queue->unplug_fn = multipath_unplug;
465
466 mddev->queue->issue_flush_fn = multipath_issue_flush;
467
468 conf->working_disks = 0;
469 ITERATE_RDEV(mddev,rdev,tmp) {
470 disk_idx = rdev->raid_disk;
471 if (disk_idx < 0 ||
472 disk_idx >= mddev->raid_disks)
473 continue;
474
475 disk = conf->multipaths + disk_idx;
476 disk->rdev = rdev;
477
478 blk_queue_stack_limits(mddev->queue,
479 rdev->bdev->bd_disk->queue);
480 /* as we don't honour merge_bvec_fn, we must never risk
481 * violating it, not that we ever expect a device with
482 * a merge_bvec_fn to be involved in multipath */
483 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
484 mddev->queue->max_sectors > (PAGE_SIZE>>9))
485 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
486
487 if (!rdev->faulty)
488 conf->working_disks++;
489 }
490
491 conf->raid_disks = mddev->raid_disks;
492 mddev->sb_dirty = 1;
493 conf->mddev = mddev;
494 spin_lock_init(&conf->device_lock);
495 INIT_LIST_HEAD(&conf->retry_list);
496
497 if (!conf->working_disks) {
498 printk(KERN_ERR "multipath: no operational IO paths for %s\n",
499 mdname(mddev));
500 goto out_free_conf;
501 }
502 mddev->degraded = conf->raid_disks = conf->working_disks;
503
504 conf->pool = mempool_create(NR_RESERVED_BUFS,
505 mp_pool_alloc, mp_pool_free,
506 NULL);
507 if (conf->pool == NULL) {
508 printk(KERN_ERR
509 "multipath: couldn't allocate memory for %s\n",
510 mdname(mddev));
511 goto out_free_conf;
512 }
513
514 {
515 mddev->thread = md_register_thread(multipathd, mddev, "%s_multipath");
516 if (!mddev->thread) {
517 printk(KERN_ERR "multipath: couldn't allocate thread"
518 " for %s\n", mdname(mddev));
519 goto out_free_conf;
520 }
521 }
522
523 printk(KERN_INFO
524 "multipath: array %s active with %d out of %d IO paths\n",
525 mdname(mddev), conf->working_disks, mddev->raid_disks);
526 /*
527 * Ok, everything is just fine now
528 */
529 mddev->array_size = mddev->size;
530 return 0;
531
532out_free_conf:
533 if (conf->pool)
534 mempool_destroy(conf->pool);
535 if (conf->multipaths)
536 kfree(conf->multipaths);
537 kfree(conf);
538 mddev->private = NULL;
539out:
540 return -EIO;
541}
542
543
544static int multipath_stop (mddev_t *mddev)
545{
546 multipath_conf_t *conf = mddev_to_conf(mddev);
547
548 md_unregister_thread(mddev->thread);
549 mddev->thread = NULL;
550 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
551 mempool_destroy(conf->pool);
552 kfree(conf->multipaths);
553 kfree(conf);
554 mddev->private = NULL;
555 return 0;
556}
557
558static mdk_personality_t multipath_personality=
559{
560 .name = "multipath",
561 .owner = THIS_MODULE,
562 .make_request = multipath_make_request,
563 .run = multipath_run,
564 .stop = multipath_stop,
565 .status = multipath_status,
566 .error_handler = multipath_error,
567 .hot_add_disk = multipath_add_disk,
568 .hot_remove_disk= multipath_remove_disk,
569};
570
571static int __init multipath_init (void)
572{
573 return register_md_personality (MULTIPATH, &multipath_personality);
574}
575
576static void __exit multipath_exit (void)
577{
578 unregister_md_personality (MULTIPATH);
579}
580
581module_init(multipath_init);
582module_exit(multipath_exit);
583MODULE_LICENSE("GPL");
584MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
new file mode 100644
index 000000000000..e7d934eca06f
--- /dev/null
+++ b/drivers/md/raid0.c
@@ -0,0 +1,539 @@
1/*
2 raid0.c : Multiple Devices driver for Linux
3 Copyright (C) 1994-96 Marc ZYNGIER
4 <zyngier@ufr-info-p7.ibp.fr> or
5 <maz@gloups.fdn.fr>
6 Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
7
8
9 RAID-0 management functions.
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 You should have received a copy of the GNU General Public License
17 (for example /usr/src/linux/COPYING); if not, write to the Free
18 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19*/
20
21#include <linux/module.h>
22#include <linux/raid/raid0.h>
23
24#define MAJOR_NR MD_MAJOR
25#define MD_DRIVER
26#define MD_PERSONALITY
27
28static void raid0_unplug(request_queue_t *q)
29{
30 mddev_t *mddev = q->queuedata;
31 raid0_conf_t *conf = mddev_to_conf(mddev);
32 mdk_rdev_t **devlist = conf->strip_zone[0].dev;
33 int i;
34
35 for (i=0; i<mddev->raid_disks; i++) {
36 request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev);
37
38 if (r_queue->unplug_fn)
39 r_queue->unplug_fn(r_queue);
40 }
41}
42
43static int raid0_issue_flush(request_queue_t *q, struct gendisk *disk,
44 sector_t *error_sector)
45{
46 mddev_t *mddev = q->queuedata;
47 raid0_conf_t *conf = mddev_to_conf(mddev);
48 mdk_rdev_t **devlist = conf->strip_zone[0].dev;
49 int i, ret = 0;
50
51 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
52 struct block_device *bdev = devlist[i]->bdev;
53 request_queue_t *r_queue = bdev_get_queue(bdev);
54
55 if (!r_queue->issue_flush_fn)
56 ret = -EOPNOTSUPP;
57 else
58 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, error_sector);
59 }
60 return ret;
61}
62
63
64static int create_strip_zones (mddev_t *mddev)
65{
66 int i, c, j;
67 sector_t current_offset, curr_zone_offset;
68 sector_t min_spacing;
69 raid0_conf_t *conf = mddev_to_conf(mddev);
70 mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
71 struct list_head *tmp1, *tmp2;
72 struct strip_zone *zone;
73 int cnt;
74 char b[BDEVNAME_SIZE];
75
76 /*
77 * The number of 'same size groups'
78 */
79 conf->nr_strip_zones = 0;
80
81 ITERATE_RDEV(mddev,rdev1,tmp1) {
82 printk("raid0: looking at %s\n",
83 bdevname(rdev1->bdev,b));
84 c = 0;
85 ITERATE_RDEV(mddev,rdev2,tmp2) {
86 printk("raid0: comparing %s(%llu)",
87 bdevname(rdev1->bdev,b),
88 (unsigned long long)rdev1->size);
89 printk(" with %s(%llu)\n",
90 bdevname(rdev2->bdev,b),
91 (unsigned long long)rdev2->size);
92 if (rdev2 == rdev1) {
93 printk("raid0: END\n");
94 break;
95 }
96 if (rdev2->size == rdev1->size)
97 {
98 /*
99 * Not unique, don't count it as a new
100 * group
101 */
102 printk("raid0: EQUAL\n");
103 c = 1;
104 break;
105 }
106 printk("raid0: NOT EQUAL\n");
107 }
108 if (!c) {
109 printk("raid0: ==> UNIQUE\n");
110 conf->nr_strip_zones++;
111 printk("raid0: %d zones\n", conf->nr_strip_zones);
112 }
113 }
114 printk("raid0: FINAL %d zones\n", conf->nr_strip_zones);
115
116 conf->strip_zone = kmalloc(sizeof(struct strip_zone)*
117 conf->nr_strip_zones, GFP_KERNEL);
118 if (!conf->strip_zone)
119 return 1;
120 conf->devlist = kmalloc(sizeof(mdk_rdev_t*)*
121 conf->nr_strip_zones*mddev->raid_disks,
122 GFP_KERNEL);
123 if (!conf->devlist)
124 return 1;
125
126 memset(conf->strip_zone, 0,sizeof(struct strip_zone)*
127 conf->nr_strip_zones);
128 memset(conf->devlist, 0,
129 sizeof(mdk_rdev_t*) * conf->nr_strip_zones * mddev->raid_disks);
130
131 /* The first zone must contain all devices, so here we check that
132 * there is a proper alignment of slots to devices and find them all
133 */
134 zone = &conf->strip_zone[0];
135 cnt = 0;
136 smallest = NULL;
137 zone->dev = conf->devlist;
138 ITERATE_RDEV(mddev, rdev1, tmp1) {
139 int j = rdev1->raid_disk;
140
141 if (j < 0 || j >= mddev->raid_disks) {
142 printk("raid0: bad disk number %d - aborting!\n", j);
143 goto abort;
144 }
145 if (zone->dev[j]) {
146 printk("raid0: multiple devices for %d - aborting!\n",
147 j);
148 goto abort;
149 }
150 zone->dev[j] = rdev1;
151
152 blk_queue_stack_limits(mddev->queue,
153 rdev1->bdev->bd_disk->queue);
154 /* as we don't honour merge_bvec_fn, we must never risk
155 * violating it, so limit ->max_sector to one PAGE, as
156 * a one page request is never in violation.
157 */
158
159 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
160 mddev->queue->max_sectors > (PAGE_SIZE>>9))
161 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
162
163 if (!smallest || (rdev1->size <smallest->size))
164 smallest = rdev1;
165 cnt++;
166 }
167 if (cnt != mddev->raid_disks) {
168 printk("raid0: too few disks (%d of %d) - aborting!\n",
169 cnt, mddev->raid_disks);
170 goto abort;
171 }
172 zone->nb_dev = cnt;
173 zone->size = smallest->size * cnt;
174 zone->zone_offset = 0;
175
176 current_offset = smallest->size;
177 curr_zone_offset = zone->size;
178
179 /* now do the other zones */
180 for (i = 1; i < conf->nr_strip_zones; i++)
181 {
182 zone = conf->strip_zone + i;
183 zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
184
185 printk("raid0: zone %d\n", i);
186 zone->dev_offset = current_offset;
187 smallest = NULL;
188 c = 0;
189
190 for (j=0; j<cnt; j++) {
191 char b[BDEVNAME_SIZE];
192 rdev = conf->strip_zone[0].dev[j];
193 printk("raid0: checking %s ...", bdevname(rdev->bdev,b));
194 if (rdev->size > current_offset)
195 {
196 printk(" contained as device %d\n", c);
197 zone->dev[c] = rdev;
198 c++;
199 if (!smallest || (rdev->size <smallest->size)) {
200 smallest = rdev;
201 printk(" (%llu) is smallest!.\n",
202 (unsigned long long)rdev->size);
203 }
204 } else
205 printk(" nope.\n");
206 }
207
208 zone->nb_dev = c;
209 zone->size = (smallest->size - current_offset) * c;
210 printk("raid0: zone->nb_dev: %d, size: %llu\n",
211 zone->nb_dev, (unsigned long long)zone->size);
212
213 zone->zone_offset = curr_zone_offset;
214 curr_zone_offset += zone->size;
215
216 current_offset = smallest->size;
217 printk("raid0: current zone offset: %llu\n",
218 (unsigned long long)current_offset);
219 }
220
221 /* Now find appropriate hash spacing.
222 * We want a number which causes most hash entries to cover
223 * at most two strips, but the hash table must be at most
224 * 1 PAGE. We choose the smallest strip, or contiguous collection
225 * of strips, that has big enough size. We never consider the last
226 * strip though as it's size has no bearing on the efficacy of the hash
227 * table.
228 */
229 conf->hash_spacing = curr_zone_offset;
230 min_spacing = curr_zone_offset;
231 sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
232 for (i=0; i < conf->nr_strip_zones-1; i++) {
233 sector_t sz = 0;
234 for (j=i; j<conf->nr_strip_zones-1 &&
235 sz < min_spacing ; j++)
236 sz += conf->strip_zone[j].size;
237 if (sz >= min_spacing && sz < conf->hash_spacing)
238 conf->hash_spacing = sz;
239 }
240
241 mddev->queue->unplug_fn = raid0_unplug;
242
243 mddev->queue->issue_flush_fn = raid0_issue_flush;
244
245 printk("raid0: done.\n");
246 return 0;
247 abort:
248 return 1;
249}
250
251/**
252 * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged
253 * @q: request queue
254 * @bio: the buffer head that's been built up so far
255 * @biovec: the request that could be merged to it.
256 *
257 * Return amount of bytes we can accept at this offset
258 */
259static int raid0_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio_vec *biovec)
260{
261 mddev_t *mddev = q->queuedata;
262 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
263 int max;
264 unsigned int chunk_sectors = mddev->chunk_size >> 9;
265 unsigned int bio_sectors = bio->bi_size >> 9;
266
267 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
268 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
269 if (max <= biovec->bv_len && bio_sectors == 0)
270 return biovec->bv_len;
271 else
272 return max;
273}
274
275static int raid0_run (mddev_t *mddev)
276{
277 unsigned cur=0, i=0, nb_zone;
278 s64 size;
279 raid0_conf_t *conf;
280 mdk_rdev_t *rdev;
281 struct list_head *tmp;
282
283 printk("%s: setting max_sectors to %d, segment boundary to %d\n",
284 mdname(mddev),
285 mddev->chunk_size >> 9,
286 (mddev->chunk_size>>1)-1);
287 blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
288 blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
289
290 conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
291 if (!conf)
292 goto out;
293 mddev->private = (void *)conf;
294
295 conf->strip_zone = NULL;
296 conf->devlist = NULL;
297 if (create_strip_zones (mddev))
298 goto out_free_conf;
299
300 /* calculate array device size */
301 mddev->array_size = 0;
302 ITERATE_RDEV(mddev,rdev,tmp)
303 mddev->array_size += rdev->size;
304
305 printk("raid0 : md_size is %llu blocks.\n",
306 (unsigned long long)mddev->array_size);
307 printk("raid0 : conf->hash_spacing is %llu blocks.\n",
308 (unsigned long long)conf->hash_spacing);
309 {
310#if __GNUC__ < 3
311 volatile
312#endif
313 sector_t s = mddev->array_size;
314 sector_t space = conf->hash_spacing;
315 int round;
316 conf->preshift = 0;
317 if (sizeof(sector_t) > sizeof(unsigned long)) {
318 /*shift down space and s so that sector_div will work */
319 while (space > (sector_t) (~(unsigned long)0)) {
320 s >>= 1;
321 space >>= 1;
322 s += 1; /* force round-up */
323 conf->preshift++;
324 }
325 }
326 round = sector_div(s, (unsigned long)space) ? 1 : 0;
327 nb_zone = s + round;
328 }
329 printk("raid0 : nb_zone is %d.\n", nb_zone);
330
331 printk("raid0 : Allocating %Zd bytes for hash.\n",
332 nb_zone*sizeof(struct strip_zone*));
333 conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
334 if (!conf->hash_table)
335 goto out_free_conf;
336 size = conf->strip_zone[cur].size;
337
338 for (i=0; i< nb_zone; i++) {
339 conf->hash_table[i] = conf->strip_zone + cur;
340 while (size <= conf->hash_spacing) {
341 cur++;
342 size += conf->strip_zone[cur].size;
343 }
344 size -= conf->hash_spacing;
345 }
346 if (conf->preshift) {
347 conf->hash_spacing >>= conf->preshift;
348 /* round hash_spacing up so when we divide by it, we
349 * err on the side of too-low, which is safest
350 */
351 conf->hash_spacing++;
352 }
353
354 /* calculate the max read-ahead size.
355 * For read-ahead of large files to be effective, we need to
356 * readahead at least twice a whole stripe. i.e. number of devices
357 * multiplied by chunk size times 2.
358 * If an individual device has an ra_pages greater than the
359 * chunk size, then we will not drive that device as hard as it
360 * wants. We consider this a configuration error: a larger
361 * chunksize should be used in that case.
362 */
363 {
364 int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
365 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
366 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
367 }
368
369
370 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
371 return 0;
372
373out_free_conf:
374 if (conf->strip_zone)
375 kfree(conf->strip_zone);
376 if (conf->devlist)
377 kfree (conf->devlist);
378 kfree(conf);
379 mddev->private = NULL;
380out:
381 return 1;
382}
383
384static int raid0_stop (mddev_t *mddev)
385{
386 raid0_conf_t *conf = mddev_to_conf(mddev);
387
388 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
389 kfree (conf->hash_table);
390 conf->hash_table = NULL;
391 kfree (conf->strip_zone);
392 conf->strip_zone = NULL;
393 kfree (conf);
394 mddev->private = NULL;
395
396 return 0;
397}
398
399static int raid0_make_request (request_queue_t *q, struct bio *bio)
400{
401 mddev_t *mddev = q->queuedata;
402 unsigned int sect_in_chunk, chunksize_bits, chunk_size, chunk_sects;
403 raid0_conf_t *conf = mddev_to_conf(mddev);
404 struct strip_zone *zone;
405 mdk_rdev_t *tmp_dev;
406 unsigned long chunk;
407 sector_t block, rsect;
408
409 if (bio_data_dir(bio)==WRITE) {
410 disk_stat_inc(mddev->gendisk, writes);
411 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
412 } else {
413 disk_stat_inc(mddev->gendisk, reads);
414 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
415 }
416
417 chunk_size = mddev->chunk_size >> 10;
418 chunk_sects = mddev->chunk_size >> 9;
419 chunksize_bits = ffz(~chunk_size);
420 block = bio->bi_sector >> 1;
421
422
423 if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
424 struct bio_pair *bp;
425 /* Sanity check -- queue functions should prevent this happening */
426 if (bio->bi_vcnt != 1 ||
427 bio->bi_idx != 0)
428 goto bad_map;
429 /* This is a one page bio that upper layers
430 * refuse to split for us, so we need to split it.
431 */
432 bp = bio_split(bio, bio_split_pool, chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
433 if (raid0_make_request(q, &bp->bio1))
434 generic_make_request(&bp->bio1);
435 if (raid0_make_request(q, &bp->bio2))
436 generic_make_request(&bp->bio2);
437
438 bio_pair_release(bp);
439 return 0;
440 }
441
442
443 {
444#if __GNUC__ < 3
445 volatile
446#endif
447 sector_t x = block >> conf->preshift;
448 sector_div(x, (unsigned long)conf->hash_spacing);
449 zone = conf->hash_table[x];
450 }
451
452 while (block >= (zone->zone_offset + zone->size))
453 zone++;
454
455 sect_in_chunk = bio->bi_sector & ((chunk_size<<1) -1);
456
457
458 {
459 sector_t x = (block - zone->zone_offset) >> chunksize_bits;
460
461 sector_div(x, zone->nb_dev);
462 chunk = x;
463 BUG_ON(x != (sector_t)chunk);
464
465 x = block >> chunksize_bits;
466 tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
467 }
468 rsect = (((chunk << chunksize_bits) + zone->dev_offset)<<1)
469 + sect_in_chunk;
470
471 bio->bi_bdev = tmp_dev->bdev;
472 bio->bi_sector = rsect + tmp_dev->data_offset;
473
474 /*
475 * Let the main block layer submit the IO and resolve recursion:
476 */
477 return 1;
478
479bad_map:
480 printk("raid0_make_request bug: can't convert block across chunks"
481 " or bigger than %dk %llu %d\n", chunk_size,
482 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
483
484 bio_io_error(bio, bio->bi_size);
485 return 0;
486}
487
488static void raid0_status (struct seq_file *seq, mddev_t *mddev)
489{
490#undef MD_DEBUG
491#ifdef MD_DEBUG
492 int j, k, h;
493 char b[BDEVNAME_SIZE];
494 raid0_conf_t *conf = mddev_to_conf(mddev);
495
496 h = 0;
497 for (j = 0; j < conf->nr_strip_zones; j++) {
498 seq_printf(seq, " z%d", j);
499 if (conf->hash_table[h] == conf->strip_zone+j)
500 seq_printf("(h%d)", h++);
501 seq_printf(seq, "=[");
502 for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
503 seq_printf (seq, "%s/", bdevname(
504 conf->strip_zone[j].dev[k]->bdev,b));
505
506 seq_printf (seq, "] zo=%d do=%d s=%d\n",
507 conf->strip_zone[j].zone_offset,
508 conf->strip_zone[j].dev_offset,
509 conf->strip_zone[j].size);
510 }
511#endif
512 seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
513 return;
514}
515
516static mdk_personality_t raid0_personality=
517{
518 .name = "raid0",
519 .owner = THIS_MODULE,
520 .make_request = raid0_make_request,
521 .run = raid0_run,
522 .stop = raid0_stop,
523 .status = raid0_status,
524};
525
526static int __init raid0_init (void)
527{
528 return register_md_personality (RAID0, &raid0_personality);
529}
530
531static void raid0_exit (void)
532{
533 unregister_md_personality (RAID0);
534}
535
536module_init(raid0_init);
537module_exit(raid0_exit);
538MODULE_LICENSE("GPL");
539MODULE_ALIAS("md-personality-2"); /* RAID0 */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
new file mode 100644
index 000000000000..a389394b52f6
--- /dev/null
+++ b/drivers/md/raid1.c
@@ -0,0 +1,1449 @@
1/*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
19 *
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25#include <linux/raid/raid1.h>
26
27/*
28 * Number of guaranteed r1bios in case of extreme VM load:
29 */
30#define NR_RAID1_BIOS 256
31
32static mdk_personality_t raid1_personality;
33
34static void unplug_slaves(mddev_t *mddev);
35
36
37static void * r1bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
38{
39 struct pool_info *pi = data;
40 r1bio_t *r1_bio;
41 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
42
43 /* allocate a r1bio with room for raid_disks entries in the bios array */
44 r1_bio = kmalloc(size, gfp_flags);
45 if (r1_bio)
46 memset(r1_bio, 0, size);
47 else
48 unplug_slaves(pi->mddev);
49
50 return r1_bio;
51}
52
53static void r1bio_pool_free(void *r1_bio, void *data)
54{
55 kfree(r1_bio);
56}
57
58#define RESYNC_BLOCK_SIZE (64*1024)
59//#define RESYNC_BLOCK_SIZE PAGE_SIZE
60#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
61#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
62#define RESYNC_WINDOW (2048*1024)
63
64static void * r1buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
65{
66 struct pool_info *pi = data;
67 struct page *page;
68 r1bio_t *r1_bio;
69 struct bio *bio;
70 int i, j;
71
72 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
73 if (!r1_bio) {
74 unplug_slaves(pi->mddev);
75 return NULL;
76 }
77
78 /*
79 * Allocate bios : 1 for reading, n-1 for writing
80 */
81 for (j = pi->raid_disks ; j-- ; ) {
82 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
83 if (!bio)
84 goto out_free_bio;
85 r1_bio->bios[j] = bio;
86 }
87 /*
88 * Allocate RESYNC_PAGES data pages and attach them to
89 * the first bio;
90 */
91 bio = r1_bio->bios[0];
92 for (i = 0; i < RESYNC_PAGES; i++) {
93 page = alloc_page(gfp_flags);
94 if (unlikely(!page))
95 goto out_free_pages;
96
97 bio->bi_io_vec[i].bv_page = page;
98 }
99
100 r1_bio->master_bio = NULL;
101
102 return r1_bio;
103
104out_free_pages:
105 for ( ; i > 0 ; i--)
106 __free_page(bio->bi_io_vec[i-1].bv_page);
107out_free_bio:
108 while ( ++j < pi->raid_disks )
109 bio_put(r1_bio->bios[j]);
110 r1bio_pool_free(r1_bio, data);
111 return NULL;
112}
113
114static void r1buf_pool_free(void *__r1_bio, void *data)
115{
116 struct pool_info *pi = data;
117 int i;
118 r1bio_t *r1bio = __r1_bio;
119 struct bio *bio = r1bio->bios[0];
120
121 for (i = 0; i < RESYNC_PAGES; i++) {
122 __free_page(bio->bi_io_vec[i].bv_page);
123 bio->bi_io_vec[i].bv_page = NULL;
124 }
125 for (i=0 ; i < pi->raid_disks; i++)
126 bio_put(r1bio->bios[i]);
127
128 r1bio_pool_free(r1bio, data);
129}
130
131static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
132{
133 int i;
134
135 for (i = 0; i < conf->raid_disks; i++) {
136 struct bio **bio = r1_bio->bios + i;
137 if (*bio)
138 bio_put(*bio);
139 *bio = NULL;
140 }
141}
142
143static inline void free_r1bio(r1bio_t *r1_bio)
144{
145 unsigned long flags;
146
147 conf_t *conf = mddev_to_conf(r1_bio->mddev);
148
149 /*
150 * Wake up any possible resync thread that waits for the device
151 * to go idle.
152 */
153 spin_lock_irqsave(&conf->resync_lock, flags);
154 if (!--conf->nr_pending) {
155 wake_up(&conf->wait_idle);
156 wake_up(&conf->wait_resume);
157 }
158 spin_unlock_irqrestore(&conf->resync_lock, flags);
159
160 put_all_bios(conf, r1_bio);
161 mempool_free(r1_bio, conf->r1bio_pool);
162}
163
164static inline void put_buf(r1bio_t *r1_bio)
165{
166 conf_t *conf = mddev_to_conf(r1_bio->mddev);
167 unsigned long flags;
168
169 mempool_free(r1_bio, conf->r1buf_pool);
170
171 spin_lock_irqsave(&conf->resync_lock, flags);
172 if (!conf->barrier)
173 BUG();
174 --conf->barrier;
175 wake_up(&conf->wait_resume);
176 wake_up(&conf->wait_idle);
177
178 if (!--conf->nr_pending) {
179 wake_up(&conf->wait_idle);
180 wake_up(&conf->wait_resume);
181 }
182 spin_unlock_irqrestore(&conf->resync_lock, flags);
183}
184
185static void reschedule_retry(r1bio_t *r1_bio)
186{
187 unsigned long flags;
188 mddev_t *mddev = r1_bio->mddev;
189 conf_t *conf = mddev_to_conf(mddev);
190
191 spin_lock_irqsave(&conf->device_lock, flags);
192 list_add(&r1_bio->retry_list, &conf->retry_list);
193 spin_unlock_irqrestore(&conf->device_lock, flags);
194
195 md_wakeup_thread(mddev->thread);
196}
197
198/*
199 * raid_end_bio_io() is called when we have finished servicing a mirrored
200 * operation and are ready to return a success/failure code to the buffer
201 * cache layer.
202 */
203static void raid_end_bio_io(r1bio_t *r1_bio)
204{
205 struct bio *bio = r1_bio->master_bio;
206
207 bio_endio(bio, bio->bi_size,
208 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
209 free_r1bio(r1_bio);
210}
211
212/*
213 * Update disk head position estimator based on IRQ completion info.
214 */
215static inline void update_head_pos(int disk, r1bio_t *r1_bio)
216{
217 conf_t *conf = mddev_to_conf(r1_bio->mddev);
218
219 conf->mirrors[disk].head_position =
220 r1_bio->sector + (r1_bio->sectors);
221}
222
223static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
224{
225 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
226 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
227 int mirror;
228 conf_t *conf = mddev_to_conf(r1_bio->mddev);
229
230 if (bio->bi_size)
231 return 1;
232
233 mirror = r1_bio->read_disk;
234 /*
235 * this branch is our 'one mirror IO has finished' event handler:
236 */
237 if (!uptodate)
238 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
239 else
240 /*
241 * Set R1BIO_Uptodate in our master bio, so that
242 * we will return a good error code for to the higher
243 * levels even if IO on some other mirrored buffer fails.
244 *
245 * The 'master' represents the composite IO operation to
246 * user-side. So if something waits for IO, then it will
247 * wait for the 'master' bio.
248 */
249 set_bit(R1BIO_Uptodate, &r1_bio->state);
250
251 update_head_pos(mirror, r1_bio);
252
253 /*
254 * we have only one bio on the read side
255 */
256 if (uptodate)
257 raid_end_bio_io(r1_bio);
258 else {
259 /*
260 * oops, read error:
261 */
262 char b[BDEVNAME_SIZE];
263 if (printk_ratelimit())
264 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
265 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
266 reschedule_retry(r1_bio);
267 }
268
269 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
270 return 0;
271}
272
273static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
274{
275 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
276 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
277 int mirror;
278 conf_t *conf = mddev_to_conf(r1_bio->mddev);
279
280 if (bio->bi_size)
281 return 1;
282
283 for (mirror = 0; mirror < conf->raid_disks; mirror++)
284 if (r1_bio->bios[mirror] == bio)
285 break;
286
287 /*
288 * this branch is our 'one mirror IO has finished' event handler:
289 */
290 if (!uptodate)
291 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
292 else
293 /*
294 * Set R1BIO_Uptodate in our master bio, so that
295 * we will return a good error code for to the higher
296 * levels even if IO on some other mirrored buffer fails.
297 *
298 * The 'master' represents the composite IO operation to
299 * user-side. So if something waits for IO, then it will
300 * wait for the 'master' bio.
301 */
302 set_bit(R1BIO_Uptodate, &r1_bio->state);
303
304 update_head_pos(mirror, r1_bio);
305
306 /*
307 *
308 * Let's see if all mirrored write operations have finished
309 * already.
310 */
311 if (atomic_dec_and_test(&r1_bio->remaining)) {
312 md_write_end(r1_bio->mddev);
313 raid_end_bio_io(r1_bio);
314 }
315
316 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
317 return 0;
318}
319
320
321/*
322 * This routine returns the disk from which the requested read should
323 * be done. There is a per-array 'next expected sequential IO' sector
324 * number - if this matches on the next IO then we use the last disk.
325 * There is also a per-disk 'last know head position' sector that is
326 * maintained from IRQ contexts, both the normal and the resync IO
327 * completion handlers update this position correctly. If there is no
328 * perfect sequential match then we pick the disk whose head is closest.
329 *
330 * If there are 2 mirrors in the same 2 devices, performance degrades
331 * because position is mirror, not device based.
332 *
333 * The rdev for the device selected will have nr_pending incremented.
334 */
335static int read_balance(conf_t *conf, r1bio_t *r1_bio)
336{
337 const unsigned long this_sector = r1_bio->sector;
338 int new_disk = conf->last_used, disk = new_disk;
339 const int sectors = r1_bio->sectors;
340 sector_t new_distance, current_distance;
341 mdk_rdev_t *new_rdev, *rdev;
342
343 rcu_read_lock();
344 /*
345 * Check if it if we can balance. We can balance on the whole
346 * device if no resync is going on, or below the resync window.
347 * We take the first readable disk when above the resync window.
348 */
349 retry:
350 if (conf->mddev->recovery_cp < MaxSector &&
351 (this_sector + sectors >= conf->next_resync)) {
352 /* Choose the first operation device, for consistancy */
353 new_disk = 0;
354
355 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
356 !new_rdev->in_sync) {
357 new_disk++;
358 if (new_disk == conf->raid_disks) {
359 new_disk = -1;
360 break;
361 }
362 }
363 goto rb_out;
364 }
365
366
367 /* make sure the disk is operational */
368 while ((new_rdev=conf->mirrors[new_disk].rdev) == NULL ||
369 !new_rdev->in_sync) {
370 if (new_disk <= 0)
371 new_disk = conf->raid_disks;
372 new_disk--;
373 if (new_disk == disk) {
374 new_disk = -1;
375 goto rb_out;
376 }
377 }
378 disk = new_disk;
379 /* now disk == new_disk == starting point for search */
380
381 /*
382 * Don't change to another disk for sequential reads:
383 */
384 if (conf->next_seq_sect == this_sector)
385 goto rb_out;
386 if (this_sector == conf->mirrors[new_disk].head_position)
387 goto rb_out;
388
389 current_distance = abs(this_sector - conf->mirrors[disk].head_position);
390
391 /* Find the disk whose head is closest */
392
393 do {
394 if (disk <= 0)
395 disk = conf->raid_disks;
396 disk--;
397
398 if ((rdev=conf->mirrors[disk].rdev) == NULL ||
399 !rdev->in_sync)
400 continue;
401
402 if (!atomic_read(&rdev->nr_pending)) {
403 new_disk = disk;
404 new_rdev = rdev;
405 break;
406 }
407 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
408 if (new_distance < current_distance) {
409 current_distance = new_distance;
410 new_disk = disk;
411 new_rdev = rdev;
412 }
413 } while (disk != conf->last_used);
414
415rb_out:
416
417
418 if (new_disk >= 0) {
419 conf->next_seq_sect = this_sector + sectors;
420 conf->last_used = new_disk;
421 atomic_inc(&new_rdev->nr_pending);
422 if (!new_rdev->in_sync) {
423 /* cannot risk returning a device that failed
424 * before we inc'ed nr_pending
425 */
426 atomic_dec(&new_rdev->nr_pending);
427 goto retry;
428 }
429 }
430 rcu_read_unlock();
431
432 return new_disk;
433}
434
435static void unplug_slaves(mddev_t *mddev)
436{
437 conf_t *conf = mddev_to_conf(mddev);
438 int i;
439
440 rcu_read_lock();
441 for (i=0; i<mddev->raid_disks; i++) {
442 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
443 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
444 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
445
446 atomic_inc(&rdev->nr_pending);
447 rcu_read_unlock();
448
449 if (r_queue->unplug_fn)
450 r_queue->unplug_fn(r_queue);
451
452 rdev_dec_pending(rdev, mddev);
453 rcu_read_lock();
454 }
455 }
456 rcu_read_unlock();
457}
458
459static void raid1_unplug(request_queue_t *q)
460{
461 unplug_slaves(q->queuedata);
462}
463
464static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
465 sector_t *error_sector)
466{
467 mddev_t *mddev = q->queuedata;
468 conf_t *conf = mddev_to_conf(mddev);
469 int i, ret = 0;
470
471 rcu_read_lock();
472 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
473 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
474 if (rdev && !rdev->faulty) {
475 struct block_device *bdev = rdev->bdev;
476 request_queue_t *r_queue = bdev_get_queue(bdev);
477
478 if (!r_queue->issue_flush_fn)
479 ret = -EOPNOTSUPP;
480 else {
481 atomic_inc(&rdev->nr_pending);
482 rcu_read_unlock();
483 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
484 error_sector);
485 rdev_dec_pending(rdev, mddev);
486 rcu_read_lock();
487 }
488 }
489 }
490 rcu_read_unlock();
491 return ret;
492}
493
494/*
495 * Throttle resync depth, so that we can both get proper overlapping of
496 * requests, but are still able to handle normal requests quickly.
497 */
498#define RESYNC_DEPTH 32
499
500static void device_barrier(conf_t *conf, sector_t sect)
501{
502 spin_lock_irq(&conf->resync_lock);
503 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
504 conf->resync_lock, unplug_slaves(conf->mddev));
505
506 if (!conf->barrier++) {
507 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
508 conf->resync_lock, unplug_slaves(conf->mddev));
509 if (conf->nr_pending)
510 BUG();
511 }
512 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
513 conf->resync_lock, unplug_slaves(conf->mddev));
514 conf->next_resync = sect;
515 spin_unlock_irq(&conf->resync_lock);
516}
517
518static int make_request(request_queue_t *q, struct bio * bio)
519{
520 mddev_t *mddev = q->queuedata;
521 conf_t *conf = mddev_to_conf(mddev);
522 mirror_info_t *mirror;
523 r1bio_t *r1_bio;
524 struct bio *read_bio;
525 int i, disks;
526 mdk_rdev_t *rdev;
527
528 /*
529 * Register the new request and wait if the reconstruction
530 * thread has put up a bar for new requests.
531 * Continue immediately if no resync is active currently.
532 */
533 spin_lock_irq(&conf->resync_lock);
534 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
535 conf->nr_pending++;
536 spin_unlock_irq(&conf->resync_lock);
537
538 if (bio_data_dir(bio)==WRITE) {
539 disk_stat_inc(mddev->gendisk, writes);
540 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
541 } else {
542 disk_stat_inc(mddev->gendisk, reads);
543 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
544 }
545
546 /*
547 * make_request() can abort the operation when READA is being
548 * used and no empty request is available.
549 *
550 */
551 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
552
553 r1_bio->master_bio = bio;
554 r1_bio->sectors = bio->bi_size >> 9;
555
556 r1_bio->mddev = mddev;
557 r1_bio->sector = bio->bi_sector;
558
559 r1_bio->state = 0;
560
561 if (bio_data_dir(bio) == READ) {
562 /*
563 * read balancing logic:
564 */
565 int rdisk = read_balance(conf, r1_bio);
566
567 if (rdisk < 0) {
568 /* couldn't find anywhere to read from */
569 raid_end_bio_io(r1_bio);
570 return 0;
571 }
572 mirror = conf->mirrors + rdisk;
573
574 r1_bio->read_disk = rdisk;
575
576 read_bio = bio_clone(bio, GFP_NOIO);
577
578 r1_bio->bios[rdisk] = read_bio;
579
580 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
581 read_bio->bi_bdev = mirror->rdev->bdev;
582 read_bio->bi_end_io = raid1_end_read_request;
583 read_bio->bi_rw = READ;
584 read_bio->bi_private = r1_bio;
585
586 generic_make_request(read_bio);
587 return 0;
588 }
589
590 /*
591 * WRITE:
592 */
593 /* first select target devices under spinlock and
594 * inc refcount on their rdev. Record them by setting
595 * bios[x] to bio
596 */
597 disks = conf->raid_disks;
598 rcu_read_lock();
599 for (i = 0; i < disks; i++) {
600 if ((rdev=conf->mirrors[i].rdev) != NULL &&
601 !rdev->faulty) {
602 atomic_inc(&rdev->nr_pending);
603 if (rdev->faulty) {
604 atomic_dec(&rdev->nr_pending);
605 r1_bio->bios[i] = NULL;
606 } else
607 r1_bio->bios[i] = bio;
608 } else
609 r1_bio->bios[i] = NULL;
610 }
611 rcu_read_unlock();
612
613 atomic_set(&r1_bio->remaining, 1);
614 md_write_start(mddev);
615 for (i = 0; i < disks; i++) {
616 struct bio *mbio;
617 if (!r1_bio->bios[i])
618 continue;
619
620 mbio = bio_clone(bio, GFP_NOIO);
621 r1_bio->bios[i] = mbio;
622
623 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
624 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
625 mbio->bi_end_io = raid1_end_write_request;
626 mbio->bi_rw = WRITE;
627 mbio->bi_private = r1_bio;
628
629 atomic_inc(&r1_bio->remaining);
630 generic_make_request(mbio);
631 }
632
633 if (atomic_dec_and_test(&r1_bio->remaining)) {
634 md_write_end(mddev);
635 raid_end_bio_io(r1_bio);
636 }
637
638 return 0;
639}
640
641static void status(struct seq_file *seq, mddev_t *mddev)
642{
643 conf_t *conf = mddev_to_conf(mddev);
644 int i;
645
646 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
647 conf->working_disks);
648 for (i = 0; i < conf->raid_disks; i++)
649 seq_printf(seq, "%s",
650 conf->mirrors[i].rdev &&
651 conf->mirrors[i].rdev->in_sync ? "U" : "_");
652 seq_printf(seq, "]");
653}
654
655
656static void error(mddev_t *mddev, mdk_rdev_t *rdev)
657{
658 char b[BDEVNAME_SIZE];
659 conf_t *conf = mddev_to_conf(mddev);
660
661 /*
662 * If it is not operational, then we have already marked it as dead
663 * else if it is the last working disks, ignore the error, let the
664 * next level up know.
665 * else mark the drive as failed
666 */
667 if (rdev->in_sync
668 && conf->working_disks == 1)
669 /*
670 * Don't fail the drive, act as though we were just a
671 * normal single drive
672 */
673 return;
674 if (rdev->in_sync) {
675 mddev->degraded++;
676 conf->working_disks--;
677 /*
678 * if recovery is running, make sure it aborts.
679 */
680 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
681 }
682 rdev->in_sync = 0;
683 rdev->faulty = 1;
684 mddev->sb_dirty = 1;
685 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
686 " Operation continuing on %d devices\n",
687 bdevname(rdev->bdev,b), conf->working_disks);
688}
689
690static void print_conf(conf_t *conf)
691{
692 int i;
693 mirror_info_t *tmp;
694
695 printk("RAID1 conf printout:\n");
696 if (!conf) {
697 printk("(!conf)\n");
698 return;
699 }
700 printk(" --- wd:%d rd:%d\n", conf->working_disks,
701 conf->raid_disks);
702
703 for (i = 0; i < conf->raid_disks; i++) {
704 char b[BDEVNAME_SIZE];
705 tmp = conf->mirrors + i;
706 if (tmp->rdev)
707 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
708 i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
709 bdevname(tmp->rdev->bdev,b));
710 }
711}
712
713static void close_sync(conf_t *conf)
714{
715 spin_lock_irq(&conf->resync_lock);
716 wait_event_lock_irq(conf->wait_resume, !conf->barrier,
717 conf->resync_lock, unplug_slaves(conf->mddev));
718 spin_unlock_irq(&conf->resync_lock);
719
720 if (conf->barrier) BUG();
721 if (waitqueue_active(&conf->wait_idle)) BUG();
722
723 mempool_destroy(conf->r1buf_pool);
724 conf->r1buf_pool = NULL;
725}
726
727static int raid1_spare_active(mddev_t *mddev)
728{
729 int i;
730 conf_t *conf = mddev->private;
731 mirror_info_t *tmp;
732
733 /*
734 * Find all failed disks within the RAID1 configuration
735 * and mark them readable
736 */
737 for (i = 0; i < conf->raid_disks; i++) {
738 tmp = conf->mirrors + i;
739 if (tmp->rdev
740 && !tmp->rdev->faulty
741 && !tmp->rdev->in_sync) {
742 conf->working_disks++;
743 mddev->degraded--;
744 tmp->rdev->in_sync = 1;
745 }
746 }
747
748 print_conf(conf);
749 return 0;
750}
751
752
753static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
754{
755 conf_t *conf = mddev->private;
756 int found = 0;
757 int mirror;
758 mirror_info_t *p;
759
760 for (mirror=0; mirror < mddev->raid_disks; mirror++)
761 if ( !(p=conf->mirrors+mirror)->rdev) {
762
763 blk_queue_stack_limits(mddev->queue,
764 rdev->bdev->bd_disk->queue);
765 /* as we don't honour merge_bvec_fn, we must never risk
766 * violating it, so limit ->max_sector to one PAGE, as
767 * a one page request is never in violation.
768 */
769 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
770 mddev->queue->max_sectors > (PAGE_SIZE>>9))
771 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
772
773 p->head_position = 0;
774 rdev->raid_disk = mirror;
775 found = 1;
776 p->rdev = rdev;
777 break;
778 }
779
780 print_conf(conf);
781 return found;
782}
783
784static int raid1_remove_disk(mddev_t *mddev, int number)
785{
786 conf_t *conf = mddev->private;
787 int err = 0;
788 mdk_rdev_t *rdev;
789 mirror_info_t *p = conf->mirrors+ number;
790
791 print_conf(conf);
792 rdev = p->rdev;
793 if (rdev) {
794 if (rdev->in_sync ||
795 atomic_read(&rdev->nr_pending)) {
796 err = -EBUSY;
797 goto abort;
798 }
799 p->rdev = NULL;
800 synchronize_kernel();
801 if (atomic_read(&rdev->nr_pending)) {
802 /* lost the race, try later */
803 err = -EBUSY;
804 p->rdev = rdev;
805 }
806 }
807abort:
808
809 print_conf(conf);
810 return err;
811}
812
813
814static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
815{
816 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
817 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
818 conf_t *conf = mddev_to_conf(r1_bio->mddev);
819
820 if (bio->bi_size)
821 return 1;
822
823 if (r1_bio->bios[r1_bio->read_disk] != bio)
824 BUG();
825 update_head_pos(r1_bio->read_disk, r1_bio);
826 /*
827 * we have read a block, now it needs to be re-written,
828 * or re-read if the read failed.
829 * We don't do much here, just schedule handling by raid1d
830 */
831 if (!uptodate)
832 md_error(r1_bio->mddev,
833 conf->mirrors[r1_bio->read_disk].rdev);
834 else
835 set_bit(R1BIO_Uptodate, &r1_bio->state);
836 rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
837 reschedule_retry(r1_bio);
838 return 0;
839}
840
841static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
842{
843 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
844 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
845 mddev_t *mddev = r1_bio->mddev;
846 conf_t *conf = mddev_to_conf(mddev);
847 int i;
848 int mirror=0;
849
850 if (bio->bi_size)
851 return 1;
852
853 for (i = 0; i < conf->raid_disks; i++)
854 if (r1_bio->bios[i] == bio) {
855 mirror = i;
856 break;
857 }
858 if (!uptodate)
859 md_error(mddev, conf->mirrors[mirror].rdev);
860 update_head_pos(mirror, r1_bio);
861
862 if (atomic_dec_and_test(&r1_bio->remaining)) {
863 md_done_sync(mddev, r1_bio->sectors, uptodate);
864 put_buf(r1_bio);
865 }
866 rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
867 return 0;
868}
869
870static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
871{
872 conf_t *conf = mddev_to_conf(mddev);
873 int i;
874 int disks = conf->raid_disks;
875 struct bio *bio, *wbio;
876
877 bio = r1_bio->bios[r1_bio->read_disk];
878
879 /*
880 * schedule writes
881 */
882 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
883 /*
884 * There is no point trying a read-for-reconstruct as
885 * reconstruct is about to be aborted
886 */
887 char b[BDEVNAME_SIZE];
888 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
889 " for block %llu\n",
890 bdevname(bio->bi_bdev,b),
891 (unsigned long long)r1_bio->sector);
892 md_done_sync(mddev, r1_bio->sectors, 0);
893 put_buf(r1_bio);
894 return;
895 }
896
897 atomic_set(&r1_bio->remaining, 1);
898 for (i = 0; i < disks ; i++) {
899 wbio = r1_bio->bios[i];
900 if (wbio->bi_end_io != end_sync_write)
901 continue;
902
903 atomic_inc(&conf->mirrors[i].rdev->nr_pending);
904 atomic_inc(&r1_bio->remaining);
905 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
906 generic_make_request(wbio);
907 }
908
909 if (atomic_dec_and_test(&r1_bio->remaining)) {
910 md_done_sync(mddev, r1_bio->sectors, 1);
911 put_buf(r1_bio);
912 }
913}
914
915/*
916 * This is a kernel thread which:
917 *
918 * 1. Retries failed read operations on working mirrors.
919 * 2. Updates the raid superblock when problems encounter.
920 * 3. Performs writes following reads for array syncronising.
921 */
922
923static void raid1d(mddev_t *mddev)
924{
925 r1bio_t *r1_bio;
926 struct bio *bio;
927 unsigned long flags;
928 conf_t *conf = mddev_to_conf(mddev);
929 struct list_head *head = &conf->retry_list;
930 int unplug=0;
931 mdk_rdev_t *rdev;
932
933 md_check_recovery(mddev);
934 md_handle_safemode(mddev);
935
936 for (;;) {
937 char b[BDEVNAME_SIZE];
938 spin_lock_irqsave(&conf->device_lock, flags);
939 if (list_empty(head))
940 break;
941 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
942 list_del(head->prev);
943 spin_unlock_irqrestore(&conf->device_lock, flags);
944
945 mddev = r1_bio->mddev;
946 conf = mddev_to_conf(mddev);
947 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
948 sync_request_write(mddev, r1_bio);
949 unplug = 1;
950 } else {
951 int disk;
952 bio = r1_bio->bios[r1_bio->read_disk];
953 if ((disk=read_balance(conf, r1_bio)) == -1) {
954 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
955 " read error for block %llu\n",
956 bdevname(bio->bi_bdev,b),
957 (unsigned long long)r1_bio->sector);
958 raid_end_bio_io(r1_bio);
959 } else {
960 r1_bio->bios[r1_bio->read_disk] = NULL;
961 r1_bio->read_disk = disk;
962 bio_put(bio);
963 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
964 r1_bio->bios[r1_bio->read_disk] = bio;
965 rdev = conf->mirrors[disk].rdev;
966 if (printk_ratelimit())
967 printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
968 " another mirror\n",
969 bdevname(rdev->bdev,b),
970 (unsigned long long)r1_bio->sector);
971 bio->bi_sector = r1_bio->sector + rdev->data_offset;
972 bio->bi_bdev = rdev->bdev;
973 bio->bi_end_io = raid1_end_read_request;
974 bio->bi_rw = READ;
975 bio->bi_private = r1_bio;
976 unplug = 1;
977 generic_make_request(bio);
978 }
979 }
980 }
981 spin_unlock_irqrestore(&conf->device_lock, flags);
982 if (unplug)
983 unplug_slaves(mddev);
984}
985
986
987static int init_resync(conf_t *conf)
988{
989 int buffs;
990
991 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
992 if (conf->r1buf_pool)
993 BUG();
994 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
995 conf->poolinfo);
996 if (!conf->r1buf_pool)
997 return -ENOMEM;
998 conf->next_resync = 0;
999 return 0;
1000}
1001
1002/*
1003 * perform a "sync" on one "block"
1004 *
1005 * We need to make sure that no normal I/O request - particularly write
1006 * requests - conflict with active sync requests.
1007 *
1008 * This is achieved by tracking pending requests and a 'barrier' concept
1009 * that can be installed to exclude normal IO requests.
1010 */
1011
1012static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1013{
1014 conf_t *conf = mddev_to_conf(mddev);
1015 mirror_info_t *mirror;
1016 r1bio_t *r1_bio;
1017 struct bio *bio;
1018 sector_t max_sector, nr_sectors;
1019 int disk;
1020 int i;
1021 int write_targets = 0;
1022
1023 if (!conf->r1buf_pool)
1024 if (init_resync(conf))
1025 return -ENOMEM;
1026
1027 max_sector = mddev->size << 1;
1028 if (sector_nr >= max_sector) {
1029 close_sync(conf);
1030 return 0;
1031 }
1032
1033 /*
1034 * If there is non-resync activity waiting for us then
1035 * put in a delay to throttle resync.
1036 */
1037 if (!go_faster && waitqueue_active(&conf->wait_resume))
1038 msleep_interruptible(1000);
1039 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1040
1041 /*
1042 * If reconstructing, and >1 working disc,
1043 * could dedicate one to rebuild and others to
1044 * service read requests ..
1045 */
1046 disk = conf->last_used;
1047 /* make sure disk is operational */
1048
1049 while (conf->mirrors[disk].rdev == NULL ||
1050 !conf->mirrors[disk].rdev->in_sync) {
1051 if (disk <= 0)
1052 disk = conf->raid_disks;
1053 disk--;
1054 if (disk == conf->last_used)
1055 break;
1056 }
1057 conf->last_used = disk;
1058 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
1059
1060
1061 mirror = conf->mirrors + disk;
1062
1063 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1064
1065 spin_lock_irq(&conf->resync_lock);
1066 conf->nr_pending++;
1067 spin_unlock_irq(&conf->resync_lock);
1068
1069 r1_bio->mddev = mddev;
1070 r1_bio->sector = sector_nr;
1071 set_bit(R1BIO_IsSync, &r1_bio->state);
1072 r1_bio->read_disk = disk;
1073
1074 for (i=0; i < conf->raid_disks; i++) {
1075 bio = r1_bio->bios[i];
1076
1077 /* take from bio_init */
1078 bio->bi_next = NULL;
1079 bio->bi_flags |= 1 << BIO_UPTODATE;
1080 bio->bi_rw = 0;
1081 bio->bi_vcnt = 0;
1082 bio->bi_idx = 0;
1083 bio->bi_phys_segments = 0;
1084 bio->bi_hw_segments = 0;
1085 bio->bi_size = 0;
1086 bio->bi_end_io = NULL;
1087 bio->bi_private = NULL;
1088
1089 if (i == disk) {
1090 bio->bi_rw = READ;
1091 bio->bi_end_io = end_sync_read;
1092 } else if (conf->mirrors[i].rdev &&
1093 !conf->mirrors[i].rdev->faulty &&
1094 (!conf->mirrors[i].rdev->in_sync ||
1095 sector_nr + RESYNC_SECTORS > mddev->recovery_cp)) {
1096 bio->bi_rw = WRITE;
1097 bio->bi_end_io = end_sync_write;
1098 write_targets ++;
1099 } else
1100 continue;
1101 bio->bi_sector = sector_nr + conf->mirrors[i].rdev->data_offset;
1102 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1103 bio->bi_private = r1_bio;
1104 }
1105 if (write_targets == 0) {
1106 /* There is nowhere to write, so all non-sync
1107 * drives must be failed - so we are finished
1108 */
1109 int rv = max_sector - sector_nr;
1110 md_done_sync(mddev, rv, 1);
1111 put_buf(r1_bio);
1112 rdev_dec_pending(conf->mirrors[disk].rdev, mddev);
1113 return rv;
1114 }
1115
1116 nr_sectors = 0;
1117 do {
1118 struct page *page;
1119 int len = PAGE_SIZE;
1120 if (sector_nr + (len>>9) > max_sector)
1121 len = (max_sector - sector_nr) << 9;
1122 if (len == 0)
1123 break;
1124 for (i=0 ; i < conf->raid_disks; i++) {
1125 bio = r1_bio->bios[i];
1126 if (bio->bi_end_io) {
1127 page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page;
1128 if (bio_add_page(bio, page, len, 0) == 0) {
1129 /* stop here */
1130 r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page;
1131 while (i > 0) {
1132 i--;
1133 bio = r1_bio->bios[i];
1134 if (bio->bi_end_io==NULL) continue;
1135 /* remove last page from this bio */
1136 bio->bi_vcnt--;
1137 bio->bi_size -= len;
1138 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
1139 }
1140 goto bio_full;
1141 }
1142 }
1143 }
1144 nr_sectors += len>>9;
1145 sector_nr += len>>9;
1146 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1147 bio_full:
1148 bio = r1_bio->bios[disk];
1149 r1_bio->sectors = nr_sectors;
1150
1151 md_sync_acct(mirror->rdev->bdev, nr_sectors);
1152
1153 generic_make_request(bio);
1154
1155 return nr_sectors;
1156}
1157
1158static int run(mddev_t *mddev)
1159{
1160 conf_t *conf;
1161 int i, j, disk_idx;
1162 mirror_info_t *disk;
1163 mdk_rdev_t *rdev;
1164 struct list_head *tmp;
1165
1166 if (mddev->level != 1) {
1167 printk("raid1: %s: raid level not set to mirroring (%d)\n",
1168 mdname(mddev), mddev->level);
1169 goto out;
1170 }
1171 /*
1172 * copy the already verified devices into our private RAID1
1173 * bookkeeping area. [whatever we allocate in run(),
1174 * should be freed in stop()]
1175 */
1176 conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
1177 mddev->private = conf;
1178 if (!conf)
1179 goto out_no_mem;
1180
1181 memset(conf, 0, sizeof(*conf));
1182 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1183 GFP_KERNEL);
1184 if (!conf->mirrors)
1185 goto out_no_mem;
1186
1187 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
1188
1189 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1190 if (!conf->poolinfo)
1191 goto out_no_mem;
1192 conf->poolinfo->mddev = mddev;
1193 conf->poolinfo->raid_disks = mddev->raid_disks;
1194 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1195 r1bio_pool_free,
1196 conf->poolinfo);
1197 if (!conf->r1bio_pool)
1198 goto out_no_mem;
1199
1200 mddev->queue->unplug_fn = raid1_unplug;
1201
1202 mddev->queue->issue_flush_fn = raid1_issue_flush;
1203
1204 ITERATE_RDEV(mddev, rdev, tmp) {
1205 disk_idx = rdev->raid_disk;
1206 if (disk_idx >= mddev->raid_disks
1207 || disk_idx < 0)
1208 continue;
1209 disk = conf->mirrors + disk_idx;
1210
1211 disk->rdev = rdev;
1212
1213 blk_queue_stack_limits(mddev->queue,
1214 rdev->bdev->bd_disk->queue);
1215 /* as we don't honour merge_bvec_fn, we must never risk
1216 * violating it, so limit ->max_sector to one PAGE, as
1217 * a one page request is never in violation.
1218 */
1219 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1220 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1221 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1222
1223 disk->head_position = 0;
1224 if (!rdev->faulty && rdev->in_sync)
1225 conf->working_disks++;
1226 }
1227 conf->raid_disks = mddev->raid_disks;
1228 conf->mddev = mddev;
1229 spin_lock_init(&conf->device_lock);
1230 INIT_LIST_HEAD(&conf->retry_list);
1231 if (conf->working_disks == 1)
1232 mddev->recovery_cp = MaxSector;
1233
1234 spin_lock_init(&conf->resync_lock);
1235 init_waitqueue_head(&conf->wait_idle);
1236 init_waitqueue_head(&conf->wait_resume);
1237
1238 if (!conf->working_disks) {
1239 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
1240 mdname(mddev));
1241 goto out_free_conf;
1242 }
1243
1244 mddev->degraded = 0;
1245 for (i = 0; i < conf->raid_disks; i++) {
1246
1247 disk = conf->mirrors + i;
1248
1249 if (!disk->rdev) {
1250 disk->head_position = 0;
1251 mddev->degraded++;
1252 }
1253 }
1254
1255 /*
1256 * find the first working one and use it as a starting point
1257 * to read balancing.
1258 */
1259 for (j = 0; j < conf->raid_disks &&
1260 (!conf->mirrors[j].rdev ||
1261 !conf->mirrors[j].rdev->in_sync) ; j++)
1262 /* nothing */;
1263 conf->last_used = j;
1264
1265
1266
1267 {
1268 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
1269 if (!mddev->thread) {
1270 printk(KERN_ERR
1271 "raid1: couldn't allocate thread for %s\n",
1272 mdname(mddev));
1273 goto out_free_conf;
1274 }
1275 }
1276 printk(KERN_INFO
1277 "raid1: raid set %s active with %d out of %d mirrors\n",
1278 mdname(mddev), mddev->raid_disks - mddev->degraded,
1279 mddev->raid_disks);
1280 /*
1281 * Ok, everything is just fine now
1282 */
1283 mddev->array_size = mddev->size;
1284
1285 return 0;
1286
1287out_no_mem:
1288 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
1289 mdname(mddev));
1290
1291out_free_conf:
1292 if (conf) {
1293 if (conf->r1bio_pool)
1294 mempool_destroy(conf->r1bio_pool);
1295 if (conf->mirrors)
1296 kfree(conf->mirrors);
1297 if (conf->poolinfo)
1298 kfree(conf->poolinfo);
1299 kfree(conf);
1300 mddev->private = NULL;
1301 }
1302out:
1303 return -EIO;
1304}
1305
1306static int stop(mddev_t *mddev)
1307{
1308 conf_t *conf = mddev_to_conf(mddev);
1309
1310 md_unregister_thread(mddev->thread);
1311 mddev->thread = NULL;
1312 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1313 if (conf->r1bio_pool)
1314 mempool_destroy(conf->r1bio_pool);
1315 if (conf->mirrors)
1316 kfree(conf->mirrors);
1317 if (conf->poolinfo)
1318 kfree(conf->poolinfo);
1319 kfree(conf);
1320 mddev->private = NULL;
1321 return 0;
1322}
1323
1324static int raid1_resize(mddev_t *mddev, sector_t sectors)
1325{
1326 /* no resync is happening, and there is enough space
1327 * on all devices, so we can resize.
1328 * We need to make sure resync covers any new space.
1329 * If the array is shrinking we should possibly wait until
1330 * any io in the removed space completes, but it hardly seems
1331 * worth it.
1332 */
1333 mddev->array_size = sectors>>1;
1334 set_capacity(mddev->gendisk, mddev->array_size << 1);
1335 mddev->changed = 1;
1336 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
1337 mddev->recovery_cp = mddev->size << 1;
1338 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1339 }
1340 mddev->size = mddev->array_size;
1341 return 0;
1342}
1343
1344static int raid1_reshape(mddev_t *mddev, int raid_disks)
1345{
1346 /* We need to:
1347 * 1/ resize the r1bio_pool
1348 * 2/ resize conf->mirrors
1349 *
1350 * We allocate a new r1bio_pool if we can.
1351 * Then raise a device barrier and wait until all IO stops.
1352 * Then resize conf->mirrors and swap in the new r1bio pool.
1353 */
1354 mempool_t *newpool, *oldpool;
1355 struct pool_info *newpoolinfo;
1356 mirror_info_t *newmirrors;
1357 conf_t *conf = mddev_to_conf(mddev);
1358
1359 int d;
1360
1361 for (d= raid_disks; d < conf->raid_disks; d++)
1362 if (conf->mirrors[d].rdev)
1363 return -EBUSY;
1364
1365 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
1366 if (!newpoolinfo)
1367 return -ENOMEM;
1368 newpoolinfo->mddev = mddev;
1369 newpoolinfo->raid_disks = raid_disks;
1370
1371 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1372 r1bio_pool_free, newpoolinfo);
1373 if (!newpool) {
1374 kfree(newpoolinfo);
1375 return -ENOMEM;
1376 }
1377 newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
1378 if (!newmirrors) {
1379 kfree(newpoolinfo);
1380 mempool_destroy(newpool);
1381 return -ENOMEM;
1382 }
1383 memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
1384
1385 spin_lock_irq(&conf->resync_lock);
1386 conf->barrier++;
1387 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
1388 conf->resync_lock, unplug_slaves(mddev));
1389 spin_unlock_irq(&conf->resync_lock);
1390
1391 /* ok, everything is stopped */
1392 oldpool = conf->r1bio_pool;
1393 conf->r1bio_pool = newpool;
1394 for (d=0; d < raid_disks && d < conf->raid_disks; d++)
1395 newmirrors[d] = conf->mirrors[d];
1396 kfree(conf->mirrors);
1397 conf->mirrors = newmirrors;
1398 kfree(conf->poolinfo);
1399 conf->poolinfo = newpoolinfo;
1400
1401 mddev->degraded += (raid_disks - conf->raid_disks);
1402 conf->raid_disks = mddev->raid_disks = raid_disks;
1403
1404 spin_lock_irq(&conf->resync_lock);
1405 conf->barrier--;
1406 spin_unlock_irq(&conf->resync_lock);
1407 wake_up(&conf->wait_resume);
1408 wake_up(&conf->wait_idle);
1409
1410
1411 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1412 md_wakeup_thread(mddev->thread);
1413
1414 mempool_destroy(oldpool);
1415 return 0;
1416}
1417
1418
1419static mdk_personality_t raid1_personality =
1420{
1421 .name = "raid1",
1422 .owner = THIS_MODULE,
1423 .make_request = make_request,
1424 .run = run,
1425 .stop = stop,
1426 .status = status,
1427 .error_handler = error,
1428 .hot_add_disk = raid1_add_disk,
1429 .hot_remove_disk= raid1_remove_disk,
1430 .spare_active = raid1_spare_active,
1431 .sync_request = sync_request,
1432 .resize = raid1_resize,
1433 .reshape = raid1_reshape,
1434};
1435
1436static int __init raid_init(void)
1437{
1438 return register_md_personality(RAID1, &raid1_personality);
1439}
1440
1441static void raid_exit(void)
1442{
1443 unregister_md_personality(RAID1);
1444}
1445
1446module_init(raid_init);
1447module_exit(raid_exit);
1448MODULE_LICENSE("GPL");
1449MODULE_ALIAS("md-personality-3"); /* RAID1 */
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
new file mode 100644
index 000000000000..b100bfe4fdca
--- /dev/null
+++ b/drivers/md/raid10.c
@@ -0,0 +1,1787 @@
1/*
2 * raid10.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 2000-2004 Neil Brown
5 *
6 * RAID-10 support for md.
7 *
8 * Base on code in raid1.c. See raid1.c for futher copyright information.
9 *
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version.
15 *
16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */
20
21#include <linux/raid/raid10.h>
22
23/*
24 * RAID10 provides a combination of RAID0 and RAID1 functionality.
25 * The layout of data is defined by
26 * chunk_size
27 * raid_disks
28 * near_copies (stored in low byte of layout)
29 * far_copies (stored in second byte of layout)
30 *
31 * The data to be stored is divided into chunks using chunksize.
32 * Each device is divided into far_copies sections.
33 * In each section, chunks are laid out in a style similar to raid0, but
34 * near_copies copies of each chunk is stored (each on a different drive).
35 * The starting device for each section is offset near_copies from the starting
36 * device of the previous section.
37 * Thus there are (near_copies*far_copies) of each chunk, and each is on a different
38 * drive.
39 * near_copies and far_copies must be at least one, and their product is at most
40 * raid_disks.
41 */
42
43/*
44 * Number of guaranteed r10bios in case of extreme VM load:
45 */
46#define NR_RAID10_BIOS 256
47
48static void unplug_slaves(mddev_t *mddev);
49
50static void * r10bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
51{
52 conf_t *conf = data;
53 r10bio_t *r10_bio;
54 int size = offsetof(struct r10bio_s, devs[conf->copies]);
55
56 /* allocate a r10bio with room for raid_disks entries in the bios array */
57 r10_bio = kmalloc(size, gfp_flags);
58 if (r10_bio)
59 memset(r10_bio, 0, size);
60 else
61 unplug_slaves(conf->mddev);
62
63 return r10_bio;
64}
65
66static void r10bio_pool_free(void *r10_bio, void *data)
67{
68 kfree(r10_bio);
69}
70
71#define RESYNC_BLOCK_SIZE (64*1024)
72//#define RESYNC_BLOCK_SIZE PAGE_SIZE
73#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
74#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
75#define RESYNC_WINDOW (2048*1024)
76
77/*
78 * When performing a resync, we need to read and compare, so
79 * we need as many pages are there are copies.
80 * When performing a recovery, we need 2 bios, one for read,
81 * one for write (we recover only one drive per r10buf)
82 *
83 */
84static void * r10buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
85{
86 conf_t *conf = data;
87 struct page *page;
88 r10bio_t *r10_bio;
89 struct bio *bio;
90 int i, j;
91 int nalloc;
92
93 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
94 if (!r10_bio) {
95 unplug_slaves(conf->mddev);
96 return NULL;
97 }
98
99 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
100 nalloc = conf->copies; /* resync */
101 else
102 nalloc = 2; /* recovery */
103
104 /*
105 * Allocate bios.
106 */
107 for (j = nalloc ; j-- ; ) {
108 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
109 if (!bio)
110 goto out_free_bio;
111 r10_bio->devs[j].bio = bio;
112 }
113 /*
114 * Allocate RESYNC_PAGES data pages and attach them
115 * where needed.
116 */
117 for (j = 0 ; j < nalloc; j++) {
118 bio = r10_bio->devs[j].bio;
119 for (i = 0; i < RESYNC_PAGES; i++) {
120 page = alloc_page(gfp_flags);
121 if (unlikely(!page))
122 goto out_free_pages;
123
124 bio->bi_io_vec[i].bv_page = page;
125 }
126 }
127
128 return r10_bio;
129
130out_free_pages:
131 for ( ; i > 0 ; i--)
132 __free_page(bio->bi_io_vec[i-1].bv_page);
133 while (j--)
134 for (i = 0; i < RESYNC_PAGES ; i++)
135 __free_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
136 j = -1;
137out_free_bio:
138 while ( ++j < nalloc )
139 bio_put(r10_bio->devs[j].bio);
140 r10bio_pool_free(r10_bio, conf);
141 return NULL;
142}
143
144static void r10buf_pool_free(void *__r10_bio, void *data)
145{
146 int i;
147 conf_t *conf = data;
148 r10bio_t *r10bio = __r10_bio;
149 int j;
150
151 for (j=0; j < conf->copies; j++) {
152 struct bio *bio = r10bio->devs[j].bio;
153 if (bio) {
154 for (i = 0; i < RESYNC_PAGES; i++) {
155 __free_page(bio->bi_io_vec[i].bv_page);
156 bio->bi_io_vec[i].bv_page = NULL;
157 }
158 bio_put(bio);
159 }
160 }
161 r10bio_pool_free(r10bio, conf);
162}
163
164static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
165{
166 int i;
167
168 for (i = 0; i < conf->copies; i++) {
169 struct bio **bio = & r10_bio->devs[i].bio;
170 if (*bio)
171 bio_put(*bio);
172 *bio = NULL;
173 }
174}
175
176static inline void free_r10bio(r10bio_t *r10_bio)
177{
178 unsigned long flags;
179
180 conf_t *conf = mddev_to_conf(r10_bio->mddev);
181
182 /*
183 * Wake up any possible resync thread that waits for the device
184 * to go idle.
185 */
186 spin_lock_irqsave(&conf->resync_lock, flags);
187 if (!--conf->nr_pending) {
188 wake_up(&conf->wait_idle);
189 wake_up(&conf->wait_resume);
190 }
191 spin_unlock_irqrestore(&conf->resync_lock, flags);
192
193 put_all_bios(conf, r10_bio);
194 mempool_free(r10_bio, conf->r10bio_pool);
195}
196
197static inline void put_buf(r10bio_t *r10_bio)
198{
199 conf_t *conf = mddev_to_conf(r10_bio->mddev);
200 unsigned long flags;
201
202 mempool_free(r10_bio, conf->r10buf_pool);
203
204 spin_lock_irqsave(&conf->resync_lock, flags);
205 if (!conf->barrier)
206 BUG();
207 --conf->barrier;
208 wake_up(&conf->wait_resume);
209 wake_up(&conf->wait_idle);
210
211 if (!--conf->nr_pending) {
212 wake_up(&conf->wait_idle);
213 wake_up(&conf->wait_resume);
214 }
215 spin_unlock_irqrestore(&conf->resync_lock, flags);
216}
217
218static void reschedule_retry(r10bio_t *r10_bio)
219{
220 unsigned long flags;
221 mddev_t *mddev = r10_bio->mddev;
222 conf_t *conf = mddev_to_conf(mddev);
223
224 spin_lock_irqsave(&conf->device_lock, flags);
225 list_add(&r10_bio->retry_list, &conf->retry_list);
226 spin_unlock_irqrestore(&conf->device_lock, flags);
227
228 md_wakeup_thread(mddev->thread);
229}
230
231/*
232 * raid_end_bio_io() is called when we have finished servicing a mirrored
233 * operation and are ready to return a success/failure code to the buffer
234 * cache layer.
235 */
236static void raid_end_bio_io(r10bio_t *r10_bio)
237{
238 struct bio *bio = r10_bio->master_bio;
239
240 bio_endio(bio, bio->bi_size,
241 test_bit(R10BIO_Uptodate, &r10_bio->state) ? 0 : -EIO);
242 free_r10bio(r10_bio);
243}
244
245/*
246 * Update disk head position estimator based on IRQ completion info.
247 */
248static inline void update_head_pos(int slot, r10bio_t *r10_bio)
249{
250 conf_t *conf = mddev_to_conf(r10_bio->mddev);
251
252 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
253 r10_bio->devs[slot].addr + (r10_bio->sectors);
254}
255
256static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
257{
258 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
259 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
260 int slot, dev;
261 conf_t *conf = mddev_to_conf(r10_bio->mddev);
262
263 if (bio->bi_size)
264 return 1;
265
266 slot = r10_bio->read_slot;
267 dev = r10_bio->devs[slot].devnum;
268 /*
269 * this branch is our 'one mirror IO has finished' event handler:
270 */
271 if (!uptodate)
272 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
273 else
274 /*
275 * Set R10BIO_Uptodate in our master bio, so that
276 * we will return a good error code to the higher
277 * levels even if IO on some other mirrored buffer fails.
278 *
279 * The 'master' represents the composite IO operation to
280 * user-side. So if something waits for IO, then it will
281 * wait for the 'master' bio.
282 */
283 set_bit(R10BIO_Uptodate, &r10_bio->state);
284
285 update_head_pos(slot, r10_bio);
286
287 /*
288 * we have only one bio on the read side
289 */
290 if (uptodate)
291 raid_end_bio_io(r10_bio);
292 else {
293 /*
294 * oops, read error:
295 */
296 char b[BDEVNAME_SIZE];
297 if (printk_ratelimit())
298 printk(KERN_ERR "raid10: %s: rescheduling sector %llu\n",
299 bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector);
300 reschedule_retry(r10_bio);
301 }
302
303 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
304 return 0;
305}
306
307static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
308{
309 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
310 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
311 int slot, dev;
312 conf_t *conf = mddev_to_conf(r10_bio->mddev);
313
314 if (bio->bi_size)
315 return 1;
316
317 for (slot = 0; slot < conf->copies; slot++)
318 if (r10_bio->devs[slot].bio == bio)
319 break;
320 dev = r10_bio->devs[slot].devnum;
321
322 /*
323 * this branch is our 'one mirror IO has finished' event handler:
324 */
325 if (!uptodate)
326 md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
327 else
328 /*
329 * Set R10BIO_Uptodate in our master bio, so that
330 * we will return a good error code for to the higher
331 * levels even if IO on some other mirrored buffer fails.
332 *
333 * The 'master' represents the composite IO operation to
334 * user-side. So if something waits for IO, then it will
335 * wait for the 'master' bio.
336 */
337 set_bit(R10BIO_Uptodate, &r10_bio->state);
338
339 update_head_pos(slot, r10_bio);
340
341 /*
342 *
343 * Let's see if all mirrored write operations have finished
344 * already.
345 */
346 if (atomic_dec_and_test(&r10_bio->remaining)) {
347 md_write_end(r10_bio->mddev);
348 raid_end_bio_io(r10_bio);
349 }
350
351 rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
352 return 0;
353}
354
355
356/*
357 * RAID10 layout manager
358 * Aswell as the chunksize and raid_disks count, there are two
359 * parameters: near_copies and far_copies.
360 * near_copies * far_copies must be <= raid_disks.
361 * Normally one of these will be 1.
362 * If both are 1, we get raid0.
363 * If near_copies == raid_disks, we get raid1.
364 *
365 * Chunks are layed out in raid0 style with near_copies copies of the
366 * first chunk, followed by near_copies copies of the next chunk and
367 * so on.
368 * If far_copies > 1, then after 1/far_copies of the array has been assigned
369 * as described above, we start again with a device offset of near_copies.
370 * So we effectively have another copy of the whole array further down all
371 * the drives, but with blocks on different drives.
372 * With this layout, and block is never stored twice on the one device.
373 *
374 * raid10_find_phys finds the sector offset of a given virtual sector
375 * on each device that it is on. If a block isn't on a device,
376 * that entry in the array is set to MaxSector.
377 *
378 * raid10_find_virt does the reverse mapping, from a device and a
379 * sector offset to a virtual address
380 */
381
382static void raid10_find_phys(conf_t *conf, r10bio_t *r10bio)
383{
384 int n,f;
385 sector_t sector;
386 sector_t chunk;
387 sector_t stripe;
388 int dev;
389
390 int slot = 0;
391
392 /* now calculate first sector/dev */
393 chunk = r10bio->sector >> conf->chunk_shift;
394 sector = r10bio->sector & conf->chunk_mask;
395
396 chunk *= conf->near_copies;
397 stripe = chunk;
398 dev = sector_div(stripe, conf->raid_disks);
399
400 sector += stripe << conf->chunk_shift;
401
402 /* and calculate all the others */
403 for (n=0; n < conf->near_copies; n++) {
404 int d = dev;
405 sector_t s = sector;
406 r10bio->devs[slot].addr = sector;
407 r10bio->devs[slot].devnum = d;
408 slot++;
409
410 for (f = 1; f < conf->far_copies; f++) {
411 d += conf->near_copies;
412 if (d >= conf->raid_disks)
413 d -= conf->raid_disks;
414 s += conf->stride;
415 r10bio->devs[slot].devnum = d;
416 r10bio->devs[slot].addr = s;
417 slot++;
418 }
419 dev++;
420 if (dev >= conf->raid_disks) {
421 dev = 0;
422 sector += (conf->chunk_mask + 1);
423 }
424 }
425 BUG_ON(slot != conf->copies);
426}
427
428static sector_t raid10_find_virt(conf_t *conf, sector_t sector, int dev)
429{
430 sector_t offset, chunk, vchunk;
431
432 while (sector > conf->stride) {
433 sector -= conf->stride;
434 if (dev < conf->near_copies)
435 dev += conf->raid_disks - conf->near_copies;
436 else
437 dev -= conf->near_copies;
438 }
439
440 offset = sector & conf->chunk_mask;
441 chunk = sector >> conf->chunk_shift;
442 vchunk = chunk * conf->raid_disks + dev;
443 sector_div(vchunk, conf->near_copies);
444 return (vchunk << conf->chunk_shift) + offset;
445}
446
447/**
448 * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
449 * @q: request queue
450 * @bio: the buffer head that's been built up so far
451 * @biovec: the request that could be merged to it.
452 *
453 * Return amount of bytes we can accept at this offset
454 * If near_copies == raid_disk, there are no striping issues,
455 * but in that case, the function isn't called at all.
456 */
457static int raid10_mergeable_bvec(request_queue_t *q, struct bio *bio,
458 struct bio_vec *bio_vec)
459{
460 mddev_t *mddev = q->queuedata;
461 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
462 int max;
463 unsigned int chunk_sectors = mddev->chunk_size >> 9;
464 unsigned int bio_sectors = bio->bi_size >> 9;
465
466 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
467 if (max < 0) max = 0; /* bio_add cannot handle a negative return */
468 if (max <= bio_vec->bv_len && bio_sectors == 0)
469 return bio_vec->bv_len;
470 else
471 return max;
472}
473
474/*
475 * This routine returns the disk from which the requested read should
476 * be done. There is a per-array 'next expected sequential IO' sector
477 * number - if this matches on the next IO then we use the last disk.
478 * There is also a per-disk 'last know head position' sector that is
479 * maintained from IRQ contexts, both the normal and the resync IO
480 * completion handlers update this position correctly. If there is no
481 * perfect sequential match then we pick the disk whose head is closest.
482 *
483 * If there are 2 mirrors in the same 2 devices, performance degrades
484 * because position is mirror, not device based.
485 *
486 * The rdev for the device selected will have nr_pending incremented.
487 */
488
489/*
490 * FIXME: possibly should rethink readbalancing and do it differently
491 * depending on near_copies / far_copies geometry.
492 */
493static int read_balance(conf_t *conf, r10bio_t *r10_bio)
494{
495 const unsigned long this_sector = r10_bio->sector;
496 int disk, slot, nslot;
497 const int sectors = r10_bio->sectors;
498 sector_t new_distance, current_distance;
499
500 raid10_find_phys(conf, r10_bio);
501 rcu_read_lock();
502 /*
503 * Check if we can balance. We can balance on the whole
504 * device if no resync is going on, or below the resync window.
505 * We take the first readable disk when above the resync window.
506 */
507 if (conf->mddev->recovery_cp < MaxSector
508 && (this_sector + sectors >= conf->next_resync)) {
509 /* make sure that disk is operational */
510 slot = 0;
511 disk = r10_bio->devs[slot].devnum;
512
513 while (!conf->mirrors[disk].rdev ||
514 !conf->mirrors[disk].rdev->in_sync) {
515 slot++;
516 if (slot == conf->copies) {
517 slot = 0;
518 disk = -1;
519 break;
520 }
521 disk = r10_bio->devs[slot].devnum;
522 }
523 goto rb_out;
524 }
525
526
527 /* make sure the disk is operational */
528 slot = 0;
529 disk = r10_bio->devs[slot].devnum;
530 while (!conf->mirrors[disk].rdev ||
531 !conf->mirrors[disk].rdev->in_sync) {
532 slot ++;
533 if (slot == conf->copies) {
534 disk = -1;
535 goto rb_out;
536 }
537 disk = r10_bio->devs[slot].devnum;
538 }
539
540
541 current_distance = abs(this_sector - conf->mirrors[disk].head_position);
542
543 /* Find the disk whose head is closest */
544
545 for (nslot = slot; nslot < conf->copies; nslot++) {
546 int ndisk = r10_bio->devs[nslot].devnum;
547
548
549 if (!conf->mirrors[ndisk].rdev ||
550 !conf->mirrors[ndisk].rdev->in_sync)
551 continue;
552
553 if (!atomic_read(&conf->mirrors[ndisk].rdev->nr_pending)) {
554 disk = ndisk;
555 slot = nslot;
556 break;
557 }
558 new_distance = abs(r10_bio->devs[nslot].addr -
559 conf->mirrors[ndisk].head_position);
560 if (new_distance < current_distance) {
561 current_distance = new_distance;
562 disk = ndisk;
563 slot = nslot;
564 }
565 }
566
567rb_out:
568 r10_bio->read_slot = slot;
569/* conf->next_seq_sect = this_sector + sectors;*/
570
571 if (disk >= 0 && conf->mirrors[disk].rdev)
572 atomic_inc(&conf->mirrors[disk].rdev->nr_pending);
573 rcu_read_unlock();
574
575 return disk;
576}
577
578static void unplug_slaves(mddev_t *mddev)
579{
580 conf_t *conf = mddev_to_conf(mddev);
581 int i;
582
583 rcu_read_lock();
584 for (i=0; i<mddev->raid_disks; i++) {
585 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
586 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
587 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
588
589 atomic_inc(&rdev->nr_pending);
590 rcu_read_unlock();
591
592 if (r_queue->unplug_fn)
593 r_queue->unplug_fn(r_queue);
594
595 rdev_dec_pending(rdev, mddev);
596 rcu_read_lock();
597 }
598 }
599 rcu_read_unlock();
600}
601
602static void raid10_unplug(request_queue_t *q)
603{
604 unplug_slaves(q->queuedata);
605}
606
607static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
608 sector_t *error_sector)
609{
610 mddev_t *mddev = q->queuedata;
611 conf_t *conf = mddev_to_conf(mddev);
612 int i, ret = 0;
613
614 rcu_read_lock();
615 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
616 mdk_rdev_t *rdev = conf->mirrors[i].rdev;
617 if (rdev && !rdev->faulty) {
618 struct block_device *bdev = rdev->bdev;
619 request_queue_t *r_queue = bdev_get_queue(bdev);
620
621 if (!r_queue->issue_flush_fn)
622 ret = -EOPNOTSUPP;
623 else {
624 atomic_inc(&rdev->nr_pending);
625 rcu_read_unlock();
626 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
627 error_sector);
628 rdev_dec_pending(rdev, mddev);
629 rcu_read_lock();
630 }
631 }
632 }
633 rcu_read_unlock();
634 return ret;
635}
636
637/*
638 * Throttle resync depth, so that we can both get proper overlapping of
639 * requests, but are still able to handle normal requests quickly.
640 */
641#define RESYNC_DEPTH 32
642
643static void device_barrier(conf_t *conf, sector_t sect)
644{
645 spin_lock_irq(&conf->resync_lock);
646 wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
647 conf->resync_lock, unplug_slaves(conf->mddev));
648
649 if (!conf->barrier++) {
650 wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
651 conf->resync_lock, unplug_slaves(conf->mddev));
652 if (conf->nr_pending)
653 BUG();
654 }
655 wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
656 conf->resync_lock, unplug_slaves(conf->mddev));
657 conf->next_resync = sect;
658 spin_unlock_irq(&conf->resync_lock);
659}
660
661static int make_request(request_queue_t *q, struct bio * bio)
662{
663 mddev_t *mddev = q->queuedata;
664 conf_t *conf = mddev_to_conf(mddev);
665 mirror_info_t *mirror;
666 r10bio_t *r10_bio;
667 struct bio *read_bio;
668 int i;
669 int chunk_sects = conf->chunk_mask + 1;
670
671 /* If this request crosses a chunk boundary, we need to
672 * split it. This will only happen for 1 PAGE (or less) requests.
673 */
674 if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
675 > chunk_sects &&
676 conf->near_copies < conf->raid_disks)) {
677 struct bio_pair *bp;
678 /* Sanity check -- queue functions should prevent this happening */
679 if (bio->bi_vcnt != 1 ||
680 bio->bi_idx != 0)
681 goto bad_map;
682 /* This is a one page bio that upper layers
683 * refuse to split for us, so we need to split it.
684 */
685 bp = bio_split(bio, bio_split_pool,
686 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
687 if (make_request(q, &bp->bio1))
688 generic_make_request(&bp->bio1);
689 if (make_request(q, &bp->bio2))
690 generic_make_request(&bp->bio2);
691
692 bio_pair_release(bp);
693 return 0;
694 bad_map:
695 printk("raid10_make_request bug: can't convert block across chunks"
696 " or bigger than %dk %llu %d\n", chunk_sects/2,
697 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
698
699 bio_io_error(bio, bio->bi_size);
700 return 0;
701 }
702
703 /*
704 * Register the new request and wait if the reconstruction
705 * thread has put up a bar for new requests.
706 * Continue immediately if no resync is active currently.
707 */
708 spin_lock_irq(&conf->resync_lock);
709 wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
710 conf->nr_pending++;
711 spin_unlock_irq(&conf->resync_lock);
712
713 if (bio_data_dir(bio)==WRITE) {
714 disk_stat_inc(mddev->gendisk, writes);
715 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bio));
716 } else {
717 disk_stat_inc(mddev->gendisk, reads);
718 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bio));
719 }
720
721 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
722
723 r10_bio->master_bio = bio;
724 r10_bio->sectors = bio->bi_size >> 9;
725
726 r10_bio->mddev = mddev;
727 r10_bio->sector = bio->bi_sector;
728
729 if (bio_data_dir(bio) == READ) {
730 /*
731 * read balancing logic:
732 */
733 int disk = read_balance(conf, r10_bio);
734 int slot = r10_bio->read_slot;
735 if (disk < 0) {
736 raid_end_bio_io(r10_bio);
737 return 0;
738 }
739 mirror = conf->mirrors + disk;
740
741 read_bio = bio_clone(bio, GFP_NOIO);
742
743 r10_bio->devs[slot].bio = read_bio;
744
745 read_bio->bi_sector = r10_bio->devs[slot].addr +
746 mirror->rdev->data_offset;
747 read_bio->bi_bdev = mirror->rdev->bdev;
748 read_bio->bi_end_io = raid10_end_read_request;
749 read_bio->bi_rw = READ;
750 read_bio->bi_private = r10_bio;
751
752 generic_make_request(read_bio);
753 return 0;
754 }
755
756 /*
757 * WRITE:
758 */
759 /* first select target devices under spinlock and
760 * inc refcount on their rdev. Record them by setting
761 * bios[x] to bio
762 */
763 raid10_find_phys(conf, r10_bio);
764 rcu_read_lock();
765 for (i = 0; i < conf->copies; i++) {
766 int d = r10_bio->devs[i].devnum;
767 if (conf->mirrors[d].rdev &&
768 !conf->mirrors[d].rdev->faulty) {
769 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
770 r10_bio->devs[i].bio = bio;
771 } else
772 r10_bio->devs[i].bio = NULL;
773 }
774 rcu_read_unlock();
775
776 atomic_set(&r10_bio->remaining, 1);
777 md_write_start(mddev);
778 for (i = 0; i < conf->copies; i++) {
779 struct bio *mbio;
780 int d = r10_bio->devs[i].devnum;
781 if (!r10_bio->devs[i].bio)
782 continue;
783
784 mbio = bio_clone(bio, GFP_NOIO);
785 r10_bio->devs[i].bio = mbio;
786
787 mbio->bi_sector = r10_bio->devs[i].addr+
788 conf->mirrors[d].rdev->data_offset;
789 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
790 mbio->bi_end_io = raid10_end_write_request;
791 mbio->bi_rw = WRITE;
792 mbio->bi_private = r10_bio;
793
794 atomic_inc(&r10_bio->remaining);
795 generic_make_request(mbio);
796 }
797
798 if (atomic_dec_and_test(&r10_bio->remaining)) {
799 md_write_end(mddev);
800 raid_end_bio_io(r10_bio);
801 }
802
803 return 0;
804}
805
806static void status(struct seq_file *seq, mddev_t *mddev)
807{
808 conf_t *conf = mddev_to_conf(mddev);
809 int i;
810
811 if (conf->near_copies < conf->raid_disks)
812 seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
813 if (conf->near_copies > 1)
814 seq_printf(seq, " %d near-copies", conf->near_copies);
815 if (conf->far_copies > 1)
816 seq_printf(seq, " %d far-copies", conf->far_copies);
817
818 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
819 conf->working_disks);
820 for (i = 0; i < conf->raid_disks; i++)
821 seq_printf(seq, "%s",
822 conf->mirrors[i].rdev &&
823 conf->mirrors[i].rdev->in_sync ? "U" : "_");
824 seq_printf(seq, "]");
825}
826
827static void error(mddev_t *mddev, mdk_rdev_t *rdev)
828{
829 char b[BDEVNAME_SIZE];
830 conf_t *conf = mddev_to_conf(mddev);
831
832 /*
833 * If it is not operational, then we have already marked it as dead
834 * else if it is the last working disks, ignore the error, let the
835 * next level up know.
836 * else mark the drive as failed
837 */
838 if (rdev->in_sync
839 && conf->working_disks == 1)
840 /*
841 * Don't fail the drive, just return an IO error.
842 * The test should really be more sophisticated than
843 * "working_disks == 1", but it isn't critical, and
844 * can wait until we do more sophisticated "is the drive
845 * really dead" tests...
846 */
847 return;
848 if (rdev->in_sync) {
849 mddev->degraded++;
850 conf->working_disks--;
851 /*
852 * if recovery is running, make sure it aborts.
853 */
854 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
855 }
856 rdev->in_sync = 0;
857 rdev->faulty = 1;
858 mddev->sb_dirty = 1;
859 printk(KERN_ALERT "raid10: Disk failure on %s, disabling device. \n"
860 " Operation continuing on %d devices\n",
861 bdevname(rdev->bdev,b), conf->working_disks);
862}
863
864static void print_conf(conf_t *conf)
865{
866 int i;
867 mirror_info_t *tmp;
868
869 printk("RAID10 conf printout:\n");
870 if (!conf) {
871 printk("(!conf)\n");
872 return;
873 }
874 printk(" --- wd:%d rd:%d\n", conf->working_disks,
875 conf->raid_disks);
876
877 for (i = 0; i < conf->raid_disks; i++) {
878 char b[BDEVNAME_SIZE];
879 tmp = conf->mirrors + i;
880 if (tmp->rdev)
881 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
882 i, !tmp->rdev->in_sync, !tmp->rdev->faulty,
883 bdevname(tmp->rdev->bdev,b));
884 }
885}
886
887static void close_sync(conf_t *conf)
888{
889 spin_lock_irq(&conf->resync_lock);
890 wait_event_lock_irq(conf->wait_resume, !conf->barrier,
891 conf->resync_lock, unplug_slaves(conf->mddev));
892 spin_unlock_irq(&conf->resync_lock);
893
894 if (conf->barrier) BUG();
895 if (waitqueue_active(&conf->wait_idle)) BUG();
896
897 mempool_destroy(conf->r10buf_pool);
898 conf->r10buf_pool = NULL;
899}
900
901static int raid10_spare_active(mddev_t *mddev)
902{
903 int i;
904 conf_t *conf = mddev->private;
905 mirror_info_t *tmp;
906
907 /*
908 * Find all non-in_sync disks within the RAID10 configuration
909 * and mark them in_sync
910 */
911 for (i = 0; i < conf->raid_disks; i++) {
912 tmp = conf->mirrors + i;
913 if (tmp->rdev
914 && !tmp->rdev->faulty
915 && !tmp->rdev->in_sync) {
916 conf->working_disks++;
917 mddev->degraded--;
918 tmp->rdev->in_sync = 1;
919 }
920 }
921
922 print_conf(conf);
923 return 0;
924}
925
926
927static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
928{
929 conf_t *conf = mddev->private;
930 int found = 0;
931 int mirror;
932 mirror_info_t *p;
933
934 if (mddev->recovery_cp < MaxSector)
935 /* only hot-add to in-sync arrays, as recovery is
936 * very different from resync
937 */
938 return 0;
939
940 for (mirror=0; mirror < mddev->raid_disks; mirror++)
941 if ( !(p=conf->mirrors+mirror)->rdev) {
942
943 blk_queue_stack_limits(mddev->queue,
944 rdev->bdev->bd_disk->queue);
945 /* as we don't honour merge_bvec_fn, we must never risk
946 * violating it, so limit ->max_sector to one PAGE, as
947 * a one page request is never in violation.
948 */
949 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
950 mddev->queue->max_sectors > (PAGE_SIZE>>9))
951 mddev->queue->max_sectors = (PAGE_SIZE>>9);
952
953 p->head_position = 0;
954 rdev->raid_disk = mirror;
955 found = 1;
956 p->rdev = rdev;
957 break;
958 }
959
960 print_conf(conf);
961 return found;
962}
963
964static int raid10_remove_disk(mddev_t *mddev, int number)
965{
966 conf_t *conf = mddev->private;
967 int err = 0;
968 mdk_rdev_t *rdev;
969 mirror_info_t *p = conf->mirrors+ number;
970
971 print_conf(conf);
972 rdev = p->rdev;
973 if (rdev) {
974 if (rdev->in_sync ||
975 atomic_read(&rdev->nr_pending)) {
976 err = -EBUSY;
977 goto abort;
978 }
979 p->rdev = NULL;
980 synchronize_kernel();
981 if (atomic_read(&rdev->nr_pending)) {
982 /* lost the race, try later */
983 err = -EBUSY;
984 p->rdev = rdev;
985 }
986 }
987abort:
988
989 print_conf(conf);
990 return err;
991}
992
993
994static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
995{
996 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
997 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
998 conf_t *conf = mddev_to_conf(r10_bio->mddev);
999 int i,d;
1000
1001 if (bio->bi_size)
1002 return 1;
1003
1004 for (i=0; i<conf->copies; i++)
1005 if (r10_bio->devs[i].bio == bio)
1006 break;
1007 if (i == conf->copies)
1008 BUG();
1009 update_head_pos(i, r10_bio);
1010 d = r10_bio->devs[i].devnum;
1011 if (!uptodate)
1012 md_error(r10_bio->mddev,
1013 conf->mirrors[d].rdev);
1014
1015 /* for reconstruct, we always reschedule after a read.
1016 * for resync, only after all reads
1017 */
1018 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1019 atomic_dec_and_test(&r10_bio->remaining)) {
1020 /* we have read all the blocks,
1021 * do the comparison in process context in raid10d
1022 */
1023 reschedule_retry(r10_bio);
1024 }
1025 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1026 return 0;
1027}
1028
1029static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1030{
1031 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1032 r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
1033 mddev_t *mddev = r10_bio->mddev;
1034 conf_t *conf = mddev_to_conf(mddev);
1035 int i,d;
1036
1037 if (bio->bi_size)
1038 return 1;
1039
1040 for (i = 0; i < conf->copies; i++)
1041 if (r10_bio->devs[i].bio == bio)
1042 break;
1043 d = r10_bio->devs[i].devnum;
1044
1045 if (!uptodate)
1046 md_error(mddev, conf->mirrors[d].rdev);
1047 update_head_pos(i, r10_bio);
1048
1049 while (atomic_dec_and_test(&r10_bio->remaining)) {
1050 if (r10_bio->master_bio == NULL) {
1051 /* the primary of several recovery bios */
1052 md_done_sync(mddev, r10_bio->sectors, 1);
1053 put_buf(r10_bio);
1054 break;
1055 } else {
1056 r10bio_t *r10_bio2 = (r10bio_t *)r10_bio->master_bio;
1057 put_buf(r10_bio);
1058 r10_bio = r10_bio2;
1059 }
1060 }
1061 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1062 return 0;
1063}
1064
1065/*
1066 * Note: sync and recover and handled very differently for raid10
1067 * This code is for resync.
1068 * For resync, we read through virtual addresses and read all blocks.
1069 * If there is any error, we schedule a write. The lowest numbered
1070 * drive is authoritative.
1071 * However requests come for physical address, so we need to map.
1072 * For every physical address there are raid_disks/copies virtual addresses,
1073 * which is always are least one, but is not necessarly an integer.
1074 * This means that a physical address can span multiple chunks, so we may
1075 * have to submit multiple io requests for a single sync request.
1076 */
1077/*
1078 * We check if all blocks are in-sync and only write to blocks that
1079 * aren't in sync
1080 */
1081static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1082{
1083 conf_t *conf = mddev_to_conf(mddev);
1084 int i, first;
1085 struct bio *tbio, *fbio;
1086
1087 atomic_set(&r10_bio->remaining, 1);
1088
1089 /* find the first device with a block */
1090 for (i=0; i<conf->copies; i++)
1091 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1092 break;
1093
1094 if (i == conf->copies)
1095 goto done;
1096
1097 first = i;
1098 fbio = r10_bio->devs[i].bio;
1099
1100 /* now find blocks with errors */
1101 for (i=first+1 ; i < conf->copies ; i++) {
1102 int vcnt, j, d;
1103
1104 if (!test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1105 continue;
1106 /* We know that the bi_io_vec layout is the same for
1107 * both 'first' and 'i', so we just compare them.
1108 * All vec entries are PAGE_SIZE;
1109 */
1110 tbio = r10_bio->devs[i].bio;
1111 vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
1112 for (j = 0; j < vcnt; j++)
1113 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
1114 page_address(tbio->bi_io_vec[j].bv_page),
1115 PAGE_SIZE))
1116 break;
1117 if (j == vcnt)
1118 continue;
1119 /* Ok, we need to write this bio
1120 * First we need to fixup bv_offset, bv_len and
1121 * bi_vecs, as the read request might have corrupted these
1122 */
1123 tbio->bi_vcnt = vcnt;
1124 tbio->bi_size = r10_bio->sectors << 9;
1125 tbio->bi_idx = 0;
1126 tbio->bi_phys_segments = 0;
1127 tbio->bi_hw_segments = 0;
1128 tbio->bi_hw_front_size = 0;
1129 tbio->bi_hw_back_size = 0;
1130 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1131 tbio->bi_flags |= 1 << BIO_UPTODATE;
1132 tbio->bi_next = NULL;
1133 tbio->bi_rw = WRITE;
1134 tbio->bi_private = r10_bio;
1135 tbio->bi_sector = r10_bio->devs[i].addr;
1136
1137 for (j=0; j < vcnt ; j++) {
1138 tbio->bi_io_vec[j].bv_offset = 0;
1139 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
1140
1141 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
1142 page_address(fbio->bi_io_vec[j].bv_page),
1143 PAGE_SIZE);
1144 }
1145 tbio->bi_end_io = end_sync_write;
1146
1147 d = r10_bio->devs[i].devnum;
1148 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1149 atomic_inc(&r10_bio->remaining);
1150 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
1151
1152 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
1153 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1154 generic_make_request(tbio);
1155 }
1156
1157done:
1158 if (atomic_dec_and_test(&r10_bio->remaining)) {
1159 md_done_sync(mddev, r10_bio->sectors, 1);
1160 put_buf(r10_bio);
1161 }
1162}
1163
1164/*
1165 * Now for the recovery code.
1166 * Recovery happens across physical sectors.
1167 * We recover all non-is_sync drives by finding the virtual address of
1168 * each, and then choose a working drive that also has that virt address.
1169 * There is a separate r10_bio for each non-in_sync drive.
1170 * Only the first two slots are in use. The first for reading,
1171 * The second for writing.
1172 *
1173 */
1174
1175static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1176{
1177 conf_t *conf = mddev_to_conf(mddev);
1178 int i, d;
1179 struct bio *bio, *wbio;
1180
1181
1182 /* move the pages across to the second bio
1183 * and submit the write request
1184 */
1185 bio = r10_bio->devs[0].bio;
1186 wbio = r10_bio->devs[1].bio;
1187 for (i=0; i < wbio->bi_vcnt; i++) {
1188 struct page *p = bio->bi_io_vec[i].bv_page;
1189 bio->bi_io_vec[i].bv_page = wbio->bi_io_vec[i].bv_page;
1190 wbio->bi_io_vec[i].bv_page = p;
1191 }
1192 d = r10_bio->devs[1].devnum;
1193
1194 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1195 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
1196 generic_make_request(wbio);
1197}
1198
1199
1200/*
1201 * This is a kernel thread which:
1202 *
1203 * 1. Retries failed read operations on working mirrors.
1204 * 2. Updates the raid superblock when problems encounter.
1205 * 3. Performs writes following reads for array syncronising.
1206 */
1207
1208static void raid10d(mddev_t *mddev)
1209{
1210 r10bio_t *r10_bio;
1211 struct bio *bio;
1212 unsigned long flags;
1213 conf_t *conf = mddev_to_conf(mddev);
1214 struct list_head *head = &conf->retry_list;
1215 int unplug=0;
1216 mdk_rdev_t *rdev;
1217
1218 md_check_recovery(mddev);
1219 md_handle_safemode(mddev);
1220
1221 for (;;) {
1222 char b[BDEVNAME_SIZE];
1223 spin_lock_irqsave(&conf->device_lock, flags);
1224 if (list_empty(head))
1225 break;
1226 r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1227 list_del(head->prev);
1228 spin_unlock_irqrestore(&conf->device_lock, flags);
1229
1230 mddev = r10_bio->mddev;
1231 conf = mddev_to_conf(mddev);
1232 if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
1233 sync_request_write(mddev, r10_bio);
1234 unplug = 1;
1235 } else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) {
1236 recovery_request_write(mddev, r10_bio);
1237 unplug = 1;
1238 } else {
1239 int mirror;
1240 bio = r10_bio->devs[r10_bio->read_slot].bio;
1241 r10_bio->devs[r10_bio->read_slot].bio = NULL;
1242 bio_put(bio);
1243 mirror = read_balance(conf, r10_bio);
1244 if (mirror == -1) {
1245 printk(KERN_ALERT "raid10: %s: unrecoverable I/O"
1246 " read error for block %llu\n",
1247 bdevname(bio->bi_bdev,b),
1248 (unsigned long long)r10_bio->sector);
1249 raid_end_bio_io(r10_bio);
1250 } else {
1251 rdev = conf->mirrors[mirror].rdev;
1252 if (printk_ratelimit())
1253 printk(KERN_ERR "raid10: %s: redirecting sector %llu to"
1254 " another mirror\n",
1255 bdevname(rdev->bdev,b),
1256 (unsigned long long)r10_bio->sector);
1257 bio = bio_clone(r10_bio->master_bio, GFP_NOIO);
1258 r10_bio->devs[r10_bio->read_slot].bio = bio;
1259 bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr
1260 + rdev->data_offset;
1261 bio->bi_bdev = rdev->bdev;
1262 bio->bi_rw = READ;
1263 bio->bi_private = r10_bio;
1264 bio->bi_end_io = raid10_end_read_request;
1265 unplug = 1;
1266 generic_make_request(bio);
1267 }
1268 }
1269 }
1270 spin_unlock_irqrestore(&conf->device_lock, flags);
1271 if (unplug)
1272 unplug_slaves(mddev);
1273}
1274
1275
1276static int init_resync(conf_t *conf)
1277{
1278 int buffs;
1279
1280 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1281 if (conf->r10buf_pool)
1282 BUG();
1283 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
1284 if (!conf->r10buf_pool)
1285 return -ENOMEM;
1286 conf->next_resync = 0;
1287 return 0;
1288}
1289
1290/*
1291 * perform a "sync" on one "block"
1292 *
1293 * We need to make sure that no normal I/O request - particularly write
1294 * requests - conflict with active sync requests.
1295 *
1296 * This is achieved by tracking pending requests and a 'barrier' concept
1297 * that can be installed to exclude normal IO requests.
1298 *
1299 * Resync and recovery are handled very differently.
1300 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
1301 *
1302 * For resync, we iterate over virtual addresses, read all copies,
1303 * and update if there are differences. If only one copy is live,
1304 * skip it.
1305 * For recovery, we iterate over physical addresses, read a good
1306 * value for each non-in_sync drive, and over-write.
1307 *
1308 * So, for recovery we may have several outstanding complex requests for a
1309 * given address, one for each out-of-sync device. We model this by allocating
1310 * a number of r10_bio structures, one for each out-of-sync device.
1311 * As we setup these structures, we collect all bio's together into a list
1312 * which we then process collectively to add pages, and then process again
1313 * to pass to generic_make_request.
1314 *
1315 * The r10_bio structures are linked using a borrowed master_bio pointer.
1316 * This link is counted in ->remaining. When the r10_bio that points to NULL
1317 * has its remaining count decremented to 0, the whole complex operation
1318 * is complete.
1319 *
1320 */
1321
1322static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster)
1323{
1324 conf_t *conf = mddev_to_conf(mddev);
1325 r10bio_t *r10_bio;
1326 struct bio *biolist = NULL, *bio;
1327 sector_t max_sector, nr_sectors;
1328 int disk;
1329 int i;
1330
1331 sector_t sectors_skipped = 0;
1332 int chunks_skipped = 0;
1333
1334 if (!conf->r10buf_pool)
1335 if (init_resync(conf))
1336 return -ENOMEM;
1337
1338 skipped:
1339 max_sector = mddev->size << 1;
1340 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
1341 max_sector = mddev->resync_max_sectors;
1342 if (sector_nr >= max_sector) {
1343 close_sync(conf);
1344 return sectors_skipped;
1345 }
1346 if (chunks_skipped >= conf->raid_disks) {
1347 /* if there has been nothing to do on any drive,
1348 * then there is nothing to do at all..
1349 */
1350 sector_t sec = max_sector - sector_nr;
1351 md_done_sync(mddev, sec, 1);
1352 return sec + sectors_skipped;
1353 }
1354
1355 /* make sure whole request will fit in a chunk - if chunks
1356 * are meaningful
1357 */
1358 if (conf->near_copies < conf->raid_disks &&
1359 max_sector > (sector_nr | conf->chunk_mask))
1360 max_sector = (sector_nr | conf->chunk_mask) + 1;
1361 /*
1362 * If there is non-resync activity waiting for us then
1363 * put in a delay to throttle resync.
1364 */
1365 if (!go_faster && waitqueue_active(&conf->wait_resume))
1366 msleep_interruptible(1000);
1367 device_barrier(conf, sector_nr + RESYNC_SECTORS);
1368
1369 /* Again, very different code for resync and recovery.
1370 * Both must result in an r10bio with a list of bios that
1371 * have bi_end_io, bi_sector, bi_bdev set,
1372 * and bi_private set to the r10bio.
1373 * For recovery, we may actually create several r10bios
1374 * with 2 bios in each, that correspond to the bios in the main one.
1375 * In this case, the subordinate r10bios link back through a
1376 * borrowed master_bio pointer, and the counter in the master
1377 * includes a ref from each subordinate.
1378 */
1379 /* First, we decide what to do and set ->bi_end_io
1380 * To end_sync_read if we want to read, and
1381 * end_sync_write if we will want to write.
1382 */
1383
1384 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1385 /* recovery... the complicated one */
1386 int i, j, k;
1387 r10_bio = NULL;
1388
1389 for (i=0 ; i<conf->raid_disks; i++)
1390 if (conf->mirrors[i].rdev &&
1391 !conf->mirrors[i].rdev->in_sync) {
1392 /* want to reconstruct this device */
1393 r10bio_t *rb2 = r10_bio;
1394
1395 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1396 spin_lock_irq(&conf->resync_lock);
1397 conf->nr_pending++;
1398 if (rb2) conf->barrier++;
1399 spin_unlock_irq(&conf->resync_lock);
1400 atomic_set(&r10_bio->remaining, 0);
1401
1402 r10_bio->master_bio = (struct bio*)rb2;
1403 if (rb2)
1404 atomic_inc(&rb2->remaining);
1405 r10_bio->mddev = mddev;
1406 set_bit(R10BIO_IsRecover, &r10_bio->state);
1407 r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
1408 raid10_find_phys(conf, r10_bio);
1409 for (j=0; j<conf->copies;j++) {
1410 int d = r10_bio->devs[j].devnum;
1411 if (conf->mirrors[d].rdev &&
1412 conf->mirrors[d].rdev->in_sync) {
1413 /* This is where we read from */
1414 bio = r10_bio->devs[0].bio;
1415 bio->bi_next = biolist;
1416 biolist = bio;
1417 bio->bi_private = r10_bio;
1418 bio->bi_end_io = end_sync_read;
1419 bio->bi_rw = 0;
1420 bio->bi_sector = r10_bio->devs[j].addr +
1421 conf->mirrors[d].rdev->data_offset;
1422 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1423 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1424 atomic_inc(&r10_bio->remaining);
1425 /* and we write to 'i' */
1426
1427 for (k=0; k<conf->copies; k++)
1428 if (r10_bio->devs[k].devnum == i)
1429 break;
1430 bio = r10_bio->devs[1].bio;
1431 bio->bi_next = biolist;
1432 biolist = bio;
1433 bio->bi_private = r10_bio;
1434 bio->bi_end_io = end_sync_write;
1435 bio->bi_rw = 1;
1436 bio->bi_sector = r10_bio->devs[k].addr +
1437 conf->mirrors[i].rdev->data_offset;
1438 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1439
1440 r10_bio->devs[0].devnum = d;
1441 r10_bio->devs[1].devnum = i;
1442
1443 break;
1444 }
1445 }
1446 if (j == conf->copies) {
1447 BUG();
1448 }
1449 }
1450 if (biolist == NULL) {
1451 while (r10_bio) {
1452 r10bio_t *rb2 = r10_bio;
1453 r10_bio = (r10bio_t*) rb2->master_bio;
1454 rb2->master_bio = NULL;
1455 put_buf(rb2);
1456 }
1457 goto giveup;
1458 }
1459 } else {
1460 /* resync. Schedule a read for every block at this virt offset */
1461 int count = 0;
1462 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
1463
1464 spin_lock_irq(&conf->resync_lock);
1465 conf->nr_pending++;
1466 spin_unlock_irq(&conf->resync_lock);
1467
1468 r10_bio->mddev = mddev;
1469 atomic_set(&r10_bio->remaining, 0);
1470
1471 r10_bio->master_bio = NULL;
1472 r10_bio->sector = sector_nr;
1473 set_bit(R10BIO_IsSync, &r10_bio->state);
1474 raid10_find_phys(conf, r10_bio);
1475 r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
1476
1477 for (i=0; i<conf->copies; i++) {
1478 int d = r10_bio->devs[i].devnum;
1479 bio = r10_bio->devs[i].bio;
1480 bio->bi_end_io = NULL;
1481 if (conf->mirrors[d].rdev == NULL ||
1482 conf->mirrors[d].rdev->faulty)
1483 continue;
1484 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
1485 atomic_inc(&r10_bio->remaining);
1486 bio->bi_next = biolist;
1487 biolist = bio;
1488 bio->bi_private = r10_bio;
1489 bio->bi_end_io = end_sync_read;
1490 bio->bi_rw = 0;
1491 bio->bi_sector = r10_bio->devs[i].addr +
1492 conf->mirrors[d].rdev->data_offset;
1493 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
1494 count++;
1495 }
1496
1497 if (count < 2) {
1498 for (i=0; i<conf->copies; i++) {
1499 int d = r10_bio->devs[i].devnum;
1500 if (r10_bio->devs[i].bio->bi_end_io)
1501 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1502 }
1503 put_buf(r10_bio);
1504 biolist = NULL;
1505 goto giveup;
1506 }
1507 }
1508
1509 for (bio = biolist; bio ; bio=bio->bi_next) {
1510
1511 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
1512 if (bio->bi_end_io)
1513 bio->bi_flags |= 1 << BIO_UPTODATE;
1514 bio->bi_vcnt = 0;
1515 bio->bi_idx = 0;
1516 bio->bi_phys_segments = 0;
1517 bio->bi_hw_segments = 0;
1518 bio->bi_size = 0;
1519 }
1520
1521 nr_sectors = 0;
1522 do {
1523 struct page *page;
1524 int len = PAGE_SIZE;
1525 disk = 0;
1526 if (sector_nr + (len>>9) > max_sector)
1527 len = (max_sector - sector_nr) << 9;
1528 if (len == 0)
1529 break;
1530 for (bio= biolist ; bio ; bio=bio->bi_next) {
1531 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1532 if (bio_add_page(bio, page, len, 0) == 0) {
1533 /* stop here */
1534 struct bio *bio2;
1535 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1536 for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) {
1537 /* remove last page from this bio */
1538 bio2->bi_vcnt--;
1539 bio2->bi_size -= len;
1540 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
1541 }
1542 goto bio_full;
1543 }
1544 disk = i;
1545 }
1546 nr_sectors += len>>9;
1547 sector_nr += len>>9;
1548 } while (biolist->bi_vcnt < RESYNC_PAGES);
1549 bio_full:
1550 r10_bio->sectors = nr_sectors;
1551
1552 while (biolist) {
1553 bio = biolist;
1554 biolist = biolist->bi_next;
1555
1556 bio->bi_next = NULL;
1557 r10_bio = bio->bi_private;
1558 r10_bio->sectors = nr_sectors;
1559
1560 if (bio->bi_end_io == end_sync_read) {
1561 md_sync_acct(bio->bi_bdev, nr_sectors);
1562 generic_make_request(bio);
1563 }
1564 }
1565
1566 return sectors_skipped + nr_sectors;
1567 giveup:
1568 /* There is nowhere to write, so all non-sync
1569 * drives must be failed, so try the next chunk...
1570 */
1571 {
1572 int sec = max_sector - sector_nr;
1573 sectors_skipped += sec;
1574 chunks_skipped ++;
1575 sector_nr = max_sector;
1576 md_done_sync(mddev, sec, 1);
1577 goto skipped;
1578 }
1579}
1580
1581static int run(mddev_t *mddev)
1582{
1583 conf_t *conf;
1584 int i, disk_idx;
1585 mirror_info_t *disk;
1586 mdk_rdev_t *rdev;
1587 struct list_head *tmp;
1588 int nc, fc;
1589 sector_t stride, size;
1590
1591 if (mddev->level != 10) {
1592 printk(KERN_ERR "raid10: %s: raid level not set correctly... (%d)\n",
1593 mdname(mddev), mddev->level);
1594 goto out;
1595 }
1596 nc = mddev->layout & 255;
1597 fc = (mddev->layout >> 8) & 255;
1598 if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
1599 (mddev->layout >> 16)) {
1600 printk(KERN_ERR "raid10: %s: unsupported raid10 layout: 0x%8x\n",
1601 mdname(mddev), mddev->layout);
1602 goto out;
1603 }
1604 /*
1605 * copy the already verified devices into our private RAID10
1606 * bookkeeping area. [whatever we allocate in run(),
1607 * should be freed in stop()]
1608 */
1609 conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
1610 mddev->private = conf;
1611 if (!conf) {
1612 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1613 mdname(mddev));
1614 goto out;
1615 }
1616 memset(conf, 0, sizeof(*conf));
1617 conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1618 GFP_KERNEL);
1619 if (!conf->mirrors) {
1620 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1621 mdname(mddev));
1622 goto out_free_conf;
1623 }
1624 memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
1625
1626 conf->near_copies = nc;
1627 conf->far_copies = fc;
1628 conf->copies = nc*fc;
1629 conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
1630 conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
1631 stride = mddev->size >> (conf->chunk_shift-1);
1632 sector_div(stride, fc);
1633 conf->stride = stride << conf->chunk_shift;
1634
1635 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
1636 r10bio_pool_free, conf);
1637 if (!conf->r10bio_pool) {
1638 printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1639 mdname(mddev));
1640 goto out_free_conf;
1641 }
1642 mddev->queue->unplug_fn = raid10_unplug;
1643
1644 mddev->queue->issue_flush_fn = raid10_issue_flush;
1645
1646 ITERATE_RDEV(mddev, rdev, tmp) {
1647 disk_idx = rdev->raid_disk;
1648 if (disk_idx >= mddev->raid_disks
1649 || disk_idx < 0)
1650 continue;
1651 disk = conf->mirrors + disk_idx;
1652
1653 disk->rdev = rdev;
1654
1655 blk_queue_stack_limits(mddev->queue,
1656 rdev->bdev->bd_disk->queue);
1657 /* as we don't honour merge_bvec_fn, we must never risk
1658 * violating it, so limit ->max_sector to one PAGE, as
1659 * a one page request is never in violation.
1660 */
1661 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1662 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1663 mddev->queue->max_sectors = (PAGE_SIZE>>9);
1664
1665 disk->head_position = 0;
1666 if (!rdev->faulty && rdev->in_sync)
1667 conf->working_disks++;
1668 }
1669 conf->raid_disks = mddev->raid_disks;
1670 conf->mddev = mddev;
1671 spin_lock_init(&conf->device_lock);
1672 INIT_LIST_HEAD(&conf->retry_list);
1673
1674 spin_lock_init(&conf->resync_lock);
1675 init_waitqueue_head(&conf->wait_idle);
1676 init_waitqueue_head(&conf->wait_resume);
1677
1678 if (!conf->working_disks) {
1679 printk(KERN_ERR "raid10: no operational mirrors for %s\n",
1680 mdname(mddev));
1681 goto out_free_conf;
1682 }
1683
1684 mddev->degraded = 0;
1685 for (i = 0; i < conf->raid_disks; i++) {
1686
1687 disk = conf->mirrors + i;
1688
1689 if (!disk->rdev) {
1690 disk->head_position = 0;
1691 mddev->degraded++;
1692 }
1693 }
1694
1695
1696 mddev->thread = md_register_thread(raid10d, mddev, "%s_raid10");
1697 if (!mddev->thread) {
1698 printk(KERN_ERR
1699 "raid10: couldn't allocate thread for %s\n",
1700 mdname(mddev));
1701 goto out_free_conf;
1702 }
1703
1704 printk(KERN_INFO
1705 "raid10: raid set %s active with %d out of %d devices\n",
1706 mdname(mddev), mddev->raid_disks - mddev->degraded,
1707 mddev->raid_disks);
1708 /*
1709 * Ok, everything is just fine now
1710 */
1711 size = conf->stride * conf->raid_disks;
1712 sector_div(size, conf->near_copies);
1713 mddev->array_size = size/2;
1714 mddev->resync_max_sectors = size;
1715
1716 /* Calculate max read-ahead size.
1717 * We need to readahead at least twice a whole stripe....
1718 * maybe...
1719 */
1720 {
1721 int stripe = conf->raid_disks * mddev->chunk_size / PAGE_CACHE_SIZE;
1722 stripe /= conf->near_copies;
1723 if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
1724 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
1725 }
1726
1727 if (conf->near_copies < mddev->raid_disks)
1728 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
1729 return 0;
1730
1731out_free_conf:
1732 if (conf->r10bio_pool)
1733 mempool_destroy(conf->r10bio_pool);
1734 if (conf->mirrors)
1735 kfree(conf->mirrors);
1736 kfree(conf);
1737 mddev->private = NULL;
1738out:
1739 return -EIO;
1740}
1741
1742static int stop(mddev_t *mddev)
1743{
1744 conf_t *conf = mddev_to_conf(mddev);
1745
1746 md_unregister_thread(mddev->thread);
1747 mddev->thread = NULL;
1748 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1749 if (conf->r10bio_pool)
1750 mempool_destroy(conf->r10bio_pool);
1751 if (conf->mirrors)
1752 kfree(conf->mirrors);
1753 kfree(conf);
1754 mddev->private = NULL;
1755 return 0;
1756}
1757
1758
1759static mdk_personality_t raid10_personality =
1760{
1761 .name = "raid10",
1762 .owner = THIS_MODULE,
1763 .make_request = make_request,
1764 .run = run,
1765 .stop = stop,
1766 .status = status,
1767 .error_handler = error,
1768 .hot_add_disk = raid10_add_disk,
1769 .hot_remove_disk= raid10_remove_disk,
1770 .spare_active = raid10_spare_active,
1771 .sync_request = sync_request,
1772};
1773
1774static int __init raid_init(void)
1775{
1776 return register_md_personality(RAID10, &raid10_personality);
1777}
1778
1779static void raid_exit(void)
1780{
1781 unregister_md_personality(RAID10);
1782}
1783
1784module_init(raid_init);
1785module_exit(raid_exit);
1786MODULE_LICENSE("GPL");
1787MODULE_ALIAS("md-personality-9"); /* RAID10 */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
new file mode 100644
index 000000000000..52c3a81c4aa7
--- /dev/null
+++ b/drivers/md/raid5.c
@@ -0,0 +1,1965 @@
1/*
2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 *
6 * RAID-5 management functions.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2, or (at your option)
11 * any later version.
12 *
13 * You should have received a copy of the GNU General Public License
14 * (for example /usr/src/linux/COPYING); if not, write to the Free
15 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 */
17
18
19#include <linux/config.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/raid/raid5.h>
23#include <linux/highmem.h>
24#include <linux/bitops.h>
25#include <asm/atomic.h>
26
27/*
28 * Stripe cache
29 */
30
31#define NR_STRIPES 256
32#define STRIPE_SIZE PAGE_SIZE
33#define STRIPE_SHIFT (PAGE_SHIFT - 9)
34#define STRIPE_SECTORS (STRIPE_SIZE>>9)
35#define IO_THRESHOLD 1
36#define HASH_PAGES 1
37#define HASH_PAGES_ORDER 0
38#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
39#define HASH_MASK (NR_HASH - 1)
40
41#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
42
43/* bio's attached to a stripe+device for I/O are linked together in bi_sector
44 * order without overlap. There may be several bio's per stripe+device, and
45 * a bio could span several devices.
46 * When walking this list for a particular stripe+device, we must never proceed
47 * beyond a bio that extends past this device, as the next bio might no longer
48 * be valid.
49 * This macro is used to determine the 'next' bio in the list, given the sector
50 * of the current stripe+device
51 */
52#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
53/*
54 * The following can be used to debug the driver
55 */
56#define RAID5_DEBUG 0
57#define RAID5_PARANOIA 1
58#if RAID5_PARANOIA && defined(CONFIG_SMP)
59# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
60#else
61# define CHECK_DEVLOCK()
62#endif
63
64#define PRINTK(x...) ((void)(RAID5_DEBUG && printk(x)))
65#if RAID5_DEBUG
66#define inline
67#define __inline__
68#endif
69
70static void print_raid5_conf (raid5_conf_t *conf);
71
72static inline void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
73{
74 if (atomic_dec_and_test(&sh->count)) {
75 if (!list_empty(&sh->lru))
76 BUG();
77 if (atomic_read(&conf->active_stripes)==0)
78 BUG();
79 if (test_bit(STRIPE_HANDLE, &sh->state)) {
80 if (test_bit(STRIPE_DELAYED, &sh->state))
81 list_add_tail(&sh->lru, &conf->delayed_list);
82 else
83 list_add_tail(&sh->lru, &conf->handle_list);
84 md_wakeup_thread(conf->mddev->thread);
85 } else {
86 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
87 atomic_dec(&conf->preread_active_stripes);
88 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
89 md_wakeup_thread(conf->mddev->thread);
90 }
91 list_add_tail(&sh->lru, &conf->inactive_list);
92 atomic_dec(&conf->active_stripes);
93 if (!conf->inactive_blocked ||
94 atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
95 wake_up(&conf->wait_for_stripe);
96 }
97 }
98}
99static void release_stripe(struct stripe_head *sh)
100{
101 raid5_conf_t *conf = sh->raid_conf;
102 unsigned long flags;
103
104 spin_lock_irqsave(&conf->device_lock, flags);
105 __release_stripe(conf, sh);
106 spin_unlock_irqrestore(&conf->device_lock, flags);
107}
108
109static void remove_hash(struct stripe_head *sh)
110{
111 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
112
113 if (sh->hash_pprev) {
114 if (sh->hash_next)
115 sh->hash_next->hash_pprev = sh->hash_pprev;
116 *sh->hash_pprev = sh->hash_next;
117 sh->hash_pprev = NULL;
118 }
119}
120
121static __inline__ void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
122{
123 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
124
125 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
126
127 CHECK_DEVLOCK();
128 if ((sh->hash_next = *shp) != NULL)
129 (*shp)->hash_pprev = &sh->hash_next;
130 *shp = sh;
131 sh->hash_pprev = shp;
132}
133
134
135/* find an idle stripe, make sure it is unhashed, and return it. */
136static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
137{
138 struct stripe_head *sh = NULL;
139 struct list_head *first;
140
141 CHECK_DEVLOCK();
142 if (list_empty(&conf->inactive_list))
143 goto out;
144 first = conf->inactive_list.next;
145 sh = list_entry(first, struct stripe_head, lru);
146 list_del_init(first);
147 remove_hash(sh);
148 atomic_inc(&conf->active_stripes);
149out:
150 return sh;
151}
152
153static void shrink_buffers(struct stripe_head *sh, int num)
154{
155 struct page *p;
156 int i;
157
158 for (i=0; i<num ; i++) {
159 p = sh->dev[i].page;
160 if (!p)
161 continue;
162 sh->dev[i].page = NULL;
163 page_cache_release(p);
164 }
165}
166
167static int grow_buffers(struct stripe_head *sh, int num)
168{
169 int i;
170
171 for (i=0; i<num; i++) {
172 struct page *page;
173
174 if (!(page = alloc_page(GFP_KERNEL))) {
175 return 1;
176 }
177 sh->dev[i].page = page;
178 }
179 return 0;
180}
181
182static void raid5_build_block (struct stripe_head *sh, int i);
183
184static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
185{
186 raid5_conf_t *conf = sh->raid_conf;
187 int disks = conf->raid_disks, i;
188
189 if (atomic_read(&sh->count) != 0)
190 BUG();
191 if (test_bit(STRIPE_HANDLE, &sh->state))
192 BUG();
193
194 CHECK_DEVLOCK();
195 PRINTK("init_stripe called, stripe %llu\n",
196 (unsigned long long)sh->sector);
197
198 remove_hash(sh);
199
200 sh->sector = sector;
201 sh->pd_idx = pd_idx;
202 sh->state = 0;
203
204 for (i=disks; i--; ) {
205 struct r5dev *dev = &sh->dev[i];
206
207 if (dev->toread || dev->towrite || dev->written ||
208 test_bit(R5_LOCKED, &dev->flags)) {
209 printk("sector=%llx i=%d %p %p %p %d\n",
210 (unsigned long long)sh->sector, i, dev->toread,
211 dev->towrite, dev->written,
212 test_bit(R5_LOCKED, &dev->flags));
213 BUG();
214 }
215 dev->flags = 0;
216 raid5_build_block(sh, i);
217 }
218 insert_hash(conf, sh);
219}
220
221static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
222{
223 struct stripe_head *sh;
224
225 CHECK_DEVLOCK();
226 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
227 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
228 if (sh->sector == sector)
229 return sh;
230 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
231 return NULL;
232}
233
234static void unplug_slaves(mddev_t *mddev);
235static void raid5_unplug_device(request_queue_t *q);
236
237static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
238 int pd_idx, int noblock)
239{
240 struct stripe_head *sh;
241
242 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
243
244 spin_lock_irq(&conf->device_lock);
245
246 do {
247 sh = __find_stripe(conf, sector);
248 if (!sh) {
249 if (!conf->inactive_blocked)
250 sh = get_free_stripe(conf);
251 if (noblock && sh == NULL)
252 break;
253 if (!sh) {
254 conf->inactive_blocked = 1;
255 wait_event_lock_irq(conf->wait_for_stripe,
256 !list_empty(&conf->inactive_list) &&
257 (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
258 || !conf->inactive_blocked),
259 conf->device_lock,
260 unplug_slaves(conf->mddev);
261 );
262 conf->inactive_blocked = 0;
263 } else
264 init_stripe(sh, sector, pd_idx);
265 } else {
266 if (atomic_read(&sh->count)) {
267 if (!list_empty(&sh->lru))
268 BUG();
269 } else {
270 if (!test_bit(STRIPE_HANDLE, &sh->state))
271 atomic_inc(&conf->active_stripes);
272 if (list_empty(&sh->lru))
273 BUG();
274 list_del_init(&sh->lru);
275 }
276 }
277 } while (sh == NULL);
278
279 if (sh)
280 atomic_inc(&sh->count);
281
282 spin_unlock_irq(&conf->device_lock);
283 return sh;
284}
285
286static int grow_stripes(raid5_conf_t *conf, int num)
287{
288 struct stripe_head *sh;
289 kmem_cache_t *sc;
290 int devs = conf->raid_disks;
291
292 sprintf(conf->cache_name, "raid5/%s", mdname(conf->mddev));
293
294 sc = kmem_cache_create(conf->cache_name,
295 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
296 0, 0, NULL, NULL);
297 if (!sc)
298 return 1;
299 conf->slab_cache = sc;
300 while (num--) {
301 sh = kmem_cache_alloc(sc, GFP_KERNEL);
302 if (!sh)
303 return 1;
304 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
305 sh->raid_conf = conf;
306 spin_lock_init(&sh->lock);
307
308 if (grow_buffers(sh, conf->raid_disks)) {
309 shrink_buffers(sh, conf->raid_disks);
310 kmem_cache_free(sc, sh);
311 return 1;
312 }
313 /* we just created an active stripe so... */
314 atomic_set(&sh->count, 1);
315 atomic_inc(&conf->active_stripes);
316 INIT_LIST_HEAD(&sh->lru);
317 release_stripe(sh);
318 }
319 return 0;
320}
321
322static void shrink_stripes(raid5_conf_t *conf)
323{
324 struct stripe_head *sh;
325
326 while (1) {
327 spin_lock_irq(&conf->device_lock);
328 sh = get_free_stripe(conf);
329 spin_unlock_irq(&conf->device_lock);
330 if (!sh)
331 break;
332 if (atomic_read(&sh->count))
333 BUG();
334 shrink_buffers(sh, conf->raid_disks);
335 kmem_cache_free(conf->slab_cache, sh);
336 atomic_dec(&conf->active_stripes);
337 }
338 kmem_cache_destroy(conf->slab_cache);
339 conf->slab_cache = NULL;
340}
341
342static int raid5_end_read_request (struct bio * bi, unsigned int bytes_done,
343 int error)
344{
345 struct stripe_head *sh = bi->bi_private;
346 raid5_conf_t *conf = sh->raid_conf;
347 int disks = conf->raid_disks, i;
348 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
349
350 if (bi->bi_size)
351 return 1;
352
353 for (i=0 ; i<disks; i++)
354 if (bi == &sh->dev[i].req)
355 break;
356
357 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
358 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
359 uptodate);
360 if (i == disks) {
361 BUG();
362 return 0;
363 }
364
365 if (uptodate) {
366#if 0
367 struct bio *bio;
368 unsigned long flags;
369 spin_lock_irqsave(&conf->device_lock, flags);
370 /* we can return a buffer if we bypassed the cache or
371 * if the top buffer is not in highmem. If there are
372 * multiple buffers, leave the extra work to
373 * handle_stripe
374 */
375 buffer = sh->bh_read[i];
376 if (buffer &&
377 (!PageHighMem(buffer->b_page)
378 || buffer->b_page == bh->b_page )
379 ) {
380 sh->bh_read[i] = buffer->b_reqnext;
381 buffer->b_reqnext = NULL;
382 } else
383 buffer = NULL;
384 spin_unlock_irqrestore(&conf->device_lock, flags);
385 if (sh->bh_page[i]==bh->b_page)
386 set_buffer_uptodate(bh);
387 if (buffer) {
388 if (buffer->b_page != bh->b_page)
389 memcpy(buffer->b_data, bh->b_data, bh->b_size);
390 buffer->b_end_io(buffer, 1);
391 }
392#else
393 set_bit(R5_UPTODATE, &sh->dev[i].flags);
394#endif
395 } else {
396 md_error(conf->mddev, conf->disks[i].rdev);
397 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
398 }
399 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
400#if 0
401 /* must restore b_page before unlocking buffer... */
402 if (sh->bh_page[i] != bh->b_page) {
403 bh->b_page = sh->bh_page[i];
404 bh->b_data = page_address(bh->b_page);
405 clear_buffer_uptodate(bh);
406 }
407#endif
408 clear_bit(R5_LOCKED, &sh->dev[i].flags);
409 set_bit(STRIPE_HANDLE, &sh->state);
410 release_stripe(sh);
411 return 0;
412}
413
414static int raid5_end_write_request (struct bio *bi, unsigned int bytes_done,
415 int error)
416{
417 struct stripe_head *sh = bi->bi_private;
418 raid5_conf_t *conf = sh->raid_conf;
419 int disks = conf->raid_disks, i;
420 unsigned long flags;
421 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
422
423 if (bi->bi_size)
424 return 1;
425
426 for (i=0 ; i<disks; i++)
427 if (bi == &sh->dev[i].req)
428 break;
429
430 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
431 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
432 uptodate);
433 if (i == disks) {
434 BUG();
435 return 0;
436 }
437
438 spin_lock_irqsave(&conf->device_lock, flags);
439 if (!uptodate)
440 md_error(conf->mddev, conf->disks[i].rdev);
441
442 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
443
444 clear_bit(R5_LOCKED, &sh->dev[i].flags);
445 set_bit(STRIPE_HANDLE, &sh->state);
446 __release_stripe(conf, sh);
447 spin_unlock_irqrestore(&conf->device_lock, flags);
448 return 0;
449}
450
451
452static sector_t compute_blocknr(struct stripe_head *sh, int i);
453
454static void raid5_build_block (struct stripe_head *sh, int i)
455{
456 struct r5dev *dev = &sh->dev[i];
457
458 bio_init(&dev->req);
459 dev->req.bi_io_vec = &dev->vec;
460 dev->req.bi_vcnt++;
461 dev->req.bi_max_vecs++;
462 dev->vec.bv_page = dev->page;
463 dev->vec.bv_len = STRIPE_SIZE;
464 dev->vec.bv_offset = 0;
465
466 dev->req.bi_sector = sh->sector;
467 dev->req.bi_private = sh;
468
469 dev->flags = 0;
470 if (i != sh->pd_idx)
471 dev->sector = compute_blocknr(sh, i);
472}
473
474static void error(mddev_t *mddev, mdk_rdev_t *rdev)
475{
476 char b[BDEVNAME_SIZE];
477 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
478 PRINTK("raid5: error called\n");
479
480 if (!rdev->faulty) {
481 mddev->sb_dirty = 1;
482 if (rdev->in_sync) {
483 conf->working_disks--;
484 mddev->degraded++;
485 conf->failed_disks++;
486 rdev->in_sync = 0;
487 /*
488 * if recovery was running, make sure it aborts.
489 */
490 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
491 }
492 rdev->faulty = 1;
493 printk (KERN_ALERT
494 "raid5: Disk failure on %s, disabling device."
495 " Operation continuing on %d devices\n",
496 bdevname(rdev->bdev,b), conf->working_disks);
497 }
498}
499
500/*
501 * Input: a 'big' sector number,
502 * Output: index of the data and parity disk, and the sector # in them.
503 */
504static sector_t raid5_compute_sector(sector_t r_sector, unsigned int raid_disks,
505 unsigned int data_disks, unsigned int * dd_idx,
506 unsigned int * pd_idx, raid5_conf_t *conf)
507{
508 long stripe;
509 unsigned long chunk_number;
510 unsigned int chunk_offset;
511 sector_t new_sector;
512 int sectors_per_chunk = conf->chunk_size >> 9;
513
514 /* First compute the information on this sector */
515
516 /*
517 * Compute the chunk number and the sector offset inside the chunk
518 */
519 chunk_offset = sector_div(r_sector, sectors_per_chunk);
520 chunk_number = r_sector;
521 BUG_ON(r_sector != chunk_number);
522
523 /*
524 * Compute the stripe number
525 */
526 stripe = chunk_number / data_disks;
527
528 /*
529 * Compute the data disk and parity disk indexes inside the stripe
530 */
531 *dd_idx = chunk_number % data_disks;
532
533 /*
534 * Select the parity disk based on the user selected algorithm.
535 */
536 if (conf->level == 4)
537 *pd_idx = data_disks;
538 else switch (conf->algorithm) {
539 case ALGORITHM_LEFT_ASYMMETRIC:
540 *pd_idx = data_disks - stripe % raid_disks;
541 if (*dd_idx >= *pd_idx)
542 (*dd_idx)++;
543 break;
544 case ALGORITHM_RIGHT_ASYMMETRIC:
545 *pd_idx = stripe % raid_disks;
546 if (*dd_idx >= *pd_idx)
547 (*dd_idx)++;
548 break;
549 case ALGORITHM_LEFT_SYMMETRIC:
550 *pd_idx = data_disks - stripe % raid_disks;
551 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
552 break;
553 case ALGORITHM_RIGHT_SYMMETRIC:
554 *pd_idx = stripe % raid_disks;
555 *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
556 break;
557 default:
558 printk("raid5: unsupported algorithm %d\n",
559 conf->algorithm);
560 }
561
562 /*
563 * Finally, compute the new sector number
564 */
565 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
566 return new_sector;
567}
568
569
570static sector_t compute_blocknr(struct stripe_head *sh, int i)
571{
572 raid5_conf_t *conf = sh->raid_conf;
573 int raid_disks = conf->raid_disks, data_disks = raid_disks - 1;
574 sector_t new_sector = sh->sector, check;
575 int sectors_per_chunk = conf->chunk_size >> 9;
576 sector_t stripe;
577 int chunk_offset;
578 int chunk_number, dummy1, dummy2, dd_idx = i;
579 sector_t r_sector;
580
581 chunk_offset = sector_div(new_sector, sectors_per_chunk);
582 stripe = new_sector;
583 BUG_ON(new_sector != stripe);
584
585
586 switch (conf->algorithm) {
587 case ALGORITHM_LEFT_ASYMMETRIC:
588 case ALGORITHM_RIGHT_ASYMMETRIC:
589 if (i > sh->pd_idx)
590 i--;
591 break;
592 case ALGORITHM_LEFT_SYMMETRIC:
593 case ALGORITHM_RIGHT_SYMMETRIC:
594 if (i < sh->pd_idx)
595 i += raid_disks;
596 i -= (sh->pd_idx + 1);
597 break;
598 default:
599 printk("raid5: unsupported algorithm %d\n",
600 conf->algorithm);
601 }
602
603 chunk_number = stripe * data_disks + i;
604 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
605
606 check = raid5_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
607 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
608 printk("compute_blocknr: map not correct\n");
609 return 0;
610 }
611 return r_sector;
612}
613
614
615
616/*
617 * Copy data between a page in the stripe cache, and a bio.
618 * There are no alignment or size guarantees between the page or the
619 * bio except that there is some overlap.
620 * All iovecs in the bio must be considered.
621 */
622static void copy_data(int frombio, struct bio *bio,
623 struct page *page,
624 sector_t sector)
625{
626 char *pa = page_address(page);
627 struct bio_vec *bvl;
628 int i;
629 int page_offset;
630
631 if (bio->bi_sector >= sector)
632 page_offset = (signed)(bio->bi_sector - sector) * 512;
633 else
634 page_offset = (signed)(sector - bio->bi_sector) * -512;
635 bio_for_each_segment(bvl, bio, i) {
636 int len = bio_iovec_idx(bio,i)->bv_len;
637 int clen;
638 int b_offset = 0;
639
640 if (page_offset < 0) {
641 b_offset = -page_offset;
642 page_offset += b_offset;
643 len -= b_offset;
644 }
645
646 if (len > 0 && page_offset + len > STRIPE_SIZE)
647 clen = STRIPE_SIZE - page_offset;
648 else clen = len;
649
650 if (clen > 0) {
651 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
652 if (frombio)
653 memcpy(pa+page_offset, ba+b_offset, clen);
654 else
655 memcpy(ba+b_offset, pa+page_offset, clen);
656 __bio_kunmap_atomic(ba, KM_USER0);
657 }
658 if (clen < len) /* hit end of page */
659 break;
660 page_offset += len;
661 }
662}
663
664#define check_xor() do { \
665 if (count == MAX_XOR_BLOCKS) { \
666 xor_block(count, STRIPE_SIZE, ptr); \
667 count = 1; \
668 } \
669 } while(0)
670
671
672static void compute_block(struct stripe_head *sh, int dd_idx)
673{
674 raid5_conf_t *conf = sh->raid_conf;
675 int i, count, disks = conf->raid_disks;
676 void *ptr[MAX_XOR_BLOCKS], *p;
677
678 PRINTK("compute_block, stripe %llu, idx %d\n",
679 (unsigned long long)sh->sector, dd_idx);
680
681 ptr[0] = page_address(sh->dev[dd_idx].page);
682 memset(ptr[0], 0, STRIPE_SIZE);
683 count = 1;
684 for (i = disks ; i--; ) {
685 if (i == dd_idx)
686 continue;
687 p = page_address(sh->dev[i].page);
688 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
689 ptr[count++] = p;
690 else
691 printk("compute_block() %d, stripe %llu, %d"
692 " not present\n", dd_idx,
693 (unsigned long long)sh->sector, i);
694
695 check_xor();
696 }
697 if (count != 1)
698 xor_block(count, STRIPE_SIZE, ptr);
699 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
700}
701
702static void compute_parity(struct stripe_head *sh, int method)
703{
704 raid5_conf_t *conf = sh->raid_conf;
705 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
706 void *ptr[MAX_XOR_BLOCKS];
707 struct bio *chosen;
708
709 PRINTK("compute_parity, stripe %llu, method %d\n",
710 (unsigned long long)sh->sector, method);
711
712 count = 1;
713 ptr[0] = page_address(sh->dev[pd_idx].page);
714 switch(method) {
715 case READ_MODIFY_WRITE:
716 if (!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags))
717 BUG();
718 for (i=disks ; i-- ;) {
719 if (i==pd_idx)
720 continue;
721 if (sh->dev[i].towrite &&
722 test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
723 ptr[count++] = page_address(sh->dev[i].page);
724 chosen = sh->dev[i].towrite;
725 sh->dev[i].towrite = NULL;
726
727 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
728 wake_up(&conf->wait_for_overlap);
729
730 if (sh->dev[i].written) BUG();
731 sh->dev[i].written = chosen;
732 check_xor();
733 }
734 }
735 break;
736 case RECONSTRUCT_WRITE:
737 memset(ptr[0], 0, STRIPE_SIZE);
738 for (i= disks; i-- ;)
739 if (i!=pd_idx && sh->dev[i].towrite) {
740 chosen = sh->dev[i].towrite;
741 sh->dev[i].towrite = NULL;
742
743 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
744 wake_up(&conf->wait_for_overlap);
745
746 if (sh->dev[i].written) BUG();
747 sh->dev[i].written = chosen;
748 }
749 break;
750 case CHECK_PARITY:
751 break;
752 }
753 if (count>1) {
754 xor_block(count, STRIPE_SIZE, ptr);
755 count = 1;
756 }
757
758 for (i = disks; i--;)
759 if (sh->dev[i].written) {
760 sector_t sector = sh->dev[i].sector;
761 struct bio *wbi = sh->dev[i].written;
762 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
763 copy_data(1, wbi, sh->dev[i].page, sector);
764 wbi = r5_next_bio(wbi, sector);
765 }
766
767 set_bit(R5_LOCKED, &sh->dev[i].flags);
768 set_bit(R5_UPTODATE, &sh->dev[i].flags);
769 }
770
771 switch(method) {
772 case RECONSTRUCT_WRITE:
773 case CHECK_PARITY:
774 for (i=disks; i--;)
775 if (i != pd_idx) {
776 ptr[count++] = page_address(sh->dev[i].page);
777 check_xor();
778 }
779 break;
780 case READ_MODIFY_WRITE:
781 for (i = disks; i--;)
782 if (sh->dev[i].written) {
783 ptr[count++] = page_address(sh->dev[i].page);
784 check_xor();
785 }
786 }
787 if (count != 1)
788 xor_block(count, STRIPE_SIZE, ptr);
789
790 if (method != CHECK_PARITY) {
791 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
792 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
793 } else
794 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
795}
796
797/*
798 * Each stripe/dev can have one or more bion attached.
799 * toread/towrite point to the first in a chain.
800 * The bi_next chain must be in order.
801 */
802static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
803{
804 struct bio **bip;
805 raid5_conf_t *conf = sh->raid_conf;
806
807 PRINTK("adding bh b#%llu to stripe s#%llu\n",
808 (unsigned long long)bi->bi_sector,
809 (unsigned long long)sh->sector);
810
811
812 spin_lock(&sh->lock);
813 spin_lock_irq(&conf->device_lock);
814 if (forwrite)
815 bip = &sh->dev[dd_idx].towrite;
816 else
817 bip = &sh->dev[dd_idx].toread;
818 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
819 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
820 goto overlap;
821 bip = & (*bip)->bi_next;
822 }
823 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
824 goto overlap;
825
826 if (*bip && bi->bi_next && (*bip) != bi->bi_next)
827 BUG();
828 if (*bip)
829 bi->bi_next = *bip;
830 *bip = bi;
831 bi->bi_phys_segments ++;
832 spin_unlock_irq(&conf->device_lock);
833 spin_unlock(&sh->lock);
834
835 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
836 (unsigned long long)bi->bi_sector,
837 (unsigned long long)sh->sector, dd_idx);
838
839 if (forwrite) {
840 /* check if page is covered */
841 sector_t sector = sh->dev[dd_idx].sector;
842 for (bi=sh->dev[dd_idx].towrite;
843 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
844 bi && bi->bi_sector <= sector;
845 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
846 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
847 sector = bi->bi_sector + (bi->bi_size>>9);
848 }
849 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
850 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
851 }
852 return 1;
853
854 overlap:
855 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
856 spin_unlock_irq(&conf->device_lock);
857 spin_unlock(&sh->lock);
858 return 0;
859}
860
861
862/*
863 * handle_stripe - do things to a stripe.
864 *
865 * We lock the stripe and then examine the state of various bits
866 * to see what needs to be done.
867 * Possible results:
868 * return some read request which now have data
869 * return some write requests which are safely on disc
870 * schedule a read on some buffers
871 * schedule a write of some buffers
872 * return confirmation of parity correctness
873 *
874 * Parity calculations are done inside the stripe lock
875 * buffers are taken off read_list or write_list, and bh_cache buffers
876 * get BH_Lock set before the stripe lock is released.
877 *
878 */
879
880static void handle_stripe(struct stripe_head *sh)
881{
882 raid5_conf_t *conf = sh->raid_conf;
883 int disks = conf->raid_disks;
884 struct bio *return_bi= NULL;
885 struct bio *bi;
886 int i;
887 int syncing;
888 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
889 int non_overwrite = 0;
890 int failed_num=0;
891 struct r5dev *dev;
892
893 PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
894 (unsigned long long)sh->sector, atomic_read(&sh->count),
895 sh->pd_idx);
896
897 spin_lock(&sh->lock);
898 clear_bit(STRIPE_HANDLE, &sh->state);
899 clear_bit(STRIPE_DELAYED, &sh->state);
900
901 syncing = test_bit(STRIPE_SYNCING, &sh->state);
902 /* Now to look around and see what can be done */
903
904 for (i=disks; i--; ) {
905 mdk_rdev_t *rdev;
906 dev = &sh->dev[i];
907 clear_bit(R5_Insync, &dev->flags);
908 clear_bit(R5_Syncio, &dev->flags);
909
910 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
911 i, dev->flags, dev->toread, dev->towrite, dev->written);
912 /* maybe we can reply to a read */
913 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
914 struct bio *rbi, *rbi2;
915 PRINTK("Return read for disc %d\n", i);
916 spin_lock_irq(&conf->device_lock);
917 rbi = dev->toread;
918 dev->toread = NULL;
919 if (test_and_clear_bit(R5_Overlap, &dev->flags))
920 wake_up(&conf->wait_for_overlap);
921 spin_unlock_irq(&conf->device_lock);
922 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
923 copy_data(0, rbi, dev->page, dev->sector);
924 rbi2 = r5_next_bio(rbi, dev->sector);
925 spin_lock_irq(&conf->device_lock);
926 if (--rbi->bi_phys_segments == 0) {
927 rbi->bi_next = return_bi;
928 return_bi = rbi;
929 }
930 spin_unlock_irq(&conf->device_lock);
931 rbi = rbi2;
932 }
933 }
934
935 /* now count some things */
936 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
937 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
938
939
940 if (dev->toread) to_read++;
941 if (dev->towrite) {
942 to_write++;
943 if (!test_bit(R5_OVERWRITE, &dev->flags))
944 non_overwrite++;
945 }
946 if (dev->written) written++;
947 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
948 if (!rdev || !rdev->in_sync) {
949 failed++;
950 failed_num = i;
951 } else
952 set_bit(R5_Insync, &dev->flags);
953 }
954 PRINTK("locked=%d uptodate=%d to_read=%d"
955 " to_write=%d failed=%d failed_num=%d\n",
956 locked, uptodate, to_read, to_write, failed, failed_num);
957 /* check if the array has lost two devices and, if so, some requests might
958 * need to be failed
959 */
960 if (failed > 1 && to_read+to_write+written) {
961 spin_lock_irq(&conf->device_lock);
962 for (i=disks; i--; ) {
963 /* fail all writes first */
964 bi = sh->dev[i].towrite;
965 sh->dev[i].towrite = NULL;
966 if (bi) to_write--;
967
968 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
969 wake_up(&conf->wait_for_overlap);
970
971 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
972 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
973 clear_bit(BIO_UPTODATE, &bi->bi_flags);
974 if (--bi->bi_phys_segments == 0) {
975 md_write_end(conf->mddev);
976 bi->bi_next = return_bi;
977 return_bi = bi;
978 }
979 bi = nextbi;
980 }
981 /* and fail all 'written' */
982 bi = sh->dev[i].written;
983 sh->dev[i].written = NULL;
984 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
985 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
986 clear_bit(BIO_UPTODATE, &bi->bi_flags);
987 if (--bi->bi_phys_segments == 0) {
988 md_write_end(conf->mddev);
989 bi->bi_next = return_bi;
990 return_bi = bi;
991 }
992 bi = bi2;
993 }
994
995 /* fail any reads if this device is non-operational */
996 if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
997 bi = sh->dev[i].toread;
998 sh->dev[i].toread = NULL;
999 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1000 wake_up(&conf->wait_for_overlap);
1001 if (bi) to_read--;
1002 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1003 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1004 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1005 if (--bi->bi_phys_segments == 0) {
1006 bi->bi_next = return_bi;
1007 return_bi = bi;
1008 }
1009 bi = nextbi;
1010 }
1011 }
1012 }
1013 spin_unlock_irq(&conf->device_lock);
1014 }
1015 if (failed > 1 && syncing) {
1016 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1017 clear_bit(STRIPE_SYNCING, &sh->state);
1018 syncing = 0;
1019 }
1020
1021 /* might be able to return some write requests if the parity block
1022 * is safe, or on a failed drive
1023 */
1024 dev = &sh->dev[sh->pd_idx];
1025 if ( written &&
1026 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
1027 test_bit(R5_UPTODATE, &dev->flags))
1028 || (failed == 1 && failed_num == sh->pd_idx))
1029 ) {
1030 /* any written block on an uptodate or failed drive can be returned.
1031 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
1032 * never LOCKED, so we don't need to test 'failed' directly.
1033 */
1034 for (i=disks; i--; )
1035 if (sh->dev[i].written) {
1036 dev = &sh->dev[i];
1037 if (!test_bit(R5_LOCKED, &dev->flags) &&
1038 test_bit(R5_UPTODATE, &dev->flags) ) {
1039 /* We can return any write requests */
1040 struct bio *wbi, *wbi2;
1041 PRINTK("Return write for disc %d\n", i);
1042 spin_lock_irq(&conf->device_lock);
1043 wbi = dev->written;
1044 dev->written = NULL;
1045 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1046 wbi2 = r5_next_bio(wbi, dev->sector);
1047 if (--wbi->bi_phys_segments == 0) {
1048 md_write_end(conf->mddev);
1049 wbi->bi_next = return_bi;
1050 return_bi = wbi;
1051 }
1052 wbi = wbi2;
1053 }
1054 spin_unlock_irq(&conf->device_lock);
1055 }
1056 }
1057 }
1058
1059 /* Now we might consider reading some blocks, either to check/generate
1060 * parity, or to satisfy requests
1061 * or to load a block that is being partially written.
1062 */
1063 if (to_read || non_overwrite || (syncing && (uptodate < disks))) {
1064 for (i=disks; i--;) {
1065 dev = &sh->dev[i];
1066 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1067 (dev->toread ||
1068 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1069 syncing ||
1070 (failed && (sh->dev[failed_num].toread ||
1071 (sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
1072 )
1073 ) {
1074 /* we would like to get this block, possibly
1075 * by computing it, but we might not be able to
1076 */
1077 if (uptodate == disks-1) {
1078 PRINTK("Computing block %d\n", i);
1079 compute_block(sh, i);
1080 uptodate++;
1081 } else if (test_bit(R5_Insync, &dev->flags)) {
1082 set_bit(R5_LOCKED, &dev->flags);
1083 set_bit(R5_Wantread, &dev->flags);
1084#if 0
1085 /* if I am just reading this block and we don't have
1086 a failed drive, or any pending writes then sidestep the cache */
1087 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1088 ! syncing && !failed && !to_write) {
1089 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1090 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1091 }
1092#endif
1093 locked++;
1094 PRINTK("Reading block %d (sync=%d)\n",
1095 i, syncing);
1096 if (syncing)
1097 md_sync_acct(conf->disks[i].rdev->bdev,
1098 STRIPE_SECTORS);
1099 }
1100 }
1101 }
1102 set_bit(STRIPE_HANDLE, &sh->state);
1103 }
1104
1105 /* now to consider writing and what else, if anything should be read */
1106 if (to_write) {
1107 int rmw=0, rcw=0;
1108 for (i=disks ; i--;) {
1109 /* would I have to read this buffer for read_modify_write */
1110 dev = &sh->dev[i];
1111 if ((dev->towrite || i == sh->pd_idx) &&
1112 (!test_bit(R5_LOCKED, &dev->flags)
1113#if 0
1114|| sh->bh_page[i]!=bh->b_page
1115#endif
1116 ) &&
1117 !test_bit(R5_UPTODATE, &dev->flags)) {
1118 if (test_bit(R5_Insync, &dev->flags)
1119/* && !(!mddev->insync && i == sh->pd_idx) */
1120 )
1121 rmw++;
1122 else rmw += 2*disks; /* cannot read it */
1123 }
1124 /* Would I have to read this buffer for reconstruct_write */
1125 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1126 (!test_bit(R5_LOCKED, &dev->flags)
1127#if 0
1128|| sh->bh_page[i] != bh->b_page
1129#endif
1130 ) &&
1131 !test_bit(R5_UPTODATE, &dev->flags)) {
1132 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1133 else rcw += 2*disks;
1134 }
1135 }
1136 PRINTK("for sector %llu, rmw=%d rcw=%d\n",
1137 (unsigned long long)sh->sector, rmw, rcw);
1138 set_bit(STRIPE_HANDLE, &sh->state);
1139 if (rmw < rcw && rmw > 0)
1140 /* prefer read-modify-write, but need to get some data */
1141 for (i=disks; i--;) {
1142 dev = &sh->dev[i];
1143 if ((dev->towrite || i == sh->pd_idx) &&
1144 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1145 test_bit(R5_Insync, &dev->flags)) {
1146 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1147 {
1148 PRINTK("Read_old block %d for r-m-w\n", i);
1149 set_bit(R5_LOCKED, &dev->flags);
1150 set_bit(R5_Wantread, &dev->flags);
1151 locked++;
1152 } else {
1153 set_bit(STRIPE_DELAYED, &sh->state);
1154 set_bit(STRIPE_HANDLE, &sh->state);
1155 }
1156 }
1157 }
1158 if (rcw <= rmw && rcw > 0)
1159 /* want reconstruct write, but need to get some data */
1160 for (i=disks; i--;) {
1161 dev = &sh->dev[i];
1162 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
1163 !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1164 test_bit(R5_Insync, &dev->flags)) {
1165 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1166 {
1167 PRINTK("Read_old block %d for Reconstruct\n", i);
1168 set_bit(R5_LOCKED, &dev->flags);
1169 set_bit(R5_Wantread, &dev->flags);
1170 locked++;
1171 } else {
1172 set_bit(STRIPE_DELAYED, &sh->state);
1173 set_bit(STRIPE_HANDLE, &sh->state);
1174 }
1175 }
1176 }
1177 /* now if nothing is locked, and if we have enough data, we can start a write request */
1178 if (locked == 0 && (rcw == 0 ||rmw == 0)) {
1179 PRINTK("Computing parity...\n");
1180 compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
1181 /* now every locked buffer is ready to be written */
1182 for (i=disks; i--;)
1183 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1184 PRINTK("Writing block %d\n", i);
1185 locked++;
1186 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1187 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1188 || (i==sh->pd_idx && failed == 0))
1189 set_bit(STRIPE_INSYNC, &sh->state);
1190 }
1191 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1192 atomic_dec(&conf->preread_active_stripes);
1193 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1194 md_wakeup_thread(conf->mddev->thread);
1195 }
1196 }
1197 }
1198
1199 /* maybe we need to check and possibly fix the parity for this stripe
1200 * Any reads will already have been scheduled, so we just see if enough data
1201 * is available
1202 */
1203 if (syncing && locked == 0 &&
1204 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
1205 set_bit(STRIPE_HANDLE, &sh->state);
1206 if (failed == 0) {
1207 char *pagea;
1208 if (uptodate != disks)
1209 BUG();
1210 compute_parity(sh, CHECK_PARITY);
1211 uptodate--;
1212 pagea = page_address(sh->dev[sh->pd_idx].page);
1213 if ((*(u32*)pagea) == 0 &&
1214 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1215 /* parity is correct (on disc, not in buffer any more) */
1216 set_bit(STRIPE_INSYNC, &sh->state);
1217 }
1218 }
1219 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1220 if (failed==0)
1221 failed_num = sh->pd_idx;
1222 /* should be able to compute the missing block and write it to spare */
1223 if (!test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)) {
1224 if (uptodate+1 != disks)
1225 BUG();
1226 compute_block(sh, failed_num);
1227 uptodate++;
1228 }
1229 if (uptodate != disks)
1230 BUG();
1231 dev = &sh->dev[failed_num];
1232 set_bit(R5_LOCKED, &dev->flags);
1233 set_bit(R5_Wantwrite, &dev->flags);
1234 locked++;
1235 set_bit(STRIPE_INSYNC, &sh->state);
1236 set_bit(R5_Syncio, &dev->flags);
1237 }
1238 }
1239 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1240 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1241 clear_bit(STRIPE_SYNCING, &sh->state);
1242 }
1243
1244 spin_unlock(&sh->lock);
1245
1246 while ((bi=return_bi)) {
1247 int bytes = bi->bi_size;
1248
1249 return_bi = bi->bi_next;
1250 bi->bi_next = NULL;
1251 bi->bi_size = 0;
1252 bi->bi_end_io(bi, bytes, 0);
1253 }
1254 for (i=disks; i-- ;) {
1255 int rw;
1256 struct bio *bi;
1257 mdk_rdev_t *rdev;
1258 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1259 rw = 1;
1260 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1261 rw = 0;
1262 else
1263 continue;
1264
1265 bi = &sh->dev[i].req;
1266
1267 bi->bi_rw = rw;
1268 if (rw)
1269 bi->bi_end_io = raid5_end_write_request;
1270 else
1271 bi->bi_end_io = raid5_end_read_request;
1272
1273 rcu_read_lock();
1274 rdev = conf->disks[i].rdev;
1275 if (rdev && rdev->faulty)
1276 rdev = NULL;
1277 if (rdev)
1278 atomic_inc(&rdev->nr_pending);
1279 rcu_read_unlock();
1280
1281 if (rdev) {
1282 if (test_bit(R5_Syncio, &sh->dev[i].flags))
1283 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1284
1285 bi->bi_bdev = rdev->bdev;
1286 PRINTK("for %llu schedule op %ld on disc %d\n",
1287 (unsigned long long)sh->sector, bi->bi_rw, i);
1288 atomic_inc(&sh->count);
1289 bi->bi_sector = sh->sector + rdev->data_offset;
1290 bi->bi_flags = 1 << BIO_UPTODATE;
1291 bi->bi_vcnt = 1;
1292 bi->bi_max_vecs = 1;
1293 bi->bi_idx = 0;
1294 bi->bi_io_vec = &sh->dev[i].vec;
1295 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1296 bi->bi_io_vec[0].bv_offset = 0;
1297 bi->bi_size = STRIPE_SIZE;
1298 bi->bi_next = NULL;
1299 generic_make_request(bi);
1300 } else {
1301 PRINTK("skip op %ld on disc %d for sector %llu\n",
1302 bi->bi_rw, i, (unsigned long long)sh->sector);
1303 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1304 set_bit(STRIPE_HANDLE, &sh->state);
1305 }
1306 }
1307}
1308
1309static inline void raid5_activate_delayed(raid5_conf_t *conf)
1310{
1311 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1312 while (!list_empty(&conf->delayed_list)) {
1313 struct list_head *l = conf->delayed_list.next;
1314 struct stripe_head *sh;
1315 sh = list_entry(l, struct stripe_head, lru);
1316 list_del_init(l);
1317 clear_bit(STRIPE_DELAYED, &sh->state);
1318 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1319 atomic_inc(&conf->preread_active_stripes);
1320 list_add_tail(&sh->lru, &conf->handle_list);
1321 }
1322 }
1323}
1324
1325static void unplug_slaves(mddev_t *mddev)
1326{
1327 raid5_conf_t *conf = mddev_to_conf(mddev);
1328 int i;
1329
1330 rcu_read_lock();
1331 for (i=0; i<mddev->raid_disks; i++) {
1332 mdk_rdev_t *rdev = conf->disks[i].rdev;
1333 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
1334 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1335
1336 atomic_inc(&rdev->nr_pending);
1337 rcu_read_unlock();
1338
1339 if (r_queue->unplug_fn)
1340 r_queue->unplug_fn(r_queue);
1341
1342 rdev_dec_pending(rdev, mddev);
1343 rcu_read_lock();
1344 }
1345 }
1346 rcu_read_unlock();
1347}
1348
1349static void raid5_unplug_device(request_queue_t *q)
1350{
1351 mddev_t *mddev = q->queuedata;
1352 raid5_conf_t *conf = mddev_to_conf(mddev);
1353 unsigned long flags;
1354
1355 spin_lock_irqsave(&conf->device_lock, flags);
1356
1357 if (blk_remove_plug(q))
1358 raid5_activate_delayed(conf);
1359 md_wakeup_thread(mddev->thread);
1360
1361 spin_unlock_irqrestore(&conf->device_lock, flags);
1362
1363 unplug_slaves(mddev);
1364}
1365
1366static int raid5_issue_flush(request_queue_t *q, struct gendisk *disk,
1367 sector_t *error_sector)
1368{
1369 mddev_t *mddev = q->queuedata;
1370 raid5_conf_t *conf = mddev_to_conf(mddev);
1371 int i, ret = 0;
1372
1373 rcu_read_lock();
1374 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1375 mdk_rdev_t *rdev = conf->disks[i].rdev;
1376 if (rdev && !rdev->faulty) {
1377 struct block_device *bdev = rdev->bdev;
1378 request_queue_t *r_queue = bdev_get_queue(bdev);
1379
1380 if (!r_queue->issue_flush_fn)
1381 ret = -EOPNOTSUPP;
1382 else {
1383 atomic_inc(&rdev->nr_pending);
1384 rcu_read_unlock();
1385 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
1386 error_sector);
1387 rdev_dec_pending(rdev, mddev);
1388 rcu_read_lock();
1389 }
1390 }
1391 }
1392 rcu_read_unlock();
1393 return ret;
1394}
1395
1396static inline void raid5_plug_device(raid5_conf_t *conf)
1397{
1398 spin_lock_irq(&conf->device_lock);
1399 blk_plug_device(conf->mddev->queue);
1400 spin_unlock_irq(&conf->device_lock);
1401}
1402
1403static int make_request (request_queue_t *q, struct bio * bi)
1404{
1405 mddev_t *mddev = q->queuedata;
1406 raid5_conf_t *conf = mddev_to_conf(mddev);
1407 const unsigned int raid_disks = conf->raid_disks;
1408 const unsigned int data_disks = raid_disks - 1;
1409 unsigned int dd_idx, pd_idx;
1410 sector_t new_sector;
1411 sector_t logical_sector, last_sector;
1412 struct stripe_head *sh;
1413
1414 if (bio_data_dir(bi)==WRITE) {
1415 disk_stat_inc(mddev->gendisk, writes);
1416 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
1417 } else {
1418 disk_stat_inc(mddev->gendisk, reads);
1419 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
1420 }
1421
1422 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
1423 last_sector = bi->bi_sector + (bi->bi_size>>9);
1424 bi->bi_next = NULL;
1425 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1426 if ( bio_data_dir(bi) == WRITE )
1427 md_write_start(mddev);
1428 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1429 DEFINE_WAIT(w);
1430
1431 new_sector = raid5_compute_sector(logical_sector,
1432 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1433
1434 PRINTK("raid5: make_request, sector %llu logical %llu\n",
1435 (unsigned long long)new_sector,
1436 (unsigned long long)logical_sector);
1437
1438 retry:
1439 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1440 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1441 if (sh) {
1442 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1443 /* Add failed due to overlap. Flush everything
1444 * and wait a while
1445 */
1446 raid5_unplug_device(mddev->queue);
1447 release_stripe(sh);
1448 schedule();
1449 goto retry;
1450 }
1451 finish_wait(&conf->wait_for_overlap, &w);
1452 raid5_plug_device(conf);
1453 handle_stripe(sh);
1454 release_stripe(sh);
1455
1456 } else {
1457 /* cannot get stripe for read-ahead, just give-up */
1458 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1459 finish_wait(&conf->wait_for_overlap, &w);
1460 break;
1461 }
1462
1463 }
1464 spin_lock_irq(&conf->device_lock);
1465 if (--bi->bi_phys_segments == 0) {
1466 int bytes = bi->bi_size;
1467
1468 if ( bio_data_dir(bi) == WRITE )
1469 md_write_end(mddev);
1470 bi->bi_size = 0;
1471 bi->bi_end_io(bi, bytes, 0);
1472 }
1473 spin_unlock_irq(&conf->device_lock);
1474 return 0;
1475}
1476
1477/* FIXME go_faster isn't used */
1478static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1479{
1480 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1481 struct stripe_head *sh;
1482 int sectors_per_chunk = conf->chunk_size >> 9;
1483 sector_t x;
1484 unsigned long stripe;
1485 int chunk_offset;
1486 int dd_idx, pd_idx;
1487 sector_t first_sector;
1488 int raid_disks = conf->raid_disks;
1489 int data_disks = raid_disks-1;
1490
1491 if (sector_nr >= mddev->size <<1) {
1492 /* just being told to finish up .. nothing much to do */
1493 unplug_slaves(mddev);
1494 return 0;
1495 }
1496 /* if there is 1 or more failed drives and we are trying
1497 * to resync, then assert that we are finished, because there is
1498 * nothing we can do.
1499 */
1500 if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1501 int rv = (mddev->size << 1) - sector_nr;
1502 md_done_sync(mddev, rv, 1);
1503 return rv;
1504 }
1505
1506 x = sector_nr;
1507 chunk_offset = sector_div(x, sectors_per_chunk);
1508 stripe = x;
1509 BUG_ON(x != stripe);
1510
1511 first_sector = raid5_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1512 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1513 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1514 if (sh == NULL) {
1515 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1516 /* make sure we don't swamp the stripe cache if someone else
1517 * is trying to get access
1518 */
1519 set_current_state(TASK_UNINTERRUPTIBLE);
1520 schedule_timeout(1);
1521 }
1522 spin_lock(&sh->lock);
1523 set_bit(STRIPE_SYNCING, &sh->state);
1524 clear_bit(STRIPE_INSYNC, &sh->state);
1525 spin_unlock(&sh->lock);
1526
1527 handle_stripe(sh);
1528 release_stripe(sh);
1529
1530 return STRIPE_SECTORS;
1531}
1532
1533/*
1534 * This is our raid5 kernel thread.
1535 *
1536 * We scan the hash table for stripes which can be handled now.
1537 * During the scan, completed stripes are saved for us by the interrupt
1538 * handler, so that they will not have to wait for our next wakeup.
1539 */
1540static void raid5d (mddev_t *mddev)
1541{
1542 struct stripe_head *sh;
1543 raid5_conf_t *conf = mddev_to_conf(mddev);
1544 int handled;
1545
1546 PRINTK("+++ raid5d active\n");
1547
1548 md_check_recovery(mddev);
1549 md_handle_safemode(mddev);
1550
1551 handled = 0;
1552 spin_lock_irq(&conf->device_lock);
1553 while (1) {
1554 struct list_head *first;
1555
1556 if (list_empty(&conf->handle_list) &&
1557 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1558 !blk_queue_plugged(mddev->queue) &&
1559 !list_empty(&conf->delayed_list))
1560 raid5_activate_delayed(conf);
1561
1562 if (list_empty(&conf->handle_list))
1563 break;
1564
1565 first = conf->handle_list.next;
1566 sh = list_entry(first, struct stripe_head, lru);
1567
1568 list_del_init(first);
1569 atomic_inc(&sh->count);
1570 if (atomic_read(&sh->count)!= 1)
1571 BUG();
1572 spin_unlock_irq(&conf->device_lock);
1573
1574 handled++;
1575 handle_stripe(sh);
1576 release_stripe(sh);
1577
1578 spin_lock_irq(&conf->device_lock);
1579 }
1580 PRINTK("%d stripes handled\n", handled);
1581
1582 spin_unlock_irq(&conf->device_lock);
1583
1584 unplug_slaves(mddev);
1585
1586 PRINTK("--- raid5d inactive\n");
1587}
1588
1589static int run (mddev_t *mddev)
1590{
1591 raid5_conf_t *conf;
1592 int raid_disk, memory;
1593 mdk_rdev_t *rdev;
1594 struct disk_info *disk;
1595 struct list_head *tmp;
1596
1597 if (mddev->level != 5 && mddev->level != 4) {
1598 printk("raid5: %s: raid level not set to 4/5 (%d)\n", mdname(mddev), mddev->level);
1599 return -EIO;
1600 }
1601
1602 mddev->private = kmalloc (sizeof (raid5_conf_t)
1603 + mddev->raid_disks * sizeof(struct disk_info),
1604 GFP_KERNEL);
1605 if ((conf = mddev->private) == NULL)
1606 goto abort;
1607 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
1608 conf->mddev = mddev;
1609
1610 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1611 goto abort;
1612 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1613
1614 spin_lock_init(&conf->device_lock);
1615 init_waitqueue_head(&conf->wait_for_stripe);
1616 init_waitqueue_head(&conf->wait_for_overlap);
1617 INIT_LIST_HEAD(&conf->handle_list);
1618 INIT_LIST_HEAD(&conf->delayed_list);
1619 INIT_LIST_HEAD(&conf->inactive_list);
1620 atomic_set(&conf->active_stripes, 0);
1621 atomic_set(&conf->preread_active_stripes, 0);
1622
1623 mddev->queue->unplug_fn = raid5_unplug_device;
1624 mddev->queue->issue_flush_fn = raid5_issue_flush;
1625
1626 PRINTK("raid5: run(%s) called.\n", mdname(mddev));
1627
1628 ITERATE_RDEV(mddev,rdev,tmp) {
1629 raid_disk = rdev->raid_disk;
1630 if (raid_disk >= mddev->raid_disks
1631 || raid_disk < 0)
1632 continue;
1633 disk = conf->disks + raid_disk;
1634
1635 disk->rdev = rdev;
1636
1637 if (rdev->in_sync) {
1638 char b[BDEVNAME_SIZE];
1639 printk(KERN_INFO "raid5: device %s operational as raid"
1640 " disk %d\n", bdevname(rdev->bdev,b),
1641 raid_disk);
1642 conf->working_disks++;
1643 }
1644 }
1645
1646 conf->raid_disks = mddev->raid_disks;
1647 /*
1648 * 0 for a fully functional array, 1 for a degraded array.
1649 */
1650 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
1651 conf->mddev = mddev;
1652 conf->chunk_size = mddev->chunk_size;
1653 conf->level = mddev->level;
1654 conf->algorithm = mddev->layout;
1655 conf->max_nr_stripes = NR_STRIPES;
1656
1657 /* device size must be a multiple of chunk size */
1658 mddev->size &= ~(mddev->chunk_size/1024 -1);
1659
1660 if (!conf->chunk_size || conf->chunk_size % 4) {
1661 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
1662 conf->chunk_size, mdname(mddev));
1663 goto abort;
1664 }
1665 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1666 printk(KERN_ERR
1667 "raid5: unsupported parity algorithm %d for %s\n",
1668 conf->algorithm, mdname(mddev));
1669 goto abort;
1670 }
1671 if (mddev->degraded > 1) {
1672 printk(KERN_ERR "raid5: not enough operational devices for %s"
1673 " (%d/%d failed)\n",
1674 mdname(mddev), conf->failed_disks, conf->raid_disks);
1675 goto abort;
1676 }
1677
1678 if (mddev->degraded == 1 &&
1679 mddev->recovery_cp != MaxSector) {
1680 printk(KERN_ERR
1681 "raid5: cannot start dirty degraded array for %s\n",
1682 mdname(mddev));
1683 goto abort;
1684 }
1685
1686 {
1687 mddev->thread = md_register_thread(raid5d, mddev, "%s_raid5");
1688 if (!mddev->thread) {
1689 printk(KERN_ERR
1690 "raid5: couldn't allocate thread for %s\n",
1691 mdname(mddev));
1692 goto abort;
1693 }
1694 }
1695memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1696 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
1697 if (grow_stripes(conf, conf->max_nr_stripes)) {
1698 printk(KERN_ERR
1699 "raid5: couldn't allocate %dkB for buffers\n", memory);
1700 shrink_stripes(conf);
1701 md_unregister_thread(mddev->thread);
1702 goto abort;
1703 } else
1704 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
1705 memory, mdname(mddev));
1706
1707 if (mddev->degraded == 0)
1708 printk("raid5: raid level %d set %s active with %d out of %d"
1709 " devices, algorithm %d\n", conf->level, mdname(mddev),
1710 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
1711 conf->algorithm);
1712 else
1713 printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
1714 " out of %d devices, algorithm %d\n", conf->level,
1715 mdname(mddev), mddev->raid_disks - mddev->degraded,
1716 mddev->raid_disks, conf->algorithm);
1717
1718 print_raid5_conf(conf);
1719
1720 /* read-ahead size must cover two whole stripes, which is
1721 * 2 * (n-1) * chunksize where 'n' is the number of raid devices
1722 */
1723 {
1724 int stripe = (mddev->raid_disks-1) * mddev->chunk_size
1725 / PAGE_CACHE_SIZE;
1726 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1727 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1728 }
1729
1730 /* Ok, everything is just fine now */
1731 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
1732 return 0;
1733abort:
1734 if (conf) {
1735 print_raid5_conf(conf);
1736 if (conf->stripe_hashtbl)
1737 free_pages((unsigned long) conf->stripe_hashtbl,
1738 HASH_PAGES_ORDER);
1739 kfree(conf);
1740 }
1741 mddev->private = NULL;
1742 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
1743 return -EIO;
1744}
1745
1746
1747
1748static int stop (mddev_t *mddev)
1749{
1750 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1751
1752 md_unregister_thread(mddev->thread);
1753 mddev->thread = NULL;
1754 shrink_stripes(conf);
1755 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1756 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1757 kfree(conf);
1758 mddev->private = NULL;
1759 return 0;
1760}
1761
1762#if RAID5_DEBUG
1763static void print_sh (struct stripe_head *sh)
1764{
1765 int i;
1766
1767 printk("sh %llu, pd_idx %d, state %ld.\n",
1768 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
1769 printk("sh %llu, count %d.\n",
1770 (unsigned long long)sh->sector, atomic_read(&sh->count));
1771 printk("sh %llu, ", (unsigned long long)sh->sector);
1772 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
1773 printk("(cache%d: %p %ld) ",
1774 i, sh->dev[i].page, sh->dev[i].flags);
1775 }
1776 printk("\n");
1777}
1778
1779static void printall (raid5_conf_t *conf)
1780{
1781 struct stripe_head *sh;
1782 int i;
1783
1784 spin_lock_irq(&conf->device_lock);
1785 for (i = 0; i < NR_HASH; i++) {
1786 sh = conf->stripe_hashtbl[i];
1787 for (; sh; sh = sh->hash_next) {
1788 if (sh->raid_conf != conf)
1789 continue;
1790 print_sh(sh);
1791 }
1792 }
1793 spin_unlock_irq(&conf->device_lock);
1794}
1795#endif
1796
1797static void status (struct seq_file *seq, mddev_t *mddev)
1798{
1799 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1800 int i;
1801
1802 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
1803 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1804 for (i = 0; i < conf->raid_disks; i++)
1805 seq_printf (seq, "%s",
1806 conf->disks[i].rdev &&
1807 conf->disks[i].rdev->in_sync ? "U" : "_");
1808 seq_printf (seq, "]");
1809#if RAID5_DEBUG
1810#define D(x) \
1811 seq_printf (seq, "<"#x":%d>", atomic_read(&conf->x))
1812 printall(conf);
1813#endif
1814}
1815
1816static void print_raid5_conf (raid5_conf_t *conf)
1817{
1818 int i;
1819 struct disk_info *tmp;
1820
1821 printk("RAID5 conf printout:\n");
1822 if (!conf) {
1823 printk("(conf==NULL)\n");
1824 return;
1825 }
1826 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1827 conf->working_disks, conf->failed_disks);
1828
1829 for (i = 0; i < conf->raid_disks; i++) {
1830 char b[BDEVNAME_SIZE];
1831 tmp = conf->disks + i;
1832 if (tmp->rdev)
1833 printk(" disk %d, o:%d, dev:%s\n",
1834 i, !tmp->rdev->faulty,
1835 bdevname(tmp->rdev->bdev,b));
1836 }
1837}
1838
1839static int raid5_spare_active(mddev_t *mddev)
1840{
1841 int i;
1842 raid5_conf_t *conf = mddev->private;
1843 struct disk_info *tmp;
1844
1845 for (i = 0; i < conf->raid_disks; i++) {
1846 tmp = conf->disks + i;
1847 if (tmp->rdev
1848 && !tmp->rdev->faulty
1849 && !tmp->rdev->in_sync) {
1850 mddev->degraded--;
1851 conf->failed_disks--;
1852 conf->working_disks++;
1853 tmp->rdev->in_sync = 1;
1854 }
1855 }
1856 print_raid5_conf(conf);
1857 return 0;
1858}
1859
1860static int raid5_remove_disk(mddev_t *mddev, int number)
1861{
1862 raid5_conf_t *conf = mddev->private;
1863 int err = 0;
1864 mdk_rdev_t *rdev;
1865 struct disk_info *p = conf->disks + number;
1866
1867 print_raid5_conf(conf);
1868 rdev = p->rdev;
1869 if (rdev) {
1870 if (rdev->in_sync ||
1871 atomic_read(&rdev->nr_pending)) {
1872 err = -EBUSY;
1873 goto abort;
1874 }
1875 p->rdev = NULL;
1876 synchronize_kernel();
1877 if (atomic_read(&rdev->nr_pending)) {
1878 /* lost the race, try later */
1879 err = -EBUSY;
1880 p->rdev = rdev;
1881 }
1882 }
1883abort:
1884
1885 print_raid5_conf(conf);
1886 return err;
1887}
1888
1889static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1890{
1891 raid5_conf_t *conf = mddev->private;
1892 int found = 0;
1893 int disk;
1894 struct disk_info *p;
1895
1896 if (mddev->degraded > 1)
1897 /* no point adding a device */
1898 return 0;
1899
1900 /*
1901 * find the disk ...
1902 */
1903 for (disk=0; disk < mddev->raid_disks; disk++)
1904 if ((p=conf->disks + disk)->rdev == NULL) {
1905 rdev->in_sync = 0;
1906 rdev->raid_disk = disk;
1907 found = 1;
1908 p->rdev = rdev;
1909 break;
1910 }
1911 print_raid5_conf(conf);
1912 return found;
1913}
1914
1915static int raid5_resize(mddev_t *mddev, sector_t sectors)
1916{
1917 /* no resync is happening, and there is enough space
1918 * on all devices, so we can resize.
1919 * We need to make sure resync covers any new space.
1920 * If the array is shrinking we should possibly wait until
1921 * any io in the removed space completes, but it hardly seems
1922 * worth it.
1923 */
1924 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
1925 mddev->array_size = (sectors * (mddev->raid_disks-1))>>1;
1926 set_capacity(mddev->gendisk, mddev->array_size << 1);
1927 mddev->changed = 1;
1928 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
1929 mddev->recovery_cp = mddev->size << 1;
1930 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1931 }
1932 mddev->size = sectors /2;
1933 return 0;
1934}
1935
1936static mdk_personality_t raid5_personality=
1937{
1938 .name = "raid5",
1939 .owner = THIS_MODULE,
1940 .make_request = make_request,
1941 .run = run,
1942 .stop = stop,
1943 .status = status,
1944 .error_handler = error,
1945 .hot_add_disk = raid5_add_disk,
1946 .hot_remove_disk= raid5_remove_disk,
1947 .spare_active = raid5_spare_active,
1948 .sync_request = sync_request,
1949 .resize = raid5_resize,
1950};
1951
1952static int __init raid5_init (void)
1953{
1954 return register_md_personality (RAID5, &raid5_personality);
1955}
1956
1957static void raid5_exit (void)
1958{
1959 unregister_md_personality (RAID5);
1960}
1961
1962module_init(raid5_init);
1963module_exit(raid5_exit);
1964MODULE_LICENSE("GPL");
1965MODULE_ALIAS("md-personality-4"); /* RAID5 */
diff --git a/drivers/md/raid6.h b/drivers/md/raid6.h
new file mode 100644
index 000000000000..f80ee6350edf
--- /dev/null
+++ b/drivers/md/raid6.h
@@ -0,0 +1,135 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2003 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13#ifndef LINUX_RAID_RAID6_H
14#define LINUX_RAID_RAID6_H
15
16#ifdef __KERNEL__
17
18/* Set to 1 to use kernel-wide empty_zero_page */
19#define RAID6_USE_EMPTY_ZERO_PAGE 0
20
21#include <linux/module.h>
22#include <linux/stddef.h>
23#include <linux/compiler.h>
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/errno.h>
27#include <linux/mempool.h>
28#include <linux/list.h>
29#include <linux/vmalloc.h>
30#include <linux/raid/md.h>
31#include <linux/raid/raid5.h>
32
33typedef raid5_conf_t raid6_conf_t; /* Same configuration */
34
35/* Additional compute_parity mode -- updates the parity w/o LOCKING */
36#define UPDATE_PARITY 4
37
38/* We need a pre-zeroed page... if we don't want to use the kernel-provided
39 one define it here */
40#if RAID6_USE_EMPTY_ZERO_PAGE
41# define raid6_empty_zero_page empty_zero_page
42#else
43extern const char raid6_empty_zero_page[PAGE_SIZE];
44#endif
45
46#else /* ! __KERNEL__ */
47/* Used for testing in user space */
48
49#include <errno.h>
50#include <inttypes.h>
51#include <limits.h>
52#include <stddef.h>
53#include <sys/mman.h>
54#include <sys/types.h>
55
56/* Not standard, but glibc defines it */
57#define BITS_PER_LONG __WORDSIZE
58
59typedef uint8_t u8;
60typedef uint16_t u16;
61typedef uint32_t u32;
62typedef uint64_t u64;
63
64#ifndef PAGE_SIZE
65# define PAGE_SIZE 4096
66#endif
67extern const char raid6_empty_zero_page[PAGE_SIZE];
68
69#define __init
70#define __exit
71#define __attribute_const__ __attribute__((const))
72
73#define preempt_enable()
74#define preempt_disable()
75
76#endif /* __KERNEL__ */
77
78/* Routine choices */
79struct raid6_calls {
80 void (*gen_syndrome)(int, size_t, void **);
81 int (*valid)(void); /* Returns 1 if this routine set is usable */
82 const char *name; /* Name of this routine set */
83 int prefer; /* Has special performance attribute */
84};
85
86/* Selected algorithm */
87extern struct raid6_calls raid6_call;
88
89/* Algorithm list */
90extern const struct raid6_calls * const raid6_algos[];
91int raid6_select_algo(void);
92
93/* Return values from chk_syndrome */
94#define RAID6_OK 0
95#define RAID6_P_BAD 1
96#define RAID6_Q_BAD 2
97#define RAID6_PQ_BAD 3
98
99/* Galois field tables */
100extern const u8 raid6_gfmul[256][256] __attribute__((aligned(256)));
101extern const u8 raid6_gfexp[256] __attribute__((aligned(256)));
102extern const u8 raid6_gfinv[256] __attribute__((aligned(256)));
103extern const u8 raid6_gfexi[256] __attribute__((aligned(256)));
104
105/* Recovery routines */
106void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
107void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs);
108void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs);
109
110/* Some definitions to allow code to be compiled for testing in userspace */
111#ifndef __KERNEL__
112
113# define jiffies raid6_jiffies()
114# define printk printf
115# define GFP_KERNEL 0
116# define __get_free_pages(x,y) ((unsigned long)mmap(NULL, PAGE_SIZE << (y), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0))
117# define free_pages(x,y) munmap((void *)(x), (y)*PAGE_SIZE)
118
119static inline void cpu_relax(void)
120{
121 /* Nothing */
122}
123
124#undef HZ
125#define HZ 1000
126static inline uint32_t raid6_jiffies(void)
127{
128 struct timeval tv;
129 gettimeofday(&tv, NULL);
130 return tv.tv_sec*1000 + tv.tv_usec/1000;
131}
132
133#endif /* ! __KERNEL__ */
134
135#endif /* LINUX_RAID_RAID6_H */
diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c
new file mode 100644
index 000000000000..acf386fc4b4f
--- /dev/null
+++ b/drivers/md/raid6algos.c
@@ -0,0 +1,153 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6algos.c
15 *
16 * Algorithm list and algorithm selection for RAID-6
17 */
18
19#include "raid6.h"
20#ifndef __KERNEL__
21#include <sys/mman.h>
22#endif
23
24struct raid6_calls raid6_call;
25
26/* Various routine sets */
27extern const struct raid6_calls raid6_intx1;
28extern const struct raid6_calls raid6_intx2;
29extern const struct raid6_calls raid6_intx4;
30extern const struct raid6_calls raid6_intx8;
31extern const struct raid6_calls raid6_intx16;
32extern const struct raid6_calls raid6_intx32;
33extern const struct raid6_calls raid6_mmxx1;
34extern const struct raid6_calls raid6_mmxx2;
35extern const struct raid6_calls raid6_sse1x1;
36extern const struct raid6_calls raid6_sse1x2;
37extern const struct raid6_calls raid6_sse2x1;
38extern const struct raid6_calls raid6_sse2x2;
39extern const struct raid6_calls raid6_sse2x4;
40extern const struct raid6_calls raid6_altivec1;
41extern const struct raid6_calls raid6_altivec2;
42extern const struct raid6_calls raid6_altivec4;
43extern const struct raid6_calls raid6_altivec8;
44
45const struct raid6_calls * const raid6_algos[] = {
46 &raid6_intx1,
47 &raid6_intx2,
48 &raid6_intx4,
49 &raid6_intx8,
50#if defined(__ia64__)
51 &raid6_intx16,
52 &raid6_intx32,
53#endif
54#if defined(__i386__)
55 &raid6_mmxx1,
56 &raid6_mmxx2,
57 &raid6_sse1x1,
58 &raid6_sse1x2,
59 &raid6_sse2x1,
60 &raid6_sse2x2,
61#endif
62#if defined(__x86_64__)
63 &raid6_sse2x1,
64 &raid6_sse2x2,
65 &raid6_sse2x4,
66#endif
67#ifdef CONFIG_ALTIVEC
68 &raid6_altivec1,
69 &raid6_altivec2,
70 &raid6_altivec4,
71 &raid6_altivec8,
72#endif
73 NULL
74};
75
76#ifdef __KERNEL__
77#define RAID6_TIME_JIFFIES_LG2 4
78#else
79/* Need more time to be stable in userspace */
80#define RAID6_TIME_JIFFIES_LG2 9
81#endif
82
83/* Try to pick the best algorithm */
84/* This code uses the gfmul table as convenient data set to abuse */
85
86int __init raid6_select_algo(void)
87{
88 const struct raid6_calls * const * algo;
89 const struct raid6_calls * best;
90 char *syndromes;
91 void *dptrs[(65536/PAGE_SIZE)+2];
92 int i, disks;
93 unsigned long perf, bestperf;
94 int bestprefer;
95 unsigned long j0, j1;
96
97 disks = (65536/PAGE_SIZE)+2;
98 for ( i = 0 ; i < disks-2 ; i++ ) {
99 dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i;
100 }
101
102 /* Normal code - use a 2-page allocation to avoid D$ conflict */
103 syndromes = (void *) __get_free_pages(GFP_KERNEL, 1);
104
105 if ( !syndromes ) {
106 printk("raid6: Yikes! No memory available.\n");
107 return -ENOMEM;
108 }
109
110 dptrs[disks-2] = syndromes;
111 dptrs[disks-1] = syndromes + PAGE_SIZE;
112
113 bestperf = 0; bestprefer = 0; best = NULL;
114
115 for ( algo = raid6_algos ; *algo ; algo++ ) {
116 if ( !(*algo)->valid || (*algo)->valid() ) {
117 perf = 0;
118
119 preempt_disable();
120 j0 = jiffies;
121 while ( (j1 = jiffies) == j0 )
122 cpu_relax();
123 while ( (jiffies-j1) < (1 << RAID6_TIME_JIFFIES_LG2) ) {
124 (*algo)->gen_syndrome(disks, PAGE_SIZE, dptrs);
125 perf++;
126 }
127 preempt_enable();
128
129 if ( (*algo)->prefer > bestprefer ||
130 ((*algo)->prefer == bestprefer &&
131 perf > bestperf) ) {
132 best = *algo;
133 bestprefer = best->prefer;
134 bestperf = perf;
135 }
136 printk("raid6: %-8s %5ld MB/s\n", (*algo)->name,
137 (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
138 }
139 }
140
141 if ( best )
142 printk("raid6: using algorithm %s (%ld MB/s)\n",
143 best->name,
144 (bestperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2));
145 else
146 printk("raid6: Yikes! No algorithm found!\n");
147
148 raid6_call = *best;
149
150 free_pages((unsigned long)syndromes, 1);
151
152 return best ? 0 : -EINVAL;
153}
diff --git a/drivers/md/raid6altivec.uc b/drivers/md/raid6altivec.uc
new file mode 100644
index 000000000000..1de8f030eee0
--- /dev/null
+++ b/drivers/md/raid6altivec.uc
@@ -0,0 +1,122 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6altivec$#.c
15 *
16 * $#-way unrolled portable integer math RAID-6 instruction set
17 *
18 * This file is postprocessed using unroll.pl
19 *
20 * <benh> hpa: in process,
21 * you can just "steal" the vec unit with enable_kernel_altivec() (but
22 * bracked this with preempt_disable/enable or in a lock)
23 */
24
25#include "raid6.h"
26
27#ifdef CONFIG_ALTIVEC
28
29#include <altivec.h>
30#include <asm/system.h>
31#include <asm/cputable.h>
32
33/*
34 * This is the C data type to use
35 */
36
37typedef vector unsigned char unative_t;
38
39#define NBYTES(x) ((vector unsigned char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
40#define NSIZE sizeof(unative_t)
41
42/*
43 * The SHLBYTE() operation shifts each byte left by 1, *not*
44 * rolling over into the next byte
45 */
46static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
47{
48 return vec_add(v,v);
49}
50
51/*
52 * The MASK() operation returns 0xFF in any byte for which the high
53 * bit is 1, 0x00 for any byte for which the high bit is 0.
54 */
55static inline __attribute_const__ unative_t MASK(unative_t v)
56{
57 unative_t zv = NBYTES(0);
58
59 /* vec_cmpgt returns a vector bool char; thus the need for the cast */
60 return (unative_t)vec_cmpgt(zv, v);
61}
62
63
64/* This is noinline to make damned sure that gcc doesn't move any of the
65 Altivec code around the enable/disable code */
66static void noinline
67raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs)
68{
69 u8 **dptr = (u8 **)ptrs;
70 u8 *p, *q;
71 int d, z, z0;
72
73 unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
74 unative_t x1d = NBYTES(0x1d);
75
76 z0 = disks - 3; /* Highest data disk */
77 p = dptr[z0+1]; /* XOR parity */
78 q = dptr[z0+2]; /* RS syndrome */
79
80 for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
81 wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
82 for ( z = z0-1 ; z >= 0 ; z-- ) {
83 wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
84 wp$$ = vec_xor(wp$$, wd$$);
85 w2$$ = MASK(wq$$);
86 w1$$ = SHLBYTE(wq$$);
87 w2$$ = vec_and(w2$$, x1d);
88 w1$$ = vec_xor(w1$$, w2$$);
89 wq$$ = vec_xor(w1$$, wd$$);
90 }
91 *(unative_t *)&p[d+NSIZE*$$] = wp$$;
92 *(unative_t *)&q[d+NSIZE*$$] = wq$$;
93 }
94}
95
96static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
97{
98 preempt_disable();
99 enable_kernel_altivec();
100
101 raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs);
102
103 preempt_enable();
104}
105
106int raid6_have_altivec(void);
107#if $# == 1
108int raid6_have_altivec(void)
109{
110 /* This assumes either all CPUs have Altivec or none does */
111 return cpu_has_feature(CPU_FTR_ALTIVEC);
112}
113#endif
114
115const struct raid6_calls raid6_altivec$# = {
116 raid6_altivec$#_gen_syndrome,
117 raid6_have_altivec,
118 "altivecx$#",
119 0
120};
121
122#endif /* CONFIG_ALTIVEC */
diff --git a/drivers/md/raid6int.uc b/drivers/md/raid6int.uc
new file mode 100644
index 000000000000..ad004cee0e26
--- /dev/null
+++ b/drivers/md/raid6int.uc
@@ -0,0 +1,117 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6int$#.c
15 *
16 * $#-way unrolled portable integer math RAID-6 instruction set
17 *
18 * This file is postprocessed using unroll.pl
19 */
20
21#include "raid6.h"
22
23/*
24 * This is the C data type to use
25 */
26
27/* Change this from BITS_PER_LONG if there is something better... */
28#if BITS_PER_LONG == 64
29# define NBYTES(x) ((x) * 0x0101010101010101UL)
30# define NSIZE 8
31# define NSHIFT 3
32# define NSTRING "64"
33typedef u64 unative_t;
34#else
35# define NBYTES(x) ((x) * 0x01010101U)
36# define NSIZE 4
37# define NSHIFT 2
38# define NSTRING "32"
39typedef u32 unative_t;
40#endif
41
42
43
44/*
45 * IA-64 wants insane amounts of unrolling. On other architectures that
46 * is just a waste of space.
47 */
48#if ($# <= 8) || defined(__ia64__)
49
50
51/*
52 * These sub-operations are separate inlines since they can sometimes be
53 * specially optimized using architecture-specific hacks.
54 */
55
56/*
57 * The SHLBYTE() operation shifts each byte left by 1, *not*
58 * rolling over into the next byte
59 */
60static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
61{
62 unative_t vv;
63
64 vv = (v << 1) & NBYTES(0xfe);
65 return vv;
66}
67
68/*
69 * The MASK() operation returns 0xFF in any byte for which the high
70 * bit is 1, 0x00 for any byte for which the high bit is 0.
71 */
72static inline __attribute_const__ unative_t MASK(unative_t v)
73{
74 unative_t vv;
75
76 vv = v & NBYTES(0x80);
77 vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
78 return vv;
79}
80
81
82static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
83{
84 u8 **dptr = (u8 **)ptrs;
85 u8 *p, *q;
86 int d, z, z0;
87
88 unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
89
90 z0 = disks - 3; /* Highest data disk */
91 p = dptr[z0+1]; /* XOR parity */
92 q = dptr[z0+2]; /* RS syndrome */
93
94 for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
95 wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE];
96 for ( z = z0-1 ; z >= 0 ; z-- ) {
97 wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE];
98 wp$$ ^= wd$$;
99 w2$$ = MASK(wq$$);
100 w1$$ = SHLBYTE(wq$$);
101 w2$$ &= NBYTES(0x1d);
102 w1$$ ^= w2$$;
103 wq$$ = w1$$ ^ wd$$;
104 }
105 *(unative_t *)&p[d+NSIZE*$$] = wp$$;
106 *(unative_t *)&q[d+NSIZE*$$] = wq$$;
107 }
108}
109
110const struct raid6_calls raid6_intx$# = {
111 raid6_int$#_gen_syndrome,
112 NULL, /* always valid */
113 "int" NSTRING "x$#",
114 0
115};
116
117#endif
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
new file mode 100644
index 000000000000..7e30ab29691a
--- /dev/null
+++ b/drivers/md/raid6main.c
@@ -0,0 +1,2136 @@
1/*
2 * raid6main.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 *
7 * RAID-6 management functions. This code is derived from raid5.c.
8 * Last merge from raid5.c bkcvs version 1.79 (kernel 2.6.1).
9 *
10 * Thanks to Penguin Computing for making the RAID-6 development possible
11 * by donating a test server!
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2, or (at your option)
16 * any later version.
17 *
18 * You should have received a copy of the GNU General Public License
19 * (for example /usr/src/linux/COPYING); if not, write to the Free
20 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23
24#include <linux/config.h>
25#include <linux/module.h>
26#include <linux/slab.h>
27#include <linux/highmem.h>
28#include <linux/bitops.h>
29#include <asm/atomic.h>
30#include "raid6.h"
31
32/*
33 * Stripe cache
34 */
35
36#define NR_STRIPES 256
37#define STRIPE_SIZE PAGE_SIZE
38#define STRIPE_SHIFT (PAGE_SHIFT - 9)
39#define STRIPE_SECTORS (STRIPE_SIZE>>9)
40#define IO_THRESHOLD 1
41#define HASH_PAGES 1
42#define HASH_PAGES_ORDER 0
43#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
44#define HASH_MASK (NR_HASH - 1)
45
46#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])
47
48/* bio's attached to a stripe+device for I/O are linked together in bi_sector
49 * order without overlap. There may be several bio's per stripe+device, and
50 * a bio could span several devices.
51 * When walking this list for a particular stripe+device, we must never proceed
52 * beyond a bio that extends past this device, as the next bio might no longer
53 * be valid.
54 * This macro is used to determine the 'next' bio in the list, given the sector
55 * of the current stripe+device
56 */
57#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
58/*
59 * The following can be used to debug the driver
60 */
61#define RAID6_DEBUG 0 /* Extremely verbose printk */
62#define RAID6_PARANOIA 1 /* Check spinlocks */
63#define RAID6_DUMPSTATE 0 /* Include stripe cache state in /proc/mdstat */
64#if RAID6_PARANOIA && defined(CONFIG_SMP)
65# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
66#else
67# define CHECK_DEVLOCK()
68#endif
69
70#define PRINTK(x...) ((void)(RAID6_DEBUG && printk(KERN_DEBUG x)))
71#if RAID6_DEBUG
72#undef inline
73#undef __inline__
74#define inline
75#define __inline__
76#endif
77
78#if !RAID6_USE_EMPTY_ZERO_PAGE
79/* In .bss so it's zeroed */
80const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
81#endif
82
83static inline int raid6_next_disk(int disk, int raid_disks)
84{
85 disk++;
86 return (disk < raid_disks) ? disk : 0;
87}
88
89static void print_raid6_conf (raid6_conf_t *conf);
90
91static inline void __release_stripe(raid6_conf_t *conf, struct stripe_head *sh)
92{
93 if (atomic_dec_and_test(&sh->count)) {
94 if (!list_empty(&sh->lru))
95 BUG();
96 if (atomic_read(&conf->active_stripes)==0)
97 BUG();
98 if (test_bit(STRIPE_HANDLE, &sh->state)) {
99 if (test_bit(STRIPE_DELAYED, &sh->state))
100 list_add_tail(&sh->lru, &conf->delayed_list);
101 else
102 list_add_tail(&sh->lru, &conf->handle_list);
103 md_wakeup_thread(conf->mddev->thread);
104 } else {
105 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
106 atomic_dec(&conf->preread_active_stripes);
107 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
108 md_wakeup_thread(conf->mddev->thread);
109 }
110 list_add_tail(&sh->lru, &conf->inactive_list);
111 atomic_dec(&conf->active_stripes);
112 if (!conf->inactive_blocked ||
113 atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
114 wake_up(&conf->wait_for_stripe);
115 }
116 }
117}
118static void release_stripe(struct stripe_head *sh)
119{
120 raid6_conf_t *conf = sh->raid_conf;
121 unsigned long flags;
122
123 spin_lock_irqsave(&conf->device_lock, flags);
124 __release_stripe(conf, sh);
125 spin_unlock_irqrestore(&conf->device_lock, flags);
126}
127
128static void remove_hash(struct stripe_head *sh)
129{
130 PRINTK("remove_hash(), stripe %llu\n", (unsigned long long)sh->sector);
131
132 if (sh->hash_pprev) {
133 if (sh->hash_next)
134 sh->hash_next->hash_pprev = sh->hash_pprev;
135 *sh->hash_pprev = sh->hash_next;
136 sh->hash_pprev = NULL;
137 }
138}
139
140static __inline__ void insert_hash(raid6_conf_t *conf, struct stripe_head *sh)
141{
142 struct stripe_head **shp = &stripe_hash(conf, sh->sector);
143
144 PRINTK("insert_hash(), stripe %llu\n", (unsigned long long)sh->sector);
145
146 CHECK_DEVLOCK();
147 if ((sh->hash_next = *shp) != NULL)
148 (*shp)->hash_pprev = &sh->hash_next;
149 *shp = sh;
150 sh->hash_pprev = shp;
151}
152
153
154/* find an idle stripe, make sure it is unhashed, and return it. */
155static struct stripe_head *get_free_stripe(raid6_conf_t *conf)
156{
157 struct stripe_head *sh = NULL;
158 struct list_head *first;
159
160 CHECK_DEVLOCK();
161 if (list_empty(&conf->inactive_list))
162 goto out;
163 first = conf->inactive_list.next;
164 sh = list_entry(first, struct stripe_head, lru);
165 list_del_init(first);
166 remove_hash(sh);
167 atomic_inc(&conf->active_stripes);
168out:
169 return sh;
170}
171
172static void shrink_buffers(struct stripe_head *sh, int num)
173{
174 struct page *p;
175 int i;
176
177 for (i=0; i<num ; i++) {
178 p = sh->dev[i].page;
179 if (!p)
180 continue;
181 sh->dev[i].page = NULL;
182 page_cache_release(p);
183 }
184}
185
186static int grow_buffers(struct stripe_head *sh, int num)
187{
188 int i;
189
190 for (i=0; i<num; i++) {
191 struct page *page;
192
193 if (!(page = alloc_page(GFP_KERNEL))) {
194 return 1;
195 }
196 sh->dev[i].page = page;
197 }
198 return 0;
199}
200
201static void raid6_build_block (struct stripe_head *sh, int i);
202
203static inline void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx)
204{
205 raid6_conf_t *conf = sh->raid_conf;
206 int disks = conf->raid_disks, i;
207
208 if (atomic_read(&sh->count) != 0)
209 BUG();
210 if (test_bit(STRIPE_HANDLE, &sh->state))
211 BUG();
212
213 CHECK_DEVLOCK();
214 PRINTK("init_stripe called, stripe %llu\n",
215 (unsigned long long)sh->sector);
216
217 remove_hash(sh);
218
219 sh->sector = sector;
220 sh->pd_idx = pd_idx;
221 sh->state = 0;
222
223 for (i=disks; i--; ) {
224 struct r5dev *dev = &sh->dev[i];
225
226 if (dev->toread || dev->towrite || dev->written ||
227 test_bit(R5_LOCKED, &dev->flags)) {
228 PRINTK("sector=%llx i=%d %p %p %p %d\n",
229 (unsigned long long)sh->sector, i, dev->toread,
230 dev->towrite, dev->written,
231 test_bit(R5_LOCKED, &dev->flags));
232 BUG();
233 }
234 dev->flags = 0;
235 raid6_build_block(sh, i);
236 }
237 insert_hash(conf, sh);
238}
239
240static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
241{
242 struct stripe_head *sh;
243
244 CHECK_DEVLOCK();
245 PRINTK("__find_stripe, sector %llu\n", (unsigned long long)sector);
246 for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
247 if (sh->sector == sector)
248 return sh;
249 PRINTK("__stripe %llu not in cache\n", (unsigned long long)sector);
250 return NULL;
251}
252
253static void unplug_slaves(mddev_t *mddev);
254
255static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
256 int pd_idx, int noblock)
257{
258 struct stripe_head *sh;
259
260 PRINTK("get_stripe, sector %llu\n", (unsigned long long)sector);
261
262 spin_lock_irq(&conf->device_lock);
263
264 do {
265 sh = __find_stripe(conf, sector);
266 if (!sh) {
267 if (!conf->inactive_blocked)
268 sh = get_free_stripe(conf);
269 if (noblock && sh == NULL)
270 break;
271 if (!sh) {
272 conf->inactive_blocked = 1;
273 wait_event_lock_irq(conf->wait_for_stripe,
274 !list_empty(&conf->inactive_list) &&
275 (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
276 || !conf->inactive_blocked),
277 conf->device_lock,
278 unplug_slaves(conf->mddev);
279 );
280 conf->inactive_blocked = 0;
281 } else
282 init_stripe(sh, sector, pd_idx);
283 } else {
284 if (atomic_read(&sh->count)) {
285 if (!list_empty(&sh->lru))
286 BUG();
287 } else {
288 if (!test_bit(STRIPE_HANDLE, &sh->state))
289 atomic_inc(&conf->active_stripes);
290 if (list_empty(&sh->lru))
291 BUG();
292 list_del_init(&sh->lru);
293 }
294 }
295 } while (sh == NULL);
296
297 if (sh)
298 atomic_inc(&sh->count);
299
300 spin_unlock_irq(&conf->device_lock);
301 return sh;
302}
303
304static int grow_stripes(raid6_conf_t *conf, int num)
305{
306 struct stripe_head *sh;
307 kmem_cache_t *sc;
308 int devs = conf->raid_disks;
309
310 sprintf(conf->cache_name, "raid6/%s", mdname(conf->mddev));
311
312 sc = kmem_cache_create(conf->cache_name,
313 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
314 0, 0, NULL, NULL);
315 if (!sc)
316 return 1;
317 conf->slab_cache = sc;
318 while (num--) {
319 sh = kmem_cache_alloc(sc, GFP_KERNEL);
320 if (!sh)
321 return 1;
322 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
323 sh->raid_conf = conf;
324 spin_lock_init(&sh->lock);
325
326 if (grow_buffers(sh, conf->raid_disks)) {
327 shrink_buffers(sh, conf->raid_disks);
328 kmem_cache_free(sc, sh);
329 return 1;
330 }
331 /* we just created an active stripe so... */
332 atomic_set(&sh->count, 1);
333 atomic_inc(&conf->active_stripes);
334 INIT_LIST_HEAD(&sh->lru);
335 release_stripe(sh);
336 }
337 return 0;
338}
339
340static void shrink_stripes(raid6_conf_t *conf)
341{
342 struct stripe_head *sh;
343
344 while (1) {
345 spin_lock_irq(&conf->device_lock);
346 sh = get_free_stripe(conf);
347 spin_unlock_irq(&conf->device_lock);
348 if (!sh)
349 break;
350 if (atomic_read(&sh->count))
351 BUG();
352 shrink_buffers(sh, conf->raid_disks);
353 kmem_cache_free(conf->slab_cache, sh);
354 atomic_dec(&conf->active_stripes);
355 }
356 kmem_cache_destroy(conf->slab_cache);
357 conf->slab_cache = NULL;
358}
359
360static int raid6_end_read_request (struct bio * bi, unsigned int bytes_done,
361 int error)
362{
363 struct stripe_head *sh = bi->bi_private;
364 raid6_conf_t *conf = sh->raid_conf;
365 int disks = conf->raid_disks, i;
366 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
367
368 if (bi->bi_size)
369 return 1;
370
371 for (i=0 ; i<disks; i++)
372 if (bi == &sh->dev[i].req)
373 break;
374
375 PRINTK("end_read_request %llu/%d, count: %d, uptodate %d.\n",
376 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
377 uptodate);
378 if (i == disks) {
379 BUG();
380 return 0;
381 }
382
383 if (uptodate) {
384#if 0
385 struct bio *bio;
386 unsigned long flags;
387 spin_lock_irqsave(&conf->device_lock, flags);
388 /* we can return a buffer if we bypassed the cache or
389 * if the top buffer is not in highmem. If there are
390 * multiple buffers, leave the extra work to
391 * handle_stripe
392 */
393 buffer = sh->bh_read[i];
394 if (buffer &&
395 (!PageHighMem(buffer->b_page)
396 || buffer->b_page == bh->b_page )
397 ) {
398 sh->bh_read[i] = buffer->b_reqnext;
399 buffer->b_reqnext = NULL;
400 } else
401 buffer = NULL;
402 spin_unlock_irqrestore(&conf->device_lock, flags);
403 if (sh->bh_page[i]==bh->b_page)
404 set_buffer_uptodate(bh);
405 if (buffer) {
406 if (buffer->b_page != bh->b_page)
407 memcpy(buffer->b_data, bh->b_data, bh->b_size);
408 buffer->b_end_io(buffer, 1);
409 }
410#else
411 set_bit(R5_UPTODATE, &sh->dev[i].flags);
412#endif
413 } else {
414 md_error(conf->mddev, conf->disks[i].rdev);
415 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
416 }
417 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
418#if 0
419 /* must restore b_page before unlocking buffer... */
420 if (sh->bh_page[i] != bh->b_page) {
421 bh->b_page = sh->bh_page[i];
422 bh->b_data = page_address(bh->b_page);
423 clear_buffer_uptodate(bh);
424 }
425#endif
426 clear_bit(R5_LOCKED, &sh->dev[i].flags);
427 set_bit(STRIPE_HANDLE, &sh->state);
428 release_stripe(sh);
429 return 0;
430}
431
432static int raid6_end_write_request (struct bio *bi, unsigned int bytes_done,
433 int error)
434{
435 struct stripe_head *sh = bi->bi_private;
436 raid6_conf_t *conf = sh->raid_conf;
437 int disks = conf->raid_disks, i;
438 unsigned long flags;
439 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
440
441 if (bi->bi_size)
442 return 1;
443
444 for (i=0 ; i<disks; i++)
445 if (bi == &sh->dev[i].req)
446 break;
447
448 PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
449 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
450 uptodate);
451 if (i == disks) {
452 BUG();
453 return 0;
454 }
455
456 spin_lock_irqsave(&conf->device_lock, flags);
457 if (!uptodate)
458 md_error(conf->mddev, conf->disks[i].rdev);
459
460 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
461
462 clear_bit(R5_LOCKED, &sh->dev[i].flags);
463 set_bit(STRIPE_HANDLE, &sh->state);
464 __release_stripe(conf, sh);
465 spin_unlock_irqrestore(&conf->device_lock, flags);
466 return 0;
467}
468
469
470static sector_t compute_blocknr(struct stripe_head *sh, int i);
471
472static void raid6_build_block (struct stripe_head *sh, int i)
473{
474 struct r5dev *dev = &sh->dev[i];
475 int pd_idx = sh->pd_idx;
476 int qd_idx = raid6_next_disk(pd_idx, sh->raid_conf->raid_disks);
477
478 bio_init(&dev->req);
479 dev->req.bi_io_vec = &dev->vec;
480 dev->req.bi_vcnt++;
481 dev->req.bi_max_vecs++;
482 dev->vec.bv_page = dev->page;
483 dev->vec.bv_len = STRIPE_SIZE;
484 dev->vec.bv_offset = 0;
485
486 dev->req.bi_sector = sh->sector;
487 dev->req.bi_private = sh;
488
489 dev->flags = 0;
490 if (i != pd_idx && i != qd_idx)
491 dev->sector = compute_blocknr(sh, i);
492}
493
494static void error(mddev_t *mddev, mdk_rdev_t *rdev)
495{
496 char b[BDEVNAME_SIZE];
497 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
498 PRINTK("raid6: error called\n");
499
500 if (!rdev->faulty) {
501 mddev->sb_dirty = 1;
502 if (rdev->in_sync) {
503 conf->working_disks--;
504 mddev->degraded++;
505 conf->failed_disks++;
506 rdev->in_sync = 0;
507 /*
508 * if recovery was running, make sure it aborts.
509 */
510 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
511 }
512 rdev->faulty = 1;
513 printk (KERN_ALERT
514 "raid6: Disk failure on %s, disabling device."
515 " Operation continuing on %d devices\n",
516 bdevname(rdev->bdev,b), conf->working_disks);
517 }
518}
519
520/*
521 * Input: a 'big' sector number,
522 * Output: index of the data and parity disk, and the sector # in them.
523 */
524static sector_t raid6_compute_sector(sector_t r_sector, unsigned int raid_disks,
525 unsigned int data_disks, unsigned int * dd_idx,
526 unsigned int * pd_idx, raid6_conf_t *conf)
527{
528 long stripe;
529 unsigned long chunk_number;
530 unsigned int chunk_offset;
531 sector_t new_sector;
532 int sectors_per_chunk = conf->chunk_size >> 9;
533
534 /* First compute the information on this sector */
535
536 /*
537 * Compute the chunk number and the sector offset inside the chunk
538 */
539 chunk_offset = sector_div(r_sector, sectors_per_chunk);
540 chunk_number = r_sector;
541 if ( r_sector != chunk_number ) {
542 printk(KERN_CRIT "raid6: ERROR: r_sector = %llu, chunk_number = %lu\n",
543 (unsigned long long)r_sector, (unsigned long)chunk_number);
544 BUG();
545 }
546
547 /*
548 * Compute the stripe number
549 */
550 stripe = chunk_number / data_disks;
551
552 /*
553 * Compute the data disk and parity disk indexes inside the stripe
554 */
555 *dd_idx = chunk_number % data_disks;
556
557 /*
558 * Select the parity disk based on the user selected algorithm.
559 */
560
561 /**** FIX THIS ****/
562 switch (conf->algorithm) {
563 case ALGORITHM_LEFT_ASYMMETRIC:
564 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
565 if (*pd_idx == raid_disks-1)
566 (*dd_idx)++; /* Q D D D P */
567 else if (*dd_idx >= *pd_idx)
568 (*dd_idx) += 2; /* D D P Q D */
569 break;
570 case ALGORITHM_RIGHT_ASYMMETRIC:
571 *pd_idx = stripe % raid_disks;
572 if (*pd_idx == raid_disks-1)
573 (*dd_idx)++; /* Q D D D P */
574 else if (*dd_idx >= *pd_idx)
575 (*dd_idx) += 2; /* D D P Q D */
576 break;
577 case ALGORITHM_LEFT_SYMMETRIC:
578 *pd_idx = raid_disks - 1 - (stripe % raid_disks);
579 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
580 break;
581 case ALGORITHM_RIGHT_SYMMETRIC:
582 *pd_idx = stripe % raid_disks;
583 *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
584 break;
585 default:
586 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
587 conf->algorithm);
588 }
589
590 PRINTK("raid6: chunk_number = %lu, pd_idx = %u, dd_idx = %u\n",
591 chunk_number, *pd_idx, *dd_idx);
592
593 /*
594 * Finally, compute the new sector number
595 */
596 new_sector = (sector_t) stripe * sectors_per_chunk + chunk_offset;
597 return new_sector;
598}
599
600
601static sector_t compute_blocknr(struct stripe_head *sh, int i)
602{
603 raid6_conf_t *conf = sh->raid_conf;
604 int raid_disks = conf->raid_disks, data_disks = raid_disks - 2;
605 sector_t new_sector = sh->sector, check;
606 int sectors_per_chunk = conf->chunk_size >> 9;
607 sector_t stripe;
608 int chunk_offset;
609 int chunk_number, dummy1, dummy2, dd_idx = i;
610 sector_t r_sector;
611 int i0 = i;
612
613 chunk_offset = sector_div(new_sector, sectors_per_chunk);
614 stripe = new_sector;
615 if ( new_sector != stripe ) {
616 printk(KERN_CRIT "raid6: ERROR: new_sector = %llu, stripe = %lu\n",
617 (unsigned long long)new_sector, (unsigned long)stripe);
618 BUG();
619 }
620
621 switch (conf->algorithm) {
622 case ALGORITHM_LEFT_ASYMMETRIC:
623 case ALGORITHM_RIGHT_ASYMMETRIC:
624 if (sh->pd_idx == raid_disks-1)
625 i--; /* Q D D D P */
626 else if (i > sh->pd_idx)
627 i -= 2; /* D D P Q D */
628 break;
629 case ALGORITHM_LEFT_SYMMETRIC:
630 case ALGORITHM_RIGHT_SYMMETRIC:
631 if (sh->pd_idx == raid_disks-1)
632 i--; /* Q D D D P */
633 else {
634 /* D D P Q D */
635 if (i < sh->pd_idx)
636 i += raid_disks;
637 i -= (sh->pd_idx + 2);
638 }
639 break;
640 default:
641 printk (KERN_CRIT "raid6: unsupported algorithm %d\n",
642 conf->algorithm);
643 }
644
645 PRINTK("raid6: compute_blocknr: pd_idx = %u, i0 = %u, i = %u\n", sh->pd_idx, i0, i);
646
647 chunk_number = stripe * data_disks + i;
648 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
649
650 check = raid6_compute_sector (r_sector, raid_disks, data_disks, &dummy1, &dummy2, conf);
651 if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
652 printk(KERN_CRIT "raid6: compute_blocknr: map not correct\n");
653 return 0;
654 }
655 return r_sector;
656}
657
658
659
660/*
661 * Copy data between a page in the stripe cache, and one or more bion
662 * The page could align with the middle of the bio, or there could be
663 * several bion, each with several bio_vecs, which cover part of the page
664 * Multiple bion are linked together on bi_next. There may be extras
665 * at the end of this list. We ignore them.
666 */
667static void copy_data(int frombio, struct bio *bio,
668 struct page *page,
669 sector_t sector)
670{
671 char *pa = page_address(page);
672 struct bio_vec *bvl;
673 int i;
674 int page_offset;
675
676 if (bio->bi_sector >= sector)
677 page_offset = (signed)(bio->bi_sector - sector) * 512;
678 else
679 page_offset = (signed)(sector - bio->bi_sector) * -512;
680 bio_for_each_segment(bvl, bio, i) {
681 int len = bio_iovec_idx(bio,i)->bv_len;
682 int clen;
683 int b_offset = 0;
684
685 if (page_offset < 0) {
686 b_offset = -page_offset;
687 page_offset += b_offset;
688 len -= b_offset;
689 }
690
691 if (len > 0 && page_offset + len > STRIPE_SIZE)
692 clen = STRIPE_SIZE - page_offset;
693 else clen = len;
694
695 if (clen > 0) {
696 char *ba = __bio_kmap_atomic(bio, i, KM_USER0);
697 if (frombio)
698 memcpy(pa+page_offset, ba+b_offset, clen);
699 else
700 memcpy(ba+b_offset, pa+page_offset, clen);
701 __bio_kunmap_atomic(ba, KM_USER0);
702 }
703 if (clen < len) /* hit end of page */
704 break;
705 page_offset += len;
706 }
707}
708
709#define check_xor() do { \
710 if (count == MAX_XOR_BLOCKS) { \
711 xor_block(count, STRIPE_SIZE, ptr); \
712 count = 1; \
713 } \
714 } while(0)
715
716/* Compute P and Q syndromes */
717static void compute_parity(struct stripe_head *sh, int method)
718{
719 raid6_conf_t *conf = sh->raid_conf;
720 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
721 struct bio *chosen;
722 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
723 void *ptrs[disks];
724
725 qd_idx = raid6_next_disk(pd_idx, disks);
726 d0_idx = raid6_next_disk(qd_idx, disks);
727
728 PRINTK("compute_parity, stripe %llu, method %d\n",
729 (unsigned long long)sh->sector, method);
730
731 switch(method) {
732 case READ_MODIFY_WRITE:
733 BUG(); /* READ_MODIFY_WRITE N/A for RAID-6 */
734 case RECONSTRUCT_WRITE:
735 for (i= disks; i-- ;)
736 if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
737 chosen = sh->dev[i].towrite;
738 sh->dev[i].towrite = NULL;
739
740 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
741 wake_up(&conf->wait_for_overlap);
742
743 if (sh->dev[i].written) BUG();
744 sh->dev[i].written = chosen;
745 }
746 break;
747 case CHECK_PARITY:
748 BUG(); /* Not implemented yet */
749 }
750
751 for (i = disks; i--;)
752 if (sh->dev[i].written) {
753 sector_t sector = sh->dev[i].sector;
754 struct bio *wbi = sh->dev[i].written;
755 while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
756 copy_data(1, wbi, sh->dev[i].page, sector);
757 wbi = r5_next_bio(wbi, sector);
758 }
759
760 set_bit(R5_LOCKED, &sh->dev[i].flags);
761 set_bit(R5_UPTODATE, &sh->dev[i].flags);
762 }
763
764// switch(method) {
765// case RECONSTRUCT_WRITE:
766// case CHECK_PARITY:
767// case UPDATE_PARITY:
768 /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
769 /* FIX: Is this ordering of drives even remotely optimal? */
770 count = 0;
771 i = d0_idx;
772 do {
773 ptrs[count++] = page_address(sh->dev[i].page);
774 if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
775 printk("block %d/%d not uptodate on parity calc\n", i,count);
776 i = raid6_next_disk(i, disks);
777 } while ( i != d0_idx );
778// break;
779// }
780
781 raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
782
783 switch(method) {
784 case RECONSTRUCT_WRITE:
785 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
786 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
787 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
788 set_bit(R5_LOCKED, &sh->dev[qd_idx].flags);
789 break;
790 case UPDATE_PARITY:
791 set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
792 set_bit(R5_UPTODATE, &sh->dev[qd_idx].flags);
793 break;
794 }
795}
796
797/* Compute one missing block */
798static void compute_block_1(struct stripe_head *sh, int dd_idx)
799{
800 raid6_conf_t *conf = sh->raid_conf;
801 int i, count, disks = conf->raid_disks;
802 void *ptr[MAX_XOR_BLOCKS], *p;
803 int pd_idx = sh->pd_idx;
804 int qd_idx = raid6_next_disk(pd_idx, disks);
805
806 PRINTK("compute_block_1, stripe %llu, idx %d\n",
807 (unsigned long long)sh->sector, dd_idx);
808
809 if ( dd_idx == qd_idx ) {
810 /* We're actually computing the Q drive */
811 compute_parity(sh, UPDATE_PARITY);
812 } else {
813 ptr[0] = page_address(sh->dev[dd_idx].page);
814 memset(ptr[0], 0, STRIPE_SIZE);
815 count = 1;
816 for (i = disks ; i--; ) {
817 if (i == dd_idx || i == qd_idx)
818 continue;
819 p = page_address(sh->dev[i].page);
820 if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
821 ptr[count++] = p;
822 else
823 printk("compute_block() %d, stripe %llu, %d"
824 " not present\n", dd_idx,
825 (unsigned long long)sh->sector, i);
826
827 check_xor();
828 }
829 if (count != 1)
830 xor_block(count, STRIPE_SIZE, ptr);
831 set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
832 }
833}
834
835/* Compute two missing blocks */
836static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
837{
838 raid6_conf_t *conf = sh->raid_conf;
839 int i, count, disks = conf->raid_disks;
840 int pd_idx = sh->pd_idx;
841 int qd_idx = raid6_next_disk(pd_idx, disks);
842 int d0_idx = raid6_next_disk(qd_idx, disks);
843 int faila, failb;
844
845 /* faila and failb are disk numbers relative to d0_idx */
846 /* pd_idx become disks-2 and qd_idx become disks-1 */
847 faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
848 failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
849
850 BUG_ON(faila == failb);
851 if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
852
853 PRINTK("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
854 (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
855
856 if ( failb == disks-1 ) {
857 /* Q disk is one of the missing disks */
858 if ( faila == disks-2 ) {
859 /* Missing P+Q, just recompute */
860 compute_parity(sh, UPDATE_PARITY);
861 return;
862 } else {
863 /* We're missing D+Q; recompute D from P */
864 compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1);
865 compute_parity(sh, UPDATE_PARITY); /* Is this necessary? */
866 return;
867 }
868 }
869
870 /* We're missing D+P or D+D; build pointer table */
871 {
872 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
873 void *ptrs[disks];
874
875 count = 0;
876 i = d0_idx;
877 do {
878 ptrs[count++] = page_address(sh->dev[i].page);
879 i = raid6_next_disk(i, disks);
880 if (i != dd_idx1 && i != dd_idx2 &&
881 !test_bit(R5_UPTODATE, &sh->dev[i].flags))
882 printk("compute_2 with missing block %d/%d\n", count, i);
883 } while ( i != d0_idx );
884
885 if ( failb == disks-2 ) {
886 /* We're missing D+P. */
887 raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
888 } else {
889 /* We're missing D+D. */
890 raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
891 }
892
893 /* Both the above update both missing blocks */
894 set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
895 set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
896 }
897}
898
899
900/*
901 * Each stripe/dev can have one or more bion attached.
902 * toread/towrite point to the first in a chain.
903 * The bi_next chain must be in order.
904 */
905static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
906{
907 struct bio **bip;
908 raid6_conf_t *conf = sh->raid_conf;
909
910 PRINTK("adding bh b#%llu to stripe s#%llu\n",
911 (unsigned long long)bi->bi_sector,
912 (unsigned long long)sh->sector);
913
914
915 spin_lock(&sh->lock);
916 spin_lock_irq(&conf->device_lock);
917 if (forwrite)
918 bip = &sh->dev[dd_idx].towrite;
919 else
920 bip = &sh->dev[dd_idx].toread;
921 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
922 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
923 goto overlap;
924 bip = &(*bip)->bi_next;
925 }
926 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
927 goto overlap;
928
929 if (*bip && bi->bi_next && (*bip) != bi->bi_next)
930 BUG();
931 if (*bip)
932 bi->bi_next = *bip;
933 *bip = bi;
934 bi->bi_phys_segments ++;
935 spin_unlock_irq(&conf->device_lock);
936 spin_unlock(&sh->lock);
937
938 PRINTK("added bi b#%llu to stripe s#%llu, disk %d.\n",
939 (unsigned long long)bi->bi_sector,
940 (unsigned long long)sh->sector, dd_idx);
941
942 if (forwrite) {
943 /* check if page is covered */
944 sector_t sector = sh->dev[dd_idx].sector;
945 for (bi=sh->dev[dd_idx].towrite;
946 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
947 bi && bi->bi_sector <= sector;
948 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
949 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
950 sector = bi->bi_sector + (bi->bi_size>>9);
951 }
952 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
953 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
954 }
955 return 1;
956
957 overlap:
958 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
959 spin_unlock_irq(&conf->device_lock);
960 spin_unlock(&sh->lock);
961 return 0;
962}
963
964
965/*
966 * handle_stripe - do things to a stripe.
967 *
968 * We lock the stripe and then examine the state of various bits
969 * to see what needs to be done.
970 * Possible results:
971 * return some read request which now have data
972 * return some write requests which are safely on disc
973 * schedule a read on some buffers
974 * schedule a write of some buffers
975 * return confirmation of parity correctness
976 *
977 * Parity calculations are done inside the stripe lock
978 * buffers are taken off read_list or write_list, and bh_cache buffers
979 * get BH_Lock set before the stripe lock is released.
980 *
981 */
982
983static void handle_stripe(struct stripe_head *sh)
984{
985 raid6_conf_t *conf = sh->raid_conf;
986 int disks = conf->raid_disks;
987 struct bio *return_bi= NULL;
988 struct bio *bi;
989 int i;
990 int syncing;
991 int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
992 int non_overwrite = 0;
993 int failed_num[2] = {0, 0};
994 struct r5dev *dev, *pdev, *qdev;
995 int pd_idx = sh->pd_idx;
996 int qd_idx = raid6_next_disk(pd_idx, disks);
997 int p_failed, q_failed;
998
999 PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d, qd_idx=%d\n",
1000 (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count),
1001 pd_idx, qd_idx);
1002
1003 spin_lock(&sh->lock);
1004 clear_bit(STRIPE_HANDLE, &sh->state);
1005 clear_bit(STRIPE_DELAYED, &sh->state);
1006
1007 syncing = test_bit(STRIPE_SYNCING, &sh->state);
1008 /* Now to look around and see what can be done */
1009
1010 for (i=disks; i--; ) {
1011 mdk_rdev_t *rdev;
1012 dev = &sh->dev[i];
1013 clear_bit(R5_Insync, &dev->flags);
1014 clear_bit(R5_Syncio, &dev->flags);
1015
1016 PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
1017 i, dev->flags, dev->toread, dev->towrite, dev->written);
1018 /* maybe we can reply to a read */
1019 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
1020 struct bio *rbi, *rbi2;
1021 PRINTK("Return read for disc %d\n", i);
1022 spin_lock_irq(&conf->device_lock);
1023 rbi = dev->toread;
1024 dev->toread = NULL;
1025 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1026 wake_up(&conf->wait_for_overlap);
1027 spin_unlock_irq(&conf->device_lock);
1028 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1029 copy_data(0, rbi, dev->page, dev->sector);
1030 rbi2 = r5_next_bio(rbi, dev->sector);
1031 spin_lock_irq(&conf->device_lock);
1032 if (--rbi->bi_phys_segments == 0) {
1033 rbi->bi_next = return_bi;
1034 return_bi = rbi;
1035 }
1036 spin_unlock_irq(&conf->device_lock);
1037 rbi = rbi2;
1038 }
1039 }
1040
1041 /* now count some things */
1042 if (test_bit(R5_LOCKED, &dev->flags)) locked++;
1043 if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
1044
1045
1046 if (dev->toread) to_read++;
1047 if (dev->towrite) {
1048 to_write++;
1049 if (!test_bit(R5_OVERWRITE, &dev->flags))
1050 non_overwrite++;
1051 }
1052 if (dev->written) written++;
1053 rdev = conf->disks[i].rdev; /* FIXME, should I be looking rdev */
1054 if (!rdev || !rdev->in_sync) {
1055 if ( failed < 2 )
1056 failed_num[failed] = i;
1057 failed++;
1058 } else
1059 set_bit(R5_Insync, &dev->flags);
1060 }
1061 PRINTK("locked=%d uptodate=%d to_read=%d"
1062 " to_write=%d failed=%d failed_num=%d,%d\n",
1063 locked, uptodate, to_read, to_write, failed,
1064 failed_num[0], failed_num[1]);
1065 /* check if the array has lost >2 devices and, if so, some requests might
1066 * need to be failed
1067 */
1068 if (failed > 2 && to_read+to_write+written) {
1069 spin_lock_irq(&conf->device_lock);
1070 for (i=disks; i--; ) {
1071 /* fail all writes first */
1072 bi = sh->dev[i].towrite;
1073 sh->dev[i].towrite = NULL;
1074 if (bi) to_write--;
1075
1076 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1077 wake_up(&conf->wait_for_overlap);
1078
1079 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1080 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1081 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1082 if (--bi->bi_phys_segments == 0) {
1083 md_write_end(conf->mddev);
1084 bi->bi_next = return_bi;
1085 return_bi = bi;
1086 }
1087 bi = nextbi;
1088 }
1089 /* and fail all 'written' */
1090 bi = sh->dev[i].written;
1091 sh->dev[i].written = NULL;
1092 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
1093 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
1094 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1095 if (--bi->bi_phys_segments == 0) {
1096 md_write_end(conf->mddev);
1097 bi->bi_next = return_bi;
1098 return_bi = bi;
1099 }
1100 bi = bi2;
1101 }
1102
1103 /* fail any reads if this device is non-operational */
1104 if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
1105 bi = sh->dev[i].toread;
1106 sh->dev[i].toread = NULL;
1107 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
1108 wake_up(&conf->wait_for_overlap);
1109 if (bi) to_read--;
1110 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
1111 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
1112 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1113 if (--bi->bi_phys_segments == 0) {
1114 bi->bi_next = return_bi;
1115 return_bi = bi;
1116 }
1117 bi = nextbi;
1118 }
1119 }
1120 }
1121 spin_unlock_irq(&conf->device_lock);
1122 }
1123 if (failed > 2 && syncing) {
1124 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
1125 clear_bit(STRIPE_SYNCING, &sh->state);
1126 syncing = 0;
1127 }
1128
1129 /*
1130 * might be able to return some write requests if the parity blocks
1131 * are safe, or on a failed drive
1132 */
1133 pdev = &sh->dev[pd_idx];
1134 p_failed = (failed >= 1 && failed_num[0] == pd_idx)
1135 || (failed >= 2 && failed_num[1] == pd_idx);
1136 qdev = &sh->dev[qd_idx];
1137 q_failed = (failed >= 1 && failed_num[0] == qd_idx)
1138 || (failed >= 2 && failed_num[1] == qd_idx);
1139
1140 if ( written &&
1141 ( p_failed || ((test_bit(R5_Insync, &pdev->flags)
1142 && !test_bit(R5_LOCKED, &pdev->flags)
1143 && test_bit(R5_UPTODATE, &pdev->flags))) ) &&
1144 ( q_failed || ((test_bit(R5_Insync, &qdev->flags)
1145 && !test_bit(R5_LOCKED, &qdev->flags)
1146 && test_bit(R5_UPTODATE, &qdev->flags))) ) ) {
1147 /* any written block on an uptodate or failed drive can be
1148 * returned. Note that if we 'wrote' to a failed drive,
1149 * it will be UPTODATE, but never LOCKED, so we don't need
1150 * to test 'failed' directly.
1151 */
1152 for (i=disks; i--; )
1153 if (sh->dev[i].written) {
1154 dev = &sh->dev[i];
1155 if (!test_bit(R5_LOCKED, &dev->flags) &&
1156 test_bit(R5_UPTODATE, &dev->flags) ) {
1157 /* We can return any write requests */
1158 struct bio *wbi, *wbi2;
1159 PRINTK("Return write for stripe %llu disc %d\n",
1160 (unsigned long long)sh->sector, i);
1161 spin_lock_irq(&conf->device_lock);
1162 wbi = dev->written;
1163 dev->written = NULL;
1164 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
1165 wbi2 = r5_next_bio(wbi, dev->sector);
1166 if (--wbi->bi_phys_segments == 0) {
1167 md_write_end(conf->mddev);
1168 wbi->bi_next = return_bi;
1169 return_bi = wbi;
1170 }
1171 wbi = wbi2;
1172 }
1173 spin_unlock_irq(&conf->device_lock);
1174 }
1175 }
1176 }
1177
1178 /* Now we might consider reading some blocks, either to check/generate
1179 * parity, or to satisfy requests
1180 * or to load a block that is being partially written.
1181 */
1182 if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) {
1183 for (i=disks; i--;) {
1184 dev = &sh->dev[i];
1185 if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1186 (dev->toread ||
1187 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
1188 syncing ||
1189 (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) ||
1190 (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write))
1191 )
1192 ) {
1193 /* we would like to get this block, possibly
1194 * by computing it, but we might not be able to
1195 */
1196 if (uptodate == disks-1) {
1197 PRINTK("Computing stripe %llu block %d\n",
1198 (unsigned long long)sh->sector, i);
1199 compute_block_1(sh, i);
1200 uptodate++;
1201 } else if ( uptodate == disks-2 && failed >= 2 ) {
1202 /* Computing 2-failure is *very* expensive; only do it if failed >= 2 */
1203 int other;
1204 for (other=disks; other--;) {
1205 if ( other == i )
1206 continue;
1207 if ( !test_bit(R5_UPTODATE, &sh->dev[other].flags) )
1208 break;
1209 }
1210 BUG_ON(other < 0);
1211 PRINTK("Computing stripe %llu blocks %d,%d\n",
1212 (unsigned long long)sh->sector, i, other);
1213 compute_block_2(sh, i, other);
1214 uptodate += 2;
1215 } else if (test_bit(R5_Insync, &dev->flags)) {
1216 set_bit(R5_LOCKED, &dev->flags);
1217 set_bit(R5_Wantread, &dev->flags);
1218#if 0
1219 /* if I am just reading this block and we don't have
1220 a failed drive, or any pending writes then sidestep the cache */
1221 if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
1222 ! syncing && !failed && !to_write) {
1223 sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
1224 sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
1225 }
1226#endif
1227 locked++;
1228 PRINTK("Reading block %d (sync=%d)\n",
1229 i, syncing);
1230 if (syncing)
1231 md_sync_acct(conf->disks[i].rdev->bdev,
1232 STRIPE_SECTORS);
1233 }
1234 }
1235 }
1236 set_bit(STRIPE_HANDLE, &sh->state);
1237 }
1238
1239 /* now to consider writing and what else, if anything should be read */
1240 if (to_write) {
1241 int rcw=0, must_compute=0;
1242 for (i=disks ; i--;) {
1243 dev = &sh->dev[i];
1244 /* Would I have to read this buffer for reconstruct_write */
1245 if (!test_bit(R5_OVERWRITE, &dev->flags)
1246 && i != pd_idx && i != qd_idx
1247 && (!test_bit(R5_LOCKED, &dev->flags)
1248#if 0
1249 || sh->bh_page[i] != bh->b_page
1250#endif
1251 ) &&
1252 !test_bit(R5_UPTODATE, &dev->flags)) {
1253 if (test_bit(R5_Insync, &dev->flags)) rcw++;
1254 else {
1255 PRINTK("raid6: must_compute: disk %d flags=%#lx\n", i, dev->flags);
1256 must_compute++;
1257 }
1258 }
1259 }
1260 PRINTK("for sector %llu, rcw=%d, must_compute=%d\n",
1261 (unsigned long long)sh->sector, rcw, must_compute);
1262 set_bit(STRIPE_HANDLE, &sh->state);
1263
1264 if (rcw > 0)
1265 /* want reconstruct write, but need to get some data */
1266 for (i=disks; i--;) {
1267 dev = &sh->dev[i];
1268 if (!test_bit(R5_OVERWRITE, &dev->flags)
1269 && !(failed == 0 && (i == pd_idx || i == qd_idx))
1270 && !test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
1271 test_bit(R5_Insync, &dev->flags)) {
1272 if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1273 {
1274 PRINTK("Read_old stripe %llu block %d for Reconstruct\n",
1275 (unsigned long long)sh->sector, i);
1276 set_bit(R5_LOCKED, &dev->flags);
1277 set_bit(R5_Wantread, &dev->flags);
1278 locked++;
1279 } else {
1280 PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
1281 (unsigned long long)sh->sector, i);
1282 set_bit(STRIPE_DELAYED, &sh->state);
1283 set_bit(STRIPE_HANDLE, &sh->state);
1284 }
1285 }
1286 }
1287 /* now if nothing is locked, and if we have enough data, we can start a write request */
1288 if (locked == 0 && rcw == 0) {
1289 if ( must_compute > 0 ) {
1290 /* We have failed blocks and need to compute them */
1291 switch ( failed ) {
1292 case 0: BUG();
1293 case 1: compute_block_1(sh, failed_num[0]); break;
1294 case 2: compute_block_2(sh, failed_num[0], failed_num[1]); break;
1295 default: BUG(); /* This request should have been failed? */
1296 }
1297 }
1298
1299 PRINTK("Computing parity for stripe %llu\n", (unsigned long long)sh->sector);
1300 compute_parity(sh, RECONSTRUCT_WRITE);
1301 /* now every locked buffer is ready to be written */
1302 for (i=disks; i--;)
1303 if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
1304 PRINTK("Writing stripe %llu block %d\n",
1305 (unsigned long long)sh->sector, i);
1306 locked++;
1307 set_bit(R5_Wantwrite, &sh->dev[i].flags);
1308#if 0 /**** FIX: I don't understand the logic here... ****/
1309 if (!test_bit(R5_Insync, &sh->dev[i].flags)
1310 || ((i==pd_idx || i==qd_idx) && failed == 0)) /* FIX? */
1311 set_bit(STRIPE_INSYNC, &sh->state);
1312#endif
1313 }
1314 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
1315 atomic_dec(&conf->preread_active_stripes);
1316 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
1317 md_wakeup_thread(conf->mddev->thread);
1318 }
1319 }
1320 }
1321
1322 /* maybe we need to check and possibly fix the parity for this stripe
1323 * Any reads will already have been scheduled, so we just see if enough data
1324 * is available
1325 */
1326 if (syncing && locked == 0 &&
1327 !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 2) {
1328 set_bit(STRIPE_HANDLE, &sh->state);
1329#if 0 /* RAID-6: Don't support CHECK PARITY yet */
1330 if (failed == 0) {
1331 char *pagea;
1332 if (uptodate != disks)
1333 BUG();
1334 compute_parity(sh, CHECK_PARITY);
1335 uptodate--;
1336 pagea = page_address(sh->dev[pd_idx].page);
1337 if ((*(u32*)pagea) == 0 &&
1338 !memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
1339 /* parity is correct (on disc, not in buffer any more) */
1340 set_bit(STRIPE_INSYNC, &sh->state);
1341 }
1342 }
1343#endif
1344 if (!test_bit(STRIPE_INSYNC, &sh->state)) {
1345 int failed_needupdate[2];
1346 struct r5dev *adev, *bdev;
1347
1348 if ( failed < 1 )
1349 failed_num[0] = pd_idx;
1350 if ( failed < 2 )
1351 failed_num[1] = (failed_num[0] == qd_idx) ? pd_idx : qd_idx;
1352
1353 failed_needupdate[0] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[0]].flags);
1354 failed_needupdate[1] = !test_bit(R5_UPTODATE, &sh->dev[failed_num[1]].flags);
1355
1356 PRINTK("sync: failed=%d num=%d,%d fnu=%u%u\n",
1357 failed, failed_num[0], failed_num[1], failed_needupdate[0], failed_needupdate[1]);
1358
1359#if 0 /* RAID-6: This code seems to require that CHECK_PARITY destroys the uptodateness of the parity */
1360 /* should be able to compute the missing block(s) and write to spare */
1361 if ( failed_needupdate[0] ^ failed_needupdate[1] ) {
1362 if (uptodate+1 != disks)
1363 BUG();
1364 compute_block_1(sh, failed_needupdate[0] ? failed_num[0] : failed_num[1]);
1365 uptodate++;
1366 } else if ( failed_needupdate[0] & failed_needupdate[1] ) {
1367 if (uptodate+2 != disks)
1368 BUG();
1369 compute_block_2(sh, failed_num[0], failed_num[1]);
1370 uptodate += 2;
1371 }
1372#else
1373 compute_block_2(sh, failed_num[0], failed_num[1]);
1374 uptodate += failed_needupdate[0] + failed_needupdate[1];
1375#endif
1376
1377 if (uptodate != disks)
1378 BUG();
1379
1380 PRINTK("Marking for sync stripe %llu blocks %d,%d\n",
1381 (unsigned long long)sh->sector, failed_num[0], failed_num[1]);
1382
1383 /**** FIX: Should we really do both of these unconditionally? ****/
1384 adev = &sh->dev[failed_num[0]];
1385 locked += !test_bit(R5_LOCKED, &adev->flags);
1386 set_bit(R5_LOCKED, &adev->flags);
1387 set_bit(R5_Wantwrite, &adev->flags);
1388 bdev = &sh->dev[failed_num[1]];
1389 locked += !test_bit(R5_LOCKED, &bdev->flags);
1390 set_bit(R5_LOCKED, &bdev->flags);
1391 set_bit(R5_Wantwrite, &bdev->flags);
1392
1393 set_bit(STRIPE_INSYNC, &sh->state);
1394 set_bit(R5_Syncio, &adev->flags);
1395 set_bit(R5_Syncio, &bdev->flags);
1396 }
1397 }
1398 if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
1399 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
1400 clear_bit(STRIPE_SYNCING, &sh->state);
1401 }
1402
1403 spin_unlock(&sh->lock);
1404
1405 while ((bi=return_bi)) {
1406 int bytes = bi->bi_size;
1407
1408 return_bi = bi->bi_next;
1409 bi->bi_next = NULL;
1410 bi->bi_size = 0;
1411 bi->bi_end_io(bi, bytes, 0);
1412 }
1413 for (i=disks; i-- ;) {
1414 int rw;
1415 struct bio *bi;
1416 mdk_rdev_t *rdev;
1417 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
1418 rw = 1;
1419 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
1420 rw = 0;
1421 else
1422 continue;
1423
1424 bi = &sh->dev[i].req;
1425
1426 bi->bi_rw = rw;
1427 if (rw)
1428 bi->bi_end_io = raid6_end_write_request;
1429 else
1430 bi->bi_end_io = raid6_end_read_request;
1431
1432 rcu_read_lock();
1433 rdev = conf->disks[i].rdev;
1434 if (rdev && rdev->faulty)
1435 rdev = NULL;
1436 if (rdev)
1437 atomic_inc(&rdev->nr_pending);
1438 rcu_read_unlock();
1439
1440 if (rdev) {
1441 if (test_bit(R5_Syncio, &sh->dev[i].flags))
1442 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
1443
1444 bi->bi_bdev = rdev->bdev;
1445 PRINTK("for %llu schedule op %ld on disc %d\n",
1446 (unsigned long long)sh->sector, bi->bi_rw, i);
1447 atomic_inc(&sh->count);
1448 bi->bi_sector = sh->sector + rdev->data_offset;
1449 bi->bi_flags = 1 << BIO_UPTODATE;
1450 bi->bi_vcnt = 1;
1451 bi->bi_max_vecs = 1;
1452 bi->bi_idx = 0;
1453 bi->bi_io_vec = &sh->dev[i].vec;
1454 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
1455 bi->bi_io_vec[0].bv_offset = 0;
1456 bi->bi_size = STRIPE_SIZE;
1457 bi->bi_next = NULL;
1458 generic_make_request(bi);
1459 } else {
1460 PRINTK("skip op %ld on disc %d for sector %llu\n",
1461 bi->bi_rw, i, (unsigned long long)sh->sector);
1462 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1463 set_bit(STRIPE_HANDLE, &sh->state);
1464 }
1465 }
1466}
1467
1468static inline void raid6_activate_delayed(raid6_conf_t *conf)
1469{
1470 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
1471 while (!list_empty(&conf->delayed_list)) {
1472 struct list_head *l = conf->delayed_list.next;
1473 struct stripe_head *sh;
1474 sh = list_entry(l, struct stripe_head, lru);
1475 list_del_init(l);
1476 clear_bit(STRIPE_DELAYED, &sh->state);
1477 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
1478 atomic_inc(&conf->preread_active_stripes);
1479 list_add_tail(&sh->lru, &conf->handle_list);
1480 }
1481 }
1482}
1483
1484static void unplug_slaves(mddev_t *mddev)
1485{
1486 raid6_conf_t *conf = mddev_to_conf(mddev);
1487 int i;
1488
1489 rcu_read_lock();
1490 for (i=0; i<mddev->raid_disks; i++) {
1491 mdk_rdev_t *rdev = conf->disks[i].rdev;
1492 if (rdev && !rdev->faulty && atomic_read(&rdev->nr_pending)) {
1493 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
1494
1495 atomic_inc(&rdev->nr_pending);
1496 rcu_read_unlock();
1497
1498 if (r_queue->unplug_fn)
1499 r_queue->unplug_fn(r_queue);
1500
1501 rdev_dec_pending(rdev, mddev);
1502 rcu_read_lock();
1503 }
1504 }
1505 rcu_read_unlock();
1506}
1507
1508static void raid6_unplug_device(request_queue_t *q)
1509{
1510 mddev_t *mddev = q->queuedata;
1511 raid6_conf_t *conf = mddev_to_conf(mddev);
1512 unsigned long flags;
1513
1514 spin_lock_irqsave(&conf->device_lock, flags);
1515
1516 if (blk_remove_plug(q))
1517 raid6_activate_delayed(conf);
1518 md_wakeup_thread(mddev->thread);
1519
1520 spin_unlock_irqrestore(&conf->device_lock, flags);
1521
1522 unplug_slaves(mddev);
1523}
1524
1525static int raid6_issue_flush(request_queue_t *q, struct gendisk *disk,
1526 sector_t *error_sector)
1527{
1528 mddev_t *mddev = q->queuedata;
1529 raid6_conf_t *conf = mddev_to_conf(mddev);
1530 int i, ret = 0;
1531
1532 rcu_read_lock();
1533 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
1534 mdk_rdev_t *rdev = conf->disks[i].rdev;
1535 if (rdev && !rdev->faulty) {
1536 struct block_device *bdev = rdev->bdev;
1537 request_queue_t *r_queue = bdev_get_queue(bdev);
1538
1539 if (!r_queue->issue_flush_fn)
1540 ret = -EOPNOTSUPP;
1541 else {
1542 atomic_inc(&rdev->nr_pending);
1543 rcu_read_unlock();
1544 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
1545 error_sector);
1546 rdev_dec_pending(rdev, mddev);
1547 rcu_read_lock();
1548 }
1549 }
1550 }
1551 rcu_read_unlock();
1552 return ret;
1553}
1554
1555static inline void raid6_plug_device(raid6_conf_t *conf)
1556{
1557 spin_lock_irq(&conf->device_lock);
1558 blk_plug_device(conf->mddev->queue);
1559 spin_unlock_irq(&conf->device_lock);
1560}
1561
1562static int make_request (request_queue_t *q, struct bio * bi)
1563{
1564 mddev_t *mddev = q->queuedata;
1565 raid6_conf_t *conf = mddev_to_conf(mddev);
1566 const unsigned int raid_disks = conf->raid_disks;
1567 const unsigned int data_disks = raid_disks - 2;
1568 unsigned int dd_idx, pd_idx;
1569 sector_t new_sector;
1570 sector_t logical_sector, last_sector;
1571 struct stripe_head *sh;
1572
1573 if (bio_data_dir(bi)==WRITE) {
1574 disk_stat_inc(mddev->gendisk, writes);
1575 disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
1576 } else {
1577 disk_stat_inc(mddev->gendisk, reads);
1578 disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
1579 }
1580
1581 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
1582 last_sector = bi->bi_sector + (bi->bi_size>>9);
1583
1584 bi->bi_next = NULL;
1585 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
1586 if ( bio_data_dir(bi) == WRITE )
1587 md_write_start(mddev);
1588 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
1589 DEFINE_WAIT(w);
1590
1591 new_sector = raid6_compute_sector(logical_sector,
1592 raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1593
1594 PRINTK("raid6: make_request, sector %llu logical %llu\n",
1595 (unsigned long long)new_sector,
1596 (unsigned long long)logical_sector);
1597
1598 retry:
1599 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
1600 sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
1601 if (sh) {
1602 if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
1603 /* Add failed due to overlap. Flush everything
1604 * and wait a while
1605 */
1606 raid6_unplug_device(mddev->queue);
1607 release_stripe(sh);
1608 schedule();
1609 goto retry;
1610 }
1611 finish_wait(&conf->wait_for_overlap, &w);
1612 raid6_plug_device(conf);
1613 handle_stripe(sh);
1614 release_stripe(sh);
1615 } else {
1616 /* cannot get stripe for read-ahead, just give-up */
1617 clear_bit(BIO_UPTODATE, &bi->bi_flags);
1618 finish_wait(&conf->wait_for_overlap, &w);
1619 break;
1620 }
1621
1622 }
1623 spin_lock_irq(&conf->device_lock);
1624 if (--bi->bi_phys_segments == 0) {
1625 int bytes = bi->bi_size;
1626
1627 if ( bio_data_dir(bi) == WRITE )
1628 md_write_end(mddev);
1629 bi->bi_size = 0;
1630 bi->bi_end_io(bi, bytes, 0);
1631 }
1632 spin_unlock_irq(&conf->device_lock);
1633 return 0;
1634}
1635
1636/* FIXME go_faster isn't used */
1637static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
1638{
1639 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1640 struct stripe_head *sh;
1641 int sectors_per_chunk = conf->chunk_size >> 9;
1642 sector_t x;
1643 unsigned long stripe;
1644 int chunk_offset;
1645 int dd_idx, pd_idx;
1646 sector_t first_sector;
1647 int raid_disks = conf->raid_disks;
1648 int data_disks = raid_disks - 2;
1649
1650 if (sector_nr >= mddev->size <<1) {
1651 /* just being told to finish up .. nothing much to do */
1652 unplug_slaves(mddev);
1653 return 0;
1654 }
1655 /* if there are 2 or more failed drives and we are trying
1656 * to resync, then assert that we are finished, because there is
1657 * nothing we can do.
1658 */
1659 if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1660 int rv = (mddev->size << 1) - sector_nr;
1661 md_done_sync(mddev, rv, 1);
1662 return rv;
1663 }
1664
1665 x = sector_nr;
1666 chunk_offset = sector_div(x, sectors_per_chunk);
1667 stripe = x;
1668 BUG_ON(x != stripe);
1669
1670 first_sector = raid6_compute_sector((sector_t)stripe*data_disks*sectors_per_chunk
1671 + chunk_offset, raid_disks, data_disks, &dd_idx, &pd_idx, conf);
1672 sh = get_active_stripe(conf, sector_nr, pd_idx, 1);
1673 if (sh == NULL) {
1674 sh = get_active_stripe(conf, sector_nr, pd_idx, 0);
1675 /* make sure we don't swamp the stripe cache if someone else
1676 * is trying to get access
1677 */
1678 set_current_state(TASK_UNINTERRUPTIBLE);
1679 schedule_timeout(1);
1680 }
1681 spin_lock(&sh->lock);
1682 set_bit(STRIPE_SYNCING, &sh->state);
1683 clear_bit(STRIPE_INSYNC, &sh->state);
1684 spin_unlock(&sh->lock);
1685
1686 handle_stripe(sh);
1687 release_stripe(sh);
1688
1689 return STRIPE_SECTORS;
1690}
1691
1692/*
1693 * This is our raid6 kernel thread.
1694 *
1695 * We scan the hash table for stripes which can be handled now.
1696 * During the scan, completed stripes are saved for us by the interrupt
1697 * handler, so that they will not have to wait for our next wakeup.
1698 */
1699static void raid6d (mddev_t *mddev)
1700{
1701 struct stripe_head *sh;
1702 raid6_conf_t *conf = mddev_to_conf(mddev);
1703 int handled;
1704
1705 PRINTK("+++ raid6d active\n");
1706
1707 md_check_recovery(mddev);
1708 md_handle_safemode(mddev);
1709
1710 handled = 0;
1711 spin_lock_irq(&conf->device_lock);
1712 while (1) {
1713 struct list_head *first;
1714
1715 if (list_empty(&conf->handle_list) &&
1716 atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
1717 !blk_queue_plugged(mddev->queue) &&
1718 !list_empty(&conf->delayed_list))
1719 raid6_activate_delayed(conf);
1720
1721 if (list_empty(&conf->handle_list))
1722 break;
1723
1724 first = conf->handle_list.next;
1725 sh = list_entry(first, struct stripe_head, lru);
1726
1727 list_del_init(first);
1728 atomic_inc(&sh->count);
1729 if (atomic_read(&sh->count)!= 1)
1730 BUG();
1731 spin_unlock_irq(&conf->device_lock);
1732
1733 handled++;
1734 handle_stripe(sh);
1735 release_stripe(sh);
1736
1737 spin_lock_irq(&conf->device_lock);
1738 }
1739 PRINTK("%d stripes handled\n", handled);
1740
1741 spin_unlock_irq(&conf->device_lock);
1742
1743 unplug_slaves(mddev);
1744
1745 PRINTK("--- raid6d inactive\n");
1746}
1747
1748static int run (mddev_t *mddev)
1749{
1750 raid6_conf_t *conf;
1751 int raid_disk, memory;
1752 mdk_rdev_t *rdev;
1753 struct disk_info *disk;
1754 struct list_head *tmp;
1755
1756 if (mddev->level != 6) {
1757 PRINTK("raid6: %s: raid level not set to 6 (%d)\n", mdname(mddev), mddev->level);
1758 return -EIO;
1759 }
1760
1761 mddev->private = kmalloc (sizeof (raid6_conf_t)
1762 + mddev->raid_disks * sizeof(struct disk_info),
1763 GFP_KERNEL);
1764 if ((conf = mddev->private) == NULL)
1765 goto abort;
1766 memset (conf, 0, sizeof (*conf) + mddev->raid_disks * sizeof(struct disk_info) );
1767 conf->mddev = mddev;
1768
1769 if ((conf->stripe_hashtbl = (struct stripe_head **) __get_free_pages(GFP_ATOMIC, HASH_PAGES_ORDER)) == NULL)
1770 goto abort;
1771 memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
1772
1773 spin_lock_init(&conf->device_lock);
1774 init_waitqueue_head(&conf->wait_for_stripe);
1775 init_waitqueue_head(&conf->wait_for_overlap);
1776 INIT_LIST_HEAD(&conf->handle_list);
1777 INIT_LIST_HEAD(&conf->delayed_list);
1778 INIT_LIST_HEAD(&conf->inactive_list);
1779 atomic_set(&conf->active_stripes, 0);
1780 atomic_set(&conf->preread_active_stripes, 0);
1781
1782 mddev->queue->unplug_fn = raid6_unplug_device;
1783 mddev->queue->issue_flush_fn = raid6_issue_flush;
1784
1785 PRINTK("raid6: run(%s) called.\n", mdname(mddev));
1786
1787 ITERATE_RDEV(mddev,rdev,tmp) {
1788 raid_disk = rdev->raid_disk;
1789 if (raid_disk >= mddev->raid_disks
1790 || raid_disk < 0)
1791 continue;
1792 disk = conf->disks + raid_disk;
1793
1794 disk->rdev = rdev;
1795
1796 if (rdev->in_sync) {
1797 char b[BDEVNAME_SIZE];
1798 printk(KERN_INFO "raid6: device %s operational as raid"
1799 " disk %d\n", bdevname(rdev->bdev,b),
1800 raid_disk);
1801 conf->working_disks++;
1802 }
1803 }
1804
1805 conf->raid_disks = mddev->raid_disks;
1806
1807 /*
1808 * 0 for a fully functional array, 1 or 2 for a degraded array.
1809 */
1810 mddev->degraded = conf->failed_disks = conf->raid_disks - conf->working_disks;
1811 conf->mddev = mddev;
1812 conf->chunk_size = mddev->chunk_size;
1813 conf->level = mddev->level;
1814 conf->algorithm = mddev->layout;
1815 conf->max_nr_stripes = NR_STRIPES;
1816
1817 /* device size must be a multiple of chunk size */
1818 mddev->size &= ~(mddev->chunk_size/1024 -1);
1819
1820 if (conf->raid_disks < 4) {
1821 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
1822 mdname(mddev), conf->raid_disks);
1823 goto abort;
1824 }
1825 if (!conf->chunk_size || conf->chunk_size % 4) {
1826 printk(KERN_ERR "raid6: invalid chunk size %d for %s\n",
1827 conf->chunk_size, mdname(mddev));
1828 goto abort;
1829 }
1830 if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
1831 printk(KERN_ERR
1832 "raid6: unsupported parity algorithm %d for %s\n",
1833 conf->algorithm, mdname(mddev));
1834 goto abort;
1835 }
1836 if (mddev->degraded > 2) {
1837 printk(KERN_ERR "raid6: not enough operational devices for %s"
1838 " (%d/%d failed)\n",
1839 mdname(mddev), conf->failed_disks, conf->raid_disks);
1840 goto abort;
1841 }
1842
1843#if 0 /* FIX: For now */
1844 if (mddev->degraded > 0 &&
1845 mddev->recovery_cp != MaxSector) {
1846 printk(KERN_ERR "raid6: cannot start dirty degraded array for %s\n", mdname(mddev));
1847 goto abort;
1848 }
1849#endif
1850
1851 {
1852 mddev->thread = md_register_thread(raid6d, mddev, "%s_raid6");
1853 if (!mddev->thread) {
1854 printk(KERN_ERR
1855 "raid6: couldn't allocate thread for %s\n",
1856 mdname(mddev));
1857 goto abort;
1858 }
1859 }
1860
1861 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
1862 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
1863 if (grow_stripes(conf, conf->max_nr_stripes)) {
1864 printk(KERN_ERR
1865 "raid6: couldn't allocate %dkB for buffers\n", memory);
1866 shrink_stripes(conf);
1867 md_unregister_thread(mddev->thread);
1868 goto abort;
1869 } else
1870 printk(KERN_INFO "raid6: allocated %dkB for %s\n",
1871 memory, mdname(mddev));
1872
1873 if (mddev->degraded == 0)
1874 printk(KERN_INFO "raid6: raid level %d set %s active with %d out of %d"
1875 " devices, algorithm %d\n", conf->level, mdname(mddev),
1876 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
1877 conf->algorithm);
1878 else
1879 printk(KERN_ALERT "raid6: raid level %d set %s active with %d"
1880 " out of %d devices, algorithm %d\n", conf->level,
1881 mdname(mddev), mddev->raid_disks - mddev->degraded,
1882 mddev->raid_disks, conf->algorithm);
1883
1884 print_raid6_conf(conf);
1885
1886 /* read-ahead size must cover two whole stripes, which is
1887 * 2 * (n-2) * chunksize where 'n' is the number of raid devices
1888 */
1889 {
1890 int stripe = (mddev->raid_disks-2) * mddev->chunk_size
1891 / PAGE_CACHE_SIZE;
1892 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
1893 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
1894 }
1895
1896 /* Ok, everything is just fine now */
1897 mddev->array_size = mddev->size * (mddev->raid_disks - 2);
1898 return 0;
1899abort:
1900 if (conf) {
1901 print_raid6_conf(conf);
1902 if (conf->stripe_hashtbl)
1903 free_pages((unsigned long) conf->stripe_hashtbl,
1904 HASH_PAGES_ORDER);
1905 kfree(conf);
1906 }
1907 mddev->private = NULL;
1908 printk(KERN_ALERT "raid6: failed to run raid set %s\n", mdname(mddev));
1909 return -EIO;
1910}
1911
1912
1913
1914static int stop (mddev_t *mddev)
1915{
1916 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1917
1918 md_unregister_thread(mddev->thread);
1919 mddev->thread = NULL;
1920 shrink_stripes(conf);
1921 free_pages((unsigned long) conf->stripe_hashtbl, HASH_PAGES_ORDER);
1922 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1923 kfree(conf);
1924 mddev->private = NULL;
1925 return 0;
1926}
1927
1928#if RAID6_DUMPSTATE
1929static void print_sh (struct seq_file *seq, struct stripe_head *sh)
1930{
1931 int i;
1932
1933 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
1934 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
1935 seq_printf(seq, "sh %llu, count %d.\n",
1936 (unsigned long long)sh->sector, atomic_read(&sh->count));
1937 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
1938 for (i = 0; i < sh->raid_conf->raid_disks; i++) {
1939 seq_printf(seq, "(cache%d: %p %ld) ",
1940 i, sh->dev[i].page, sh->dev[i].flags);
1941 }
1942 seq_printf(seq, "\n");
1943}
1944
1945static void printall (struct seq_file *seq, raid6_conf_t *conf)
1946{
1947 struct stripe_head *sh;
1948 int i;
1949
1950 spin_lock_irq(&conf->device_lock);
1951 for (i = 0; i < NR_HASH; i++) {
1952 sh = conf->stripe_hashtbl[i];
1953 for (; sh; sh = sh->hash_next) {
1954 if (sh->raid_conf != conf)
1955 continue;
1956 print_sh(seq, sh);
1957 }
1958 }
1959 spin_unlock_irq(&conf->device_lock);
1960}
1961#endif
1962
1963static void status (struct seq_file *seq, mddev_t *mddev)
1964{
1965 raid6_conf_t *conf = (raid6_conf_t *) mddev->private;
1966 int i;
1967
1968 seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
1969 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->working_disks);
1970 for (i = 0; i < conf->raid_disks; i++)
1971 seq_printf (seq, "%s",
1972 conf->disks[i].rdev &&
1973 conf->disks[i].rdev->in_sync ? "U" : "_");
1974 seq_printf (seq, "]");
1975#if RAID6_DUMPSTATE
1976 seq_printf (seq, "\n");
1977 printall(seq, conf);
1978#endif
1979}
1980
1981static void print_raid6_conf (raid6_conf_t *conf)
1982{
1983 int i;
1984 struct disk_info *tmp;
1985
1986 printk("RAID6 conf printout:\n");
1987 if (!conf) {
1988 printk("(conf==NULL)\n");
1989 return;
1990 }
1991 printk(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
1992 conf->working_disks, conf->failed_disks);
1993
1994 for (i = 0; i < conf->raid_disks; i++) {
1995 char b[BDEVNAME_SIZE];
1996 tmp = conf->disks + i;
1997 if (tmp->rdev)
1998 printk(" disk %d, o:%d, dev:%s\n",
1999 i, !tmp->rdev->faulty,
2000 bdevname(tmp->rdev->bdev,b));
2001 }
2002}
2003
2004static int raid6_spare_active(mddev_t *mddev)
2005{
2006 int i;
2007 raid6_conf_t *conf = mddev->private;
2008 struct disk_info *tmp;
2009
2010 for (i = 0; i < conf->raid_disks; i++) {
2011 tmp = conf->disks + i;
2012 if (tmp->rdev
2013 && !tmp->rdev->faulty
2014 && !tmp->rdev->in_sync) {
2015 mddev->degraded--;
2016 conf->failed_disks--;
2017 conf->working_disks++;
2018 tmp->rdev->in_sync = 1;
2019 }
2020 }
2021 print_raid6_conf(conf);
2022 return 0;
2023}
2024
2025static int raid6_remove_disk(mddev_t *mddev, int number)
2026{
2027 raid6_conf_t *conf = mddev->private;
2028 int err = 0;
2029 mdk_rdev_t *rdev;
2030 struct disk_info *p = conf->disks + number;
2031
2032 print_raid6_conf(conf);
2033 rdev = p->rdev;
2034 if (rdev) {
2035 if (rdev->in_sync ||
2036 atomic_read(&rdev->nr_pending)) {
2037 err = -EBUSY;
2038 goto abort;
2039 }
2040 p->rdev = NULL;
2041 synchronize_kernel();
2042 if (atomic_read(&rdev->nr_pending)) {
2043 /* lost the race, try later */
2044 err = -EBUSY;
2045 p->rdev = rdev;
2046 }
2047 }
2048
2049abort:
2050
2051 print_raid6_conf(conf);
2052 return err;
2053}
2054
2055static int raid6_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
2056{
2057 raid6_conf_t *conf = mddev->private;
2058 int found = 0;
2059 int disk;
2060 struct disk_info *p;
2061
2062 if (mddev->degraded > 2)
2063 /* no point adding a device */
2064 return 0;
2065 /*
2066 * find the disk ...
2067 */
2068 for (disk=0; disk < mddev->raid_disks; disk++)
2069 if ((p=conf->disks + disk)->rdev == NULL) {
2070 rdev->in_sync = 0;
2071 rdev->raid_disk = disk;
2072 found = 1;
2073 p->rdev = rdev;
2074 break;
2075 }
2076 print_raid6_conf(conf);
2077 return found;
2078}
2079
2080static int raid6_resize(mddev_t *mddev, sector_t sectors)
2081{
2082 /* no resync is happening, and there is enough space
2083 * on all devices, so we can resize.
2084 * We need to make sure resync covers any new space.
2085 * If the array is shrinking we should possibly wait until
2086 * any io in the removed space completes, but it hardly seems
2087 * worth it.
2088 */
2089 sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
2090 mddev->array_size = (sectors * (mddev->raid_disks-2))>>1;
2091 set_capacity(mddev->gendisk, mddev->array_size << 1);
2092 mddev->changed = 1;
2093 if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) {
2094 mddev->recovery_cp = mddev->size << 1;
2095 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2096 }
2097 mddev->size = sectors /2;
2098 return 0;
2099}
2100
2101static mdk_personality_t raid6_personality=
2102{
2103 .name = "raid6",
2104 .owner = THIS_MODULE,
2105 .make_request = make_request,
2106 .run = run,
2107 .stop = stop,
2108 .status = status,
2109 .error_handler = error,
2110 .hot_add_disk = raid6_add_disk,
2111 .hot_remove_disk= raid6_remove_disk,
2112 .spare_active = raid6_spare_active,
2113 .sync_request = sync_request,
2114 .resize = raid6_resize,
2115};
2116
2117static int __init raid6_init (void)
2118{
2119 int e;
2120
2121 e = raid6_select_algo();
2122 if ( e )
2123 return e;
2124
2125 return register_md_personality (RAID6, &raid6_personality);
2126}
2127
2128static void raid6_exit (void)
2129{
2130 unregister_md_personality (RAID6);
2131}
2132
2133module_init(raid6_init);
2134module_exit(raid6_exit);
2135MODULE_LICENSE("GPL");
2136MODULE_ALIAS("md-personality-8"); /* RAID6 */
diff --git a/drivers/md/raid6mmx.c b/drivers/md/raid6mmx.c
new file mode 100644
index 000000000000..359157aaf9e0
--- /dev/null
+++ b/drivers/md/raid6mmx.c
@@ -0,0 +1,150 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6mmx.c
15 *
16 * MMX implementation of RAID-6 syndrome functions
17 */
18
19#if defined(__i386__)
20
21#include "raid6.h"
22#include "raid6x86.h"
23
24/* Shared with raid6sse1.c */
25const struct raid6_mmx_constants {
26 u64 x1d;
27} raid6_mmx_constants = {
28 0x1d1d1d1d1d1d1d1dULL,
29};
30
31static int raid6_have_mmx(void)
32{
33#ifdef __KERNEL__
34 /* Not really "boot_cpu" but "all_cpus" */
35 return boot_cpu_has(X86_FEATURE_MMX);
36#else
37 /* User space test code */
38 u32 features = cpuid_features();
39 return ( (features & (1<<23)) == (1<<23) );
40#endif
41}
42
43/*
44 * Plain MMX implementation
45 */
46static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs)
47{
48 u8 **dptr = (u8 **)ptrs;
49 u8 *p, *q;
50 int d, z, z0;
51 raid6_mmx_save_t sa;
52
53 z0 = disks - 3; /* Highest data disk */
54 p = dptr[z0+1]; /* XOR parity */
55 q = dptr[z0+2]; /* RS syndrome */
56
57 raid6_before_mmx(&sa);
58
59 asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
60 asm volatile("pxor %mm5,%mm5"); /* Zero temp */
61
62 for ( d = 0 ; d < bytes ; d += 8 ) {
63 asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
64 asm volatile("movq %mm2,%mm4"); /* Q[0] */
65 for ( z = z0-1 ; z >= 0 ; z-- ) {
66 asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
67 asm volatile("pcmpgtb %mm4,%mm5");
68 asm volatile("paddb %mm4,%mm4");
69 asm volatile("pand %mm0,%mm5");
70 asm volatile("pxor %mm5,%mm4");
71 asm volatile("pxor %mm5,%mm5");
72 asm volatile("pxor %mm6,%mm2");
73 asm volatile("pxor %mm6,%mm4");
74 }
75 asm volatile("movq %%mm2,%0" : "=m" (p[d]));
76 asm volatile("pxor %mm2,%mm2");
77 asm volatile("movq %%mm4,%0" : "=m" (q[d]));
78 asm volatile("pxor %mm4,%mm4");
79 }
80
81 raid6_after_mmx(&sa);
82}
83
84const struct raid6_calls raid6_mmxx1 = {
85 raid6_mmx1_gen_syndrome,
86 raid6_have_mmx,
87 "mmxx1",
88 0
89};
90
91/*
92 * Unrolled-by-2 MMX implementation
93 */
94static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs)
95{
96 u8 **dptr = (u8 **)ptrs;
97 u8 *p, *q;
98 int d, z, z0;
99 raid6_mmx_save_t sa;
100
101 z0 = disks - 3; /* Highest data disk */
102 p = dptr[z0+1]; /* XOR parity */
103 q = dptr[z0+2]; /* RS syndrome */
104
105 raid6_before_mmx(&sa);
106
107 asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
108 asm volatile("pxor %mm5,%mm5"); /* Zero temp */
109 asm volatile("pxor %mm7,%mm7"); /* Zero temp */
110
111 for ( d = 0 ; d < bytes ; d += 16 ) {
112 asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
113 asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8]));
114 asm volatile("movq %mm2,%mm4"); /* Q[0] */
115 asm volatile("movq %mm3,%mm6"); /* Q[1] */
116 for ( z = z0-1 ; z >= 0 ; z-- ) {
117 asm volatile("pcmpgtb %mm4,%mm5");
118 asm volatile("pcmpgtb %mm6,%mm7");
119 asm volatile("paddb %mm4,%mm4");
120 asm volatile("paddb %mm6,%mm6");
121 asm volatile("pand %mm0,%mm5");
122 asm volatile("pand %mm0,%mm7");
123 asm volatile("pxor %mm5,%mm4");
124 asm volatile("pxor %mm7,%mm6");
125 asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
126 asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
127 asm volatile("pxor %mm5,%mm2");
128 asm volatile("pxor %mm7,%mm3");
129 asm volatile("pxor %mm5,%mm4");
130 asm volatile("pxor %mm7,%mm6");
131 asm volatile("pxor %mm5,%mm5");
132 asm volatile("pxor %mm7,%mm7");
133 }
134 asm volatile("movq %%mm2,%0" : "=m" (p[d]));
135 asm volatile("movq %%mm3,%0" : "=m" (p[d+8]));
136 asm volatile("movq %%mm4,%0" : "=m" (q[d]));
137 asm volatile("movq %%mm6,%0" : "=m" (q[d+8]));
138 }
139
140 raid6_after_mmx(&sa);
141}
142
143const struct raid6_calls raid6_mmxx2 = {
144 raid6_mmx2_gen_syndrome,
145 raid6_have_mmx,
146 "mmxx2",
147 0
148};
149
150#endif
diff --git a/drivers/md/raid6recov.c b/drivers/md/raid6recov.c
new file mode 100644
index 000000000000..a8c4d9451bd9
--- /dev/null
+++ b/drivers/md/raid6recov.c
@@ -0,0 +1,133 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6recov.c
15 *
16 * RAID-6 data recovery in dual failure mode. In single failure mode,
17 * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct
18 * the syndrome.)
19 */
20
21#include "raid6.h"
22
23/* Recover two failed data blocks. */
24void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
25 void **ptrs)
26{
27 u8 *p, *q, *dp, *dq;
28 u8 px, qx, db;
29 const u8 *pbmul; /* P multiplier table for B data */
30 const u8 *qmul; /* Q multiplier table (for both) */
31
32 p = (u8 *)ptrs[disks-2];
33 q = (u8 *)ptrs[disks-1];
34
35 /* Compute syndrome with zero for the missing data pages
36 Use the dead data pages as temporary storage for
37 delta p and delta q */
38 dp = (u8 *)ptrs[faila];
39 ptrs[faila] = (void *)raid6_empty_zero_page;
40 ptrs[disks-2] = dp;
41 dq = (u8 *)ptrs[failb];
42 ptrs[failb] = (void *)raid6_empty_zero_page;
43 ptrs[disks-1] = dq;
44
45 raid6_call.gen_syndrome(disks, bytes, ptrs);
46
47 /* Restore pointer table */
48 ptrs[faila] = dp;
49 ptrs[failb] = dq;
50 ptrs[disks-2] = p;
51 ptrs[disks-1] = q;
52
53 /* Now, pick the proper data tables */
54 pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
55 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
56
57 /* Now do it... */
58 while ( bytes-- ) {
59 px = *p ^ *dp;
60 qx = qmul[*q ^ *dq];
61 *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
62 *dp++ = db ^ px; /* Reconstructed A */
63 p++; q++;
64 }
65}
66
67
68
69
70/* Recover failure of one data block plus the P block */
71void raid6_datap_recov(int disks, size_t bytes, int faila, void **ptrs)
72{
73 u8 *p, *q, *dq;
74 const u8 *qmul; /* Q multiplier table */
75
76 p = (u8 *)ptrs[disks-2];
77 q = (u8 *)ptrs[disks-1];
78
79 /* Compute syndrome with zero for the missing data page
80 Use the dead data page as temporary storage for delta q */
81 dq = (u8 *)ptrs[faila];
82 ptrs[faila] = (void *)raid6_empty_zero_page;
83 ptrs[disks-1] = dq;
84
85 raid6_call.gen_syndrome(disks, bytes, ptrs);
86
87 /* Restore pointer table */
88 ptrs[faila] = dq;
89 ptrs[disks-1] = q;
90
91 /* Now, pick the proper data tables */
92 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
93
94 /* Now do it... */
95 while ( bytes-- ) {
96 *p++ ^= *dq = qmul[*q ^ *dq];
97 q++; dq++;
98 }
99}
100
101
102#ifndef __KERNEL__ /* Testing only */
103
104/* Recover two failed blocks. */
105void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs)
106{
107 if ( faila > failb ) {
108 int tmp = faila;
109 faila = failb;
110 failb = tmp;
111 }
112
113 if ( failb == disks-1 ) {
114 if ( faila == disks-2 ) {
115 /* P+Q failure. Just rebuild the syndrome. */
116 raid6_call.gen_syndrome(disks, bytes, ptrs);
117 } else {
118 /* data+Q failure. Reconstruct data from P,
119 then rebuild syndrome. */
120 /* NOT IMPLEMENTED - equivalent to RAID-5 */
121 }
122 } else {
123 if ( failb == disks-2 ) {
124 /* data+P failure. */
125 raid6_datap_recov(disks, bytes, faila, ptrs);
126 } else {
127 /* data+data failure. */
128 raid6_2data_recov(disks, bytes, faila, failb, ptrs);
129 }
130 }
131}
132
133#endif
diff --git a/drivers/md/raid6sse1.c b/drivers/md/raid6sse1.c
new file mode 100644
index 000000000000..f7e7859f71aa
--- /dev/null
+++ b/drivers/md/raid6sse1.c
@@ -0,0 +1,171 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6sse1.c
15 *
16 * SSE-1/MMXEXT implementation of RAID-6 syndrome functions
17 *
18 * This is really an MMX implementation, but it requires SSE-1 or
19 * AMD MMXEXT for prefetch support and a few other features. The
20 * support for nontemporal memory accesses is enough to make this
21 * worthwhile as a separate implementation.
22 */
23
24#if defined(__i386__)
25
26#include "raid6.h"
27#include "raid6x86.h"
28
29/* Defined in raid6mmx.c */
30extern const struct raid6_mmx_constants {
31 u64 x1d;
32} raid6_mmx_constants;
33
34static int raid6_have_sse1_or_mmxext(void)
35{
36#ifdef __KERNEL__
37 /* Not really boot_cpu but "all_cpus" */
38 return boot_cpu_has(X86_FEATURE_MMX) &&
39 (boot_cpu_has(X86_FEATURE_XMM) ||
40 boot_cpu_has(X86_FEATURE_MMXEXT));
41#else
42 /* User space test code - this incorrectly breaks on some Athlons */
43 u32 features = cpuid_features();
44 return ( (features & (5<<23)) == (5<<23) );
45#endif
46}
47
48/*
49 * Plain SSE1 implementation
50 */
51static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs)
52{
53 u8 **dptr = (u8 **)ptrs;
54 u8 *p, *q;
55 int d, z, z0;
56 raid6_mmx_save_t sa;
57
58 z0 = disks - 3; /* Highest data disk */
59 p = dptr[z0+1]; /* XOR parity */
60 q = dptr[z0+2]; /* RS syndrome */
61
62 /* This is really MMX code, not SSE */
63 raid6_before_mmx(&sa);
64
65 asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
66 asm volatile("pxor %mm5,%mm5"); /* Zero temp */
67
68 for ( d = 0 ; d < bytes ; d += 8 ) {
69 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
70 asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
71 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
72 asm volatile("movq %mm2,%mm4"); /* Q[0] */
73 asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d]));
74 for ( z = z0-2 ; z >= 0 ; z-- ) {
75 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
76 asm volatile("pcmpgtb %mm4,%mm5");
77 asm volatile("paddb %mm4,%mm4");
78 asm volatile("pand %mm0,%mm5");
79 asm volatile("pxor %mm5,%mm4");
80 asm volatile("pxor %mm5,%mm5");
81 asm volatile("pxor %mm6,%mm2");
82 asm volatile("pxor %mm6,%mm4");
83 asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d]));
84 }
85 asm volatile("pcmpgtb %mm4,%mm5");
86 asm volatile("paddb %mm4,%mm4");
87 asm volatile("pand %mm0,%mm5");
88 asm volatile("pxor %mm5,%mm4");
89 asm volatile("pxor %mm5,%mm5");
90 asm volatile("pxor %mm6,%mm2");
91 asm volatile("pxor %mm6,%mm4");
92
93 asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
94 asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
95 }
96
97 raid6_after_mmx(&sa);
98 asm volatile("sfence" : : : "memory");
99}
100
101const struct raid6_calls raid6_sse1x1 = {
102 raid6_sse11_gen_syndrome,
103 raid6_have_sse1_or_mmxext,
104 "sse1x1",
105 1 /* Has cache hints */
106};
107
108/*
109 * Unrolled-by-2 SSE1 implementation
110 */
111static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs)
112{
113 u8 **dptr = (u8 **)ptrs;
114 u8 *p, *q;
115 int d, z, z0;
116 raid6_mmx_save_t sa;
117
118 z0 = disks - 3; /* Highest data disk */
119 p = dptr[z0+1]; /* XOR parity */
120 q = dptr[z0+2]; /* RS syndrome */
121
122 raid6_before_mmx(&sa);
123
124 asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d));
125 asm volatile("pxor %mm5,%mm5"); /* Zero temp */
126 asm volatile("pxor %mm7,%mm7"); /* Zero temp */
127
128 /* We uniformly assume a single prefetch covers at least 16 bytes */
129 for ( d = 0 ; d < bytes ; d += 16 ) {
130 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
131 asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */
132 asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */
133 asm volatile("movq %mm2,%mm4"); /* Q[0] */
134 asm volatile("movq %mm3,%mm6"); /* Q[1] */
135 for ( z = z0-1 ; z >= 0 ; z-- ) {
136 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
137 asm volatile("pcmpgtb %mm4,%mm5");
138 asm volatile("pcmpgtb %mm6,%mm7");
139 asm volatile("paddb %mm4,%mm4");
140 asm volatile("paddb %mm6,%mm6");
141 asm volatile("pand %mm0,%mm5");
142 asm volatile("pand %mm0,%mm7");
143 asm volatile("pxor %mm5,%mm4");
144 asm volatile("pxor %mm7,%mm6");
145 asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d]));
146 asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8]));
147 asm volatile("pxor %mm5,%mm2");
148 asm volatile("pxor %mm7,%mm3");
149 asm volatile("pxor %mm5,%mm4");
150 asm volatile("pxor %mm7,%mm6");
151 asm volatile("pxor %mm5,%mm5");
152 asm volatile("pxor %mm7,%mm7");
153 }
154 asm volatile("movntq %%mm2,%0" : "=m" (p[d]));
155 asm volatile("movntq %%mm3,%0" : "=m" (p[d+8]));
156 asm volatile("movntq %%mm4,%0" : "=m" (q[d]));
157 asm volatile("movntq %%mm6,%0" : "=m" (q[d+8]));
158 }
159
160 raid6_after_mmx(&sa);
161 asm volatile("sfence" : :: "memory");
162}
163
164const struct raid6_calls raid6_sse1x2 = {
165 raid6_sse12_gen_syndrome,
166 raid6_have_sse1_or_mmxext,
167 "sse1x2",
168 1 /* Has cache hints */
169};
170
171#endif
diff --git a/drivers/md/raid6sse2.c b/drivers/md/raid6sse2.c
new file mode 100644
index 000000000000..b3aa7fe0877e
--- /dev/null
+++ b/drivers/md/raid6sse2.c
@@ -0,0 +1,270 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6sse2.c
15 *
16 * SSE-2 implementation of RAID-6 syndrome functions
17 *
18 */
19
20#if defined(__i386__) || defined(__x86_64__)
21
22#include "raid6.h"
23#include "raid6x86.h"
24
25static const struct raid6_sse_constants {
26 u64 x1d[2];
27} raid6_sse_constants __attribute__((aligned(16))) = {
28 { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
29};
30
31static int raid6_have_sse2(void)
32{
33#ifdef __KERNEL__
34 /* Not really boot_cpu but "all_cpus" */
35 return boot_cpu_has(X86_FEATURE_MMX) &&
36 boot_cpu_has(X86_FEATURE_FXSR) &&
37 boot_cpu_has(X86_FEATURE_XMM) &&
38 boot_cpu_has(X86_FEATURE_XMM2);
39#else
40 /* User space test code */
41 u32 features = cpuid_features();
42 return ( (features & (15<<23)) == (15<<23) );
43#endif
44}
45
46/*
47 * Plain SSE2 implementation
48 */
49static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
50{
51 u8 **dptr = (u8 **)ptrs;
52 u8 *p, *q;
53 int d, z, z0;
54 raid6_sse_save_t sa;
55
56 z0 = disks - 3; /* Highest data disk */
57 p = dptr[z0+1]; /* XOR parity */
58 q = dptr[z0+2]; /* RS syndrome */
59
60 raid6_before_sse2(&sa);
61
62 asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
63 asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
64
65 for ( d = 0 ; d < bytes ; d += 16 ) {
66 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
67 asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
68 asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
69 asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
70 asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
71 for ( z = z0-2 ; z >= 0 ; z-- ) {
72 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
73 asm volatile("pcmpgtb %xmm4,%xmm5");
74 asm volatile("paddb %xmm4,%xmm4");
75 asm volatile("pand %xmm0,%xmm5");
76 asm volatile("pxor %xmm5,%xmm4");
77 asm volatile("pxor %xmm5,%xmm5");
78 asm volatile("pxor %xmm6,%xmm2");
79 asm volatile("pxor %xmm6,%xmm4");
80 asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
81 }
82 asm volatile("pcmpgtb %xmm4,%xmm5");
83 asm volatile("paddb %xmm4,%xmm4");
84 asm volatile("pand %xmm0,%xmm5");
85 asm volatile("pxor %xmm5,%xmm4");
86 asm volatile("pxor %xmm5,%xmm5");
87 asm volatile("pxor %xmm6,%xmm2");
88 asm volatile("pxor %xmm6,%xmm4");
89
90 asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
91 asm volatile("pxor %xmm2,%xmm2");
92 asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
93 asm volatile("pxor %xmm4,%xmm4");
94 }
95
96 raid6_after_sse2(&sa);
97 asm volatile("sfence" : : : "memory");
98}
99
100const struct raid6_calls raid6_sse2x1 = {
101 raid6_sse21_gen_syndrome,
102 raid6_have_sse2,
103 "sse2x1",
104 1 /* Has cache hints */
105};
106
107/*
108 * Unrolled-by-2 SSE2 implementation
109 */
110static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
111{
112 u8 **dptr = (u8 **)ptrs;
113 u8 *p, *q;
114 int d, z, z0;
115 raid6_sse_save_t sa;
116
117 z0 = disks - 3; /* Highest data disk */
118 p = dptr[z0+1]; /* XOR parity */
119 q = dptr[z0+2]; /* RS syndrome */
120
121 raid6_before_sse2(&sa);
122
123 asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
124 asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
125 asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
126
127 /* We uniformly assume a single prefetch covers at least 32 bytes */
128 for ( d = 0 ; d < bytes ; d += 32 ) {
129 asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
130 asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
131 asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
132 asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
133 asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
134 for ( z = z0-1 ; z >= 0 ; z-- ) {
135 asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
136 asm volatile("pcmpgtb %xmm4,%xmm5");
137 asm volatile("pcmpgtb %xmm6,%xmm7");
138 asm volatile("paddb %xmm4,%xmm4");
139 asm volatile("paddb %xmm6,%xmm6");
140 asm volatile("pand %xmm0,%xmm5");
141 asm volatile("pand %xmm0,%xmm7");
142 asm volatile("pxor %xmm5,%xmm4");
143 asm volatile("pxor %xmm7,%xmm6");
144 asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
145 asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
146 asm volatile("pxor %xmm5,%xmm2");
147 asm volatile("pxor %xmm7,%xmm3");
148 asm volatile("pxor %xmm5,%xmm4");
149 asm volatile("pxor %xmm7,%xmm6");
150 asm volatile("pxor %xmm5,%xmm5");
151 asm volatile("pxor %xmm7,%xmm7");
152 }
153 asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
154 asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
155 asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
156 asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
157 }
158
159 raid6_after_sse2(&sa);
160 asm volatile("sfence" : : : "memory");
161}
162
163const struct raid6_calls raid6_sse2x2 = {
164 raid6_sse22_gen_syndrome,
165 raid6_have_sse2,
166 "sse2x2",
167 1 /* Has cache hints */
168};
169
170#endif
171
172#ifdef __x86_64__
173
174/*
175 * Unrolled-by-4 SSE2 implementation
176 */
177static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
178{
179 u8 **dptr = (u8 **)ptrs;
180 u8 *p, *q;
181 int d, z, z0;
182 raid6_sse16_save_t sa;
183
184 z0 = disks - 3; /* Highest data disk */
185 p = dptr[z0+1]; /* XOR parity */
186 q = dptr[z0+2]; /* RS syndrome */
187
188 raid6_before_sse16(&sa);
189
190 asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
191 asm volatile("pxor %xmm2,%xmm2"); /* P[0] */
192 asm volatile("pxor %xmm3,%xmm3"); /* P[1] */
193 asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */
194 asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
195 asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */
196 asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
197 asm volatile("pxor %xmm10,%xmm10"); /* P[2] */
198 asm volatile("pxor %xmm11,%xmm11"); /* P[3] */
199 asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */
200 asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */
201 asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */
202 asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */
203
204 for ( d = 0 ; d < bytes ; d += 64 ) {
205 for ( z = z0 ; z >= 0 ; z-- ) {
206 /* The second prefetch seems to improve performance... */
207 asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
208 asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
209 asm volatile("pcmpgtb %xmm4,%xmm5");
210 asm volatile("pcmpgtb %xmm6,%xmm7");
211 asm volatile("pcmpgtb %xmm12,%xmm13");
212 asm volatile("pcmpgtb %xmm14,%xmm15");
213 asm volatile("paddb %xmm4,%xmm4");
214 asm volatile("paddb %xmm6,%xmm6");
215 asm volatile("paddb %xmm12,%xmm12");
216 asm volatile("paddb %xmm14,%xmm14");
217 asm volatile("pand %xmm0,%xmm5");
218 asm volatile("pand %xmm0,%xmm7");
219 asm volatile("pand %xmm0,%xmm13");
220 asm volatile("pand %xmm0,%xmm15");
221 asm volatile("pxor %xmm5,%xmm4");
222 asm volatile("pxor %xmm7,%xmm6");
223 asm volatile("pxor %xmm13,%xmm12");
224 asm volatile("pxor %xmm15,%xmm14");
225 asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
226 asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
227 asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
228 asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
229 asm volatile("pxor %xmm5,%xmm2");
230 asm volatile("pxor %xmm7,%xmm3");
231 asm volatile("pxor %xmm13,%xmm10");
232 asm volatile("pxor %xmm15,%xmm11");
233 asm volatile("pxor %xmm5,%xmm4");
234 asm volatile("pxor %xmm7,%xmm6");
235 asm volatile("pxor %xmm13,%xmm12");
236 asm volatile("pxor %xmm15,%xmm14");
237 asm volatile("pxor %xmm5,%xmm5");
238 asm volatile("pxor %xmm7,%xmm7");
239 asm volatile("pxor %xmm13,%xmm13");
240 asm volatile("pxor %xmm15,%xmm15");
241 }
242 asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
243 asm volatile("pxor %xmm2,%xmm2");
244 asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
245 asm volatile("pxor %xmm3,%xmm3");
246 asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
247 asm volatile("pxor %xmm10,%xmm10");
248 asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
249 asm volatile("pxor %xmm11,%xmm11");
250 asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
251 asm volatile("pxor %xmm4,%xmm4");
252 asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
253 asm volatile("pxor %xmm6,%xmm6");
254 asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
255 asm volatile("pxor %xmm12,%xmm12");
256 asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
257 asm volatile("pxor %xmm14,%xmm14");
258 }
259 asm volatile("sfence" : : : "memory");
260 raid6_after_sse16(&sa);
261}
262
263const struct raid6_calls raid6_sse2x4 = {
264 raid6_sse24_gen_syndrome,
265 raid6_have_sse2,
266 "sse2x4",
267 1 /* Has cache hints */
268};
269
270#endif
diff --git a/drivers/md/raid6test/Makefile b/drivers/md/raid6test/Makefile
new file mode 100644
index 000000000000..557806728609
--- /dev/null
+++ b/drivers/md/raid6test/Makefile
@@ -0,0 +1,58 @@
1#
2# This is a simple Makefile to test some of the RAID-6 code
3# from userspace.
4#
5
6CC = gcc
7OPTFLAGS = -O2 # Adjust as desired
8CFLAGS = -I.. -g $(OPTFLAGS)
9LD = ld
10PERL = perl
11
12.c.o:
13 $(CC) $(CFLAGS) -c -o $@ $<
14
15%.c: ../%.c
16 cp -f $< $@
17
18%.uc: ../%.uc
19 cp -f $< $@
20
21all: raid6.o raid6test
22
23raid6.o: raid6int1.o raid6int2.o raid6int4.o raid6int8.o raid6int16.o \
24 raid6int32.o \
25 raid6mmx.o raid6sse1.o raid6sse2.o \
26 raid6recov.o raid6algos.o \
27 raid6tables.o
28 $(LD) -r -o $@ $^
29
30raid6test: raid6.o test.c
31 $(CC) $(CFLAGS) -o raid6test $^
32
33raid6int1.c: raid6int.uc ../unroll.pl
34 $(PERL) ../unroll.pl 1 < raid6int.uc > $@
35
36raid6int2.c: raid6int.uc ../unroll.pl
37 $(PERL) ../unroll.pl 2 < raid6int.uc > $@
38
39raid6int4.c: raid6int.uc ../unroll.pl
40 $(PERL) ../unroll.pl 4 < raid6int.uc > $@
41
42raid6int8.c: raid6int.uc ../unroll.pl
43 $(PERL) ../unroll.pl 8 < raid6int.uc > $@
44
45raid6int16.c: raid6int.uc ../unroll.pl
46 $(PERL) ../unroll.pl 16 < raid6int.uc > $@
47
48raid6int32.c: raid6int.uc ../unroll.pl
49 $(PERL) ../unroll.pl 32 < raid6int.uc > $@
50
51raid6tables.c: mktables
52 ./mktables > raid6tables.c
53
54clean:
55 rm -f *.o mktables mktables.c raid6int.uc raid6*.c raid6test
56
57spotless: clean
58 rm -f *~
diff --git a/drivers/md/raid6test/test.c b/drivers/md/raid6test/test.c
new file mode 100644
index 000000000000..0d5cd57accd7
--- /dev/null
+++ b/drivers/md/raid6test/test.c
@@ -0,0 +1,103 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2002 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6test.c
15 *
16 * Test RAID-6 recovery with various algorithms
17 */
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include "raid6.h"
23
24#define NDISKS 16 /* Including P and Q */
25
26const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
27struct raid6_calls raid6_call;
28
29char *dataptrs[NDISKS];
30char data[NDISKS][PAGE_SIZE];
31char recovi[PAGE_SIZE], recovj[PAGE_SIZE];
32
33void makedata(void)
34{
35 int i, j;
36
37 for ( i = 0 ; i < NDISKS ; i++ ) {
38 for ( j = 0 ; j < PAGE_SIZE ; j++ ) {
39 data[i][j] = rand();
40 }
41 dataptrs[i] = data[i];
42 }
43}
44
45int main(int argc, char *argv[])
46{
47 const struct raid6_calls * const * algo;
48 int i, j;
49 int erra, errb;
50
51 makedata();
52
53 for ( algo = raid6_algos ; *algo ; algo++ ) {
54 if ( !(*algo)->valid || (*algo)->valid() ) {
55 raid6_call = **algo;
56
57 /* Nuke syndromes */
58 memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE);
59
60 /* Generate assumed good syndrome */
61 raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, (void **)&dataptrs);
62
63 for ( i = 0 ; i < NDISKS-1 ; i++ ) {
64 for ( j = i+1 ; j < NDISKS ; j++ ) {
65 memset(recovi, 0xf0, PAGE_SIZE);
66 memset(recovj, 0xba, PAGE_SIZE);
67
68 dataptrs[i] = recovi;
69 dataptrs[j] = recovj;
70
71 raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs);
72
73 erra = memcmp(data[i], recovi, PAGE_SIZE);
74 errb = memcmp(data[j], recovj, PAGE_SIZE);
75
76 if ( i < NDISKS-2 && j == NDISKS-1 ) {
77 /* We don't implement the DQ failure scenario, since it's
78 equivalent to a RAID-5 failure (XOR, then recompute Q) */
79 } else {
80 printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n",
81 raid6_call.name,
82 i, (i==NDISKS-2)?'P':'D',
83 j, (j==NDISKS-1)?'Q':(j==NDISKS-2)?'P':'D',
84 (!erra && !errb) ? "OK" :
85 !erra ? "ERRB" :
86 !errb ? "ERRA" :
87 "ERRAB");
88 }
89
90 dataptrs[i] = data[i];
91 dataptrs[j] = data[j];
92 }
93 }
94 }
95 printf("\n");
96 }
97
98 printf("\n");
99 /* Pick the best algorithm test */
100 raid6_select_algo();
101
102 return 0;
103}
diff --git a/drivers/md/raid6x86.h b/drivers/md/raid6x86.h
new file mode 100644
index 000000000000..4cf20534fe44
--- /dev/null
+++ b/drivers/md/raid6x86.h
@@ -0,0 +1,245 @@
1/* ----------------------------------------------------------------------- *
2 *
3 * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
8 * Bostom MA 02111-1307, USA; either version 2 of the License, or
9 * (at your option) any later version; incorporated herein by reference.
10 *
11 * ----------------------------------------------------------------------- */
12
13/*
14 * raid6x86.h
15 *
16 * Definitions common to x86 and x86-64 RAID-6 code only
17 */
18
19#ifndef LINUX_RAID_RAID6X86_H
20#define LINUX_RAID_RAID6X86_H
21
22#if defined(__i386__) || defined(__x86_64__)
23
24#ifdef __x86_64__
25
26typedef struct {
27 unsigned int fsave[27];
28 unsigned long cr0;
29} raid6_mmx_save_t __attribute__((aligned(16)));
30
31/* N.B.: For SSE we only save %xmm0-%xmm7 even for x86-64, since
32 the code doesn't know about the additional x86-64 registers */
33typedef struct {
34 unsigned int sarea[8*4+2];
35 unsigned long cr0;
36} raid6_sse_save_t __attribute__((aligned(16)));
37
38/* This is for x86-64-specific code which uses all 16 XMM registers */
39typedef struct {
40 unsigned int sarea[16*4+2];
41 unsigned long cr0;
42} raid6_sse16_save_t __attribute__((aligned(16)));
43
44/* On x86-64 the stack *SHOULD* be 16-byte aligned, but currently this
45 is buggy in the kernel and it's only 8-byte aligned in places, so
46 we need to do this anyway. Sigh. */
47#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15))
48
49#else /* __i386__ */
50
51typedef struct {
52 unsigned int fsave[27];
53 unsigned long cr0;
54} raid6_mmx_save_t;
55
56/* On i386, the stack is only 8-byte aligned, but SSE requires 16-byte
57 alignment. The +3 is so we have the slack space to manually align
58 a properly-sized area correctly. */
59typedef struct {
60 unsigned int sarea[8*4+3];
61 unsigned long cr0;
62} raid6_sse_save_t;
63
64/* Find the 16-byte aligned save area */
65#define SAREA(x) ((unsigned int *)((((unsigned long)&(x)->sarea)+15) & ~15))
66
67#endif
68
69#ifdef __KERNEL__ /* Real code */
70
71/* Note: %cr0 is 32 bits on i386 and 64 bits on x86-64 */
72
73static inline unsigned long raid6_get_fpu(void)
74{
75 unsigned long cr0;
76
77 preempt_disable();
78 asm volatile("mov %%cr0,%0 ; clts" : "=r" (cr0));
79 return cr0;
80}
81
82static inline void raid6_put_fpu(unsigned long cr0)
83{
84 asm volatile("mov %0,%%cr0" : : "r" (cr0));
85 preempt_enable();
86}
87
88#else /* Dummy code for user space testing */
89
90static inline unsigned long raid6_get_fpu(void)
91{
92 return 0xf00ba6;
93}
94
95static inline void raid6_put_fpu(unsigned long cr0)
96{
97 (void)cr0;
98}
99
100#endif
101
102static inline void raid6_before_mmx(raid6_mmx_save_t *s)
103{
104 s->cr0 = raid6_get_fpu();
105 asm volatile("fsave %0 ; fwait" : "=m" (s->fsave[0]));
106}
107
108static inline void raid6_after_mmx(raid6_mmx_save_t *s)
109{
110 asm volatile("frstor %0" : : "m" (s->fsave[0]));
111 raid6_put_fpu(s->cr0);
112}
113
114static inline void raid6_before_sse(raid6_sse_save_t *s)
115{
116 unsigned int *rsa = SAREA(s);
117
118 s->cr0 = raid6_get_fpu();
119
120 asm volatile("movaps %%xmm0,%0" : "=m" (rsa[0]));
121 asm volatile("movaps %%xmm1,%0" : "=m" (rsa[4]));
122 asm volatile("movaps %%xmm2,%0" : "=m" (rsa[8]));
123 asm volatile("movaps %%xmm3,%0" : "=m" (rsa[12]));
124 asm volatile("movaps %%xmm4,%0" : "=m" (rsa[16]));
125 asm volatile("movaps %%xmm5,%0" : "=m" (rsa[20]));
126 asm volatile("movaps %%xmm6,%0" : "=m" (rsa[24]));
127 asm volatile("movaps %%xmm7,%0" : "=m" (rsa[28]));
128}
129
130static inline void raid6_after_sse(raid6_sse_save_t *s)
131{
132 unsigned int *rsa = SAREA(s);
133
134 asm volatile("movaps %0,%%xmm0" : : "m" (rsa[0]));
135 asm volatile("movaps %0,%%xmm1" : : "m" (rsa[4]));
136 asm volatile("movaps %0,%%xmm2" : : "m" (rsa[8]));
137 asm volatile("movaps %0,%%xmm3" : : "m" (rsa[12]));
138 asm volatile("movaps %0,%%xmm4" : : "m" (rsa[16]));
139 asm volatile("movaps %0,%%xmm5" : : "m" (rsa[20]));
140 asm volatile("movaps %0,%%xmm6" : : "m" (rsa[24]));
141 asm volatile("movaps %0,%%xmm7" : : "m" (rsa[28]));
142
143 raid6_put_fpu(s->cr0);
144}
145
146static inline void raid6_before_sse2(raid6_sse_save_t *s)
147{
148 unsigned int *rsa = SAREA(s);
149
150 s->cr0 = raid6_get_fpu();
151
152 asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0]));
153 asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4]));
154 asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8]));
155 asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12]));
156 asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16]));
157 asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20]));
158 asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24]));
159 asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28]));
160}
161
162static inline void raid6_after_sse2(raid6_sse_save_t *s)
163{
164 unsigned int *rsa = SAREA(s);
165
166 asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0]));
167 asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4]));
168 asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8]));
169 asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12]));
170 asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16]));
171 asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20]));
172 asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24]));
173 asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28]));
174
175 raid6_put_fpu(s->cr0);
176}
177
178#ifdef __x86_64__
179
180static inline void raid6_before_sse16(raid6_sse16_save_t *s)
181{
182 unsigned int *rsa = SAREA(s);
183
184 s->cr0 = raid6_get_fpu();
185
186 asm volatile("movdqa %%xmm0,%0" : "=m" (rsa[0]));
187 asm volatile("movdqa %%xmm1,%0" : "=m" (rsa[4]));
188 asm volatile("movdqa %%xmm2,%0" : "=m" (rsa[8]));
189 asm volatile("movdqa %%xmm3,%0" : "=m" (rsa[12]));
190 asm volatile("movdqa %%xmm4,%0" : "=m" (rsa[16]));
191 asm volatile("movdqa %%xmm5,%0" : "=m" (rsa[20]));
192 asm volatile("movdqa %%xmm6,%0" : "=m" (rsa[24]));
193 asm volatile("movdqa %%xmm7,%0" : "=m" (rsa[28]));
194 asm volatile("movdqa %%xmm8,%0" : "=m" (rsa[32]));
195 asm volatile("movdqa %%xmm9,%0" : "=m" (rsa[36]));
196 asm volatile("movdqa %%xmm10,%0" : "=m" (rsa[40]));
197 asm volatile("movdqa %%xmm11,%0" : "=m" (rsa[44]));
198 asm volatile("movdqa %%xmm12,%0" : "=m" (rsa[48]));
199 asm volatile("movdqa %%xmm13,%0" : "=m" (rsa[52]));
200 asm volatile("movdqa %%xmm14,%0" : "=m" (rsa[56]));
201 asm volatile("movdqa %%xmm15,%0" : "=m" (rsa[60]));
202}
203
204static inline void raid6_after_sse16(raid6_sse16_save_t *s)
205{
206 unsigned int *rsa = SAREA(s);
207
208 asm volatile("movdqa %0,%%xmm0" : : "m" (rsa[0]));
209 asm volatile("movdqa %0,%%xmm1" : : "m" (rsa[4]));
210 asm volatile("movdqa %0,%%xmm2" : : "m" (rsa[8]));
211 asm volatile("movdqa %0,%%xmm3" : : "m" (rsa[12]));
212 asm volatile("movdqa %0,%%xmm4" : : "m" (rsa[16]));
213 asm volatile("movdqa %0,%%xmm5" : : "m" (rsa[20]));
214 asm volatile("movdqa %0,%%xmm6" : : "m" (rsa[24]));
215 asm volatile("movdqa %0,%%xmm7" : : "m" (rsa[28]));
216 asm volatile("movdqa %0,%%xmm8" : : "m" (rsa[32]));
217 asm volatile("movdqa %0,%%xmm9" : : "m" (rsa[36]));
218 asm volatile("movdqa %0,%%xmm10" : : "m" (rsa[40]));
219 asm volatile("movdqa %0,%%xmm11" : : "m" (rsa[44]));
220 asm volatile("movdqa %0,%%xmm12" : : "m" (rsa[48]));
221 asm volatile("movdqa %0,%%xmm13" : : "m" (rsa[52]));
222 asm volatile("movdqa %0,%%xmm14" : : "m" (rsa[56]));
223 asm volatile("movdqa %0,%%xmm15" : : "m" (rsa[60]));
224
225 raid6_put_fpu(s->cr0);
226}
227
228#endif /* __x86_64__ */
229
230/* User space test hack */
231#ifndef __KERNEL__
232static inline int cpuid_features(void)
233{
234 u32 eax = 1;
235 u32 ebx, ecx, edx;
236
237 asm volatile("cpuid" :
238 "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx));
239
240 return edx;
241}
242#endif /* ndef __KERNEL__ */
243
244#endif
245#endif
diff --git a/drivers/md/unroll.pl b/drivers/md/unroll.pl
new file mode 100644
index 000000000000..3acc710a20ea
--- /dev/null
+++ b/drivers/md/unroll.pl
@@ -0,0 +1,24 @@
1#!/usr/bin/perl
2#
3# Take a piece of C code and for each line which contains the sequence $$
4# repeat n times with $ replaced by 0...n-1; the sequence $# is replaced
5# by the unrolling factor, and $* with a single $
6#
7
8($n) = @ARGV;
9$n += 0;
10
11while ( defined($line = <STDIN>) ) {
12 if ( $line =~ /\$\$/ ) {
13 $rep = $n;
14 } else {
15 $rep = 1;
16 }
17 for ( $i = 0 ; $i < $rep ; $i++ ) {
18 $tmp = $line;
19 $tmp =~ s/\$\$/$i/g;
20 $tmp =~ s/\$\#/$n/g;
21 $tmp =~ s/\$\*/\$/g;
22 print $tmp;
23 }
24}
diff --git a/drivers/md/xor.c b/drivers/md/xor.c
new file mode 100644
index 000000000000..324897c4be4e
--- /dev/null
+++ b/drivers/md/xor.c
@@ -0,0 +1,154 @@
1/*
2 * xor.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1996, 1997, 1998, 1999, 2000,
5 * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
6 *
7 * Dispatch optimized RAID-5 checksumming functions.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2, or (at your option)
12 * any later version.
13 *
14 * You should have received a copy of the GNU General Public License
15 * (for example /usr/src/linux/COPYING); if not, write to the Free
16 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
17 */
18
19#define BH_TRACE 0
20#include <linux/module.h>
21#include <linux/raid/md.h>
22#include <linux/raid/xor.h>
23#include <asm/xor.h>
24
25/* The xor routines to use. */
26static struct xor_block_template *active_template;
27
28void
29xor_block(unsigned int count, unsigned int bytes, void **ptr)
30{
31 unsigned long *p0, *p1, *p2, *p3, *p4;
32
33 p0 = (unsigned long *) ptr[0];
34 p1 = (unsigned long *) ptr[1];
35 if (count == 2) {
36 active_template->do_2(bytes, p0, p1);
37 return;
38 }
39
40 p2 = (unsigned long *) ptr[2];
41 if (count == 3) {
42 active_template->do_3(bytes, p0, p1, p2);
43 return;
44 }
45
46 p3 = (unsigned long *) ptr[3];
47 if (count == 4) {
48 active_template->do_4(bytes, p0, p1, p2, p3);
49 return;
50 }
51
52 p4 = (unsigned long *) ptr[4];
53 active_template->do_5(bytes, p0, p1, p2, p3, p4);
54}
55
56/* Set of all registered templates. */
57static struct xor_block_template *template_list;
58
59#define BENCH_SIZE (PAGE_SIZE)
60
61static void
62do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
63{
64 int speed;
65 unsigned long now;
66 int i, count, max;
67
68 tmpl->next = template_list;
69 template_list = tmpl;
70
71 /*
72 * Count the number of XORs done during a whole jiffy, and use
73 * this to calculate the speed of checksumming. We use a 2-page
74 * allocation to have guaranteed color L1-cache layout.
75 */
76 max = 0;
77 for (i = 0; i < 5; i++) {
78 now = jiffies;
79 count = 0;
80 while (jiffies == now) {
81 mb();
82 tmpl->do_2(BENCH_SIZE, b1, b2);
83 mb();
84 count++;
85 mb();
86 }
87 if (count > max)
88 max = count;
89 }
90
91 speed = max * (HZ * BENCH_SIZE / 1024);
92 tmpl->speed = speed;
93
94 printk(" %-10s: %5d.%03d MB/sec\n", tmpl->name,
95 speed / 1000, speed % 1000);
96}
97
98static int
99calibrate_xor_block(void)
100{
101 void *b1, *b2;
102 struct xor_block_template *f, *fastest;
103
104 b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
105 if (! b1) {
106 printk("raid5: Yikes! No memory available.\n");
107 return -ENOMEM;
108 }
109 b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
110
111 /*
112 * If this arch/cpu has a short-circuited selection, don't loop through all
113 * the possible functions, just test the best one
114 */
115
116 fastest = NULL;
117
118#ifdef XOR_SELECT_TEMPLATE
119 fastest = XOR_SELECT_TEMPLATE(fastest);
120#endif
121
122#define xor_speed(templ) do_xor_speed((templ), b1, b2)
123
124 if (fastest) {
125 printk(KERN_INFO "raid5: automatically using best checksumming function: %s\n",
126 fastest->name);
127 xor_speed(fastest);
128 } else {
129 printk(KERN_INFO "raid5: measuring checksumming speed\n");
130 XOR_TRY_TEMPLATES;
131 fastest = template_list;
132 for (f = fastest; f; f = f->next)
133 if (f->speed > fastest->speed)
134 fastest = f;
135 }
136
137 printk("raid5: using function: %s (%d.%03d MB/sec)\n",
138 fastest->name, fastest->speed / 1000, fastest->speed % 1000);
139
140#undef xor_speed
141
142 free_pages((unsigned long)b1, 2);
143
144 active_template = fastest;
145 return 0;
146}
147
148static __exit void xor_exit(void) { }
149
150EXPORT_SYMBOL(xor_block);
151MODULE_LICENSE("GPL");
152
153module_init(calibrate_xor_block);
154module_exit(xor_exit);