aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-01-13 15:00:02 -0500
committerAlasdair G Kergon <agk@redhat.com>2011-01-13 15:00:02 -0500
commit9d09e663d5502c46f2d9481c04c1087e1c2da698 (patch)
tree993f10eb7100a6ce8c00c0cff7951d7ffea9488e /drivers/md
parent99d03c141b40914b67d63c9d23b8da4386422ed7 (diff)
dm: raid456 basic support
This patch is the skeleton for the DM target that will be the bridge from DM to MD (initially RAID456 and later RAID1). It provides a way to use device-mapper interfaces to the MD RAID456 drivers. As with all device-mapper targets, the nominal public interfaces are the constructor (CTR) tables and the status outputs (both STATUSTYPE_INFO and STATUSTYPE_TABLE). The CTR table looks like the following: 1: <s> <l> raid \ 2: <raid_type> <#raid_params> <raid_params> \ 3: <#raid_devs> <meta_dev1> <dev1> .. <meta_devN> <devN> Line 1 contains the standard first three arguments to any device-mapper target - the start, length, and target type fields. The target type in this case is "raid". Line 2 contains the arguments that define the particular raid type/personality/level, the required arguments for that raid type, and any optional arguments. Possible raid types include: raid4, raid5_la, raid5_ls, raid5_rs, raid6_zr, raid6_nr, and raid6_nc. (again, raid1 is planned for the future.) The list of required and optional parameters is the same for all the current raid types. The required parameters are positional, while the optional parameters are given as key/value pairs. The possible parameters are as follows: <chunk_size> Chunk size in sectors. [[no]sync] Force/Prevent RAID initialization [rebuild <idx>] Rebuild the drive indicated by the index [daemon_sleep <ms>] Time between bitmap daemon work to clear bits [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization [max_write_behind <value>] See '-write-behind=' (man mdadm) [stripe_cache <sectors>] Stripe cache size for higher RAIDs Line 3 contains the list of devices that compose the array in metadata/data device pairs. If the metadata is stored separately, a '-' is given for the metadata device position. If a drive has failed or is missing at creation time, a '-' can be given for both the metadata and data drives for a given position. Examples: # RAID4 - 4 data drives, 1 parity # No metadata devices specified to hold superblock/bitmap info # Chunk size of 1MiB # (Lines separated for easy reading) 0 1960893648 raid \ raid4 1 2048 \ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 # RAID4 - 4 data drives, 1 parity (no metadata devices) # Chunk size of 1MiB, force RAID initialization, # min recovery rate at 20 kiB/sec/disk 0 1960893648 raid \ raid4 4 2048 min_recovery_rate 20 sync\ 5 - 8:17 - 8:33 - 8:49 - 8:65 - 8:81 Performing a 'dmsetup table' should display the CTR table used to construct the mapping (with possible reordering of optional parameters). Performing a 'dmsetup status' will yield information on the state and health of the array. The output is as follows: 1: <s> <l> raid \ 2: <raid_type> <#devices> <1 health char for each dev> <resync_ratio> Line 1 is standard DM output. Line 2 is best shown by example: 0 1960893648 raid raid4 5 AAAAA 2/490221568 Here we can see the RAID type is raid4, there are 5 devices - all of which are 'A'live, and the array is 2/490221568 complete with recovery. Cc: linux-raid@vger.kernel.org Signed-off-by: NeilBrown <neilb@suse.de> Signed-off-by: Jonathan Brassow <jbrassow@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig24
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-raid.c697
3 files changed, 722 insertions, 0 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf1a95e3155..98d9ec85e0e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -240,6 +240,30 @@ config DM_MIRROR
240 Allow volume managers to mirror logical volumes, also 240 Allow volume managers to mirror logical volumes, also
241 needed for live data migration tools such as 'pvmove'. 241 needed for live data migration tools such as 'pvmove'.
242 242
243config DM_RAID
244 tristate "RAID 4/5/6 target (EXPERIMENTAL)"
245 depends on BLK_DEV_DM && EXPERIMENTAL
246 select MD_RAID456
247 select BLK_DEV_MD
248 ---help---
249 A dm target that supports RAID4, RAID5 and RAID6 mappings
250
251 A RAID-5 set of N drives with a capacity of C MB per drive provides
252 the capacity of C * (N - 1) MB, and protects against a failure
253 of a single drive. For a given sector (row) number, (N - 1) drives
254 contain data sectors, and one drive contains the parity protection.
255 For a RAID-4 set, the parity blocks are present on a single drive,
256 while a RAID-5 set distributes the parity across the drives in one
257 of the available parity distribution methods.
258
259 A RAID-6 set of N drives with a capacity of C MB per drive
260 provides the capacity of C * (N - 2) MB, and protects
261 against a failure of any two drives. For a given sector
262 (row) number, (N - 2) drives contain data sectors, and two
263 drives contains two independent redundancy syndromes. Like
264 RAID-5, RAID-6 distributes the syndromes across the drives
265 in one of the available parity distribution methods.
266
243config DM_LOG_USERSPACE 267config DM_LOG_USERSPACE
244 tristate "Mirror userspace logging (EXPERIMENTAL)" 268 tristate "Mirror userspace logging (EXPERIMENTAL)"
245 depends on DM_MIRROR && EXPERIMENTAL && NET 269 depends on DM_MIRROR && EXPERIMENTAL && NET
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5e3aac41919..d0138606c2e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o 36obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o 37obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
38obj-$(CONFIG_DM_ZERO) += dm-zero.o 38obj-$(CONFIG_DM_ZERO) += dm-zero.o
39obj-$(CONFIG_DM_RAID) += dm-raid.o
39 40
40ifeq ($(CONFIG_DM_UEVENT),y) 41ifeq ($(CONFIG_DM_UEVENT),y)
41dm-mod-objs += dm-uevent.o 42dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
new file mode 100644
index 00000000000..b9e1e15ef11
--- /dev/null
+++ b/drivers/md/dm-raid.c
@@ -0,0 +1,697 @@
1/*
2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
4 *
5 * This file is released under the GPL.
6 */
7
8#include <linux/slab.h>
9
10#include "md.h"
11#include "raid5.h"
12#include "dm.h"
13#include "bitmap.h"
14
15#define DM_MSG_PREFIX "raid"
16
17/*
18 * If the MD doesn't support MD_SYNC_STATE_FORCED yet, then
19 * make it so the flag doesn't set anything.
20 */
21#ifndef MD_SYNC_STATE_FORCED
22#define MD_SYNC_STATE_FORCED 0
23#endif
24
25struct raid_dev {
26 /*
27 * Two DM devices, one to hold metadata and one to hold the
28 * actual data/parity. The reason for this is to not confuse
29 * ti->len and give more flexibility in altering size and
30 * characteristics.
31 *
32 * While it is possible for this device to be associated
33 * with a different physical device than the data_dev, it
34 * is intended for it to be the same.
35 * |--------- Physical Device ---------|
36 * |- meta_dev -|------ data_dev ------|
37 */
38 struct dm_dev *meta_dev;
39 struct dm_dev *data_dev;
40 struct mdk_rdev_s rdev;
41};
42
43/*
44 * Flags for rs->print_flags field.
45 */
46#define DMPF_DAEMON_SLEEP 0x1
47#define DMPF_MAX_WRITE_BEHIND 0x2
48#define DMPF_SYNC 0x4
49#define DMPF_NOSYNC 0x8
50#define DMPF_STRIPE_CACHE 0x10
51#define DMPF_MIN_RECOVERY_RATE 0x20
52#define DMPF_MAX_RECOVERY_RATE 0x40
53
54struct raid_set {
55 struct dm_target *ti;
56
57 uint64_t print_flags;
58
59 struct mddev_s md;
60 struct raid_type *raid_type;
61 struct dm_target_callbacks callbacks;
62
63 struct raid_dev dev[0];
64};
65
66/* Supported raid types and properties. */
67static struct raid_type {
68 const char *name; /* RAID algorithm. */
69 const char *descr; /* Descriptor text for logging. */
70 const unsigned parity_devs; /* # of parity devices. */
71 const unsigned minimal_devs; /* minimal # of devices in set. */
72 const unsigned level; /* RAID level. */
73 const unsigned algorithm; /* RAID algorithm. */
74} raid_types[] = {
75 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
76 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
77 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
78 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
79 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
80 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
81 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
82 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
83};
84
85static struct raid_type *get_raid_type(char *name)
86{
87 int i;
88
89 for (i = 0; i < ARRAY_SIZE(raid_types); i++)
90 if (!strcmp(raid_types[i].name, name))
91 return &raid_types[i];
92
93 return NULL;
94}
95
96static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
97{
98 unsigned i;
99 struct raid_set *rs;
100 sector_t sectors_per_dev;
101
102 if (raid_devs <= raid_type->parity_devs) {
103 ti->error = "Insufficient number of devices";
104 return ERR_PTR(-EINVAL);
105 }
106
107 sectors_per_dev = ti->len;
108 if (sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
109 ti->error = "Target length not divisible by number of data devices";
110 return ERR_PTR(-EINVAL);
111 }
112
113 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
114 if (!rs) {
115 ti->error = "Cannot allocate raid context";
116 return ERR_PTR(-ENOMEM);
117 }
118
119 mddev_init(&rs->md);
120
121 rs->ti = ti;
122 rs->raid_type = raid_type;
123 rs->md.raid_disks = raid_devs;
124 rs->md.level = raid_type->level;
125 rs->md.new_level = rs->md.level;
126 rs->md.dev_sectors = sectors_per_dev;
127 rs->md.layout = raid_type->algorithm;
128 rs->md.new_layout = rs->md.layout;
129 rs->md.delta_disks = 0;
130 rs->md.recovery_cp = 0;
131
132 for (i = 0; i < raid_devs; i++)
133 md_rdev_init(&rs->dev[i].rdev);
134
135 /*
136 * Remaining items to be initialized by further RAID params:
137 * rs->md.persistent
138 * rs->md.external
139 * rs->md.chunk_sectors
140 * rs->md.new_chunk_sectors
141 */
142
143 return rs;
144}
145
146static void context_free(struct raid_set *rs)
147{
148 int i;
149
150 for (i = 0; i < rs->md.raid_disks; i++)
151 if (rs->dev[i].data_dev)
152 dm_put_device(rs->ti, rs->dev[i].data_dev);
153
154 kfree(rs);
155}
156
157/*
158 * For every device we have two words
159 * <meta_dev>: meta device name or '-' if missing
160 * <data_dev>: data device name or '-' if missing
161 *
162 * This code parses those words.
163 */
164static int dev_parms(struct raid_set *rs, char **argv)
165{
166 int i;
167 int rebuild = 0;
168 int metadata_available = 0;
169 int ret = 0;
170
171 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
172 rs->dev[i].rdev.raid_disk = i;
173
174 rs->dev[i].meta_dev = NULL;
175 rs->dev[i].data_dev = NULL;
176
177 /*
178 * There are no offsets, since there is a separate device
179 * for data and metadata.
180 */
181 rs->dev[i].rdev.data_offset = 0;
182 rs->dev[i].rdev.mddev = &rs->md;
183
184 if (strcmp(argv[0], "-")) {
185 rs->ti->error = "Metadata devices not supported";
186 return -EINVAL;
187 }
188
189 if (!strcmp(argv[1], "-")) {
190 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
191 (!rs->dev[i].rdev.recovery_offset)) {
192 rs->ti->error = "Drive designated for rebuild not specified";
193 return -EINVAL;
194 }
195
196 continue;
197 }
198
199 ret = dm_get_device(rs->ti, argv[1],
200 dm_table_get_mode(rs->ti->table),
201 &rs->dev[i].data_dev);
202 if (ret) {
203 rs->ti->error = "RAID device lookup failure";
204 return ret;
205 }
206
207 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
208 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
209 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
210 rebuild++;
211 }
212
213 if (metadata_available) {
214 rs->md.external = 0;
215 rs->md.persistent = 1;
216 rs->md.major_version = 2;
217 } else if (rebuild && !rs->md.recovery_cp) {
218 /*
219 * Without metadata, we will not be able to tell if the array
220 * is in-sync or not - we must assume it is not. Therefore,
221 * it is impossible to rebuild a drive.
222 *
223 * Even if there is metadata, the on-disk information may
224 * indicate that the array is not in-sync and it will then
225 * fail at that time.
226 *
227 * User could specify 'nosync' option if desperate.
228 */
229 DMERR("Unable to rebuild drive while array is not in-sync");
230 rs->ti->error = "RAID device lookup failure";
231 return -EINVAL;
232 }
233
234 return 0;
235}
236
237/*
238 * Possible arguments are...
239 * RAID456:
240 * <chunk_size> [optional_args]
241 *
242 * Optional args:
243 * [[no]sync] Force or prevent recovery of the entire array
244 * [rebuild <idx>] Rebuild the drive indicated by the index
245 * [daemon_sleep <ms>] Time between bitmap daemon work to clear bits
246 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
247 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
248 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
249 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
250 */
251static int parse_raid_params(struct raid_set *rs, char **argv,
252 unsigned num_raid_params)
253{
254 unsigned i, rebuild_cnt = 0;
255 unsigned long value;
256 char *key;
257
258 /*
259 * First, parse the in-order required arguments
260 */
261 if ((strict_strtoul(argv[0], 10, &value) < 0) ||
262 !is_power_of_2(value) || (value < 8)) {
263 rs->ti->error = "Bad chunk size";
264 return -EINVAL;
265 }
266
267 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
268 argv++;
269 num_raid_params--;
270
271 /*
272 * Second, parse the unordered optional arguments
273 */
274 for (i = 0; i < rs->md.raid_disks; i++)
275 set_bit(In_sync, &rs->dev[i].rdev.flags);
276
277 for (i = 0; i < num_raid_params; i++) {
278 if (!strcmp(argv[i], "nosync")) {
279 rs->md.recovery_cp = MaxSector;
280 rs->print_flags |= DMPF_NOSYNC;
281 rs->md.flags |= MD_SYNC_STATE_FORCED;
282 continue;
283 }
284 if (!strcmp(argv[i], "sync")) {
285 rs->md.recovery_cp = 0;
286 rs->print_flags |= DMPF_SYNC;
287 rs->md.flags |= MD_SYNC_STATE_FORCED;
288 continue;
289 }
290
291 /* The rest of the optional arguments come in key/value pairs */
292 if ((i + 1) >= num_raid_params) {
293 rs->ti->error = "Wrong number of raid parameters given";
294 return -EINVAL;
295 }
296
297 key = argv[i++];
298 if (strict_strtoul(argv[i], 10, &value) < 0) {
299 rs->ti->error = "Bad numerical argument given in raid params";
300 return -EINVAL;
301 }
302
303 if (!strcmp(key, "rebuild")) {
304 if (++rebuild_cnt > rs->raid_type->parity_devs) {
305 rs->ti->error = "Too many rebuild drives given";
306 return -EINVAL;
307 }
308 if (value > rs->md.raid_disks) {
309 rs->ti->error = "Invalid rebuild index given";
310 return -EINVAL;
311 }
312 clear_bit(In_sync, &rs->dev[value].rdev.flags);
313 rs->dev[value].rdev.recovery_offset = 0;
314 } else if (!strcmp(key, "max_write_behind")) {
315 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
316
317 /*
318 * In device-mapper, we specify things in sectors, but
319 * MD records this value in kB
320 */
321 value /= 2;
322 if (value > COUNTER_MAX) {
323 rs->ti->error = "Max write-behind limit out of range";
324 return -EINVAL;
325 }
326 rs->md.bitmap_info.max_write_behind = value;
327 } else if (!strcmp(key, "daemon_sleep")) {
328 rs->print_flags |= DMPF_DAEMON_SLEEP;
329 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
330 rs->ti->error = "daemon sleep period out of range";
331 return -EINVAL;
332 }
333 rs->md.bitmap_info.daemon_sleep = value;
334 } else if (!strcmp(key, "stripe_cache")) {
335 rs->print_flags |= DMPF_STRIPE_CACHE;
336
337 /*
338 * In device-mapper, we specify things in sectors, but
339 * MD records this value in kB
340 */
341 value /= 2;
342
343 if (rs->raid_type->level < 5) {
344 rs->ti->error = "Inappropriate argument: stripe_cache";
345 return -EINVAL;
346 }
347 if (raid5_set_cache_size(&rs->md, (int)value)) {
348 rs->ti->error = "Bad stripe_cache size";
349 return -EINVAL;
350 }
351 } else if (!strcmp(key, "min_recovery_rate")) {
352 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
353 if (value > INT_MAX) {
354 rs->ti->error = "min_recovery_rate out of range";
355 return -EINVAL;
356 }
357 rs->md.sync_speed_min = (int)value;
358 } else if (!strcmp(key, "max_recovery_rate")) {
359 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
360 if (value > INT_MAX) {
361 rs->ti->error = "max_recovery_rate out of range";
362 return -EINVAL;
363 }
364 rs->md.sync_speed_max = (int)value;
365 } else {
366 DMERR("Unable to parse RAID parameter: %s", key);
367 rs->ti->error = "Unable to parse RAID parameters";
368 return -EINVAL;
369 }
370 }
371
372 /* Assume there are no metadata devices until the drives are parsed */
373 rs->md.persistent = 0;
374 rs->md.external = 1;
375
376 return 0;
377}
378
379static void do_table_event(struct work_struct *ws)
380{
381 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
382
383 dm_table_event(rs->ti->table);
384}
385
386static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
387{
388 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
389
390 return md_raid5_congested(&rs->md, bits);
391}
392
393static void raid_unplug(struct dm_target_callbacks *cb)
394{
395 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
396
397 md_raid5_unplug_device(rs->md.private);
398}
399
400/*
401 * Construct a RAID4/5/6 mapping:
402 * Args:
403 * <raid_type> <#raid_params> <raid_params> \
404 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
405 *
406 * ** metadata devices are not supported yet, use '-' instead **
407 *
408 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
409 * details on possible <raid_params>.
410 */
411static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
412{
413 int ret;
414 struct raid_type *rt;
415 unsigned long num_raid_params, num_raid_devs;
416 struct raid_set *rs = NULL;
417
418 /* Must have at least <raid_type> <#raid_params> */
419 if (argc < 2) {
420 ti->error = "Too few arguments";
421 return -EINVAL;
422 }
423
424 /* raid type */
425 rt = get_raid_type(argv[0]);
426 if (!rt) {
427 ti->error = "Unrecognised raid_type";
428 return -EINVAL;
429 }
430 argc--;
431 argv++;
432
433 /* number of RAID parameters */
434 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
435 ti->error = "Cannot understand number of RAID parameters";
436 return -EINVAL;
437 }
438 argc--;
439 argv++;
440
441 /* Skip over RAID params for now and find out # of devices */
442 if (num_raid_params + 1 > argc) {
443 ti->error = "Arguments do not agree with counts given";
444 return -EINVAL;
445 }
446
447 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
448 (num_raid_devs >= INT_MAX)) {
449 ti->error = "Cannot understand number of raid devices";
450 return -EINVAL;
451 }
452
453 rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
454 if (IS_ERR(rs))
455 return PTR_ERR(rs);
456
457 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
458 if (ret)
459 goto bad;
460
461 ret = -EINVAL;
462
463 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
464 argv += num_raid_params + 1;
465
466 if (argc != (num_raid_devs * 2)) {
467 ti->error = "Supplied RAID devices does not match the count given";
468 goto bad;
469 }
470
471 ret = dev_parms(rs, argv);
472 if (ret)
473 goto bad;
474
475 INIT_WORK(&rs->md.event_work, do_table_event);
476 ti->split_io = rs->md.chunk_sectors;
477 ti->private = rs;
478
479 mutex_lock(&rs->md.reconfig_mutex);
480 ret = md_run(&rs->md);
481 rs->md.in_sync = 0; /* Assume already marked dirty */
482 mutex_unlock(&rs->md.reconfig_mutex);
483
484 if (ret) {
485 ti->error = "Fail to run raid array";
486 goto bad;
487 }
488
489 rs->callbacks.congested_fn = raid_is_congested;
490 rs->callbacks.unplug_fn = raid_unplug;
491 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
492
493 return 0;
494
495bad:
496 context_free(rs);
497
498 return ret;
499}
500
501static void raid_dtr(struct dm_target *ti)
502{
503 struct raid_set *rs = ti->private;
504
505 list_del_init(&rs->callbacks.list);
506 md_stop(&rs->md);
507 context_free(rs);
508}
509
510static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
511{
512 struct raid_set *rs = ti->private;
513 mddev_t *mddev = &rs->md;
514
515 mddev->pers->make_request(mddev, bio);
516
517 return DM_MAPIO_SUBMITTED;
518}
519
520static int raid_status(struct dm_target *ti, status_type_t type,
521 char *result, unsigned maxlen)
522{
523 struct raid_set *rs = ti->private;
524 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
525 unsigned sz = 0;
526 int i;
527 sector_t sync;
528
529 switch (type) {
530 case STATUSTYPE_INFO:
531 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
532
533 for (i = 0; i < rs->md.raid_disks; i++) {
534 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
535 DMEMIT("D");
536 else if (test_bit(In_sync, &rs->dev[i].rdev.flags))
537 DMEMIT("A");
538 else
539 DMEMIT("a");
540 }
541
542 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
543 sync = rs->md.curr_resync_completed;
544 else
545 sync = rs->md.recovery_cp;
546
547 if (sync > rs->md.resync_max_sectors)
548 sync = rs->md.resync_max_sectors;
549
550 DMEMIT(" %llu/%llu",
551 (unsigned long long) sync,
552 (unsigned long long) rs->md.resync_max_sectors);
553
554 break;
555 case STATUSTYPE_TABLE:
556 /* The string you would use to construct this array */
557 for (i = 0; i < rs->md.raid_disks; i++)
558 if (rs->dev[i].data_dev &&
559 !test_bit(In_sync, &rs->dev[i].rdev.flags))
560 raid_param_cnt++; /* for rebuilds */
561
562 raid_param_cnt += (hweight64(rs->print_flags) * 2);
563 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
564 raid_param_cnt--;
565
566 DMEMIT("%s %u %u", rs->raid_type->name,
567 raid_param_cnt, rs->md.chunk_sectors);
568
569 if ((rs->print_flags & DMPF_SYNC) &&
570 (rs->md.recovery_cp == MaxSector))
571 DMEMIT(" sync");
572 if (rs->print_flags & DMPF_NOSYNC)
573 DMEMIT(" nosync");
574
575 for (i = 0; i < rs->md.raid_disks; i++)
576 if (rs->dev[i].data_dev &&
577 !test_bit(In_sync, &rs->dev[i].rdev.flags))
578 DMEMIT(" rebuild %u", i);
579
580 if (rs->print_flags & DMPF_DAEMON_SLEEP)
581 DMEMIT(" daemon_sleep %lu",
582 rs->md.bitmap_info.daemon_sleep);
583
584 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
585 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
586
587 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
588 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
589
590 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
591 DMEMIT(" max_write_behind %lu",
592 rs->md.bitmap_info.max_write_behind);
593
594 if (rs->print_flags & DMPF_STRIPE_CACHE) {
595 raid5_conf_t *conf = rs->md.private;
596
597 /* convert from kiB to sectors */
598 DMEMIT(" stripe_cache %d",
599 conf ? conf->max_nr_stripes * 2 : 0);
600 }
601
602 DMEMIT(" %d", rs->md.raid_disks);
603 for (i = 0; i < rs->md.raid_disks; i++) {
604 DMEMIT(" -"); /* metadata device */
605
606 if (rs->dev[i].data_dev)
607 DMEMIT(" %s", rs->dev[i].data_dev->name);
608 else
609 DMEMIT(" -");
610 }
611 }
612
613 return 0;
614}
615
616static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
617{
618 struct raid_set *rs = ti->private;
619 unsigned i;
620 int ret = 0;
621
622 for (i = 0; !ret && i < rs->md.raid_disks; i++)
623 if (rs->dev[i].data_dev)
624 ret = fn(ti,
625 rs->dev[i].data_dev,
626 0, /* No offset on data devs */
627 rs->md.dev_sectors,
628 data);
629
630 return ret;
631}
632
633static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
634{
635 struct raid_set *rs = ti->private;
636 unsigned chunk_size = rs->md.chunk_sectors << 9;
637 raid5_conf_t *conf = rs->md.private;
638
639 blk_limits_io_min(limits, chunk_size);
640 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
641}
642
643static void raid_presuspend(struct dm_target *ti)
644{
645 struct raid_set *rs = ti->private;
646
647 md_stop_writes(&rs->md);
648}
649
650static void raid_postsuspend(struct dm_target *ti)
651{
652 struct raid_set *rs = ti->private;
653
654 mddev_suspend(&rs->md);
655}
656
657static void raid_resume(struct dm_target *ti)
658{
659 struct raid_set *rs = ti->private;
660
661 mddev_resume(&rs->md);
662}
663
664static struct target_type raid_target = {
665 .name = "raid",
666 .version = {1, 0, 0},
667 .module = THIS_MODULE,
668 .ctr = raid_ctr,
669 .dtr = raid_dtr,
670 .map = raid_map,
671 .status = raid_status,
672 .iterate_devices = raid_iterate_devices,
673 .io_hints = raid_io_hints,
674 .presuspend = raid_presuspend,
675 .postsuspend = raid_postsuspend,
676 .resume = raid_resume,
677};
678
679static int __init dm_raid_init(void)
680{
681 return dm_register_target(&raid_target);
682}
683
684static void __exit dm_raid_exit(void)
685{
686 dm_unregister_target(&raid_target);
687}
688
689module_init(dm_raid_init);
690module_exit(dm_raid_exit);
691
692MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
693MODULE_ALIAS("dm-raid4");
694MODULE_ALIAS("dm-raid5");
695MODULE_ALIAS("dm-raid6");
696MODULE_AUTHOR("Neil Brown <dm-devel@redhat.com>");
697MODULE_LICENSE("GPL");