diff options
-rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 44 | ||||
-rw-r--r-- | drivers/md/Kconfig | 11 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 123 | ||||
-rw-r--r-- | drivers/md/md.c | 19 | ||||
-rw-r--r-- | drivers/md/raid0.c | 13 | ||||
-rw-r--r-- | drivers/md/raid1.c | 8 | ||||
-rw-r--r-- | drivers/md/raid10.c | 97 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 38 |
9 files changed, 256 insertions, 102 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 56fb62b09fc5..b428556197c9 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt | |||
@@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters: | |||
30 | raid10 Various RAID10 inspired algorithms chosen by additional params | 30 | raid10 Various RAID10 inspired algorithms chosen by additional params |
31 | - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') | 31 | - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') |
32 | - RAID1E: Integrated Adjacent Stripe Mirroring | 32 | - RAID1E: Integrated Adjacent Stripe Mirroring |
33 | - RAID1E: Integrated Offset Stripe Mirroring | ||
33 | - and other similar RAID10 variants | 34 | - and other similar RAID10 variants |
34 | 35 | ||
35 | Reference: Chapter 4 of | 36 | Reference: Chapter 4 of |
@@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters: | |||
64 | synchronisation state for each region. | 65 | synchronisation state for each region. |
65 | 66 | ||
66 | [raid10_copies <# copies>] | 67 | [raid10_copies <# copies>] |
67 | [raid10_format near] | 68 | [raid10_format <near|far|offset>] |
68 | These two options are used to alter the default layout of | 69 | These two options are used to alter the default layout of |
69 | a RAID10 configuration. The number of copies is can be | 70 | a RAID10 configuration. The number of copies is can be |
70 | specified, but the default is 2. There are other variations | 71 | specified, but the default is 2. There are also three |
71 | to how the copies are laid down - the default and only current | 72 | variations to how the copies are laid down - the default |
72 | option is "near". Near copies are what most people think of | 73 | is "near". Near copies are what most people think of with |
73 | with respect to mirroring. If these options are left | 74 | respect to mirroring. If these options are left unspecified, |
74 | unspecified, or 'raid10_copies 2' and/or 'raid10_format near' | 75 | or 'raid10_copies 2' and/or 'raid10_format near' are given, |
75 | are given, then the layouts for 2, 3 and 4 devices are: | 76 | then the layouts for 2, 3 and 4 devices are: |
76 | 2 drives 3 drives 4 drives | 77 | 2 drives 3 drives 4 drives |
77 | -------- ---------- -------------- | 78 | -------- ---------- -------------- |
78 | A1 A1 A1 A1 A2 A1 A1 A2 A2 | 79 | A1 A1 A1 A1 A2 A1 A1 A2 A2 |
@@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters: | |||
85 | 3-device layout is what might be called a 'RAID1E - Integrated | 86 | 3-device layout is what might be called a 'RAID1E - Integrated |
86 | Adjacent Stripe Mirroring'. | 87 | Adjacent Stripe Mirroring'. |
87 | 88 | ||
89 | If 'raid10_copies 2' and 'raid10_format far', then the layouts | ||
90 | for 2, 3 and 4 devices are: | ||
91 | 2 drives 3 drives 4 drives | ||
92 | -------- -------------- -------------------- | ||
93 | A1 A2 A1 A2 A3 A1 A2 A3 A4 | ||
94 | A3 A4 A4 A5 A6 A5 A6 A7 A8 | ||
95 | A5 A6 A7 A8 A9 A9 A10 A11 A12 | ||
96 | .. .. .. .. .. .. .. .. .. | ||
97 | A2 A1 A3 A1 A2 A2 A1 A4 A3 | ||
98 | A4 A3 A6 A4 A5 A6 A5 A8 A7 | ||
99 | A6 A5 A9 A7 A8 A10 A9 A12 A11 | ||
100 | .. .. .. .. .. .. .. .. .. | ||
101 | |||
102 | If 'raid10_copies 2' and 'raid10_format offset', then the | ||
103 | layouts for 2, 3 and 4 devices are: | ||
104 | 2 drives 3 drives 4 drives | ||
105 | -------- ------------ ----------------- | ||
106 | A1 A2 A1 A2 A3 A1 A2 A3 A4 | ||
107 | A2 A1 A3 A1 A2 A2 A1 A4 A3 | ||
108 | A3 A4 A4 A5 A6 A5 A6 A7 A8 | ||
109 | A4 A3 A6 A4 A5 A6 A5 A8 A7 | ||
110 | A5 A6 A7 A8 A9 A9 A10 A11 A12 | ||
111 | A6 A5 A9 A7 A8 A10 A9 A12 A11 | ||
112 | .. .. .. .. .. .. .. .. .. | ||
113 | Here we see layouts closely akin to 'RAID1E - Integrated | ||
114 | Offset Stripe Mirroring'. | ||
115 | |||
88 | <#raid_devs>: The number of devices composing the array. | 116 | <#raid_devs>: The number of devices composing the array. |
89 | Each device consists of two entries. The first is the device | 117 | Each device consists of two entries. The first is the device |
90 | containing the metadata (if any); the second is the one containing the | 118 | containing the metadata (if any); the second is the one containing the |
@@ -142,3 +170,5 @@ Version History | |||
142 | 1.3.0 Added support for RAID 10 | 170 | 1.3.0 Added support for RAID 10 |
143 | 1.3.1 Allow device replacement/rebuild for RAID 10 | 171 | 1.3.1 Allow device replacement/rebuild for RAID 10 |
144 | 1.3.2 Fix/improve redundancy checking for RAID10 | 172 | 1.3.2 Fix/improve redundancy checking for RAID10 |
173 | 1.4.0 Non-functional change. Removes arg from mapping function. | ||
174 | 1.4.1 Add RAID10 "far" and "offset" algorithm support. | ||
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index e30b490055aa..4d8d90b4fe78 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -154,17 +154,6 @@ config MD_RAID456 | |||
154 | 154 | ||
155 | If unsure, say Y. | 155 | If unsure, say Y. |
156 | 156 | ||
157 | config MULTICORE_RAID456 | ||
158 | bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" | ||
159 | depends on MD_RAID456 | ||
160 | depends on SMP | ||
161 | depends on EXPERIMENTAL | ||
162 | ---help--- | ||
163 | Enable the raid456 module to dispatch per-stripe raid operations to a | ||
164 | thread pool. | ||
165 | |||
166 | If unsure, say N. | ||
167 | |||
168 | config MD_MULTIPATH | 157 | config MD_MULTIPATH |
169 | tristate "Multipath I/O support" | 158 | tristate "Multipath I/O support" |
170 | depends on BLK_DEV_MD | 159 | depends on BLK_DEV_MD |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9a01d1e4c783..311e3d35b272 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -91,15 +91,44 @@ static struct raid_type { | |||
91 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} | 91 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} |
92 | }; | 92 | }; |
93 | 93 | ||
94 | static char *raid10_md_layout_to_format(int layout) | ||
95 | { | ||
96 | /* | ||
97 | * Bit 16 and 17 stand for "offset" and "use_far_sets" | ||
98 | * Refer to MD's raid10.c for details | ||
99 | */ | ||
100 | if ((layout & 0x10000) && (layout & 0x20000)) | ||
101 | return "offset"; | ||
102 | |||
103 | if ((layout & 0xFF) > 1) | ||
104 | return "near"; | ||
105 | |||
106 | return "far"; | ||
107 | } | ||
108 | |||
94 | static unsigned raid10_md_layout_to_copies(int layout) | 109 | static unsigned raid10_md_layout_to_copies(int layout) |
95 | { | 110 | { |
96 | return layout & 0xFF; | 111 | if ((layout & 0xFF) > 1) |
112 | return layout & 0xFF; | ||
113 | return (layout >> 8) & 0xFF; | ||
97 | } | 114 | } |
98 | 115 | ||
99 | static int raid10_format_to_md_layout(char *format, unsigned copies) | 116 | static int raid10_format_to_md_layout(char *format, unsigned copies) |
100 | { | 117 | { |
101 | /* 1 "far" copy, and 'copies' "near" copies */ | 118 | unsigned n = 1, f = 1; |
102 | return (1 << 8) | (copies & 0xFF); | 119 | |
120 | if (!strcmp("near", format)) | ||
121 | n = copies; | ||
122 | else | ||
123 | f = copies; | ||
124 | |||
125 | if (!strcmp("offset", format)) | ||
126 | return 0x30000 | (f << 8) | n; | ||
127 | |||
128 | if (!strcmp("far", format)) | ||
129 | return 0x20000 | (f << 8) | n; | ||
130 | |||
131 | return (f << 8) | n; | ||
103 | } | 132 | } |
104 | 133 | ||
105 | static struct raid_type *get_raid_type(char *name) | 134 | static struct raid_type *get_raid_type(char *name) |
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs) | |||
352 | { | 381 | { |
353 | unsigned i, rebuild_cnt = 0; | 382 | unsigned i, rebuild_cnt = 0; |
354 | unsigned rebuilds_per_group, copies, d; | 383 | unsigned rebuilds_per_group, copies, d; |
384 | unsigned group_size, last_group_start; | ||
355 | 385 | ||
356 | for (i = 0; i < rs->md.raid_disks; i++) | 386 | for (i = 0; i < rs->md.raid_disks; i++) |
357 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || | 387 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || |
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs) | |||
379 | * as long as the failed devices occur in different mirror | 409 | * as long as the failed devices occur in different mirror |
380 | * groups (i.e. different stripes). | 410 | * groups (i.e. different stripes). |
381 | * | 411 | * |
382 | * Right now, we only allow for "near" copies. When other | ||
383 | * formats are added, we will have to check those too. | ||
384 | * | ||
385 | * When checking "near" format, make sure no adjacent devices | 412 | * When checking "near" format, make sure no adjacent devices |
386 | * have failed beyond what can be handled. In addition to the | 413 | * have failed beyond what can be handled. In addition to the |
387 | * simple case where the number of devices is a multiple of the | 414 | * simple case where the number of devices is a multiple of the |
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs) | |||
391 | * A A B B C | 418 | * A A B B C |
392 | * C D D E E | 419 | * C D D E E |
393 | */ | 420 | */ |
394 | for (i = 0; i < rs->md.raid_disks * copies; i++) { | 421 | if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { |
395 | if (!(i % copies)) | 422 | for (i = 0; i < rs->md.raid_disks * copies; i++) { |
423 | if (!(i % copies)) | ||
424 | rebuilds_per_group = 0; | ||
425 | d = i % rs->md.raid_disks; | ||
426 | if ((!rs->dev[d].rdev.sb_page || | ||
427 | !test_bit(In_sync, &rs->dev[d].rdev.flags)) && | ||
428 | (++rebuilds_per_group >= copies)) | ||
429 | goto too_many; | ||
430 | } | ||
431 | break; | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * When checking "far" and "offset" formats, we need to ensure | ||
436 | * that the device that holds its copy is not also dead or | ||
437 | * being rebuilt. (Note that "far" and "offset" formats only | ||
438 | * support two copies right now. These formats also only ever | ||
439 | * use the 'use_far_sets' variant.) | ||
440 | * | ||
441 | * This check is somewhat complicated by the need to account | ||
442 | * for arrays that are not a multiple of (far) copies. This | ||
443 | * results in the need to treat the last (potentially larger) | ||
444 | * set differently. | ||
445 | */ | ||
446 | group_size = (rs->md.raid_disks / copies); | ||
447 | last_group_start = (rs->md.raid_disks / group_size) - 1; | ||
448 | last_group_start *= group_size; | ||
449 | for (i = 0; i < rs->md.raid_disks; i++) { | ||
450 | if (!(i % copies) && !(i > last_group_start)) | ||
396 | rebuilds_per_group = 0; | 451 | rebuilds_per_group = 0; |
397 | d = i % rs->md.raid_disks; | 452 | if ((!rs->dev[i].rdev.sb_page || |
398 | if ((!rs->dev[d].rdev.sb_page || | 453 | !test_bit(In_sync, &rs->dev[i].rdev.flags)) && |
399 | !test_bit(In_sync, &rs->dev[d].rdev.flags)) && | ||
400 | (++rebuilds_per_group >= copies)) | 454 | (++rebuilds_per_group >= copies)) |
401 | goto too_many; | 455 | goto too_many; |
402 | } | 456 | } |
403 | break; | 457 | break; |
404 | default: | 458 | default: |
@@ -433,7 +487,7 @@ too_many: | |||
433 | * | 487 | * |
434 | * RAID10-only options: | 488 | * RAID10-only options: |
435 | * [raid10_copies <# copies>] Number of copies. (Default: 2) | 489 | * [raid10_copies <# copies>] Number of copies. (Default: 2) |
436 | * [raid10_format <near>] Layout algorithm. (Default: near) | 490 | * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) |
437 | */ | 491 | */ |
438 | static int parse_raid_params(struct raid_set *rs, char **argv, | 492 | static int parse_raid_params(struct raid_set *rs, char **argv, |
439 | unsigned num_raid_params) | 493 | unsigned num_raid_params) |
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
520 | rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; | 574 | rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; |
521 | return -EINVAL; | 575 | return -EINVAL; |
522 | } | 576 | } |
523 | if (strcmp("near", argv[i])) { | 577 | if (strcmp("near", argv[i]) && |
578 | strcmp("far", argv[i]) && | ||
579 | strcmp("offset", argv[i])) { | ||
524 | rs->ti->error = "Invalid 'raid10_format' value given"; | 580 | rs->ti->error = "Invalid 'raid10_format' value given"; |
525 | return -EINVAL; | 581 | return -EINVAL; |
526 | } | 582 | } |
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
644 | return -EINVAL; | 700 | return -EINVAL; |
645 | } | 701 | } |
646 | 702 | ||
703 | /* | ||
704 | * If the format is not "near", we only support | ||
705 | * two copies at the moment. | ||
706 | */ | ||
707 | if (strcmp("near", raid10_format) && (raid10_copies > 2)) { | ||
708 | rs->ti->error = "Too many copies for given RAID10 format."; | ||
709 | return -EINVAL; | ||
710 | } | ||
711 | |||
647 | /* (Len * #mirrors) / #devices */ | 712 | /* (Len * #mirrors) / #devices */ |
648 | sectors_per_dev = rs->ti->len * raid10_copies; | 713 | sectors_per_dev = rs->ti->len * raid10_copies; |
649 | sector_div(sectors_per_dev, rs->md.raid_disks); | 714 | sector_div(sectors_per_dev, rs->md.raid_disks); |
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
854 | /* | 919 | /* |
855 | * Reshaping is not currently allowed | 920 | * Reshaping is not currently allowed |
856 | */ | 921 | */ |
857 | if ((le32_to_cpu(sb->level) != mddev->level) || | 922 | if (le32_to_cpu(sb->level) != mddev->level) { |
858 | (le32_to_cpu(sb->layout) != mddev->layout) || | 923 | DMERR("Reshaping arrays not yet supported. (RAID level change)"); |
859 | (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { | 924 | return -EINVAL; |
860 | DMERR("Reshaping arrays not yet supported."); | 925 | } |
926 | if (le32_to_cpu(sb->layout) != mddev->layout) { | ||
927 | DMERR("Reshaping arrays not yet supported. (RAID layout change)"); | ||
928 | DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); | ||
929 | DMERR(" Old layout: %s w/ %d copies", | ||
930 | raid10_md_layout_to_format(le32_to_cpu(sb->layout)), | ||
931 | raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); | ||
932 | DMERR(" New layout: %s w/ %d copies", | ||
933 | raid10_md_layout_to_format(mddev->layout), | ||
934 | raid10_md_layout_to_copies(mddev->layout)); | ||
935 | return -EINVAL; | ||
936 | } | ||
937 | if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { | ||
938 | DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); | ||
861 | return -EINVAL; | 939 | return -EINVAL; |
862 | } | 940 | } |
863 | 941 | ||
864 | /* We can only change the number of devices in RAID1 right now */ | 942 | /* We can only change the number of devices in RAID1 right now */ |
865 | if ((rs->raid_type->level != 1) && | 943 | if ((rs->raid_type->level != 1) && |
866 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { | 944 | (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { |
867 | DMERR("Reshaping arrays not yet supported."); | 945 | DMERR("Reshaping arrays not yet supported. (device count change)"); |
868 | return -EINVAL; | 946 | return -EINVAL; |
869 | } | 947 | } |
870 | 948 | ||
@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, | |||
1329 | raid10_md_layout_to_copies(rs->md.layout)); | 1407 | raid10_md_layout_to_copies(rs->md.layout)); |
1330 | 1408 | ||
1331 | if (rs->print_flags & DMPF_RAID10_FORMAT) | 1409 | if (rs->print_flags & DMPF_RAID10_FORMAT) |
1332 | DMEMIT(" raid10_format near"); | 1410 | DMEMIT(" raid10_format %s", |
1411 | raid10_md_layout_to_format(rs->md.layout)); | ||
1333 | 1412 | ||
1334 | DMEMIT(" %d", rs->md.raid_disks); | 1413 | DMEMIT(" %d", rs->md.raid_disks); |
1335 | for (i = 0; i < rs->md.raid_disks; i++) { | 1414 | for (i = 0; i < rs->md.raid_disks; i++) { |
@@ -1418,6 +1497,10 @@ static struct target_type raid_target = { | |||
1418 | 1497 | ||
1419 | static int __init dm_raid_init(void) | 1498 | static int __init dm_raid_init(void) |
1420 | { | 1499 | { |
1500 | DMINFO("Loading target version %u.%u.%u", | ||
1501 | raid_target.version[0], | ||
1502 | raid_target.version[1], | ||
1503 | raid_target.version[2]); | ||
1421 | return dm_register_target(&raid_target); | 1504 | return dm_register_target(&raid_target); |
1422 | } | 1505 | } |
1423 | 1506 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db3d1b271f7..fcb878f88796 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio) | |||
307 | bio_io_error(bio); | 307 | bio_io_error(bio); |
308 | return; | 308 | return; |
309 | } | 309 | } |
310 | if (mddev->ro == 1 && unlikely(rw == WRITE)) { | ||
311 | bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); | ||
312 | return; | ||
313 | } | ||
310 | smp_rmb(); /* Ensure implications of 'active' are visible */ | 314 | smp_rmb(); /* Ensure implications of 'active' are visible */ |
311 | rcu_read_lock(); | 315 | rcu_read_lock(); |
312 | if (mddev->suspended) { | 316 | if (mddev->suspended) { |
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2994 | } else if (!sectors) | 2998 | } else if (!sectors) |
2995 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - | 2999 | sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - |
2996 | rdev->data_offset; | 3000 | rdev->data_offset; |
3001 | if (!my_mddev->pers->resize) | ||
3002 | /* Cannot change size for RAID0 or Linear etc */ | ||
3003 | return -EINVAL; | ||
2997 | } | 3004 | } |
2998 | if (sectors < my_mddev->dev_sectors) | 3005 | if (sectors < my_mddev->dev_sectors) |
2999 | return -EINVAL; /* component must fit device */ | 3006 | return -EINVAL; /* component must fit device */ |
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6525 | mddev->ro = 0; | 6532 | mddev->ro = 0; |
6526 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 6533 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
6527 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 6534 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
6528 | md_wakeup_thread(mddev->thread); | 6535 | /* mddev_unlock will wake thread */ |
6536 | /* If a device failed while we were read-only, we | ||
6537 | * need to make sure the metadata is updated now. | ||
6538 | */ | ||
6539 | if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { | ||
6540 | mddev_unlock(mddev); | ||
6541 | wait_event(mddev->sb_wait, | ||
6542 | !test_bit(MD_CHANGE_DEVS, &mddev->flags) && | ||
6543 | !test_bit(MD_CHANGE_PENDING, &mddev->flags)); | ||
6544 | mddev_lock(mddev); | ||
6545 | } | ||
6529 | } else { | 6546 | } else { |
6530 | err = -EROFS; | 6547 | err = -EROFS; |
6531 | goto abort_unlock; | 6548 | goto abort_unlock; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 24b359717a7e..0505452de8d6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
175 | rdev1->new_raid_disk = j; | 175 | rdev1->new_raid_disk = j; |
176 | } | 176 | } |
177 | 177 | ||
178 | if (j < 0 || j >= mddev->raid_disks) { | 178 | if (j < 0) { |
179 | printk(KERN_ERR | ||
180 | "md/raid0:%s: remove inactive devices before converting to RAID0\n", | ||
181 | mdname(mddev)); | ||
182 | goto abort; | ||
183 | } | ||
184 | if (j >= mddev->raid_disks) { | ||
179 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " | 185 | printk(KERN_ERR "md/raid0:%s: bad disk number %d - " |
180 | "aborting!\n", mdname(mddev), j); | 186 | "aborting!\n", mdname(mddev), j); |
181 | goto abort; | 187 | goto abort; |
@@ -289,7 +295,7 @@ abort: | |||
289 | kfree(conf->strip_zone); | 295 | kfree(conf->strip_zone); |
290 | kfree(conf->devlist); | 296 | kfree(conf->devlist); |
291 | kfree(conf); | 297 | kfree(conf); |
292 | *private_conf = NULL; | 298 | *private_conf = ERR_PTR(err); |
293 | return err; | 299 | return err; |
294 | } | 300 | } |
295 | 301 | ||
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks | |||
411 | "%s does not support generic reshape\n", __func__); | 417 | "%s does not support generic reshape\n", __func__); |
412 | 418 | ||
413 | rdev_for_each(rdev, mddev) | 419 | rdev_for_each(rdev, mddev) |
414 | array_sectors += rdev->sectors; | 420 | array_sectors += (rdev->sectors & |
421 | ~(sector_t)(mddev->chunk_sectors-1)); | ||
415 | 422 | ||
416 | return array_sectors; | 423 | return array_sectors; |
417 | } | 424 | } |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d5bddfc4010e..fd86b372692d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
967 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | 967 | bio_list_merge(&conf->pending_bio_list, &plug->pending); |
968 | conf->pending_count += plug->pending_cnt; | 968 | conf->pending_count += plug->pending_cnt; |
969 | spin_unlock_irq(&conf->device_lock); | 969 | spin_unlock_irq(&conf->device_lock); |
970 | wake_up(&conf->wait_barrier); | ||
970 | md_wakeup_thread(mddev->thread); | 971 | md_wakeup_thread(mddev->thread); |
971 | kfree(plug); | 972 | kfree(plug); |
972 | return; | 973 | return; |
@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1000 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 1001 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
1001 | const unsigned long do_discard = (bio->bi_rw | 1002 | const unsigned long do_discard = (bio->bi_rw |
1002 | & (REQ_DISCARD | REQ_SECURE)); | 1003 | & (REQ_DISCARD | REQ_SECURE)); |
1004 | const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); | ||
1003 | struct md_rdev *blocked_rdev; | 1005 | struct md_rdev *blocked_rdev; |
1004 | struct blk_plug_cb *cb; | 1006 | struct blk_plug_cb *cb; |
1005 | struct raid1_plug_cb *plug = NULL; | 1007 | struct raid1_plug_cb *plug = NULL; |
@@ -1301,7 +1303,8 @@ read_again: | |||
1301 | conf->mirrors[i].rdev->data_offset); | 1303 | conf->mirrors[i].rdev->data_offset); |
1302 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1304 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1303 | mbio->bi_end_io = raid1_end_write_request; | 1305 | mbio->bi_end_io = raid1_end_write_request; |
1304 | mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; | 1306 | mbio->bi_rw = |
1307 | WRITE | do_flush_fua | do_sync | do_discard | do_same; | ||
1305 | mbio->bi_private = r1_bio; | 1308 | mbio->bi_private = r1_bio; |
1306 | 1309 | ||
1307 | atomic_inc(&r1_bio->remaining); | 1310 | atomic_inc(&r1_bio->remaining); |
@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev) | |||
2818 | if (IS_ERR(conf)) | 2821 | if (IS_ERR(conf)) |
2819 | return PTR_ERR(conf); | 2822 | return PTR_ERR(conf); |
2820 | 2823 | ||
2824 | if (mddev->queue) | ||
2825 | blk_queue_max_write_same_sectors(mddev->queue, | ||
2826 | mddev->chunk_sectors); | ||
2821 | rdev_for_each(rdev, mddev) { | 2827 | rdev_for_each(rdev, mddev) { |
2822 | if (!mddev->gendisk) | 2828 | if (!mddev->gendisk) |
2823 | continue; | 2829 | continue; |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..77b562d18a90 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -38,21 +38,36 @@ | |||
38 | * near_copies (stored in low byte of layout) | 38 | * near_copies (stored in low byte of layout) |
39 | * far_copies (stored in second byte of layout) | 39 | * far_copies (stored in second byte of layout) |
40 | * far_offset (stored in bit 16 of layout ) | 40 | * far_offset (stored in bit 16 of layout ) |
41 | * use_far_sets (stored in bit 17 of layout ) | ||
41 | * | 42 | * |
42 | * The data to be stored is divided into chunks using chunksize. | 43 | * The data to be stored is divided into chunks using chunksize. Each device |
43 | * Each device is divided into far_copies sections. | 44 | * is divided into far_copies sections. In each section, chunks are laid out |
44 | * In each section, chunks are laid out in a style similar to raid0, but | 45 | * in a style similar to raid0, but near_copies copies of each chunk is stored |
45 | * near_copies copies of each chunk is stored (each on a different drive). | 46 | * (each on a different drive). The starting device for each section is offset |
46 | * The starting device for each section is offset near_copies from the starting | 47 | * near_copies from the starting device of the previous section. Thus there |
47 | * device of the previous section. | 48 | * are (near_copies * far_copies) of each chunk, and each is on a different |
48 | * Thus they are (near_copies*far_copies) of each chunk, and each is on a different | 49 | * drive. near_copies and far_copies must be at least one, and their product |
49 | * drive. | 50 | * is at most raid_disks. |
50 | * near_copies and far_copies must be at least one, and their product is at most | ||
51 | * raid_disks. | ||
52 | * | 51 | * |
53 | * If far_offset is true, then the far_copies are handled a bit differently. | 52 | * If far_offset is true, then the far_copies are handled a bit differently. |
54 | * The copies are still in different stripes, but instead of be very far apart | 53 | * The copies are still in different stripes, but instead of being very far |
55 | * on disk, there are adjacent stripes. | 54 | * apart on disk, there are adjacent stripes. |
55 | * | ||
56 | * The far and offset algorithms are handled slightly differently if | ||
57 | * 'use_far_sets' is true. In this case, the array's devices are grouped into | ||
58 | * sets that are (near_copies * far_copies) in size. The far copied stripes | ||
59 | * are still shifted by 'near_copies' devices, but this shifting stays confined | ||
60 | * to the set rather than the entire array. This is done to improve the number | ||
61 | * of device combinations that can fail without causing the array to fail. | ||
62 | * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk | ||
63 | * on a device): | ||
64 | * A B C D A B C D E | ||
65 | * ... ... | ||
66 | * D A B C E A B C D | ||
67 | * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): | ||
68 | * [A B] [C D] [A B] [C D E] | ||
69 | * |...| |...| |...| | ... | | ||
70 | * [B A] [D C] [B A] [E C D] | ||
56 | */ | 71 | */ |
57 | 72 | ||
58 | /* | 73 | /* |
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) | |||
535 | sector_t stripe; | 550 | sector_t stripe; |
536 | int dev; | 551 | int dev; |
537 | int slot = 0; | 552 | int slot = 0; |
553 | int last_far_set_start, last_far_set_size; | ||
554 | |||
555 | last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; | ||
556 | last_far_set_start *= geo->far_set_size; | ||
557 | |||
558 | last_far_set_size = geo->far_set_size; | ||
559 | last_far_set_size += (geo->raid_disks % geo->far_set_size); | ||
538 | 560 | ||
539 | /* now calculate first sector/dev */ | 561 | /* now calculate first sector/dev */ |
540 | chunk = r10bio->sector >> geo->chunk_shift; | 562 | chunk = r10bio->sector >> geo->chunk_shift; |
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) | |||
551 | /* and calculate all the others */ | 573 | /* and calculate all the others */ |
552 | for (n = 0; n < geo->near_copies; n++) { | 574 | for (n = 0; n < geo->near_copies; n++) { |
553 | int d = dev; | 575 | int d = dev; |
576 | int set; | ||
554 | sector_t s = sector; | 577 | sector_t s = sector; |
555 | r10bio->devs[slot].addr = sector; | ||
556 | r10bio->devs[slot].devnum = d; | 578 | r10bio->devs[slot].devnum = d; |
579 | r10bio->devs[slot].addr = s; | ||
557 | slot++; | 580 | slot++; |
558 | 581 | ||
559 | for (f = 1; f < geo->far_copies; f++) { | 582 | for (f = 1; f < geo->far_copies; f++) { |
583 | set = d / geo->far_set_size; | ||
560 | d += geo->near_copies; | 584 | d += geo->near_copies; |
561 | if (d >= geo->raid_disks) | 585 | |
562 | d -= geo->raid_disks; | 586 | if ((geo->raid_disks % geo->far_set_size) && |
587 | (d > last_far_set_start)) { | ||
588 | d -= last_far_set_start; | ||
589 | d %= last_far_set_size; | ||
590 | d += last_far_set_start; | ||
591 | } else { | ||
592 | d %= geo->far_set_size; | ||
593 | d += geo->far_set_size * set; | ||
594 | } | ||
563 | s += geo->stride; | 595 | s += geo->stride; |
564 | r10bio->devs[slot].devnum = d; | 596 | r10bio->devs[slot].devnum = d; |
565 | r10bio->devs[slot].addr = s; | 597 | r10bio->devs[slot].addr = s; |
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
595 | * or recovery, so reshape isn't happening | 627 | * or recovery, so reshape isn't happening |
596 | */ | 628 | */ |
597 | struct geom *geo = &conf->geo; | 629 | struct geom *geo = &conf->geo; |
630 | int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; | ||
631 | int far_set_size = geo->far_set_size; | ||
632 | int last_far_set_start; | ||
633 | |||
634 | if (geo->raid_disks % geo->far_set_size) { | ||
635 | last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; | ||
636 | last_far_set_start *= geo->far_set_size; | ||
637 | |||
638 | if (dev >= last_far_set_start) { | ||
639 | far_set_size = geo->far_set_size; | ||
640 | far_set_size += (geo->raid_disks % geo->far_set_size); | ||
641 | far_set_start = last_far_set_start; | ||
642 | } | ||
643 | } | ||
598 | 644 | ||
599 | offset = sector & geo->chunk_mask; | 645 | offset = sector & geo->chunk_mask; |
600 | if (geo->far_offset) { | 646 | if (geo->far_offset) { |
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
602 | chunk = sector >> geo->chunk_shift; | 648 | chunk = sector >> geo->chunk_shift; |
603 | fc = sector_div(chunk, geo->far_copies); | 649 | fc = sector_div(chunk, geo->far_copies); |
604 | dev -= fc * geo->near_copies; | 650 | dev -= fc * geo->near_copies; |
605 | if (dev < 0) | 651 | if (dev < far_set_start) |
606 | dev += geo->raid_disks; | 652 | dev += far_set_size; |
607 | } else { | 653 | } else { |
608 | while (sector >= geo->stride) { | 654 | while (sector >= geo->stride) { |
609 | sector -= geo->stride; | 655 | sector -= geo->stride; |
610 | if (dev < geo->near_copies) | 656 | if (dev < (geo->near_copies + far_set_start)) |
611 | dev += geo->raid_disks - geo->near_copies; | 657 | dev += far_set_size - geo->near_copies; |
612 | else | 658 | else |
613 | dev -= geo->near_copies; | 659 | dev -= geo->near_copies; |
614 | } | 660 | } |
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | |||
1073 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | 1119 | bio_list_merge(&conf->pending_bio_list, &plug->pending); |
1074 | conf->pending_count += plug->pending_cnt; | 1120 | conf->pending_count += plug->pending_cnt; |
1075 | spin_unlock_irq(&conf->device_lock); | 1121 | spin_unlock_irq(&conf->device_lock); |
1122 | wake_up(&conf->wait_barrier); | ||
1076 | md_wakeup_thread(mddev->thread); | 1123 | md_wakeup_thread(mddev->thread); |
1077 | kfree(plug); | 1124 | kfree(plug); |
1078 | return; | 1125 | return; |
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1105 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1152 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
1106 | const unsigned long do_discard = (bio->bi_rw | 1153 | const unsigned long do_discard = (bio->bi_rw |
1107 | & (REQ_DISCARD | REQ_SECURE)); | 1154 | & (REQ_DISCARD | REQ_SECURE)); |
1155 | const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); | ||
1108 | unsigned long flags; | 1156 | unsigned long flags; |
1109 | struct md_rdev *blocked_rdev; | 1157 | struct md_rdev *blocked_rdev; |
1110 | struct blk_plug_cb *cb; | 1158 | struct blk_plug_cb *cb; |
@@ -1460,7 +1508,8 @@ retry_write: | |||
1460 | rdev)); | 1508 | rdev)); |
1461 | mbio->bi_bdev = rdev->bdev; | 1509 | mbio->bi_bdev = rdev->bdev; |
1462 | mbio->bi_end_io = raid10_end_write_request; | 1510 | mbio->bi_end_io = raid10_end_write_request; |
1463 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; | 1511 | mbio->bi_rw = |
1512 | WRITE | do_sync | do_fua | do_discard | do_same; | ||
1464 | mbio->bi_private = r10_bio; | 1513 | mbio->bi_private = r10_bio; |
1465 | 1514 | ||
1466 | atomic_inc(&r10_bio->remaining); | 1515 | atomic_inc(&r10_bio->remaining); |
@@ -1502,7 +1551,8 @@ retry_write: | |||
1502 | r10_bio, rdev)); | 1551 | r10_bio, rdev)); |
1503 | mbio->bi_bdev = rdev->bdev; | 1552 | mbio->bi_bdev = rdev->bdev; |
1504 | mbio->bi_end_io = raid10_end_write_request; | 1553 | mbio->bi_end_io = raid10_end_write_request; |
1505 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; | 1554 | mbio->bi_rw = |
1555 | WRITE | do_sync | do_fua | do_discard | do_same; | ||
1506 | mbio->bi_private = r10_bio; | 1556 | mbio->bi_private = r10_bio; |
1507 | 1557 | ||
1508 | atomic_inc(&r10_bio->remaining); | 1558 | atomic_inc(&r10_bio->remaining); |
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | |||
3436 | disks = mddev->raid_disks + mddev->delta_disks; | 3486 | disks = mddev->raid_disks + mddev->delta_disks; |
3437 | break; | 3487 | break; |
3438 | } | 3488 | } |
3439 | if (layout >> 17) | 3489 | if (layout >> 18) |
3440 | return -1; | 3490 | return -1; |
3441 | if (chunk < (PAGE_SIZE >> 9) || | 3491 | if (chunk < (PAGE_SIZE >> 9) || |
3442 | !is_power_of_2(chunk)) | 3492 | !is_power_of_2(chunk)) |
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) | |||
3448 | geo->near_copies = nc; | 3498 | geo->near_copies = nc; |
3449 | geo->far_copies = fc; | 3499 | geo->far_copies = fc; |
3450 | geo->far_offset = fo; | 3500 | geo->far_offset = fo; |
3501 | geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; | ||
3451 | geo->chunk_mask = chunk - 1; | 3502 | geo->chunk_mask = chunk - 1; |
3452 | geo->chunk_shift = ffz(~chunk); | 3503 | geo->chunk_shift = ffz(~chunk); |
3453 | return nc*fc; | 3504 | return nc*fc; |
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev) | |||
3569 | if (mddev->queue) { | 3620 | if (mddev->queue) { |
3570 | blk_queue_max_discard_sectors(mddev->queue, | 3621 | blk_queue_max_discard_sectors(mddev->queue, |
3571 | mddev->chunk_sectors); | 3622 | mddev->chunk_sectors); |
3623 | blk_queue_max_write_same_sectors(mddev->queue, | ||
3624 | mddev->chunk_sectors); | ||
3572 | blk_queue_io_min(mddev->queue, chunk_size); | 3625 | blk_queue_io_min(mddev->queue, chunk_size); |
3573 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3626 | if (conf->geo.raid_disks % conf->geo.near_copies) |
3574 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3627 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1054cf602345..157d69e83ff4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
@@ -33,6 +33,11 @@ struct r10conf { | |||
33 | * far_offset, in which case it is | 33 | * far_offset, in which case it is |
34 | * 1 stripe. | 34 | * 1 stripe. |
35 | */ | 35 | */ |
36 | int far_set_size; /* The number of devices in a set, | ||
37 | * where a 'set' are devices that | ||
38 | * contain far/offset copies of | ||
39 | * each other. | ||
40 | */ | ||
36 | int chunk_shift; /* shift from chunks to sectors */ | 41 | int chunk_shift; /* shift from chunks to sectors */ |
37 | sector_t chunk_mask; | 42 | sector_t chunk_mask; |
38 | } prev, geo; | 43 | } prev, geo; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5af2d2709081..3ee2912889e7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu | |||
1403 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); | 1403 | &sh->ops.zero_sum_result, percpu->spare_page, &submit); |
1404 | } | 1404 | } |
1405 | 1405 | ||
1406 | static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | 1406 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
1407 | { | 1407 | { |
1408 | int overlap_clear = 0, i, disks = sh->disks; | 1408 | int overlap_clear = 0, i, disks = sh->disks; |
1409 | struct dma_async_tx_descriptor *tx = NULL; | 1409 | struct dma_async_tx_descriptor *tx = NULL; |
@@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | |||
1468 | put_cpu(); | 1468 | put_cpu(); |
1469 | } | 1469 | } |
1470 | 1470 | ||
1471 | #ifdef CONFIG_MULTICORE_RAID456 | ||
1472 | static void async_run_ops(void *param, async_cookie_t cookie) | ||
1473 | { | ||
1474 | struct stripe_head *sh = param; | ||
1475 | unsigned long ops_request = sh->ops.request; | ||
1476 | |||
1477 | clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); | ||
1478 | wake_up(&sh->ops.wait_for_ops); | ||
1479 | |||
1480 | __raid_run_ops(sh, ops_request); | ||
1481 | release_stripe(sh); | ||
1482 | } | ||
1483 | |||
1484 | static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) | ||
1485 | { | ||
1486 | /* since handle_stripe can be called outside of raid5d context | ||
1487 | * we need to ensure sh->ops.request is de-staged before another | ||
1488 | * request arrives | ||
1489 | */ | ||
1490 | wait_event(sh->ops.wait_for_ops, | ||
1491 | !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); | ||
1492 | sh->ops.request = ops_request; | ||
1493 | |||
1494 | atomic_inc(&sh->count); | ||
1495 | async_schedule(async_run_ops, sh); | ||
1496 | } | ||
1497 | #else | ||
1498 | #define raid_run_ops __raid_run_ops | ||
1499 | #endif | ||
1500 | |||
1501 | static int grow_one_stripe(struct r5conf *conf) | 1471 | static int grow_one_stripe(struct r5conf *conf) |
1502 | { | 1472 | { |
1503 | struct stripe_head *sh; | 1473 | struct stripe_head *sh; |
@@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf) | |||
1506 | return 0; | 1476 | return 0; |
1507 | 1477 | ||
1508 | sh->raid_conf = conf; | 1478 | sh->raid_conf = conf; |
1509 | #ifdef CONFIG_MULTICORE_RAID456 | ||
1510 | init_waitqueue_head(&sh->ops.wait_for_ops); | ||
1511 | #endif | ||
1512 | 1479 | ||
1513 | spin_lock_init(&sh->stripe_lock); | 1480 | spin_lock_init(&sh->stripe_lock); |
1514 | 1481 | ||
@@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) | |||
1627 | break; | 1594 | break; |
1628 | 1595 | ||
1629 | nsh->raid_conf = conf; | 1596 | nsh->raid_conf = conf; |
1630 | #ifdef CONFIG_MULTICORE_RAID456 | ||
1631 | init_waitqueue_head(&nsh->ops.wait_for_ops); | ||
1632 | #endif | ||
1633 | spin_lock_init(&nsh->stripe_lock); | 1597 | spin_lock_init(&nsh->stripe_lock); |
1634 | 1598 | ||
1635 | list_add(&nsh->lru, &newstripes); | 1599 | list_add(&nsh->lru, &newstripes); |