aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig11
-rw-r--r--drivers/md/dm-raid.c123
-rw-r--r--drivers/md/md.c19
-rw-r--r--drivers/md/raid0.c13
-rw-r--r--drivers/md/raid1.c8
-rw-r--r--drivers/md/raid10.c97
-rw-r--r--drivers/md/raid10.h5
-rw-r--r--drivers/md/raid5.c38
8 files changed, 219 insertions, 95 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index e30b490055aa..4d8d90b4fe78 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -154,17 +154,6 @@ config MD_RAID456
154 154
155 If unsure, say Y. 155 If unsure, say Y.
156 156
157config MULTICORE_RAID456
158 bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
159 depends on MD_RAID456
160 depends on SMP
161 depends on EXPERIMENTAL
162 ---help---
163 Enable the raid456 module to dispatch per-stripe raid operations to a
164 thread pool.
165
166 If unsure, say N.
167
168config MD_MULTIPATH 157config MD_MULTIPATH
169 tristate "Multipath I/O support" 158 tristate "Multipath I/O support"
170 depends on BLK_DEV_MD 159 depends on BLK_DEV_MD
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9a01d1e4c783..311e3d35b272 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -91,15 +91,44 @@ static struct raid_type {
91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
92}; 92};
93 93
94static char *raid10_md_layout_to_format(int layout)
95{
96 /*
97 * Bit 16 and 17 stand for "offset" and "use_far_sets"
98 * Refer to MD's raid10.c for details
99 */
100 if ((layout & 0x10000) && (layout & 0x20000))
101 return "offset";
102
103 if ((layout & 0xFF) > 1)
104 return "near";
105
106 return "far";
107}
108
94static unsigned raid10_md_layout_to_copies(int layout) 109static unsigned raid10_md_layout_to_copies(int layout)
95{ 110{
96 return layout & 0xFF; 111 if ((layout & 0xFF) > 1)
112 return layout & 0xFF;
113 return (layout >> 8) & 0xFF;
97} 114}
98 115
99static int raid10_format_to_md_layout(char *format, unsigned copies) 116static int raid10_format_to_md_layout(char *format, unsigned copies)
100{ 117{
101 /* 1 "far" copy, and 'copies' "near" copies */ 118 unsigned n = 1, f = 1;
102 return (1 << 8) | (copies & 0xFF); 119
120 if (!strcmp("near", format))
121 n = copies;
122 else
123 f = copies;
124
125 if (!strcmp("offset", format))
126 return 0x30000 | (f << 8) | n;
127
128 if (!strcmp("far", format))
129 return 0x20000 | (f << 8) | n;
130
131 return (f << 8) | n;
103} 132}
104 133
105static struct raid_type *get_raid_type(char *name) 134static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
352{ 381{
353 unsigned i, rebuild_cnt = 0; 382 unsigned i, rebuild_cnt = 0;
354 unsigned rebuilds_per_group, copies, d; 383 unsigned rebuilds_per_group, copies, d;
384 unsigned group_size, last_group_start;
355 385
356 for (i = 0; i < rs->md.raid_disks; i++) 386 for (i = 0; i < rs->md.raid_disks; i++)
357 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || 387 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
379 * as long as the failed devices occur in different mirror 409 * as long as the failed devices occur in different mirror
380 * groups (i.e. different stripes). 410 * groups (i.e. different stripes).
381 * 411 *
382 * Right now, we only allow for "near" copies. When other
383 * formats are added, we will have to check those too.
384 *
385 * When checking "near" format, make sure no adjacent devices 412 * When checking "near" format, make sure no adjacent devices
386 * have failed beyond what can be handled. In addition to the 413 * have failed beyond what can be handled. In addition to the
387 * simple case where the number of devices is a multiple of the 414 * simple case where the number of devices is a multiple of the
@@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
391 * A A B B C 418 * A A B B C
392 * C D D E E 419 * C D D E E
393 */ 420 */
394 for (i = 0; i < rs->md.raid_disks * copies; i++) { 421 if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
395 if (!(i % copies)) 422 for (i = 0; i < rs->md.raid_disks * copies; i++) {
423 if (!(i % copies))
424 rebuilds_per_group = 0;
425 d = i % rs->md.raid_disks;
426 if ((!rs->dev[d].rdev.sb_page ||
427 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
428 (++rebuilds_per_group >= copies))
429 goto too_many;
430 }
431 break;
432 }
433
434 /*
435 * When checking "far" and "offset" formats, we need to ensure
436 * that the device that holds its copy is not also dead or
437 * being rebuilt. (Note that "far" and "offset" formats only
438 * support two copies right now. These formats also only ever
439 * use the 'use_far_sets' variant.)
440 *
441 * This check is somewhat complicated by the need to account
442 * for arrays that are not a multiple of (far) copies. This
443 * results in the need to treat the last (potentially larger)
444 * set differently.
445 */
446 group_size = (rs->md.raid_disks / copies);
447 last_group_start = (rs->md.raid_disks / group_size) - 1;
448 last_group_start *= group_size;
449 for (i = 0; i < rs->md.raid_disks; i++) {
450 if (!(i % copies) && !(i > last_group_start))
396 rebuilds_per_group = 0; 451 rebuilds_per_group = 0;
397 d = i % rs->md.raid_disks; 452 if ((!rs->dev[i].rdev.sb_page ||
398 if ((!rs->dev[d].rdev.sb_page || 453 !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
399 !test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
400 (++rebuilds_per_group >= copies)) 454 (++rebuilds_per_group >= copies))
401 goto too_many; 455 goto too_many;
402 } 456 }
403 break; 457 break;
404 default: 458 default:
@@ -433,7 +487,7 @@ too_many:
433 * 487 *
434 * RAID10-only options: 488 * RAID10-only options:
435 * [raid10_copies <# copies>] Number of copies. (Default: 2) 489 * [raid10_copies <# copies>] Number of copies. (Default: 2)
436 * [raid10_format <near>] Layout algorithm. (Default: near) 490 * [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
437 */ 491 */
438static int parse_raid_params(struct raid_set *rs, char **argv, 492static int parse_raid_params(struct raid_set *rs, char **argv,
439 unsigned num_raid_params) 493 unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
520 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; 574 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
521 return -EINVAL; 575 return -EINVAL;
522 } 576 }
523 if (strcmp("near", argv[i])) { 577 if (strcmp("near", argv[i]) &&
578 strcmp("far", argv[i]) &&
579 strcmp("offset", argv[i])) {
524 rs->ti->error = "Invalid 'raid10_format' value given"; 580 rs->ti->error = "Invalid 'raid10_format' value given";
525 return -EINVAL; 581 return -EINVAL;
526 } 582 }
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
644 return -EINVAL; 700 return -EINVAL;
645 } 701 }
646 702
703 /*
704 * If the format is not "near", we only support
705 * two copies at the moment.
706 */
707 if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
708 rs->ti->error = "Too many copies for given RAID10 format.";
709 return -EINVAL;
710 }
711
647 /* (Len * #mirrors) / #devices */ 712 /* (Len * #mirrors) / #devices */
648 sectors_per_dev = rs->ti->len * raid10_copies; 713 sectors_per_dev = rs->ti->len * raid10_copies;
649 sector_div(sectors_per_dev, rs->md.raid_disks); 714 sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
854 /* 919 /*
855 * Reshaping is not currently allowed 920 * Reshaping is not currently allowed
856 */ 921 */
857 if ((le32_to_cpu(sb->level) != mddev->level) || 922 if (le32_to_cpu(sb->level) != mddev->level) {
858 (le32_to_cpu(sb->layout) != mddev->layout) || 923 DMERR("Reshaping arrays not yet supported. (RAID level change)");
859 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { 924 return -EINVAL;
860 DMERR("Reshaping arrays not yet supported."); 925 }
926 if (le32_to_cpu(sb->layout) != mddev->layout) {
927 DMERR("Reshaping arrays not yet supported. (RAID layout change)");
928 DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
929 DMERR(" Old layout: %s w/ %d copies",
930 raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
931 raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
932 DMERR(" New layout: %s w/ %d copies",
933 raid10_md_layout_to_format(mddev->layout),
934 raid10_md_layout_to_copies(mddev->layout));
935 return -EINVAL;
936 }
937 if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
938 DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
861 return -EINVAL; 939 return -EINVAL;
862 } 940 }
863 941
864 /* We can only change the number of devices in RAID1 right now */ 942 /* We can only change the number of devices in RAID1 right now */
865 if ((rs->raid_type->level != 1) && 943 if ((rs->raid_type->level != 1) &&
866 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 944 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
867 DMERR("Reshaping arrays not yet supported."); 945 DMERR("Reshaping arrays not yet supported. (device count change)");
868 return -EINVAL; 946 return -EINVAL;
869 } 947 }
870 948
@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
1329 raid10_md_layout_to_copies(rs->md.layout)); 1407 raid10_md_layout_to_copies(rs->md.layout));
1330 1408
1331 if (rs->print_flags & DMPF_RAID10_FORMAT) 1409 if (rs->print_flags & DMPF_RAID10_FORMAT)
1332 DMEMIT(" raid10_format near"); 1410 DMEMIT(" raid10_format %s",
1411 raid10_md_layout_to_format(rs->md.layout));
1333 1412
1334 DMEMIT(" %d", rs->md.raid_disks); 1413 DMEMIT(" %d", rs->md.raid_disks);
1335 for (i = 0; i < rs->md.raid_disks; i++) { 1414 for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1418,6 +1497,10 @@ static struct target_type raid_target = {
1418 1497
1419static int __init dm_raid_init(void) 1498static int __init dm_raid_init(void)
1420{ 1499{
1500 DMINFO("Loading target version %u.%u.%u",
1501 raid_target.version[0],
1502 raid_target.version[1],
1503 raid_target.version[2]);
1421 return dm_register_target(&raid_target); 1504 return dm_register_target(&raid_target);
1422} 1505}
1423 1506
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3db3d1b271f7..fcb878f88796 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
307 bio_io_error(bio); 307 bio_io_error(bio);
308 return; 308 return;
309 } 309 }
310 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
311 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
312 return;
313 }
310 smp_rmb(); /* Ensure implications of 'active' are visible */ 314 smp_rmb(); /* Ensure implications of 'active' are visible */
311 rcu_read_lock(); 315 rcu_read_lock();
312 if (mddev->suspended) { 316 if (mddev->suspended) {
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2994 } else if (!sectors) 2998 } else if (!sectors)
2995 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2999 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2996 rdev->data_offset; 3000 rdev->data_offset;
3001 if (!my_mddev->pers->resize)
3002 /* Cannot change size for RAID0 or Linear etc */
3003 return -EINVAL;
2997 } 3004 }
2998 if (sectors < my_mddev->dev_sectors) 3005 if (sectors < my_mddev->dev_sectors)
2999 return -EINVAL; /* component must fit device */ 3006 return -EINVAL; /* component must fit device */
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
6525 mddev->ro = 0; 6532 mddev->ro = 0;
6526 sysfs_notify_dirent_safe(mddev->sysfs_state); 6533 sysfs_notify_dirent_safe(mddev->sysfs_state);
6527 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6534 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6528 md_wakeup_thread(mddev->thread); 6535 /* mddev_unlock will wake thread */
6536 /* If a device failed while we were read-only, we
6537 * need to make sure the metadata is updated now.
6538 */
6539 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6540 mddev_unlock(mddev);
6541 wait_event(mddev->sb_wait,
6542 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6543 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6544 mddev_lock(mddev);
6545 }
6529 } else { 6546 } else {
6530 err = -EROFS; 6547 err = -EROFS;
6531 goto abort_unlock; 6548 goto abort_unlock;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 24b359717a7e..0505452de8d6 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
175 rdev1->new_raid_disk = j; 175 rdev1->new_raid_disk = j;
176 } 176 }
177 177
178 if (j < 0 || j >= mddev->raid_disks) { 178 if (j < 0) {
179 printk(KERN_ERR
180 "md/raid0:%s: remove inactive devices before converting to RAID0\n",
181 mdname(mddev));
182 goto abort;
183 }
184 if (j >= mddev->raid_disks) {
179 printk(KERN_ERR "md/raid0:%s: bad disk number %d - " 185 printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
180 "aborting!\n", mdname(mddev), j); 186 "aborting!\n", mdname(mddev), j);
181 goto abort; 187 goto abort;
@@ -289,7 +295,7 @@ abort:
289 kfree(conf->strip_zone); 295 kfree(conf->strip_zone);
290 kfree(conf->devlist); 296 kfree(conf->devlist);
291 kfree(conf); 297 kfree(conf);
292 *private_conf = NULL; 298 *private_conf = ERR_PTR(err);
293 return err; 299 return err;
294} 300}
295 301
@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
411 "%s does not support generic reshape\n", __func__); 417 "%s does not support generic reshape\n", __func__);
412 418
413 rdev_for_each(rdev, mddev) 419 rdev_for_each(rdev, mddev)
414 array_sectors += rdev->sectors; 420 array_sectors += (rdev->sectors &
421 ~(sector_t)(mddev->chunk_sectors-1));
415 422
416 return array_sectors; 423 return array_sectors;
417} 424}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index d5bddfc4010e..fd86b372692d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
967 bio_list_merge(&conf->pending_bio_list, &plug->pending); 967 bio_list_merge(&conf->pending_bio_list, &plug->pending);
968 conf->pending_count += plug->pending_cnt; 968 conf->pending_count += plug->pending_cnt;
969 spin_unlock_irq(&conf->device_lock); 969 spin_unlock_irq(&conf->device_lock);
970 wake_up(&conf->wait_barrier);
970 md_wakeup_thread(mddev->thread); 971 md_wakeup_thread(mddev->thread);
971 kfree(plug); 972 kfree(plug);
972 return; 973 return;
@@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1000 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 1001 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
1001 const unsigned long do_discard = (bio->bi_rw 1002 const unsigned long do_discard = (bio->bi_rw
1002 & (REQ_DISCARD | REQ_SECURE)); 1003 & (REQ_DISCARD | REQ_SECURE));
1004 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1003 struct md_rdev *blocked_rdev; 1005 struct md_rdev *blocked_rdev;
1004 struct blk_plug_cb *cb; 1006 struct blk_plug_cb *cb;
1005 struct raid1_plug_cb *plug = NULL; 1007 struct raid1_plug_cb *plug = NULL;
@@ -1301,7 +1303,8 @@ read_again:
1301 conf->mirrors[i].rdev->data_offset); 1303 conf->mirrors[i].rdev->data_offset);
1302 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1304 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1303 mbio->bi_end_io = raid1_end_write_request; 1305 mbio->bi_end_io = raid1_end_write_request;
1304 mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; 1306 mbio->bi_rw =
1307 WRITE | do_flush_fua | do_sync | do_discard | do_same;
1305 mbio->bi_private = r1_bio; 1308 mbio->bi_private = r1_bio;
1306 1309
1307 atomic_inc(&r1_bio->remaining); 1310 atomic_inc(&r1_bio->remaining);
@@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
2818 if (IS_ERR(conf)) 2821 if (IS_ERR(conf))
2819 return PTR_ERR(conf); 2822 return PTR_ERR(conf);
2820 2823
2824 if (mddev->queue)
2825 blk_queue_max_write_same_sectors(mddev->queue,
2826 mddev->chunk_sectors);
2821 rdev_for_each(rdev, mddev) { 2827 rdev_for_each(rdev, mddev) {
2822 if (!mddev->gendisk) 2828 if (!mddev->gendisk)
2823 continue; 2829 continue;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 64d48249c03b..77b562d18a90 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -38,21 +38,36 @@
38 * near_copies (stored in low byte of layout) 38 * near_copies (stored in low byte of layout)
39 * far_copies (stored in second byte of layout) 39 * far_copies (stored in second byte of layout)
40 * far_offset (stored in bit 16 of layout ) 40 * far_offset (stored in bit 16 of layout )
41 * use_far_sets (stored in bit 17 of layout )
41 * 42 *
42 * The data to be stored is divided into chunks using chunksize. 43 * The data to be stored is divided into chunks using chunksize. Each device
43 * Each device is divided into far_copies sections. 44 * is divided into far_copies sections. In each section, chunks are laid out
44 * In each section, chunks are laid out in a style similar to raid0, but 45 * in a style similar to raid0, but near_copies copies of each chunk is stored
45 * near_copies copies of each chunk is stored (each on a different drive). 46 * (each on a different drive). The starting device for each section is offset
46 * The starting device for each section is offset near_copies from the starting 47 * near_copies from the starting device of the previous section. Thus there
47 * device of the previous section. 48 * are (near_copies * far_copies) of each chunk, and each is on a different
48 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different 49 * drive. near_copies and far_copies must be at least one, and their product
49 * drive. 50 * is at most raid_disks.
50 * near_copies and far_copies must be at least one, and their product is at most
51 * raid_disks.
52 * 51 *
53 * If far_offset is true, then the far_copies are handled a bit differently. 52 * If far_offset is true, then the far_copies are handled a bit differently.
54 * The copies are still in different stripes, but instead of be very far apart 53 * The copies are still in different stripes, but instead of being very far
55 * on disk, there are adjacent stripes. 54 * apart on disk, there are adjacent stripes.
55 *
56 * The far and offset algorithms are handled slightly differently if
57 * 'use_far_sets' is true. In this case, the array's devices are grouped into
58 * sets that are (near_copies * far_copies) in size. The far copied stripes
59 * are still shifted by 'near_copies' devices, but this shifting stays confined
60 * to the set rather than the entire array. This is done to improve the number
61 * of device combinations that can fail without causing the array to fail.
62 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
63 * on a device):
64 * A B C D A B C D E
65 * ... ...
66 * D A B C E A B C D
67 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
68 * [A B] [C D] [A B] [C D E]
69 * |...| |...| |...| | ... |
70 * [B A] [D C] [B A] [E C D]
56 */ 71 */
57 72
58/* 73/*
@@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
535 sector_t stripe; 550 sector_t stripe;
536 int dev; 551 int dev;
537 int slot = 0; 552 int slot = 0;
553 int last_far_set_start, last_far_set_size;
554
555 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
556 last_far_set_start *= geo->far_set_size;
557
558 last_far_set_size = geo->far_set_size;
559 last_far_set_size += (geo->raid_disks % geo->far_set_size);
538 560
539 /* now calculate first sector/dev */ 561 /* now calculate first sector/dev */
540 chunk = r10bio->sector >> geo->chunk_shift; 562 chunk = r10bio->sector >> geo->chunk_shift;
@@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
551 /* and calculate all the others */ 573 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 574 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev; 575 int d = dev;
576 int set;
554 sector_t s = sector; 577 sector_t s = sector;
555 r10bio->devs[slot].addr = sector;
556 r10bio->devs[slot].devnum = d; 578 r10bio->devs[slot].devnum = d;
579 r10bio->devs[slot].addr = s;
557 slot++; 580 slot++;
558 581
559 for (f = 1; f < geo->far_copies; f++) { 582 for (f = 1; f < geo->far_copies; f++) {
583 set = d / geo->far_set_size;
560 d += geo->near_copies; 584 d += geo->near_copies;
561 if (d >= geo->raid_disks) 585
562 d -= geo->raid_disks; 586 if ((geo->raid_disks % geo->far_set_size) &&
587 (d > last_far_set_start)) {
588 d -= last_far_set_start;
589 d %= last_far_set_size;
590 d += last_far_set_start;
591 } else {
592 d %= geo->far_set_size;
593 d += geo->far_set_size * set;
594 }
563 s += geo->stride; 595 s += geo->stride;
564 r10bio->devs[slot].devnum = d; 596 r10bio->devs[slot].devnum = d;
565 r10bio->devs[slot].addr = s; 597 r10bio->devs[slot].addr = s;
@@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
595 * or recovery, so reshape isn't happening 627 * or recovery, so reshape isn't happening
596 */ 628 */
597 struct geom *geo = &conf->geo; 629 struct geom *geo = &conf->geo;
630 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
631 int far_set_size = geo->far_set_size;
632 int last_far_set_start;
633
634 if (geo->raid_disks % geo->far_set_size) {
635 last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
636 last_far_set_start *= geo->far_set_size;
637
638 if (dev >= last_far_set_start) {
639 far_set_size = geo->far_set_size;
640 far_set_size += (geo->raid_disks % geo->far_set_size);
641 far_set_start = last_far_set_start;
642 }
643 }
598 644
599 offset = sector & geo->chunk_mask; 645 offset = sector & geo->chunk_mask;
600 if (geo->far_offset) { 646 if (geo->far_offset) {
@@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
602 chunk = sector >> geo->chunk_shift; 648 chunk = sector >> geo->chunk_shift;
603 fc = sector_div(chunk, geo->far_copies); 649 fc = sector_div(chunk, geo->far_copies);
604 dev -= fc * geo->near_copies; 650 dev -= fc * geo->near_copies;
605 if (dev < 0) 651 if (dev < far_set_start)
606 dev += geo->raid_disks; 652 dev += far_set_size;
607 } else { 653 } else {
608 while (sector >= geo->stride) { 654 while (sector >= geo->stride) {
609 sector -= geo->stride; 655 sector -= geo->stride;
610 if (dev < geo->near_copies) 656 if (dev < (geo->near_copies + far_set_start))
611 dev += geo->raid_disks - geo->near_copies; 657 dev += far_set_size - geo->near_copies;
612 else 658 else
613 dev -= geo->near_copies; 659 dev -= geo->near_copies;
614 } 660 }
@@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1073 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1119 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1074 conf->pending_count += plug->pending_cnt; 1120 conf->pending_count += plug->pending_cnt;
1075 spin_unlock_irq(&conf->device_lock); 1121 spin_unlock_irq(&conf->device_lock);
1122 wake_up(&conf->wait_barrier);
1076 md_wakeup_thread(mddev->thread); 1123 md_wakeup_thread(mddev->thread);
1077 kfree(plug); 1124 kfree(plug);
1078 return; 1125 return;
@@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
1105 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1152 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1106 const unsigned long do_discard = (bio->bi_rw 1153 const unsigned long do_discard = (bio->bi_rw
1107 & (REQ_DISCARD | REQ_SECURE)); 1154 & (REQ_DISCARD | REQ_SECURE));
1155 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1108 unsigned long flags; 1156 unsigned long flags;
1109 struct md_rdev *blocked_rdev; 1157 struct md_rdev *blocked_rdev;
1110 struct blk_plug_cb *cb; 1158 struct blk_plug_cb *cb;
@@ -1460,7 +1508,8 @@ retry_write:
1460 rdev)); 1508 rdev));
1461 mbio->bi_bdev = rdev->bdev; 1509 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request; 1510 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1511 mbio->bi_rw =
1512 WRITE | do_sync | do_fua | do_discard | do_same;
1464 mbio->bi_private = r10_bio; 1513 mbio->bi_private = r10_bio;
1465 1514
1466 atomic_inc(&r10_bio->remaining); 1515 atomic_inc(&r10_bio->remaining);
@@ -1502,7 +1551,8 @@ retry_write:
1502 r10_bio, rdev)); 1551 r10_bio, rdev));
1503 mbio->bi_bdev = rdev->bdev; 1552 mbio->bi_bdev = rdev->bdev;
1504 mbio->bi_end_io = raid10_end_write_request; 1553 mbio->bi_end_io = raid10_end_write_request;
1505 mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; 1554 mbio->bi_rw =
1555 WRITE | do_sync | do_fua | do_discard | do_same;
1506 mbio->bi_private = r10_bio; 1556 mbio->bi_private = r10_bio;
1507 1557
1508 atomic_inc(&r10_bio->remaining); 1558 atomic_inc(&r10_bio->remaining);
@@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3436 disks = mddev->raid_disks + mddev->delta_disks; 3486 disks = mddev->raid_disks + mddev->delta_disks;
3437 break; 3487 break;
3438 } 3488 }
3439 if (layout >> 17) 3489 if (layout >> 18)
3440 return -1; 3490 return -1;
3441 if (chunk < (PAGE_SIZE >> 9) || 3491 if (chunk < (PAGE_SIZE >> 9) ||
3442 !is_power_of_2(chunk)) 3492 !is_power_of_2(chunk))
@@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3448 geo->near_copies = nc; 3498 geo->near_copies = nc;
3449 geo->far_copies = fc; 3499 geo->far_copies = fc;
3450 geo->far_offset = fo; 3500 geo->far_offset = fo;
3501 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3451 geo->chunk_mask = chunk - 1; 3502 geo->chunk_mask = chunk - 1;
3452 geo->chunk_shift = ffz(~chunk); 3503 geo->chunk_shift = ffz(~chunk);
3453 return nc*fc; 3504 return nc*fc;
@@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
3569 if (mddev->queue) { 3620 if (mddev->queue) {
3570 blk_queue_max_discard_sectors(mddev->queue, 3621 blk_queue_max_discard_sectors(mddev->queue,
3571 mddev->chunk_sectors); 3622 mddev->chunk_sectors);
3623 blk_queue_max_write_same_sectors(mddev->queue,
3624 mddev->chunk_sectors);
3572 blk_queue_io_min(mddev->queue, chunk_size); 3625 blk_queue_io_min(mddev->queue, chunk_size);
3573 if (conf->geo.raid_disks % conf->geo.near_copies) 3626 if (conf->geo.raid_disks % conf->geo.near_copies)
3574 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3627 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1054cf602345..157d69e83ff4 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -33,6 +33,11 @@ struct r10conf {
33 * far_offset, in which case it is 33 * far_offset, in which case it is
34 * 1 stripe. 34 * 1 stripe.
35 */ 35 */
36 int far_set_size; /* The number of devices in a set,
37 * where a 'set' are devices that
38 * contain far/offset copies of
39 * each other.
40 */
36 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 42 sector_t chunk_mask;
38 } prev, geo; 43 } prev, geo;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5af2d2709081..3ee2912889e7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
1403 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1403 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1404} 1404}
1405 1405
1406static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1406static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1407{ 1407{
1408 int overlap_clear = 0, i, disks = sh->disks; 1408 int overlap_clear = 0, i, disks = sh->disks;
1409 struct dma_async_tx_descriptor *tx = NULL; 1409 struct dma_async_tx_descriptor *tx = NULL;
@@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1468 put_cpu(); 1468 put_cpu();
1469} 1469}
1470 1470
1471#ifdef CONFIG_MULTICORE_RAID456
1472static void async_run_ops(void *param, async_cookie_t cookie)
1473{
1474 struct stripe_head *sh = param;
1475 unsigned long ops_request = sh->ops.request;
1476
1477 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1478 wake_up(&sh->ops.wait_for_ops);
1479
1480 __raid_run_ops(sh, ops_request);
1481 release_stripe(sh);
1482}
1483
1484static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1485{
1486 /* since handle_stripe can be called outside of raid5d context
1487 * we need to ensure sh->ops.request is de-staged before another
1488 * request arrives
1489 */
1490 wait_event(sh->ops.wait_for_ops,
1491 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1492 sh->ops.request = ops_request;
1493
1494 atomic_inc(&sh->count);
1495 async_schedule(async_run_ops, sh);
1496}
1497#else
1498#define raid_run_ops __raid_run_ops
1499#endif
1500
1501static int grow_one_stripe(struct r5conf *conf) 1471static int grow_one_stripe(struct r5conf *conf)
1502{ 1472{
1503 struct stripe_head *sh; 1473 struct stripe_head *sh;
@@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf)
1506 return 0; 1476 return 0;
1507 1477
1508 sh->raid_conf = conf; 1478 sh->raid_conf = conf;
1509 #ifdef CONFIG_MULTICORE_RAID456
1510 init_waitqueue_head(&sh->ops.wait_for_ops);
1511 #endif
1512 1479
1513 spin_lock_init(&sh->stripe_lock); 1480 spin_lock_init(&sh->stripe_lock);
1514 1481
@@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
1627 break; 1594 break;
1628 1595
1629 nsh->raid_conf = conf; 1596 nsh->raid_conf = conf;
1630 #ifdef CONFIG_MULTICORE_RAID456
1631 init_waitqueue_head(&nsh->ops.wait_for_ops);
1632 #endif
1633 spin_lock_init(&nsh->stripe_lock); 1597 spin_lock_init(&nsh->stripe_lock);
1634 1598
1635 list_add(&nsh->lru, &newstripes); 1599 list_add(&nsh->lru, &newstripes);