aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:01 -0400
commitfcff06c438b60f415af5983efe92811d6aa02ad1 (patch)
tree704f6598b2de60a86774bc5cf152d4f051bd2dc4 /drivers/md
parent068535f1fef4c90aee23eb7b9b9a71c5b72d7cd0 (diff)
parent63f33b8dda88923487004b20fba825486d009e7b (diff)
Merge branch 'for-next' of git://neil.brown.name/md
Pull md updates from NeilBrown. * 'for-next' of git://neil.brown.name/md: DM RAID: Add support for MD RAID10 md/RAID1: Add missing case for attempting to repair known bad blocks. md/raid5: For odirect-write performance, do not set STRIPE_PREREAD_ACTIVE. md/raid1: don't abort a resync on the first badblock. md: remove duplicated test on ->openers when calling do_md_stop() raid5: Add R5_ReadNoMerge flag which prevent bio from merging at block layer md/raid1: prevent merging too large request md/raid1: read balance chooses idlest disk for SSD md/raid1: make sequential read detection per disk based MD RAID10: Export md_raid10_congested MD: Move macros from raid1*.h to raid1*.c MD RAID1: rename mirror_info structure MD RAID10: rename mirror_info structure MD RAID10: Fix compiler warning. raid5: add a per-stripe lock raid5: remove unnecessary bitmap write optimization raid5: lockless access raid5 overrided bi_phys_segments raid5: reduce chance release_stripe() taking device_lock
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-raid.c95
-rw-r--r--drivers/md/md.c8
-rw-r--r--drivers/md/raid1.c164
-rw-r--r--drivers/md/raid1.h30
-rw-r--r--drivers/md/raid10.c92
-rw-r--r--drivers/md/raid10.h23
-rw-r--r--drivers/md/raid5.c205
-rw-r--r--drivers/md/raid5.h2
8 files changed, 400 insertions, 219 deletions
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f2f29c526544..982e3e390c45 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
11#include "md.h" 11#include "md.h"
12#include "raid1.h" 12#include "raid1.h"
13#include "raid5.h" 13#include "raid5.h"
14#include "raid10.h"
14#include "bitmap.h" 15#include "bitmap.h"
15 16
16#include <linux/device-mapper.h> 17#include <linux/device-mapper.h>
@@ -52,7 +53,10 @@ struct raid_dev {
52#define DMPF_MAX_RECOVERY_RATE 0x20 53#define DMPF_MAX_RECOVERY_RATE 0x20
53#define DMPF_MAX_WRITE_BEHIND 0x40 54#define DMPF_MAX_WRITE_BEHIND 0x40
54#define DMPF_STRIPE_CACHE 0x80 55#define DMPF_STRIPE_CACHE 0x80
55#define DMPF_REGION_SIZE 0X100 56#define DMPF_REGION_SIZE 0x100
57#define DMPF_RAID10_COPIES 0x200
58#define DMPF_RAID10_FORMAT 0x400
59
56struct raid_set { 60struct raid_set {
57 struct dm_target *ti; 61 struct dm_target *ti;
58 62
@@ -76,6 +80,7 @@ static struct raid_type {
76 const unsigned algorithm; /* RAID algorithm. */ 80 const unsigned algorithm; /* RAID algorithm. */
77} raid_types[] = { 81} raid_types[] = {
78 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 82 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
83 {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
79 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 84 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
80 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 85 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
81 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 86 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -86,6 +91,17 @@ static struct raid_type {
86 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
87}; 92};
88 93
94static unsigned raid10_md_layout_to_copies(int layout)
95{
96 return layout & 0xFF;
97}
98
99static int raid10_format_to_md_layout(char *format, unsigned copies)
100{
101 /* 1 "far" copy, and 'copies' "near" copies */
102 return (1 << 8) | (copies & 0xFF);
103}
104
89static struct raid_type *get_raid_type(char *name) 105static struct raid_type *get_raid_type(char *name)
90{ 106{
91 int i; 107 int i;
@@ -339,10 +355,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
339 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 355 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
340 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 356 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
341 * [region_size <sectors>] Defines granularity of bitmap 357 * [region_size <sectors>] Defines granularity of bitmap
358 *
359 * RAID10-only options:
360 * [raid10_copies <# copies>] Number of copies. (Default: 2)
361 * [raid10_format <near>] Layout algorithm. (Default: near)
342 */ 362 */
343static int parse_raid_params(struct raid_set *rs, char **argv, 363static int parse_raid_params(struct raid_set *rs, char **argv,
344 unsigned num_raid_params) 364 unsigned num_raid_params)
345{ 365{
366 char *raid10_format = "near";
367 unsigned raid10_copies = 2;
346 unsigned i, rebuild_cnt = 0; 368 unsigned i, rebuild_cnt = 0;
347 unsigned long value, region_size = 0; 369 unsigned long value, region_size = 0;
348 sector_t sectors_per_dev = rs->ti->len; 370 sector_t sectors_per_dev = rs->ti->len;
@@ -416,11 +438,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
416 } 438 }
417 439
418 key = argv[i++]; 440 key = argv[i++];
441
442 /* Parameters that take a string value are checked here. */
443 if (!strcasecmp(key, "raid10_format")) {
444 if (rs->raid_type->level != 10) {
445 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
446 return -EINVAL;
447 }
448 if (strcmp("near", argv[i])) {
449 rs->ti->error = "Invalid 'raid10_format' value given";
450 return -EINVAL;
451 }
452 raid10_format = argv[i];
453 rs->print_flags |= DMPF_RAID10_FORMAT;
454 continue;
455 }
456
419 if (strict_strtoul(argv[i], 10, &value) < 0) { 457 if (strict_strtoul(argv[i], 10, &value) < 0) {
420 rs->ti->error = "Bad numerical argument given in raid params"; 458 rs->ti->error = "Bad numerical argument given in raid params";
421 return -EINVAL; 459 return -EINVAL;
422 } 460 }
423 461
462 /* Parameters that take a numeric value are checked here */
424 if (!strcasecmp(key, "rebuild")) { 463 if (!strcasecmp(key, "rebuild")) {
425 rebuild_cnt++; 464 rebuild_cnt++;
426 465
@@ -439,6 +478,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
439 return -EINVAL; 478 return -EINVAL;
440 } 479 }
441 break; 480 break;
481 case 10:
442 default: 482 default:
443 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); 483 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
444 rs->ti->error = "Rebuild not supported for this RAID type"; 484 rs->ti->error = "Rebuild not supported for this RAID type";
@@ -495,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
495 */ 535 */
496 value /= 2; 536 value /= 2;
497 537
498 if (rs->raid_type->level < 5) { 538 if ((rs->raid_type->level != 5) &&
539 (rs->raid_type->level != 6)) {
499 rs->ti->error = "Inappropriate argument: stripe_cache"; 540 rs->ti->error = "Inappropriate argument: stripe_cache";
500 return -EINVAL; 541 return -EINVAL;
501 } 542 }
@@ -520,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
520 } else if (!strcasecmp(key, "region_size")) { 561 } else if (!strcasecmp(key, "region_size")) {
521 rs->print_flags |= DMPF_REGION_SIZE; 562 rs->print_flags |= DMPF_REGION_SIZE;
522 region_size = value; 563 region_size = value;
564 } else if (!strcasecmp(key, "raid10_copies") &&
565 (rs->raid_type->level == 10)) {
566 if ((value < 2) || (value > 0xFF)) {
567 rs->ti->error = "Bad value for 'raid10_copies'";
568 return -EINVAL;
569 }
570 rs->print_flags |= DMPF_RAID10_COPIES;
571 raid10_copies = value;
523 } else { 572 } else {
524 DMERR("Unable to parse RAID parameter: %s", key); 573 DMERR("Unable to parse RAID parameter: %s", key);
525 rs->ti->error = "Unable to parse RAID parameters"; 574 rs->ti->error = "Unable to parse RAID parameters";
@@ -538,8 +587,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
538 if (dm_set_target_max_io_len(rs->ti, max_io_len)) 587 if (dm_set_target_max_io_len(rs->ti, max_io_len))
539 return -EINVAL; 588 return -EINVAL;
540 589
541 if ((rs->raid_type->level > 1) && 590 if (rs->raid_type->level == 10) {
542 sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { 591 if (raid10_copies > rs->md.raid_disks) {
592 rs->ti->error = "Not enough devices to satisfy specification";
593 return -EINVAL;
594 }
595
596 /* (Len * #mirrors) / #devices */
597 sectors_per_dev = rs->ti->len * raid10_copies;
598 sector_div(sectors_per_dev, rs->md.raid_disks);
599
600 rs->md.layout = raid10_format_to_md_layout(raid10_format,
601 raid10_copies);
602 rs->md.new_layout = rs->md.layout;
603 } else if ((rs->raid_type->level > 1) &&
604 sector_div(sectors_per_dev,
605 (rs->md.raid_disks - rs->raid_type->parity_devs))) {
543 rs->ti->error = "Target length not divisible by number of data devices"; 606 rs->ti->error = "Target length not divisible by number of data devices";
544 return -EINVAL; 607 return -EINVAL;
545 } 608 }
@@ -566,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
566 if (rs->raid_type->level == 1) 629 if (rs->raid_type->level == 1)
567 return md_raid1_congested(&rs->md, bits); 630 return md_raid1_congested(&rs->md, bits);
568 631
632 if (rs->raid_type->level == 10)
633 return md_raid10_congested(&rs->md, bits);
634
569 return md_raid5_congested(&rs->md, bits); 635 return md_raid5_congested(&rs->md, bits);
570} 636}
571 637
@@ -884,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
884 case 6: 950 case 6:
885 redundancy = rs->raid_type->parity_devs; 951 redundancy = rs->raid_type->parity_devs;
886 break; 952 break;
953 case 10:
954 redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
955 break;
887 default: 956 default:
888 ti->error = "Unknown RAID type"; 957 ti->error = "Unknown RAID type";
889 return -EINVAL; 958 return -EINVAL;
@@ -1049,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1049 goto bad; 1118 goto bad;
1050 } 1119 }
1051 1120
1121 if (ti->len != rs->md.array_sectors) {
1122 ti->error = "Array size does not match requested target length";
1123 ret = -EINVAL;
1124 goto size_mismatch;
1125 }
1052 rs->callbacks.congested_fn = raid_is_congested; 1126 rs->callbacks.congested_fn = raid_is_congested;
1053 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 1127 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
1054 1128
1055 mddev_suspend(&rs->md); 1129 mddev_suspend(&rs->md);
1056 return 0; 1130 return 0;
1057 1131
1132size_mismatch:
1133 md_stop(&rs->md);
1058bad: 1134bad:
1059 context_free(rs); 1135 context_free(rs);
1060 1136
@@ -1203,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1203 DMEMIT(" region_size %lu", 1279 DMEMIT(" region_size %lu",
1204 rs->md.bitmap_info.chunksize >> 9); 1280 rs->md.bitmap_info.chunksize >> 9);
1205 1281
1282 if (rs->print_flags & DMPF_RAID10_COPIES)
1283 DMEMIT(" raid10_copies %u",
1284 raid10_md_layout_to_copies(rs->md.layout));
1285
1286 if (rs->print_flags & DMPF_RAID10_FORMAT)
1287 DMEMIT(" raid10_format near");
1288
1206 DMEMIT(" %d", rs->md.raid_disks); 1289 DMEMIT(" %d", rs->md.raid_disks);
1207 for (i = 0; i < rs->md.raid_disks; i++) { 1290 for (i = 0; i < rs->md.raid_disks; i++) {
1208 if (rs->dev[i].meta_dev) 1291 if (rs->dev[i].meta_dev)
@@ -1277,7 +1360,7 @@ static void raid_resume(struct dm_target *ti)
1277 1360
1278static struct target_type raid_target = { 1361static struct target_type raid_target = {
1279 .name = "raid", 1362 .name = "raid",
1280 .version = {1, 2, 0}, 1363 .version = {1, 3, 0},
1281 .module = THIS_MODULE, 1364 .module = THIS_MODULE,
1282 .ctr = raid_ctr, 1365 .ctr = raid_ctr,
1283 .dtr = raid_dtr, 1366 .dtr = raid_dtr,
@@ -1304,6 +1387,8 @@ module_init(dm_raid_init);
1304module_exit(dm_raid_exit); 1387module_exit(dm_raid_exit);
1305 1388
1306MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1389MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1390MODULE_ALIAS("dm-raid1");
1391MODULE_ALIAS("dm-raid10");
1307MODULE_ALIAS("dm-raid4"); 1392MODULE_ALIAS("dm-raid4");
1308MODULE_ALIAS("dm-raid5"); 1393MODULE_ALIAS("dm-raid5");
1309MODULE_ALIAS("dm-raid6"); 1394MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d5ab4493c8be..f6c46109b071 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3942,17 +3942,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3942 break; 3942 break;
3943 case clear: 3943 case clear:
3944 /* stopping an active array */ 3944 /* stopping an active array */
3945 if (atomic_read(&mddev->openers) > 0)
3946 return -EBUSY;
3947 err = do_md_stop(mddev, 0, NULL); 3945 err = do_md_stop(mddev, 0, NULL);
3948 break; 3946 break;
3949 case inactive: 3947 case inactive:
3950 /* stopping an active array */ 3948 /* stopping an active array */
3951 if (mddev->pers) { 3949 if (mddev->pers)
3952 if (atomic_read(&mddev->openers) > 0)
3953 return -EBUSY;
3954 err = do_md_stop(mddev, 2, NULL); 3950 err = do_md_stop(mddev, 2, NULL);
3955 } else 3951 else
3956 err = 0; /* already inactive */ 3952 err = 0; /* already inactive */
3957 break; 3953 break;
3958 case suspended: 3954 case suspended:
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cacd008d6864..197f62681db5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
46 */ 46 */
47#define NR_RAID1_BIOS 256 47#define NR_RAID1_BIOS 256
48 48
49/* when we get a read error on a read-only array, we redirect to another
50 * device without failing the first device, or trying to over-write to
51 * correct the read error. To keep track of bad blocks on a per-bio
52 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
53 */
54#define IO_BLOCKED ((struct bio *)1)
55/* When we successfully write to a known bad-block, we need to remove the
56 * bad-block marking which must be done from process context. So we record
57 * the success by setting devs[n].bio to IO_MADE_GOOD
58 */
59#define IO_MADE_GOOD ((struct bio *)2)
60
61#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
62
49/* When there are this many requests queue to be written by 63/* When there are this many requests queue to be written by
50 * the raid1 thread, we become 'congested' to provide back-pressure 64 * the raid1 thread, we become 'congested' to provide back-pressure
51 * for writeback. 65 * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
483 const sector_t this_sector = r1_bio->sector; 497 const sector_t this_sector = r1_bio->sector;
484 int sectors; 498 int sectors;
485 int best_good_sectors; 499 int best_good_sectors;
486 int start_disk; 500 int best_disk, best_dist_disk, best_pending_disk;
487 int best_disk; 501 int has_nonrot_disk;
488 int i; 502 int disk;
489 sector_t best_dist; 503 sector_t best_dist;
504 unsigned int min_pending;
490 struct md_rdev *rdev; 505 struct md_rdev *rdev;
491 int choose_first; 506 int choose_first;
507 int choose_next_idle;
492 508
493 rcu_read_lock(); 509 rcu_read_lock();
494 /* 510 /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
499 retry: 515 retry:
500 sectors = r1_bio->sectors; 516 sectors = r1_bio->sectors;
501 best_disk = -1; 517 best_disk = -1;
518 best_dist_disk = -1;
502 best_dist = MaxSector; 519 best_dist = MaxSector;
520 best_pending_disk = -1;
521 min_pending = UINT_MAX;
503 best_good_sectors = 0; 522 best_good_sectors = 0;
523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
504 525
505 if (conf->mddev->recovery_cp < MaxSector && 526 if (conf->mddev->recovery_cp < MaxSector &&
506 (this_sector + sectors >= conf->next_resync)) { 527 (this_sector + sectors >= conf->next_resync))
507 choose_first = 1; 528 choose_first = 1;
508 start_disk = 0; 529 else
509 } else {
510 choose_first = 0; 530 choose_first = 0;
511 start_disk = conf->last_used;
512 }
513 531
514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) { 532 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
515 sector_t dist; 533 sector_t dist;
516 sector_t first_bad; 534 sector_t first_bad;
517 int bad_sectors; 535 int bad_sectors;
518 536 unsigned int pending;
519 int disk = start_disk + i; 537 bool nonrot;
520 if (disk >= conf->raid_disks * 2)
521 disk -= conf->raid_disks * 2;
522 538
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 539 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 540 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
577 } else 593 } else
578 best_good_sectors = sectors; 594 best_good_sectors = sectors;
579 595
596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
598 pending = atomic_read(&rdev->nr_pending);
580 dist = abs(this_sector - conf->mirrors[disk].head_position); 599 dist = abs(this_sector - conf->mirrors[disk].head_position);
581 if (choose_first 600 if (choose_first) {
582 /* Don't change to another disk for sequential reads */ 601 best_disk = disk;
583 || conf->next_seq_sect == this_sector 602 break;
584 || dist == 0 603 }
585 /* If device is idle, use it */ 604 /* Don't change to another disk for sequential reads */
586 || atomic_read(&rdev->nr_pending) == 0) { 605 if (conf->mirrors[disk].next_seq_sect == this_sector
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break;
633 }
634 /* If device is idle, use it */
635 if (pending == 0) {
587 best_disk = disk; 636 best_disk = disk;
588 break; 637 break;
589 } 638 }
639
640 if (choose_next_idle)
641 continue;
642
643 if (min_pending > pending) {
644 min_pending = pending;
645 best_pending_disk = disk;
646 }
647
590 if (dist < best_dist) { 648 if (dist < best_dist) {
591 best_dist = dist; 649 best_dist = dist;
592 best_disk = disk; 650 best_dist_disk = disk;
593 } 651 }
594 } 652 }
595 653
654 /*
655 * If all disks are rotational, choose the closest disk. If any disk is
656 * non-rotational, choose the disk with less pending request even the
657 * disk is rotational, which might/might not be optimal for raids with
658 * mixed ratation/non-rotational disks depending on workload.
659 */
660 if (best_disk == -1) {
661 if (has_nonrot_disk)
662 best_disk = best_pending_disk;
663 else
664 best_disk = best_dist_disk;
665 }
666
596 if (best_disk >= 0) { 667 if (best_disk >= 0) {
597 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 668 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
598 if (!rdev) 669 if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
606 goto retry; 677 goto retry;
607 } 678 }
608 sectors = best_good_sectors; 679 sectors = best_good_sectors;
609 conf->next_seq_sect = this_sector + sectors; 680
610 conf->last_used = best_disk; 681 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
682 conf->mirrors[best_disk].seq_start = this_sector;
683
684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
611 } 685 }
612 rcu_read_unlock(); 686 rcu_read_unlock();
613 *max_sectors = sectors; 687 *max_sectors = sectors;
@@ -873,7 +947,7 @@ do_sync_io:
873static void make_request(struct mddev *mddev, struct bio * bio) 947static void make_request(struct mddev *mddev, struct bio * bio)
874{ 948{
875 struct r1conf *conf = mddev->private; 949 struct r1conf *conf = mddev->private;
876 struct mirror_info *mirror; 950 struct raid1_info *mirror;
877 struct r1bio *r1_bio; 951 struct r1bio *r1_bio;
878 struct bio *read_bio; 952 struct bio *read_bio;
879 int i, disks; 953 int i, disks;
@@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1364 struct r1conf *conf = mddev->private; 1438 struct r1conf *conf = mddev->private;
1365 int err = -EEXIST; 1439 int err = -EEXIST;
1366 int mirror = 0; 1440 int mirror = 0;
1367 struct mirror_info *p; 1441 struct raid1_info *p;
1368 int first = 0; 1442 int first = 0;
1369 int last = conf->raid_disks - 1; 1443 int last = conf->raid_disks - 1;
1370 struct request_queue *q = bdev_get_queue(rdev->bdev); 1444 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1433 struct r1conf *conf = mddev->private; 1507 struct r1conf *conf = mddev->private;
1434 int err = 0; 1508 int err = 0;
1435 int number = rdev->raid_disk; 1509 int number = rdev->raid_disk;
1436 struct mirror_info *p = conf->mirrors+ number; 1510 struct raid1_info *p = conf->mirrors + number;
1437 1511
1438 if (rdev != p->rdev) 1512 if (rdev != p->rdev)
1439 p = conf->mirrors + conf->raid_disks + number; 1513 p = conf->mirrors + conf->raid_disks + number;
@@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2371 bio->bi_rw = READ; 2445 bio->bi_rw = READ;
2372 bio->bi_end_io = end_sync_read; 2446 bio->bi_end_io = end_sync_read;
2373 read_targets++; 2447 read_targets++;
2448 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2449 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2450 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2451 /*
2452 * The device is suitable for reading (InSync),
2453 * but has bad block(s) here. Let's try to correct them,
2454 * if we are doing resync or repair. Otherwise, leave
2455 * this device alone for this sync request.
2456 */
2457 bio->bi_rw = WRITE;
2458 bio->bi_end_io = end_sync_write;
2459 write_targets++;
2374 } 2460 }
2375 } 2461 }
2376 if (bio->bi_end_io) { 2462 if (bio->bi_end_io) {
@@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2428 /* There is nowhere to write, so all non-sync 2514 /* There is nowhere to write, so all non-sync
2429 * drives must be failed - so we are finished 2515 * drives must be failed - so we are finished
2430 */ 2516 */
2431 sector_t rv = max_sector - sector_nr; 2517 sector_t rv;
2518 if (min_bad > 0)
2519 max_sector = sector_nr + min_bad;
2520 rv = max_sector - sector_nr;
2432 *skipped = 1; 2521 *skipped = 1;
2433 put_buf(r1_bio); 2522 put_buf(r1_bio);
2434 return rv; 2523 return rv;
@@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2521{ 2610{
2522 struct r1conf *conf; 2611 struct r1conf *conf;
2523 int i; 2612 int i;
2524 struct mirror_info *disk; 2613 struct raid1_info *disk;
2525 struct md_rdev *rdev; 2614 struct md_rdev *rdev;
2526 int err = -ENOMEM; 2615 int err = -ENOMEM;
2527 2616
@@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2529 if (!conf) 2618 if (!conf)
2530 goto abort; 2619 goto abort;
2531 2620
2532 conf->mirrors = kzalloc(sizeof(struct mirror_info) 2621 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2533 * mddev->raid_disks * 2, 2622 * mddev->raid_disks * 2,
2534 GFP_KERNEL); 2623 GFP_KERNEL);
2535 if (!conf->mirrors) 2624 if (!conf->mirrors)
@@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2572 mddev->merge_check_needed = 1; 2661 mddev->merge_check_needed = 1;
2573 2662
2574 disk->head_position = 0; 2663 disk->head_position = 0;
2664 disk->seq_start = MaxSector;
2575 } 2665 }
2576 conf->raid_disks = mddev->raid_disks; 2666 conf->raid_disks = mddev->raid_disks;
2577 conf->mddev = mddev; 2667 conf->mddev = mddev;
@@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2585 conf->recovery_disabled = mddev->recovery_disabled - 1; 2675 conf->recovery_disabled = mddev->recovery_disabled - 1;
2586 2676
2587 err = -EIO; 2677 err = -EIO;
2588 conf->last_used = -1;
2589 for (i = 0; i < conf->raid_disks * 2; i++) { 2678 for (i = 0; i < conf->raid_disks * 2; i++) {
2590 2679
2591 disk = conf->mirrors + i; 2680 disk = conf->mirrors + i;
@@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2611 if (disk->rdev && 2700 if (disk->rdev &&
2612 (disk->rdev->saved_raid_disk < 0)) 2701 (disk->rdev->saved_raid_disk < 0))
2613 conf->fullsync = 1; 2702 conf->fullsync = 1;
2614 } else if (conf->last_used < 0) 2703 }
2615 /*
2616 * The first working device is used as a
2617 * starting point to read balancing.
2618 */
2619 conf->last_used = i;
2620 } 2704 }
2621 2705
2622 if (conf->last_used < 0) {
2623 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2624 mdname(mddev));
2625 goto abort;
2626 }
2627 err = -ENOMEM; 2706 err = -ENOMEM;
2628 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2707 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2629 if (!conf->thread) { 2708 if (!conf->thread) {
@@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev)
2798 */ 2877 */
2799 mempool_t *newpool, *oldpool; 2878 mempool_t *newpool, *oldpool;
2800 struct pool_info *newpoolinfo; 2879 struct pool_info *newpoolinfo;
2801 struct mirror_info *newmirrors; 2880 struct raid1_info *newmirrors;
2802 struct r1conf *conf = mddev->private; 2881 struct r1conf *conf = mddev->private;
2803 int cnt, raid_disks; 2882 int cnt, raid_disks;
2804 unsigned long flags; 2883 unsigned long flags;
@@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev)
2841 kfree(newpoolinfo); 2920 kfree(newpoolinfo);
2842 return -ENOMEM; 2921 return -ENOMEM;
2843 } 2922 }
2844 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, 2923 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
2845 GFP_KERNEL); 2924 GFP_KERNEL);
2846 if (!newmirrors) { 2925 if (!newmirrors) {
2847 kfree(newpoolinfo); 2926 kfree(newpoolinfo);
@@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev)
2880 conf->raid_disks = mddev->raid_disks = raid_disks; 2959 conf->raid_disks = mddev->raid_disks = raid_disks;
2881 mddev->delta_disks = 0; 2960 mddev->delta_disks = 0;
2882 2961
2883 conf->last_used = 0; /* just make sure it is in-range */
2884 lower_barrier(conf); 2962 lower_barrier(conf);
2885 2963
2886 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2964 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 80ded139314c..0ff3715fb7eb 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,9 +1,15 @@
1#ifndef _RAID1_H 1#ifndef _RAID1_H
2#define _RAID1_H 2#define _RAID1_H
3 3
4struct mirror_info { 4struct raid1_info {
5 struct md_rdev *rdev; 5 struct md_rdev *rdev;
6 sector_t head_position; 6 sector_t head_position;
7
8 /* When choose the best device for a read (read_balance())
9 * we try to keep sequential reads one the same device
10 */
11 sector_t next_seq_sect;
12 sector_t seq_start;
7}; 13};
8 14
9/* 15/*
@@ -24,17 +30,11 @@ struct pool_info {
24 30
25struct r1conf { 31struct r1conf {
26 struct mddev *mddev; 32 struct mddev *mddev;
27 struct mirror_info *mirrors; /* twice 'raid_disks' to 33 struct raid1_info *mirrors; /* twice 'raid_disks' to
28 * allow for replacements. 34 * allow for replacements.
29 */ 35 */
30 int raid_disks; 36 int raid_disks;
31 37
32 /* When choose the best device for a read (read_balance())
33 * we try to keep sequential reads one the same device
34 * using 'last_used' and 'next_seq_sect'
35 */
36 int last_used;
37 sector_t next_seq_sect;
38 /* During resync, read_balancing is only allowed on the part 38 /* During resync, read_balancing is only allowed on the part
39 * of the array that has been resynced. 'next_resync' tells us 39 * of the array that has been resynced. 'next_resync' tells us
40 * where that is. 40 * where that is.
@@ -135,20 +135,6 @@ struct r1bio {
135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
136}; 136};
137 137
138/* when we get a read error on a read-only array, we redirect to another
139 * device without failing the first device, or trying to over-write to
140 * correct the read error. To keep track of bad blocks on a per-bio
141 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
142 */
143#define IO_BLOCKED ((struct bio *)1)
144/* When we successfully write to a known bad-block, we need to remove the
145 * bad-block marking which must be done from process context. So we record
146 * the success by setting bios[n] to IO_MADE_GOOD
147 */
148#define IO_MADE_GOOD ((struct bio *)2)
149
150#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
151
152/* bits for r1bio.state */ 138/* bits for r1bio.state */
153#define R1BIO_Uptodate 0 139#define R1BIO_Uptodate 0
154#define R1BIO_IsSync 1 140#define R1BIO_IsSync 1
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8da6282254c3..e2549deab7c3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -60,7 +60,21 @@
60 */ 60 */
61#define NR_RAID10_BIOS 256 61#define NR_RAID10_BIOS 256
62 62
63/* When there are this many requests queue to be written by 63/* when we get a read error on a read-only array, we redirect to another
64 * device without failing the first device, or trying to over-write to
65 * correct the read error. To keep track of bad blocks on a per-bio
66 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
67 */
68#define IO_BLOCKED ((struct bio *)1)
69/* When we successfully write to a known bad-block, we need to remove the
70 * bad-block marking which must be done from process context. So we record
71 * the success by setting devs[n].bio to IO_MADE_GOOD
72 */
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77/* When there are this many requests queued to be written by
64 * the raid10 thread, we become 'congested' to provide back-pressure 78 * the raid10 thread, we become 'congested' to provide back-pressure
65 * for writeback. 79 * for writeback.
66 */ 80 */
@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
717 int sectors = r10_bio->sectors; 731 int sectors = r10_bio->sectors;
718 int best_good_sectors; 732 int best_good_sectors;
719 sector_t new_distance, best_dist; 733 sector_t new_distance, best_dist;
720 struct md_rdev *rdev, *best_rdev; 734 struct md_rdev *best_rdev, *rdev = NULL;
721 int do_balance; 735 int do_balance;
722 int best_slot; 736 int best_slot;
723 struct geom *geo = &conf->geo; 737 struct geom *geo = &conf->geo;
@@ -839,9 +853,8 @@ retry:
839 return rdev; 853 return rdev;
840} 854}
841 855
842static int raid10_congested(void *data, int bits) 856int md_raid10_congested(struct mddev *mddev, int bits)
843{ 857{
844 struct mddev *mddev = data;
845 struct r10conf *conf = mddev->private; 858 struct r10conf *conf = mddev->private;
846 int i, ret = 0; 859 int i, ret = 0;
847 860
@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits)
849 conf->pending_count >= max_queued_requests) 862 conf->pending_count >= max_queued_requests)
850 return 1; 863 return 1;
851 864
852 if (mddev_congested(mddev, bits))
853 return 1;
854 rcu_read_lock(); 865 rcu_read_lock();
855 for (i = 0; 866 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 867 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits)
866 rcu_read_unlock(); 877 rcu_read_unlock();
867 return ret; 878 return ret;
868} 879}
880EXPORT_SYMBOL_GPL(md_raid10_congested);
881
882static int raid10_congested(void *data, int bits)
883{
884 struct mddev *mddev = data;
885
886 return mddev_congested(mddev, bits) ||
887 md_raid10_congested(mddev, bits);
888}
869 889
870static void flush_pending_writes(struct r10conf *conf) 890static void flush_pending_writes(struct r10conf *conf)
871{ 891{
@@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1546static void print_conf(struct r10conf *conf) 1566static void print_conf(struct r10conf *conf)
1547{ 1567{
1548 int i; 1568 int i;
1549 struct mirror_info *tmp; 1569 struct raid10_info *tmp;
1550 1570
1551 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1571 printk(KERN_DEBUG "RAID10 conf printout:\n");
1552 if (!conf) { 1572 if (!conf) {
@@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev)
1580{ 1600{
1581 int i; 1601 int i;
1582 struct r10conf *conf = mddev->private; 1602 struct r10conf *conf = mddev->private;
1583 struct mirror_info *tmp; 1603 struct raid10_info *tmp;
1584 int count = 0; 1604 int count = 0;
1585 unsigned long flags; 1605 unsigned long flags;
1586 1606
@@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1655 else 1675 else
1656 mirror = first; 1676 mirror = first;
1657 for ( ; mirror <= last ; mirror++) { 1677 for ( ; mirror <= last ; mirror++) {
1658 struct mirror_info *p = &conf->mirrors[mirror]; 1678 struct raid10_info *p = &conf->mirrors[mirror];
1659 if (p->recovery_disabled == mddev->recovery_disabled) 1679 if (p->recovery_disabled == mddev->recovery_disabled)
1660 continue; 1680 continue;
1661 if (p->rdev) { 1681 if (p->rdev) {
@@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1709 int err = 0; 1729 int err = 0;
1710 int number = rdev->raid_disk; 1730 int number = rdev->raid_disk;
1711 struct md_rdev **rdevp; 1731 struct md_rdev **rdevp;
1712 struct mirror_info *p = conf->mirrors + number; 1732 struct raid10_info *p = conf->mirrors + number;
1713 1733
1714 print_conf(conf); 1734 print_conf(conf);
1715 if (rdev == p->rdev) 1735 if (rdev == p->rdev)
@@ -2876,7 +2896,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2876 sector_t sect; 2896 sector_t sect;
2877 int must_sync; 2897 int must_sync;
2878 int any_working; 2898 int any_working;
2879 struct mirror_info *mirror = &conf->mirrors[i]; 2899 struct raid10_info *mirror = &conf->mirrors[i];
2880 2900
2881 if ((mirror->rdev == NULL || 2901 if ((mirror->rdev == NULL ||
2882 test_bit(In_sync, &mirror->rdev->flags)) 2902 test_bit(In_sync, &mirror->rdev->flags))
@@ -3388,7 +3408,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3388 goto out; 3408 goto out;
3389 3409
3390 /* FIXME calc properly */ 3410 /* FIXME calc properly */
3391 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + 3411 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3392 max(0,mddev->delta_disks)), 3412 max(0,mddev->delta_disks)),
3393 GFP_KERNEL); 3413 GFP_KERNEL);
3394 if (!conf->mirrors) 3414 if (!conf->mirrors)
@@ -3452,7 +3472,7 @@ static int run(struct mddev *mddev)
3452{ 3472{
3453 struct r10conf *conf; 3473 struct r10conf *conf;
3454 int i, disk_idx, chunk_size; 3474 int i, disk_idx, chunk_size;
3455 struct mirror_info *disk; 3475 struct raid10_info *disk;
3456 struct md_rdev *rdev; 3476 struct md_rdev *rdev;
3457 sector_t size; 3477 sector_t size;
3458 sector_t min_offset_diff = 0; 3478 sector_t min_offset_diff = 0;
@@ -3472,12 +3492,14 @@ static int run(struct mddev *mddev)
3472 conf->thread = NULL; 3492 conf->thread = NULL;
3473 3493
3474 chunk_size = mddev->chunk_sectors << 9; 3494 chunk_size = mddev->chunk_sectors << 9;
3475 blk_queue_io_min(mddev->queue, chunk_size); 3495 if (mddev->queue) {
3476 if (conf->geo.raid_disks % conf->geo.near_copies) 3496 blk_queue_io_min(mddev->queue, chunk_size);
3477 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3497 if (conf->geo.raid_disks % conf->geo.near_copies)
3478 else 3498 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3479 blk_queue_io_opt(mddev->queue, chunk_size * 3499 else
3480 (conf->geo.raid_disks / conf->geo.near_copies)); 3500 blk_queue_io_opt(mddev->queue, chunk_size *
3501 (conf->geo.raid_disks / conf->geo.near_copies));
3502 }
3481 3503
3482 rdev_for_each(rdev, mddev) { 3504 rdev_for_each(rdev, mddev) {
3483 long long diff; 3505 long long diff;
@@ -3511,8 +3533,9 @@ static int run(struct mddev *mddev)
3511 if (first || diff < min_offset_diff) 3533 if (first || diff < min_offset_diff)
3512 min_offset_diff = diff; 3534 min_offset_diff = diff;
3513 3535
3514 disk_stack_limits(mddev->gendisk, rdev->bdev, 3536 if (mddev->gendisk)
3515 rdev->data_offset << 9); 3537 disk_stack_limits(mddev->gendisk, rdev->bdev,
3538 rdev->data_offset << 9);
3516 3539
3517 disk->head_position = 0; 3540 disk->head_position = 0;
3518 } 3541 }
@@ -3575,22 +3598,22 @@ static int run(struct mddev *mddev)
3575 md_set_array_sectors(mddev, size); 3598 md_set_array_sectors(mddev, size);
3576 mddev->resync_max_sectors = size; 3599 mddev->resync_max_sectors = size;
3577 3600
3578 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3601 if (mddev->queue) {
3579 mddev->queue->backing_dev_info.congested_data = mddev;
3580
3581 /* Calculate max read-ahead size.
3582 * We need to readahead at least twice a whole stripe....
3583 * maybe...
3584 */
3585 {
3586 int stripe = conf->geo.raid_disks * 3602 int stripe = conf->geo.raid_disks *
3587 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3603 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3604 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3605 mddev->queue->backing_dev_info.congested_data = mddev;
3606
3607 /* Calculate max read-ahead size.
3608 * We need to readahead at least twice a whole stripe....
3609 * maybe...
3610 */
3588 stripe /= conf->geo.near_copies; 3611 stripe /= conf->geo.near_copies;
3589 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3612 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3590 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3613 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3614 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3591 } 3615 }
3592 3616
3593 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3594 3617
3595 if (md_integrity_register(mddev)) 3618 if (md_integrity_register(mddev))
3596 goto out_free_conf; 3619 goto out_free_conf;
@@ -3641,7 +3664,10 @@ static int stop(struct mddev *mddev)
3641 lower_barrier(conf); 3664 lower_barrier(conf);
3642 3665
3643 md_unregister_thread(&mddev->thread); 3666 md_unregister_thread(&mddev->thread);
3644 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 3667 if (mddev->queue)
3668 /* the unplug fn references 'conf'*/
3669 blk_sync_queue(mddev->queue);
3670
3645 if (conf->r10bio_pool) 3671 if (conf->r10bio_pool)
3646 mempool_destroy(conf->r10bio_pool); 3672 mempool_destroy(conf->r10bio_pool);
3647 kfree(conf->mirrors); 3673 kfree(conf->mirrors);
@@ -3805,7 +3831,7 @@ static int raid10_check_reshape(struct mddev *mddev)
3805 if (mddev->delta_disks > 0) { 3831 if (mddev->delta_disks > 0) {
3806 /* allocate new 'mirrors' list */ 3832 /* allocate new 'mirrors' list */
3807 conf->mirrors_new = kzalloc( 3833 conf->mirrors_new = kzalloc(
3808 sizeof(struct mirror_info) 3834 sizeof(struct raid10_info)
3809 *(mddev->raid_disks + 3835 *(mddev->raid_disks +
3810 mddev->delta_disks), 3836 mddev->delta_disks),
3811 GFP_KERNEL); 3837 GFP_KERNEL);
@@ -3930,7 +3956,7 @@ static int raid10_start_reshape(struct mddev *mddev)
3930 spin_lock_irq(&conf->device_lock); 3956 spin_lock_irq(&conf->device_lock);
3931 if (conf->mirrors_new) { 3957 if (conf->mirrors_new) {
3932 memcpy(conf->mirrors_new, conf->mirrors, 3958 memcpy(conf->mirrors_new, conf->mirrors,
3933 sizeof(struct mirror_info)*conf->prev.raid_disks); 3959 sizeof(struct raid10_info)*conf->prev.raid_disks);
3934 smp_mb(); 3960 smp_mb();
3935 kfree(conf->mirrors_old); /* FIXME and elsewhere */ 3961 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3936 conf->mirrors_old = conf->mirrors; 3962 conf->mirrors_old = conf->mirrors;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 135b1b0a1554..007c2c68dd83 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,7 +1,7 @@
1#ifndef _RAID10_H 1#ifndef _RAID10_H
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct mirror_info { 4struct raid10_info {
5 struct md_rdev *rdev, *replacement; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
@@ -13,8 +13,8 @@ struct mirror_info {
13 13
14struct r10conf { 14struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct mirror_info *mirrors; 16 struct raid10_info *mirrors;
17 struct mirror_info *mirrors_new, *mirrors_old; 17 struct raid10_info *mirrors_new, *mirrors_old;
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
@@ -123,20 +123,6 @@ struct r10bio {
123 } devs[0]; 123 } devs[0];
124}; 124};
125 125
126/* when we get a read error on a read-only array, we redirect to another
127 * device without failing the first device, or trying to over-write to
128 * correct the read error. To keep track of bad blocks on a per-bio
129 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
130 */
131#define IO_BLOCKED ((struct bio*)1)
132/* When we successfully write to a known bad-block, we need to remove the
133 * bad-block marking which must be done from process context. So we record
134 * the success by setting devs[n].bio to IO_MADE_GOOD
135 */
136#define IO_MADE_GOOD ((struct bio *)2)
137
138#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
139
140/* bits for r10bio.state */ 126/* bits for r10bio.state */
141enum r10bio_state { 127enum r10bio_state {
142 R10BIO_Uptodate, 128 R10BIO_Uptodate,
@@ -159,4 +145,7 @@ enum r10bio_state {
159 */ 145 */
160 R10BIO_Previous, 146 R10BIO_Previous,
161}; 147};
148
149extern int md_raid10_congested(struct mddev *mddev, int bits);
150
162#endif 151#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 04348d76bb30..259f519814ca 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
99 * We maintain a biased count of active stripes in the bottom 16 bits of 99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */ 101 */
102static inline int raid5_bi_phys_segments(struct bio *bio) 102static inline int raid5_bi_processed_stripes(struct bio *bio)
103{ 103{
104 return bio->bi_phys_segments & 0xffff; 104 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
105 return (atomic_read(segments) >> 16) & 0xffff;
105} 106}
106 107
107static inline int raid5_bi_hw_segments(struct bio *bio) 108static inline int raid5_dec_bi_active_stripes(struct bio *bio)
108{ 109{
109 return (bio->bi_phys_segments >> 16) & 0xffff; 110 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
111 return atomic_sub_return(1, segments) & 0xffff;
110} 112}
111 113
112static inline int raid5_dec_bi_phys_segments(struct bio *bio) 114static inline void raid5_inc_bi_active_stripes(struct bio *bio)
113{ 115{
114 --bio->bi_phys_segments; 116 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
115 return raid5_bi_phys_segments(bio); 117 atomic_inc(segments);
116} 118}
117 119
118static inline int raid5_dec_bi_hw_segments(struct bio *bio) 120static inline void raid5_set_bi_processed_stripes(struct bio *bio,
121 unsigned int cnt)
119{ 122{
120 unsigned short val = raid5_bi_hw_segments(bio); 123 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
124 int old, new;
121 125
122 --val; 126 do {
123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 old = atomic_read(segments);
124 return val; 128 new = (old & 0xffff) | (cnt << 16);
129 } while (atomic_cmpxchg(segments, old, new) != old);
125} 130}
126 131
127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 132static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
128{ 133{
129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 134 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
135 atomic_set(segments, cnt);
130} 136}
131 137
132/* Find first data disk in a raid6 stripe */ 138/* Find first data disk in a raid6 stripe */
@@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh)
190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 196 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191} 197}
192 198
193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 199static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
194{ 200{
195 if (atomic_dec_and_test(&sh->count)) { 201 BUG_ON(!list_empty(&sh->lru));
196 BUG_ON(!list_empty(&sh->lru)); 202 BUG_ON(atomic_read(&conf->active_stripes)==0);
197 BUG_ON(atomic_read(&conf->active_stripes)==0); 203 if (test_bit(STRIPE_HANDLE, &sh->state)) {
198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 204 if (test_bit(STRIPE_DELAYED, &sh->state) &&
199 if (test_bit(STRIPE_DELAYED, &sh->state) && 205 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
200 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 206 list_add_tail(&sh->lru, &conf->delayed_list);
201 list_add_tail(&sh->lru, &conf->delayed_list); 207 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
202 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 208 sh->bm_seq - conf->seq_write > 0)
203 sh->bm_seq - conf->seq_write > 0) 209 list_add_tail(&sh->lru, &conf->bitmap_list);
204 list_add_tail(&sh->lru, &conf->bitmap_list); 210 else {
205 else { 211 clear_bit(STRIPE_DELAYED, &sh->state);
206 clear_bit(STRIPE_DELAYED, &sh->state); 212 clear_bit(STRIPE_BIT_DELAY, &sh->state);
207 clear_bit(STRIPE_BIT_DELAY, &sh->state); 213 list_add_tail(&sh->lru, &conf->handle_list);
208 list_add_tail(&sh->lru, &conf->handle_list); 214 }
209 } 215 md_wakeup_thread(conf->mddev->thread);
210 md_wakeup_thread(conf->mddev->thread); 216 } else {
211 } else { 217 BUG_ON(stripe_operations_active(sh));
212 BUG_ON(stripe_operations_active(sh)); 218 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
213 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 219 if (atomic_dec_return(&conf->preread_active_stripes)
214 if (atomic_dec_return(&conf->preread_active_stripes) 220 < IO_THRESHOLD)
215 < IO_THRESHOLD) 221 md_wakeup_thread(conf->mddev->thread);
216 md_wakeup_thread(conf->mddev->thread); 222 atomic_dec(&conf->active_stripes);
217 atomic_dec(&conf->active_stripes); 223 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 224 list_add_tail(&sh->lru, &conf->inactive_list);
219 list_add_tail(&sh->lru, &conf->inactive_list); 225 wake_up(&conf->wait_for_stripe);
220 wake_up(&conf->wait_for_stripe); 226 if (conf->retry_read_aligned)
221 if (conf->retry_read_aligned) 227 md_wakeup_thread(conf->mddev->thread);
222 md_wakeup_thread(conf->mddev->thread);
223 }
224 } 228 }
225 } 229 }
226} 230}
227 231
232static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
233{
234 if (atomic_dec_and_test(&sh->count))
235 do_release_stripe(conf, sh);
236}
237
228static void release_stripe(struct stripe_head *sh) 238static void release_stripe(struct stripe_head *sh)
229{ 239{
230 struct r5conf *conf = sh->raid_conf; 240 struct r5conf *conf = sh->raid_conf;
231 unsigned long flags; 241 unsigned long flags;
232 242
233 spin_lock_irqsave(&conf->device_lock, flags); 243 local_irq_save(flags);
234 __release_stripe(conf, sh); 244 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
235 spin_unlock_irqrestore(&conf->device_lock, flags); 245 do_release_stripe(conf, sh);
246 spin_unlock(&conf->device_lock);
247 }
248 local_irq_restore(flags);
236} 249}
237 250
238static inline void remove_hash(struct stripe_head *sh) 251static inline void remove_hash(struct stripe_head *sh)
@@ -640,6 +653,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
640 else 653 else
641 bi->bi_sector = (sh->sector 654 bi->bi_sector = (sh->sector
642 + rdev->data_offset); 655 + rdev->data_offset);
656 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
657 bi->bi_rw |= REQ_FLUSH;
658
643 bi->bi_flags = 1 << BIO_UPTODATE; 659 bi->bi_flags = 1 << BIO_UPTODATE;
644 bi->bi_idx = 0; 660 bi->bi_idx = 0;
645 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 661 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -749,14 +765,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
749{ 765{
750 struct stripe_head *sh = stripe_head_ref; 766 struct stripe_head *sh = stripe_head_ref;
751 struct bio *return_bi = NULL; 767 struct bio *return_bi = NULL;
752 struct r5conf *conf = sh->raid_conf;
753 int i; 768 int i;
754 769
755 pr_debug("%s: stripe %llu\n", __func__, 770 pr_debug("%s: stripe %llu\n", __func__,
756 (unsigned long long)sh->sector); 771 (unsigned long long)sh->sector);
757 772
758 /* clear completed biofills */ 773 /* clear completed biofills */
759 spin_lock_irq(&conf->device_lock);
760 for (i = sh->disks; i--; ) { 774 for (i = sh->disks; i--; ) {
761 struct r5dev *dev = &sh->dev[i]; 775 struct r5dev *dev = &sh->dev[i];
762 776
@@ -774,7 +788,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
774 while (rbi && rbi->bi_sector < 788 while (rbi && rbi->bi_sector <
775 dev->sector + STRIPE_SECTORS) { 789 dev->sector + STRIPE_SECTORS) {
776 rbi2 = r5_next_bio(rbi, dev->sector); 790 rbi2 = r5_next_bio(rbi, dev->sector);
777 if (!raid5_dec_bi_phys_segments(rbi)) { 791 if (!raid5_dec_bi_active_stripes(rbi)) {
778 rbi->bi_next = return_bi; 792 rbi->bi_next = return_bi;
779 return_bi = rbi; 793 return_bi = rbi;
780 } 794 }
@@ -782,7 +796,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
782 } 796 }
783 } 797 }
784 } 798 }
785 spin_unlock_irq(&conf->device_lock);
786 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 799 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
787 800
788 return_io(return_bi); 801 return_io(return_bi);
@@ -794,7 +807,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
794static void ops_run_biofill(struct stripe_head *sh) 807static void ops_run_biofill(struct stripe_head *sh)
795{ 808{
796 struct dma_async_tx_descriptor *tx = NULL; 809 struct dma_async_tx_descriptor *tx = NULL;
797 struct r5conf *conf = sh->raid_conf;
798 struct async_submit_ctl submit; 810 struct async_submit_ctl submit;
799 int i; 811 int i;
800 812
@@ -805,10 +817,10 @@ static void ops_run_biofill(struct stripe_head *sh)
805 struct r5dev *dev = &sh->dev[i]; 817 struct r5dev *dev = &sh->dev[i];
806 if (test_bit(R5_Wantfill, &dev->flags)) { 818 if (test_bit(R5_Wantfill, &dev->flags)) {
807 struct bio *rbi; 819 struct bio *rbi;
808 spin_lock_irq(&conf->device_lock); 820 spin_lock_irq(&sh->stripe_lock);
809 dev->read = rbi = dev->toread; 821 dev->read = rbi = dev->toread;
810 dev->toread = NULL; 822 dev->toread = NULL;
811 spin_unlock_irq(&conf->device_lock); 823 spin_unlock_irq(&sh->stripe_lock);
812 while (rbi && rbi->bi_sector < 824 while (rbi && rbi->bi_sector <
813 dev->sector + STRIPE_SECTORS) { 825 dev->sector + STRIPE_SECTORS) {
814 tx = async_copy_data(0, rbi, dev->page, 826 tx = async_copy_data(0, rbi, dev->page,
@@ -1144,12 +1156,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1144 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1156 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1145 struct bio *wbi; 1157 struct bio *wbi;
1146 1158
1147 spin_lock_irq(&sh->raid_conf->device_lock); 1159 spin_lock_irq(&sh->stripe_lock);
1148 chosen = dev->towrite; 1160 chosen = dev->towrite;
1149 dev->towrite = NULL; 1161 dev->towrite = NULL;
1150 BUG_ON(dev->written); 1162 BUG_ON(dev->written);
1151 wbi = dev->written = chosen; 1163 wbi = dev->written = chosen;
1152 spin_unlock_irq(&sh->raid_conf->device_lock); 1164 spin_unlock_irq(&sh->stripe_lock);
1153 1165
1154 while (wbi && wbi->bi_sector < 1166 while (wbi && wbi->bi_sector <
1155 dev->sector + STRIPE_SECTORS) { 1167 dev->sector + STRIPE_SECTORS) {
@@ -1454,6 +1466,8 @@ static int grow_one_stripe(struct r5conf *conf)
1454 init_waitqueue_head(&sh->ops.wait_for_ops); 1466 init_waitqueue_head(&sh->ops.wait_for_ops);
1455 #endif 1467 #endif
1456 1468
1469 spin_lock_init(&sh->stripe_lock);
1470
1457 if (grow_buffers(sh)) { 1471 if (grow_buffers(sh)) {
1458 shrink_buffers(sh); 1472 shrink_buffers(sh);
1459 kmem_cache_free(conf->slab_cache, sh); 1473 kmem_cache_free(conf->slab_cache, sh);
@@ -1739,7 +1753,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1739 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1753 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1740 clear_bit(R5_ReadError, &sh->dev[i].flags); 1754 clear_bit(R5_ReadError, &sh->dev[i].flags);
1741 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1755 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1742 } 1756 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
1757 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1758
1743 if (atomic_read(&rdev->read_errors)) 1759 if (atomic_read(&rdev->read_errors))
1744 atomic_set(&rdev->read_errors, 0); 1760 atomic_set(&rdev->read_errors, 0);
1745 } else { 1761 } else {
@@ -1784,7 +1800,11 @@ static void raid5_end_read_request(struct bio * bi, int error)
1784 else 1800 else
1785 retry = 1; 1801 retry = 1;
1786 if (retry) 1802 if (retry)
1787 set_bit(R5_ReadError, &sh->dev[i].flags); 1803 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1804 set_bit(R5_ReadError, &sh->dev[i].flags);
1805 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1806 } else
1807 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1788 else { 1808 else {
1789 clear_bit(R5_ReadError, &sh->dev[i].flags); 1809 clear_bit(R5_ReadError, &sh->dev[i].flags);
1790 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1810 clear_bit(R5_ReWrite, &sh->dev[i].flags);
@@ -2340,11 +2360,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2340 (unsigned long long)bi->bi_sector, 2360 (unsigned long long)bi->bi_sector,
2341 (unsigned long long)sh->sector); 2361 (unsigned long long)sh->sector);
2342 2362
2343 2363 /*
2344 spin_lock_irq(&conf->device_lock); 2364 * If several bio share a stripe. The bio bi_phys_segments acts as a
2365 * reference count to avoid race. The reference count should already be
2366 * increased before this function is called (for example, in
2367 * make_request()), so other bio sharing this stripe will not free the
2368 * stripe. If a stripe is owned by one stripe, the stripe lock will
2369 * protect it.
2370 */
2371 spin_lock_irq(&sh->stripe_lock);
2345 if (forwrite) { 2372 if (forwrite) {
2346 bip = &sh->dev[dd_idx].towrite; 2373 bip = &sh->dev[dd_idx].towrite;
2347 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2374 if (*bip == NULL)
2348 firstwrite = 1; 2375 firstwrite = 1;
2349 } else 2376 } else
2350 bip = &sh->dev[dd_idx].toread; 2377 bip = &sh->dev[dd_idx].toread;
@@ -2360,7 +2387,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2360 if (*bip) 2387 if (*bip)
2361 bi->bi_next = *bip; 2388 bi->bi_next = *bip;
2362 *bip = bi; 2389 *bip = bi;
2363 bi->bi_phys_segments++; 2390 raid5_inc_bi_active_stripes(bi);
2364 2391
2365 if (forwrite) { 2392 if (forwrite) {
2366 /* check if page is covered */ 2393 /* check if page is covered */
@@ -2375,7 +2402,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2375 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2402 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2376 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2403 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2377 } 2404 }
2378 spin_unlock_irq(&conf->device_lock); 2405 spin_unlock_irq(&sh->stripe_lock);
2379 2406
2380 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2407 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2381 (unsigned long long)(*bip)->bi_sector, 2408 (unsigned long long)(*bip)->bi_sector,
@@ -2391,7 +2418,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2391 2418
2392 overlap: 2419 overlap:
2393 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2420 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2394 spin_unlock_irq(&conf->device_lock); 2421 spin_unlock_irq(&sh->stripe_lock);
2395 return 0; 2422 return 0;
2396} 2423}
2397 2424
@@ -2441,10 +2468,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2441 rdev_dec_pending(rdev, conf->mddev); 2468 rdev_dec_pending(rdev, conf->mddev);
2442 } 2469 }
2443 } 2470 }
2444 spin_lock_irq(&conf->device_lock); 2471 spin_lock_irq(&sh->stripe_lock);
2445 /* fail all writes first */ 2472 /* fail all writes first */
2446 bi = sh->dev[i].towrite; 2473 bi = sh->dev[i].towrite;
2447 sh->dev[i].towrite = NULL; 2474 sh->dev[i].towrite = NULL;
2475 spin_unlock_irq(&sh->stripe_lock);
2448 if (bi) { 2476 if (bi) {
2449 s->to_write--; 2477 s->to_write--;
2450 bitmap_end = 1; 2478 bitmap_end = 1;
@@ -2457,13 +2485,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2457 sh->dev[i].sector + STRIPE_SECTORS) { 2485 sh->dev[i].sector + STRIPE_SECTORS) {
2458 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2486 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2459 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2487 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2460 if (!raid5_dec_bi_phys_segments(bi)) { 2488 if (!raid5_dec_bi_active_stripes(bi)) {
2461 md_write_end(conf->mddev); 2489 md_write_end(conf->mddev);
2462 bi->bi_next = *return_bi; 2490 bi->bi_next = *return_bi;
2463 *return_bi = bi; 2491 *return_bi = bi;
2464 } 2492 }
2465 bi = nextbi; 2493 bi = nextbi;
2466 } 2494 }
2495 if (bitmap_end)
2496 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2497 STRIPE_SECTORS, 0, 0);
2498 bitmap_end = 0;
2467 /* and fail all 'written' */ 2499 /* and fail all 'written' */
2468 bi = sh->dev[i].written; 2500 bi = sh->dev[i].written;
2469 sh->dev[i].written = NULL; 2501 sh->dev[i].written = NULL;
@@ -2472,7 +2504,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2472 sh->dev[i].sector + STRIPE_SECTORS) { 2504 sh->dev[i].sector + STRIPE_SECTORS) {
2473 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2505 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2474 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2506 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2475 if (!raid5_dec_bi_phys_segments(bi)) { 2507 if (!raid5_dec_bi_active_stripes(bi)) {
2476 md_write_end(conf->mddev); 2508 md_write_end(conf->mddev);
2477 bi->bi_next = *return_bi; 2509 bi->bi_next = *return_bi;
2478 *return_bi = bi; 2510 *return_bi = bi;
@@ -2496,14 +2528,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2496 struct bio *nextbi = 2528 struct bio *nextbi =
2497 r5_next_bio(bi, sh->dev[i].sector); 2529 r5_next_bio(bi, sh->dev[i].sector);
2498 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2530 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2499 if (!raid5_dec_bi_phys_segments(bi)) { 2531 if (!raid5_dec_bi_active_stripes(bi)) {
2500 bi->bi_next = *return_bi; 2532 bi->bi_next = *return_bi;
2501 *return_bi = bi; 2533 *return_bi = bi;
2502 } 2534 }
2503 bi = nextbi; 2535 bi = nextbi;
2504 } 2536 }
2505 } 2537 }
2506 spin_unlock_irq(&conf->device_lock);
2507 if (bitmap_end) 2538 if (bitmap_end)
2508 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2539 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2509 STRIPE_SECTORS, 0, 0); 2540 STRIPE_SECTORS, 0, 0);
@@ -2707,30 +2738,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2707 test_bit(R5_UPTODATE, &dev->flags)) { 2738 test_bit(R5_UPTODATE, &dev->flags)) {
2708 /* We can return any write requests */ 2739 /* We can return any write requests */
2709 struct bio *wbi, *wbi2; 2740 struct bio *wbi, *wbi2;
2710 int bitmap_end = 0;
2711 pr_debug("Return write for disc %d\n", i); 2741 pr_debug("Return write for disc %d\n", i);
2712 spin_lock_irq(&conf->device_lock);
2713 wbi = dev->written; 2742 wbi = dev->written;
2714 dev->written = NULL; 2743 dev->written = NULL;
2715 while (wbi && wbi->bi_sector < 2744 while (wbi && wbi->bi_sector <
2716 dev->sector + STRIPE_SECTORS) { 2745 dev->sector + STRIPE_SECTORS) {
2717 wbi2 = r5_next_bio(wbi, dev->sector); 2746 wbi2 = r5_next_bio(wbi, dev->sector);
2718 if (!raid5_dec_bi_phys_segments(wbi)) { 2747 if (!raid5_dec_bi_active_stripes(wbi)) {
2719 md_write_end(conf->mddev); 2748 md_write_end(conf->mddev);
2720 wbi->bi_next = *return_bi; 2749 wbi->bi_next = *return_bi;
2721 *return_bi = wbi; 2750 *return_bi = wbi;
2722 } 2751 }
2723 wbi = wbi2; 2752 wbi = wbi2;
2724 } 2753 }
2725 if (dev->towrite == NULL) 2754 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2726 bitmap_end = 1; 2755 STRIPE_SECTORS,
2727 spin_unlock_irq(&conf->device_lock);
2728 if (bitmap_end)
2729 bitmap_endwrite(conf->mddev->bitmap,
2730 sh->sector,
2731 STRIPE_SECTORS,
2732 !test_bit(STRIPE_DEGRADED, &sh->state), 2756 !test_bit(STRIPE_DEGRADED, &sh->state),
2733 0); 2757 0);
2734 } 2758 }
2735 } 2759 }
2736 2760
@@ -3182,7 +3206,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3182 3206
3183 /* Now to look around and see what can be done */ 3207 /* Now to look around and see what can be done */
3184 rcu_read_lock(); 3208 rcu_read_lock();
3185 spin_lock_irq(&conf->device_lock);
3186 for (i=disks; i--; ) { 3209 for (i=disks; i--; ) {
3187 struct md_rdev *rdev; 3210 struct md_rdev *rdev;
3188 sector_t first_bad; 3211 sector_t first_bad;
@@ -3328,7 +3351,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3328 do_recovery = 1; 3351 do_recovery = 1;
3329 } 3352 }
3330 } 3353 }
3331 spin_unlock_irq(&conf->device_lock);
3332 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3354 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3333 /* If there is a failed device being replaced, 3355 /* If there is a failed device being replaced,
3334 * we must be recovering. 3356 * we must be recovering.
@@ -3791,7 +3813,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
3791 * this sets the active strip count to 1 and the processed 3813 * this sets the active strip count to 1 and the processed
3792 * strip count to zero (upper 8 bits) 3814 * strip count to zero (upper 8 bits)
3793 */ 3815 */
3794 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3816 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
3795 } 3817 }
3796 3818
3797 return bi; 3819 return bi;
@@ -4113,7 +4135,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4113 finish_wait(&conf->wait_for_overlap, &w); 4135 finish_wait(&conf->wait_for_overlap, &w);
4114 set_bit(STRIPE_HANDLE, &sh->state); 4136 set_bit(STRIPE_HANDLE, &sh->state);
4115 clear_bit(STRIPE_DELAYED, &sh->state); 4137 clear_bit(STRIPE_DELAYED, &sh->state);
4116 if ((bi->bi_rw & REQ_SYNC) && 4138 if ((bi->bi_rw & REQ_NOIDLE) &&
4117 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4139 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4118 atomic_inc(&conf->preread_active_stripes); 4140 atomic_inc(&conf->preread_active_stripes);
4119 mddev_check_plugged(mddev); 4141 mddev_check_plugged(mddev);
@@ -4126,9 +4148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4126 } 4148 }
4127 } 4149 }
4128 4150
4129 spin_lock_irq(&conf->device_lock); 4151 remaining = raid5_dec_bi_active_stripes(bi);
4130 remaining = raid5_dec_bi_phys_segments(bi);
4131 spin_unlock_irq(&conf->device_lock);
4132 if (remaining == 0) { 4152 if (remaining == 0) {
4133 4153
4134 if ( rw == WRITE ) 4154 if ( rw == WRITE )
@@ -4484,7 +4504,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4484 sector += STRIPE_SECTORS, 4504 sector += STRIPE_SECTORS,
4485 scnt++) { 4505 scnt++) {
4486 4506
4487 if (scnt < raid5_bi_hw_segments(raid_bio)) 4507 if (scnt < raid5_bi_processed_stripes(raid_bio))
4488 /* already done this stripe */ 4508 /* already done this stripe */
4489 continue; 4509 continue;
4490 4510
@@ -4492,25 +4512,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4492 4512
4493 if (!sh) { 4513 if (!sh) {
4494 /* failed to get a stripe - must wait */ 4514 /* failed to get a stripe - must wait */
4495 raid5_set_bi_hw_segments(raid_bio, scnt); 4515 raid5_set_bi_processed_stripes(raid_bio, scnt);
4496 conf->retry_read_aligned = raid_bio; 4516 conf->retry_read_aligned = raid_bio;
4497 return handled; 4517 return handled;
4498 } 4518 }
4499 4519
4500 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4520 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4501 release_stripe(sh); 4521 release_stripe(sh);
4502 raid5_set_bi_hw_segments(raid_bio, scnt); 4522 raid5_set_bi_processed_stripes(raid_bio, scnt);
4503 conf->retry_read_aligned = raid_bio; 4523 conf->retry_read_aligned = raid_bio;
4504 return handled; 4524 return handled;
4505 } 4525 }
4506 4526
4527 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4507 handle_stripe(sh); 4528 handle_stripe(sh);
4508 release_stripe(sh); 4529 release_stripe(sh);
4509 handled++; 4530 handled++;
4510 } 4531 }
4511 spin_lock_irq(&conf->device_lock); 4532 remaining = raid5_dec_bi_active_stripes(raid_bio);
4512 remaining = raid5_dec_bi_phys_segments(raid_bio);
4513 spin_unlock_irq(&conf->device_lock);
4514 if (remaining == 0) 4533 if (remaining == 0)
4515 bio_endio(raid_bio, 0); 4534 bio_endio(raid_bio, 0);
4516 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4535 if (atomic_dec_and_test(&conf->active_aligned_reads))
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2164021f3b5f..61dbb615c30b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,7 @@ struct stripe_head {
210 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
211 enum check_states check_state; 211 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 212 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock;
213 /** 214 /**
214 * struct stripe_operations 215 * struct stripe_operations
215 * @target - STRIPE_OP_COMPUTE_BLK target 216 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -273,6 +274,7 @@ enum r5dev_flags {
273 R5_Wantwrite, 274 R5_Wantwrite,
274 R5_Overlap, /* There is a pending overlapping request 275 R5_Overlap, /* There is a pending overlapping request
275 * on this block */ 276 * on this block */
277 R5_ReadNoMerge, /* prevent bio from merging in block-layer */
276 R5_ReadError, /* seen a read error here recently */ 278 R5_ReadError, /* seen a read error here recently */
277 R5_ReWrite, /* have tried to over-write the readerror */ 279 R5_ReWrite, /* have tried to over-write the readerror */
278 280