aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-08-01 12:02:01 -0400
commitfcff06c438b60f415af5983efe92811d6aa02ad1 (patch)
tree704f6598b2de60a86774bc5cf152d4f051bd2dc4
parent068535f1fef4c90aee23eb7b9b9a71c5b72d7cd0 (diff)
parent63f33b8dda88923487004b20fba825486d009e7b (diff)
Merge branch 'for-next' of git://neil.brown.name/md
Pull md updates from NeilBrown. * 'for-next' of git://neil.brown.name/md: DM RAID: Add support for MD RAID10 md/RAID1: Add missing case for attempting to repair known bad blocks. md/raid5: For odirect-write performance, do not set STRIPE_PREREAD_ACTIVE. md/raid1: don't abort a resync on the first badblock. md: remove duplicated test on ->openers when calling do_md_stop() raid5: Add R5_ReadNoMerge flag which prevent bio from merging at block layer md/raid1: prevent merging too large request md/raid1: read balance chooses idlest disk for SSD md/raid1: make sequential read detection per disk based MD RAID10: Export md_raid10_congested MD: Move macros from raid1*.h to raid1*.c MD RAID1: rename mirror_info structure MD RAID10: rename mirror_info structure MD RAID10: Fix compiler warning. raid5: add a per-stripe lock raid5: remove unnecessary bitmap write optimization raid5: lockless access raid5 overrided bi_phys_segments raid5: reduce chance release_stripe() taking device_lock
-rw-r--r--Documentation/device-mapper/dm-raid.txt26
-rw-r--r--drivers/md/dm-raid.c95
-rw-r--r--drivers/md/md.c8
-rw-r--r--drivers/md/raid1.c164
-rw-r--r--drivers/md/raid1.h30
-rw-r--r--drivers/md/raid10.c92
-rw-r--r--drivers/md/raid10.h23
-rw-r--r--drivers/md/raid5.c205
-rw-r--r--drivers/md/raid5.h2
9 files changed, 426 insertions, 219 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt
index 946c73342cde..1c1844957166 100644
--- a/Documentation/device-mapper/dm-raid.txt
+++ b/Documentation/device-mapper/dm-raid.txt
@@ -27,6 +27,10 @@ The target is named "raid" and it accepts the following parameters:
27 - rotating parity N (right-to-left) with data restart 27 - rotating parity N (right-to-left) with data restart
28 raid6_nc RAID6 N continue 28 raid6_nc RAID6 N continue
29 - rotating parity N (right-to-left) with data continuation 29 - rotating parity N (right-to-left) with data continuation
30 raid10 Various RAID10 inspired algorithms chosen by additional params
31 - RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
32 - RAID1E: Integrated Adjacent Stripe Mirroring
33 - and other similar RAID10 variants
30 34
31 Reference: Chapter 4 of 35 Reference: Chapter 4 of
32 http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf 36 http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf
@@ -59,6 +63,28 @@ The target is named "raid" and it accepts the following parameters:
59 logical size of the array. The bitmap records the device 63 logical size of the array. The bitmap records the device
60 synchronisation state for each region. 64 synchronisation state for each region.
61 65
66 [raid10_copies <# copies>]
67 [raid10_format near]
68 These two options are used to alter the default layout of
69 a RAID10 configuration. The number of copies is can be
70 specified, but the default is 2. There are other variations
71 to how the copies are laid down - the default and only current
72 option is "near". Near copies are what most people think of
73 with respect to mirroring. If these options are left
74 unspecified, or 'raid10_copies 2' and/or 'raid10_format near'
75 are given, then the layouts for 2, 3 and 4 devices are:
76 2 drives 3 drives 4 drives
77 -------- ---------- --------------
78 A1 A1 A1 A1 A2 A1 A1 A2 A2
79 A2 A2 A2 A3 A3 A3 A3 A4 A4
80 A3 A3 A4 A4 A5 A5 A5 A6 A6
81 A4 A4 A5 A6 A6 A7 A7 A8 A8
82 .. .. .. .. .. .. .. .. ..
83 The 2-device layout is equivalent 2-way RAID1. The 4-device
84 layout is what a traditional RAID10 would look like. The
85 3-device layout is what might be called a 'RAID1E - Integrated
86 Adjacent Stripe Mirroring'.
87
62<#raid_devs>: The number of devices composing the array. 88<#raid_devs>: The number of devices composing the array.
63 Each device consists of two entries. The first is the device 89 Each device consists of two entries. The first is the device
64 containing the metadata (if any); the second is the one containing the 90 containing the metadata (if any); the second is the one containing the
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f2f29c526544..982e3e390c45 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -11,6 +11,7 @@
11#include "md.h" 11#include "md.h"
12#include "raid1.h" 12#include "raid1.h"
13#include "raid5.h" 13#include "raid5.h"
14#include "raid10.h"
14#include "bitmap.h" 15#include "bitmap.h"
15 16
16#include <linux/device-mapper.h> 17#include <linux/device-mapper.h>
@@ -52,7 +53,10 @@ struct raid_dev {
52#define DMPF_MAX_RECOVERY_RATE 0x20 53#define DMPF_MAX_RECOVERY_RATE 0x20
53#define DMPF_MAX_WRITE_BEHIND 0x40 54#define DMPF_MAX_WRITE_BEHIND 0x40
54#define DMPF_STRIPE_CACHE 0x80 55#define DMPF_STRIPE_CACHE 0x80
55#define DMPF_REGION_SIZE 0X100 56#define DMPF_REGION_SIZE 0x100
57#define DMPF_RAID10_COPIES 0x200
58#define DMPF_RAID10_FORMAT 0x400
59
56struct raid_set { 60struct raid_set {
57 struct dm_target *ti; 61 struct dm_target *ti;
58 62
@@ -76,6 +80,7 @@ static struct raid_type {
76 const unsigned algorithm; /* RAID algorithm. */ 80 const unsigned algorithm; /* RAID algorithm. */
77} raid_types[] = { 81} raid_types[] = {
78 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 82 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
83 {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */},
79 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 84 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
80 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 85 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
81 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 86 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
@@ -86,6 +91,17 @@ static struct raid_type {
86 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 91 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
87}; 92};
88 93
94static unsigned raid10_md_layout_to_copies(int layout)
95{
96 return layout & 0xFF;
97}
98
99static int raid10_format_to_md_layout(char *format, unsigned copies)
100{
101 /* 1 "far" copy, and 'copies' "near" copies */
102 return (1 << 8) | (copies & 0xFF);
103}
104
89static struct raid_type *get_raid_type(char *name) 105static struct raid_type *get_raid_type(char *name)
90{ 106{
91 int i; 107 int i;
@@ -339,10 +355,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size)
339 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 355 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
340 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 356 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
341 * [region_size <sectors>] Defines granularity of bitmap 357 * [region_size <sectors>] Defines granularity of bitmap
358 *
359 * RAID10-only options:
360 * [raid10_copies <# copies>] Number of copies. (Default: 2)
361 * [raid10_format <near>] Layout algorithm. (Default: near)
342 */ 362 */
343static int parse_raid_params(struct raid_set *rs, char **argv, 363static int parse_raid_params(struct raid_set *rs, char **argv,
344 unsigned num_raid_params) 364 unsigned num_raid_params)
345{ 365{
366 char *raid10_format = "near";
367 unsigned raid10_copies = 2;
346 unsigned i, rebuild_cnt = 0; 368 unsigned i, rebuild_cnt = 0;
347 unsigned long value, region_size = 0; 369 unsigned long value, region_size = 0;
348 sector_t sectors_per_dev = rs->ti->len; 370 sector_t sectors_per_dev = rs->ti->len;
@@ -416,11 +438,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
416 } 438 }
417 439
418 key = argv[i++]; 440 key = argv[i++];
441
442 /* Parameters that take a string value are checked here. */
443 if (!strcasecmp(key, "raid10_format")) {
444 if (rs->raid_type->level != 10) {
445 rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
446 return -EINVAL;
447 }
448 if (strcmp("near", argv[i])) {
449 rs->ti->error = "Invalid 'raid10_format' value given";
450 return -EINVAL;
451 }
452 raid10_format = argv[i];
453 rs->print_flags |= DMPF_RAID10_FORMAT;
454 continue;
455 }
456
419 if (strict_strtoul(argv[i], 10, &value) < 0) { 457 if (strict_strtoul(argv[i], 10, &value) < 0) {
420 rs->ti->error = "Bad numerical argument given in raid params"; 458 rs->ti->error = "Bad numerical argument given in raid params";
421 return -EINVAL; 459 return -EINVAL;
422 } 460 }
423 461
462 /* Parameters that take a numeric value are checked here */
424 if (!strcasecmp(key, "rebuild")) { 463 if (!strcasecmp(key, "rebuild")) {
425 rebuild_cnt++; 464 rebuild_cnt++;
426 465
@@ -439,6 +478,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
439 return -EINVAL; 478 return -EINVAL;
440 } 479 }
441 break; 480 break;
481 case 10:
442 default: 482 default:
443 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); 483 DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name);
444 rs->ti->error = "Rebuild not supported for this RAID type"; 484 rs->ti->error = "Rebuild not supported for this RAID type";
@@ -495,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
495 */ 535 */
496 value /= 2; 536 value /= 2;
497 537
498 if (rs->raid_type->level < 5) { 538 if ((rs->raid_type->level != 5) &&
539 (rs->raid_type->level != 6)) {
499 rs->ti->error = "Inappropriate argument: stripe_cache"; 540 rs->ti->error = "Inappropriate argument: stripe_cache";
500 return -EINVAL; 541 return -EINVAL;
501 } 542 }
@@ -520,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
520 } else if (!strcasecmp(key, "region_size")) { 561 } else if (!strcasecmp(key, "region_size")) {
521 rs->print_flags |= DMPF_REGION_SIZE; 562 rs->print_flags |= DMPF_REGION_SIZE;
522 region_size = value; 563 region_size = value;
564 } else if (!strcasecmp(key, "raid10_copies") &&
565 (rs->raid_type->level == 10)) {
566 if ((value < 2) || (value > 0xFF)) {
567 rs->ti->error = "Bad value for 'raid10_copies'";
568 return -EINVAL;
569 }
570 rs->print_flags |= DMPF_RAID10_COPIES;
571 raid10_copies = value;
523 } else { 572 } else {
524 DMERR("Unable to parse RAID parameter: %s", key); 573 DMERR("Unable to parse RAID parameter: %s", key);
525 rs->ti->error = "Unable to parse RAID parameters"; 574 rs->ti->error = "Unable to parse RAID parameters";
@@ -538,8 +587,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
538 if (dm_set_target_max_io_len(rs->ti, max_io_len)) 587 if (dm_set_target_max_io_len(rs->ti, max_io_len))
539 return -EINVAL; 588 return -EINVAL;
540 589
541 if ((rs->raid_type->level > 1) && 590 if (rs->raid_type->level == 10) {
542 sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { 591 if (raid10_copies > rs->md.raid_disks) {
592 rs->ti->error = "Not enough devices to satisfy specification";
593 return -EINVAL;
594 }
595
596 /* (Len * #mirrors) / #devices */
597 sectors_per_dev = rs->ti->len * raid10_copies;
598 sector_div(sectors_per_dev, rs->md.raid_disks);
599
600 rs->md.layout = raid10_format_to_md_layout(raid10_format,
601 raid10_copies);
602 rs->md.new_layout = rs->md.layout;
603 } else if ((rs->raid_type->level > 1) &&
604 sector_div(sectors_per_dev,
605 (rs->md.raid_disks - rs->raid_type->parity_devs))) {
543 rs->ti->error = "Target length not divisible by number of data devices"; 606 rs->ti->error = "Target length not divisible by number of data devices";
544 return -EINVAL; 607 return -EINVAL;
545 } 608 }
@@ -566,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
566 if (rs->raid_type->level == 1) 629 if (rs->raid_type->level == 1)
567 return md_raid1_congested(&rs->md, bits); 630 return md_raid1_congested(&rs->md, bits);
568 631
632 if (rs->raid_type->level == 10)
633 return md_raid10_congested(&rs->md, bits);
634
569 return md_raid5_congested(&rs->md, bits); 635 return md_raid5_congested(&rs->md, bits);
570} 636}
571 637
@@ -884,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
884 case 6: 950 case 6:
885 redundancy = rs->raid_type->parity_devs; 951 redundancy = rs->raid_type->parity_devs;
886 break; 952 break;
953 case 10:
954 redundancy = raid10_md_layout_to_copies(mddev->layout) - 1;
955 break;
887 default: 956 default:
888 ti->error = "Unknown RAID type"; 957 ti->error = "Unknown RAID type";
889 return -EINVAL; 958 return -EINVAL;
@@ -1049,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
1049 goto bad; 1118 goto bad;
1050 } 1119 }
1051 1120
1121 if (ti->len != rs->md.array_sectors) {
1122 ti->error = "Array size does not match requested target length";
1123 ret = -EINVAL;
1124 goto size_mismatch;
1125 }
1052 rs->callbacks.congested_fn = raid_is_congested; 1126 rs->callbacks.congested_fn = raid_is_congested;
1053 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 1127 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
1054 1128
1055 mddev_suspend(&rs->md); 1129 mddev_suspend(&rs->md);
1056 return 0; 1130 return 0;
1057 1131
1132size_mismatch:
1133 md_stop(&rs->md);
1058bad: 1134bad:
1059 context_free(rs); 1135 context_free(rs);
1060 1136
@@ -1203,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type,
1203 DMEMIT(" region_size %lu", 1279 DMEMIT(" region_size %lu",
1204 rs->md.bitmap_info.chunksize >> 9); 1280 rs->md.bitmap_info.chunksize >> 9);
1205 1281
1282 if (rs->print_flags & DMPF_RAID10_COPIES)
1283 DMEMIT(" raid10_copies %u",
1284 raid10_md_layout_to_copies(rs->md.layout));
1285
1286 if (rs->print_flags & DMPF_RAID10_FORMAT)
1287 DMEMIT(" raid10_format near");
1288
1206 DMEMIT(" %d", rs->md.raid_disks); 1289 DMEMIT(" %d", rs->md.raid_disks);
1207 for (i = 0; i < rs->md.raid_disks; i++) { 1290 for (i = 0; i < rs->md.raid_disks; i++) {
1208 if (rs->dev[i].meta_dev) 1291 if (rs->dev[i].meta_dev)
@@ -1277,7 +1360,7 @@ static void raid_resume(struct dm_target *ti)
1277 1360
1278static struct target_type raid_target = { 1361static struct target_type raid_target = {
1279 .name = "raid", 1362 .name = "raid",
1280 .version = {1, 2, 0}, 1363 .version = {1, 3, 0},
1281 .module = THIS_MODULE, 1364 .module = THIS_MODULE,
1282 .ctr = raid_ctr, 1365 .ctr = raid_ctr,
1283 .dtr = raid_dtr, 1366 .dtr = raid_dtr,
@@ -1304,6 +1387,8 @@ module_init(dm_raid_init);
1304module_exit(dm_raid_exit); 1387module_exit(dm_raid_exit);
1305 1388
1306MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1389MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1390MODULE_ALIAS("dm-raid1");
1391MODULE_ALIAS("dm-raid10");
1307MODULE_ALIAS("dm-raid4"); 1392MODULE_ALIAS("dm-raid4");
1308MODULE_ALIAS("dm-raid5"); 1393MODULE_ALIAS("dm-raid5");
1309MODULE_ALIAS("dm-raid6"); 1394MODULE_ALIAS("dm-raid6");
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d5ab4493c8be..f6c46109b071 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3942,17 +3942,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
3942 break; 3942 break;
3943 case clear: 3943 case clear:
3944 /* stopping an active array */ 3944 /* stopping an active array */
3945 if (atomic_read(&mddev->openers) > 0)
3946 return -EBUSY;
3947 err = do_md_stop(mddev, 0, NULL); 3945 err = do_md_stop(mddev, 0, NULL);
3948 break; 3946 break;
3949 case inactive: 3947 case inactive:
3950 /* stopping an active array */ 3948 /* stopping an active array */
3951 if (mddev->pers) { 3949 if (mddev->pers)
3952 if (atomic_read(&mddev->openers) > 0)
3953 return -EBUSY;
3954 err = do_md_stop(mddev, 2, NULL); 3950 err = do_md_stop(mddev, 2, NULL);
3955 } else 3951 else
3956 err = 0; /* already inactive */ 3952 err = 0; /* already inactive */
3957 break; 3953 break;
3958 case suspended: 3954 case suspended:
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index cacd008d6864..197f62681db5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,6 +46,20 @@
46 */ 46 */
47#define NR_RAID1_BIOS 256 47#define NR_RAID1_BIOS 256
48 48
49/* when we get a read error on a read-only array, we redirect to another
50 * device without failing the first device, or trying to over-write to
51 * correct the read error. To keep track of bad blocks on a per-bio
52 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
53 */
54#define IO_BLOCKED ((struct bio *)1)
55/* When we successfully write to a known bad-block, we need to remove the
56 * bad-block marking which must be done from process context. So we record
57 * the success by setting devs[n].bio to IO_MADE_GOOD
58 */
59#define IO_MADE_GOOD ((struct bio *)2)
60
61#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
62
49/* When there are this many requests queue to be written by 63/* When there are this many requests queue to be written by
50 * the raid1 thread, we become 'congested' to provide back-pressure 64 * the raid1 thread, we become 'congested' to provide back-pressure
51 * for writeback. 65 * for writeback.
@@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
483 const sector_t this_sector = r1_bio->sector; 497 const sector_t this_sector = r1_bio->sector;
484 int sectors; 498 int sectors;
485 int best_good_sectors; 499 int best_good_sectors;
486 int start_disk; 500 int best_disk, best_dist_disk, best_pending_disk;
487 int best_disk; 501 int has_nonrot_disk;
488 int i; 502 int disk;
489 sector_t best_dist; 503 sector_t best_dist;
504 unsigned int min_pending;
490 struct md_rdev *rdev; 505 struct md_rdev *rdev;
491 int choose_first; 506 int choose_first;
507 int choose_next_idle;
492 508
493 rcu_read_lock(); 509 rcu_read_lock();
494 /* 510 /*
@@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
499 retry: 515 retry:
500 sectors = r1_bio->sectors; 516 sectors = r1_bio->sectors;
501 best_disk = -1; 517 best_disk = -1;
518 best_dist_disk = -1;
502 best_dist = MaxSector; 519 best_dist = MaxSector;
520 best_pending_disk = -1;
521 min_pending = UINT_MAX;
503 best_good_sectors = 0; 522 best_good_sectors = 0;
523 has_nonrot_disk = 0;
524 choose_next_idle = 0;
504 525
505 if (conf->mddev->recovery_cp < MaxSector && 526 if (conf->mddev->recovery_cp < MaxSector &&
506 (this_sector + sectors >= conf->next_resync)) { 527 (this_sector + sectors >= conf->next_resync))
507 choose_first = 1; 528 choose_first = 1;
508 start_disk = 0; 529 else
509 } else {
510 choose_first = 0; 530 choose_first = 0;
511 start_disk = conf->last_used;
512 }
513 531
514 for (i = 0 ; i < conf->raid_disks * 2 ; i++) { 532 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
515 sector_t dist; 533 sector_t dist;
516 sector_t first_bad; 534 sector_t first_bad;
517 int bad_sectors; 535 int bad_sectors;
518 536 unsigned int pending;
519 int disk = start_disk + i; 537 bool nonrot;
520 if (disk >= conf->raid_disks * 2)
521 disk -= conf->raid_disks * 2;
522 538
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 539 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 540 if (r1_bio->bios[disk] == IO_BLOCKED
@@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
577 } else 593 } else
578 best_good_sectors = sectors; 594 best_good_sectors = sectors;
579 595
596 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
597 has_nonrot_disk |= nonrot;
598 pending = atomic_read(&rdev->nr_pending);
580 dist = abs(this_sector - conf->mirrors[disk].head_position); 599 dist = abs(this_sector - conf->mirrors[disk].head_position);
581 if (choose_first 600 if (choose_first) {
582 /* Don't change to another disk for sequential reads */ 601 best_disk = disk;
583 || conf->next_seq_sect == this_sector 602 break;
584 || dist == 0 603 }
585 /* If device is idle, use it */ 604 /* Don't change to another disk for sequential reads */
586 || atomic_read(&rdev->nr_pending) == 0) { 605 if (conf->mirrors[disk].next_seq_sect == this_sector
606 || dist == 0) {
607 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
608 struct raid1_info *mirror = &conf->mirrors[disk];
609
610 best_disk = disk;
611 /*
612 * If buffered sequential IO size exceeds optimal
613 * iosize, check if there is idle disk. If yes, choose
614 * the idle disk. read_balance could already choose an
615 * idle disk before noticing it's a sequential IO in
616 * this disk. This doesn't matter because this disk
617 * will idle, next time it will be utilized after the
618 * first disk has IO size exceeds optimal iosize. In
619 * this way, iosize of the first disk will be optimal
620 * iosize at least. iosize of the second disk might be
621 * small, but not a big deal since when the second disk
622 * starts IO, the first disk is likely still busy.
623 */
624 if (nonrot && opt_iosize > 0 &&
625 mirror->seq_start != MaxSector &&
626 mirror->next_seq_sect > opt_iosize &&
627 mirror->next_seq_sect - opt_iosize >=
628 mirror->seq_start) {
629 choose_next_idle = 1;
630 continue;
631 }
632 break;
633 }
634 /* If device is idle, use it */
635 if (pending == 0) {
587 best_disk = disk; 636 best_disk = disk;
588 break; 637 break;
589 } 638 }
639
640 if (choose_next_idle)
641 continue;
642
643 if (min_pending > pending) {
644 min_pending = pending;
645 best_pending_disk = disk;
646 }
647
590 if (dist < best_dist) { 648 if (dist < best_dist) {
591 best_dist = dist; 649 best_dist = dist;
592 best_disk = disk; 650 best_dist_disk = disk;
593 } 651 }
594 } 652 }
595 653
654 /*
655 * If all disks are rotational, choose the closest disk. If any disk is
656 * non-rotational, choose the disk with less pending request even the
657 * disk is rotational, which might/might not be optimal for raids with
658 * mixed ratation/non-rotational disks depending on workload.
659 */
660 if (best_disk == -1) {
661 if (has_nonrot_disk)
662 best_disk = best_pending_disk;
663 else
664 best_disk = best_dist_disk;
665 }
666
596 if (best_disk >= 0) { 667 if (best_disk >= 0) {
597 rdev = rcu_dereference(conf->mirrors[best_disk].rdev); 668 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
598 if (!rdev) 669 if (!rdev)
@@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
606 goto retry; 677 goto retry;
607 } 678 }
608 sectors = best_good_sectors; 679 sectors = best_good_sectors;
609 conf->next_seq_sect = this_sector + sectors; 680
610 conf->last_used = best_disk; 681 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
682 conf->mirrors[best_disk].seq_start = this_sector;
683
684 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
611 } 685 }
612 rcu_read_unlock(); 686 rcu_read_unlock();
613 *max_sectors = sectors; 687 *max_sectors = sectors;
@@ -873,7 +947,7 @@ do_sync_io:
873static void make_request(struct mddev *mddev, struct bio * bio) 947static void make_request(struct mddev *mddev, struct bio * bio)
874{ 948{
875 struct r1conf *conf = mddev->private; 949 struct r1conf *conf = mddev->private;
876 struct mirror_info *mirror; 950 struct raid1_info *mirror;
877 struct r1bio *r1_bio; 951 struct r1bio *r1_bio;
878 struct bio *read_bio; 952 struct bio *read_bio;
879 int i, disks; 953 int i, disks;
@@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1364 struct r1conf *conf = mddev->private; 1438 struct r1conf *conf = mddev->private;
1365 int err = -EEXIST; 1439 int err = -EEXIST;
1366 int mirror = 0; 1440 int mirror = 0;
1367 struct mirror_info *p; 1441 struct raid1_info *p;
1368 int first = 0; 1442 int first = 0;
1369 int last = conf->raid_disks - 1; 1443 int last = conf->raid_disks - 1;
1370 struct request_queue *q = bdev_get_queue(rdev->bdev); 1444 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1433 struct r1conf *conf = mddev->private; 1507 struct r1conf *conf = mddev->private;
1434 int err = 0; 1508 int err = 0;
1435 int number = rdev->raid_disk; 1509 int number = rdev->raid_disk;
1436 struct mirror_info *p = conf->mirrors+ number; 1510 struct raid1_info *p = conf->mirrors + number;
1437 1511
1438 if (rdev != p->rdev) 1512 if (rdev != p->rdev)
1439 p = conf->mirrors + conf->raid_disks + number; 1513 p = conf->mirrors + conf->raid_disks + number;
@@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2371 bio->bi_rw = READ; 2445 bio->bi_rw = READ;
2372 bio->bi_end_io = end_sync_read; 2446 bio->bi_end_io = end_sync_read;
2373 read_targets++; 2447 read_targets++;
2448 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2449 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2450 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2451 /*
2452 * The device is suitable for reading (InSync),
2453 * but has bad block(s) here. Let's try to correct them,
2454 * if we are doing resync or repair. Otherwise, leave
2455 * this device alone for this sync request.
2456 */
2457 bio->bi_rw = WRITE;
2458 bio->bi_end_io = end_sync_write;
2459 write_targets++;
2374 } 2460 }
2375 } 2461 }
2376 if (bio->bi_end_io) { 2462 if (bio->bi_end_io) {
@@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
2428 /* There is nowhere to write, so all non-sync 2514 /* There is nowhere to write, so all non-sync
2429 * drives must be failed - so we are finished 2515 * drives must be failed - so we are finished
2430 */ 2516 */
2431 sector_t rv = max_sector - sector_nr; 2517 sector_t rv;
2518 if (min_bad > 0)
2519 max_sector = sector_nr + min_bad;
2520 rv = max_sector - sector_nr;
2432 *skipped = 1; 2521 *skipped = 1;
2433 put_buf(r1_bio); 2522 put_buf(r1_bio);
2434 return rv; 2523 return rv;
@@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2521{ 2610{
2522 struct r1conf *conf; 2611 struct r1conf *conf;
2523 int i; 2612 int i;
2524 struct mirror_info *disk; 2613 struct raid1_info *disk;
2525 struct md_rdev *rdev; 2614 struct md_rdev *rdev;
2526 int err = -ENOMEM; 2615 int err = -ENOMEM;
2527 2616
@@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2529 if (!conf) 2618 if (!conf)
2530 goto abort; 2619 goto abort;
2531 2620
2532 conf->mirrors = kzalloc(sizeof(struct mirror_info) 2621 conf->mirrors = kzalloc(sizeof(struct raid1_info)
2533 * mddev->raid_disks * 2, 2622 * mddev->raid_disks * 2,
2534 GFP_KERNEL); 2623 GFP_KERNEL);
2535 if (!conf->mirrors) 2624 if (!conf->mirrors)
@@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2572 mddev->merge_check_needed = 1; 2661 mddev->merge_check_needed = 1;
2573 2662
2574 disk->head_position = 0; 2663 disk->head_position = 0;
2664 disk->seq_start = MaxSector;
2575 } 2665 }
2576 conf->raid_disks = mddev->raid_disks; 2666 conf->raid_disks = mddev->raid_disks;
2577 conf->mddev = mddev; 2667 conf->mddev = mddev;
@@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2585 conf->recovery_disabled = mddev->recovery_disabled - 1; 2675 conf->recovery_disabled = mddev->recovery_disabled - 1;
2586 2676
2587 err = -EIO; 2677 err = -EIO;
2588 conf->last_used = -1;
2589 for (i = 0; i < conf->raid_disks * 2; i++) { 2678 for (i = 0; i < conf->raid_disks * 2; i++) {
2590 2679
2591 disk = conf->mirrors + i; 2680 disk = conf->mirrors + i;
@@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2611 if (disk->rdev && 2700 if (disk->rdev &&
2612 (disk->rdev->saved_raid_disk < 0)) 2701 (disk->rdev->saved_raid_disk < 0))
2613 conf->fullsync = 1; 2702 conf->fullsync = 1;
2614 } else if (conf->last_used < 0) 2703 }
2615 /*
2616 * The first working device is used as a
2617 * starting point to read balancing.
2618 */
2619 conf->last_used = i;
2620 } 2704 }
2621 2705
2622 if (conf->last_used < 0) {
2623 printk(KERN_ERR "md/raid1:%s: no operational mirrors\n",
2624 mdname(mddev));
2625 goto abort;
2626 }
2627 err = -ENOMEM; 2706 err = -ENOMEM;
2628 conf->thread = md_register_thread(raid1d, mddev, "raid1"); 2707 conf->thread = md_register_thread(raid1d, mddev, "raid1");
2629 if (!conf->thread) { 2708 if (!conf->thread) {
@@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev)
2798 */ 2877 */
2799 mempool_t *newpool, *oldpool; 2878 mempool_t *newpool, *oldpool;
2800 struct pool_info *newpoolinfo; 2879 struct pool_info *newpoolinfo;
2801 struct mirror_info *newmirrors; 2880 struct raid1_info *newmirrors;
2802 struct r1conf *conf = mddev->private; 2881 struct r1conf *conf = mddev->private;
2803 int cnt, raid_disks; 2882 int cnt, raid_disks;
2804 unsigned long flags; 2883 unsigned long flags;
@@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev)
2841 kfree(newpoolinfo); 2920 kfree(newpoolinfo);
2842 return -ENOMEM; 2921 return -ENOMEM;
2843 } 2922 }
2844 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, 2923 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
2845 GFP_KERNEL); 2924 GFP_KERNEL);
2846 if (!newmirrors) { 2925 if (!newmirrors) {
2847 kfree(newpoolinfo); 2926 kfree(newpoolinfo);
@@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev)
2880 conf->raid_disks = mddev->raid_disks = raid_disks; 2959 conf->raid_disks = mddev->raid_disks = raid_disks;
2881 mddev->delta_disks = 0; 2960 mddev->delta_disks = 0;
2882 2961
2883 conf->last_used = 0; /* just make sure it is in-range */
2884 lower_barrier(conf); 2962 lower_barrier(conf);
2885 2963
2886 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2964 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 80ded139314c..0ff3715fb7eb 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -1,9 +1,15 @@
1#ifndef _RAID1_H 1#ifndef _RAID1_H
2#define _RAID1_H 2#define _RAID1_H
3 3
4struct mirror_info { 4struct raid1_info {
5 struct md_rdev *rdev; 5 struct md_rdev *rdev;
6 sector_t head_position; 6 sector_t head_position;
7
8 /* When choose the best device for a read (read_balance())
9 * we try to keep sequential reads one the same device
10 */
11 sector_t next_seq_sect;
12 sector_t seq_start;
7}; 13};
8 14
9/* 15/*
@@ -24,17 +30,11 @@ struct pool_info {
24 30
25struct r1conf { 31struct r1conf {
26 struct mddev *mddev; 32 struct mddev *mddev;
27 struct mirror_info *mirrors; /* twice 'raid_disks' to 33 struct raid1_info *mirrors; /* twice 'raid_disks' to
28 * allow for replacements. 34 * allow for replacements.
29 */ 35 */
30 int raid_disks; 36 int raid_disks;
31 37
32 /* When choose the best device for a read (read_balance())
33 * we try to keep sequential reads one the same device
34 * using 'last_used' and 'next_seq_sect'
35 */
36 int last_used;
37 sector_t next_seq_sect;
38 /* During resync, read_balancing is only allowed on the part 38 /* During resync, read_balancing is only allowed on the part
39 * of the array that has been resynced. 'next_resync' tells us 39 * of the array that has been resynced. 'next_resync' tells us
40 * where that is. 40 * where that is.
@@ -135,20 +135,6 @@ struct r1bio {
135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ 135 /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/
136}; 136};
137 137
138/* when we get a read error on a read-only array, we redirect to another
139 * device without failing the first device, or trying to over-write to
140 * correct the read error. To keep track of bad blocks on a per-bio
141 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
142 */
143#define IO_BLOCKED ((struct bio *)1)
144/* When we successfully write to a known bad-block, we need to remove the
145 * bad-block marking which must be done from process context. So we record
146 * the success by setting bios[n] to IO_MADE_GOOD
147 */
148#define IO_MADE_GOOD ((struct bio *)2)
149
150#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
151
152/* bits for r1bio.state */ 138/* bits for r1bio.state */
153#define R1BIO_Uptodate 0 139#define R1BIO_Uptodate 0
154#define R1BIO_IsSync 1 140#define R1BIO_IsSync 1
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8da6282254c3..e2549deab7c3 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -60,7 +60,21 @@
60 */ 60 */
61#define NR_RAID10_BIOS 256 61#define NR_RAID10_BIOS 256
62 62
63/* When there are this many requests queue to be written by 63/* when we get a read error on a read-only array, we redirect to another
64 * device without failing the first device, or trying to over-write to
65 * correct the read error. To keep track of bad blocks on a per-bio
66 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
67 */
68#define IO_BLOCKED ((struct bio *)1)
69/* When we successfully write to a known bad-block, we need to remove the
70 * bad-block marking which must be done from process context. So we record
71 * the success by setting devs[n].bio to IO_MADE_GOOD
72 */
73#define IO_MADE_GOOD ((struct bio *)2)
74
75#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76
77/* When there are this many requests queued to be written by
64 * the raid10 thread, we become 'congested' to provide back-pressure 78 * the raid10 thread, we become 'congested' to provide back-pressure
65 * for writeback. 79 * for writeback.
66 */ 80 */
@@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
717 int sectors = r10_bio->sectors; 731 int sectors = r10_bio->sectors;
718 int best_good_sectors; 732 int best_good_sectors;
719 sector_t new_distance, best_dist; 733 sector_t new_distance, best_dist;
720 struct md_rdev *rdev, *best_rdev; 734 struct md_rdev *best_rdev, *rdev = NULL;
721 int do_balance; 735 int do_balance;
722 int best_slot; 736 int best_slot;
723 struct geom *geo = &conf->geo; 737 struct geom *geo = &conf->geo;
@@ -839,9 +853,8 @@ retry:
839 return rdev; 853 return rdev;
840} 854}
841 855
842static int raid10_congested(void *data, int bits) 856int md_raid10_congested(struct mddev *mddev, int bits)
843{ 857{
844 struct mddev *mddev = data;
845 struct r10conf *conf = mddev->private; 858 struct r10conf *conf = mddev->private;
846 int i, ret = 0; 859 int i, ret = 0;
847 860
@@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits)
849 conf->pending_count >= max_queued_requests) 862 conf->pending_count >= max_queued_requests)
850 return 1; 863 return 1;
851 864
852 if (mddev_congested(mddev, bits))
853 return 1;
854 rcu_read_lock(); 865 rcu_read_lock();
855 for (i = 0; 866 for (i = 0;
856 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 867 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
@@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits)
866 rcu_read_unlock(); 877 rcu_read_unlock();
867 return ret; 878 return ret;
868} 879}
880EXPORT_SYMBOL_GPL(md_raid10_congested);
881
882static int raid10_congested(void *data, int bits)
883{
884 struct mddev *mddev = data;
885
886 return mddev_congested(mddev, bits) ||
887 md_raid10_congested(mddev, bits);
888}
869 889
870static void flush_pending_writes(struct r10conf *conf) 890static void flush_pending_writes(struct r10conf *conf)
871{ 891{
@@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
1546static void print_conf(struct r10conf *conf) 1566static void print_conf(struct r10conf *conf)
1547{ 1567{
1548 int i; 1568 int i;
1549 struct mirror_info *tmp; 1569 struct raid10_info *tmp;
1550 1570
1551 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1571 printk(KERN_DEBUG "RAID10 conf printout:\n");
1552 if (!conf) { 1572 if (!conf) {
@@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev)
1580{ 1600{
1581 int i; 1601 int i;
1582 struct r10conf *conf = mddev->private; 1602 struct r10conf *conf = mddev->private;
1583 struct mirror_info *tmp; 1603 struct raid10_info *tmp;
1584 int count = 0; 1604 int count = 0;
1585 unsigned long flags; 1605 unsigned long flags;
1586 1606
@@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1655 else 1675 else
1656 mirror = first; 1676 mirror = first;
1657 for ( ; mirror <= last ; mirror++) { 1677 for ( ; mirror <= last ; mirror++) {
1658 struct mirror_info *p = &conf->mirrors[mirror]; 1678 struct raid10_info *p = &conf->mirrors[mirror];
1659 if (p->recovery_disabled == mddev->recovery_disabled) 1679 if (p->recovery_disabled == mddev->recovery_disabled)
1660 continue; 1680 continue;
1661 if (p->rdev) { 1681 if (p->rdev) {
@@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1709 int err = 0; 1729 int err = 0;
1710 int number = rdev->raid_disk; 1730 int number = rdev->raid_disk;
1711 struct md_rdev **rdevp; 1731 struct md_rdev **rdevp;
1712 struct mirror_info *p = conf->mirrors + number; 1732 struct raid10_info *p = conf->mirrors + number;
1713 1733
1714 print_conf(conf); 1734 print_conf(conf);
1715 if (rdev == p->rdev) 1735 if (rdev == p->rdev)
@@ -2876,7 +2896,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2876 sector_t sect; 2896 sector_t sect;
2877 int must_sync; 2897 int must_sync;
2878 int any_working; 2898 int any_working;
2879 struct mirror_info *mirror = &conf->mirrors[i]; 2899 struct raid10_info *mirror = &conf->mirrors[i];
2880 2900
2881 if ((mirror->rdev == NULL || 2901 if ((mirror->rdev == NULL ||
2882 test_bit(In_sync, &mirror->rdev->flags)) 2902 test_bit(In_sync, &mirror->rdev->flags))
@@ -3388,7 +3408,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
3388 goto out; 3408 goto out;
3389 3409
3390 /* FIXME calc properly */ 3410 /* FIXME calc properly */
3391 conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + 3411 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3392 max(0,mddev->delta_disks)), 3412 max(0,mddev->delta_disks)),
3393 GFP_KERNEL); 3413 GFP_KERNEL);
3394 if (!conf->mirrors) 3414 if (!conf->mirrors)
@@ -3452,7 +3472,7 @@ static int run(struct mddev *mddev)
3452{ 3472{
3453 struct r10conf *conf; 3473 struct r10conf *conf;
3454 int i, disk_idx, chunk_size; 3474 int i, disk_idx, chunk_size;
3455 struct mirror_info *disk; 3475 struct raid10_info *disk;
3456 struct md_rdev *rdev; 3476 struct md_rdev *rdev;
3457 sector_t size; 3477 sector_t size;
3458 sector_t min_offset_diff = 0; 3478 sector_t min_offset_diff = 0;
@@ -3472,12 +3492,14 @@ static int run(struct mddev *mddev)
3472 conf->thread = NULL; 3492 conf->thread = NULL;
3473 3493
3474 chunk_size = mddev->chunk_sectors << 9; 3494 chunk_size = mddev->chunk_sectors << 9;
3475 blk_queue_io_min(mddev->queue, chunk_size); 3495 if (mddev->queue) {
3476 if (conf->geo.raid_disks % conf->geo.near_copies) 3496 blk_queue_io_min(mddev->queue, chunk_size);
3477 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3497 if (conf->geo.raid_disks % conf->geo.near_copies)
3478 else 3498 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3479 blk_queue_io_opt(mddev->queue, chunk_size * 3499 else
3480 (conf->geo.raid_disks / conf->geo.near_copies)); 3500 blk_queue_io_opt(mddev->queue, chunk_size *
3501 (conf->geo.raid_disks / conf->geo.near_copies));
3502 }
3481 3503
3482 rdev_for_each(rdev, mddev) { 3504 rdev_for_each(rdev, mddev) {
3483 long long diff; 3505 long long diff;
@@ -3511,8 +3533,9 @@ static int run(struct mddev *mddev)
3511 if (first || diff < min_offset_diff) 3533 if (first || diff < min_offset_diff)
3512 min_offset_diff = diff; 3534 min_offset_diff = diff;
3513 3535
3514 disk_stack_limits(mddev->gendisk, rdev->bdev, 3536 if (mddev->gendisk)
3515 rdev->data_offset << 9); 3537 disk_stack_limits(mddev->gendisk, rdev->bdev,
3538 rdev->data_offset << 9);
3516 3539
3517 disk->head_position = 0; 3540 disk->head_position = 0;
3518 } 3541 }
@@ -3575,22 +3598,22 @@ static int run(struct mddev *mddev)
3575 md_set_array_sectors(mddev, size); 3598 md_set_array_sectors(mddev, size);
3576 mddev->resync_max_sectors = size; 3599 mddev->resync_max_sectors = size;
3577 3600
3578 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3601 if (mddev->queue) {
3579 mddev->queue->backing_dev_info.congested_data = mddev;
3580
3581 /* Calculate max read-ahead size.
3582 * We need to readahead at least twice a whole stripe....
3583 * maybe...
3584 */
3585 {
3586 int stripe = conf->geo.raid_disks * 3602 int stripe = conf->geo.raid_disks *
3587 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3603 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3604 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3605 mddev->queue->backing_dev_info.congested_data = mddev;
3606
3607 /* Calculate max read-ahead size.
3608 * We need to readahead at least twice a whole stripe....
3609 * maybe...
3610 */
3588 stripe /= conf->geo.near_copies; 3611 stripe /= conf->geo.near_copies;
3589 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3612 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3590 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3613 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3614 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3591 } 3615 }
3592 3616
3593 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3594 3617
3595 if (md_integrity_register(mddev)) 3618 if (md_integrity_register(mddev))
3596 goto out_free_conf; 3619 goto out_free_conf;
@@ -3641,7 +3664,10 @@ static int stop(struct mddev *mddev)
3641 lower_barrier(conf); 3664 lower_barrier(conf);
3642 3665
3643 md_unregister_thread(&mddev->thread); 3666 md_unregister_thread(&mddev->thread);
3644 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 3667 if (mddev->queue)
3668 /* the unplug fn references 'conf'*/
3669 blk_sync_queue(mddev->queue);
3670
3645 if (conf->r10bio_pool) 3671 if (conf->r10bio_pool)
3646 mempool_destroy(conf->r10bio_pool); 3672 mempool_destroy(conf->r10bio_pool);
3647 kfree(conf->mirrors); 3673 kfree(conf->mirrors);
@@ -3805,7 +3831,7 @@ static int raid10_check_reshape(struct mddev *mddev)
3805 if (mddev->delta_disks > 0) { 3831 if (mddev->delta_disks > 0) {
3806 /* allocate new 'mirrors' list */ 3832 /* allocate new 'mirrors' list */
3807 conf->mirrors_new = kzalloc( 3833 conf->mirrors_new = kzalloc(
3808 sizeof(struct mirror_info) 3834 sizeof(struct raid10_info)
3809 *(mddev->raid_disks + 3835 *(mddev->raid_disks +
3810 mddev->delta_disks), 3836 mddev->delta_disks),
3811 GFP_KERNEL); 3837 GFP_KERNEL);
@@ -3930,7 +3956,7 @@ static int raid10_start_reshape(struct mddev *mddev)
3930 spin_lock_irq(&conf->device_lock); 3956 spin_lock_irq(&conf->device_lock);
3931 if (conf->mirrors_new) { 3957 if (conf->mirrors_new) {
3932 memcpy(conf->mirrors_new, conf->mirrors, 3958 memcpy(conf->mirrors_new, conf->mirrors,
3933 sizeof(struct mirror_info)*conf->prev.raid_disks); 3959 sizeof(struct raid10_info)*conf->prev.raid_disks);
3934 smp_mb(); 3960 smp_mb();
3935 kfree(conf->mirrors_old); /* FIXME and elsewhere */ 3961 kfree(conf->mirrors_old); /* FIXME and elsewhere */
3936 conf->mirrors_old = conf->mirrors; 3962 conf->mirrors_old = conf->mirrors;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 135b1b0a1554..007c2c68dd83 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -1,7 +1,7 @@
1#ifndef _RAID10_H 1#ifndef _RAID10_H
2#define _RAID10_H 2#define _RAID10_H
3 3
4struct mirror_info { 4struct raid10_info {
5 struct md_rdev *rdev, *replacement; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
@@ -13,8 +13,8 @@ struct mirror_info {
13 13
14struct r10conf { 14struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct mirror_info *mirrors; 16 struct raid10_info *mirrors;
17 struct mirror_info *mirrors_new, *mirrors_old; 17 struct raid10_info *mirrors_new, *mirrors_old;
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
@@ -123,20 +123,6 @@ struct r10bio {
123 } devs[0]; 123 } devs[0];
124}; 124};
125 125
126/* when we get a read error on a read-only array, we redirect to another
127 * device without failing the first device, or trying to over-write to
128 * correct the read error. To keep track of bad blocks on a per-bio
129 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
130 */
131#define IO_BLOCKED ((struct bio*)1)
132/* When we successfully write to a known bad-block, we need to remove the
133 * bad-block marking which must be done from process context. So we record
134 * the success by setting devs[n].bio to IO_MADE_GOOD
135 */
136#define IO_MADE_GOOD ((struct bio *)2)
137
138#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
139
140/* bits for r10bio.state */ 126/* bits for r10bio.state */
141enum r10bio_state { 127enum r10bio_state {
142 R10BIO_Uptodate, 128 R10BIO_Uptodate,
@@ -159,4 +145,7 @@ enum r10bio_state {
159 */ 145 */
160 R10BIO_Previous, 146 R10BIO_Previous,
161}; 147};
148
149extern int md_raid10_congested(struct mddev *mddev, int bits);
150
162#endif 151#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 04348d76bb30..259f519814ca 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
99 * We maintain a biased count of active stripes in the bottom 16 bits of 99 * We maintain a biased count of active stripes in the bottom 16 bits of
100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 100 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
101 */ 101 */
102static inline int raid5_bi_phys_segments(struct bio *bio) 102static inline int raid5_bi_processed_stripes(struct bio *bio)
103{ 103{
104 return bio->bi_phys_segments & 0xffff; 104 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
105 return (atomic_read(segments) >> 16) & 0xffff;
105} 106}
106 107
107static inline int raid5_bi_hw_segments(struct bio *bio) 108static inline int raid5_dec_bi_active_stripes(struct bio *bio)
108{ 109{
109 return (bio->bi_phys_segments >> 16) & 0xffff; 110 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
111 return atomic_sub_return(1, segments) & 0xffff;
110} 112}
111 113
112static inline int raid5_dec_bi_phys_segments(struct bio *bio) 114static inline void raid5_inc_bi_active_stripes(struct bio *bio)
113{ 115{
114 --bio->bi_phys_segments; 116 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
115 return raid5_bi_phys_segments(bio); 117 atomic_inc(segments);
116} 118}
117 119
118static inline int raid5_dec_bi_hw_segments(struct bio *bio) 120static inline void raid5_set_bi_processed_stripes(struct bio *bio,
121 unsigned int cnt)
119{ 122{
120 unsigned short val = raid5_bi_hw_segments(bio); 123 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
124 int old, new;
121 125
122 --val; 126 do {
123 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 old = atomic_read(segments);
124 return val; 128 new = (old & 0xffff) | (cnt << 16);
129 } while (atomic_cmpxchg(segments, old, new) != old);
125} 130}
126 131
127static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 132static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
128{ 133{
129 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 134 atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
135 atomic_set(segments, cnt);
130} 136}
131 137
132/* Find first data disk in a raid6 stripe */ 138/* Find first data disk in a raid6 stripe */
@@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh)
190 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 196 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
191} 197}
192 198
193static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) 199static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh)
194{ 200{
195 if (atomic_dec_and_test(&sh->count)) { 201 BUG_ON(!list_empty(&sh->lru));
196 BUG_ON(!list_empty(&sh->lru)); 202 BUG_ON(atomic_read(&conf->active_stripes)==0);
197 BUG_ON(atomic_read(&conf->active_stripes)==0); 203 if (test_bit(STRIPE_HANDLE, &sh->state)) {
198 if (test_bit(STRIPE_HANDLE, &sh->state)) { 204 if (test_bit(STRIPE_DELAYED, &sh->state) &&
199 if (test_bit(STRIPE_DELAYED, &sh->state) && 205 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
200 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 206 list_add_tail(&sh->lru, &conf->delayed_list);
201 list_add_tail(&sh->lru, &conf->delayed_list); 207 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
202 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 208 sh->bm_seq - conf->seq_write > 0)
203 sh->bm_seq - conf->seq_write > 0) 209 list_add_tail(&sh->lru, &conf->bitmap_list);
204 list_add_tail(&sh->lru, &conf->bitmap_list); 210 else {
205 else { 211 clear_bit(STRIPE_DELAYED, &sh->state);
206 clear_bit(STRIPE_DELAYED, &sh->state); 212 clear_bit(STRIPE_BIT_DELAY, &sh->state);
207 clear_bit(STRIPE_BIT_DELAY, &sh->state); 213 list_add_tail(&sh->lru, &conf->handle_list);
208 list_add_tail(&sh->lru, &conf->handle_list); 214 }
209 } 215 md_wakeup_thread(conf->mddev->thread);
210 md_wakeup_thread(conf->mddev->thread); 216 } else {
211 } else { 217 BUG_ON(stripe_operations_active(sh));
212 BUG_ON(stripe_operations_active(sh)); 218 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
213 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 219 if (atomic_dec_return(&conf->preread_active_stripes)
214 if (atomic_dec_return(&conf->preread_active_stripes) 220 < IO_THRESHOLD)
215 < IO_THRESHOLD) 221 md_wakeup_thread(conf->mddev->thread);
216 md_wakeup_thread(conf->mddev->thread); 222 atomic_dec(&conf->active_stripes);
217 atomic_dec(&conf->active_stripes); 223 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 224 list_add_tail(&sh->lru, &conf->inactive_list);
219 list_add_tail(&sh->lru, &conf->inactive_list); 225 wake_up(&conf->wait_for_stripe);
220 wake_up(&conf->wait_for_stripe); 226 if (conf->retry_read_aligned)
221 if (conf->retry_read_aligned) 227 md_wakeup_thread(conf->mddev->thread);
222 md_wakeup_thread(conf->mddev->thread);
223 }
224 } 228 }
225 } 229 }
226} 230}
227 231
232static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
233{
234 if (atomic_dec_and_test(&sh->count))
235 do_release_stripe(conf, sh);
236}
237
228static void release_stripe(struct stripe_head *sh) 238static void release_stripe(struct stripe_head *sh)
229{ 239{
230 struct r5conf *conf = sh->raid_conf; 240 struct r5conf *conf = sh->raid_conf;
231 unsigned long flags; 241 unsigned long flags;
232 242
233 spin_lock_irqsave(&conf->device_lock, flags); 243 local_irq_save(flags);
234 __release_stripe(conf, sh); 244 if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) {
235 spin_unlock_irqrestore(&conf->device_lock, flags); 245 do_release_stripe(conf, sh);
246 spin_unlock(&conf->device_lock);
247 }
248 local_irq_restore(flags);
236} 249}
237 250
238static inline void remove_hash(struct stripe_head *sh) 251static inline void remove_hash(struct stripe_head *sh)
@@ -640,6 +653,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
640 else 653 else
641 bi->bi_sector = (sh->sector 654 bi->bi_sector = (sh->sector
642 + rdev->data_offset); 655 + rdev->data_offset);
656 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
657 bi->bi_rw |= REQ_FLUSH;
658
643 bi->bi_flags = 1 << BIO_UPTODATE; 659 bi->bi_flags = 1 << BIO_UPTODATE;
644 bi->bi_idx = 0; 660 bi->bi_idx = 0;
645 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 661 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -749,14 +765,12 @@ static void ops_complete_biofill(void *stripe_head_ref)
749{ 765{
750 struct stripe_head *sh = stripe_head_ref; 766 struct stripe_head *sh = stripe_head_ref;
751 struct bio *return_bi = NULL; 767 struct bio *return_bi = NULL;
752 struct r5conf *conf = sh->raid_conf;
753 int i; 768 int i;
754 769
755 pr_debug("%s: stripe %llu\n", __func__, 770 pr_debug("%s: stripe %llu\n", __func__,
756 (unsigned long long)sh->sector); 771 (unsigned long long)sh->sector);
757 772
758 /* clear completed biofills */ 773 /* clear completed biofills */
759 spin_lock_irq(&conf->device_lock);
760 for (i = sh->disks; i--; ) { 774 for (i = sh->disks; i--; ) {
761 struct r5dev *dev = &sh->dev[i]; 775 struct r5dev *dev = &sh->dev[i];
762 776
@@ -774,7 +788,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
774 while (rbi && rbi->bi_sector < 788 while (rbi && rbi->bi_sector <
775 dev->sector + STRIPE_SECTORS) { 789 dev->sector + STRIPE_SECTORS) {
776 rbi2 = r5_next_bio(rbi, dev->sector); 790 rbi2 = r5_next_bio(rbi, dev->sector);
777 if (!raid5_dec_bi_phys_segments(rbi)) { 791 if (!raid5_dec_bi_active_stripes(rbi)) {
778 rbi->bi_next = return_bi; 792 rbi->bi_next = return_bi;
779 return_bi = rbi; 793 return_bi = rbi;
780 } 794 }
@@ -782,7 +796,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
782 } 796 }
783 } 797 }
784 } 798 }
785 spin_unlock_irq(&conf->device_lock);
786 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 799 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
787 800
788 return_io(return_bi); 801 return_io(return_bi);
@@ -794,7 +807,6 @@ static void ops_complete_biofill(void *stripe_head_ref)
794static void ops_run_biofill(struct stripe_head *sh) 807static void ops_run_biofill(struct stripe_head *sh)
795{ 808{
796 struct dma_async_tx_descriptor *tx = NULL; 809 struct dma_async_tx_descriptor *tx = NULL;
797 struct r5conf *conf = sh->raid_conf;
798 struct async_submit_ctl submit; 810 struct async_submit_ctl submit;
799 int i; 811 int i;
800 812
@@ -805,10 +817,10 @@ static void ops_run_biofill(struct stripe_head *sh)
805 struct r5dev *dev = &sh->dev[i]; 817 struct r5dev *dev = &sh->dev[i];
806 if (test_bit(R5_Wantfill, &dev->flags)) { 818 if (test_bit(R5_Wantfill, &dev->flags)) {
807 struct bio *rbi; 819 struct bio *rbi;
808 spin_lock_irq(&conf->device_lock); 820 spin_lock_irq(&sh->stripe_lock);
809 dev->read = rbi = dev->toread; 821 dev->read = rbi = dev->toread;
810 dev->toread = NULL; 822 dev->toread = NULL;
811 spin_unlock_irq(&conf->device_lock); 823 spin_unlock_irq(&sh->stripe_lock);
812 while (rbi && rbi->bi_sector < 824 while (rbi && rbi->bi_sector <
813 dev->sector + STRIPE_SECTORS) { 825 dev->sector + STRIPE_SECTORS) {
814 tx = async_copy_data(0, rbi, dev->page, 826 tx = async_copy_data(0, rbi, dev->page,
@@ -1144,12 +1156,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1144 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1156 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1145 struct bio *wbi; 1157 struct bio *wbi;
1146 1158
1147 spin_lock_irq(&sh->raid_conf->device_lock); 1159 spin_lock_irq(&sh->stripe_lock);
1148 chosen = dev->towrite; 1160 chosen = dev->towrite;
1149 dev->towrite = NULL; 1161 dev->towrite = NULL;
1150 BUG_ON(dev->written); 1162 BUG_ON(dev->written);
1151 wbi = dev->written = chosen; 1163 wbi = dev->written = chosen;
1152 spin_unlock_irq(&sh->raid_conf->device_lock); 1164 spin_unlock_irq(&sh->stripe_lock);
1153 1165
1154 while (wbi && wbi->bi_sector < 1166 while (wbi && wbi->bi_sector <
1155 dev->sector + STRIPE_SECTORS) { 1167 dev->sector + STRIPE_SECTORS) {
@@ -1454,6 +1466,8 @@ static int grow_one_stripe(struct r5conf *conf)
1454 init_waitqueue_head(&sh->ops.wait_for_ops); 1466 init_waitqueue_head(&sh->ops.wait_for_ops);
1455 #endif 1467 #endif
1456 1468
1469 spin_lock_init(&sh->stripe_lock);
1470
1457 if (grow_buffers(sh)) { 1471 if (grow_buffers(sh)) {
1458 shrink_buffers(sh); 1472 shrink_buffers(sh);
1459 kmem_cache_free(conf->slab_cache, sh); 1473 kmem_cache_free(conf->slab_cache, sh);
@@ -1739,7 +1753,9 @@ static void raid5_end_read_request(struct bio * bi, int error)
1739 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); 1753 atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
1740 clear_bit(R5_ReadError, &sh->dev[i].flags); 1754 clear_bit(R5_ReadError, &sh->dev[i].flags);
1741 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1755 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1742 } 1756 } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
1757 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1758
1743 if (atomic_read(&rdev->read_errors)) 1759 if (atomic_read(&rdev->read_errors))
1744 atomic_set(&rdev->read_errors, 0); 1760 atomic_set(&rdev->read_errors, 0);
1745 } else { 1761 } else {
@@ -1784,7 +1800,11 @@ static void raid5_end_read_request(struct bio * bi, int error)
1784 else 1800 else
1785 retry = 1; 1801 retry = 1;
1786 if (retry) 1802 if (retry)
1787 set_bit(R5_ReadError, &sh->dev[i].flags); 1803 if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) {
1804 set_bit(R5_ReadError, &sh->dev[i].flags);
1805 clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1806 } else
1807 set_bit(R5_ReadNoMerge, &sh->dev[i].flags);
1788 else { 1808 else {
1789 clear_bit(R5_ReadError, &sh->dev[i].flags); 1809 clear_bit(R5_ReadError, &sh->dev[i].flags);
1790 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1810 clear_bit(R5_ReWrite, &sh->dev[i].flags);
@@ -2340,11 +2360,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2340 (unsigned long long)bi->bi_sector, 2360 (unsigned long long)bi->bi_sector,
2341 (unsigned long long)sh->sector); 2361 (unsigned long long)sh->sector);
2342 2362
2343 2363 /*
2344 spin_lock_irq(&conf->device_lock); 2364 * If several bio share a stripe. The bio bi_phys_segments acts as a
2365 * reference count to avoid race. The reference count should already be
2366 * increased before this function is called (for example, in
2367 * make_request()), so other bio sharing this stripe will not free the
2368 * stripe. If a stripe is owned by one stripe, the stripe lock will
2369 * protect it.
2370 */
2371 spin_lock_irq(&sh->stripe_lock);
2345 if (forwrite) { 2372 if (forwrite) {
2346 bip = &sh->dev[dd_idx].towrite; 2373 bip = &sh->dev[dd_idx].towrite;
2347 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2374 if (*bip == NULL)
2348 firstwrite = 1; 2375 firstwrite = 1;
2349 } else 2376 } else
2350 bip = &sh->dev[dd_idx].toread; 2377 bip = &sh->dev[dd_idx].toread;
@@ -2360,7 +2387,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2360 if (*bip) 2387 if (*bip)
2361 bi->bi_next = *bip; 2388 bi->bi_next = *bip;
2362 *bip = bi; 2389 *bip = bi;
2363 bi->bi_phys_segments++; 2390 raid5_inc_bi_active_stripes(bi);
2364 2391
2365 if (forwrite) { 2392 if (forwrite) {
2366 /* check if page is covered */ 2393 /* check if page is covered */
@@ -2375,7 +2402,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2375 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2402 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2376 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2403 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2377 } 2404 }
2378 spin_unlock_irq(&conf->device_lock); 2405 spin_unlock_irq(&sh->stripe_lock);
2379 2406
2380 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2407 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2381 (unsigned long long)(*bip)->bi_sector, 2408 (unsigned long long)(*bip)->bi_sector,
@@ -2391,7 +2418,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2391 2418
2392 overlap: 2419 overlap:
2393 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2420 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2394 spin_unlock_irq(&conf->device_lock); 2421 spin_unlock_irq(&sh->stripe_lock);
2395 return 0; 2422 return 0;
2396} 2423}
2397 2424
@@ -2441,10 +2468,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2441 rdev_dec_pending(rdev, conf->mddev); 2468 rdev_dec_pending(rdev, conf->mddev);
2442 } 2469 }
2443 } 2470 }
2444 spin_lock_irq(&conf->device_lock); 2471 spin_lock_irq(&sh->stripe_lock);
2445 /* fail all writes first */ 2472 /* fail all writes first */
2446 bi = sh->dev[i].towrite; 2473 bi = sh->dev[i].towrite;
2447 sh->dev[i].towrite = NULL; 2474 sh->dev[i].towrite = NULL;
2475 spin_unlock_irq(&sh->stripe_lock);
2448 if (bi) { 2476 if (bi) {
2449 s->to_write--; 2477 s->to_write--;
2450 bitmap_end = 1; 2478 bitmap_end = 1;
@@ -2457,13 +2485,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2457 sh->dev[i].sector + STRIPE_SECTORS) { 2485 sh->dev[i].sector + STRIPE_SECTORS) {
2458 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2486 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2459 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2487 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2460 if (!raid5_dec_bi_phys_segments(bi)) { 2488 if (!raid5_dec_bi_active_stripes(bi)) {
2461 md_write_end(conf->mddev); 2489 md_write_end(conf->mddev);
2462 bi->bi_next = *return_bi; 2490 bi->bi_next = *return_bi;
2463 *return_bi = bi; 2491 *return_bi = bi;
2464 } 2492 }
2465 bi = nextbi; 2493 bi = nextbi;
2466 } 2494 }
2495 if (bitmap_end)
2496 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2497 STRIPE_SECTORS, 0, 0);
2498 bitmap_end = 0;
2467 /* and fail all 'written' */ 2499 /* and fail all 'written' */
2468 bi = sh->dev[i].written; 2500 bi = sh->dev[i].written;
2469 sh->dev[i].written = NULL; 2501 sh->dev[i].written = NULL;
@@ -2472,7 +2504,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2472 sh->dev[i].sector + STRIPE_SECTORS) { 2504 sh->dev[i].sector + STRIPE_SECTORS) {
2473 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2505 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2474 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2506 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2475 if (!raid5_dec_bi_phys_segments(bi)) { 2507 if (!raid5_dec_bi_active_stripes(bi)) {
2476 md_write_end(conf->mddev); 2508 md_write_end(conf->mddev);
2477 bi->bi_next = *return_bi; 2509 bi->bi_next = *return_bi;
2478 *return_bi = bi; 2510 *return_bi = bi;
@@ -2496,14 +2528,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2496 struct bio *nextbi = 2528 struct bio *nextbi =
2497 r5_next_bio(bi, sh->dev[i].sector); 2529 r5_next_bio(bi, sh->dev[i].sector);
2498 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2530 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2499 if (!raid5_dec_bi_phys_segments(bi)) { 2531 if (!raid5_dec_bi_active_stripes(bi)) {
2500 bi->bi_next = *return_bi; 2532 bi->bi_next = *return_bi;
2501 *return_bi = bi; 2533 *return_bi = bi;
2502 } 2534 }
2503 bi = nextbi; 2535 bi = nextbi;
2504 } 2536 }
2505 } 2537 }
2506 spin_unlock_irq(&conf->device_lock);
2507 if (bitmap_end) 2538 if (bitmap_end)
2508 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2539 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2509 STRIPE_SECTORS, 0, 0); 2540 STRIPE_SECTORS, 0, 0);
@@ -2707,30 +2738,23 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2707 test_bit(R5_UPTODATE, &dev->flags)) { 2738 test_bit(R5_UPTODATE, &dev->flags)) {
2708 /* We can return any write requests */ 2739 /* We can return any write requests */
2709 struct bio *wbi, *wbi2; 2740 struct bio *wbi, *wbi2;
2710 int bitmap_end = 0;
2711 pr_debug("Return write for disc %d\n", i); 2741 pr_debug("Return write for disc %d\n", i);
2712 spin_lock_irq(&conf->device_lock);
2713 wbi = dev->written; 2742 wbi = dev->written;
2714 dev->written = NULL; 2743 dev->written = NULL;
2715 while (wbi && wbi->bi_sector < 2744 while (wbi && wbi->bi_sector <
2716 dev->sector + STRIPE_SECTORS) { 2745 dev->sector + STRIPE_SECTORS) {
2717 wbi2 = r5_next_bio(wbi, dev->sector); 2746 wbi2 = r5_next_bio(wbi, dev->sector);
2718 if (!raid5_dec_bi_phys_segments(wbi)) { 2747 if (!raid5_dec_bi_active_stripes(wbi)) {
2719 md_write_end(conf->mddev); 2748 md_write_end(conf->mddev);
2720 wbi->bi_next = *return_bi; 2749 wbi->bi_next = *return_bi;
2721 *return_bi = wbi; 2750 *return_bi = wbi;
2722 } 2751 }
2723 wbi = wbi2; 2752 wbi = wbi2;
2724 } 2753 }
2725 if (dev->towrite == NULL) 2754 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2726 bitmap_end = 1; 2755 STRIPE_SECTORS,
2727 spin_unlock_irq(&conf->device_lock);
2728 if (bitmap_end)
2729 bitmap_endwrite(conf->mddev->bitmap,
2730 sh->sector,
2731 STRIPE_SECTORS,
2732 !test_bit(STRIPE_DEGRADED, &sh->state), 2756 !test_bit(STRIPE_DEGRADED, &sh->state),
2733 0); 2757 0);
2734 } 2758 }
2735 } 2759 }
2736 2760
@@ -3182,7 +3206,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3182 3206
3183 /* Now to look around and see what can be done */ 3207 /* Now to look around and see what can be done */
3184 rcu_read_lock(); 3208 rcu_read_lock();
3185 spin_lock_irq(&conf->device_lock);
3186 for (i=disks; i--; ) { 3209 for (i=disks; i--; ) {
3187 struct md_rdev *rdev; 3210 struct md_rdev *rdev;
3188 sector_t first_bad; 3211 sector_t first_bad;
@@ -3328,7 +3351,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
3328 do_recovery = 1; 3351 do_recovery = 1;
3329 } 3352 }
3330 } 3353 }
3331 spin_unlock_irq(&conf->device_lock);
3332 if (test_bit(STRIPE_SYNCING, &sh->state)) { 3354 if (test_bit(STRIPE_SYNCING, &sh->state)) {
3333 /* If there is a failed device being replaced, 3355 /* If there is a failed device being replaced,
3334 * we must be recovering. 3356 * we must be recovering.
@@ -3791,7 +3813,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
3791 * this sets the active strip count to 1 and the processed 3813 * this sets the active strip count to 1 and the processed
3792 * strip count to zero (upper 8 bits) 3814 * strip count to zero (upper 8 bits)
3793 */ 3815 */
3794 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3816 raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
3795 } 3817 }
3796 3818
3797 return bi; 3819 return bi;
@@ -4113,7 +4135,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4113 finish_wait(&conf->wait_for_overlap, &w); 4135 finish_wait(&conf->wait_for_overlap, &w);
4114 set_bit(STRIPE_HANDLE, &sh->state); 4136 set_bit(STRIPE_HANDLE, &sh->state);
4115 clear_bit(STRIPE_DELAYED, &sh->state); 4137 clear_bit(STRIPE_DELAYED, &sh->state);
4116 if ((bi->bi_rw & REQ_SYNC) && 4138 if ((bi->bi_rw & REQ_NOIDLE) &&
4117 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 4139 !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4118 atomic_inc(&conf->preread_active_stripes); 4140 atomic_inc(&conf->preread_active_stripes);
4119 mddev_check_plugged(mddev); 4141 mddev_check_plugged(mddev);
@@ -4126,9 +4148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4126 } 4148 }
4127 } 4149 }
4128 4150
4129 spin_lock_irq(&conf->device_lock); 4151 remaining = raid5_dec_bi_active_stripes(bi);
4130 remaining = raid5_dec_bi_phys_segments(bi);
4131 spin_unlock_irq(&conf->device_lock);
4132 if (remaining == 0) { 4152 if (remaining == 0) {
4133 4153
4134 if ( rw == WRITE ) 4154 if ( rw == WRITE )
@@ -4484,7 +4504,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4484 sector += STRIPE_SECTORS, 4504 sector += STRIPE_SECTORS,
4485 scnt++) { 4505 scnt++) {
4486 4506
4487 if (scnt < raid5_bi_hw_segments(raid_bio)) 4507 if (scnt < raid5_bi_processed_stripes(raid_bio))
4488 /* already done this stripe */ 4508 /* already done this stripe */
4489 continue; 4509 continue;
4490 4510
@@ -4492,25 +4512,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
4492 4512
4493 if (!sh) { 4513 if (!sh) {
4494 /* failed to get a stripe - must wait */ 4514 /* failed to get a stripe - must wait */
4495 raid5_set_bi_hw_segments(raid_bio, scnt); 4515 raid5_set_bi_processed_stripes(raid_bio, scnt);
4496 conf->retry_read_aligned = raid_bio; 4516 conf->retry_read_aligned = raid_bio;
4497 return handled; 4517 return handled;
4498 } 4518 }
4499 4519
4500 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4520 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4501 release_stripe(sh); 4521 release_stripe(sh);
4502 raid5_set_bi_hw_segments(raid_bio, scnt); 4522 raid5_set_bi_processed_stripes(raid_bio, scnt);
4503 conf->retry_read_aligned = raid_bio; 4523 conf->retry_read_aligned = raid_bio;
4504 return handled; 4524 return handled;
4505 } 4525 }
4506 4526
4527 set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
4507 handle_stripe(sh); 4528 handle_stripe(sh);
4508 release_stripe(sh); 4529 release_stripe(sh);
4509 handled++; 4530 handled++;
4510 } 4531 }
4511 spin_lock_irq(&conf->device_lock); 4532 remaining = raid5_dec_bi_active_stripes(raid_bio);
4512 remaining = raid5_dec_bi_phys_segments(raid_bio);
4513 spin_unlock_irq(&conf->device_lock);
4514 if (remaining == 0) 4533 if (remaining == 0)
4515 bio_endio(raid_bio, 0); 4534 bio_endio(raid_bio, 0);
4516 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4535 if (atomic_dec_and_test(&conf->active_aligned_reads))
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 2164021f3b5f..61dbb615c30b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,7 @@ struct stripe_head {
210 int disks; /* disks in stripe */ 210 int disks; /* disks in stripe */
211 enum check_states check_state; 211 enum check_states check_state;
212 enum reconstruct_states reconstruct_state; 212 enum reconstruct_states reconstruct_state;
213 spinlock_t stripe_lock;
213 /** 214 /**
214 * struct stripe_operations 215 * struct stripe_operations
215 * @target - STRIPE_OP_COMPUTE_BLK target 216 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -273,6 +274,7 @@ enum r5dev_flags {
273 R5_Wantwrite, 274 R5_Wantwrite,
274 R5_Overlap, /* There is a pending overlapping request 275 R5_Overlap, /* There is a pending overlapping request
275 * on this block */ 276 * on this block */
277 R5_ReadNoMerge, /* prevent bio from merging in block-layer */
276 R5_ReadError, /* seen a read error here recently */ 278 R5_ReadError, /* seen a read error here recently */
277 R5_ReWrite, /* have tried to over-write the readerror */ 279 R5_ReWrite, /* have tried to over-write the readerror */
278 280