diff options
| -rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 26 | ||||
| -rw-r--r-- | drivers/md/dm-raid.c | 95 | ||||
| -rw-r--r-- | drivers/md/md.c | 8 | ||||
| -rw-r--r-- | drivers/md/raid1.c | 164 | ||||
| -rw-r--r-- | drivers/md/raid1.h | 30 | ||||
| -rw-r--r-- | drivers/md/raid10.c | 92 | ||||
| -rw-r--r-- | drivers/md/raid10.h | 23 | ||||
| -rw-r--r-- | drivers/md/raid5.c | 205 | ||||
| -rw-r--r-- | drivers/md/raid5.h | 2 |
9 files changed, 426 insertions, 219 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 946c73342cde..1c1844957166 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt | |||
| @@ -27,6 +27,10 @@ The target is named "raid" and it accepts the following parameters: | |||
| 27 | - rotating parity N (right-to-left) with data restart | 27 | - rotating parity N (right-to-left) with data restart |
| 28 | raid6_nc RAID6 N continue | 28 | raid6_nc RAID6 N continue |
| 29 | - rotating parity N (right-to-left) with data continuation | 29 | - rotating parity N (right-to-left) with data continuation |
| 30 | raid10 Various RAID10 inspired algorithms chosen by additional params | ||
| 31 | - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') | ||
| 32 | - RAID1E: Integrated Adjacent Stripe Mirroring | ||
| 33 | - and other similar RAID10 variants | ||
| 30 | 34 | ||
| 31 | Reference: Chapter 4 of | 35 | Reference: Chapter 4 of |
| 32 | http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf | 36 | http://www.snia.org/sites/default/files/SNIA_DDF_Technical_Position_v2.0.pdf |
| @@ -59,6 +63,28 @@ The target is named "raid" and it accepts the following parameters: | |||
| 59 | logical size of the array. The bitmap records the device | 63 | logical size of the array. The bitmap records the device |
| 60 | synchronisation state for each region. | 64 | synchronisation state for each region. |
| 61 | 65 | ||
| 66 | [raid10_copies <# copies>] | ||
| 67 | [raid10_format near] | ||
| 68 | These two options are used to alter the default layout of | ||
| 69 | a RAID10 configuration. The number of copies is can be | ||
| 70 | specified, but the default is 2. There are other variations | ||
| 71 | to how the copies are laid down - the default and only current | ||
| 72 | option is "near". Near copies are what most people think of | ||
| 73 | with respect to mirroring. If these options are left | ||
| 74 | unspecified, or 'raid10_copies 2' and/or 'raid10_format near' | ||
| 75 | are given, then the layouts for 2, 3 and 4 devices are: | ||
| 76 | 2 drives 3 drives 4 drives | ||
| 77 | -------- ---------- -------------- | ||
| 78 | A1 A1 A1 A1 A2 A1 A1 A2 A2 | ||
| 79 | A2 A2 A2 A3 A3 A3 A3 A4 A4 | ||
| 80 | A3 A3 A4 A4 A5 A5 A5 A6 A6 | ||
| 81 | A4 A4 A5 A6 A6 A7 A7 A8 A8 | ||
| 82 | .. .. .. .. .. .. .. .. .. | ||
| 83 | The 2-device layout is equivalent 2-way RAID1. The 4-device | ||
| 84 | layout is what a traditional RAID10 would look like. The | ||
| 85 | 3-device layout is what might be called a 'RAID1E - Integrated | ||
| 86 | Adjacent Stripe Mirroring'. | ||
| 87 | |||
| 62 | <#raid_devs>: The number of devices composing the array. | 88 | <#raid_devs>: The number of devices composing the array. |
| 63 | Each device consists of two entries. The first is the device | 89 | Each device consists of two entries. The first is the device |
| 64 | containing the metadata (if any); the second is the one containing the | 90 | containing the metadata (if any); the second is the one containing the |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index f2f29c526544..982e3e390c45 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include "md.h" | 11 | #include "md.h" |
| 12 | #include "raid1.h" | 12 | #include "raid1.h" |
| 13 | #include "raid5.h" | 13 | #include "raid5.h" |
| 14 | #include "raid10.h" | ||
| 14 | #include "bitmap.h" | 15 | #include "bitmap.h" |
| 15 | 16 | ||
| 16 | #include <linux/device-mapper.h> | 17 | #include <linux/device-mapper.h> |
| @@ -52,7 +53,10 @@ struct raid_dev { | |||
| 52 | #define DMPF_MAX_RECOVERY_RATE 0x20 | 53 | #define DMPF_MAX_RECOVERY_RATE 0x20 |
| 53 | #define DMPF_MAX_WRITE_BEHIND 0x40 | 54 | #define DMPF_MAX_WRITE_BEHIND 0x40 |
| 54 | #define DMPF_STRIPE_CACHE 0x80 | 55 | #define DMPF_STRIPE_CACHE 0x80 |
| 55 | #define DMPF_REGION_SIZE 0X100 | 56 | #define DMPF_REGION_SIZE 0x100 |
| 57 | #define DMPF_RAID10_COPIES 0x200 | ||
| 58 | #define DMPF_RAID10_FORMAT 0x400 | ||
| 59 | |||
| 56 | struct raid_set { | 60 | struct raid_set { |
| 57 | struct dm_target *ti; | 61 | struct dm_target *ti; |
| 58 | 62 | ||
| @@ -76,6 +80,7 @@ static struct raid_type { | |||
| 76 | const unsigned algorithm; /* RAID algorithm. */ | 80 | const unsigned algorithm; /* RAID algorithm. */ |
| 77 | } raid_types[] = { | 81 | } raid_types[] = { |
| 78 | {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, | 82 | {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, |
| 83 | {"raid10", "RAID10 (striped mirrors)", 0, 2, 10, UINT_MAX /* Varies */}, | ||
| 79 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, | 84 | {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, |
| 80 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, | 85 | {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, |
| 81 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, | 86 | {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, |
| @@ -86,6 +91,17 @@ static struct raid_type { | |||
| 86 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} | 91 | {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} |
| 87 | }; | 92 | }; |
| 88 | 93 | ||
| 94 | static unsigned raid10_md_layout_to_copies(int layout) | ||
| 95 | { | ||
| 96 | return layout & 0xFF; | ||
| 97 | } | ||
| 98 | |||
| 99 | static int raid10_format_to_md_layout(char *format, unsigned copies) | ||
| 100 | { | ||
| 101 | /* 1 "far" copy, and 'copies' "near" copies */ | ||
| 102 | return (1 << 8) | (copies & 0xFF); | ||
| 103 | } | ||
| 104 | |||
| 89 | static struct raid_type *get_raid_type(char *name) | 105 | static struct raid_type *get_raid_type(char *name) |
| 90 | { | 106 | { |
| 91 | int i; | 107 | int i; |
| @@ -339,10 +355,16 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) | |||
| 339 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) | 355 | * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) |
| 340 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs | 356 | * [stripe_cache <sectors>] Stripe cache size for higher RAIDs |
| 341 | * [region_size <sectors>] Defines granularity of bitmap | 357 | * [region_size <sectors>] Defines granularity of bitmap |
| 358 | * | ||
| 359 | * RAID10-only options: | ||
| 360 | * [raid10_copies <# copies>] Number of copies. (Default: 2) | ||
| 361 | * [raid10_format <near>] Layout algorithm. (Default: near) | ||
| 342 | */ | 362 | */ |
| 343 | static int parse_raid_params(struct raid_set *rs, char **argv, | 363 | static int parse_raid_params(struct raid_set *rs, char **argv, |
| 344 | unsigned num_raid_params) | 364 | unsigned num_raid_params) |
| 345 | { | 365 | { |
| 366 | char *raid10_format = "near"; | ||
| 367 | unsigned raid10_copies = 2; | ||
| 346 | unsigned i, rebuild_cnt = 0; | 368 | unsigned i, rebuild_cnt = 0; |
| 347 | unsigned long value, region_size = 0; | 369 | unsigned long value, region_size = 0; |
| 348 | sector_t sectors_per_dev = rs->ti->len; | 370 | sector_t sectors_per_dev = rs->ti->len; |
| @@ -416,11 +438,28 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 416 | } | 438 | } |
| 417 | 439 | ||
| 418 | key = argv[i++]; | 440 | key = argv[i++]; |
| 441 | |||
| 442 | /* Parameters that take a string value are checked here. */ | ||
| 443 | if (!strcasecmp(key, "raid10_format")) { | ||
| 444 | if (rs->raid_type->level != 10) { | ||
| 445 | rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; | ||
| 446 | return -EINVAL; | ||
| 447 | } | ||
| 448 | if (strcmp("near", argv[i])) { | ||
| 449 | rs->ti->error = "Invalid 'raid10_format' value given"; | ||
| 450 | return -EINVAL; | ||
| 451 | } | ||
| 452 | raid10_format = argv[i]; | ||
| 453 | rs->print_flags |= DMPF_RAID10_FORMAT; | ||
| 454 | continue; | ||
| 455 | } | ||
| 456 | |||
| 419 | if (strict_strtoul(argv[i], 10, &value) < 0) { | 457 | if (strict_strtoul(argv[i], 10, &value) < 0) { |
| 420 | rs->ti->error = "Bad numerical argument given in raid params"; | 458 | rs->ti->error = "Bad numerical argument given in raid params"; |
| 421 | return -EINVAL; | 459 | return -EINVAL; |
| 422 | } | 460 | } |
| 423 | 461 | ||
| 462 | /* Parameters that take a numeric value are checked here */ | ||
| 424 | if (!strcasecmp(key, "rebuild")) { | 463 | if (!strcasecmp(key, "rebuild")) { |
| 425 | rebuild_cnt++; | 464 | rebuild_cnt++; |
| 426 | 465 | ||
| @@ -439,6 +478,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 439 | return -EINVAL; | 478 | return -EINVAL; |
| 440 | } | 479 | } |
| 441 | break; | 480 | break; |
| 481 | case 10: | ||
| 442 | default: | 482 | default: |
| 443 | DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); | 483 | DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); |
| 444 | rs->ti->error = "Rebuild not supported for this RAID type"; | 484 | rs->ti->error = "Rebuild not supported for this RAID type"; |
| @@ -495,7 +535,8 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 495 | */ | 535 | */ |
| 496 | value /= 2; | 536 | value /= 2; |
| 497 | 537 | ||
| 498 | if (rs->raid_type->level < 5) { | 538 | if ((rs->raid_type->level != 5) && |
| 539 | (rs->raid_type->level != 6)) { | ||
| 499 | rs->ti->error = "Inappropriate argument: stripe_cache"; | 540 | rs->ti->error = "Inappropriate argument: stripe_cache"; |
| 500 | return -EINVAL; | 541 | return -EINVAL; |
| 501 | } | 542 | } |
| @@ -520,6 +561,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 520 | } else if (!strcasecmp(key, "region_size")) { | 561 | } else if (!strcasecmp(key, "region_size")) { |
| 521 | rs->print_flags |= DMPF_REGION_SIZE; | 562 | rs->print_flags |= DMPF_REGION_SIZE; |
| 522 | region_size = value; | 563 | region_size = value; |
| 564 | } else if (!strcasecmp(key, "raid10_copies") && | ||
| 565 | (rs->raid_type->level == 10)) { | ||
| 566 | if ((value < 2) || (value > 0xFF)) { | ||
| 567 | rs->ti->error = "Bad value for 'raid10_copies'"; | ||
| 568 | return -EINVAL; | ||
| 569 | } | ||
| 570 | rs->print_flags |= DMPF_RAID10_COPIES; | ||
| 571 | raid10_copies = value; | ||
| 523 | } else { | 572 | } else { |
| 524 | DMERR("Unable to parse RAID parameter: %s", key); | 573 | DMERR("Unable to parse RAID parameter: %s", key); |
| 525 | rs->ti->error = "Unable to parse RAID parameters"; | 574 | rs->ti->error = "Unable to parse RAID parameters"; |
| @@ -538,8 +587,22 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
| 538 | if (dm_set_target_max_io_len(rs->ti, max_io_len)) | 587 | if (dm_set_target_max_io_len(rs->ti, max_io_len)) |
| 539 | return -EINVAL; | 588 | return -EINVAL; |
| 540 | 589 | ||
| 541 | if ((rs->raid_type->level > 1) && | 590 | if (rs->raid_type->level == 10) { |
| 542 | sector_div(sectors_per_dev, (rs->md.raid_disks - rs->raid_type->parity_devs))) { | 591 | if (raid10_copies > rs->md.raid_disks) { |
| 592 | rs->ti->error = "Not enough devices to satisfy specification"; | ||
| 593 | return -EINVAL; | ||
| 594 | } | ||
| 595 | |||
| 596 | /* (Len * #mirrors) / #devices */ | ||
| 597 | sectors_per_dev = rs->ti->len * raid10_copies; | ||
| 598 | sector_div(sectors_per_dev, rs->md.raid_disks); | ||
| 599 | |||
| 600 | rs->md.layout = raid10_format_to_md_layout(raid10_format, | ||
| 601 | raid10_copies); | ||
| 602 | rs->md.new_layout = rs->md.layout; | ||
| 603 | } else if ((rs->raid_type->level > 1) && | ||
| 604 | sector_div(sectors_per_dev, | ||
| 605 | (rs->md.raid_disks - rs->raid_type->parity_devs))) { | ||
| 543 | rs->ti->error = "Target length not divisible by number of data devices"; | 606 | rs->ti->error = "Target length not divisible by number of data devices"; |
| 544 | return -EINVAL; | 607 | return -EINVAL; |
| 545 | } | 608 | } |
| @@ -566,6 +629,9 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) | |||
| 566 | if (rs->raid_type->level == 1) | 629 | if (rs->raid_type->level == 1) |
| 567 | return md_raid1_congested(&rs->md, bits); | 630 | return md_raid1_congested(&rs->md, bits); |
| 568 | 631 | ||
| 632 | if (rs->raid_type->level == 10) | ||
| 633 | return md_raid10_congested(&rs->md, bits); | ||
| 634 | |||
| 569 | return md_raid5_congested(&rs->md, bits); | 635 | return md_raid5_congested(&rs->md, bits); |
| 570 | } | 636 | } |
| 571 | 637 | ||
| @@ -884,6 +950,9 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
| 884 | case 6: | 950 | case 6: |
| 885 | redundancy = rs->raid_type->parity_devs; | 951 | redundancy = rs->raid_type->parity_devs; |
| 886 | break; | 952 | break; |
| 953 | case 10: | ||
| 954 | redundancy = raid10_md_layout_to_copies(mddev->layout) - 1; | ||
| 955 | break; | ||
| 887 | default: | 956 | default: |
| 888 | ti->error = "Unknown RAID type"; | 957 | ti->error = "Unknown RAID type"; |
| 889 | return -EINVAL; | 958 | return -EINVAL; |
| @@ -1049,12 +1118,19 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
| 1049 | goto bad; | 1118 | goto bad; |
| 1050 | } | 1119 | } |
| 1051 | 1120 | ||
| 1121 | if (ti->len != rs->md.array_sectors) { | ||
| 1122 | ti->error = "Array size does not match requested target length"; | ||
| 1123 | ret = -EINVAL; | ||
| 1124 | goto size_mismatch; | ||
| 1125 | } | ||
| 1052 | rs->callbacks.congested_fn = raid_is_congested; | 1126 | rs->callbacks.congested_fn = raid_is_congested; |
| 1053 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); | 1127 | dm_table_add_target_callbacks(ti->table, &rs->callbacks); |
| 1054 | 1128 | ||
| 1055 | mddev_suspend(&rs->md); | 1129 | mddev_suspend(&rs->md); |
| 1056 | return 0; | 1130 | return 0; |
| 1057 | 1131 | ||
| 1132 | size_mismatch: | ||
| 1133 | md_stop(&rs->md); | ||
| 1058 | bad: | 1134 | bad: |
| 1059 | context_free(rs); | 1135 | context_free(rs); |
| 1060 | 1136 | ||
| @@ -1203,6 +1279,13 @@ static int raid_status(struct dm_target *ti, status_type_t type, | |||
| 1203 | DMEMIT(" region_size %lu", | 1279 | DMEMIT(" region_size %lu", |
| 1204 | rs->md.bitmap_info.chunksize >> 9); | 1280 | rs->md.bitmap_info.chunksize >> 9); |
| 1205 | 1281 | ||
| 1282 | if (rs->print_flags & DMPF_RAID10_COPIES) | ||
| 1283 | DMEMIT(" raid10_copies %u", | ||
| 1284 | raid10_md_layout_to_copies(rs->md.layout)); | ||
| 1285 | |||
| 1286 | if (rs->print_flags & DMPF_RAID10_FORMAT) | ||
| 1287 | DMEMIT(" raid10_format near"); | ||
| 1288 | |||
| 1206 | DMEMIT(" %d", rs->md.raid_disks); | 1289 | DMEMIT(" %d", rs->md.raid_disks); |
| 1207 | for (i = 0; i < rs->md.raid_disks; i++) { | 1290 | for (i = 0; i < rs->md.raid_disks; i++) { |
| 1208 | if (rs->dev[i].meta_dev) | 1291 | if (rs->dev[i].meta_dev) |
| @@ -1277,7 +1360,7 @@ static void raid_resume(struct dm_target *ti) | |||
| 1277 | 1360 | ||
| 1278 | static struct target_type raid_target = { | 1361 | static struct target_type raid_target = { |
| 1279 | .name = "raid", | 1362 | .name = "raid", |
| 1280 | .version = {1, 2, 0}, | 1363 | .version = {1, 3, 0}, |
| 1281 | .module = THIS_MODULE, | 1364 | .module = THIS_MODULE, |
| 1282 | .ctr = raid_ctr, | 1365 | .ctr = raid_ctr, |
| 1283 | .dtr = raid_dtr, | 1366 | .dtr = raid_dtr, |
| @@ -1304,6 +1387,8 @@ module_init(dm_raid_init); | |||
| 1304 | module_exit(dm_raid_exit); | 1387 | module_exit(dm_raid_exit); |
| 1305 | 1388 | ||
| 1306 | MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); | 1389 | MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); |
| 1390 | MODULE_ALIAS("dm-raid1"); | ||
| 1391 | MODULE_ALIAS("dm-raid10"); | ||
| 1307 | MODULE_ALIAS("dm-raid4"); | 1392 | MODULE_ALIAS("dm-raid4"); |
| 1308 | MODULE_ALIAS("dm-raid5"); | 1393 | MODULE_ALIAS("dm-raid5"); |
| 1309 | MODULE_ALIAS("dm-raid6"); | 1394 | MODULE_ALIAS("dm-raid6"); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index d5ab4493c8be..f6c46109b071 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
| @@ -3942,17 +3942,13 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) | |||
| 3942 | break; | 3942 | break; |
| 3943 | case clear: | 3943 | case clear: |
| 3944 | /* stopping an active array */ | 3944 | /* stopping an active array */ |
| 3945 | if (atomic_read(&mddev->openers) > 0) | ||
| 3946 | return -EBUSY; | ||
| 3947 | err = do_md_stop(mddev, 0, NULL); | 3945 | err = do_md_stop(mddev, 0, NULL); |
| 3948 | break; | 3946 | break; |
| 3949 | case inactive: | 3947 | case inactive: |
| 3950 | /* stopping an active array */ | 3948 | /* stopping an active array */ |
| 3951 | if (mddev->pers) { | 3949 | if (mddev->pers) |
| 3952 | if (atomic_read(&mddev->openers) > 0) | ||
| 3953 | return -EBUSY; | ||
| 3954 | err = do_md_stop(mddev, 2, NULL); | 3950 | err = do_md_stop(mddev, 2, NULL); |
| 3955 | } else | 3951 | else |
| 3956 | err = 0; /* already inactive */ | 3952 | err = 0; /* already inactive */ |
| 3957 | break; | 3953 | break; |
| 3958 | case suspended: | 3954 | case suspended: |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..197f62681db5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
| @@ -46,6 +46,20 @@ | |||
| 46 | */ | 46 | */ |
| 47 | #define NR_RAID1_BIOS 256 | 47 | #define NR_RAID1_BIOS 256 |
| 48 | 48 | ||
| 49 | /* when we get a read error on a read-only array, we redirect to another | ||
| 50 | * device without failing the first device, or trying to over-write to | ||
| 51 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 52 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 53 | */ | ||
| 54 | #define IO_BLOCKED ((struct bio *)1) | ||
| 55 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 56 | * bad-block marking which must be done from process context. So we record | ||
| 57 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 58 | */ | ||
| 59 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 60 | |||
| 61 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 62 | |||
| 49 | /* When there are this many requests queue to be written by | 63 | /* When there are this many requests queue to be written by |
| 50 | * the raid1 thread, we become 'congested' to provide back-pressure | 64 | * the raid1 thread, we become 'congested' to provide back-pressure |
| 51 | * for writeback. | 65 | * for writeback. |
| @@ -483,12 +497,14 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 483 | const sector_t this_sector = r1_bio->sector; | 497 | const sector_t this_sector = r1_bio->sector; |
| 484 | int sectors; | 498 | int sectors; |
| 485 | int best_good_sectors; | 499 | int best_good_sectors; |
| 486 | int start_disk; | 500 | int best_disk, best_dist_disk, best_pending_disk; |
| 487 | int best_disk; | 501 | int has_nonrot_disk; |
| 488 | int i; | 502 | int disk; |
| 489 | sector_t best_dist; | 503 | sector_t best_dist; |
| 504 | unsigned int min_pending; | ||
| 490 | struct md_rdev *rdev; | 505 | struct md_rdev *rdev; |
| 491 | int choose_first; | 506 | int choose_first; |
| 507 | int choose_next_idle; | ||
| 492 | 508 | ||
| 493 | rcu_read_lock(); | 509 | rcu_read_lock(); |
| 494 | /* | 510 | /* |
| @@ -499,26 +515,26 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 499 | retry: | 515 | retry: |
| 500 | sectors = r1_bio->sectors; | 516 | sectors = r1_bio->sectors; |
| 501 | best_disk = -1; | 517 | best_disk = -1; |
| 518 | best_dist_disk = -1; | ||
| 502 | best_dist = MaxSector; | 519 | best_dist = MaxSector; |
| 520 | best_pending_disk = -1; | ||
| 521 | min_pending = UINT_MAX; | ||
| 503 | best_good_sectors = 0; | 522 | best_good_sectors = 0; |
| 523 | has_nonrot_disk = 0; | ||
| 524 | choose_next_idle = 0; | ||
| 504 | 525 | ||
| 505 | if (conf->mddev->recovery_cp < MaxSector && | 526 | if (conf->mddev->recovery_cp < MaxSector && |
| 506 | (this_sector + sectors >= conf->next_resync)) { | 527 | (this_sector + sectors >= conf->next_resync)) |
| 507 | choose_first = 1; | 528 | choose_first = 1; |
| 508 | start_disk = 0; | 529 | else |
| 509 | } else { | ||
| 510 | choose_first = 0; | 530 | choose_first = 0; |
| 511 | start_disk = conf->last_used; | ||
| 512 | } | ||
| 513 | 531 | ||
| 514 | for (i = 0 ; i < conf->raid_disks * 2 ; i++) { | 532 | for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { |
| 515 | sector_t dist; | 533 | sector_t dist; |
| 516 | sector_t first_bad; | 534 | sector_t first_bad; |
| 517 | int bad_sectors; | 535 | int bad_sectors; |
| 518 | 536 | unsigned int pending; | |
| 519 | int disk = start_disk + i; | 537 | bool nonrot; |
| 520 | if (disk >= conf->raid_disks * 2) | ||
| 521 | disk -= conf->raid_disks * 2; | ||
| 522 | 538 | ||
| 523 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 539 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
| 524 | if (r1_bio->bios[disk] == IO_BLOCKED | 540 | if (r1_bio->bios[disk] == IO_BLOCKED |
| @@ -577,22 +593,77 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 577 | } else | 593 | } else |
| 578 | best_good_sectors = sectors; | 594 | best_good_sectors = sectors; |
| 579 | 595 | ||
| 596 | nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); | ||
| 597 | has_nonrot_disk |= nonrot; | ||
| 598 | pending = atomic_read(&rdev->nr_pending); | ||
| 580 | dist = abs(this_sector - conf->mirrors[disk].head_position); | 599 | dist = abs(this_sector - conf->mirrors[disk].head_position); |
| 581 | if (choose_first | 600 | if (choose_first) { |
| 582 | /* Don't change to another disk for sequential reads */ | 601 | best_disk = disk; |
| 583 | || conf->next_seq_sect == this_sector | 602 | break; |
| 584 | || dist == 0 | 603 | } |
| 585 | /* If device is idle, use it */ | 604 | /* Don't change to another disk for sequential reads */ |
| 586 | || atomic_read(&rdev->nr_pending) == 0) { | 605 | if (conf->mirrors[disk].next_seq_sect == this_sector |
| 606 | || dist == 0) { | ||
| 607 | int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; | ||
| 608 | struct raid1_info *mirror = &conf->mirrors[disk]; | ||
| 609 | |||
| 610 | best_disk = disk; | ||
| 611 | /* | ||
| 612 | * If buffered sequential IO size exceeds optimal | ||
| 613 | * iosize, check if there is idle disk. If yes, choose | ||
| 614 | * the idle disk. read_balance could already choose an | ||
| 615 | * idle disk before noticing it's a sequential IO in | ||
| 616 | * this disk. This doesn't matter because this disk | ||
| 617 | * will idle, next time it will be utilized after the | ||
| 618 | * first disk has IO size exceeds optimal iosize. In | ||
| 619 | * this way, iosize of the first disk will be optimal | ||
| 620 | * iosize at least. iosize of the second disk might be | ||
| 621 | * small, but not a big deal since when the second disk | ||
| 622 | * starts IO, the first disk is likely still busy. | ||
| 623 | */ | ||
| 624 | if (nonrot && opt_iosize > 0 && | ||
| 625 | mirror->seq_start != MaxSector && | ||
| 626 | mirror->next_seq_sect > opt_iosize && | ||
| 627 | mirror->next_seq_sect - opt_iosize >= | ||
| 628 | mirror->seq_start) { | ||
| 629 | choose_next_idle = 1; | ||
| 630 | continue; | ||
| 631 | } | ||
| 632 | break; | ||
| 633 | } | ||
| 634 | /* If device is idle, use it */ | ||
| 635 | if (pending == 0) { | ||
| 587 | best_disk = disk; | 636 | best_disk = disk; |
| 588 | break; | 637 | break; |
| 589 | } | 638 | } |
| 639 | |||
| 640 | if (choose_next_idle) | ||
| 641 | continue; | ||
| 642 | |||
| 643 | if (min_pending > pending) { | ||
| 644 | min_pending = pending; | ||
| 645 | best_pending_disk = disk; | ||
| 646 | } | ||
| 647 | |||
| 590 | if (dist < best_dist) { | 648 | if (dist < best_dist) { |
| 591 | best_dist = dist; | 649 | best_dist = dist; |
| 592 | best_disk = disk; | 650 | best_dist_disk = disk; |
| 593 | } | 651 | } |
| 594 | } | 652 | } |
| 595 | 653 | ||
| 654 | /* | ||
| 655 | * If all disks are rotational, choose the closest disk. If any disk is | ||
| 656 | * non-rotational, choose the disk with less pending request even the | ||
| 657 | * disk is rotational, which might/might not be optimal for raids with | ||
| 658 | * mixed ratation/non-rotational disks depending on workload. | ||
| 659 | */ | ||
| 660 | if (best_disk == -1) { | ||
| 661 | if (has_nonrot_disk) | ||
| 662 | best_disk = best_pending_disk; | ||
| 663 | else | ||
| 664 | best_disk = best_dist_disk; | ||
| 665 | } | ||
| 666 | |||
| 596 | if (best_disk >= 0) { | 667 | if (best_disk >= 0) { |
| 597 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); | 668 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
| 598 | if (!rdev) | 669 | if (!rdev) |
| @@ -606,8 +677,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
| 606 | goto retry; | 677 | goto retry; |
| 607 | } | 678 | } |
| 608 | sectors = best_good_sectors; | 679 | sectors = best_good_sectors; |
| 609 | conf->next_seq_sect = this_sector + sectors; | 680 | |
| 610 | conf->last_used = best_disk; | 681 | if (conf->mirrors[best_disk].next_seq_sect != this_sector) |
| 682 | conf->mirrors[best_disk].seq_start = this_sector; | ||
| 683 | |||
| 684 | conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; | ||
| 611 | } | 685 | } |
| 612 | rcu_read_unlock(); | 686 | rcu_read_unlock(); |
| 613 | *max_sectors = sectors; | 687 | *max_sectors = sectors; |
| @@ -873,7 +947,7 @@ do_sync_io: | |||
| 873 | static void make_request(struct mddev *mddev, struct bio * bio) | 947 | static void make_request(struct mddev *mddev, struct bio * bio) |
| 874 | { | 948 | { |
| 875 | struct r1conf *conf = mddev->private; | 949 | struct r1conf *conf = mddev->private; |
| 876 | struct mirror_info *mirror; | 950 | struct raid1_info *mirror; |
| 877 | struct r1bio *r1_bio; | 951 | struct r1bio *r1_bio; |
| 878 | struct bio *read_bio; | 952 | struct bio *read_bio; |
| 879 | int i, disks; | 953 | int i, disks; |
| @@ -1364,7 +1438,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1364 | struct r1conf *conf = mddev->private; | 1438 | struct r1conf *conf = mddev->private; |
| 1365 | int err = -EEXIST; | 1439 | int err = -EEXIST; |
| 1366 | int mirror = 0; | 1440 | int mirror = 0; |
| 1367 | struct mirror_info *p; | 1441 | struct raid1_info *p; |
| 1368 | int first = 0; | 1442 | int first = 0; |
| 1369 | int last = conf->raid_disks - 1; | 1443 | int last = conf->raid_disks - 1; |
| 1370 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 1444 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
| @@ -1433,7 +1507,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1433 | struct r1conf *conf = mddev->private; | 1507 | struct r1conf *conf = mddev->private; |
| 1434 | int err = 0; | 1508 | int err = 0; |
| 1435 | int number = rdev->raid_disk; | 1509 | int number = rdev->raid_disk; |
| 1436 | struct mirror_info *p = conf->mirrors+ number; | 1510 | struct raid1_info *p = conf->mirrors + number; |
| 1437 | 1511 | ||
| 1438 | if (rdev != p->rdev) | 1512 | if (rdev != p->rdev) |
| 1439 | p = conf->mirrors + conf->raid_disks + number; | 1513 | p = conf->mirrors + conf->raid_disks + number; |
| @@ -2371,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
| 2371 | bio->bi_rw = READ; | 2445 | bio->bi_rw = READ; |
| 2372 | bio->bi_end_io = end_sync_read; | 2446 | bio->bi_end_io = end_sync_read; |
| 2373 | read_targets++; | 2447 | read_targets++; |
| 2448 | } else if (!test_bit(WriteErrorSeen, &rdev->flags) && | ||
| 2449 | test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && | ||
| 2450 | !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { | ||
| 2451 | /* | ||
| 2452 | * The device is suitable for reading (InSync), | ||
| 2453 | * but has bad block(s) here. Let's try to correct them, | ||
| 2454 | * if we are doing resync or repair. Otherwise, leave | ||
| 2455 | * this device alone for this sync request. | ||
| 2456 | */ | ||
| 2457 | bio->bi_rw = WRITE; | ||
| 2458 | bio->bi_end_io = end_sync_write; | ||
| 2459 | write_targets++; | ||
| 2374 | } | 2460 | } |
| 2375 | } | 2461 | } |
| 2376 | if (bio->bi_end_io) { | 2462 | if (bio->bi_end_io) { |
| @@ -2428,7 +2514,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp | |||
| 2428 | /* There is nowhere to write, so all non-sync | 2514 | /* There is nowhere to write, so all non-sync |
| 2429 | * drives must be failed - so we are finished | 2515 | * drives must be failed - so we are finished |
| 2430 | */ | 2516 | */ |
| 2431 | sector_t rv = max_sector - sector_nr; | 2517 | sector_t rv; |
| 2518 | if (min_bad > 0) | ||
| 2519 | max_sector = sector_nr + min_bad; | ||
| 2520 | rv = max_sector - sector_nr; | ||
| 2432 | *skipped = 1; | 2521 | *skipped = 1; |
| 2433 | put_buf(r1_bio); | 2522 | put_buf(r1_bio); |
| 2434 | return rv; | 2523 | return rv; |
| @@ -2521,7 +2610,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2521 | { | 2610 | { |
| 2522 | struct r1conf *conf; | 2611 | struct r1conf *conf; |
| 2523 | int i; | 2612 | int i; |
| 2524 | struct mirror_info *disk; | 2613 | struct raid1_info *disk; |
| 2525 | struct md_rdev *rdev; | 2614 | struct md_rdev *rdev; |
| 2526 | int err = -ENOMEM; | 2615 | int err = -ENOMEM; |
| 2527 | 2616 | ||
| @@ -2529,7 +2618,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2529 | if (!conf) | 2618 | if (!conf) |
| 2530 | goto abort; | 2619 | goto abort; |
| 2531 | 2620 | ||
| 2532 | conf->mirrors = kzalloc(sizeof(struct mirror_info) | 2621 | conf->mirrors = kzalloc(sizeof(struct raid1_info) |
| 2533 | * mddev->raid_disks * 2, | 2622 | * mddev->raid_disks * 2, |
| 2534 | GFP_KERNEL); | 2623 | GFP_KERNEL); |
| 2535 | if (!conf->mirrors) | 2624 | if (!conf->mirrors) |
| @@ -2572,6 +2661,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2572 | mddev->merge_check_needed = 1; | 2661 | mddev->merge_check_needed = 1; |
| 2573 | 2662 | ||
| 2574 | disk->head_position = 0; | 2663 | disk->head_position = 0; |
| 2664 | disk->seq_start = MaxSector; | ||
| 2575 | } | 2665 | } |
| 2576 | conf->raid_disks = mddev->raid_disks; | 2666 | conf->raid_disks = mddev->raid_disks; |
| 2577 | conf->mddev = mddev; | 2667 | conf->mddev = mddev; |
| @@ -2585,7 +2675,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2585 | conf->recovery_disabled = mddev->recovery_disabled - 1; | 2675 | conf->recovery_disabled = mddev->recovery_disabled - 1; |
| 2586 | 2676 | ||
| 2587 | err = -EIO; | 2677 | err = -EIO; |
| 2588 | conf->last_used = -1; | ||
| 2589 | for (i = 0; i < conf->raid_disks * 2; i++) { | 2678 | for (i = 0; i < conf->raid_disks * 2; i++) { |
| 2590 | 2679 | ||
| 2591 | disk = conf->mirrors + i; | 2680 | disk = conf->mirrors + i; |
| @@ -2611,19 +2700,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
| 2611 | if (disk->rdev && | 2700 | if (disk->rdev && |
| 2612 | (disk->rdev->saved_raid_disk < 0)) | 2701 | (disk->rdev->saved_raid_disk < 0)) |
| 2613 | conf->fullsync = 1; | 2702 | conf->fullsync = 1; |
| 2614 | } else if (conf->last_used < 0) | 2703 | } |
| 2615 | /* | ||
| 2616 | * The first working device is used as a | ||
| 2617 | * starting point to read balancing. | ||
| 2618 | */ | ||
| 2619 | conf->last_used = i; | ||
| 2620 | } | 2704 | } |
| 2621 | 2705 | ||
| 2622 | if (conf->last_used < 0) { | ||
| 2623 | printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", | ||
| 2624 | mdname(mddev)); | ||
| 2625 | goto abort; | ||
| 2626 | } | ||
| 2627 | err = -ENOMEM; | 2706 | err = -ENOMEM; |
| 2628 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); | 2707 | conf->thread = md_register_thread(raid1d, mddev, "raid1"); |
| 2629 | if (!conf->thread) { | 2708 | if (!conf->thread) { |
| @@ -2798,7 +2877,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2798 | */ | 2877 | */ |
| 2799 | mempool_t *newpool, *oldpool; | 2878 | mempool_t *newpool, *oldpool; |
| 2800 | struct pool_info *newpoolinfo; | 2879 | struct pool_info *newpoolinfo; |
| 2801 | struct mirror_info *newmirrors; | 2880 | struct raid1_info *newmirrors; |
| 2802 | struct r1conf *conf = mddev->private; | 2881 | struct r1conf *conf = mddev->private; |
| 2803 | int cnt, raid_disks; | 2882 | int cnt, raid_disks; |
| 2804 | unsigned long flags; | 2883 | unsigned long flags; |
| @@ -2841,7 +2920,7 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2841 | kfree(newpoolinfo); | 2920 | kfree(newpoolinfo); |
| 2842 | return -ENOMEM; | 2921 | return -ENOMEM; |
| 2843 | } | 2922 | } |
| 2844 | newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, | 2923 | newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, |
| 2845 | GFP_KERNEL); | 2924 | GFP_KERNEL); |
| 2846 | if (!newmirrors) { | 2925 | if (!newmirrors) { |
| 2847 | kfree(newpoolinfo); | 2926 | kfree(newpoolinfo); |
| @@ -2880,7 +2959,6 @@ static int raid1_reshape(struct mddev *mddev) | |||
| 2880 | conf->raid_disks = mddev->raid_disks = raid_disks; | 2959 | conf->raid_disks = mddev->raid_disks = raid_disks; |
| 2881 | mddev->delta_disks = 0; | 2960 | mddev->delta_disks = 0; |
| 2882 | 2961 | ||
| 2883 | conf->last_used = 0; /* just make sure it is in-range */ | ||
| 2884 | lower_barrier(conf); | 2962 | lower_barrier(conf); |
| 2885 | 2963 | ||
| 2886 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2964 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 80ded139314c..0ff3715fb7eb 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
| @@ -1,9 +1,15 @@ | |||
| 1 | #ifndef _RAID1_H | 1 | #ifndef _RAID1_H |
| 2 | #define _RAID1_H | 2 | #define _RAID1_H |
| 3 | 3 | ||
| 4 | struct mirror_info { | 4 | struct raid1_info { |
| 5 | struct md_rdev *rdev; | 5 | struct md_rdev *rdev; |
| 6 | sector_t head_position; | 6 | sector_t head_position; |
| 7 | |||
| 8 | /* When choose the best device for a read (read_balance()) | ||
| 9 | * we try to keep sequential reads one the same device | ||
| 10 | */ | ||
| 11 | sector_t next_seq_sect; | ||
| 12 | sector_t seq_start; | ||
| 7 | }; | 13 | }; |
| 8 | 14 | ||
| 9 | /* | 15 | /* |
| @@ -24,17 +30,11 @@ struct pool_info { | |||
| 24 | 30 | ||
| 25 | struct r1conf { | 31 | struct r1conf { |
| 26 | struct mddev *mddev; | 32 | struct mddev *mddev; |
| 27 | struct mirror_info *mirrors; /* twice 'raid_disks' to | 33 | struct raid1_info *mirrors; /* twice 'raid_disks' to |
| 28 | * allow for replacements. | 34 | * allow for replacements. |
| 29 | */ | 35 | */ |
| 30 | int raid_disks; | 36 | int raid_disks; |
| 31 | 37 | ||
| 32 | /* When choose the best device for a read (read_balance()) | ||
| 33 | * we try to keep sequential reads one the same device | ||
| 34 | * using 'last_used' and 'next_seq_sect' | ||
| 35 | */ | ||
| 36 | int last_used; | ||
| 37 | sector_t next_seq_sect; | ||
| 38 | /* During resync, read_balancing is only allowed on the part | 38 | /* During resync, read_balancing is only allowed on the part |
| 39 | * of the array that has been resynced. 'next_resync' tells us | 39 | * of the array that has been resynced. 'next_resync' tells us |
| 40 | * where that is. | 40 | * where that is. |
| @@ -135,20 +135,6 @@ struct r1bio { | |||
| 135 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ | 135 | /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ |
| 136 | }; | 136 | }; |
| 137 | 137 | ||
| 138 | /* when we get a read error on a read-only array, we redirect to another | ||
| 139 | * device without failing the first device, or trying to over-write to | ||
| 140 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 141 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 142 | */ | ||
| 143 | #define IO_BLOCKED ((struct bio *)1) | ||
| 144 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 145 | * bad-block marking which must be done from process context. So we record | ||
| 146 | * the success by setting bios[n] to IO_MADE_GOOD | ||
| 147 | */ | ||
| 148 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 149 | |||
| 150 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 151 | |||
| 152 | /* bits for r1bio.state */ | 138 | /* bits for r1bio.state */ |
| 153 | #define R1BIO_Uptodate 0 | 139 | #define R1BIO_Uptodate 0 |
| 154 | #define R1BIO_IsSync 1 | 140 | #define R1BIO_IsSync 1 |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8da6282254c3..e2549deab7c3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -60,7 +60,21 @@ | |||
| 60 | */ | 60 | */ |
| 61 | #define NR_RAID10_BIOS 256 | 61 | #define NR_RAID10_BIOS 256 |
| 62 | 62 | ||
| 63 | /* When there are this many requests queue to be written by | 63 | /* when we get a read error on a read-only array, we redirect to another |
| 64 | * device without failing the first device, or trying to over-write to | ||
| 65 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 66 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 67 | */ | ||
| 68 | #define IO_BLOCKED ((struct bio *)1) | ||
| 69 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 70 | * bad-block marking which must be done from process context. So we record | ||
| 71 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 72 | */ | ||
| 73 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 74 | |||
| 75 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 76 | |||
| 77 | /* When there are this many requests queued to be written by | ||
| 64 | * the raid10 thread, we become 'congested' to provide back-pressure | 78 | * the raid10 thread, we become 'congested' to provide back-pressure |
| 65 | * for writeback. | 79 | * for writeback. |
| 66 | */ | 80 | */ |
| @@ -717,7 +731,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, | |||
| 717 | int sectors = r10_bio->sectors; | 731 | int sectors = r10_bio->sectors; |
| 718 | int best_good_sectors; | 732 | int best_good_sectors; |
| 719 | sector_t new_distance, best_dist; | 733 | sector_t new_distance, best_dist; |
| 720 | struct md_rdev *rdev, *best_rdev; | 734 | struct md_rdev *best_rdev, *rdev = NULL; |
| 721 | int do_balance; | 735 | int do_balance; |
| 722 | int best_slot; | 736 | int best_slot; |
| 723 | struct geom *geo = &conf->geo; | 737 | struct geom *geo = &conf->geo; |
| @@ -839,9 +853,8 @@ retry: | |||
| 839 | return rdev; | 853 | return rdev; |
| 840 | } | 854 | } |
| 841 | 855 | ||
| 842 | static int raid10_congested(void *data, int bits) | 856 | int md_raid10_congested(struct mddev *mddev, int bits) |
| 843 | { | 857 | { |
| 844 | struct mddev *mddev = data; | ||
| 845 | struct r10conf *conf = mddev->private; | 858 | struct r10conf *conf = mddev->private; |
| 846 | int i, ret = 0; | 859 | int i, ret = 0; |
| 847 | 860 | ||
| @@ -849,8 +862,6 @@ static int raid10_congested(void *data, int bits) | |||
| 849 | conf->pending_count >= max_queued_requests) | 862 | conf->pending_count >= max_queued_requests) |
| 850 | return 1; | 863 | return 1; |
| 851 | 864 | ||
| 852 | if (mddev_congested(mddev, bits)) | ||
| 853 | return 1; | ||
| 854 | rcu_read_lock(); | 865 | rcu_read_lock(); |
| 855 | for (i = 0; | 866 | for (i = 0; |
| 856 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) | 867 | (i < conf->geo.raid_disks || i < conf->prev.raid_disks) |
| @@ -866,6 +877,15 @@ static int raid10_congested(void *data, int bits) | |||
| 866 | rcu_read_unlock(); | 877 | rcu_read_unlock(); |
| 867 | return ret; | 878 | return ret; |
| 868 | } | 879 | } |
| 880 | EXPORT_SYMBOL_GPL(md_raid10_congested); | ||
| 881 | |||
| 882 | static int raid10_congested(void *data, int bits) | ||
| 883 | { | ||
| 884 | struct mddev *mddev = data; | ||
| 885 | |||
| 886 | return mddev_congested(mddev, bits) || | ||
| 887 | md_raid10_congested(mddev, bits); | ||
| 888 | } | ||
| 869 | 889 | ||
| 870 | static void flush_pending_writes(struct r10conf *conf) | 890 | static void flush_pending_writes(struct r10conf *conf) |
| 871 | { | 891 | { |
| @@ -1546,7 +1566,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1546 | static void print_conf(struct r10conf *conf) | 1566 | static void print_conf(struct r10conf *conf) |
| 1547 | { | 1567 | { |
| 1548 | int i; | 1568 | int i; |
| 1549 | struct mirror_info *tmp; | 1569 | struct raid10_info *tmp; |
| 1550 | 1570 | ||
| 1551 | printk(KERN_DEBUG "RAID10 conf printout:\n"); | 1571 | printk(KERN_DEBUG "RAID10 conf printout:\n"); |
| 1552 | if (!conf) { | 1572 | if (!conf) { |
| @@ -1580,7 +1600,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
| 1580 | { | 1600 | { |
| 1581 | int i; | 1601 | int i; |
| 1582 | struct r10conf *conf = mddev->private; | 1602 | struct r10conf *conf = mddev->private; |
| 1583 | struct mirror_info *tmp; | 1603 | struct raid10_info *tmp; |
| 1584 | int count = 0; | 1604 | int count = 0; |
| 1585 | unsigned long flags; | 1605 | unsigned long flags; |
| 1586 | 1606 | ||
| @@ -1655,7 +1675,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1655 | else | 1675 | else |
| 1656 | mirror = first; | 1676 | mirror = first; |
| 1657 | for ( ; mirror <= last ; mirror++) { | 1677 | for ( ; mirror <= last ; mirror++) { |
| 1658 | struct mirror_info *p = &conf->mirrors[mirror]; | 1678 | struct raid10_info *p = &conf->mirrors[mirror]; |
| 1659 | if (p->recovery_disabled == mddev->recovery_disabled) | 1679 | if (p->recovery_disabled == mddev->recovery_disabled) |
| 1660 | continue; | 1680 | continue; |
| 1661 | if (p->rdev) { | 1681 | if (p->rdev) { |
| @@ -1709,7 +1729,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
| 1709 | int err = 0; | 1729 | int err = 0; |
| 1710 | int number = rdev->raid_disk; | 1730 | int number = rdev->raid_disk; |
| 1711 | struct md_rdev **rdevp; | 1731 | struct md_rdev **rdevp; |
| 1712 | struct mirror_info *p = conf->mirrors + number; | 1732 | struct raid10_info *p = conf->mirrors + number; |
| 1713 | 1733 | ||
| 1714 | print_conf(conf); | 1734 | print_conf(conf); |
| 1715 | if (rdev == p->rdev) | 1735 | if (rdev == p->rdev) |
| @@ -2876,7 +2896,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
| 2876 | sector_t sect; | 2896 | sector_t sect; |
| 2877 | int must_sync; | 2897 | int must_sync; |
| 2878 | int any_working; | 2898 | int any_working; |
| 2879 | struct mirror_info *mirror = &conf->mirrors[i]; | 2899 | struct raid10_info *mirror = &conf->mirrors[i]; |
| 2880 | 2900 | ||
| 2881 | if ((mirror->rdev == NULL || | 2901 | if ((mirror->rdev == NULL || |
| 2882 | test_bit(In_sync, &mirror->rdev->flags)) | 2902 | test_bit(In_sync, &mirror->rdev->flags)) |
| @@ -3388,7 +3408,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) | |||
| 3388 | goto out; | 3408 | goto out; |
| 3389 | 3409 | ||
| 3390 | /* FIXME calc properly */ | 3410 | /* FIXME calc properly */ |
| 3391 | conf->mirrors = kzalloc(sizeof(struct mirror_info)*(mddev->raid_disks + | 3411 | conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + |
| 3392 | max(0,mddev->delta_disks)), | 3412 | max(0,mddev->delta_disks)), |
| 3393 | GFP_KERNEL); | 3413 | GFP_KERNEL); |
| 3394 | if (!conf->mirrors) | 3414 | if (!conf->mirrors) |
| @@ -3452,7 +3472,7 @@ static int run(struct mddev *mddev) | |||
| 3452 | { | 3472 | { |
| 3453 | struct r10conf *conf; | 3473 | struct r10conf *conf; |
| 3454 | int i, disk_idx, chunk_size; | 3474 | int i, disk_idx, chunk_size; |
| 3455 | struct mirror_info *disk; | 3475 | struct raid10_info *disk; |
| 3456 | struct md_rdev *rdev; | 3476 | struct md_rdev *rdev; |
| 3457 | sector_t size; | 3477 | sector_t size; |
| 3458 | sector_t min_offset_diff = 0; | 3478 | sector_t min_offset_diff = 0; |
| @@ -3472,12 +3492,14 @@ static int run(struct mddev *mddev) | |||
| 3472 | conf->thread = NULL; | 3492 | conf->thread = NULL; |
| 3473 | 3493 | ||
| 3474 | chunk_size = mddev->chunk_sectors << 9; | 3494 | chunk_size = mddev->chunk_sectors << 9; |
| 3475 | blk_queue_io_min(mddev->queue, chunk_size); | 3495 | if (mddev->queue) { |
| 3476 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3496 | blk_queue_io_min(mddev->queue, chunk_size); |
| 3477 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3497 | if (conf->geo.raid_disks % conf->geo.near_copies) |
| 3478 | else | 3498 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
| 3479 | blk_queue_io_opt(mddev->queue, chunk_size * | 3499 | else |
| 3480 | (conf->geo.raid_disks / conf->geo.near_copies)); | 3500 | blk_queue_io_opt(mddev->queue, chunk_size * |
| 3501 | (conf->geo.raid_disks / conf->geo.near_copies)); | ||
| 3502 | } | ||
| 3481 | 3503 | ||
| 3482 | rdev_for_each(rdev, mddev) { | 3504 | rdev_for_each(rdev, mddev) { |
| 3483 | long long diff; | 3505 | long long diff; |
| @@ -3511,8 +3533,9 @@ static int run(struct mddev *mddev) | |||
| 3511 | if (first || diff < min_offset_diff) | 3533 | if (first || diff < min_offset_diff) |
| 3512 | min_offset_diff = diff; | 3534 | min_offset_diff = diff; |
| 3513 | 3535 | ||
| 3514 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3536 | if (mddev->gendisk) |
| 3515 | rdev->data_offset << 9); | 3537 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
| 3538 | rdev->data_offset << 9); | ||
| 3516 | 3539 | ||
| 3517 | disk->head_position = 0; | 3540 | disk->head_position = 0; |
| 3518 | } | 3541 | } |
| @@ -3575,22 +3598,22 @@ static int run(struct mddev *mddev) | |||
| 3575 | md_set_array_sectors(mddev, size); | 3598 | md_set_array_sectors(mddev, size); |
| 3576 | mddev->resync_max_sectors = size; | 3599 | mddev->resync_max_sectors = size; |
| 3577 | 3600 | ||
| 3578 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | 3601 | if (mddev->queue) { |
| 3579 | mddev->queue->backing_dev_info.congested_data = mddev; | ||
| 3580 | |||
| 3581 | /* Calculate max read-ahead size. | ||
| 3582 | * We need to readahead at least twice a whole stripe.... | ||
| 3583 | * maybe... | ||
| 3584 | */ | ||
| 3585 | { | ||
| 3586 | int stripe = conf->geo.raid_disks * | 3602 | int stripe = conf->geo.raid_disks * |
| 3587 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); | 3603 | ((mddev->chunk_sectors << 9) / PAGE_SIZE); |
| 3604 | mddev->queue->backing_dev_info.congested_fn = raid10_congested; | ||
| 3605 | mddev->queue->backing_dev_info.congested_data = mddev; | ||
| 3606 | |||
| 3607 | /* Calculate max read-ahead size. | ||
| 3608 | * We need to readahead at least twice a whole stripe.... | ||
| 3609 | * maybe... | ||
| 3610 | */ | ||
| 3588 | stripe /= conf->geo.near_copies; | 3611 | stripe /= conf->geo.near_copies; |
| 3589 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) | 3612 | if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) |
| 3590 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; | 3613 | mddev->queue->backing_dev_info.ra_pages = 2 * stripe; |
| 3614 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
| 3591 | } | 3615 | } |
| 3592 | 3616 | ||
| 3593 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
| 3594 | 3617 | ||
| 3595 | if (md_integrity_register(mddev)) | 3618 | if (md_integrity_register(mddev)) |
| 3596 | goto out_free_conf; | 3619 | goto out_free_conf; |
| @@ -3641,7 +3664,10 @@ static int stop(struct mddev *mddev) | |||
| 3641 | lower_barrier(conf); | 3664 | lower_barrier(conf); |
| 3642 | 3665 | ||
| 3643 | md_unregister_thread(&mddev->thread); | 3666 | md_unregister_thread(&mddev->thread); |
| 3644 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | 3667 | if (mddev->queue) |
| 3668 | /* the unplug fn references 'conf'*/ | ||
| 3669 | blk_sync_queue(mddev->queue); | ||
| 3670 | |||
| 3645 | if (conf->r10bio_pool) | 3671 | if (conf->r10bio_pool) |
| 3646 | mempool_destroy(conf->r10bio_pool); | 3672 | mempool_destroy(conf->r10bio_pool); |
| 3647 | kfree(conf->mirrors); | 3673 | kfree(conf->mirrors); |
| @@ -3805,7 +3831,7 @@ static int raid10_check_reshape(struct mddev *mddev) | |||
| 3805 | if (mddev->delta_disks > 0) { | 3831 | if (mddev->delta_disks > 0) { |
| 3806 | /* allocate new 'mirrors' list */ | 3832 | /* allocate new 'mirrors' list */ |
| 3807 | conf->mirrors_new = kzalloc( | 3833 | conf->mirrors_new = kzalloc( |
| 3808 | sizeof(struct mirror_info) | 3834 | sizeof(struct raid10_info) |
| 3809 | *(mddev->raid_disks + | 3835 | *(mddev->raid_disks + |
| 3810 | mddev->delta_disks), | 3836 | mddev->delta_disks), |
| 3811 | GFP_KERNEL); | 3837 | GFP_KERNEL); |
| @@ -3930,7 +3956,7 @@ static int raid10_start_reshape(struct mddev *mddev) | |||
| 3930 | spin_lock_irq(&conf->device_lock); | 3956 | spin_lock_irq(&conf->device_lock); |
| 3931 | if (conf->mirrors_new) { | 3957 | if (conf->mirrors_new) { |
| 3932 | memcpy(conf->mirrors_new, conf->mirrors, | 3958 | memcpy(conf->mirrors_new, conf->mirrors, |
| 3933 | sizeof(struct mirror_info)*conf->prev.raid_disks); | 3959 | sizeof(struct raid10_info)*conf->prev.raid_disks); |
| 3934 | smp_mb(); | 3960 | smp_mb(); |
| 3935 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ | 3961 | kfree(conf->mirrors_old); /* FIXME and elsewhere */ |
| 3936 | conf->mirrors_old = conf->mirrors; | 3962 | conf->mirrors_old = conf->mirrors; |
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 135b1b0a1554..007c2c68dd83 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #ifndef _RAID10_H | 1 | #ifndef _RAID10_H |
| 2 | #define _RAID10_H | 2 | #define _RAID10_H |
| 3 | 3 | ||
| 4 | struct mirror_info { | 4 | struct raid10_info { |
| 5 | struct md_rdev *rdev, *replacement; | 5 | struct md_rdev *rdev, *replacement; |
| 6 | sector_t head_position; | 6 | sector_t head_position; |
| 7 | int recovery_disabled; /* matches | 7 | int recovery_disabled; /* matches |
| @@ -13,8 +13,8 @@ struct mirror_info { | |||
| 13 | 13 | ||
| 14 | struct r10conf { | 14 | struct r10conf { |
| 15 | struct mddev *mddev; | 15 | struct mddev *mddev; |
| 16 | struct mirror_info *mirrors; | 16 | struct raid10_info *mirrors; |
| 17 | struct mirror_info *mirrors_new, *mirrors_old; | 17 | struct raid10_info *mirrors_new, *mirrors_old; |
| 18 | spinlock_t device_lock; | 18 | spinlock_t device_lock; |
| 19 | 19 | ||
| 20 | /* geometry */ | 20 | /* geometry */ |
| @@ -123,20 +123,6 @@ struct r10bio { | |||
| 123 | } devs[0]; | 123 | } devs[0]; |
| 124 | }; | 124 | }; |
| 125 | 125 | ||
| 126 | /* when we get a read error on a read-only array, we redirect to another | ||
| 127 | * device without failing the first device, or trying to over-write to | ||
| 128 | * correct the read error. To keep track of bad blocks on a per-bio | ||
| 129 | * level, we store IO_BLOCKED in the appropriate 'bios' pointer | ||
| 130 | */ | ||
| 131 | #define IO_BLOCKED ((struct bio*)1) | ||
| 132 | /* When we successfully write to a known bad-block, we need to remove the | ||
| 133 | * bad-block marking which must be done from process context. So we record | ||
| 134 | * the success by setting devs[n].bio to IO_MADE_GOOD | ||
| 135 | */ | ||
| 136 | #define IO_MADE_GOOD ((struct bio *)2) | ||
| 137 | |||
| 138 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) | ||
| 139 | |||
| 140 | /* bits for r10bio.state */ | 126 | /* bits for r10bio.state */ |
| 141 | enum r10bio_state { | 127 | enum r10bio_state { |
| 142 | R10BIO_Uptodate, | 128 | R10BIO_Uptodate, |
| @@ -159,4 +145,7 @@ enum r10bio_state { | |||
| 159 | */ | 145 | */ |
| 160 | R10BIO_Previous, | 146 | R10BIO_Previous, |
| 161 | }; | 147 | }; |
| 148 | |||
| 149 | extern int md_raid10_congested(struct mddev *mddev, int bits); | ||
| 150 | |||
| 162 | #endif | 151 | #endif |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 04348d76bb30..259f519814ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
| @@ -99,34 +99,40 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) | |||
| 99 | * We maintain a biased count of active stripes in the bottom 16 bits of | 99 | * We maintain a biased count of active stripes in the bottom 16 bits of |
| 100 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits | 100 | * bi_phys_segments, and a count of processed stripes in the upper 16 bits |
| 101 | */ | 101 | */ |
| 102 | static inline int raid5_bi_phys_segments(struct bio *bio) | 102 | static inline int raid5_bi_processed_stripes(struct bio *bio) |
| 103 | { | 103 | { |
| 104 | return bio->bi_phys_segments & 0xffff; | 104 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 105 | return (atomic_read(segments) >> 16) & 0xffff; | ||
| 105 | } | 106 | } |
| 106 | 107 | ||
| 107 | static inline int raid5_bi_hw_segments(struct bio *bio) | 108 | static inline int raid5_dec_bi_active_stripes(struct bio *bio) |
| 108 | { | 109 | { |
| 109 | return (bio->bi_phys_segments >> 16) & 0xffff; | 110 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 111 | return atomic_sub_return(1, segments) & 0xffff; | ||
| 110 | } | 112 | } |
| 111 | 113 | ||
| 112 | static inline int raid5_dec_bi_phys_segments(struct bio *bio) | 114 | static inline void raid5_inc_bi_active_stripes(struct bio *bio) |
| 113 | { | 115 | { |
| 114 | --bio->bi_phys_segments; | 116 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 115 | return raid5_bi_phys_segments(bio); | 117 | atomic_inc(segments); |
| 116 | } | 118 | } |
| 117 | 119 | ||
| 118 | static inline int raid5_dec_bi_hw_segments(struct bio *bio) | 120 | static inline void raid5_set_bi_processed_stripes(struct bio *bio, |
| 121 | unsigned int cnt) | ||
| 119 | { | 122 | { |
| 120 | unsigned short val = raid5_bi_hw_segments(bio); | 123 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 124 | int old, new; | ||
| 121 | 125 | ||
| 122 | --val; | 126 | do { |
| 123 | bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); | 127 | old = atomic_read(segments); |
| 124 | return val; | 128 | new = (old & 0xffff) | (cnt << 16); |
| 129 | } while (atomic_cmpxchg(segments, old, new) != old); | ||
| 125 | } | 130 | } |
| 126 | 131 | ||
| 127 | static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) | 132 | static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt) |
| 128 | { | 133 | { |
| 129 | bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); | 134 | atomic_t *segments = (atomic_t *)&bio->bi_phys_segments; |
| 135 | atomic_set(segments, cnt); | ||
| 130 | } | 136 | } |
| 131 | 137 | ||
| 132 | /* Find first data disk in a raid6 stripe */ | 138 | /* Find first data disk in a raid6 stripe */ |
| @@ -190,49 +196,56 @@ static int stripe_operations_active(struct stripe_head *sh) | |||
| 190 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); | 196 | test_bit(STRIPE_COMPUTE_RUN, &sh->state); |
| 191 | } | 197 | } |
| 192 | 198 | ||
| 193 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | 199 | static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) |
| 194 | { | 200 | { |
| 195 | if (atomic_dec_and_test(&sh->count)) { | 201 | BUG_ON(!list_empty(&sh->lru)); |
| 196 | BUG_ON(!list_empty(&sh->lru)); | 202 | BUG_ON(atomic_read(&conf->active_stripes)==0); |
| 197 | BUG_ON(atomic_read(&conf->active_stripes)==0); | 203 | if (test_bit(STRIPE_HANDLE, &sh->state)) { |
| 198 | if (test_bit(STRIPE_HANDLE, &sh->state)) { | 204 | if (test_bit(STRIPE_DELAYED, &sh->state) && |
| 199 | if (test_bit(STRIPE_DELAYED, &sh->state) && | 205 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 200 | !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 206 | list_add_tail(&sh->lru, &conf->delayed_list); |
| 201 | list_add_tail(&sh->lru, &conf->delayed_list); | 207 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && |
| 202 | else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && | 208 | sh->bm_seq - conf->seq_write > 0) |
| 203 | sh->bm_seq - conf->seq_write > 0) | 209 | list_add_tail(&sh->lru, &conf->bitmap_list); |
| 204 | list_add_tail(&sh->lru, &conf->bitmap_list); | 210 | else { |
| 205 | else { | 211 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 206 | clear_bit(STRIPE_DELAYED, &sh->state); | 212 | clear_bit(STRIPE_BIT_DELAY, &sh->state); |
| 207 | clear_bit(STRIPE_BIT_DELAY, &sh->state); | 213 | list_add_tail(&sh->lru, &conf->handle_list); |
| 208 | list_add_tail(&sh->lru, &conf->handle_list); | 214 | } |
| 209 | } | 215 | md_wakeup_thread(conf->mddev->thread); |
| 210 | md_wakeup_thread(conf->mddev->thread); | 216 | } else { |
| 211 | } else { | 217 | BUG_ON(stripe_operations_active(sh)); |
| 212 | BUG_ON(stripe_operations_active(sh)); | 218 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 213 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 219 | if (atomic_dec_return(&conf->preread_active_stripes) |
| 214 | if (atomic_dec_return(&conf->preread_active_stripes) | 220 | < IO_THRESHOLD) |
| 215 | < IO_THRESHOLD) | 221 | md_wakeup_thread(conf->mddev->thread); |
| 216 | md_wakeup_thread(conf->mddev->thread); | 222 | atomic_dec(&conf->active_stripes); |
| 217 | atomic_dec(&conf->active_stripes); | 223 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
| 218 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 224 | list_add_tail(&sh->lru, &conf->inactive_list); |
| 219 | list_add_tail(&sh->lru, &conf->inactive_list); | 225 | wake_up(&conf->wait_for_stripe); |
| 220 | wake_up(&conf->wait_for_stripe); | 226 | if (conf->retry_read_aligned) |
| 221 | if (conf->retry_read_aligned) | 227 | md_wakeup_thread(conf->mddev->thread); |
| 222 | md_wakeup_thread(conf->mddev->thread); | ||
| 223 | } | ||
| 224 | } | 228 | } |
| 225 | } | 229 | } |
| 226 | } | 230 | } |
| 227 | 231 | ||
| 232 | static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | ||
| 233 | { | ||
| 234 | if (atomic_dec_and_test(&sh->count)) | ||
| 235 | do_release_stripe(conf, sh); | ||
| 236 | } | ||
| 237 | |||
| 228 | static void release_stripe(struct stripe_head *sh) | 238 | static void release_stripe(struct stripe_head *sh) |
| 229 | { | 239 | { |
| 230 | struct r5conf *conf = sh->raid_conf; | 240 | struct r5conf *conf = sh->raid_conf; |
| 231 | unsigned long flags; | 241 | unsigned long flags; |
| 232 | 242 | ||
| 233 | spin_lock_irqsave(&conf->device_lock, flags); | 243 | local_irq_save(flags); |
| 234 | __release_stripe(conf, sh); | 244 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { |
| 235 | spin_unlock_irqrestore(&conf->device_lock, flags); | 245 | do_release_stripe(conf, sh); |
| 246 | spin_unlock(&conf->device_lock); | ||
| 247 | } | ||
| 248 | local_irq_restore(flags); | ||
| 236 | } | 249 | } |
| 237 | 250 | ||
| 238 | static inline void remove_hash(struct stripe_head *sh) | 251 | static inline void remove_hash(struct stripe_head *sh) |
| @@ -640,6 +653,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 640 | else | 653 | else |
| 641 | bi->bi_sector = (sh->sector | 654 | bi->bi_sector = (sh->sector |
| 642 | + rdev->data_offset); | 655 | + rdev->data_offset); |
| 656 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) | ||
| 657 | bi->bi_rw |= REQ_FLUSH; | ||
| 658 | |||
| 643 | bi->bi_flags = 1 << BIO_UPTODATE; | 659 | bi->bi_flags = 1 << BIO_UPTODATE; |
| 644 | bi->bi_idx = 0; | 660 | bi->bi_idx = 0; |
| 645 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; | 661 | bi->bi_io_vec[0].bv_len = STRIPE_SIZE; |
| @@ -749,14 +765,12 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 749 | { | 765 | { |
| 750 | struct stripe_head *sh = stripe_head_ref; | 766 | struct stripe_head *sh = stripe_head_ref; |
| 751 | struct bio *return_bi = NULL; | 767 | struct bio *return_bi = NULL; |
| 752 | struct r5conf *conf = sh->raid_conf; | ||
| 753 | int i; | 768 | int i; |
| 754 | 769 | ||
| 755 | pr_debug("%s: stripe %llu\n", __func__, | 770 | pr_debug("%s: stripe %llu\n", __func__, |
| 756 | (unsigned long long)sh->sector); | 771 | (unsigned long long)sh->sector); |
| 757 | 772 | ||
| 758 | /* clear completed biofills */ | 773 | /* clear completed biofills */ |
| 759 | spin_lock_irq(&conf->device_lock); | ||
| 760 | for (i = sh->disks; i--; ) { | 774 | for (i = sh->disks; i--; ) { |
| 761 | struct r5dev *dev = &sh->dev[i]; | 775 | struct r5dev *dev = &sh->dev[i]; |
| 762 | 776 | ||
| @@ -774,7 +788,7 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 774 | while (rbi && rbi->bi_sector < | 788 | while (rbi && rbi->bi_sector < |
| 775 | dev->sector + STRIPE_SECTORS) { | 789 | dev->sector + STRIPE_SECTORS) { |
| 776 | rbi2 = r5_next_bio(rbi, dev->sector); | 790 | rbi2 = r5_next_bio(rbi, dev->sector); |
| 777 | if (!raid5_dec_bi_phys_segments(rbi)) { | 791 | if (!raid5_dec_bi_active_stripes(rbi)) { |
| 778 | rbi->bi_next = return_bi; | 792 | rbi->bi_next = return_bi; |
| 779 | return_bi = rbi; | 793 | return_bi = rbi; |
| 780 | } | 794 | } |
| @@ -782,7 +796,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 782 | } | 796 | } |
| 783 | } | 797 | } |
| 784 | } | 798 | } |
| 785 | spin_unlock_irq(&conf->device_lock); | ||
| 786 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | 799 | clear_bit(STRIPE_BIOFILL_RUN, &sh->state); |
| 787 | 800 | ||
| 788 | return_io(return_bi); | 801 | return_io(return_bi); |
| @@ -794,7 +807,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | |||
| 794 | static void ops_run_biofill(struct stripe_head *sh) | 807 | static void ops_run_biofill(struct stripe_head *sh) |
| 795 | { | 808 | { |
| 796 | struct dma_async_tx_descriptor *tx = NULL; | 809 | struct dma_async_tx_descriptor *tx = NULL; |
| 797 | struct r5conf *conf = sh->raid_conf; | ||
| 798 | struct async_submit_ctl submit; | 810 | struct async_submit_ctl submit; |
| 799 | int i; | 811 | int i; |
| 800 | 812 | ||
| @@ -805,10 +817,10 @@ static void ops_run_biofill(struct stripe_head *sh) | |||
| 805 | struct r5dev *dev = &sh->dev[i]; | 817 | struct r5dev *dev = &sh->dev[i]; |
| 806 | if (test_bit(R5_Wantfill, &dev->flags)) { | 818 | if (test_bit(R5_Wantfill, &dev->flags)) { |
| 807 | struct bio *rbi; | 819 | struct bio *rbi; |
| 808 | spin_lock_irq(&conf->device_lock); | 820 | spin_lock_irq(&sh->stripe_lock); |
| 809 | dev->read = rbi = dev->toread; | 821 | dev->read = rbi = dev->toread; |
| 810 | dev->toread = NULL; | 822 | dev->toread = NULL; |
| 811 | spin_unlock_irq(&conf->device_lock); | 823 | spin_unlock_irq(&sh->stripe_lock); |
| 812 | while (rbi && rbi->bi_sector < | 824 | while (rbi && rbi->bi_sector < |
| 813 | dev->sector + STRIPE_SECTORS) { | 825 | dev->sector + STRIPE_SECTORS) { |
| 814 | tx = async_copy_data(0, rbi, dev->page, | 826 | tx = async_copy_data(0, rbi, dev->page, |
| @@ -1144,12 +1156,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
| 1144 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | 1156 | if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { |
| 1145 | struct bio *wbi; | 1157 | struct bio *wbi; |
| 1146 | 1158 | ||
| 1147 | spin_lock_irq(&sh->raid_conf->device_lock); | 1159 | spin_lock_irq(&sh->stripe_lock); |
| 1148 | chosen = dev->towrite; | 1160 | chosen = dev->towrite; |
| 1149 | dev->towrite = NULL; | 1161 | dev->towrite = NULL; |
| 1150 | BUG_ON(dev->written); | 1162 | BUG_ON(dev->written); |
| 1151 | wbi = dev->written = chosen; | 1163 | wbi = dev->written = chosen; |
| 1152 | spin_unlock_irq(&sh->raid_conf->device_lock); | 1164 | spin_unlock_irq(&sh->stripe_lock); |
| 1153 | 1165 | ||
| 1154 | while (wbi && wbi->bi_sector < | 1166 | while (wbi && wbi->bi_sector < |
| 1155 | dev->sector + STRIPE_SECTORS) { | 1167 | dev->sector + STRIPE_SECTORS) { |
| @@ -1454,6 +1466,8 @@ static int grow_one_stripe(struct r5conf *conf) | |||
| 1454 | init_waitqueue_head(&sh->ops.wait_for_ops); | 1466 | init_waitqueue_head(&sh->ops.wait_for_ops); |
| 1455 | #endif | 1467 | #endif |
| 1456 | 1468 | ||
| 1469 | spin_lock_init(&sh->stripe_lock); | ||
| 1470 | |||
| 1457 | if (grow_buffers(sh)) { | 1471 | if (grow_buffers(sh)) { |
| 1458 | shrink_buffers(sh); | 1472 | shrink_buffers(sh); |
| 1459 | kmem_cache_free(conf->slab_cache, sh); | 1473 | kmem_cache_free(conf->slab_cache, sh); |
| @@ -1739,7 +1753,9 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1739 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); | 1753 | atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); |
| 1740 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1754 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| 1741 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1755 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
| 1742 | } | 1756 | } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) |
| 1757 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
| 1758 | |||
| 1743 | if (atomic_read(&rdev->read_errors)) | 1759 | if (atomic_read(&rdev->read_errors)) |
| 1744 | atomic_set(&rdev->read_errors, 0); | 1760 | atomic_set(&rdev->read_errors, 0); |
| 1745 | } else { | 1761 | } else { |
| @@ -1784,7 +1800,11 @@ static void raid5_end_read_request(struct bio * bi, int error) | |||
| 1784 | else | 1800 | else |
| 1785 | retry = 1; | 1801 | retry = 1; |
| 1786 | if (retry) | 1802 | if (retry) |
| 1787 | set_bit(R5_ReadError, &sh->dev[i].flags); | 1803 | if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) { |
| 1804 | set_bit(R5_ReadError, &sh->dev[i].flags); | ||
| 1805 | clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
| 1806 | } else | ||
| 1807 | set_bit(R5_ReadNoMerge, &sh->dev[i].flags); | ||
| 1788 | else { | 1808 | else { |
| 1789 | clear_bit(R5_ReadError, &sh->dev[i].flags); | 1809 | clear_bit(R5_ReadError, &sh->dev[i].flags); |
| 1790 | clear_bit(R5_ReWrite, &sh->dev[i].flags); | 1810 | clear_bit(R5_ReWrite, &sh->dev[i].flags); |
| @@ -2340,11 +2360,18 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2340 | (unsigned long long)bi->bi_sector, | 2360 | (unsigned long long)bi->bi_sector, |
| 2341 | (unsigned long long)sh->sector); | 2361 | (unsigned long long)sh->sector); |
| 2342 | 2362 | ||
| 2343 | 2363 | /* | |
| 2344 | spin_lock_irq(&conf->device_lock); | 2364 | * If several bio share a stripe. The bio bi_phys_segments acts as a |
| 2365 | * reference count to avoid race. The reference count should already be | ||
| 2366 | * increased before this function is called (for example, in | ||
| 2367 | * make_request()), so other bio sharing this stripe will not free the | ||
| 2368 | * stripe. If a stripe is owned by one stripe, the stripe lock will | ||
| 2369 | * protect it. | ||
| 2370 | */ | ||
| 2371 | spin_lock_irq(&sh->stripe_lock); | ||
| 2345 | if (forwrite) { | 2372 | if (forwrite) { |
| 2346 | bip = &sh->dev[dd_idx].towrite; | 2373 | bip = &sh->dev[dd_idx].towrite; |
| 2347 | if (*bip == NULL && sh->dev[dd_idx].written == NULL) | 2374 | if (*bip == NULL) |
| 2348 | firstwrite = 1; | 2375 | firstwrite = 1; |
| 2349 | } else | 2376 | } else |
| 2350 | bip = &sh->dev[dd_idx].toread; | 2377 | bip = &sh->dev[dd_idx].toread; |
| @@ -2360,7 +2387,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2360 | if (*bip) | 2387 | if (*bip) |
| 2361 | bi->bi_next = *bip; | 2388 | bi->bi_next = *bip; |
| 2362 | *bip = bi; | 2389 | *bip = bi; |
| 2363 | bi->bi_phys_segments++; | 2390 | raid5_inc_bi_active_stripes(bi); |
| 2364 | 2391 | ||
| 2365 | if (forwrite) { | 2392 | if (forwrite) { |
| 2366 | /* check if page is covered */ | 2393 | /* check if page is covered */ |
| @@ -2375,7 +2402,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2375 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2402 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
| 2376 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2403 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
| 2377 | } | 2404 | } |
| 2378 | spin_unlock_irq(&conf->device_lock); | 2405 | spin_unlock_irq(&sh->stripe_lock); |
| 2379 | 2406 | ||
| 2380 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2407 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
| 2381 | (unsigned long long)(*bip)->bi_sector, | 2408 | (unsigned long long)(*bip)->bi_sector, |
| @@ -2391,7 +2418,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
| 2391 | 2418 | ||
| 2392 | overlap: | 2419 | overlap: |
| 2393 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | 2420 | set_bit(R5_Overlap, &sh->dev[dd_idx].flags); |
| 2394 | spin_unlock_irq(&conf->device_lock); | 2421 | spin_unlock_irq(&sh->stripe_lock); |
| 2395 | return 0; | 2422 | return 0; |
| 2396 | } | 2423 | } |
| 2397 | 2424 | ||
| @@ -2441,10 +2468,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2441 | rdev_dec_pending(rdev, conf->mddev); | 2468 | rdev_dec_pending(rdev, conf->mddev); |
| 2442 | } | 2469 | } |
| 2443 | } | 2470 | } |
| 2444 | spin_lock_irq(&conf->device_lock); | 2471 | spin_lock_irq(&sh->stripe_lock); |
| 2445 | /* fail all writes first */ | 2472 | /* fail all writes first */ |
| 2446 | bi = sh->dev[i].towrite; | 2473 | bi = sh->dev[i].towrite; |
| 2447 | sh->dev[i].towrite = NULL; | 2474 | sh->dev[i].towrite = NULL; |
| 2475 | spin_unlock_irq(&sh->stripe_lock); | ||
| 2448 | if (bi) { | 2476 | if (bi) { |
| 2449 | s->to_write--; | 2477 | s->to_write--; |
| 2450 | bitmap_end = 1; | 2478 | bitmap_end = 1; |
| @@ -2457,13 +2485,17 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2457 | sh->dev[i].sector + STRIPE_SECTORS) { | 2485 | sh->dev[i].sector + STRIPE_SECTORS) { |
| 2458 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); | 2486 | struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); |
| 2459 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2487 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| 2460 | if (!raid5_dec_bi_phys_segments(bi)) { | 2488 | if (!raid5_dec_bi_active_stripes(bi)) { |
| 2461 | md_write_end(conf->mddev); | 2489 | md_write_end(conf->mddev); |
| 2462 | bi->bi_next = *return_bi; | 2490 | bi->bi_next = *return_bi; |
| 2463 | *return_bi = bi; | 2491 | *return_bi = bi; |
| 2464 | } | 2492 | } |
| 2465 | bi = nextbi; | 2493 | bi = nextbi; |
| 2466 | } | 2494 | } |
| 2495 | if (bitmap_end) | ||
| 2496 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | ||
| 2497 | STRIPE_SECTORS, 0, 0); | ||
| 2498 | bitmap_end = 0; | ||
| 2467 | /* and fail all 'written' */ | 2499 | /* and fail all 'written' */ |
| 2468 | bi = sh->dev[i].written; | 2500 | bi = sh->dev[i].written; |
| 2469 | sh->dev[i].written = NULL; | 2501 | sh->dev[i].written = NULL; |
| @@ -2472,7 +2504,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2472 | sh->dev[i].sector + STRIPE_SECTORS) { | 2504 | sh->dev[i].sector + STRIPE_SECTORS) { |
| 2473 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); | 2505 | struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); |
| 2474 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2506 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| 2475 | if (!raid5_dec_bi_phys_segments(bi)) { | 2507 | if (!raid5_dec_bi_active_stripes(bi)) { |
| 2476 | md_write_end(conf->mddev); | 2508 | md_write_end(conf->mddev); |
| 2477 | bi->bi_next = *return_bi; | 2509 | bi->bi_next = *return_bi; |
| 2478 | *return_bi = bi; | 2510 | *return_bi = bi; |
| @@ -2496,14 +2528,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
| 2496 | struct bio *nextbi = | 2528 | struct bio *nextbi = |
| 2497 | r5_next_bio(bi, sh->dev[i].sector); | 2529 | r5_next_bio(bi, sh->dev[i].sector); |
| 2498 | clear_bit(BIO_UPTODATE, &bi->bi_flags); | 2530 | clear_bit(BIO_UPTODATE, &bi->bi_flags); |
| 2499 | if (!raid5_dec_bi_phys_segments(bi)) { | 2531 | if (!raid5_dec_bi_active_stripes(bi)) { |
| 2500 | bi->bi_next = *return_bi; | 2532 | bi->bi_next = *return_bi; |
| 2501 | *return_bi = bi; | 2533 | *return_bi = bi; |
| 2502 | } | 2534 | } |
| 2503 | bi = nextbi; | 2535 | bi = nextbi; |
| 2504 | } | 2536 | } |
| 2505 | } | 2537 | } |
| 2506 | spin_unlock_irq(&conf->device_lock); | ||
| 2507 | if (bitmap_end) | 2538 | if (bitmap_end) |
| 2508 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, | 2539 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
| 2509 | STRIPE_SECTORS, 0, 0); | 2540 | STRIPE_SECTORS, 0, 0); |
| @@ -2707,30 +2738,23 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
| 2707 | test_bit(R5_UPTODATE, &dev->flags)) { | 2738 | test_bit(R5_UPTODATE, &dev->flags)) { |
| 2708 | /* We can return any write requests */ | 2739 | /* We can return any write requests */ |
| 2709 | struct bio *wbi, *wbi2; | 2740 | struct bio *wbi, *wbi2; |
| 2710 | int bitmap_end = 0; | ||
| 2711 | pr_debug("Return write for disc %d\n", i); | 2741 | pr_debug("Return write for disc %d\n", i); |
| 2712 | spin_lock_irq(&conf->device_lock); | ||
| 2713 | wbi = dev->written; | 2742 | wbi = dev->written; |
| 2714 | dev->written = NULL; | 2743 | dev->written = NULL; |
| 2715 | while (wbi && wbi->bi_sector < | 2744 | while (wbi && wbi->bi_sector < |
| 2716 | dev->sector + STRIPE_SECTORS) { | 2745 | dev->sector + STRIPE_SECTORS) { |
| 2717 | wbi2 = r5_next_bio(wbi, dev->sector); | 2746 | wbi2 = r5_next_bio(wbi, dev->sector); |
| 2718 | if (!raid5_dec_bi_phys_segments(wbi)) { | 2747 | if (!raid5_dec_bi_active_stripes(wbi)) { |
| 2719 | md_write_end(conf->mddev); | 2748 | md_write_end(conf->mddev); |
| 2720 | wbi->bi_next = *return_bi; | 2749 | wbi->bi_next = *return_bi; |
| 2721 | *return_bi = wbi; | 2750 | *return_bi = wbi; |
| 2722 | } | 2751 | } |
| 2723 | wbi = wbi2; | 2752 | wbi = wbi2; |
| 2724 | } | 2753 | } |
| 2725 | if (dev->towrite == NULL) | 2754 | bitmap_endwrite(conf->mddev->bitmap, sh->sector, |
| 2726 | bitmap_end = 1; | 2755 | STRIPE_SECTORS, |
| 2727 | spin_unlock_irq(&conf->device_lock); | ||
| 2728 | if (bitmap_end) | ||
| 2729 | bitmap_endwrite(conf->mddev->bitmap, | ||
| 2730 | sh->sector, | ||
| 2731 | STRIPE_SECTORS, | ||
| 2732 | !test_bit(STRIPE_DEGRADED, &sh->state), | 2756 | !test_bit(STRIPE_DEGRADED, &sh->state), |
| 2733 | 0); | 2757 | 0); |
| 2734 | } | 2758 | } |
| 2735 | } | 2759 | } |
| 2736 | 2760 | ||
| @@ -3182,7 +3206,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 3182 | 3206 | ||
| 3183 | /* Now to look around and see what can be done */ | 3207 | /* Now to look around and see what can be done */ |
| 3184 | rcu_read_lock(); | 3208 | rcu_read_lock(); |
| 3185 | spin_lock_irq(&conf->device_lock); | ||
| 3186 | for (i=disks; i--; ) { | 3209 | for (i=disks; i--; ) { |
| 3187 | struct md_rdev *rdev; | 3210 | struct md_rdev *rdev; |
| 3188 | sector_t first_bad; | 3211 | sector_t first_bad; |
| @@ -3328,7 +3351,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | |||
| 3328 | do_recovery = 1; | 3351 | do_recovery = 1; |
| 3329 | } | 3352 | } |
| 3330 | } | 3353 | } |
| 3331 | spin_unlock_irq(&conf->device_lock); | ||
| 3332 | if (test_bit(STRIPE_SYNCING, &sh->state)) { | 3354 | if (test_bit(STRIPE_SYNCING, &sh->state)) { |
| 3333 | /* If there is a failed device being replaced, | 3355 | /* If there is a failed device being replaced, |
| 3334 | * we must be recovering. | 3356 | * we must be recovering. |
| @@ -3791,7 +3813,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf) | |||
| 3791 | * this sets the active strip count to 1 and the processed | 3813 | * this sets the active strip count to 1 and the processed |
| 3792 | * strip count to zero (upper 8 bits) | 3814 | * strip count to zero (upper 8 bits) |
| 3793 | */ | 3815 | */ |
| 3794 | bi->bi_phys_segments = 1; /* biased count of active stripes */ | 3816 | raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */ |
| 3795 | } | 3817 | } |
| 3796 | 3818 | ||
| 3797 | return bi; | 3819 | return bi; |
| @@ -4113,7 +4135,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4113 | finish_wait(&conf->wait_for_overlap, &w); | 4135 | finish_wait(&conf->wait_for_overlap, &w); |
| 4114 | set_bit(STRIPE_HANDLE, &sh->state); | 4136 | set_bit(STRIPE_HANDLE, &sh->state); |
| 4115 | clear_bit(STRIPE_DELAYED, &sh->state); | 4137 | clear_bit(STRIPE_DELAYED, &sh->state); |
| 4116 | if ((bi->bi_rw & REQ_SYNC) && | 4138 | if ((bi->bi_rw & REQ_NOIDLE) && |
| 4117 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | 4139 | !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
| 4118 | atomic_inc(&conf->preread_active_stripes); | 4140 | atomic_inc(&conf->preread_active_stripes); |
| 4119 | mddev_check_plugged(mddev); | 4141 | mddev_check_plugged(mddev); |
| @@ -4126,9 +4148,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
| 4126 | } | 4148 | } |
| 4127 | } | 4149 | } |
| 4128 | 4150 | ||
| 4129 | spin_lock_irq(&conf->device_lock); | 4151 | remaining = raid5_dec_bi_active_stripes(bi); |
| 4130 | remaining = raid5_dec_bi_phys_segments(bi); | ||
| 4131 | spin_unlock_irq(&conf->device_lock); | ||
| 4132 | if (remaining == 0) { | 4152 | if (remaining == 0) { |
| 4133 | 4153 | ||
| 4134 | if ( rw == WRITE ) | 4154 | if ( rw == WRITE ) |
| @@ -4484,7 +4504,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 4484 | sector += STRIPE_SECTORS, | 4504 | sector += STRIPE_SECTORS, |
| 4485 | scnt++) { | 4505 | scnt++) { |
| 4486 | 4506 | ||
| 4487 | if (scnt < raid5_bi_hw_segments(raid_bio)) | 4507 | if (scnt < raid5_bi_processed_stripes(raid_bio)) |
| 4488 | /* already done this stripe */ | 4508 | /* already done this stripe */ |
| 4489 | continue; | 4509 | continue; |
| 4490 | 4510 | ||
| @@ -4492,25 +4512,24 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) | |||
| 4492 | 4512 | ||
| 4493 | if (!sh) { | 4513 | if (!sh) { |
| 4494 | /* failed to get a stripe - must wait */ | 4514 | /* failed to get a stripe - must wait */ |
| 4495 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4515 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
| 4496 | conf->retry_read_aligned = raid_bio; | 4516 | conf->retry_read_aligned = raid_bio; |
| 4497 | return handled; | 4517 | return handled; |
| 4498 | } | 4518 | } |
| 4499 | 4519 | ||
| 4500 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { | 4520 | if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { |
| 4501 | release_stripe(sh); | 4521 | release_stripe(sh); |
| 4502 | raid5_set_bi_hw_segments(raid_bio, scnt); | 4522 | raid5_set_bi_processed_stripes(raid_bio, scnt); |
| 4503 | conf->retry_read_aligned = raid_bio; | 4523 | conf->retry_read_aligned = raid_bio; |
| 4504 | return handled; | 4524 | return handled; |
| 4505 | } | 4525 | } |
| 4506 | 4526 | ||
| 4527 | set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); | ||
| 4507 | handle_stripe(sh); | 4528 | handle_stripe(sh); |
| 4508 | release_stripe(sh); | 4529 | release_stripe(sh); |
| 4509 | handled++; | 4530 | handled++; |
| 4510 | } | 4531 | } |
| 4511 | spin_lock_irq(&conf->device_lock); | 4532 | remaining = raid5_dec_bi_active_stripes(raid_bio); |
| 4512 | remaining = raid5_dec_bi_phys_segments(raid_bio); | ||
| 4513 | spin_unlock_irq(&conf->device_lock); | ||
| 4514 | if (remaining == 0) | 4533 | if (remaining == 0) |
| 4515 | bio_endio(raid_bio, 0); | 4534 | bio_endio(raid_bio, 0); |
| 4516 | if (atomic_dec_and_test(&conf->active_aligned_reads)) | 4535 | if (atomic_dec_and_test(&conf->active_aligned_reads)) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 2164021f3b5f..61dbb615c30b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
| @@ -210,6 +210,7 @@ struct stripe_head { | |||
| 210 | int disks; /* disks in stripe */ | 210 | int disks; /* disks in stripe */ |
| 211 | enum check_states check_state; | 211 | enum check_states check_state; |
| 212 | enum reconstruct_states reconstruct_state; | 212 | enum reconstruct_states reconstruct_state; |
| 213 | spinlock_t stripe_lock; | ||
| 213 | /** | 214 | /** |
| 214 | * struct stripe_operations | 215 | * struct stripe_operations |
| 215 | * @target - STRIPE_OP_COMPUTE_BLK target | 216 | * @target - STRIPE_OP_COMPUTE_BLK target |
| @@ -273,6 +274,7 @@ enum r5dev_flags { | |||
| 273 | R5_Wantwrite, | 274 | R5_Wantwrite, |
| 274 | R5_Overlap, /* There is a pending overlapping request | 275 | R5_Overlap, /* There is a pending overlapping request |
| 275 | * on this block */ | 276 | * on this block */ |
| 277 | R5_ReadNoMerge, /* prevent bio from merging in block-layer */ | ||
| 276 | R5_ReadError, /* seen a read error here recently */ | 278 | R5_ReadError, /* seen a read error here recently */ |
| 277 | R5_ReWrite, /* have tried to over-write the readerror */ | 279 | R5_ReWrite, /* have tried to over-write the readerror */ |
| 278 | 280 | ||
