diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-13 16:22:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-13 16:22:01 -0400 |
commit | 9db908806b85c1430150fbafe269a7b21b07d15d (patch) | |
tree | 3911759c93e0be26b6771e1a92b75612b206ffa5 | |
parent | 4d7127dace8cf4b05eb7c8c8531fc204fbb195f4 (diff) | |
parent | 72f36d5972a166197036c1281963f6863c429bf2 (diff) |
Merge tag 'md-3.7' of git://neil.brown.name/md
Pull md updates from NeilBrown:
- "discard" support, some dm-raid improvements and other assorted bits
and pieces.
* tag 'md-3.7' of git://neil.brown.name/md: (29 commits)
md: refine reporting of resync/reshape delays.
md/raid5: be careful not to resize_stripes too big.
md: make sure manual changes to recovery checkpoint are saved.
md/raid10: use correct limit variable
md: writing to sync_action should clear the read-auto state.
Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races
md/raid5: make sure to_read and to_write never go negative.
md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write.
md/raid5: protect debug message against NULL derefernce.
md/raid5: add some missing locking in handle_failed_stripe.
MD: raid5 avoid unnecessary zero page for trim
MD: raid5 trim support
md/bitmap:Don't use IS_ERR to judge alloc_page().
md/raid1: Don't release reference to device while handling read error.
raid: replace list_for_each_continue_rcu with new interface
add further __init annotations to crypto/xor.c
DM RAID: Fix for "sync" directive ineffectiveness
DM RAID: Fix comparison of index and quantity for "rebuild" parameter
DM RAID: Add rebuild capability for RAID10
DM RAID: Move 'rebuild' checking code to its own function
...
-rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 9 | ||||
-rw-r--r-- | crypto/xor.c | 4 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 17 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 124 | ||||
-rw-r--r-- | drivers/md/linear.c | 25 | ||||
-rw-r--r-- | drivers/md/md.c | 145 | ||||
-rw-r--r-- | drivers/md/md.h | 9 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 19 | ||||
-rw-r--r-- | drivers/md/raid1.c | 37 | ||||
-rw-r--r-- | drivers/md/raid10.c | 95 | ||||
-rw-r--r-- | drivers/md/raid5.c | 219 | ||||
-rw-r--r-- | drivers/md/raid5.h | 1 |
13 files changed, 578 insertions, 129 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 1c1844957166..728c38c242d6 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt | |||
@@ -132,3 +132,12 @@ Here we can see the RAID type is raid4, there are 5 devices - all of | |||
132 | which are 'A'live, and the array is 2/490221568 complete with recovery. | 132 | which are 'A'live, and the array is 2/490221568 complete with recovery. |
133 | Faulty or missing devices are marked 'D'. Devices that are out-of-sync | 133 | Faulty or missing devices are marked 'D'. Devices that are out-of-sync |
134 | are marked 'a'. | 134 | are marked 'a'. |
135 | |||
136 | |||
137 | Version History | ||
138 | --------------- | ||
139 | 1.0.0 Initial version. Support for RAID 4/5/6 | ||
140 | 1.1.0 Added support for RAID 1 | ||
141 | 1.2.0 Handle creation of arrays that contain failed devices. | ||
142 | 1.3.0 Added support for RAID 10 | ||
143 | 1.3.1 Allow device replacement/rebuild for RAID 10 | ||
diff --git a/crypto/xor.c b/crypto/xor.c index 65c7b416b4a3..35d6b3adf230 100644 --- a/crypto/xor.c +++ b/crypto/xor.c | |||
@@ -56,11 +56,11 @@ xor_blocks(unsigned int src_count, unsigned int bytes, void *dest, void **srcs) | |||
56 | EXPORT_SYMBOL(xor_blocks); | 56 | EXPORT_SYMBOL(xor_blocks); |
57 | 57 | ||
58 | /* Set of all registered templates. */ | 58 | /* Set of all registered templates. */ |
59 | static struct xor_block_template *template_list; | 59 | static struct xor_block_template *__initdata template_list; |
60 | 60 | ||
61 | #define BENCH_SIZE (PAGE_SIZE) | 61 | #define BENCH_SIZE (PAGE_SIZE) |
62 | 62 | ||
63 | static void | 63 | static void __init |
64 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) | 64 | do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) |
65 | { | 65 | { |
66 | int speed; | 66 | int speed; |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 94e7f6ba2e11..7155945f8eb8 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -163,20 +163,17 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde | |||
163 | * As devices are only added or removed when raid_disk is < 0 and | 163 | * As devices are only added or removed when raid_disk is < 0 and |
164 | * nr_pending is 0 and In_sync is clear, the entries we return will | 164 | * nr_pending is 0 and In_sync is clear, the entries we return will |
165 | * still be in the same position on the list when we re-enter | 165 | * still be in the same position on the list when we re-enter |
166 | * list_for_each_continue_rcu. | 166 | * list_for_each_entry_continue_rcu. |
167 | */ | 167 | */ |
168 | struct list_head *pos; | ||
169 | rcu_read_lock(); | 168 | rcu_read_lock(); |
170 | if (rdev == NULL) | 169 | if (rdev == NULL) |
171 | /* start at the beginning */ | 170 | /* start at the beginning */ |
172 | pos = &mddev->disks; | 171 | rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set); |
173 | else { | 172 | else { |
174 | /* release the previous rdev and start from there. */ | 173 | /* release the previous rdev and start from there. */ |
175 | rdev_dec_pending(rdev, mddev); | 174 | rdev_dec_pending(rdev, mddev); |
176 | pos = &rdev->same_set; | ||
177 | } | 175 | } |
178 | list_for_each_continue_rcu(pos, &mddev->disks) { | 176 | list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { |
179 | rdev = list_entry(pos, struct md_rdev, same_set); | ||
180 | if (rdev->raid_disk >= 0 && | 177 | if (rdev->raid_disk >= 0 && |
181 | !test_bit(Faulty, &rdev->flags)) { | 178 | !test_bit(Faulty, &rdev->flags)) { |
182 | /* this is a usable devices */ | 179 | /* this is a usable devices */ |
@@ -473,14 +470,10 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
473 | { | 470 | { |
474 | bitmap_super_t *sb; | 471 | bitmap_super_t *sb; |
475 | unsigned long chunksize, daemon_sleep, write_behind; | 472 | unsigned long chunksize, daemon_sleep, write_behind; |
476 | int err = -EINVAL; | ||
477 | 473 | ||
478 | bitmap->storage.sb_page = alloc_page(GFP_KERNEL); | 474 | bitmap->storage.sb_page = alloc_page(GFP_KERNEL); |
479 | if (IS_ERR(bitmap->storage.sb_page)) { | 475 | if (bitmap->storage.sb_page == NULL) |
480 | err = PTR_ERR(bitmap->storage.sb_page); | 476 | return -ENOMEM; |
481 | bitmap->storage.sb_page = NULL; | ||
482 | return err; | ||
483 | } | ||
484 | bitmap->storage.sb_page->index = 0; | 477 | bitmap->storage.sb_page->index = 0; |
485 | 478 | ||
486 | sb = kmap_atomic(bitmap->storage.sb_page); | 479 | sb = kmap_atomic(bitmap->storage.sb_page); |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 982e3e390c45..45d94a7e7f6d 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -338,6 +338,84 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) | |||
338 | } | 338 | } |
339 | 339 | ||
340 | /* | 340 | /* |
341 | * validate_rebuild_devices | ||
342 | * @rs | ||
343 | * | ||
344 | * Determine if the devices specified for rebuild can result in a valid | ||
345 | * usable array that is capable of rebuilding the given devices. | ||
346 | * | ||
347 | * Returns: 0 on success, -EINVAL on failure. | ||
348 | */ | ||
349 | static int validate_rebuild_devices(struct raid_set *rs) | ||
350 | { | ||
351 | unsigned i, rebuild_cnt = 0; | ||
352 | unsigned rebuilds_per_group, copies, d; | ||
353 | |||
354 | if (!(rs->print_flags & DMPF_REBUILD)) | ||
355 | return 0; | ||
356 | |||
357 | for (i = 0; i < rs->md.raid_disks; i++) | ||
358 | if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) | ||
359 | rebuild_cnt++; | ||
360 | |||
361 | switch (rs->raid_type->level) { | ||
362 | case 1: | ||
363 | if (rebuild_cnt >= rs->md.raid_disks) | ||
364 | goto too_many; | ||
365 | break; | ||
366 | case 4: | ||
367 | case 5: | ||
368 | case 6: | ||
369 | if (rebuild_cnt > rs->raid_type->parity_devs) | ||
370 | goto too_many; | ||
371 | break; | ||
372 | case 10: | ||
373 | copies = raid10_md_layout_to_copies(rs->md.layout); | ||
374 | if (rebuild_cnt < copies) | ||
375 | break; | ||
376 | |||
377 | /* | ||
378 | * It is possible to have a higher rebuild count for RAID10, | ||
379 | * as long as the failed devices occur in different mirror | ||
380 | * groups (i.e. different stripes). | ||
381 | * | ||
382 | * Right now, we only allow for "near" copies. When other | ||
383 | * formats are added, we will have to check those too. | ||
384 | * | ||
385 | * When checking "near" format, make sure no adjacent devices | ||
386 | * have failed beyond what can be handled. In addition to the | ||
387 | * simple case where the number of devices is a multiple of the | ||
388 | * number of copies, we must also handle cases where the number | ||
389 | * of devices is not a multiple of the number of copies. | ||
390 | * E.g. dev1 dev2 dev3 dev4 dev5 | ||
391 | * A A B B C | ||
392 | * C D D E E | ||
393 | */ | ||
394 | rebuilds_per_group = 0; | ||
395 | for (i = 0; i < rs->md.raid_disks * copies; i++) { | ||
396 | d = i % rs->md.raid_disks; | ||
397 | if (!test_bit(In_sync, &rs->dev[d].rdev.flags) && | ||
398 | (++rebuilds_per_group >= copies)) | ||
399 | goto too_many; | ||
400 | if (!((i + 1) % copies)) | ||
401 | rebuilds_per_group = 0; | ||
402 | } | ||
403 | break; | ||
404 | default: | ||
405 | DMERR("The rebuild parameter is not supported for %s", | ||
406 | rs->raid_type->name); | ||
407 | rs->ti->error = "Rebuild not supported for this RAID type"; | ||
408 | return -EINVAL; | ||
409 | } | ||
410 | |||
411 | return 0; | ||
412 | |||
413 | too_many: | ||
414 | rs->ti->error = "Too many rebuild devices specified"; | ||
415 | return -EINVAL; | ||
416 | } | ||
417 | |||
418 | /* | ||
341 | * Possible arguments are... | 419 | * Possible arguments are... |
342 | * <chunk_size> [optional_args] | 420 | * <chunk_size> [optional_args] |
343 | * | 421 | * |
@@ -365,7 +443,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
365 | { | 443 | { |
366 | char *raid10_format = "near"; | 444 | char *raid10_format = "near"; |
367 | unsigned raid10_copies = 2; | 445 | unsigned raid10_copies = 2; |
368 | unsigned i, rebuild_cnt = 0; | 446 | unsigned i; |
369 | unsigned long value, region_size = 0; | 447 | unsigned long value, region_size = 0; |
370 | sector_t sectors_per_dev = rs->ti->len; | 448 | sector_t sectors_per_dev = rs->ti->len; |
371 | sector_t max_io_len; | 449 | sector_t max_io_len; |
@@ -461,31 +539,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
461 | 539 | ||
462 | /* Parameters that take a numeric value are checked here */ | 540 | /* Parameters that take a numeric value are checked here */ |
463 | if (!strcasecmp(key, "rebuild")) { | 541 | if (!strcasecmp(key, "rebuild")) { |
464 | rebuild_cnt++; | 542 | if (value >= rs->md.raid_disks) { |
465 | |||
466 | switch (rs->raid_type->level) { | ||
467 | case 1: | ||
468 | if (rebuild_cnt >= rs->md.raid_disks) { | ||
469 | rs->ti->error = "Too many rebuild devices specified"; | ||
470 | return -EINVAL; | ||
471 | } | ||
472 | break; | ||
473 | case 4: | ||
474 | case 5: | ||
475 | case 6: | ||
476 | if (rebuild_cnt > rs->raid_type->parity_devs) { | ||
477 | rs->ti->error = "Too many rebuild devices specified for given RAID type"; | ||
478 | return -EINVAL; | ||
479 | } | ||
480 | break; | ||
481 | case 10: | ||
482 | default: | ||
483 | DMERR("The rebuild parameter is not supported for %s", rs->raid_type->name); | ||
484 | rs->ti->error = "Rebuild not supported for this RAID type"; | ||
485 | return -EINVAL; | ||
486 | } | ||
487 | |||
488 | if (value > rs->md.raid_disks) { | ||
489 | rs->ti->error = "Invalid rebuild index given"; | 543 | rs->ti->error = "Invalid rebuild index given"; |
490 | return -EINVAL; | 544 | return -EINVAL; |
491 | } | 545 | } |
@@ -608,6 +662,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, | |||
608 | } | 662 | } |
609 | rs->md.dev_sectors = sectors_per_dev; | 663 | rs->md.dev_sectors = sectors_per_dev; |
610 | 664 | ||
665 | if (validate_rebuild_devices(rs)) | ||
666 | return -EINVAL; | ||
667 | |||
611 | /* Assume there are no metadata devices until the drives are parsed */ | 668 | /* Assume there are no metadata devices until the drives are parsed */ |
612 | rs->md.persistent = 0; | 669 | rs->md.persistent = 0; |
613 | rs->md.external = 1; | 670 | rs->md.external = 1; |
@@ -960,6 +1017,19 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
960 | 1017 | ||
961 | freshest = NULL; | 1018 | freshest = NULL; |
962 | rdev_for_each_safe(rdev, tmp, mddev) { | 1019 | rdev_for_each_safe(rdev, tmp, mddev) { |
1020 | /* | ||
1021 | * Skipping super_load due to DMPF_SYNC will cause | ||
1022 | * the array to undergo initialization again as | ||
1023 | * though it were new. This is the intended effect | ||
1024 | * of the "sync" directive. | ||
1025 | * | ||
1026 | * When reshaping capability is added, we must ensure | ||
1027 | * that the "sync" directive is disallowed during the | ||
1028 | * reshape. | ||
1029 | */ | ||
1030 | if (rs->print_flags & DMPF_SYNC) | ||
1031 | continue; | ||
1032 | |||
963 | if (!rdev->meta_bdev) | 1033 | if (!rdev->meta_bdev) |
964 | continue; | 1034 | continue; |
965 | 1035 | ||
@@ -1360,7 +1430,7 @@ static void raid_resume(struct dm_target *ti) | |||
1360 | 1430 | ||
1361 | static struct target_type raid_target = { | 1431 | static struct target_type raid_target = { |
1362 | .name = "raid", | 1432 | .name = "raid", |
1363 | .version = {1, 3, 0}, | 1433 | .version = {1, 3, 1}, |
1364 | .module = THIS_MODULE, | 1434 | .module = THIS_MODULE, |
1365 | .ctr = raid_ctr, | 1435 | .ctr = raid_ctr, |
1366 | .dtr = raid_dtr, | 1436 | .dtr = raid_dtr, |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index fa211d80fc0a..21014836bdbf 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -138,6 +138,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
138 | struct linear_conf *conf; | 138 | struct linear_conf *conf; |
139 | struct md_rdev *rdev; | 139 | struct md_rdev *rdev; |
140 | int i, cnt; | 140 | int i, cnt; |
141 | bool discard_supported = false; | ||
141 | 142 | ||
142 | conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), | 143 | conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), |
143 | GFP_KERNEL); | 144 | GFP_KERNEL); |
@@ -171,6 +172,8 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
171 | conf->array_sectors += rdev->sectors; | 172 | conf->array_sectors += rdev->sectors; |
172 | cnt++; | 173 | cnt++; |
173 | 174 | ||
175 | if (blk_queue_discard(bdev_get_queue(rdev->bdev))) | ||
176 | discard_supported = true; | ||
174 | } | 177 | } |
175 | if (cnt != raid_disks) { | 178 | if (cnt != raid_disks) { |
176 | printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", | 179 | printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n", |
@@ -178,6 +181,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
178 | goto out; | 181 | goto out; |
179 | } | 182 | } |
180 | 183 | ||
184 | if (!discard_supported) | ||
185 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
186 | else | ||
187 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
188 | |||
181 | /* | 189 | /* |
182 | * Here we calculate the device offsets. | 190 | * Here we calculate the device offsets. |
183 | */ | 191 | */ |
@@ -244,7 +252,9 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) | |||
244 | if (!newconf) | 252 | if (!newconf) |
245 | return -ENOMEM; | 253 | return -ENOMEM; |
246 | 254 | ||
247 | oldconf = rcu_dereference(mddev->private); | 255 | oldconf = rcu_dereference_protected(mddev->private, |
256 | lockdep_is_held( | ||
257 | &mddev->reconfig_mutex)); | ||
248 | mddev->raid_disks++; | 258 | mddev->raid_disks++; |
249 | rcu_assign_pointer(mddev->private, newconf); | 259 | rcu_assign_pointer(mddev->private, newconf); |
250 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); | 260 | md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); |
@@ -256,7 +266,10 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) | |||
256 | 266 | ||
257 | static int linear_stop (struct mddev *mddev) | 267 | static int linear_stop (struct mddev *mddev) |
258 | { | 268 | { |
259 | struct linear_conf *conf = mddev->private; | 269 | struct linear_conf *conf = |
270 | rcu_dereference_protected(mddev->private, | ||
271 | lockdep_is_held( | ||
272 | &mddev->reconfig_mutex)); | ||
260 | 273 | ||
261 | /* | 274 | /* |
262 | * We do not require rcu protection here since | 275 | * We do not require rcu protection here since |
@@ -326,6 +339,14 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) | |||
326 | bio->bi_sector = bio->bi_sector - start_sector | 339 | bio->bi_sector = bio->bi_sector - start_sector |
327 | + tmp_dev->rdev->data_offset; | 340 | + tmp_dev->rdev->data_offset; |
328 | rcu_read_unlock(); | 341 | rcu_read_unlock(); |
342 | |||
343 | if (unlikely((bio->bi_rw & REQ_DISCARD) && | ||
344 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { | ||
345 | /* Just ignore it */ | ||
346 | bio_endio(bio, 0); | ||
347 | return; | ||
348 | } | ||
349 | |||
329 | generic_make_request(bio); | 350 | generic_make_request(bio); |
330 | } | 351 | } |
331 | 352 | ||
diff --git a/drivers/md/md.c b/drivers/md/md.c index 95c88012a3b9..9ab768acfb62 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -674,7 +674,18 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) | |||
674 | return NULL; | 674 | return NULL; |
675 | } | 675 | } |
676 | 676 | ||
677 | static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) | 677 | static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) |
678 | { | ||
679 | struct md_rdev *rdev; | ||
680 | |||
681 | rdev_for_each_rcu(rdev, mddev) | ||
682 | if (rdev->desc_nr == nr) | ||
683 | return rdev; | ||
684 | |||
685 | return NULL; | ||
686 | } | ||
687 | |||
688 | static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) | ||
678 | { | 689 | { |
679 | struct md_rdev *rdev; | 690 | struct md_rdev *rdev; |
680 | 691 | ||
@@ -685,6 +696,17 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) | |||
685 | return NULL; | 696 | return NULL; |
686 | } | 697 | } |
687 | 698 | ||
699 | static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) | ||
700 | { | ||
701 | struct md_rdev *rdev; | ||
702 | |||
703 | rdev_for_each_rcu(rdev, mddev) | ||
704 | if (rdev->bdev->bd_dev == dev) | ||
705 | return rdev; | ||
706 | |||
707 | return NULL; | ||
708 | } | ||
709 | |||
688 | static struct md_personality *find_pers(int level, char *clevel) | 710 | static struct md_personality *find_pers(int level, char *clevel) |
689 | { | 711 | { |
690 | struct md_personality *pers; | 712 | struct md_personality *pers; |
@@ -2022,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); | |||
2022 | /* Disable data integrity if non-capable/non-matching disk is being added */ | 2044 | /* Disable data integrity if non-capable/non-matching disk is being added */ |
2023 | void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) | 2045 | void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) |
2024 | { | 2046 | { |
2025 | struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); | 2047 | struct blk_integrity *bi_rdev; |
2026 | struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); | 2048 | struct blk_integrity *bi_mddev; |
2049 | |||
2050 | if (!mddev->gendisk) | ||
2051 | return; | ||
2052 | |||
2053 | bi_rdev = bdev_get_integrity(rdev->bdev); | ||
2054 | bi_mddev = blk_get_integrity(mddev->gendisk); | ||
2027 | 2055 | ||
2028 | if (!bi_mddev) /* nothing to do */ | 2056 | if (!bi_mddev) /* nothing to do */ |
2029 | return; | 2057 | return; |
@@ -3754,6 +3782,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) | |||
3754 | return -EINVAL; | 3782 | return -EINVAL; |
3755 | 3783 | ||
3756 | mddev->recovery_cp = n; | 3784 | mddev->recovery_cp = n; |
3785 | if (mddev->pers) | ||
3786 | set_bit(MD_CHANGE_CLEAN, &mddev->flags); | ||
3757 | return len; | 3787 | return len; |
3758 | } | 3788 | } |
3759 | static struct md_sysfs_entry md_resync_start = | 3789 | static struct md_sysfs_entry md_resync_start = |
@@ -4231,6 +4261,13 @@ action_store(struct mddev *mddev, const char *page, size_t len) | |||
4231 | set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); | 4261 | set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); |
4232 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); | 4262 | set_bit(MD_RECOVERY_SYNC, &mddev->recovery); |
4233 | } | 4263 | } |
4264 | if (mddev->ro == 2) { | ||
4265 | /* A write to sync_action is enough to justify | ||
4266 | * canceling read-auto mode | ||
4267 | */ | ||
4268 | mddev->ro = 0; | ||
4269 | md_wakeup_thread(mddev->sync_thread); | ||
4270 | } | ||
4234 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4271 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4235 | md_wakeup_thread(mddev->thread); | 4272 | md_wakeup_thread(mddev->thread); |
4236 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 4273 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
@@ -4241,7 +4278,8 @@ static ssize_t | |||
4241 | mismatch_cnt_show(struct mddev *mddev, char *page) | 4278 | mismatch_cnt_show(struct mddev *mddev, char *page) |
4242 | { | 4279 | { |
4243 | return sprintf(page, "%llu\n", | 4280 | return sprintf(page, "%llu\n", |
4244 | (unsigned long long) mddev->resync_mismatches); | 4281 | (unsigned long long) |
4282 | atomic64_read(&mddev->resync_mismatches)); | ||
4245 | } | 4283 | } |
4246 | 4284 | ||
4247 | static struct md_sysfs_entry md_scan_mode = | 4285 | static struct md_sysfs_entry md_scan_mode = |
@@ -4362,6 +4400,10 @@ sync_completed_show(struct mddev *mddev, char *page) | |||
4362 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 4400 | if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
4363 | return sprintf(page, "none\n"); | 4401 | return sprintf(page, "none\n"); |
4364 | 4402 | ||
4403 | if (mddev->curr_resync == 1 || | ||
4404 | mddev->curr_resync == 2) | ||
4405 | return sprintf(page, "delayed\n"); | ||
4406 | |||
4365 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || | 4407 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
4366 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 4408 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
4367 | max_sectors = mddev->resync_max_sectors; | 4409 | max_sectors = mddev->resync_max_sectors; |
@@ -5207,7 +5249,7 @@ static void md_clean(struct mddev *mddev) | |||
5207 | mddev->new_layout = 0; | 5249 | mddev->new_layout = 0; |
5208 | mddev->new_chunk_sectors = 0; | 5250 | mddev->new_chunk_sectors = 0; |
5209 | mddev->curr_resync = 0; | 5251 | mddev->curr_resync = 0; |
5210 | mddev->resync_mismatches = 0; | 5252 | atomic64_set(&mddev->resync_mismatches, 0); |
5211 | mddev->suspend_lo = mddev->suspend_hi = 0; | 5253 | mddev->suspend_lo = mddev->suspend_hi = 0; |
5212 | mddev->sync_speed_min = mddev->sync_speed_max = 0; | 5254 | mddev->sync_speed_min = mddev->sync_speed_max = 0; |
5213 | mddev->recovery = 0; | 5255 | mddev->recovery = 0; |
@@ -5509,8 +5551,9 @@ static int get_array_info(struct mddev * mddev, void __user * arg) | |||
5509 | int nr,working,insync,failed,spare; | 5551 | int nr,working,insync,failed,spare; |
5510 | struct md_rdev *rdev; | 5552 | struct md_rdev *rdev; |
5511 | 5553 | ||
5512 | nr=working=insync=failed=spare=0; | 5554 | nr = working = insync = failed = spare = 0; |
5513 | rdev_for_each(rdev, mddev) { | 5555 | rcu_read_lock(); |
5556 | rdev_for_each_rcu(rdev, mddev) { | ||
5514 | nr++; | 5557 | nr++; |
5515 | if (test_bit(Faulty, &rdev->flags)) | 5558 | if (test_bit(Faulty, &rdev->flags)) |
5516 | failed++; | 5559 | failed++; |
@@ -5522,6 +5565,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) | |||
5522 | spare++; | 5565 | spare++; |
5523 | } | 5566 | } |
5524 | } | 5567 | } |
5568 | rcu_read_unlock(); | ||
5525 | 5569 | ||
5526 | info.major_version = mddev->major_version; | 5570 | info.major_version = mddev->major_version; |
5527 | info.minor_version = mddev->minor_version; | 5571 | info.minor_version = mddev->minor_version; |
@@ -5605,7 +5649,8 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) | |||
5605 | if (copy_from_user(&info, arg, sizeof(info))) | 5649 | if (copy_from_user(&info, arg, sizeof(info))) |
5606 | return -EFAULT; | 5650 | return -EFAULT; |
5607 | 5651 | ||
5608 | rdev = find_rdev_nr(mddev, info.number); | 5652 | rcu_read_lock(); |
5653 | rdev = find_rdev_nr_rcu(mddev, info.number); | ||
5609 | if (rdev) { | 5654 | if (rdev) { |
5610 | info.major = MAJOR(rdev->bdev->bd_dev); | 5655 | info.major = MAJOR(rdev->bdev->bd_dev); |
5611 | info.minor = MINOR(rdev->bdev->bd_dev); | 5656 | info.minor = MINOR(rdev->bdev->bd_dev); |
@@ -5624,6 +5669,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) | |||
5624 | info.raid_disk = -1; | 5669 | info.raid_disk = -1; |
5625 | info.state = (1<<MD_DISK_REMOVED); | 5670 | info.state = (1<<MD_DISK_REMOVED); |
5626 | } | 5671 | } |
5672 | rcu_read_unlock(); | ||
5627 | 5673 | ||
5628 | if (copy_to_user(arg, &info, sizeof(info))) | 5674 | if (copy_to_user(arg, &info, sizeof(info))) |
5629 | return -EFAULT; | 5675 | return -EFAULT; |
@@ -6232,18 +6278,22 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) | |||
6232 | static int set_disk_faulty(struct mddev *mddev, dev_t dev) | 6278 | static int set_disk_faulty(struct mddev *mddev, dev_t dev) |
6233 | { | 6279 | { |
6234 | struct md_rdev *rdev; | 6280 | struct md_rdev *rdev; |
6281 | int err = 0; | ||
6235 | 6282 | ||
6236 | if (mddev->pers == NULL) | 6283 | if (mddev->pers == NULL) |
6237 | return -ENODEV; | 6284 | return -ENODEV; |
6238 | 6285 | ||
6239 | rdev = find_rdev(mddev, dev); | 6286 | rcu_read_lock(); |
6287 | rdev = find_rdev_rcu(mddev, dev); | ||
6240 | if (!rdev) | 6288 | if (!rdev) |
6241 | return -ENODEV; | 6289 | err = -ENODEV; |
6242 | 6290 | else { | |
6243 | md_error(mddev, rdev); | 6291 | md_error(mddev, rdev); |
6244 | if (!test_bit(Faulty, &rdev->flags)) | 6292 | if (!test_bit(Faulty, &rdev->flags)) |
6245 | return -EBUSY; | 6293 | err = -EBUSY; |
6246 | return 0; | 6294 | } |
6295 | rcu_read_unlock(); | ||
6296 | return err; | ||
6247 | } | 6297 | } |
6248 | 6298 | ||
6249 | /* | 6299 | /* |
@@ -6315,6 +6365,27 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6315 | goto abort; | 6365 | goto abort; |
6316 | } | 6366 | } |
6317 | 6367 | ||
6368 | /* Some actions do not requires the mutex */ | ||
6369 | switch (cmd) { | ||
6370 | case GET_ARRAY_INFO: | ||
6371 | if (!mddev->raid_disks && !mddev->external) | ||
6372 | err = -ENODEV; | ||
6373 | else | ||
6374 | err = get_array_info(mddev, argp); | ||
6375 | goto abort; | ||
6376 | |||
6377 | case GET_DISK_INFO: | ||
6378 | if (!mddev->raid_disks && !mddev->external) | ||
6379 | err = -ENODEV; | ||
6380 | else | ||
6381 | err = get_disk_info(mddev, argp); | ||
6382 | goto abort; | ||
6383 | |||
6384 | case SET_DISK_FAULTY: | ||
6385 | err = set_disk_faulty(mddev, new_decode_dev(arg)); | ||
6386 | goto abort; | ||
6387 | } | ||
6388 | |||
6318 | err = mddev_lock(mddev); | 6389 | err = mddev_lock(mddev); |
6319 | if (err) { | 6390 | if (err) { |
6320 | printk(KERN_INFO | 6391 | printk(KERN_INFO |
@@ -6387,18 +6458,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6387 | */ | 6458 | */ |
6388 | switch (cmd) | 6459 | switch (cmd) |
6389 | { | 6460 | { |
6390 | case GET_ARRAY_INFO: | ||
6391 | err = get_array_info(mddev, argp); | ||
6392 | goto done_unlock; | ||
6393 | |||
6394 | case GET_BITMAP_FILE: | 6461 | case GET_BITMAP_FILE: |
6395 | err = get_bitmap_file(mddev, argp); | 6462 | err = get_bitmap_file(mddev, argp); |
6396 | goto done_unlock; | 6463 | goto done_unlock; |
6397 | 6464 | ||
6398 | case GET_DISK_INFO: | ||
6399 | err = get_disk_info(mddev, argp); | ||
6400 | goto done_unlock; | ||
6401 | |||
6402 | case RESTART_ARRAY_RW: | 6465 | case RESTART_ARRAY_RW: |
6403 | err = restart_array(mddev); | 6466 | err = restart_array(mddev); |
6404 | goto done_unlock; | 6467 | goto done_unlock; |
@@ -6480,10 +6543,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, | |||
6480 | err = hot_add_disk(mddev, new_decode_dev(arg)); | 6543 | err = hot_add_disk(mddev, new_decode_dev(arg)); |
6481 | goto done_unlock; | 6544 | goto done_unlock; |
6482 | 6545 | ||
6483 | case SET_DISK_FAULTY: | ||
6484 | err = set_disk_faulty(mddev, new_decode_dev(arg)); | ||
6485 | goto done_unlock; | ||
6486 | |||
6487 | case RUN_ARRAY: | 6546 | case RUN_ARRAY: |
6488 | err = do_md_run(mddev); | 6547 | err = do_md_run(mddev); |
6489 | goto done_unlock; | 6548 | goto done_unlock; |
@@ -6641,7 +6700,7 @@ static int md_thread(void * arg) | |||
6641 | 6700 | ||
6642 | clear_bit(THREAD_WAKEUP, &thread->flags); | 6701 | clear_bit(THREAD_WAKEUP, &thread->flags); |
6643 | if (!kthread_should_stop()) | 6702 | if (!kthread_should_stop()) |
6644 | thread->run(thread->mddev); | 6703 | thread->run(thread); |
6645 | } | 6704 | } |
6646 | 6705 | ||
6647 | return 0; | 6706 | return 0; |
@@ -6656,8 +6715,8 @@ void md_wakeup_thread(struct md_thread *thread) | |||
6656 | } | 6715 | } |
6657 | } | 6716 | } |
6658 | 6717 | ||
6659 | struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, | 6718 | struct md_thread *md_register_thread(void (*run) (struct md_thread *), |
6660 | const char *name) | 6719 | struct mddev *mddev, const char *name) |
6661 | { | 6720 | { |
6662 | struct md_thread *thread; | 6721 | struct md_thread *thread; |
6663 | 6722 | ||
@@ -6752,7 +6811,11 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) | |||
6752 | int scale; | 6811 | int scale; |
6753 | unsigned int per_milli; | 6812 | unsigned int per_milli; |
6754 | 6813 | ||
6755 | resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); | 6814 | if (mddev->curr_resync <= 3) |
6815 | resync = 0; | ||
6816 | else | ||
6817 | resync = mddev->curr_resync | ||
6818 | - atomic_read(&mddev->recovery_active); | ||
6756 | 6819 | ||
6757 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || | 6820 | if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || |
6758 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) | 6821 | test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) |
@@ -6978,7 +7041,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6978 | if (mddev->curr_resync > 2) { | 7041 | if (mddev->curr_resync > 2) { |
6979 | status_resync(seq, mddev); | 7042 | status_resync(seq, mddev); |
6980 | seq_printf(seq, "\n "); | 7043 | seq_printf(seq, "\n "); |
6981 | } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) | 7044 | } else if (mddev->curr_resync >= 1) |
6982 | seq_printf(seq, "\tresync=DELAYED\n "); | 7045 | seq_printf(seq, "\tresync=DELAYED\n "); |
6983 | else if (mddev->recovery_cp < MaxSector) | 7046 | else if (mddev->recovery_cp < MaxSector) |
6984 | seq_printf(seq, "\tresync=PENDING\n "); | 7047 | seq_printf(seq, "\tresync=PENDING\n "); |
@@ -7206,8 +7269,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); | |||
7206 | 7269 | ||
7207 | #define SYNC_MARKS 10 | 7270 | #define SYNC_MARKS 10 |
7208 | #define SYNC_MARK_STEP (3*HZ) | 7271 | #define SYNC_MARK_STEP (3*HZ) |
7209 | void md_do_sync(struct mddev *mddev) | 7272 | void md_do_sync(struct md_thread *thread) |
7210 | { | 7273 | { |
7274 | struct mddev *mddev = thread->mddev; | ||
7211 | struct mddev *mddev2; | 7275 | struct mddev *mddev2; |
7212 | unsigned int currspeed = 0, | 7276 | unsigned int currspeed = 0, |
7213 | window; | 7277 | window; |
@@ -7311,7 +7375,7 @@ void md_do_sync(struct mddev *mddev) | |||
7311 | * which defaults to physical size, but can be virtual size | 7375 | * which defaults to physical size, but can be virtual size |
7312 | */ | 7376 | */ |
7313 | max_sectors = mddev->resync_max_sectors; | 7377 | max_sectors = mddev->resync_max_sectors; |
7314 | mddev->resync_mismatches = 0; | 7378 | atomic64_set(&mddev->resync_mismatches, 0); |
7315 | /* we don't use the checkpoint if there's a bitmap */ | 7379 | /* we don't use the checkpoint if there's a bitmap */ |
7316 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | 7380 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) |
7317 | j = mddev->resync_min; | 7381 | j = mddev->resync_min; |
@@ -7367,8 +7431,11 @@ void md_do_sync(struct mddev *mddev) | |||
7367 | "md: resuming %s of %s from checkpoint.\n", | 7431 | "md: resuming %s of %s from checkpoint.\n", |
7368 | desc, mdname(mddev)); | 7432 | desc, mdname(mddev)); |
7369 | mddev->curr_resync = j; | 7433 | mddev->curr_resync = j; |
7370 | } | 7434 | } else |
7435 | mddev->curr_resync = 3; /* no longer delayed */ | ||
7371 | mddev->curr_resync_completed = j; | 7436 | mddev->curr_resync_completed = j; |
7437 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | ||
7438 | md_new_event(mddev); | ||
7372 | 7439 | ||
7373 | blk_start_plug(&plug); | 7440 | blk_start_plug(&plug); |
7374 | while (j < max_sectors) { | 7441 | while (j < max_sectors) { |
@@ -7421,7 +7488,8 @@ void md_do_sync(struct mddev *mddev) | |||
7421 | break; | 7488 | break; |
7422 | 7489 | ||
7423 | j += sectors; | 7490 | j += sectors; |
7424 | if (j>1) mddev->curr_resync = j; | 7491 | if (j > 2) |
7492 | mddev->curr_resync = j; | ||
7425 | mddev->curr_mark_cnt = io_sectors; | 7493 | mddev->curr_mark_cnt = io_sectors; |
7426 | if (last_check == 0) | 7494 | if (last_check == 0) |
7427 | /* this is the earliest that rebuild will be | 7495 | /* this is the earliest that rebuild will be |
@@ -7543,8 +7611,6 @@ static int remove_and_add_spares(struct mddev *mddev) | |||
7543 | int spares = 0; | 7611 | int spares = 0; |
7544 | int removed = 0; | 7612 | int removed = 0; |
7545 | 7613 | ||
7546 | mddev->curr_resync_completed = 0; | ||
7547 | |||
7548 | rdev_for_each(rdev, mddev) | 7614 | rdev_for_each(rdev, mddev) |
7549 | if (rdev->raid_disk >= 0 && | 7615 | if (rdev->raid_disk >= 0 && |
7550 | !test_bit(Blocked, &rdev->flags) && | 7616 | !test_bit(Blocked, &rdev->flags) && |
@@ -7739,6 +7805,7 @@ void md_check_recovery(struct mddev *mddev) | |||
7739 | /* Set RUNNING before clearing NEEDED to avoid | 7805 | /* Set RUNNING before clearing NEEDED to avoid |
7740 | * any transients in the value of "sync_action". | 7806 | * any transients in the value of "sync_action". |
7741 | */ | 7807 | */ |
7808 | mddev->curr_resync_completed = 0; | ||
7742 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); | 7809 | set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); |
7743 | /* Clear some bits that don't mean anything, but | 7810 | /* Clear some bits that don't mean anything, but |
7744 | * might be left set | 7811 | * might be left set |
@@ -7752,7 +7819,7 @@ void md_check_recovery(struct mddev *mddev) | |||
7752 | /* no recovery is running. | 7819 | /* no recovery is running. |
7753 | * remove any failed drives, then | 7820 | * remove any failed drives, then |
7754 | * add spares if possible. | 7821 | * add spares if possible. |
7755 | * Spare are also removed and re-added, to allow | 7822 | * Spares are also removed and re-added, to allow |
7756 | * the personality to fail the re-add. | 7823 | * the personality to fail the re-add. |
7757 | */ | 7824 | */ |
7758 | 7825 | ||
diff --git a/drivers/md/md.h b/drivers/md/md.h index f385b038589d..af443ab868db 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -282,7 +282,7 @@ struct mddev { | |||
282 | 282 | ||
283 | sector_t resync_max_sectors; /* may be set by personality */ | 283 | sector_t resync_max_sectors; /* may be set by personality */ |
284 | 284 | ||
285 | sector_t resync_mismatches; /* count of sectors where | 285 | atomic64_t resync_mismatches; /* count of sectors where |
286 | * parity/replica mismatch found | 286 | * parity/replica mismatch found |
287 | */ | 287 | */ |
288 | 288 | ||
@@ -540,12 +540,13 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) | |||
540 | list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) | 540 | list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) |
541 | 541 | ||
542 | struct md_thread { | 542 | struct md_thread { |
543 | void (*run) (struct mddev *mddev); | 543 | void (*run) (struct md_thread *thread); |
544 | struct mddev *mddev; | 544 | struct mddev *mddev; |
545 | wait_queue_head_t wqueue; | 545 | wait_queue_head_t wqueue; |
546 | unsigned long flags; | 546 | unsigned long flags; |
547 | struct task_struct *tsk; | 547 | struct task_struct *tsk; |
548 | unsigned long timeout; | 548 | unsigned long timeout; |
549 | void *private; | ||
549 | }; | 550 | }; |
550 | 551 | ||
551 | #define THREAD_WAKEUP 0 | 552 | #define THREAD_WAKEUP 0 |
@@ -584,7 +585,7 @@ static inline void safe_put_page(struct page *p) | |||
584 | extern int register_md_personality(struct md_personality *p); | 585 | extern int register_md_personality(struct md_personality *p); |
585 | extern int unregister_md_personality(struct md_personality *p); | 586 | extern int unregister_md_personality(struct md_personality *p); |
586 | extern struct md_thread *md_register_thread( | 587 | extern struct md_thread *md_register_thread( |
587 | void (*run)(struct mddev *mddev), | 588 | void (*run)(struct md_thread *thread), |
588 | struct mddev *mddev, | 589 | struct mddev *mddev, |
589 | const char *name); | 590 | const char *name); |
590 | extern void md_unregister_thread(struct md_thread **threadp); | 591 | extern void md_unregister_thread(struct md_thread **threadp); |
@@ -603,7 +604,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, | |||
603 | extern void md_super_wait(struct mddev *mddev); | 604 | extern void md_super_wait(struct mddev *mddev); |
604 | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, | 605 | extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
605 | struct page *page, int rw, bool metadata_op); | 606 | struct page *page, int rw, bool metadata_op); |
606 | extern void md_do_sync(struct mddev *mddev); | 607 | extern void md_do_sync(struct md_thread *thread); |
607 | extern void md_new_event(struct mddev *mddev); | 608 | extern void md_new_event(struct mddev *mddev); |
608 | extern int md_allow_write(struct mddev *mddev); | 609 | extern int md_allow_write(struct mddev *mddev); |
609 | extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); | 610 | extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 61a1833ebaf3..1642eae75a33 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -335,8 +335,9 @@ abort: | |||
335 | * 3. Performs writes following reads for array syncronising. | 335 | * 3. Performs writes following reads for array syncronising. |
336 | */ | 336 | */ |
337 | 337 | ||
338 | static void multipathd (struct mddev *mddev) | 338 | static void multipathd(struct md_thread *thread) |
339 | { | 339 | { |
340 | struct mddev *mddev = thread->mddev; | ||
340 | struct multipath_bh *mp_bh; | 341 | struct multipath_bh *mp_bh; |
341 | struct bio *bio; | 342 | struct bio *bio; |
342 | unsigned long flags; | 343 | unsigned long flags; |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index a9e4fa95dfaa..24b359717a7e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -88,6 +88,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
88 | char b[BDEVNAME_SIZE]; | 88 | char b[BDEVNAME_SIZE]; |
89 | char b2[BDEVNAME_SIZE]; | 89 | char b2[BDEVNAME_SIZE]; |
90 | struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); | 90 | struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); |
91 | bool discard_supported = false; | ||
91 | 92 | ||
92 | if (!conf) | 93 | if (!conf) |
93 | return -ENOMEM; | 94 | return -ENOMEM; |
@@ -195,6 +196,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
195 | if (!smallest || (rdev1->sectors < smallest->sectors)) | 196 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
196 | smallest = rdev1; | 197 | smallest = rdev1; |
197 | cnt++; | 198 | cnt++; |
199 | |||
200 | if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) | ||
201 | discard_supported = true; | ||
198 | } | 202 | } |
199 | if (cnt != mddev->raid_disks) { | 203 | if (cnt != mddev->raid_disks) { |
200 | printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " | 204 | printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " |
@@ -272,6 +276,11 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
272 | blk_queue_io_opt(mddev->queue, | 276 | blk_queue_io_opt(mddev->queue, |
273 | (mddev->chunk_sectors << 9) * mddev->raid_disks); | 277 | (mddev->chunk_sectors << 9) * mddev->raid_disks); |
274 | 278 | ||
279 | if (!discard_supported) | ||
280 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
281 | else | ||
282 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
283 | |||
275 | pr_debug("md/raid0:%s: done.\n", mdname(mddev)); | 284 | pr_debug("md/raid0:%s: done.\n", mdname(mddev)); |
276 | *private_conf = conf; | 285 | *private_conf = conf; |
277 | 286 | ||
@@ -423,6 +432,7 @@ static int raid0_run(struct mddev *mddev) | |||
423 | return -EINVAL; | 432 | return -EINVAL; |
424 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); | 433 | blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); |
425 | blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); | 434 | blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); |
435 | blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); | ||
426 | 436 | ||
427 | /* if private is not null, we are here after takeover */ | 437 | /* if private is not null, we are here after takeover */ |
428 | if (mddev->private == NULL) { | 438 | if (mddev->private == NULL) { |
@@ -510,7 +520,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
510 | sector_t sector = bio->bi_sector; | 520 | sector_t sector = bio->bi_sector; |
511 | struct bio_pair *bp; | 521 | struct bio_pair *bp; |
512 | /* Sanity check -- queue functions should prevent this happening */ | 522 | /* Sanity check -- queue functions should prevent this happening */ |
513 | if (bio->bi_vcnt != 1 || | 523 | if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || |
514 | bio->bi_idx != 0) | 524 | bio->bi_idx != 0) |
515 | goto bad_map; | 525 | goto bad_map; |
516 | /* This is a one page bio that upper layers | 526 | /* This is a one page bio that upper layers |
@@ -536,6 +546,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
536 | bio->bi_sector = sector_offset + zone->dev_start + | 546 | bio->bi_sector = sector_offset + zone->dev_start + |
537 | tmp_dev->data_offset; | 547 | tmp_dev->data_offset; |
538 | 548 | ||
549 | if (unlikely((bio->bi_rw & REQ_DISCARD) && | ||
550 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { | ||
551 | /* Just ignore it */ | ||
552 | bio_endio(bio, 0); | ||
553 | return; | ||
554 | } | ||
555 | |||
539 | generic_make_request(bio); | 556 | generic_make_request(bio); |
540 | return; | 557 | return; |
541 | 558 | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 611b5f797618..8034fbd6190c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -333,9 +333,10 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
333 | spin_unlock_irqrestore(&conf->device_lock, flags); | 333 | spin_unlock_irqrestore(&conf->device_lock, flags); |
334 | } | 334 | } |
335 | 335 | ||
336 | if (uptodate) | 336 | if (uptodate) { |
337 | raid_end_bio_io(r1_bio); | 337 | raid_end_bio_io(r1_bio); |
338 | else { | 338 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
339 | } else { | ||
339 | /* | 340 | /* |
340 | * oops, read error: | 341 | * oops, read error: |
341 | */ | 342 | */ |
@@ -349,9 +350,8 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
349 | (unsigned long long)r1_bio->sector); | 350 | (unsigned long long)r1_bio->sector); |
350 | set_bit(R1BIO_ReadError, &r1_bio->state); | 351 | set_bit(R1BIO_ReadError, &r1_bio->state); |
351 | reschedule_retry(r1_bio); | 352 | reschedule_retry(r1_bio); |
353 | /* don't drop the reference on read_disk yet */ | ||
352 | } | 354 | } |
353 | |||
354 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
355 | } | 355 | } |
356 | 356 | ||
357 | static void close_write(struct r1bio *r1_bio) | 357 | static void close_write(struct r1bio *r1_bio) |
@@ -781,7 +781,12 @@ static void flush_pending_writes(struct r1conf *conf) | |||
781 | while (bio) { /* submit pending writes */ | 781 | while (bio) { /* submit pending writes */ |
782 | struct bio *next = bio->bi_next; | 782 | struct bio *next = bio->bi_next; |
783 | bio->bi_next = NULL; | 783 | bio->bi_next = NULL; |
784 | generic_make_request(bio); | 784 | if (unlikely((bio->bi_rw & REQ_DISCARD) && |
785 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
786 | /* Just ignore it */ | ||
787 | bio_endio(bio, 0); | ||
788 | else | ||
789 | generic_make_request(bio); | ||
785 | bio = next; | 790 | bio = next; |
786 | } | 791 | } |
787 | } else | 792 | } else |
@@ -994,6 +999,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
994 | const int rw = bio_data_dir(bio); | 999 | const int rw = bio_data_dir(bio); |
995 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 1000 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
996 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 1001 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
1002 | const unsigned long do_discard = (bio->bi_rw | ||
1003 | & (REQ_DISCARD | REQ_SECURE)); | ||
997 | struct md_rdev *blocked_rdev; | 1004 | struct md_rdev *blocked_rdev; |
998 | struct blk_plug_cb *cb; | 1005 | struct blk_plug_cb *cb; |
999 | struct raid1_plug_cb *plug = NULL; | 1006 | struct raid1_plug_cb *plug = NULL; |
@@ -1295,7 +1302,7 @@ read_again: | |||
1295 | conf->mirrors[i].rdev->data_offset); | 1302 | conf->mirrors[i].rdev->data_offset); |
1296 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1303 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
1297 | mbio->bi_end_io = raid1_end_write_request; | 1304 | mbio->bi_end_io = raid1_end_write_request; |
1298 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 1305 | mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; |
1299 | mbio->bi_private = r1_bio; | 1306 | mbio->bi_private = r1_bio; |
1300 | 1307 | ||
1301 | atomic_inc(&r1_bio->remaining); | 1308 | atomic_inc(&r1_bio->remaining); |
@@ -1549,6 +1556,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1549 | clear_bit(Unmerged, &rdev->flags); | 1556 | clear_bit(Unmerged, &rdev->flags); |
1550 | } | 1557 | } |
1551 | md_integrity_add_rdev(rdev, mddev); | 1558 | md_integrity_add_rdev(rdev, mddev); |
1559 | if (blk_queue_discard(bdev_get_queue(rdev->bdev))) | ||
1560 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
1552 | print_conf(conf); | 1561 | print_conf(conf); |
1553 | return err; | 1562 | return err; |
1554 | } | 1563 | } |
@@ -1867,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) | |||
1867 | } else | 1876 | } else |
1868 | j = 0; | 1877 | j = 0; |
1869 | if (j >= 0) | 1878 | if (j >= 0) |
1870 | mddev->resync_mismatches += r1_bio->sectors; | 1879 | atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); |
1871 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | 1880 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) |
1872 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | 1881 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { |
1873 | /* No need to write to this device. */ | 1882 | /* No need to write to this device. */ |
@@ -2220,6 +2229,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) | |||
2220 | unfreeze_array(conf); | 2229 | unfreeze_array(conf); |
2221 | } else | 2230 | } else |
2222 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | 2231 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); |
2232 | rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); | ||
2223 | 2233 | ||
2224 | bio = r1_bio->bios[r1_bio->read_disk]; | 2234 | bio = r1_bio->bios[r1_bio->read_disk]; |
2225 | bdevname(bio->bi_bdev, b); | 2235 | bdevname(bio->bi_bdev, b); |
@@ -2285,8 +2295,9 @@ read_more: | |||
2285 | } | 2295 | } |
2286 | } | 2296 | } |
2287 | 2297 | ||
2288 | static void raid1d(struct mddev *mddev) | 2298 | static void raid1d(struct md_thread *thread) |
2289 | { | 2299 | { |
2300 | struct mddev *mddev = thread->mddev; | ||
2290 | struct r1bio *r1_bio; | 2301 | struct r1bio *r1_bio; |
2291 | unsigned long flags; | 2302 | unsigned long flags; |
2292 | struct r1conf *conf = mddev->private; | 2303 | struct r1conf *conf = mddev->private; |
@@ -2783,6 +2794,7 @@ static int run(struct mddev *mddev) | |||
2783 | int i; | 2794 | int i; |
2784 | struct md_rdev *rdev; | 2795 | struct md_rdev *rdev; |
2785 | int ret; | 2796 | int ret; |
2797 | bool discard_supported = false; | ||
2786 | 2798 | ||
2787 | if (mddev->level != 1) { | 2799 | if (mddev->level != 1) { |
2788 | printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", | 2800 | printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", |
@@ -2812,6 +2824,8 @@ static int run(struct mddev *mddev) | |||
2812 | continue; | 2824 | continue; |
2813 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 2825 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2814 | rdev->data_offset << 9); | 2826 | rdev->data_offset << 9); |
2827 | if (blk_queue_discard(bdev_get_queue(rdev->bdev))) | ||
2828 | discard_supported = true; | ||
2815 | } | 2829 | } |
2816 | 2830 | ||
2817 | mddev->degraded = 0; | 2831 | mddev->degraded = 0; |
@@ -2846,6 +2860,13 @@ static int run(struct mddev *mddev) | |||
2846 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2860 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
2847 | mddev->queue->backing_dev_info.congested_data = mddev; | 2861 | mddev->queue->backing_dev_info.congested_data = mddev; |
2848 | blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); | 2862 | blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); |
2863 | |||
2864 | if (discard_supported) | ||
2865 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, | ||
2866 | mddev->queue); | ||
2867 | else | ||
2868 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, | ||
2869 | mddev->queue); | ||
2849 | } | 2870 | } |
2850 | 2871 | ||
2851 | ret = md_integrity_register(mddev); | 2872 | ret = md_integrity_register(mddev); |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0138a727c1f3..906ccbd0f7dc 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) | |||
911 | while (bio) { /* submit pending writes */ | 911 | while (bio) { /* submit pending writes */ |
912 | struct bio *next = bio->bi_next; | 912 | struct bio *next = bio->bi_next; |
913 | bio->bi_next = NULL; | 913 | bio->bi_next = NULL; |
914 | generic_make_request(bio); | 914 | if (unlikely((bio->bi_rw & REQ_DISCARD) && |
915 | !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) | ||
916 | /* Just ignore it */ | ||
917 | bio_endio(bio, 0); | ||
918 | else | ||
919 | generic_make_request(bio); | ||
915 | bio = next; | 920 | bio = next; |
916 | } | 921 | } |
917 | } else | 922 | } else |
@@ -1050,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, | |||
1050 | return rdev->new_data_offset; | 1055 | return rdev->new_data_offset; |
1051 | } | 1056 | } |
1052 | 1057 | ||
1058 | struct raid10_plug_cb { | ||
1059 | struct blk_plug_cb cb; | ||
1060 | struct bio_list pending; | ||
1061 | int pending_cnt; | ||
1062 | }; | ||
1063 | |||
1064 | static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) | ||
1065 | { | ||
1066 | struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, | ||
1067 | cb); | ||
1068 | struct mddev *mddev = plug->cb.data; | ||
1069 | struct r10conf *conf = mddev->private; | ||
1070 | struct bio *bio; | ||
1071 | |||
1072 | if (from_schedule) { | ||
1073 | spin_lock_irq(&conf->device_lock); | ||
1074 | bio_list_merge(&conf->pending_bio_list, &plug->pending); | ||
1075 | conf->pending_count += plug->pending_cnt; | ||
1076 | spin_unlock_irq(&conf->device_lock); | ||
1077 | md_wakeup_thread(mddev->thread); | ||
1078 | kfree(plug); | ||
1079 | return; | ||
1080 | } | ||
1081 | |||
1082 | /* we aren't scheduling, so we can do the write-out directly. */ | ||
1083 | bio = bio_list_get(&plug->pending); | ||
1084 | bitmap_unplug(mddev->bitmap); | ||
1085 | wake_up(&conf->wait_barrier); | ||
1086 | |||
1087 | while (bio) { /* submit pending writes */ | ||
1088 | struct bio *next = bio->bi_next; | ||
1089 | bio->bi_next = NULL; | ||
1090 | generic_make_request(bio); | ||
1091 | bio = next; | ||
1092 | } | ||
1093 | kfree(plug); | ||
1094 | } | ||
1095 | |||
1053 | static void make_request(struct mddev *mddev, struct bio * bio) | 1096 | static void make_request(struct mddev *mddev, struct bio * bio) |
1054 | { | 1097 | { |
1055 | struct r10conf *conf = mddev->private; | 1098 | struct r10conf *conf = mddev->private; |
@@ -1061,8 +1104,12 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1061 | const int rw = bio_data_dir(bio); | 1104 | const int rw = bio_data_dir(bio); |
1062 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 1105 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
1063 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); | 1106 | const unsigned long do_fua = (bio->bi_rw & REQ_FUA); |
1107 | const unsigned long do_discard = (bio->bi_rw | ||
1108 | & (REQ_DISCARD | REQ_SECURE)); | ||
1064 | unsigned long flags; | 1109 | unsigned long flags; |
1065 | struct md_rdev *blocked_rdev; | 1110 | struct md_rdev *blocked_rdev; |
1111 | struct blk_plug_cb *cb; | ||
1112 | struct raid10_plug_cb *plug = NULL; | ||
1066 | int sectors_handled; | 1113 | int sectors_handled; |
1067 | int max_sectors; | 1114 | int max_sectors; |
1068 | int sectors; | 1115 | int sectors; |
@@ -1081,7 +1128,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) | |||
1081 | || conf->prev.near_copies < conf->prev.raid_disks))) { | 1128 | || conf->prev.near_copies < conf->prev.raid_disks))) { |
1082 | struct bio_pair *bp; | 1129 | struct bio_pair *bp; |
1083 | /* Sanity check -- queue functions should prevent this happening */ | 1130 | /* Sanity check -- queue functions should prevent this happening */ |
1084 | if (bio->bi_vcnt != 1 || | 1131 | if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || |
1085 | bio->bi_idx != 0) | 1132 | bio->bi_idx != 0) |
1086 | goto bad_map; | 1133 | goto bad_map; |
1087 | /* This is a one page bio that upper layers | 1134 | /* This is a one page bio that upper layers |
@@ -1410,15 +1457,26 @@ retry_write: | |||
1410 | conf->mirrors[d].rdev)); | 1457 | conf->mirrors[d].rdev)); |
1411 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1458 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1412 | mbio->bi_end_io = raid10_end_write_request; | 1459 | mbio->bi_end_io = raid10_end_write_request; |
1413 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1460 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; |
1414 | mbio->bi_private = r10_bio; | 1461 | mbio->bi_private = r10_bio; |
1415 | 1462 | ||
1416 | atomic_inc(&r10_bio->remaining); | 1463 | atomic_inc(&r10_bio->remaining); |
1464 | |||
1465 | cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); | ||
1466 | if (cb) | ||
1467 | plug = container_of(cb, struct raid10_plug_cb, cb); | ||
1468 | else | ||
1469 | plug = NULL; | ||
1417 | spin_lock_irqsave(&conf->device_lock, flags); | 1470 | spin_lock_irqsave(&conf->device_lock, flags); |
1418 | bio_list_add(&conf->pending_bio_list, mbio); | 1471 | if (plug) { |
1419 | conf->pending_count++; | 1472 | bio_list_add(&plug->pending, mbio); |
1473 | plug->pending_cnt++; | ||
1474 | } else { | ||
1475 | bio_list_add(&conf->pending_bio_list, mbio); | ||
1476 | conf->pending_count++; | ||
1477 | } | ||
1420 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1478 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1421 | if (!mddev_check_plugged(mddev)) | 1479 | if (!plug) |
1422 | md_wakeup_thread(mddev->thread); | 1480 | md_wakeup_thread(mddev->thread); |
1423 | 1481 | ||
1424 | if (!r10_bio->devs[i].repl_bio) | 1482 | if (!r10_bio->devs[i].repl_bio) |
@@ -1439,7 +1497,7 @@ retry_write: | |||
1439 | conf->mirrors[d].replacement)); | 1497 | conf->mirrors[d].replacement)); |
1440 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; | 1498 | mbio->bi_bdev = conf->mirrors[d].replacement->bdev; |
1441 | mbio->bi_end_io = raid10_end_write_request; | 1499 | mbio->bi_end_io = raid10_end_write_request; |
1442 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1500 | mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; |
1443 | mbio->bi_private = r10_bio; | 1501 | mbio->bi_private = r10_bio; |
1444 | 1502 | ||
1445 | atomic_inc(&r10_bio->remaining); | 1503 | atomic_inc(&r10_bio->remaining); |
@@ -1638,7 +1696,7 @@ static int raid10_spare_active(struct mddev *mddev) | |||
1638 | && !test_bit(Faulty, &tmp->rdev->flags) | 1696 | && !test_bit(Faulty, &tmp->rdev->flags) |
1639 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { | 1697 | && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { |
1640 | count++; | 1698 | count++; |
1641 | sysfs_notify_dirent(tmp->rdev->sysfs_state); | 1699 | sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); |
1642 | } | 1700 | } |
1643 | } | 1701 | } |
1644 | spin_lock_irqsave(&conf->device_lock, flags); | 1702 | spin_lock_irqsave(&conf->device_lock, flags); |
@@ -1725,6 +1783,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1725 | clear_bit(Unmerged, &rdev->flags); | 1783 | clear_bit(Unmerged, &rdev->flags); |
1726 | } | 1784 | } |
1727 | md_integrity_add_rdev(rdev, mddev); | 1785 | md_integrity_add_rdev(rdev, mddev); |
1786 | if (blk_queue_discard(bdev_get_queue(rdev->bdev))) | ||
1787 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
1788 | |||
1728 | print_conf(conf); | 1789 | print_conf(conf); |
1729 | return err; | 1790 | return err; |
1730 | } | 1791 | } |
@@ -1952,7 +2013,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |||
1952 | break; | 2013 | break; |
1953 | if (j == vcnt) | 2014 | if (j == vcnt) |
1954 | continue; | 2015 | continue; |
1955 | mddev->resync_mismatches += r10_bio->sectors; | 2016 | atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); |
1956 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) | 2017 | if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) |
1957 | /* Don't fix anything. */ | 2018 | /* Don't fix anything. */ |
1958 | continue; | 2019 | continue; |
@@ -2673,8 +2734,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) | |||
2673 | } | 2734 | } |
2674 | } | 2735 | } |
2675 | 2736 | ||
2676 | static void raid10d(struct mddev *mddev) | 2737 | static void raid10d(struct md_thread *thread) |
2677 | { | 2738 | { |
2739 | struct mddev *mddev = thread->mddev; | ||
2678 | struct r10bio *r10_bio; | 2740 | struct r10bio *r10_bio; |
2679 | unsigned long flags; | 2741 | unsigned long flags; |
2680 | struct r10conf *conf = mddev->private; | 2742 | struct r10conf *conf = mddev->private; |
@@ -3158,7 +3220,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, | |||
3158 | else { | 3220 | else { |
3159 | bad_sectors -= (sector - first_bad); | 3221 | bad_sectors -= (sector - first_bad); |
3160 | if (max_sync > bad_sectors) | 3222 | if (max_sync > bad_sectors) |
3161 | max_sync = max_sync; | 3223 | max_sync = bad_sectors; |
3162 | continue; | 3224 | continue; |
3163 | } | 3225 | } |
3164 | } | 3226 | } |
@@ -3482,6 +3544,7 @@ static int run(struct mddev *mddev) | |||
3482 | sector_t size; | 3544 | sector_t size; |
3483 | sector_t min_offset_diff = 0; | 3545 | sector_t min_offset_diff = 0; |
3484 | int first = 1; | 3546 | int first = 1; |
3547 | bool discard_supported = false; | ||
3485 | 3548 | ||
3486 | if (mddev->private == NULL) { | 3549 | if (mddev->private == NULL) { |
3487 | conf = setup_conf(mddev); | 3550 | conf = setup_conf(mddev); |
@@ -3498,6 +3561,8 @@ static int run(struct mddev *mddev) | |||
3498 | 3561 | ||
3499 | chunk_size = mddev->chunk_sectors << 9; | 3562 | chunk_size = mddev->chunk_sectors << 9; |
3500 | if (mddev->queue) { | 3563 | if (mddev->queue) { |
3564 | blk_queue_max_discard_sectors(mddev->queue, | ||
3565 | mddev->chunk_sectors); | ||
3501 | blk_queue_io_min(mddev->queue, chunk_size); | 3566 | blk_queue_io_min(mddev->queue, chunk_size); |
3502 | if (conf->geo.raid_disks % conf->geo.near_copies) | 3567 | if (conf->geo.raid_disks % conf->geo.near_copies) |
3503 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); | 3568 | blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); |
@@ -3543,8 +3608,16 @@ static int run(struct mddev *mddev) | |||
3543 | rdev->data_offset << 9); | 3608 | rdev->data_offset << 9); |
3544 | 3609 | ||
3545 | disk->head_position = 0; | 3610 | disk->head_position = 0; |
3611 | |||
3612 | if (blk_queue_discard(bdev_get_queue(rdev->bdev))) | ||
3613 | discard_supported = true; | ||
3546 | } | 3614 | } |
3547 | 3615 | ||
3616 | if (discard_supported) | ||
3617 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
3618 | else | ||
3619 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); | ||
3620 | |||
3548 | /* need to check that every block has at least one working mirror */ | 3621 | /* need to check that every block has at least one working mirror */ |
3549 | if (!enough(conf, -1)) { | 3622 | if (!enough(conf, -1)) { |
3550 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", | 3623 | printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0689173fd9f5..c5439dce0295 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -551,6 +551,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
551 | rw = WRITE_FUA; | 551 | rw = WRITE_FUA; |
552 | else | 552 | else |
553 | rw = WRITE; | 553 | rw = WRITE; |
554 | if (test_bit(R5_Discard, &sh->dev[i].flags)) | ||
555 | rw |= REQ_DISCARD; | ||
554 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | 556 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) |
555 | rw = READ; | 557 | rw = READ; |
556 | else if (test_and_clear_bit(R5_WantReplace, | 558 | else if (test_and_clear_bit(R5_WantReplace, |
@@ -1174,8 +1176,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1174 | set_bit(R5_WantFUA, &dev->flags); | 1176 | set_bit(R5_WantFUA, &dev->flags); |
1175 | if (wbi->bi_rw & REQ_SYNC) | 1177 | if (wbi->bi_rw & REQ_SYNC) |
1176 | set_bit(R5_SyncIO, &dev->flags); | 1178 | set_bit(R5_SyncIO, &dev->flags); |
1177 | tx = async_copy_data(1, wbi, dev->page, | 1179 | if (wbi->bi_rw & REQ_DISCARD) |
1178 | dev->sector, tx); | 1180 | set_bit(R5_Discard, &dev->flags); |
1181 | else | ||
1182 | tx = async_copy_data(1, wbi, dev->page, | ||
1183 | dev->sector, tx); | ||
1179 | wbi = r5_next_bio(wbi, dev->sector); | 1184 | wbi = r5_next_bio(wbi, dev->sector); |
1180 | } | 1185 | } |
1181 | } | 1186 | } |
@@ -1191,7 +1196,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1191 | int pd_idx = sh->pd_idx; | 1196 | int pd_idx = sh->pd_idx; |
1192 | int qd_idx = sh->qd_idx; | 1197 | int qd_idx = sh->qd_idx; |
1193 | int i; | 1198 | int i; |
1194 | bool fua = false, sync = false; | 1199 | bool fua = false, sync = false, discard = false; |
1195 | 1200 | ||
1196 | pr_debug("%s: stripe %llu\n", __func__, | 1201 | pr_debug("%s: stripe %llu\n", __func__, |
1197 | (unsigned long long)sh->sector); | 1202 | (unsigned long long)sh->sector); |
@@ -1199,13 +1204,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) | |||
1199 | for (i = disks; i--; ) { | 1204 | for (i = disks; i--; ) { |
1200 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); | 1205 | fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); |
1201 | sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); | 1206 | sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); |
1207 | discard |= test_bit(R5_Discard, &sh->dev[i].flags); | ||
1202 | } | 1208 | } |
1203 | 1209 | ||
1204 | for (i = disks; i--; ) { | 1210 | for (i = disks; i--; ) { |
1205 | struct r5dev *dev = &sh->dev[i]; | 1211 | struct r5dev *dev = &sh->dev[i]; |
1206 | 1212 | ||
1207 | if (dev->written || i == pd_idx || i == qd_idx) { | 1213 | if (dev->written || i == pd_idx || i == qd_idx) { |
1208 | set_bit(R5_UPTODATE, &dev->flags); | 1214 | if (!discard) |
1215 | set_bit(R5_UPTODATE, &dev->flags); | ||
1209 | if (fua) | 1216 | if (fua) |
1210 | set_bit(R5_WantFUA, &dev->flags); | 1217 | set_bit(R5_WantFUA, &dev->flags); |
1211 | if (sync) | 1218 | if (sync) |
@@ -1241,6 +1248,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1241 | pr_debug("%s: stripe %llu\n", __func__, | 1248 | pr_debug("%s: stripe %llu\n", __func__, |
1242 | (unsigned long long)sh->sector); | 1249 | (unsigned long long)sh->sector); |
1243 | 1250 | ||
1251 | for (i = 0; i < sh->disks; i++) { | ||
1252 | if (pd_idx == i) | ||
1253 | continue; | ||
1254 | if (!test_bit(R5_Discard, &sh->dev[i].flags)) | ||
1255 | break; | ||
1256 | } | ||
1257 | if (i >= sh->disks) { | ||
1258 | atomic_inc(&sh->count); | ||
1259 | set_bit(R5_Discard, &sh->dev[pd_idx].flags); | ||
1260 | ops_complete_reconstruct(sh); | ||
1261 | return; | ||
1262 | } | ||
1244 | /* check if prexor is active which means only process blocks | 1263 | /* check if prexor is active which means only process blocks |
1245 | * that are part of a read-modify-write (written) | 1264 | * that are part of a read-modify-write (written) |
1246 | */ | 1265 | */ |
@@ -1285,10 +1304,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1285 | { | 1304 | { |
1286 | struct async_submit_ctl submit; | 1305 | struct async_submit_ctl submit; |
1287 | struct page **blocks = percpu->scribble; | 1306 | struct page **blocks = percpu->scribble; |
1288 | int count; | 1307 | int count, i; |
1289 | 1308 | ||
1290 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | 1309 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); |
1291 | 1310 | ||
1311 | for (i = 0; i < sh->disks; i++) { | ||
1312 | if (sh->pd_idx == i || sh->qd_idx == i) | ||
1313 | continue; | ||
1314 | if (!test_bit(R5_Discard, &sh->dev[i].flags)) | ||
1315 | break; | ||
1316 | } | ||
1317 | if (i >= sh->disks) { | ||
1318 | atomic_inc(&sh->count); | ||
1319 | set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); | ||
1320 | set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); | ||
1321 | ops_complete_reconstruct(sh); | ||
1322 | return; | ||
1323 | } | ||
1324 | |||
1292 | count = set_syndrome_sources(blocks, sh); | 1325 | count = set_syndrome_sources(blocks, sh); |
1293 | 1326 | ||
1294 | atomic_inc(&sh->count); | 1327 | atomic_inc(&sh->count); |
@@ -2408,11 +2441,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | |||
2408 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | 2441 | if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) |
2409 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | 2442 | set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); |
2410 | } | 2443 | } |
2411 | spin_unlock_irq(&sh->stripe_lock); | ||
2412 | 2444 | ||
2413 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | 2445 | pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", |
2414 | (unsigned long long)(*bip)->bi_sector, | 2446 | (unsigned long long)(*bip)->bi_sector, |
2415 | (unsigned long long)sh->sector, dd_idx); | 2447 | (unsigned long long)sh->sector, dd_idx); |
2448 | spin_unlock_irq(&sh->stripe_lock); | ||
2416 | 2449 | ||
2417 | if (conf->mddev->bitmap && firstwrite) { | 2450 | if (conf->mddev->bitmap && firstwrite) { |
2418 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, | 2451 | bitmap_startwrite(conf->mddev->bitmap, sh->sector, |
@@ -2479,10 +2512,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2479 | bi = sh->dev[i].towrite; | 2512 | bi = sh->dev[i].towrite; |
2480 | sh->dev[i].towrite = NULL; | 2513 | sh->dev[i].towrite = NULL; |
2481 | spin_unlock_irq(&sh->stripe_lock); | 2514 | spin_unlock_irq(&sh->stripe_lock); |
2482 | if (bi) { | 2515 | if (bi) |
2483 | s->to_write--; | ||
2484 | bitmap_end = 1; | 2516 | bitmap_end = 1; |
2485 | } | ||
2486 | 2517 | ||
2487 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 2518 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
2488 | wake_up(&conf->wait_for_overlap); | 2519 | wake_up(&conf->wait_for_overlap); |
@@ -2524,11 +2555,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | |||
2524 | if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && | 2555 | if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && |
2525 | (!test_bit(R5_Insync, &sh->dev[i].flags) || | 2556 | (!test_bit(R5_Insync, &sh->dev[i].flags) || |
2526 | test_bit(R5_ReadError, &sh->dev[i].flags))) { | 2557 | test_bit(R5_ReadError, &sh->dev[i].flags))) { |
2558 | spin_lock_irq(&sh->stripe_lock); | ||
2527 | bi = sh->dev[i].toread; | 2559 | bi = sh->dev[i].toread; |
2528 | sh->dev[i].toread = NULL; | 2560 | sh->dev[i].toread = NULL; |
2561 | spin_unlock_irq(&sh->stripe_lock); | ||
2529 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) | 2562 | if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) |
2530 | wake_up(&conf->wait_for_overlap); | 2563 | wake_up(&conf->wait_for_overlap); |
2531 | if (bi) s->to_read--; | ||
2532 | while (bi && bi->bi_sector < | 2564 | while (bi && bi->bi_sector < |
2533 | sh->dev[i].sector + STRIPE_SECTORS) { | 2565 | sh->dev[i].sector + STRIPE_SECTORS) { |
2534 | struct bio *nextbi = | 2566 | struct bio *nextbi = |
@@ -2741,7 +2773,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, | |||
2741 | if (sh->dev[i].written) { | 2773 | if (sh->dev[i].written) { |
2742 | dev = &sh->dev[i]; | 2774 | dev = &sh->dev[i]; |
2743 | if (!test_bit(R5_LOCKED, &dev->flags) && | 2775 | if (!test_bit(R5_LOCKED, &dev->flags) && |
2744 | test_bit(R5_UPTODATE, &dev->flags)) { | 2776 | (test_bit(R5_UPTODATE, &dev->flags) || |
2777 | test_and_clear_bit(R5_Discard, &dev->flags))) { | ||
2745 | /* We can return any write requests */ | 2778 | /* We can return any write requests */ |
2746 | struct bio *wbi, *wbi2; | 2779 | struct bio *wbi, *wbi2; |
2747 | pr_debug("Return write for disc %d\n", i); | 2780 | pr_debug("Return write for disc %d\n", i); |
@@ -2775,12 +2808,25 @@ static void handle_stripe_dirtying(struct r5conf *conf, | |||
2775 | int disks) | 2808 | int disks) |
2776 | { | 2809 | { |
2777 | int rmw = 0, rcw = 0, i; | 2810 | int rmw = 0, rcw = 0, i; |
2778 | if (conf->max_degraded == 2) { | 2811 | sector_t recovery_cp = conf->mddev->recovery_cp; |
2779 | /* RAID6 requires 'rcw' in current implementation | 2812 | |
2780 | * Calculate the real rcw later - for now fake it | 2813 | /* RAID6 requires 'rcw' in current implementation. |
2814 | * Otherwise, check whether resync is now happening or should start. | ||
2815 | * If yes, then the array is dirty (after unclean shutdown or | ||
2816 | * initial creation), so parity in some stripes might be inconsistent. | ||
2817 | * In this case, we need to always do reconstruct-write, to ensure | ||
2818 | * that in case of drive failure or read-error correction, we | ||
2819 | * generate correct data from the parity. | ||
2820 | */ | ||
2821 | if (conf->max_degraded == 2 || | ||
2822 | (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { | ||
2823 | /* Calculate the real rcw later - for now make it | ||
2781 | * look like rcw is cheaper | 2824 | * look like rcw is cheaper |
2782 | */ | 2825 | */ |
2783 | rcw = 1; rmw = 2; | 2826 | rcw = 1; rmw = 2; |
2827 | pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", | ||
2828 | conf->max_degraded, (unsigned long long)recovery_cp, | ||
2829 | (unsigned long long)sh->sector); | ||
2784 | } else for (i = disks; i--; ) { | 2830 | } else for (i = disks; i--; ) { |
2785 | /* would I have to read this buffer for read_modify_write */ | 2831 | /* would I have to read this buffer for read_modify_write */ |
2786 | struct r5dev *dev = &sh->dev[i]; | 2832 | struct r5dev *dev = &sh->dev[i]; |
@@ -2932,7 +2978,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, | |||
2932 | */ | 2978 | */ |
2933 | set_bit(STRIPE_INSYNC, &sh->state); | 2979 | set_bit(STRIPE_INSYNC, &sh->state); |
2934 | else { | 2980 | else { |
2935 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 2981 | atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); |
2936 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 2982 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) |
2937 | /* don't try to repair!! */ | 2983 | /* don't try to repair!! */ |
2938 | set_bit(STRIPE_INSYNC, &sh->state); | 2984 | set_bit(STRIPE_INSYNC, &sh->state); |
@@ -3084,7 +3130,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, | |||
3084 | */ | 3130 | */ |
3085 | } | 3131 | } |
3086 | } else { | 3132 | } else { |
3087 | conf->mddev->resync_mismatches += STRIPE_SECTORS; | 3133 | atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); |
3088 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) | 3134 | if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) |
3089 | /* don't try to repair!! */ | 3135 | /* don't try to repair!! */ |
3090 | set_bit(STRIPE_INSYNC, &sh->state); | 3136 | set_bit(STRIPE_INSYNC, &sh->state); |
@@ -3459,10 +3505,12 @@ static void handle_stripe(struct stripe_head *sh) | |||
3459 | if (s.written && | 3505 | if (s.written && |
3460 | (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) | 3506 | (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) |
3461 | && !test_bit(R5_LOCKED, &pdev->flags) | 3507 | && !test_bit(R5_LOCKED, &pdev->flags) |
3462 | && test_bit(R5_UPTODATE, &pdev->flags)))) && | 3508 | && (test_bit(R5_UPTODATE, &pdev->flags) || |
3509 | test_bit(R5_Discard, &pdev->flags))))) && | ||
3463 | (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) | 3510 | (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) |
3464 | && !test_bit(R5_LOCKED, &qdev->flags) | 3511 | && !test_bit(R5_LOCKED, &qdev->flags) |
3465 | && test_bit(R5_UPTODATE, &qdev->flags))))) | 3512 | && (test_bit(R5_UPTODATE, &qdev->flags) || |
3513 | test_bit(R5_Discard, &qdev->flags)))))) | ||
3466 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); | 3514 | handle_stripe_clean_event(conf, sh, disks, &s.return_bi); |
3467 | 3515 | ||
3468 | /* Now we might consider reading some blocks, either to check/generate | 3516 | /* Now we might consider reading some blocks, either to check/generate |
@@ -3489,9 +3537,11 @@ static void handle_stripe(struct stripe_head *sh) | |||
3489 | /* All the 'written' buffers and the parity block are ready to | 3537 | /* All the 'written' buffers and the parity block are ready to |
3490 | * be written back to disk | 3538 | * be written back to disk |
3491 | */ | 3539 | */ |
3492 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); | 3540 | BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && |
3541 | !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); | ||
3493 | BUG_ON(sh->qd_idx >= 0 && | 3542 | BUG_ON(sh->qd_idx >= 0 && |
3494 | !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); | 3543 | !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && |
3544 | !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); | ||
3495 | for (i = disks; i--; ) { | 3545 | for (i = disks; i--; ) { |
3496 | struct r5dev *dev = &sh->dev[i]; | 3546 | struct r5dev *dev = &sh->dev[i]; |
3497 | if (test_bit(R5_LOCKED, &dev->flags) && | 3547 | if (test_bit(R5_LOCKED, &dev->flags) && |
@@ -4072,6 +4122,88 @@ static void release_stripe_plug(struct mddev *mddev, | |||
4072 | release_stripe(sh); | 4122 | release_stripe(sh); |
4073 | } | 4123 | } |
4074 | 4124 | ||
4125 | static void make_discard_request(struct mddev *mddev, struct bio *bi) | ||
4126 | { | ||
4127 | struct r5conf *conf = mddev->private; | ||
4128 | sector_t logical_sector, last_sector; | ||
4129 | struct stripe_head *sh; | ||
4130 | int remaining; | ||
4131 | int stripe_sectors; | ||
4132 | |||
4133 | if (mddev->reshape_position != MaxSector) | ||
4134 | /* Skip discard while reshape is happening */ | ||
4135 | return; | ||
4136 | |||
4137 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | ||
4138 | last_sector = bi->bi_sector + (bi->bi_size>>9); | ||
4139 | |||
4140 | bi->bi_next = NULL; | ||
4141 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | ||
4142 | |||
4143 | stripe_sectors = conf->chunk_sectors * | ||
4144 | (conf->raid_disks - conf->max_degraded); | ||
4145 | logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, | ||
4146 | stripe_sectors); | ||
4147 | sector_div(last_sector, stripe_sectors); | ||
4148 | |||
4149 | logical_sector *= conf->chunk_sectors; | ||
4150 | last_sector *= conf->chunk_sectors; | ||
4151 | |||
4152 | for (; logical_sector < last_sector; | ||
4153 | logical_sector += STRIPE_SECTORS) { | ||
4154 | DEFINE_WAIT(w); | ||
4155 | int d; | ||
4156 | again: | ||
4157 | sh = get_active_stripe(conf, logical_sector, 0, 0, 0); | ||
4158 | prepare_to_wait(&conf->wait_for_overlap, &w, | ||
4159 | TASK_UNINTERRUPTIBLE); | ||
4160 | spin_lock_irq(&sh->stripe_lock); | ||
4161 | for (d = 0; d < conf->raid_disks; d++) { | ||
4162 | if (d == sh->pd_idx || d == sh->qd_idx) | ||
4163 | continue; | ||
4164 | if (sh->dev[d].towrite || sh->dev[d].toread) { | ||
4165 | set_bit(R5_Overlap, &sh->dev[d].flags); | ||
4166 | spin_unlock_irq(&sh->stripe_lock); | ||
4167 | release_stripe(sh); | ||
4168 | schedule(); | ||
4169 | goto again; | ||
4170 | } | ||
4171 | } | ||
4172 | finish_wait(&conf->wait_for_overlap, &w); | ||
4173 | for (d = 0; d < conf->raid_disks; d++) { | ||
4174 | if (d == sh->pd_idx || d == sh->qd_idx) | ||
4175 | continue; | ||
4176 | sh->dev[d].towrite = bi; | ||
4177 | set_bit(R5_OVERWRITE, &sh->dev[d].flags); | ||
4178 | raid5_inc_bi_active_stripes(bi); | ||
4179 | } | ||
4180 | spin_unlock_irq(&sh->stripe_lock); | ||
4181 | if (conf->mddev->bitmap) { | ||
4182 | for (d = 0; | ||
4183 | d < conf->raid_disks - conf->max_degraded; | ||
4184 | d++) | ||
4185 | bitmap_startwrite(mddev->bitmap, | ||
4186 | sh->sector, | ||
4187 | STRIPE_SECTORS, | ||
4188 | 0); | ||
4189 | sh->bm_seq = conf->seq_flush + 1; | ||
4190 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
4191 | } | ||
4192 | |||
4193 | set_bit(STRIPE_HANDLE, &sh->state); | ||
4194 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
4195 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
4196 | atomic_inc(&conf->preread_active_stripes); | ||
4197 | release_stripe_plug(mddev, sh); | ||
4198 | } | ||
4199 | |||
4200 | remaining = raid5_dec_bi_active_stripes(bi); | ||
4201 | if (remaining == 0) { | ||
4202 | md_write_end(mddev); | ||
4203 | bio_endio(bi, 0); | ||
4204 | } | ||
4205 | } | ||
4206 | |||
4075 | static void make_request(struct mddev *mddev, struct bio * bi) | 4207 | static void make_request(struct mddev *mddev, struct bio * bi) |
4076 | { | 4208 | { |
4077 | struct r5conf *conf = mddev->private; | 4209 | struct r5conf *conf = mddev->private; |
@@ -4094,6 +4226,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4094 | chunk_aligned_read(mddev,bi)) | 4226 | chunk_aligned_read(mddev,bi)) |
4095 | return; | 4227 | return; |
4096 | 4228 | ||
4229 | if (unlikely(bi->bi_rw & REQ_DISCARD)) { | ||
4230 | make_discard_request(mddev, bi); | ||
4231 | return; | ||
4232 | } | ||
4233 | |||
4097 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 4234 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
4098 | last_sector = bi->bi_sector + (bi->bi_size>>9); | 4235 | last_sector = bi->bi_sector + (bi->bi_size>>9); |
4099 | bi->bi_next = NULL; | 4236 | bi->bi_next = NULL; |
@@ -4630,8 +4767,9 @@ static int handle_active_stripes(struct r5conf *conf) | |||
4630 | * During the scan, completed stripes are saved for us by the interrupt | 4767 | * During the scan, completed stripes are saved for us by the interrupt |
4631 | * handler, so that they will not have to wait for our next wakeup. | 4768 | * handler, so that they will not have to wait for our next wakeup. |
4632 | */ | 4769 | */ |
4633 | static void raid5d(struct mddev *mddev) | 4770 | static void raid5d(struct md_thread *thread) |
4634 | { | 4771 | { |
4772 | struct mddev *mddev = thread->mddev; | ||
4635 | struct r5conf *conf = mddev->private; | 4773 | struct r5conf *conf = mddev->private; |
4636 | int handled; | 4774 | int handled; |
4637 | struct blk_plug plug; | 4775 | struct blk_plug plug; |
@@ -5366,6 +5504,7 @@ static int run(struct mddev *mddev) | |||
5366 | 5504 | ||
5367 | if (mddev->queue) { | 5505 | if (mddev->queue) { |
5368 | int chunk_size; | 5506 | int chunk_size; |
5507 | bool discard_supported = true; | ||
5369 | /* read-ahead size must cover two whole stripes, which | 5508 | /* read-ahead size must cover two whole stripes, which |
5370 | * is 2 * (datadisks) * chunksize where 'n' is the | 5509 | * is 2 * (datadisks) * chunksize where 'n' is the |
5371 | * number of raid devices | 5510 | * number of raid devices |
@@ -5385,13 +5524,48 @@ static int run(struct mddev *mddev) | |||
5385 | blk_queue_io_min(mddev->queue, chunk_size); | 5524 | blk_queue_io_min(mddev->queue, chunk_size); |
5386 | blk_queue_io_opt(mddev->queue, chunk_size * | 5525 | blk_queue_io_opt(mddev->queue, chunk_size * |
5387 | (conf->raid_disks - conf->max_degraded)); | 5526 | (conf->raid_disks - conf->max_degraded)); |
5527 | /* | ||
5528 | * We can only discard a whole stripe. It doesn't make sense to | ||
5529 | * discard data disk but write parity disk | ||
5530 | */ | ||
5531 | stripe = stripe * PAGE_SIZE; | ||
5532 | mddev->queue->limits.discard_alignment = stripe; | ||
5533 | mddev->queue->limits.discard_granularity = stripe; | ||
5534 | /* | ||
5535 | * unaligned part of discard request will be ignored, so can't | ||
5536 | * guarantee discard_zerors_data | ||
5537 | */ | ||
5538 | mddev->queue->limits.discard_zeroes_data = 0; | ||
5388 | 5539 | ||
5389 | rdev_for_each(rdev, mddev) { | 5540 | rdev_for_each(rdev, mddev) { |
5390 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5541 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5391 | rdev->data_offset << 9); | 5542 | rdev->data_offset << 9); |
5392 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5543 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5393 | rdev->new_data_offset << 9); | 5544 | rdev->new_data_offset << 9); |
5545 | /* | ||
5546 | * discard_zeroes_data is required, otherwise data | ||
5547 | * could be lost. Consider a scenario: discard a stripe | ||
5548 | * (the stripe could be inconsistent if | ||
5549 | * discard_zeroes_data is 0); write one disk of the | ||
5550 | * stripe (the stripe could be inconsistent again | ||
5551 | * depending on which disks are used to calculate | ||
5552 | * parity); the disk is broken; The stripe data of this | ||
5553 | * disk is lost. | ||
5554 | */ | ||
5555 | if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || | ||
5556 | !bdev_get_queue(rdev->bdev)-> | ||
5557 | limits.discard_zeroes_data) | ||
5558 | discard_supported = false; | ||
5394 | } | 5559 | } |
5560 | |||
5561 | if (discard_supported && | ||
5562 | mddev->queue->limits.max_discard_sectors >= stripe && | ||
5563 | mddev->queue->limits.discard_granularity >= stripe) | ||
5564 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, | ||
5565 | mddev->queue); | ||
5566 | else | ||
5567 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, | ||
5568 | mddev->queue); | ||
5395 | } | 5569 | } |
5396 | 5570 | ||
5397 | return 0; | 5571 | return 0; |
@@ -5702,7 +5876,8 @@ static int check_reshape(struct mddev *mddev) | |||
5702 | if (!check_stripe_cache(mddev)) | 5876 | if (!check_stripe_cache(mddev)) |
5703 | return -ENOSPC; | 5877 | return -ENOSPC; |
5704 | 5878 | ||
5705 | return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); | 5879 | return resize_stripes(conf, (conf->previous_raid_disks |
5880 | + mddev->delta_disks)); | ||
5706 | } | 5881 | } |
5707 | 5882 | ||
5708 | static int raid5_start_reshape(struct mddev *mddev) | 5883 | static int raid5_start_reshape(struct mddev *mddev) |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901eda..18b2c4a8a1fd 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -298,6 +298,7 @@ enum r5dev_flags { | |||
298 | R5_WantReplace, /* We need to update the replacement, we have read | 298 | R5_WantReplace, /* We need to update the replacement, we have read |
299 | * data in, and now is a good time to write it out. | 299 | * data in, and now is a good time to write it out. |
300 | */ | 300 | */ |
301 | R5_Discard, /* Discard the stripe */ | ||
301 | }; | 302 | }; |
302 | 303 | ||
303 | /* | 304 | /* |