aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-13 16:22:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-13 16:22:01 -0400
commit9db908806b85c1430150fbafe269a7b21b07d15d (patch)
tree3911759c93e0be26b6771e1a92b75612b206ffa5 /drivers/md/raid5.c
parent4d7127dace8cf4b05eb7c8c8531fc204fbb195f4 (diff)
parent72f36d5972a166197036c1281963f6863c429bf2 (diff)
Merge tag 'md-3.7' of git://neil.brown.name/md
Pull md updates from NeilBrown: - "discard" support, some dm-raid improvements and other assorted bits and pieces. * tag 'md-3.7' of git://neil.brown.name/md: (29 commits) md: refine reporting of resync/reshape delays. md/raid5: be careful not to resize_stripes too big. md: make sure manual changes to recovery checkpoint are saved. md/raid10: use correct limit variable md: writing to sync_action should clear the read-auto state. Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races md/raid5: make sure to_read and to_write never go negative. md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write. md/raid5: protect debug message against NULL derefernce. md/raid5: add some missing locking in handle_failed_stripe. MD: raid5 avoid unnecessary zero page for trim MD: raid5 trim support md/bitmap:Don't use IS_ERR to judge alloc_page(). md/raid1: Don't release reference to device while handling read error. raid: replace list_for_each_continue_rcu with new interface add further __init annotations to crypto/xor.c DM RAID: Fix for "sync" directive ineffectiveness DM RAID: Fix comparison of index and quantity for "rebuild" parameter DM RAID: Add rebuild capability for RAID10 DM RAID: Move 'rebuild' checking code to its own function ...
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c219
1 files changed, 197 insertions, 22 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0689173fd9f5..c5439dce0295 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -551,6 +551,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
551 rw = WRITE_FUA; 551 rw = WRITE_FUA;
552 else 552 else
553 rw = WRITE; 553 rw = WRITE;
554 if (test_bit(R5_Discard, &sh->dev[i].flags))
555 rw |= REQ_DISCARD;
554 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 556 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
555 rw = READ; 557 rw = READ;
556 else if (test_and_clear_bit(R5_WantReplace, 558 else if (test_and_clear_bit(R5_WantReplace,
@@ -1174,8 +1176,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1174 set_bit(R5_WantFUA, &dev->flags); 1176 set_bit(R5_WantFUA, &dev->flags);
1175 if (wbi->bi_rw & REQ_SYNC) 1177 if (wbi->bi_rw & REQ_SYNC)
1176 set_bit(R5_SyncIO, &dev->flags); 1178 set_bit(R5_SyncIO, &dev->flags);
1177 tx = async_copy_data(1, wbi, dev->page, 1179 if (wbi->bi_rw & REQ_DISCARD)
1178 dev->sector, tx); 1180 set_bit(R5_Discard, &dev->flags);
1181 else
1182 tx = async_copy_data(1, wbi, dev->page,
1183 dev->sector, tx);
1179 wbi = r5_next_bio(wbi, dev->sector); 1184 wbi = r5_next_bio(wbi, dev->sector);
1180 } 1185 }
1181 } 1186 }
@@ -1191,7 +1196,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1191 int pd_idx = sh->pd_idx; 1196 int pd_idx = sh->pd_idx;
1192 int qd_idx = sh->qd_idx; 1197 int qd_idx = sh->qd_idx;
1193 int i; 1198 int i;
1194 bool fua = false, sync = false; 1199 bool fua = false, sync = false, discard = false;
1195 1200
1196 pr_debug("%s: stripe %llu\n", __func__, 1201 pr_debug("%s: stripe %llu\n", __func__,
1197 (unsigned long long)sh->sector); 1202 (unsigned long long)sh->sector);
@@ -1199,13 +1204,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
1199 for (i = disks; i--; ) { 1204 for (i = disks; i--; ) {
1200 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1205 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1201 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); 1206 sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
1207 discard |= test_bit(R5_Discard, &sh->dev[i].flags);
1202 } 1208 }
1203 1209
1204 for (i = disks; i--; ) { 1210 for (i = disks; i--; ) {
1205 struct r5dev *dev = &sh->dev[i]; 1211 struct r5dev *dev = &sh->dev[i];
1206 1212
1207 if (dev->written || i == pd_idx || i == qd_idx) { 1213 if (dev->written || i == pd_idx || i == qd_idx) {
1208 set_bit(R5_UPTODATE, &dev->flags); 1214 if (!discard)
1215 set_bit(R5_UPTODATE, &dev->flags);
1209 if (fua) 1216 if (fua)
1210 set_bit(R5_WantFUA, &dev->flags); 1217 set_bit(R5_WantFUA, &dev->flags);
1211 if (sync) 1218 if (sync)
@@ -1241,6 +1248,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1241 pr_debug("%s: stripe %llu\n", __func__, 1248 pr_debug("%s: stripe %llu\n", __func__,
1242 (unsigned long long)sh->sector); 1249 (unsigned long long)sh->sector);
1243 1250
1251 for (i = 0; i < sh->disks; i++) {
1252 if (pd_idx == i)
1253 continue;
1254 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1255 break;
1256 }
1257 if (i >= sh->disks) {
1258 atomic_inc(&sh->count);
1259 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1260 ops_complete_reconstruct(sh);
1261 return;
1262 }
1244 /* check if prexor is active which means only process blocks 1263 /* check if prexor is active which means only process blocks
1245 * that are part of a read-modify-write (written) 1264 * that are part of a read-modify-write (written)
1246 */ 1265 */
@@ -1285,10 +1304,24 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1285{ 1304{
1286 struct async_submit_ctl submit; 1305 struct async_submit_ctl submit;
1287 struct page **blocks = percpu->scribble; 1306 struct page **blocks = percpu->scribble;
1288 int count; 1307 int count, i;
1289 1308
1290 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1309 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1291 1310
1311 for (i = 0; i < sh->disks; i++) {
1312 if (sh->pd_idx == i || sh->qd_idx == i)
1313 continue;
1314 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1315 break;
1316 }
1317 if (i >= sh->disks) {
1318 atomic_inc(&sh->count);
1319 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1320 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1321 ops_complete_reconstruct(sh);
1322 return;
1323 }
1324
1292 count = set_syndrome_sources(blocks, sh); 1325 count = set_syndrome_sources(blocks, sh);
1293 1326
1294 atomic_inc(&sh->count); 1327 atomic_inc(&sh->count);
@@ -2408,11 +2441,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
2408 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2441 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2409 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2442 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2410 } 2443 }
2411 spin_unlock_irq(&sh->stripe_lock);
2412 2444
2413 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2445 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2414 (unsigned long long)(*bip)->bi_sector, 2446 (unsigned long long)(*bip)->bi_sector,
2415 (unsigned long long)sh->sector, dd_idx); 2447 (unsigned long long)sh->sector, dd_idx);
2448 spin_unlock_irq(&sh->stripe_lock);
2416 2449
2417 if (conf->mddev->bitmap && firstwrite) { 2450 if (conf->mddev->bitmap && firstwrite) {
2418 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2451 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
@@ -2479,10 +2512,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2479 bi = sh->dev[i].towrite; 2512 bi = sh->dev[i].towrite;
2480 sh->dev[i].towrite = NULL; 2513 sh->dev[i].towrite = NULL;
2481 spin_unlock_irq(&sh->stripe_lock); 2514 spin_unlock_irq(&sh->stripe_lock);
2482 if (bi) { 2515 if (bi)
2483 s->to_write--;
2484 bitmap_end = 1; 2516 bitmap_end = 1;
2485 }
2486 2517
2487 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2518 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2488 wake_up(&conf->wait_for_overlap); 2519 wake_up(&conf->wait_for_overlap);
@@ -2524,11 +2555,12 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
2524 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2555 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2525 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2556 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2526 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2557 test_bit(R5_ReadError, &sh->dev[i].flags))) {
2558 spin_lock_irq(&sh->stripe_lock);
2527 bi = sh->dev[i].toread; 2559 bi = sh->dev[i].toread;
2528 sh->dev[i].toread = NULL; 2560 sh->dev[i].toread = NULL;
2561 spin_unlock_irq(&sh->stripe_lock);
2529 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2562 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2530 wake_up(&conf->wait_for_overlap); 2563 wake_up(&conf->wait_for_overlap);
2531 if (bi) s->to_read--;
2532 while (bi && bi->bi_sector < 2564 while (bi && bi->bi_sector <
2533 sh->dev[i].sector + STRIPE_SECTORS) { 2565 sh->dev[i].sector + STRIPE_SECTORS) {
2534 struct bio *nextbi = 2566 struct bio *nextbi =
@@ -2741,7 +2773,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
2741 if (sh->dev[i].written) { 2773 if (sh->dev[i].written) {
2742 dev = &sh->dev[i]; 2774 dev = &sh->dev[i];
2743 if (!test_bit(R5_LOCKED, &dev->flags) && 2775 if (!test_bit(R5_LOCKED, &dev->flags) &&
2744 test_bit(R5_UPTODATE, &dev->flags)) { 2776 (test_bit(R5_UPTODATE, &dev->flags) ||
2777 test_and_clear_bit(R5_Discard, &dev->flags))) {
2745 /* We can return any write requests */ 2778 /* We can return any write requests */
2746 struct bio *wbi, *wbi2; 2779 struct bio *wbi, *wbi2;
2747 pr_debug("Return write for disc %d\n", i); 2780 pr_debug("Return write for disc %d\n", i);
@@ -2775,12 +2808,25 @@ static void handle_stripe_dirtying(struct r5conf *conf,
2775 int disks) 2808 int disks)
2776{ 2809{
2777 int rmw = 0, rcw = 0, i; 2810 int rmw = 0, rcw = 0, i;
2778 if (conf->max_degraded == 2) { 2811 sector_t recovery_cp = conf->mddev->recovery_cp;
2779 /* RAID6 requires 'rcw' in current implementation 2812
2780 * Calculate the real rcw later - for now fake it 2813 /* RAID6 requires 'rcw' in current implementation.
2814 * Otherwise, check whether resync is now happening or should start.
2815 * If yes, then the array is dirty (after unclean shutdown or
2816 * initial creation), so parity in some stripes might be inconsistent.
2817 * In this case, we need to always do reconstruct-write, to ensure
2818 * that in case of drive failure or read-error correction, we
2819 * generate correct data from the parity.
2820 */
2821 if (conf->max_degraded == 2 ||
2822 (recovery_cp < MaxSector && sh->sector >= recovery_cp)) {
2823 /* Calculate the real rcw later - for now make it
2781 * look like rcw is cheaper 2824 * look like rcw is cheaper
2782 */ 2825 */
2783 rcw = 1; rmw = 2; 2826 rcw = 1; rmw = 2;
2827 pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
2828 conf->max_degraded, (unsigned long long)recovery_cp,
2829 (unsigned long long)sh->sector);
2784 } else for (i = disks; i--; ) { 2830 } else for (i = disks; i--; ) {
2785 /* would I have to read this buffer for read_modify_write */ 2831 /* would I have to read this buffer for read_modify_write */
2786 struct r5dev *dev = &sh->dev[i]; 2832 struct r5dev *dev = &sh->dev[i];
@@ -2932,7 +2978,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
2932 */ 2978 */
2933 set_bit(STRIPE_INSYNC, &sh->state); 2979 set_bit(STRIPE_INSYNC, &sh->state);
2934 else { 2980 else {
2935 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2981 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
2936 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2982 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2937 /* don't try to repair!! */ 2983 /* don't try to repair!! */
2938 set_bit(STRIPE_INSYNC, &sh->state); 2984 set_bit(STRIPE_INSYNC, &sh->state);
@@ -3084,7 +3130,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
3084 */ 3130 */
3085 } 3131 }
3086 } else { 3132 } else {
3087 conf->mddev->resync_mismatches += STRIPE_SECTORS; 3133 atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
3088 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 3134 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
3089 /* don't try to repair!! */ 3135 /* don't try to repair!! */
3090 set_bit(STRIPE_INSYNC, &sh->state); 3136 set_bit(STRIPE_INSYNC, &sh->state);
@@ -3459,10 +3505,12 @@ static void handle_stripe(struct stripe_head *sh)
3459 if (s.written && 3505 if (s.written &&
3460 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3506 (s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3461 && !test_bit(R5_LOCKED, &pdev->flags) 3507 && !test_bit(R5_LOCKED, &pdev->flags)
3462 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3508 && (test_bit(R5_UPTODATE, &pdev->flags) ||
3509 test_bit(R5_Discard, &pdev->flags))))) &&
3463 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3510 (s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3464 && !test_bit(R5_LOCKED, &qdev->flags) 3511 && !test_bit(R5_LOCKED, &qdev->flags)
3465 && test_bit(R5_UPTODATE, &qdev->flags))))) 3512 && (test_bit(R5_UPTODATE, &qdev->flags) ||
3513 test_bit(R5_Discard, &qdev->flags))))))
3466 handle_stripe_clean_event(conf, sh, disks, &s.return_bi); 3514 handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
3467 3515
3468 /* Now we might consider reading some blocks, either to check/generate 3516 /* Now we might consider reading some blocks, either to check/generate
@@ -3489,9 +3537,11 @@ static void handle_stripe(struct stripe_head *sh)
3489 /* All the 'written' buffers and the parity block are ready to 3537 /* All the 'written' buffers and the parity block are ready to
3490 * be written back to disk 3538 * be written back to disk
3491 */ 3539 */
3492 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3540 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) &&
3541 !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags));
3493 BUG_ON(sh->qd_idx >= 0 && 3542 BUG_ON(sh->qd_idx >= 0 &&
3494 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); 3543 !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) &&
3544 !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags));
3495 for (i = disks; i--; ) { 3545 for (i = disks; i--; ) {
3496 struct r5dev *dev = &sh->dev[i]; 3546 struct r5dev *dev = &sh->dev[i];
3497 if (test_bit(R5_LOCKED, &dev->flags) && 3547 if (test_bit(R5_LOCKED, &dev->flags) &&
@@ -4072,6 +4122,88 @@ static void release_stripe_plug(struct mddev *mddev,
4072 release_stripe(sh); 4122 release_stripe(sh);
4073} 4123}
4074 4124
4125static void make_discard_request(struct mddev *mddev, struct bio *bi)
4126{
4127 struct r5conf *conf = mddev->private;
4128 sector_t logical_sector, last_sector;
4129 struct stripe_head *sh;
4130 int remaining;
4131 int stripe_sectors;
4132
4133 if (mddev->reshape_position != MaxSector)
4134 /* Skip discard while reshape is happening */
4135 return;
4136
4137 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4138 last_sector = bi->bi_sector + (bi->bi_size>>9);
4139
4140 bi->bi_next = NULL;
4141 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4142
4143 stripe_sectors = conf->chunk_sectors *
4144 (conf->raid_disks - conf->max_degraded);
4145 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
4146 stripe_sectors);
4147 sector_div(last_sector, stripe_sectors);
4148
4149 logical_sector *= conf->chunk_sectors;
4150 last_sector *= conf->chunk_sectors;
4151
4152 for (; logical_sector < last_sector;
4153 logical_sector += STRIPE_SECTORS) {
4154 DEFINE_WAIT(w);
4155 int d;
4156 again:
4157 sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4158 prepare_to_wait(&conf->wait_for_overlap, &w,
4159 TASK_UNINTERRUPTIBLE);
4160 spin_lock_irq(&sh->stripe_lock);
4161 for (d = 0; d < conf->raid_disks; d++) {
4162 if (d == sh->pd_idx || d == sh->qd_idx)
4163 continue;
4164 if (sh->dev[d].towrite || sh->dev[d].toread) {
4165 set_bit(R5_Overlap, &sh->dev[d].flags);
4166 spin_unlock_irq(&sh->stripe_lock);
4167 release_stripe(sh);
4168 schedule();
4169 goto again;
4170 }
4171 }
4172 finish_wait(&conf->wait_for_overlap, &w);
4173 for (d = 0; d < conf->raid_disks; d++) {
4174 if (d == sh->pd_idx || d == sh->qd_idx)
4175 continue;
4176 sh->dev[d].towrite = bi;
4177 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
4178 raid5_inc_bi_active_stripes(bi);
4179 }
4180 spin_unlock_irq(&sh->stripe_lock);
4181 if (conf->mddev->bitmap) {
4182 for (d = 0;
4183 d < conf->raid_disks - conf->max_degraded;
4184 d++)
4185 bitmap_startwrite(mddev->bitmap,
4186 sh->sector,
4187 STRIPE_SECTORS,
4188 0);
4189 sh->bm_seq = conf->seq_flush + 1;
4190 set_bit(STRIPE_BIT_DELAY, &sh->state);
4191 }
4192
4193 set_bit(STRIPE_HANDLE, &sh->state);
4194 clear_bit(STRIPE_DELAYED, &sh->state);
4195 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4196 atomic_inc(&conf->preread_active_stripes);
4197 release_stripe_plug(mddev, sh);
4198 }
4199
4200 remaining = raid5_dec_bi_active_stripes(bi);
4201 if (remaining == 0) {
4202 md_write_end(mddev);
4203 bio_endio(bi, 0);
4204 }
4205}
4206
4075static void make_request(struct mddev *mddev, struct bio * bi) 4207static void make_request(struct mddev *mddev, struct bio * bi)
4076{ 4208{
4077 struct r5conf *conf = mddev->private; 4209 struct r5conf *conf = mddev->private;
@@ -4094,6 +4226,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4094 chunk_aligned_read(mddev,bi)) 4226 chunk_aligned_read(mddev,bi))
4095 return; 4227 return;
4096 4228
4229 if (unlikely(bi->bi_rw & REQ_DISCARD)) {
4230 make_discard_request(mddev, bi);
4231 return;
4232 }
4233
4097 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4234 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4098 last_sector = bi->bi_sector + (bi->bi_size>>9); 4235 last_sector = bi->bi_sector + (bi->bi_size>>9);
4099 bi->bi_next = NULL; 4236 bi->bi_next = NULL;
@@ -4630,8 +4767,9 @@ static int handle_active_stripes(struct r5conf *conf)
4630 * During the scan, completed stripes are saved for us by the interrupt 4767 * During the scan, completed stripes are saved for us by the interrupt
4631 * handler, so that they will not have to wait for our next wakeup. 4768 * handler, so that they will not have to wait for our next wakeup.
4632 */ 4769 */
4633static void raid5d(struct mddev *mddev) 4770static void raid5d(struct md_thread *thread)
4634{ 4771{
4772 struct mddev *mddev = thread->mddev;
4635 struct r5conf *conf = mddev->private; 4773 struct r5conf *conf = mddev->private;
4636 int handled; 4774 int handled;
4637 struct blk_plug plug; 4775 struct blk_plug plug;
@@ -5366,6 +5504,7 @@ static int run(struct mddev *mddev)
5366 5504
5367 if (mddev->queue) { 5505 if (mddev->queue) {
5368 int chunk_size; 5506 int chunk_size;
5507 bool discard_supported = true;
5369 /* read-ahead size must cover two whole stripes, which 5508 /* read-ahead size must cover two whole stripes, which
5370 * is 2 * (datadisks) * chunksize where 'n' is the 5509 * is 2 * (datadisks) * chunksize where 'n' is the
5371 * number of raid devices 5510 * number of raid devices
@@ -5385,13 +5524,48 @@ static int run(struct mddev *mddev)
5385 blk_queue_io_min(mddev->queue, chunk_size); 5524 blk_queue_io_min(mddev->queue, chunk_size);
5386 blk_queue_io_opt(mddev->queue, chunk_size * 5525 blk_queue_io_opt(mddev->queue, chunk_size *
5387 (conf->raid_disks - conf->max_degraded)); 5526 (conf->raid_disks - conf->max_degraded));
5527 /*
5528 * We can only discard a whole stripe. It doesn't make sense to
5529 * discard data disk but write parity disk
5530 */
5531 stripe = stripe * PAGE_SIZE;
5532 mddev->queue->limits.discard_alignment = stripe;
5533 mddev->queue->limits.discard_granularity = stripe;
5534 /*
5535 * unaligned part of discard request will be ignored, so can't
5536 * guarantee discard_zerors_data
5537 */
5538 mddev->queue->limits.discard_zeroes_data = 0;
5388 5539
5389 rdev_for_each(rdev, mddev) { 5540 rdev_for_each(rdev, mddev) {
5390 disk_stack_limits(mddev->gendisk, rdev->bdev, 5541 disk_stack_limits(mddev->gendisk, rdev->bdev,
5391 rdev->data_offset << 9); 5542 rdev->data_offset << 9);
5392 disk_stack_limits(mddev->gendisk, rdev->bdev, 5543 disk_stack_limits(mddev->gendisk, rdev->bdev,
5393 rdev->new_data_offset << 9); 5544 rdev->new_data_offset << 9);
5545 /*
5546 * discard_zeroes_data is required, otherwise data
5547 * could be lost. Consider a scenario: discard a stripe
5548 * (the stripe could be inconsistent if
5549 * discard_zeroes_data is 0); write one disk of the
5550 * stripe (the stripe could be inconsistent again
5551 * depending on which disks are used to calculate
5552 * parity); the disk is broken; The stripe data of this
5553 * disk is lost.
5554 */
5555 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
5556 !bdev_get_queue(rdev->bdev)->
5557 limits.discard_zeroes_data)
5558 discard_supported = false;
5394 } 5559 }
5560
5561 if (discard_supported &&
5562 mddev->queue->limits.max_discard_sectors >= stripe &&
5563 mddev->queue->limits.discard_granularity >= stripe)
5564 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
5565 mddev->queue);
5566 else
5567 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
5568 mddev->queue);
5395 } 5569 }
5396 5570
5397 return 0; 5571 return 0;
@@ -5702,7 +5876,8 @@ static int check_reshape(struct mddev *mddev)
5702 if (!check_stripe_cache(mddev)) 5876 if (!check_stripe_cache(mddev))
5703 return -ENOSPC; 5877 return -ENOSPC;
5704 5878
5705 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5879 return resize_stripes(conf, (conf->previous_raid_disks
5880 + mddev->delta_disks));
5706} 5881}
5707 5882
5708static int raid5_start_reshape(struct mddev *mddev) 5883static int raid5_start_reshape(struct mddev *mddev)