aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid5.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r--drivers/md/raid5.c168
1 files changed, 165 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 81c02d63440b..74dcf19cfe68 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
547 rw = WRITE_FUA; 547 rw = WRITE_FUA;
548 else 548 else
549 rw = WRITE; 549 rw = WRITE;
550 if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
551 rw |= REQ_DISCARD;
550 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 552 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
551 rw = READ; 553 rw = READ;
552 else if (test_and_clear_bit(R5_WantReplace, 554 else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1170 set_bit(R5_WantFUA, &dev->flags); 1172 set_bit(R5_WantFUA, &dev->flags);
1171 if (wbi->bi_rw & REQ_SYNC) 1173 if (wbi->bi_rw & REQ_SYNC)
1172 set_bit(R5_SyncIO, &dev->flags); 1174 set_bit(R5_SyncIO, &dev->flags);
1173 tx = async_copy_data(1, wbi, dev->page, 1175 if (wbi->bi_rw & REQ_DISCARD) {
1174 dev->sector, tx); 1176 memset(page_address(dev->page), 0,
1177 STRIPE_SECTORS << 9);
1178 set_bit(R5_Discard, &dev->flags);
1179 } else
1180 tx = async_copy_data(1, wbi, dev->page,
1181 dev->sector, tx);
1175 wbi = r5_next_bio(wbi, dev->sector); 1182 wbi = r5_next_bio(wbi, dev->sector);
1176 } 1183 }
1177 } 1184 }
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1237 pr_debug("%s: stripe %llu\n", __func__, 1244 pr_debug("%s: stripe %llu\n", __func__,
1238 (unsigned long long)sh->sector); 1245 (unsigned long long)sh->sector);
1239 1246
1247 for (i = 0; i < sh->disks; i++) {
1248 if (pd_idx == i)
1249 continue;
1250 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1251 break;
1252 }
1253 if (i >= sh->disks) {
1254 atomic_inc(&sh->count);
1255 memset(page_address(sh->dev[pd_idx].page), 0,
1256 STRIPE_SECTORS << 9);
1257 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1258 ops_complete_reconstruct(sh);
1259 return;
1260 }
1240 /* check if prexor is active which means only process blocks 1261 /* check if prexor is active which means only process blocks
1241 * that are part of a read-modify-write (written) 1262 * that are part of a read-modify-write (written)
1242 */ 1263 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1281{ 1302{
1282 struct async_submit_ctl submit; 1303 struct async_submit_ctl submit;
1283 struct page **blocks = percpu->scribble; 1304 struct page **blocks = percpu->scribble;
1284 int count; 1305 int count, i;
1285 1306
1286 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1307 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1287 1308
1309 for (i = 0; i < sh->disks; i++) {
1310 if (sh->pd_idx == i || sh->qd_idx == i)
1311 continue;
1312 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1313 break;
1314 }
1315 if (i >= sh->disks) {
1316 atomic_inc(&sh->count);
1317 memset(page_address(sh->dev[sh->pd_idx].page), 0,
1318 STRIPE_SECTORS << 9);
1319 memset(page_address(sh->dev[sh->qd_idx].page), 0,
1320 STRIPE_SECTORS << 9);
1321 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1322 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1323 ops_complete_reconstruct(sh);
1324 return;
1325 }
1326
1288 count = set_syndrome_sources(blocks, sh); 1327 count = set_syndrome_sources(blocks, sh);
1289 1328
1290 atomic_inc(&sh->count); 1329 atomic_inc(&sh->count);
@@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev,
4067 release_stripe(sh); 4106 release_stripe(sh);
4068} 4107}
4069 4108
4109static void make_discard_request(struct mddev *mddev, struct bio *bi)
4110{
4111 struct r5conf *conf = mddev->private;
4112 sector_t logical_sector, last_sector;
4113 struct stripe_head *sh;
4114 int remaining;
4115 int stripe_sectors;
4116
4117 if (mddev->reshape_position != MaxSector)
4118 /* Skip discard while reshape is happening */
4119 return;
4120
4121 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4122 last_sector = bi->bi_sector + (bi->bi_size>>9);
4123
4124 bi->bi_next = NULL;
4125 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4126
4127 stripe_sectors = conf->chunk_sectors *
4128 (conf->raid_disks - conf->max_degraded);
4129 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
4130 stripe_sectors);
4131 sector_div(last_sector, stripe_sectors);
4132
4133 logical_sector *= conf->chunk_sectors;
4134 last_sector *= conf->chunk_sectors;
4135
4136 for (; logical_sector < last_sector;
4137 logical_sector += STRIPE_SECTORS) {
4138 DEFINE_WAIT(w);
4139 int d;
4140 again:
4141 sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4142 prepare_to_wait(&conf->wait_for_overlap, &w,
4143 TASK_UNINTERRUPTIBLE);
4144 spin_lock_irq(&sh->stripe_lock);
4145 for (d = 0; d < conf->raid_disks; d++) {
4146 if (d == sh->pd_idx || d == sh->qd_idx)
4147 continue;
4148 if (sh->dev[d].towrite || sh->dev[d].toread) {
4149 set_bit(R5_Overlap, &sh->dev[d].flags);
4150 spin_unlock_irq(&sh->stripe_lock);
4151 release_stripe(sh);
4152 schedule();
4153 goto again;
4154 }
4155 }
4156 finish_wait(&conf->wait_for_overlap, &w);
4157 for (d = 0; d < conf->raid_disks; d++) {
4158 if (d == sh->pd_idx || d == sh->qd_idx)
4159 continue;
4160 sh->dev[d].towrite = bi;
4161 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
4162 raid5_inc_bi_active_stripes(bi);
4163 }
4164 spin_unlock_irq(&sh->stripe_lock);
4165 if (conf->mddev->bitmap) {
4166 for (d = 0;
4167 d < conf->raid_disks - conf->max_degraded;
4168 d++)
4169 bitmap_startwrite(mddev->bitmap,
4170 sh->sector,
4171 STRIPE_SECTORS,
4172 0);
4173 sh->bm_seq = conf->seq_flush + 1;
4174 set_bit(STRIPE_BIT_DELAY, &sh->state);
4175 }
4176
4177 set_bit(STRIPE_HANDLE, &sh->state);
4178 clear_bit(STRIPE_DELAYED, &sh->state);
4179 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4180 atomic_inc(&conf->preread_active_stripes);
4181 release_stripe_plug(mddev, sh);
4182 }
4183
4184 remaining = raid5_dec_bi_active_stripes(bi);
4185 if (remaining == 0) {
4186 md_write_end(mddev);
4187 bio_endio(bi, 0);
4188 }
4189}
4190
4070static void make_request(struct mddev *mddev, struct bio * bi) 4191static void make_request(struct mddev *mddev, struct bio * bi)
4071{ 4192{
4072 struct r5conf *conf = mddev->private; 4193 struct r5conf *conf = mddev->private;
@@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4089 chunk_aligned_read(mddev,bi)) 4210 chunk_aligned_read(mddev,bi))
4090 return; 4211 return;
4091 4212
4213 if (unlikely(bi->bi_rw & REQ_DISCARD)) {
4214 make_discard_request(mddev, bi);
4215 return;
4216 }
4217
4092 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4218 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4093 last_sector = bi->bi_sector + (bi->bi_size>>9); 4219 last_sector = bi->bi_sector + (bi->bi_size>>9);
4094 bi->bi_next = NULL; 4220 bi->bi_next = NULL;
@@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev)
5362 5488
5363 if (mddev->queue) { 5489 if (mddev->queue) {
5364 int chunk_size; 5490 int chunk_size;
5491 bool discard_supported = true;
5365 /* read-ahead size must cover two whole stripes, which 5492 /* read-ahead size must cover two whole stripes, which
5366 * is 2 * (datadisks) * chunksize where 'n' is the 5493 * is 2 * (datadisks) * chunksize where 'n' is the
5367 * number of raid devices 5494 * number of raid devices
@@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev)
5381 blk_queue_io_min(mddev->queue, chunk_size); 5508 blk_queue_io_min(mddev->queue, chunk_size);
5382 blk_queue_io_opt(mddev->queue, chunk_size * 5509 blk_queue_io_opt(mddev->queue, chunk_size *
5383 (conf->raid_disks - conf->max_degraded)); 5510 (conf->raid_disks - conf->max_degraded));
5511 /*
5512 * We can only discard a whole stripe. It doesn't make sense to
5513 * discard data disk but write parity disk
5514 */
5515 stripe = stripe * PAGE_SIZE;
5516 mddev->queue->limits.discard_alignment = stripe;
5517 mddev->queue->limits.discard_granularity = stripe;
5518 /*
5519 * unaligned part of discard request will be ignored, so can't
5520 * guarantee discard_zerors_data
5521 */
5522 mddev->queue->limits.discard_zeroes_data = 0;
5384 5523
5385 rdev_for_each(rdev, mddev) { 5524 rdev_for_each(rdev, mddev) {
5386 disk_stack_limits(mddev->gendisk, rdev->bdev, 5525 disk_stack_limits(mddev->gendisk, rdev->bdev,
5387 rdev->data_offset << 9); 5526 rdev->data_offset << 9);
5388 disk_stack_limits(mddev->gendisk, rdev->bdev, 5527 disk_stack_limits(mddev->gendisk, rdev->bdev,
5389 rdev->new_data_offset << 9); 5528 rdev->new_data_offset << 9);
5529 /*
5530 * discard_zeroes_data is required, otherwise data
5531 * could be lost. Consider a scenario: discard a stripe
5532 * (the stripe could be inconsistent if
5533 * discard_zeroes_data is 0); write one disk of the
5534 * stripe (the stripe could be inconsistent again
5535 * depending on which disks are used to calculate
5536 * parity); the disk is broken; The stripe data of this
5537 * disk is lost.
5538 */
5539 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
5540 !bdev_get_queue(rdev->bdev)->
5541 limits.discard_zeroes_data)
5542 discard_supported = false;
5390 } 5543 }
5544
5545 if (discard_supported &&
5546 mddev->queue->limits.max_discard_sectors >= stripe &&
5547 mddev->queue->limits.discard_granularity >= stripe)
5548 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
5549 mddev->queue);
5550 else
5551 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
5552 mddev->queue);
5391 } 5553 }
5392 5554
5393 return 0; 5555 return 0;