aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorShaohua Li <shli@kernel.org>2012-10-10 22:49:05 -0400
committerNeilBrown <neilb@suse.de>2012-10-10 22:49:05 -0400
commit620125f2bf8ff0c4969b79653b54d7bcc9d40637 (patch)
tree373257b7e9a236e66bc3ad99cd1d158e7430014e /drivers/md
parent582e2e056a5c3410174c23f5134e6b00e0db9101 (diff)
MD: raid5 trim support
Discard for raid4/5/6 has limitation. If discard request size is small, we do discard for one disk, but we need calculate parity and write parity disk. To correctly calculate parity, zero_after_discard must be guaranteed. Even it's true, we need do discard for one disk but write another disks, which makes the parity disks wear out fast. This doesn't make sense. So an efficient discard for raid4/5/6 should discard all data disks and parity disks, which requires the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is smaller than chunk_size, such pattern is almost impossible in practice. So in this patch, I only handle the case that A's size equals to chunk_size. That is discard request should be aligned to stripe size and its size is multiple of stripe size. Since we can only handle request with specific alignment and size (or part of the request fitting stripes), we can't guarantee zero_after_discard even zero_after_discard is true in low level drives. The block layer doesn't send down correctly aligned requests even correct discard alignment is set, so I must filter out. For raid4/5/6 parity calculation, if data is 0, parity is 0. So if zero_after_discard is true for all disks, data is consistent after discard. Otherwise, data might be lost. Let's consider a scenario: discard a stripe, write data to one disk and write parity disk. The stripe could be still inconsistent till then depending on using data from other data disks or parity disks to calculate new parity. If the disk is broken, we can't restore it. So in this patch, we only enable discard support if all disks have zero_after_discard. If discard fails in one disk, we face the similar inconsistent issue above. The patch will make discard follow the same path as normal write request. If discard fails, a resync will be scheduled to make the data consistent. This isn't good to have extra writes, but data consistency is important. If a subsequent read/write request hits raid5 cache of a discarded stripe, the discarded dev page should have zero filled, so the data is consistent. This patch will always zero dev page for discarded request stripe. This isn't optimal because discard request doesn't need such payload. Next patch will avoid it. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid5.c168
-rw-r--r--drivers/md/raid5.h1
2 files changed, 166 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 81c02d63440b..74dcf19cfe68 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
547 rw = WRITE_FUA; 547 rw = WRITE_FUA;
548 else 548 else
549 rw = WRITE; 549 rw = WRITE;
550 if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
551 rw |= REQ_DISCARD;
550 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 552 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
551 rw = READ; 553 rw = READ;
552 else if (test_and_clear_bit(R5_WantReplace, 554 else if (test_and_clear_bit(R5_WantReplace,
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1170 set_bit(R5_WantFUA, &dev->flags); 1172 set_bit(R5_WantFUA, &dev->flags);
1171 if (wbi->bi_rw & REQ_SYNC) 1173 if (wbi->bi_rw & REQ_SYNC)
1172 set_bit(R5_SyncIO, &dev->flags); 1174 set_bit(R5_SyncIO, &dev->flags);
1173 tx = async_copy_data(1, wbi, dev->page, 1175 if (wbi->bi_rw & REQ_DISCARD) {
1174 dev->sector, tx); 1176 memset(page_address(dev->page), 0,
1177 STRIPE_SECTORS << 9);
1178 set_bit(R5_Discard, &dev->flags);
1179 } else
1180 tx = async_copy_data(1, wbi, dev->page,
1181 dev->sector, tx);
1175 wbi = r5_next_bio(wbi, dev->sector); 1182 wbi = r5_next_bio(wbi, dev->sector);
1176 } 1183 }
1177 } 1184 }
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1237 pr_debug("%s: stripe %llu\n", __func__, 1244 pr_debug("%s: stripe %llu\n", __func__,
1238 (unsigned long long)sh->sector); 1245 (unsigned long long)sh->sector);
1239 1246
1247 for (i = 0; i < sh->disks; i++) {
1248 if (pd_idx == i)
1249 continue;
1250 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1251 break;
1252 }
1253 if (i >= sh->disks) {
1254 atomic_inc(&sh->count);
1255 memset(page_address(sh->dev[pd_idx].page), 0,
1256 STRIPE_SECTORS << 9);
1257 set_bit(R5_Discard, &sh->dev[pd_idx].flags);
1258 ops_complete_reconstruct(sh);
1259 return;
1260 }
1240 /* check if prexor is active which means only process blocks 1261 /* check if prexor is active which means only process blocks
1241 * that are part of a read-modify-write (written) 1262 * that are part of a read-modify-write (written)
1242 */ 1263 */
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1281{ 1302{
1282 struct async_submit_ctl submit; 1303 struct async_submit_ctl submit;
1283 struct page **blocks = percpu->scribble; 1304 struct page **blocks = percpu->scribble;
1284 int count; 1305 int count, i;
1285 1306
1286 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1307 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1287 1308
1309 for (i = 0; i < sh->disks; i++) {
1310 if (sh->pd_idx == i || sh->qd_idx == i)
1311 continue;
1312 if (!test_bit(R5_Discard, &sh->dev[i].flags))
1313 break;
1314 }
1315 if (i >= sh->disks) {
1316 atomic_inc(&sh->count);
1317 memset(page_address(sh->dev[sh->pd_idx].page), 0,
1318 STRIPE_SECTORS << 9);
1319 memset(page_address(sh->dev[sh->qd_idx].page), 0,
1320 STRIPE_SECTORS << 9);
1321 set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
1322 set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
1323 ops_complete_reconstruct(sh);
1324 return;
1325 }
1326
1288 count = set_syndrome_sources(blocks, sh); 1327 count = set_syndrome_sources(blocks, sh);
1289 1328
1290 atomic_inc(&sh->count); 1329 atomic_inc(&sh->count);
@@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev,
4067 release_stripe(sh); 4106 release_stripe(sh);
4068} 4107}
4069 4108
4109static void make_discard_request(struct mddev *mddev, struct bio *bi)
4110{
4111 struct r5conf *conf = mddev->private;
4112 sector_t logical_sector, last_sector;
4113 struct stripe_head *sh;
4114 int remaining;
4115 int stripe_sectors;
4116
4117 if (mddev->reshape_position != MaxSector)
4118 /* Skip discard while reshape is happening */
4119 return;
4120
4121 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4122 last_sector = bi->bi_sector + (bi->bi_size>>9);
4123
4124 bi->bi_next = NULL;
4125 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
4126
4127 stripe_sectors = conf->chunk_sectors *
4128 (conf->raid_disks - conf->max_degraded);
4129 logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
4130 stripe_sectors);
4131 sector_div(last_sector, stripe_sectors);
4132
4133 logical_sector *= conf->chunk_sectors;
4134 last_sector *= conf->chunk_sectors;
4135
4136 for (; logical_sector < last_sector;
4137 logical_sector += STRIPE_SECTORS) {
4138 DEFINE_WAIT(w);
4139 int d;
4140 again:
4141 sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
4142 prepare_to_wait(&conf->wait_for_overlap, &w,
4143 TASK_UNINTERRUPTIBLE);
4144 spin_lock_irq(&sh->stripe_lock);
4145 for (d = 0; d < conf->raid_disks; d++) {
4146 if (d == sh->pd_idx || d == sh->qd_idx)
4147 continue;
4148 if (sh->dev[d].towrite || sh->dev[d].toread) {
4149 set_bit(R5_Overlap, &sh->dev[d].flags);
4150 spin_unlock_irq(&sh->stripe_lock);
4151 release_stripe(sh);
4152 schedule();
4153 goto again;
4154 }
4155 }
4156 finish_wait(&conf->wait_for_overlap, &w);
4157 for (d = 0; d < conf->raid_disks; d++) {
4158 if (d == sh->pd_idx || d == sh->qd_idx)
4159 continue;
4160 sh->dev[d].towrite = bi;
4161 set_bit(R5_OVERWRITE, &sh->dev[d].flags);
4162 raid5_inc_bi_active_stripes(bi);
4163 }
4164 spin_unlock_irq(&sh->stripe_lock);
4165 if (conf->mddev->bitmap) {
4166 for (d = 0;
4167 d < conf->raid_disks - conf->max_degraded;
4168 d++)
4169 bitmap_startwrite(mddev->bitmap,
4170 sh->sector,
4171 STRIPE_SECTORS,
4172 0);
4173 sh->bm_seq = conf->seq_flush + 1;
4174 set_bit(STRIPE_BIT_DELAY, &sh->state);
4175 }
4176
4177 set_bit(STRIPE_HANDLE, &sh->state);
4178 clear_bit(STRIPE_DELAYED, &sh->state);
4179 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
4180 atomic_inc(&conf->preread_active_stripes);
4181 release_stripe_plug(mddev, sh);
4182 }
4183
4184 remaining = raid5_dec_bi_active_stripes(bi);
4185 if (remaining == 0) {
4186 md_write_end(mddev);
4187 bio_endio(bi, 0);
4188 }
4189}
4190
4070static void make_request(struct mddev *mddev, struct bio * bi) 4191static void make_request(struct mddev *mddev, struct bio * bi)
4071{ 4192{
4072 struct r5conf *conf = mddev->private; 4193 struct r5conf *conf = mddev->private;
@@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi)
4089 chunk_aligned_read(mddev,bi)) 4210 chunk_aligned_read(mddev,bi))
4090 return; 4211 return;
4091 4212
4213 if (unlikely(bi->bi_rw & REQ_DISCARD)) {
4214 make_discard_request(mddev, bi);
4215 return;
4216 }
4217
4092 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4218 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4093 last_sector = bi->bi_sector + (bi->bi_size>>9); 4219 last_sector = bi->bi_sector + (bi->bi_size>>9);
4094 bi->bi_next = NULL; 4220 bi->bi_next = NULL;
@@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev)
5362 5488
5363 if (mddev->queue) { 5489 if (mddev->queue) {
5364 int chunk_size; 5490 int chunk_size;
5491 bool discard_supported = true;
5365 /* read-ahead size must cover two whole stripes, which 5492 /* read-ahead size must cover two whole stripes, which
5366 * is 2 * (datadisks) * chunksize where 'n' is the 5493 * is 2 * (datadisks) * chunksize where 'n' is the
5367 * number of raid devices 5494 * number of raid devices
@@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev)
5381 blk_queue_io_min(mddev->queue, chunk_size); 5508 blk_queue_io_min(mddev->queue, chunk_size);
5382 blk_queue_io_opt(mddev->queue, chunk_size * 5509 blk_queue_io_opt(mddev->queue, chunk_size *
5383 (conf->raid_disks - conf->max_degraded)); 5510 (conf->raid_disks - conf->max_degraded));
5511 /*
5512 * We can only discard a whole stripe. It doesn't make sense to
5513 * discard data disk but write parity disk
5514 */
5515 stripe = stripe * PAGE_SIZE;
5516 mddev->queue->limits.discard_alignment = stripe;
5517 mddev->queue->limits.discard_granularity = stripe;
5518 /*
5519 * unaligned part of discard request will be ignored, so can't
5520 * guarantee discard_zerors_data
5521 */
5522 mddev->queue->limits.discard_zeroes_data = 0;
5384 5523
5385 rdev_for_each(rdev, mddev) { 5524 rdev_for_each(rdev, mddev) {
5386 disk_stack_limits(mddev->gendisk, rdev->bdev, 5525 disk_stack_limits(mddev->gendisk, rdev->bdev,
5387 rdev->data_offset << 9); 5526 rdev->data_offset << 9);
5388 disk_stack_limits(mddev->gendisk, rdev->bdev, 5527 disk_stack_limits(mddev->gendisk, rdev->bdev,
5389 rdev->new_data_offset << 9); 5528 rdev->new_data_offset << 9);
5529 /*
5530 * discard_zeroes_data is required, otherwise data
5531 * could be lost. Consider a scenario: discard a stripe
5532 * (the stripe could be inconsistent if
5533 * discard_zeroes_data is 0); write one disk of the
5534 * stripe (the stripe could be inconsistent again
5535 * depending on which disks are used to calculate
5536 * parity); the disk is broken; The stripe data of this
5537 * disk is lost.
5538 */
5539 if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
5540 !bdev_get_queue(rdev->bdev)->
5541 limits.discard_zeroes_data)
5542 discard_supported = false;
5390 } 5543 }
5544
5545 if (discard_supported &&
5546 mddev->queue->limits.max_discard_sectors >= stripe &&
5547 mddev->queue->limits.discard_granularity >= stripe)
5548 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
5549 mddev->queue);
5550 else
5551 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
5552 mddev->queue);
5391 } 5553 }
5392 5554
5393 return 0; 5555 return 0;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a9fc24901eda..18b2c4a8a1fd 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -298,6 +298,7 @@ enum r5dev_flags {
298 R5_WantReplace, /* We need to update the replacement, we have read 298 R5_WantReplace, /* We need to update the replacement, we have read
299 * data in, and now is a good time to write it out. 299 * data in, and now is a good time to write it out.
300 */ 300 */
301 R5_Discard, /* Discard the stripe */
301}; 302};
302 303
303/* 304/*