diff options
-rw-r--r-- | drivers/md/raid5.c | 168 | ||||
-rw-r--r-- | drivers/md/raid5.h | 1 |
2 files changed, 166 insertions, 3 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81c02d63440b..74dcf19cfe68 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) | |||
547 | rw = WRITE_FUA; | 547 | rw = WRITE_FUA; |
548 | else | 548 | else |
549 | rw = WRITE; | 549 | rw = WRITE; |
550 | if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) | ||
551 | rw |= REQ_DISCARD; | ||
550 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) | 552 | } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) |
551 | rw = READ; | 553 | rw = READ; |
552 | else if (test_and_clear_bit(R5_WantReplace, | 554 | else if (test_and_clear_bit(R5_WantReplace, |
@@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | |||
1170 | set_bit(R5_WantFUA, &dev->flags); | 1172 | set_bit(R5_WantFUA, &dev->flags); |
1171 | if (wbi->bi_rw & REQ_SYNC) | 1173 | if (wbi->bi_rw & REQ_SYNC) |
1172 | set_bit(R5_SyncIO, &dev->flags); | 1174 | set_bit(R5_SyncIO, &dev->flags); |
1173 | tx = async_copy_data(1, wbi, dev->page, | 1175 | if (wbi->bi_rw & REQ_DISCARD) { |
1174 | dev->sector, tx); | 1176 | memset(page_address(dev->page), 0, |
1177 | STRIPE_SECTORS << 9); | ||
1178 | set_bit(R5_Discard, &dev->flags); | ||
1179 | } else | ||
1180 | tx = async_copy_data(1, wbi, dev->page, | ||
1181 | dev->sector, tx); | ||
1175 | wbi = r5_next_bio(wbi, dev->sector); | 1182 | wbi = r5_next_bio(wbi, dev->sector); |
1176 | } | 1183 | } |
1177 | } | 1184 | } |
@@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1237 | pr_debug("%s: stripe %llu\n", __func__, | 1244 | pr_debug("%s: stripe %llu\n", __func__, |
1238 | (unsigned long long)sh->sector); | 1245 | (unsigned long long)sh->sector); |
1239 | 1246 | ||
1247 | for (i = 0; i < sh->disks; i++) { | ||
1248 | if (pd_idx == i) | ||
1249 | continue; | ||
1250 | if (!test_bit(R5_Discard, &sh->dev[i].flags)) | ||
1251 | break; | ||
1252 | } | ||
1253 | if (i >= sh->disks) { | ||
1254 | atomic_inc(&sh->count); | ||
1255 | memset(page_address(sh->dev[pd_idx].page), 0, | ||
1256 | STRIPE_SECTORS << 9); | ||
1257 | set_bit(R5_Discard, &sh->dev[pd_idx].flags); | ||
1258 | ops_complete_reconstruct(sh); | ||
1259 | return; | ||
1260 | } | ||
1240 | /* check if prexor is active which means only process blocks | 1261 | /* check if prexor is active which means only process blocks |
1241 | * that are part of a read-modify-write (written) | 1262 | * that are part of a read-modify-write (written) |
1242 | */ | 1263 | */ |
@@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, | |||
1281 | { | 1302 | { |
1282 | struct async_submit_ctl submit; | 1303 | struct async_submit_ctl submit; |
1283 | struct page **blocks = percpu->scribble; | 1304 | struct page **blocks = percpu->scribble; |
1284 | int count; | 1305 | int count, i; |
1285 | 1306 | ||
1286 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); | 1307 | pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); |
1287 | 1308 | ||
1309 | for (i = 0; i < sh->disks; i++) { | ||
1310 | if (sh->pd_idx == i || sh->qd_idx == i) | ||
1311 | continue; | ||
1312 | if (!test_bit(R5_Discard, &sh->dev[i].flags)) | ||
1313 | break; | ||
1314 | } | ||
1315 | if (i >= sh->disks) { | ||
1316 | atomic_inc(&sh->count); | ||
1317 | memset(page_address(sh->dev[sh->pd_idx].page), 0, | ||
1318 | STRIPE_SECTORS << 9); | ||
1319 | memset(page_address(sh->dev[sh->qd_idx].page), 0, | ||
1320 | STRIPE_SECTORS << 9); | ||
1321 | set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); | ||
1322 | set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); | ||
1323 | ops_complete_reconstruct(sh); | ||
1324 | return; | ||
1325 | } | ||
1326 | |||
1288 | count = set_syndrome_sources(blocks, sh); | 1327 | count = set_syndrome_sources(blocks, sh); |
1289 | 1328 | ||
1290 | atomic_inc(&sh->count); | 1329 | atomic_inc(&sh->count); |
@@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev, | |||
4067 | release_stripe(sh); | 4106 | release_stripe(sh); |
4068 | } | 4107 | } |
4069 | 4108 | ||
4109 | static void make_discard_request(struct mddev *mddev, struct bio *bi) | ||
4110 | { | ||
4111 | struct r5conf *conf = mddev->private; | ||
4112 | sector_t logical_sector, last_sector; | ||
4113 | struct stripe_head *sh; | ||
4114 | int remaining; | ||
4115 | int stripe_sectors; | ||
4116 | |||
4117 | if (mddev->reshape_position != MaxSector) | ||
4118 | /* Skip discard while reshape is happening */ | ||
4119 | return; | ||
4120 | |||
4121 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | ||
4122 | last_sector = bi->bi_sector + (bi->bi_size>>9); | ||
4123 | |||
4124 | bi->bi_next = NULL; | ||
4125 | bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ | ||
4126 | |||
4127 | stripe_sectors = conf->chunk_sectors * | ||
4128 | (conf->raid_disks - conf->max_degraded); | ||
4129 | logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, | ||
4130 | stripe_sectors); | ||
4131 | sector_div(last_sector, stripe_sectors); | ||
4132 | |||
4133 | logical_sector *= conf->chunk_sectors; | ||
4134 | last_sector *= conf->chunk_sectors; | ||
4135 | |||
4136 | for (; logical_sector < last_sector; | ||
4137 | logical_sector += STRIPE_SECTORS) { | ||
4138 | DEFINE_WAIT(w); | ||
4139 | int d; | ||
4140 | again: | ||
4141 | sh = get_active_stripe(conf, logical_sector, 0, 0, 0); | ||
4142 | prepare_to_wait(&conf->wait_for_overlap, &w, | ||
4143 | TASK_UNINTERRUPTIBLE); | ||
4144 | spin_lock_irq(&sh->stripe_lock); | ||
4145 | for (d = 0; d < conf->raid_disks; d++) { | ||
4146 | if (d == sh->pd_idx || d == sh->qd_idx) | ||
4147 | continue; | ||
4148 | if (sh->dev[d].towrite || sh->dev[d].toread) { | ||
4149 | set_bit(R5_Overlap, &sh->dev[d].flags); | ||
4150 | spin_unlock_irq(&sh->stripe_lock); | ||
4151 | release_stripe(sh); | ||
4152 | schedule(); | ||
4153 | goto again; | ||
4154 | } | ||
4155 | } | ||
4156 | finish_wait(&conf->wait_for_overlap, &w); | ||
4157 | for (d = 0; d < conf->raid_disks; d++) { | ||
4158 | if (d == sh->pd_idx || d == sh->qd_idx) | ||
4159 | continue; | ||
4160 | sh->dev[d].towrite = bi; | ||
4161 | set_bit(R5_OVERWRITE, &sh->dev[d].flags); | ||
4162 | raid5_inc_bi_active_stripes(bi); | ||
4163 | } | ||
4164 | spin_unlock_irq(&sh->stripe_lock); | ||
4165 | if (conf->mddev->bitmap) { | ||
4166 | for (d = 0; | ||
4167 | d < conf->raid_disks - conf->max_degraded; | ||
4168 | d++) | ||
4169 | bitmap_startwrite(mddev->bitmap, | ||
4170 | sh->sector, | ||
4171 | STRIPE_SECTORS, | ||
4172 | 0); | ||
4173 | sh->bm_seq = conf->seq_flush + 1; | ||
4174 | set_bit(STRIPE_BIT_DELAY, &sh->state); | ||
4175 | } | ||
4176 | |||
4177 | set_bit(STRIPE_HANDLE, &sh->state); | ||
4178 | clear_bit(STRIPE_DELAYED, &sh->state); | ||
4179 | if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) | ||
4180 | atomic_inc(&conf->preread_active_stripes); | ||
4181 | release_stripe_plug(mddev, sh); | ||
4182 | } | ||
4183 | |||
4184 | remaining = raid5_dec_bi_active_stripes(bi); | ||
4185 | if (remaining == 0) { | ||
4186 | md_write_end(mddev); | ||
4187 | bio_endio(bi, 0); | ||
4188 | } | ||
4189 | } | ||
4190 | |||
4070 | static void make_request(struct mddev *mddev, struct bio * bi) | 4191 | static void make_request(struct mddev *mddev, struct bio * bi) |
4071 | { | 4192 | { |
4072 | struct r5conf *conf = mddev->private; | 4193 | struct r5conf *conf = mddev->private; |
@@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) | |||
4089 | chunk_aligned_read(mddev,bi)) | 4210 | chunk_aligned_read(mddev,bi)) |
4090 | return; | 4211 | return; |
4091 | 4212 | ||
4213 | if (unlikely(bi->bi_rw & REQ_DISCARD)) { | ||
4214 | make_discard_request(mddev, bi); | ||
4215 | return; | ||
4216 | } | ||
4217 | |||
4092 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); | 4218 | logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); |
4093 | last_sector = bi->bi_sector + (bi->bi_size>>9); | 4219 | last_sector = bi->bi_sector + (bi->bi_size>>9); |
4094 | bi->bi_next = NULL; | 4220 | bi->bi_next = NULL; |
@@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev) | |||
5362 | 5488 | ||
5363 | if (mddev->queue) { | 5489 | if (mddev->queue) { |
5364 | int chunk_size; | 5490 | int chunk_size; |
5491 | bool discard_supported = true; | ||
5365 | /* read-ahead size must cover two whole stripes, which | 5492 | /* read-ahead size must cover two whole stripes, which |
5366 | * is 2 * (datadisks) * chunksize where 'n' is the | 5493 | * is 2 * (datadisks) * chunksize where 'n' is the |
5367 | * number of raid devices | 5494 | * number of raid devices |
@@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev) | |||
5381 | blk_queue_io_min(mddev->queue, chunk_size); | 5508 | blk_queue_io_min(mddev->queue, chunk_size); |
5382 | blk_queue_io_opt(mddev->queue, chunk_size * | 5509 | blk_queue_io_opt(mddev->queue, chunk_size * |
5383 | (conf->raid_disks - conf->max_degraded)); | 5510 | (conf->raid_disks - conf->max_degraded)); |
5511 | /* | ||
5512 | * We can only discard a whole stripe. It doesn't make sense to | ||
5513 | * discard data disk but write parity disk | ||
5514 | */ | ||
5515 | stripe = stripe * PAGE_SIZE; | ||
5516 | mddev->queue->limits.discard_alignment = stripe; | ||
5517 | mddev->queue->limits.discard_granularity = stripe; | ||
5518 | /* | ||
5519 | * unaligned part of discard request will be ignored, so can't | ||
5520 | * guarantee discard_zerors_data | ||
5521 | */ | ||
5522 | mddev->queue->limits.discard_zeroes_data = 0; | ||
5384 | 5523 | ||
5385 | rdev_for_each(rdev, mddev) { | 5524 | rdev_for_each(rdev, mddev) { |
5386 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5525 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5387 | rdev->data_offset << 9); | 5526 | rdev->data_offset << 9); |
5388 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5527 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5389 | rdev->new_data_offset << 9); | 5528 | rdev->new_data_offset << 9); |
5529 | /* | ||
5530 | * discard_zeroes_data is required, otherwise data | ||
5531 | * could be lost. Consider a scenario: discard a stripe | ||
5532 | * (the stripe could be inconsistent if | ||
5533 | * discard_zeroes_data is 0); write one disk of the | ||
5534 | * stripe (the stripe could be inconsistent again | ||
5535 | * depending on which disks are used to calculate | ||
5536 | * parity); the disk is broken; The stripe data of this | ||
5537 | * disk is lost. | ||
5538 | */ | ||
5539 | if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || | ||
5540 | !bdev_get_queue(rdev->bdev)-> | ||
5541 | limits.discard_zeroes_data) | ||
5542 | discard_supported = false; | ||
5390 | } | 5543 | } |
5544 | |||
5545 | if (discard_supported && | ||
5546 | mddev->queue->limits.max_discard_sectors >= stripe && | ||
5547 | mddev->queue->limits.discard_granularity >= stripe) | ||
5548 | queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, | ||
5549 | mddev->queue); | ||
5550 | else | ||
5551 | queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, | ||
5552 | mddev->queue); | ||
5391 | } | 5553 | } |
5392 | 5554 | ||
5393 | return 0; | 5555 | return 0; |
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901eda..18b2c4a8a1fd 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h | |||
@@ -298,6 +298,7 @@ enum r5dev_flags { | |||
298 | R5_WantReplace, /* We need to update the replacement, we have read | 298 | R5_WantReplace, /* We need to update the replacement, we have read |
299 | * data in, and now is a good time to write it out. | 299 | * data in, and now is a good time to write it out. |
300 | */ | 300 | */ |
301 | R5_Discard, /* Discard the stripe */ | ||
301 | }; | 302 | }; |
302 | 303 | ||
303 | /* | 304 | /* |