aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-09-03 05:56:19 -0400
committerJens Axboe <jaxboe@fusionio.com>2010-09-10 06:35:38 -0400
commitd87f4c14f27dc82d215108d8392a7d26687148a1 (patch)
tree55f2a81f3df5d70fd85c4428089f6fe28540bcf4 /drivers/md/dm.c
parent3a2edd0d6ddbd5fa3b389ea6db811285415ce6c8 (diff)
dm: implement REQ_FLUSH/FUA support for bio-based dm
This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c119
1 files changed, 57 insertions, 62 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b1d92be8f990..32e6622767ad 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -144,15 +144,16 @@ struct mapped_device {
144 spinlock_t deferred_lock; 144 spinlock_t deferred_lock;
145 145
146 /* 146 /*
147 * An error from the barrier request currently being processed. 147 * An error from the flush request currently being processed.
148 */ 148 */
149 int barrier_error; 149 int flush_error;
150 150
151 /* 151 /*
152 * Protect barrier_error from concurrent endio processing 152 * Protect barrier_error from concurrent endio processing
153 * in request-based dm. 153 * in request-based dm.
154 */ 154 */
155 spinlock_t barrier_error_lock; 155 spinlock_t barrier_error_lock;
156 int barrier_error;
156 157
157 /* 158 /*
158 * Processing queue (flush/barriers) 159 * Processing queue (flush/barriers)
@@ -200,8 +201,8 @@ struct mapped_device {
200 /* sysfs handle */ 201 /* sysfs handle */
201 struct kobject kobj; 202 struct kobject kobj;
202 203
203 /* zero-length barrier that will be cloned and submitted to targets */ 204 /* zero-length flush that will be cloned and submitted to targets */
204 struct bio barrier_bio; 205 struct bio flush_bio;
205}; 206};
206 207
207/* 208/*
@@ -512,7 +513,7 @@ static void end_io_acct(struct dm_io *io)
512 513
513 /* 514 /*
514 * After this is decremented the bio must not be touched if it is 515 * After this is decremented the bio must not be touched if it is
515 * a barrier. 516 * a flush.
516 */ 517 */
517 dm_disk(md)->part0.in_flight[rw] = pending = 518 dm_disk(md)->part0.in_flight[rw] = pending =
518 atomic_dec_return(&md->pending[rw]); 519 atomic_dec_return(&md->pending[rw]);
@@ -626,7 +627,7 @@ static void dec_pending(struct dm_io *io, int error)
626 */ 627 */
627 spin_lock_irqsave(&md->deferred_lock, flags); 628 spin_lock_irqsave(&md->deferred_lock, flags);
628 if (__noflush_suspending(md)) { 629 if (__noflush_suspending(md)) {
629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 630 if (!(io->bio->bi_rw & REQ_FLUSH))
630 bio_list_add_head(&md->deferred, 631 bio_list_add_head(&md->deferred,
631 io->bio); 632 io->bio);
632 } else 633 } else
@@ -638,20 +639,14 @@ static void dec_pending(struct dm_io *io, int error)
638 io_error = io->error; 639 io_error = io->error;
639 bio = io->bio; 640 bio = io->bio;
640 641
641 if (bio->bi_rw & REQ_HARDBARRIER) { 642 if (bio->bi_rw & REQ_FLUSH) {
642 /* 643 /*
643 * There can be just one barrier request so we use 644 * There can be just one flush request so we use
644 * a per-device variable for error reporting. 645 * a per-device variable for error reporting.
645 * Note that you can't touch the bio after end_io_acct 646 * Note that you can't touch the bio after end_io_acct
646 *
647 * We ignore -EOPNOTSUPP for empty flush reported by
648 * underlying devices. We assume that if the device
649 * doesn't support empty barriers, it doesn't need
650 * cache flushing commands.
651 */ 647 */
652 if (!md->barrier_error && 648 if (!md->flush_error)
653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 649 md->flush_error = io_error;
654 md->barrier_error = io_error;
655 end_io_acct(io); 650 end_io_acct(io);
656 free_io(md, io); 651 free_io(md, io);
657 } else { 652 } else {
@@ -1119,7 +1114,7 @@ static void dm_bio_destructor(struct bio *bio)
1119} 1114}
1120 1115
1121/* 1116/*
1122 * Creates a little bio that is just does part of a bvec. 1117 * Creates a little bio that just does part of a bvec.
1123 */ 1118 */
1124static struct bio *split_bvec(struct bio *bio, sector_t sector, 1119static struct bio *split_bvec(struct bio *bio, sector_t sector,
1125 unsigned short idx, unsigned int offset, 1120 unsigned short idx, unsigned int offset,
@@ -1134,7 +1129,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
1134 1129
1135 clone->bi_sector = sector; 1130 clone->bi_sector = sector;
1136 clone->bi_bdev = bio->bi_bdev; 1131 clone->bi_bdev = bio->bi_bdev;
1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1132 clone->bi_rw = bio->bi_rw;
1138 clone->bi_vcnt = 1; 1133 clone->bi_vcnt = 1;
1139 clone->bi_size = to_bytes(len); 1134 clone->bi_size = to_bytes(len);
1140 clone->bi_io_vec->bv_offset = offset; 1135 clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1156,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
1161 1156
1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1157 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163 __bio_clone(clone, bio); 1158 __bio_clone(clone, bio);
1164 clone->bi_rw &= ~REQ_HARDBARRIER;
1165 clone->bi_destructor = dm_bio_destructor; 1159 clone->bi_destructor = dm_bio_destructor;
1166 clone->bi_sector = sector; 1160 clone->bi_sector = sector;
1167 clone->bi_idx = idx; 1161 clone->bi_idx = idx;
@@ -1225,7 +1219,7 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1225 __issue_target_request(ci, ti, request_nr, len); 1219 __issue_target_request(ci, ti, request_nr, len);
1226} 1220}
1227 1221
1228static int __clone_and_map_empty_barrier(struct clone_info *ci) 1222static int __clone_and_map_flush(struct clone_info *ci)
1229{ 1223{
1230 unsigned target_nr = 0; 1224 unsigned target_nr = 0;
1231 struct dm_target *ti; 1225 struct dm_target *ti;
@@ -1289,9 +1283,6 @@ static int __clone_and_map(struct clone_info *ci)
1289 sector_t len = 0, max; 1283 sector_t len = 0, max;
1290 struct dm_target_io *tio; 1284 struct dm_target_io *tio;
1291 1285
1292 if (unlikely(bio_empty_barrier(bio)))
1293 return __clone_and_map_empty_barrier(ci);
1294
1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1286 if (unlikely(bio->bi_rw & REQ_DISCARD))
1296 return __clone_and_map_discard(ci); 1287 return __clone_and_map_discard(ci);
1297 1288
@@ -1383,11 +1374,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1383 1374
1384 ci.map = dm_get_live_table(md); 1375 ci.map = dm_get_live_table(md);
1385 if (unlikely(!ci.map)) { 1376 if (unlikely(!ci.map)) {
1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1377 if (!(bio->bi_rw & REQ_FLUSH))
1387 bio_io_error(bio); 1378 bio_io_error(bio);
1388 else 1379 else
1389 if (!md->barrier_error) 1380 if (!md->flush_error)
1390 md->barrier_error = -EIO; 1381 md->flush_error = -EIO;
1391 return; 1382 return;
1392 } 1383 }
1393 1384
@@ -1400,14 +1391,22 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1400 ci.io->md = md; 1391 ci.io->md = md;
1401 spin_lock_init(&ci.io->endio_lock); 1392 spin_lock_init(&ci.io->endio_lock);
1402 ci.sector = bio->bi_sector; 1393 ci.sector = bio->bi_sector;
1403 ci.sector_count = bio_sectors(bio); 1394 if (!(bio->bi_rw & REQ_FLUSH))
1404 if (unlikely(bio_empty_barrier(bio))) 1395 ci.sector_count = bio_sectors(bio);
1396 else {
1397 /* all FLUSH bio's reaching here should be empty */
1398 WARN_ON_ONCE(bio_has_data(bio));
1405 ci.sector_count = 1; 1399 ci.sector_count = 1;
1400 }
1406 ci.idx = bio->bi_idx; 1401 ci.idx = bio->bi_idx;
1407 1402
1408 start_io_acct(ci.io); 1403 start_io_acct(ci.io);
1409 while (ci.sector_count && !error) 1404 while (ci.sector_count && !error) {
1410 error = __clone_and_map(&ci); 1405 if (!(bio->bi_rw & REQ_FLUSH))
1406 error = __clone_and_map(&ci);
1407 else
1408 error = __clone_and_map_flush(&ci);
1409 }
1411 1410
1412 /* drop the extra reference count */ 1411 /* drop the extra reference count */
1413 dec_pending(ci.io, error); 1412 dec_pending(ci.io, error);
@@ -1492,11 +1491,11 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1492 part_stat_unlock(); 1491 part_stat_unlock();
1493 1492
1494 /* 1493 /*
1495 * If we're suspended or the thread is processing barriers 1494 * If we're suspended or the thread is processing flushes
1496 * we have to queue this io for later. 1495 * we have to queue this io for later.
1497 */ 1496 */
1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1497 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 1498 (bio->bi_rw & REQ_FLUSH)) {
1500 up_read(&md->io_lock); 1499 up_read(&md->io_lock);
1501 1500
1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1501 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1940,6 +1939,7 @@ static void dm_init_md_queue(struct mapped_device *md)
1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1939 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941 md->queue->unplug_fn = dm_unplug_all; 1940 md->queue->unplug_fn = dm_unplug_all;
1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1941 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1942 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943} 1943}
1944 1944
1945/* 1945/*
@@ -2245,7 +2245,8 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2245 blk_queue_softirq_done(md->queue, dm_softirq_done); 2245 blk_queue_softirq_done(md->queue, dm_softirq_done);
2246 blk_queue_prep_rq(md->queue, dm_prep_fn); 2246 blk_queue_prep_rq(md->queue, dm_prep_fn);
2247 blk_queue_lld_busy(md->queue, dm_lld_busy); 2247 blk_queue_lld_busy(md->queue, dm_lld_busy);
2248 blk_queue_flush(md->queue, REQ_FLUSH); 2248 /* no flush support for request based dm yet */
2249 blk_queue_flush(md->queue, 0);
2249 2250
2250 elv_register_queue(md->queue); 2251 elv_register_queue(md->queue);
2251 2252
@@ -2406,41 +2407,35 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2406 return r; 2407 return r;
2407} 2408}
2408 2409
2409static void dm_flush(struct mapped_device *md) 2410static void process_flush(struct mapped_device *md, struct bio *bio)
2410{ 2411{
2411 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2412 md->flush_error = 0;
2412
2413 bio_init(&md->barrier_bio);
2414 md->barrier_bio.bi_bdev = md->bdev;
2415 md->barrier_bio.bi_rw = WRITE_BARRIER;
2416 __split_and_process_bio(md, &md->barrier_bio);
2417 2413
2414 /* handle REQ_FLUSH */
2418 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2415 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419}
2420 2416
2421static void process_barrier(struct mapped_device *md, struct bio *bio) 2417 bio_init(&md->flush_bio);
2422{ 2418 md->flush_bio.bi_bdev = md->bdev;
2423 md->barrier_error = 0; 2419 md->flush_bio.bi_rw = WRITE_FLUSH;
2420 __split_and_process_bio(md, &md->flush_bio);
2424 2421
2425 dm_flush(md); 2422 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2426 2423
2427 if (!bio_empty_barrier(bio)) { 2424 /* if it's an empty flush or the preflush failed, we're done */
2428 __split_and_process_bio(md, bio); 2425 if (!bio_has_data(bio) || md->flush_error) {
2429 /* 2426 if (md->flush_error != DM_ENDIO_REQUEUE)
2430 * If the request isn't supported, don't waste time with 2427 bio_endio(bio, md->flush_error);
2431 * the second flush. 2428 else {
2432 */ 2429 spin_lock_irq(&md->deferred_lock);
2433 if (md->barrier_error != -EOPNOTSUPP) 2430 bio_list_add_head(&md->deferred, bio);
2434 dm_flush(md); 2431 spin_unlock_irq(&md->deferred_lock);
2432 }
2433 return;
2435 } 2434 }
2436 2435
2437 if (md->barrier_error != DM_ENDIO_REQUEUE) 2436 /* issue data + REQ_FUA */
2438 bio_endio(bio, md->barrier_error); 2437 bio->bi_rw &= ~REQ_FLUSH;
2439 else { 2438 __split_and_process_bio(md, bio);
2440 spin_lock_irq(&md->deferred_lock);
2441 bio_list_add_head(&md->deferred, bio);
2442 spin_unlock_irq(&md->deferred_lock);
2443 }
2444} 2439}
2445 2440
2446/* 2441/*
@@ -2469,8 +2464,8 @@ static void dm_wq_work(struct work_struct *work)
2469 if (dm_request_based(md)) 2464 if (dm_request_based(md))
2470 generic_make_request(c); 2465 generic_make_request(c);
2471 else { 2466 else {
2472 if (c->bi_rw & REQ_HARDBARRIER) 2467 if (c->bi_rw & REQ_FLUSH)
2473 process_barrier(md, c); 2468 process_flush(md, c);
2474 else 2469 else
2475 __split_and_process_bio(md, c); 2470 __split_and_process_bio(md, c);
2476 } 2471 }