aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c477
1 files changed, 114 insertions, 363 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ac384b2a6a33..0cf68b478878 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -15,7 +15,6 @@
15#include <linux/blkpg.h> 15#include <linux/blkpg.h>
16#include <linux/bio.h> 16#include <linux/bio.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/smp_lock.h>
19#include <linux/mempool.h> 18#include <linux/mempool.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/idr.h> 20#include <linux/idr.h>
@@ -110,7 +109,6 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
110#define DMF_FREEING 3 109#define DMF_FREEING 3
111#define DMF_DELETING 4 110#define DMF_DELETING 4
112#define DMF_NOFLUSH_SUSPENDING 5 111#define DMF_NOFLUSH_SUSPENDING 5
113#define DMF_QUEUE_IO_TO_THREAD 6
114 112
115/* 113/*
116 * Work processed by per-device workqueue. 114 * Work processed by per-device workqueue.
@@ -144,24 +142,9 @@ struct mapped_device {
144 spinlock_t deferred_lock; 142 spinlock_t deferred_lock;
145 143
146 /* 144 /*
147 * An error from the barrier request currently being processed. 145 * Processing queue (flush)
148 */
149 int barrier_error;
150
151 /*
152 * Protect barrier_error from concurrent endio processing
153 * in request-based dm.
154 */
155 spinlock_t barrier_error_lock;
156
157 /*
158 * Processing queue (flush/barriers)
159 */ 146 */
160 struct workqueue_struct *wq; 147 struct workqueue_struct *wq;
161 struct work_struct barrier_work;
162
163 /* A pointer to the currently processing pre/post flush request */
164 struct request *flush_request;
165 148
166 /* 149 /*
167 * The current mapping. 150 * The current mapping.
@@ -200,8 +183,8 @@ struct mapped_device {
200 /* sysfs handle */ 183 /* sysfs handle */
201 struct kobject kobj; 184 struct kobject kobj;
202 185
203 /* zero-length barrier that will be cloned and submitted to targets */ 186 /* zero-length flush that will be cloned and submitted to targets */
204 struct bio barrier_bio; 187 struct bio flush_bio;
205}; 188};
206 189
207/* 190/*
@@ -344,7 +327,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
344{ 327{
345 struct mapped_device *md; 328 struct mapped_device *md;
346 329
347 lock_kernel();
348 spin_lock(&_minor_lock); 330 spin_lock(&_minor_lock);
349 331
350 md = bdev->bd_disk->private_data; 332 md = bdev->bd_disk->private_data;
@@ -362,7 +344,6 @@ static int dm_blk_open(struct block_device *bdev, fmode_t mode)
362 344
363out: 345out:
364 spin_unlock(&_minor_lock); 346 spin_unlock(&_minor_lock);
365 unlock_kernel();
366 347
367 return md ? 0 : -ENXIO; 348 return md ? 0 : -ENXIO;
368} 349}
@@ -371,10 +352,12 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
371{ 352{
372 struct mapped_device *md = disk->private_data; 353 struct mapped_device *md = disk->private_data;
373 354
374 lock_kernel(); 355 spin_lock(&_minor_lock);
356
375 atomic_dec(&md->open_count); 357 atomic_dec(&md->open_count);
376 dm_put(md); 358 dm_put(md);
377 unlock_kernel(); 359
360 spin_unlock(&_minor_lock);
378 361
379 return 0; 362 return 0;
380} 363}
@@ -494,7 +477,8 @@ static void start_io_acct(struct dm_io *io)
494 cpu = part_stat_lock(); 477 cpu = part_stat_lock();
495 part_round_stats(cpu, &dm_disk(md)->part0); 478 part_round_stats(cpu, &dm_disk(md)->part0);
496 part_stat_unlock(); 479 part_stat_unlock();
497 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 480 atomic_set(&dm_disk(md)->part0.in_flight[rw],
481 atomic_inc_return(&md->pending[rw]));
498} 482}
499 483
500static void end_io_acct(struct dm_io *io) 484static void end_io_acct(struct dm_io *io)
@@ -512,10 +496,10 @@ static void end_io_acct(struct dm_io *io)
512 496
513 /* 497 /*
514 * After this is decremented the bio must not be touched if it is 498 * After this is decremented the bio must not be touched if it is
515 * a barrier. 499 * a flush.
516 */ 500 */
517 dm_disk(md)->part0.in_flight[rw] = pending = 501 pending = atomic_dec_return(&md->pending[rw]);
518 atomic_dec_return(&md->pending[rw]); 502 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
519 pending += atomic_read(&md->pending[rw^0x1]); 503 pending += atomic_read(&md->pending[rw^0x1]);
520 504
521 /* nudge anyone waiting on suspend queue */ 505 /* nudge anyone waiting on suspend queue */
@@ -528,16 +512,12 @@ static void end_io_acct(struct dm_io *io)
528 */ 512 */
529static void queue_io(struct mapped_device *md, struct bio *bio) 513static void queue_io(struct mapped_device *md, struct bio *bio)
530{ 514{
531 down_write(&md->io_lock); 515 unsigned long flags;
532 516
533 spin_lock_irq(&md->deferred_lock); 517 spin_lock_irqsave(&md->deferred_lock, flags);
534 bio_list_add(&md->deferred, bio); 518 bio_list_add(&md->deferred, bio);
535 spin_unlock_irq(&md->deferred_lock); 519 spin_unlock_irqrestore(&md->deferred_lock, flags);
536 520 queue_work(md->wq, &md->work);
537 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
538 queue_work(md->wq, &md->work);
539
540 up_write(&md->io_lock);
541} 521}
542 522
543/* 523/*
@@ -625,11 +605,9 @@ static void dec_pending(struct dm_io *io, int error)
625 * Target requested pushing back the I/O. 605 * Target requested pushing back the I/O.
626 */ 606 */
627 spin_lock_irqsave(&md->deferred_lock, flags); 607 spin_lock_irqsave(&md->deferred_lock, flags);
628 if (__noflush_suspending(md)) { 608 if (__noflush_suspending(md))
629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 609 bio_list_add_head(&md->deferred, io->bio);
630 bio_list_add_head(&md->deferred, 610 else
631 io->bio);
632 } else
633 /* noflush suspend was interrupted. */ 611 /* noflush suspend was interrupted. */
634 io->error = -EIO; 612 io->error = -EIO;
635 spin_unlock_irqrestore(&md->deferred_lock, flags); 613 spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -637,32 +615,23 @@ static void dec_pending(struct dm_io *io, int error)
637 615
638 io_error = io->error; 616 io_error = io->error;
639 bio = io->bio; 617 bio = io->bio;
618 end_io_acct(io);
619 free_io(md, io);
620
621 if (io_error == DM_ENDIO_REQUEUE)
622 return;
640 623
641 if (bio->bi_rw & REQ_HARDBARRIER) { 624 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
642 /* 625 /*
643 * There can be just one barrier request so we use 626 * Preflush done for flush with data, reissue
644 * a per-device variable for error reporting. 627 * without REQ_FLUSH.
645 * Note that you can't touch the bio after end_io_acct
646 *
647 * We ignore -EOPNOTSUPP for empty flush reported by
648 * underlying devices. We assume that if the device
649 * doesn't support empty barriers, it doesn't need
650 * cache flushing commands.
651 */ 628 */
652 if (!md->barrier_error && 629 bio->bi_rw &= ~REQ_FLUSH;
653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 630 queue_io(md, bio);
654 md->barrier_error = io_error;
655 end_io_acct(io);
656 free_io(md, io);
657 } else { 631 } else {
658 end_io_acct(io); 632 /* done with normal IO or empty flush */
659 free_io(md, io); 633 trace_block_bio_complete(md->queue, bio, io_error);
660 634 bio_endio(bio, io_error);
661 if (io_error != DM_ENDIO_REQUEUE) {
662 trace_block_bio_complete(md->queue, bio);
663
664 bio_endio(bio, io_error);
665 }
666 } 635 }
667 } 636 }
668} 637}
@@ -755,23 +724,6 @@ static void end_clone_bio(struct bio *clone, int error)
755 blk_update_request(tio->orig, 0, nr_bytes); 724 blk_update_request(tio->orig, 0, nr_bytes);
756} 725}
757 726
758static void store_barrier_error(struct mapped_device *md, int error)
759{
760 unsigned long flags;
761
762 spin_lock_irqsave(&md->barrier_error_lock, flags);
763 /*
764 * Basically, the first error is taken, but:
765 * -EOPNOTSUPP supersedes any I/O error.
766 * Requeue request supersedes any I/O error but -EOPNOTSUPP.
767 */
768 if (!md->barrier_error || error == -EOPNOTSUPP ||
769 (md->barrier_error != -EOPNOTSUPP &&
770 error == DM_ENDIO_REQUEUE))
771 md->barrier_error = error;
772 spin_unlock_irqrestore(&md->barrier_error_lock, flags);
773}
774
775/* 727/*
776 * Don't touch any member of the md after calling this function because 728 * Don't touch any member of the md after calling this function because
777 * the md may be freed in dm_put() at the end of this function. 729 * the md may be freed in dm_put() at the end of this function.
@@ -809,13 +761,11 @@ static void free_rq_clone(struct request *clone)
809static void dm_end_request(struct request *clone, int error) 761static void dm_end_request(struct request *clone, int error)
810{ 762{
811 int rw = rq_data_dir(clone); 763 int rw = rq_data_dir(clone);
812 int run_queue = 1;
813 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
814 struct dm_rq_target_io *tio = clone->end_io_data; 764 struct dm_rq_target_io *tio = clone->end_io_data;
815 struct mapped_device *md = tio->md; 765 struct mapped_device *md = tio->md;
816 struct request *rq = tio->orig; 766 struct request *rq = tio->orig;
817 767
818 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 768 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
819 rq->errors = clone->errors; 769 rq->errors = clone->errors;
820 rq->resid_len = clone->resid_len; 770 rq->resid_len = clone->resid_len;
821 771
@@ -829,15 +779,8 @@ static void dm_end_request(struct request *clone, int error)
829 } 779 }
830 780
831 free_rq_clone(clone); 781 free_rq_clone(clone);
832 782 blk_end_request_all(rq, error);
833 if (unlikely(is_barrier)) { 783 rq_completed(md, rw, true);
834 if (unlikely(error))
835 store_barrier_error(md, error);
836 run_queue = 0;
837 } else
838 blk_end_request_all(rq, error);
839
840 rq_completed(md, rw, run_queue);
841} 784}
842 785
843static void dm_unprep_request(struct request *rq) 786static void dm_unprep_request(struct request *rq)
@@ -862,21 +805,9 @@ void dm_requeue_unmapped_request(struct request *clone)
862 struct request_queue *q = rq->q; 805 struct request_queue *q = rq->q;
863 unsigned long flags; 806 unsigned long flags;
864 807
865 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
866 /*
867 * Barrier clones share an original request.
868 * Leave it to dm_end_request(), which handles this special
869 * case.
870 */
871 dm_end_request(clone, DM_ENDIO_REQUEUE);
872 return;
873 }
874
875 dm_unprep_request(rq); 808 dm_unprep_request(rq);
876 809
877 spin_lock_irqsave(q->queue_lock, flags); 810 spin_lock_irqsave(q->queue_lock, flags);
878 if (elv_queue_empty(q))
879 blk_plug_device(q);
880 blk_requeue_request(q, rq); 811 blk_requeue_request(q, rq);
881 spin_unlock_irqrestore(q->queue_lock, flags); 812 spin_unlock_irqrestore(q->queue_lock, flags);
882 813
@@ -961,19 +892,6 @@ static void dm_complete_request(struct request *clone, int error)
961 struct dm_rq_target_io *tio = clone->end_io_data; 892 struct dm_rq_target_io *tio = clone->end_io_data;
962 struct request *rq = tio->orig; 893 struct request *rq = tio->orig;
963 894
964 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
965 /*
966 * Barrier clones share an original request. So can't use
967 * softirq_done with the original.
968 * Pass the clone to dm_done() directly in this special case.
969 * It is safe (even if clone->q->queue_lock is held here)
970 * because there is no I/O dispatching during the completion
971 * of barrier clone.
972 */
973 dm_done(clone, error, true);
974 return;
975 }
976
977 tio->error = error; 895 tio->error = error;
978 rq->completion_data = clone; 896 rq->completion_data = clone;
979 blk_complete_request(rq); 897 blk_complete_request(rq);
@@ -990,17 +908,6 @@ void dm_kill_unmapped_request(struct request *clone, int error)
990 struct dm_rq_target_io *tio = clone->end_io_data; 908 struct dm_rq_target_io *tio = clone->end_io_data;
991 struct request *rq = tio->orig; 909 struct request *rq = tio->orig;
992 910
993 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
994 /*
995 * Barrier clones share an original request.
996 * Leave it to dm_end_request(), which handles this special
997 * case.
998 */
999 BUG_ON(error > 0);
1000 dm_end_request(clone, error);
1001 return;
1002 }
1003
1004 rq->cmd_flags |= REQ_FAILED; 911 rq->cmd_flags |= REQ_FAILED;
1005 dm_complete_request(clone, error); 912 dm_complete_request(clone, error);
1006} 913}
@@ -1081,8 +988,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
1081 if (r == DM_MAPIO_REMAPPED) { 988 if (r == DM_MAPIO_REMAPPED) {
1082 /* the bio has been remapped so dispatch it */ 989 /* the bio has been remapped so dispatch it */
1083 990
1084 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 991 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1085 tio->io->bio->bi_bdev->bd_dev, sector); 992 tio->io->bio->bi_bdev->bd_dev, sector);
1086 993
1087 generic_make_request(clone); 994 generic_make_request(clone);
1088 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 995 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
@@ -1119,7 +1026,7 @@ static void dm_bio_destructor(struct bio *bio)
1119} 1026}
1120 1027
1121/* 1028/*
1122 * Creates a little bio that is just does part of a bvec. 1029 * Creates a little bio that just does part of a bvec.
1123 */ 1030 */
1124static struct bio *split_bvec(struct bio *bio, sector_t sector, 1031static struct bio *split_bvec(struct bio *bio, sector_t sector,
1125 unsigned short idx, unsigned int offset, 1032 unsigned short idx, unsigned int offset,
@@ -1134,7 +1041,7 @@ static struct bio *split_bvec(struct bio *bio, sector_t sector,
1134 1041
1135 clone->bi_sector = sector; 1042 clone->bi_sector = sector;
1136 clone->bi_bdev = bio->bi_bdev; 1043 clone->bi_bdev = bio->bi_bdev;
1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1044 clone->bi_rw = bio->bi_rw;
1138 clone->bi_vcnt = 1; 1045 clone->bi_vcnt = 1;
1139 clone->bi_size = to_bytes(len); 1046 clone->bi_size = to_bytes(len);
1140 clone->bi_io_vec->bv_offset = offset; 1047 clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1068,6 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
1161 1068
1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1069 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163 __bio_clone(clone, bio); 1070 __bio_clone(clone, bio);
1164 clone->bi_rw &= ~REQ_HARDBARRIER;
1165 clone->bi_destructor = dm_bio_destructor; 1071 clone->bi_destructor = dm_bio_destructor;
1166 clone->bi_sector = sector; 1072 clone->bi_sector = sector;
1167 clone->bi_idx = idx; 1073 clone->bi_idx = idx;
@@ -1225,16 +1131,15 @@ static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1225 __issue_target_request(ci, ti, request_nr, len); 1131 __issue_target_request(ci, ti, request_nr, len);
1226} 1132}
1227 1133
1228static int __clone_and_map_empty_barrier(struct clone_info *ci) 1134static int __clone_and_map_empty_flush(struct clone_info *ci)
1229{ 1135{
1230 unsigned target_nr = 0; 1136 unsigned target_nr = 0;
1231 struct dm_target *ti; 1137 struct dm_target *ti;
1232 1138
1139 BUG_ON(bio_has_data(ci->bio));
1233 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1140 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1234 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1141 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1235 1142
1236 ci->sector_count = 0;
1237
1238 return 0; 1143 return 0;
1239} 1144}
1240 1145
@@ -1289,9 +1194,6 @@ static int __clone_and_map(struct clone_info *ci)
1289 sector_t len = 0, max; 1194 sector_t len = 0, max;
1290 struct dm_target_io *tio; 1195 struct dm_target_io *tio;
1291 1196
1292 if (unlikely(bio_empty_barrier(bio)))
1293 return __clone_and_map_empty_barrier(ci);
1294
1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1197 if (unlikely(bio->bi_rw & REQ_DISCARD))
1296 return __clone_and_map_discard(ci); 1198 return __clone_and_map_discard(ci);
1297 1199
@@ -1383,16 +1285,11 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1383 1285
1384 ci.map = dm_get_live_table(md); 1286 ci.map = dm_get_live_table(md);
1385 if (unlikely(!ci.map)) { 1287 if (unlikely(!ci.map)) {
1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1288 bio_io_error(bio);
1387 bio_io_error(bio);
1388 else
1389 if (!md->barrier_error)
1390 md->barrier_error = -EIO;
1391 return; 1289 return;
1392 } 1290 }
1393 1291
1394 ci.md = md; 1292 ci.md = md;
1395 ci.bio = bio;
1396 ci.io = alloc_io(md); 1293 ci.io = alloc_io(md);
1397 ci.io->error = 0; 1294 ci.io->error = 0;
1398 atomic_set(&ci.io->io_count, 1); 1295 atomic_set(&ci.io->io_count, 1);
@@ -1400,14 +1297,20 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1400 ci.io->md = md; 1297 ci.io->md = md;
1401 spin_lock_init(&ci.io->endio_lock); 1298 spin_lock_init(&ci.io->endio_lock);
1402 ci.sector = bio->bi_sector; 1299 ci.sector = bio->bi_sector;
1403 ci.sector_count = bio_sectors(bio);
1404 if (unlikely(bio_empty_barrier(bio)))
1405 ci.sector_count = 1;
1406 ci.idx = bio->bi_idx; 1300 ci.idx = bio->bi_idx;
1407 1301
1408 start_io_acct(ci.io); 1302 start_io_acct(ci.io);
1409 while (ci.sector_count && !error) 1303 if (bio->bi_rw & REQ_FLUSH) {
1410 error = __clone_and_map(&ci); 1304 ci.bio = &ci.md->flush_bio;
1305 ci.sector_count = 0;
1306 error = __clone_and_map_empty_flush(&ci);
1307 /* dec_pending submits any data associated with flush */
1308 } else {
1309 ci.bio = bio;
1310 ci.sector_count = bio_sectors(bio);
1311 while (ci.sector_count && !error)
1312 error = __clone_and_map(&ci);
1313 }
1411 1314
1412 /* drop the extra reference count */ 1315 /* drop the extra reference count */
1413 dec_pending(ci.io, error); 1316 dec_pending(ci.io, error);
@@ -1491,22 +1394,14 @@ static int _dm_request(struct request_queue *q, struct bio *bio)
1491 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1394 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1492 part_stat_unlock(); 1395 part_stat_unlock();
1493 1396
1494 /* 1397 /* if we're suspended, we have to queue this io for later */
1495 * If we're suspended or the thread is processing barriers 1398 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1496 * we have to queue this io for later.
1497 */
1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
1500 up_read(&md->io_lock); 1399 up_read(&md->io_lock);
1501 1400
1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1401 if (bio_rw(bio) != READA)
1503 bio_rw(bio) == READA) { 1402 queue_io(md, bio);
1403 else
1504 bio_io_error(bio); 1404 bio_io_error(bio);
1505 return 0;
1506 }
1507
1508 queue_io(md, bio);
1509
1510 return 0; 1405 return 0;
1511 } 1406 }
1512 1407
@@ -1537,14 +1432,6 @@ static int dm_request(struct request_queue *q, struct bio *bio)
1537 return _dm_request(q, bio); 1432 return _dm_request(q, bio);
1538} 1433}
1539 1434
1540static bool dm_rq_is_flush_request(struct request *rq)
1541{
1542 if (rq->cmd_flags & REQ_FLUSH)
1543 return true;
1544 else
1545 return false;
1546}
1547
1548void dm_dispatch_request(struct request *rq) 1435void dm_dispatch_request(struct request *rq)
1549{ 1436{
1550 int r; 1437 int r;
@@ -1592,22 +1479,15 @@ static int setup_clone(struct request *clone, struct request *rq,
1592{ 1479{
1593 int r; 1480 int r;
1594 1481
1595 if (dm_rq_is_flush_request(rq)) { 1482 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1596 blk_rq_init(NULL, clone); 1483 dm_rq_bio_constructor, tio);
1597 clone->cmd_type = REQ_TYPE_FS; 1484 if (r)
1598 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1485 return r;
1599 } else {
1600 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1601 dm_rq_bio_constructor, tio);
1602 if (r)
1603 return r;
1604
1605 clone->cmd = rq->cmd;
1606 clone->cmd_len = rq->cmd_len;
1607 clone->sense = rq->sense;
1608 clone->buffer = rq->buffer;
1609 }
1610 1486
1487 clone->cmd = rq->cmd;
1488 clone->cmd_len = rq->cmd_len;
1489 clone->sense = rq->sense;
1490 clone->buffer = rq->buffer;
1611 clone->end_io = end_clone_request; 1491 clone->end_io = end_clone_request;
1612 clone->end_io_data = tio; 1492 clone->end_io_data = tio;
1613 1493
@@ -1648,9 +1528,6 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
1648 struct mapped_device *md = q->queuedata; 1528 struct mapped_device *md = q->queuedata;
1649 struct request *clone; 1529 struct request *clone;
1650 1530
1651 if (unlikely(dm_rq_is_flush_request(rq)))
1652 return BLKPREP_OK;
1653
1654 if (unlikely(rq->special)) { 1531 if (unlikely(rq->special)) {
1655 DMWARN("Already has something in rq->special."); 1532 DMWARN("Already has something in rq->special.");
1656 return BLKPREP_KILL; 1533 return BLKPREP_KILL;
@@ -1727,6 +1604,7 @@ static void dm_request_fn(struct request_queue *q)
1727 struct dm_table *map = dm_get_live_table(md); 1604 struct dm_table *map = dm_get_live_table(md);
1728 struct dm_target *ti; 1605 struct dm_target *ti;
1729 struct request *rq, *clone; 1606 struct request *rq, *clone;
1607 sector_t pos;
1730 1608
1731 /* 1609 /*
1732 * For suspend, check blk_queue_stopped() and increment 1610 * For suspend, check blk_queue_stopped() and increment
@@ -1734,22 +1612,21 @@ static void dm_request_fn(struct request_queue *q)
1734 * number of in-flight I/Os after the queue is stopped in 1612 * number of in-flight I/Os after the queue is stopped in
1735 * dm_suspend(). 1613 * dm_suspend().
1736 */ 1614 */
1737 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1615 while (!blk_queue_stopped(q)) {
1738 rq = blk_peek_request(q); 1616 rq = blk_peek_request(q);
1739 if (!rq) 1617 if (!rq)
1740 goto plug_and_out; 1618 goto delay_and_out;
1741 1619
1742 if (unlikely(dm_rq_is_flush_request(rq))) { 1620 /* always use block 0 to find the target for flushes for now */
1743 BUG_ON(md->flush_request); 1621 pos = 0;
1744 md->flush_request = rq; 1622 if (!(rq->cmd_flags & REQ_FLUSH))
1745 blk_start_request(rq); 1623 pos = blk_rq_pos(rq);
1746 queue_work(md->wq, &md->barrier_work); 1624
1747 goto out; 1625 ti = dm_table_find_target(map, pos);
1748 } 1626 BUG_ON(!dm_target_is_valid(ti));
1749 1627
1750 ti = dm_table_find_target(map, blk_rq_pos(rq));
1751 if (ti->type->busy && ti->type->busy(ti)) 1628 if (ti->type->busy && ti->type->busy(ti))
1752 goto plug_and_out; 1629 goto delay_and_out;
1753 1630
1754 blk_start_request(rq); 1631 blk_start_request(rq);
1755 clone = rq->special; 1632 clone = rq->special;
@@ -1759,19 +1636,18 @@ static void dm_request_fn(struct request_queue *q)
1759 if (map_request(ti, clone, md)) 1636 if (map_request(ti, clone, md))
1760 goto requeued; 1637 goto requeued;
1761 1638
1762 spin_lock_irq(q->queue_lock); 1639 BUG_ON(!irqs_disabled());
1640 spin_lock(q->queue_lock);
1763 } 1641 }
1764 1642
1765 goto out; 1643 goto out;
1766 1644
1767requeued: 1645requeued:
1768 spin_lock_irq(q->queue_lock); 1646 BUG_ON(!irqs_disabled());
1769 1647 spin_lock(q->queue_lock);
1770plug_and_out:
1771 if (!elv_queue_empty(q))
1772 /* Some requests still remain, retry later */
1773 blk_plug_device(q);
1774 1648
1649delay_and_out:
1650 blk_delay_queue(q, HZ / 10);
1775out: 1651out:
1776 dm_table_put(map); 1652 dm_table_put(map);
1777 1653
@@ -1800,20 +1676,6 @@ static int dm_lld_busy(struct request_queue *q)
1800 return r; 1676 return r;
1801} 1677}
1802 1678
1803static void dm_unplug_all(struct request_queue *q)
1804{
1805 struct mapped_device *md = q->queuedata;
1806 struct dm_table *map = dm_get_live_table(md);
1807
1808 if (map) {
1809 if (dm_request_based(md))
1810 generic_unplug_device(q);
1811
1812 dm_table_unplug_all(map);
1813 dm_table_put(map);
1814 }
1815}
1816
1817static int dm_any_congested(void *congested_data, int bdi_bits) 1679static int dm_any_congested(void *congested_data, int bdi_bits)
1818{ 1680{
1819 int r = bdi_bits; 1681 int r = bdi_bits;
@@ -1918,7 +1780,6 @@ out:
1918static const struct block_device_operations dm_blk_dops; 1780static const struct block_device_operations dm_blk_dops;
1919 1781
1920static void dm_wq_work(struct work_struct *work); 1782static void dm_wq_work(struct work_struct *work);
1921static void dm_rq_barrier_work(struct work_struct *work);
1922 1783
1923static void dm_init_md_queue(struct mapped_device *md) 1784static void dm_init_md_queue(struct mapped_device *md)
1924{ 1785{
@@ -1938,8 +1799,8 @@ static void dm_init_md_queue(struct mapped_device *md)
1938 md->queue->backing_dev_info.congested_data = md; 1799 md->queue->backing_dev_info.congested_data = md;
1939 blk_queue_make_request(md->queue, dm_request); 1800 blk_queue_make_request(md->queue, dm_request);
1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1801 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941 md->queue->unplug_fn = dm_unplug_all;
1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1802 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1803 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943} 1804}
1944 1805
1945/* 1806/*
@@ -1972,7 +1833,6 @@ static struct mapped_device *alloc_dev(int minor)
1972 mutex_init(&md->suspend_lock); 1833 mutex_init(&md->suspend_lock);
1973 mutex_init(&md->type_lock); 1834 mutex_init(&md->type_lock);
1974 spin_lock_init(&md->deferred_lock); 1835 spin_lock_init(&md->deferred_lock);
1975 spin_lock_init(&md->barrier_error_lock);
1976 rwlock_init(&md->map_lock); 1836 rwlock_init(&md->map_lock);
1977 atomic_set(&md->holders, 1); 1837 atomic_set(&md->holders, 1);
1978 atomic_set(&md->open_count, 0); 1838 atomic_set(&md->open_count, 0);
@@ -1995,7 +1855,6 @@ static struct mapped_device *alloc_dev(int minor)
1995 atomic_set(&md->pending[1], 0); 1855 atomic_set(&md->pending[1], 0);
1996 init_waitqueue_head(&md->wait); 1856 init_waitqueue_head(&md->wait);
1997 INIT_WORK(&md->work, dm_wq_work); 1857 INIT_WORK(&md->work, dm_wq_work);
1998 INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1999 init_waitqueue_head(&md->eventq); 1858 init_waitqueue_head(&md->eventq);
2000 1859
2001 md->disk->major = _major; 1860 md->disk->major = _major;
@@ -2007,7 +1866,8 @@ static struct mapped_device *alloc_dev(int minor)
2007 add_disk(md->disk); 1866 add_disk(md->disk);
2008 format_dev_t(md->name, MKDEV(_major, minor)); 1867 format_dev_t(md->name, MKDEV(_major, minor));
2009 1868
2010 md->wq = create_singlethread_workqueue("kdmflush"); 1869 md->wq = alloc_workqueue("kdmflush",
1870 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
2011 if (!md->wq) 1871 if (!md->wq)
2012 goto bad_thread; 1872 goto bad_thread;
2013 1873
@@ -2015,6 +1875,10 @@ static struct mapped_device *alloc_dev(int minor)
2015 if (!md->bdev) 1875 if (!md->bdev)
2016 goto bad_bdev; 1876 goto bad_bdev;
2017 1877
1878 bio_init(&md->flush_bio);
1879 md->flush_bio.bi_bdev = md->bdev;
1880 md->flush_bio.bi_rw = WRITE_FLUSH;
1881
2018 /* Populate the mapping, nobody knows we exist yet */ 1882 /* Populate the mapping, nobody knows we exist yet */
2019 spin_lock(&_minor_lock); 1883 spin_lock(&_minor_lock);
2020 old_md = idr_replace(&_minor_idr, md, minor); 1884 old_md = idr_replace(&_minor_idr, md, minor);
@@ -2111,13 +1975,14 @@ static void event_callback(void *context)
2111 wake_up(&md->eventq); 1975 wake_up(&md->eventq);
2112} 1976}
2113 1977
1978/*
1979 * Protected by md->suspend_lock obtained by dm_swap_table().
1980 */
2114static void __set_size(struct mapped_device *md, sector_t size) 1981static void __set_size(struct mapped_device *md, sector_t size)
2115{ 1982{
2116 set_capacity(md->disk, size); 1983 set_capacity(md->disk, size);
2117 1984
2118 mutex_lock(&md->bdev->bd_inode->i_mutex);
2119 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1985 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2120 mutex_unlock(&md->bdev->bd_inode->i_mutex);
2121} 1986}
2122 1987
2123/* 1988/*
@@ -2245,7 +2110,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
2245 blk_queue_softirq_done(md->queue, dm_softirq_done); 2110 blk_queue_softirq_done(md->queue, dm_softirq_done);
2246 blk_queue_prep_rq(md->queue, dm_prep_fn); 2111 blk_queue_prep_rq(md->queue, dm_prep_fn);
2247 blk_queue_lld_busy(md->queue, dm_lld_busy); 2112 blk_queue_lld_busy(md->queue, dm_lld_busy);
2248 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);
2249 2113
2250 elv_register_queue(md->queue); 2114 elv_register_queue(md->queue);
2251 2115
@@ -2380,8 +2244,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2380 int r = 0; 2244 int r = 0;
2381 DECLARE_WAITQUEUE(wait, current); 2245 DECLARE_WAITQUEUE(wait, current);
2382 2246
2383 dm_unplug_all(md->queue);
2384
2385 add_wait_queue(&md->wait, &wait); 2247 add_wait_queue(&md->wait, &wait);
2386 2248
2387 while (1) { 2249 while (1) {
@@ -2406,43 +2268,6 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2406 return r; 2268 return r;
2407} 2269}
2408 2270
2409static void dm_flush(struct mapped_device *md)
2410{
2411 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412
2413 bio_init(&md->barrier_bio);
2414 md->barrier_bio.bi_bdev = md->bdev;
2415 md->barrier_bio.bi_rw = WRITE_BARRIER;
2416 __split_and_process_bio(md, &md->barrier_bio);
2417
2418 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419}
2420
2421static void process_barrier(struct mapped_device *md, struct bio *bio)
2422{
2423 md->barrier_error = 0;
2424
2425 dm_flush(md);
2426
2427 if (!bio_empty_barrier(bio)) {
2428 __split_and_process_bio(md, bio);
2429 /*
2430 * If the request isn't supported, don't waste time with
2431 * the second flush.
2432 */
2433 if (md->barrier_error != -EOPNOTSUPP)
2434 dm_flush(md);
2435 }
2436
2437 if (md->barrier_error != DM_ENDIO_REQUEUE)
2438 bio_endio(bio, md->barrier_error);
2439 else {
2440 spin_lock_irq(&md->deferred_lock);
2441 bio_list_add_head(&md->deferred, bio);
2442 spin_unlock_irq(&md->deferred_lock);
2443 }
2444}
2445
2446/* 2271/*
2447 * Process the deferred bios 2272 * Process the deferred bios
2448 */ 2273 */
@@ -2452,33 +2277,27 @@ static void dm_wq_work(struct work_struct *work)
2452 work); 2277 work);
2453 struct bio *c; 2278 struct bio *c;
2454 2279
2455 down_write(&md->io_lock); 2280 down_read(&md->io_lock);
2456 2281
2457 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2282 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2458 spin_lock_irq(&md->deferred_lock); 2283 spin_lock_irq(&md->deferred_lock);
2459 c = bio_list_pop(&md->deferred); 2284 c = bio_list_pop(&md->deferred);
2460 spin_unlock_irq(&md->deferred_lock); 2285 spin_unlock_irq(&md->deferred_lock);
2461 2286
2462 if (!c) { 2287 if (!c)
2463 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2464 break; 2288 break;
2465 }
2466 2289
2467 up_write(&md->io_lock); 2290 up_read(&md->io_lock);
2468 2291
2469 if (dm_request_based(md)) 2292 if (dm_request_based(md))
2470 generic_make_request(c); 2293 generic_make_request(c);
2471 else { 2294 else
2472 if (c->bi_rw & REQ_HARDBARRIER) 2295 __split_and_process_bio(md, c);
2473 process_barrier(md, c);
2474 else
2475 __split_and_process_bio(md, c);
2476 }
2477 2296
2478 down_write(&md->io_lock); 2297 down_read(&md->io_lock);
2479 } 2298 }
2480 2299
2481 up_write(&md->io_lock); 2300 up_read(&md->io_lock);
2482} 2301}
2483 2302
2484static void dm_queue_flush(struct mapped_device *md) 2303static void dm_queue_flush(struct mapped_device *md)
@@ -2488,73 +2307,6 @@ static void dm_queue_flush(struct mapped_device *md)
2488 queue_work(md->wq, &md->work); 2307 queue_work(md->wq, &md->work);
2489} 2308}
2490 2309
2491static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
2492{
2493 struct dm_rq_target_io *tio = clone->end_io_data;
2494
2495 tio->info.target_request_nr = request_nr;
2496}
2497
2498/* Issue barrier requests to targets and wait for their completion. */
2499static int dm_rq_barrier(struct mapped_device *md)
2500{
2501 int i, j;
2502 struct dm_table *map = dm_get_live_table(md);
2503 unsigned num_targets = dm_table_get_num_targets(map);
2504 struct dm_target *ti;
2505 struct request *clone;
2506
2507 md->barrier_error = 0;
2508
2509 for (i = 0; i < num_targets; i++) {
2510 ti = dm_table_get_target(map, i);
2511 for (j = 0; j < ti->num_flush_requests; j++) {
2512 clone = clone_rq(md->flush_request, md, GFP_NOIO);
2513 dm_rq_set_target_request_nr(clone, j);
2514 atomic_inc(&md->pending[rq_data_dir(clone)]);
2515 map_request(ti, clone, md);
2516 }
2517 }
2518
2519 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2520 dm_table_put(map);
2521
2522 return md->barrier_error;
2523}
2524
2525static void dm_rq_barrier_work(struct work_struct *work)
2526{
2527 int error;
2528 struct mapped_device *md = container_of(work, struct mapped_device,
2529 barrier_work);
2530 struct request_queue *q = md->queue;
2531 struct request *rq;
2532 unsigned long flags;
2533
2534 /*
2535 * Hold the md reference here and leave it at the last part so that
2536 * the md can't be deleted by device opener when the barrier request
2537 * completes.
2538 */
2539 dm_get(md);
2540
2541 error = dm_rq_barrier(md);
2542
2543 rq = md->flush_request;
2544 md->flush_request = NULL;
2545
2546 if (error == DM_ENDIO_REQUEUE) {
2547 spin_lock_irqsave(q->queue_lock, flags);
2548 blk_requeue_request(q, rq);
2549 spin_unlock_irqrestore(q->queue_lock, flags);
2550 } else
2551 blk_end_request_all(rq, error);
2552
2553 blk_run_queue(q);
2554
2555 dm_put(md);
2556}
2557
2558/* 2310/*
2559 * Swap in a new table, returning the old one for the caller to destroy. 2311 * Swap in a new table, returning the old one for the caller to destroy.
2560 */ 2312 */
@@ -2677,23 +2429,17 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2677 * 2429 *
2678 * To get all processes out of __split_and_process_bio in dm_request, 2430 * To get all processes out of __split_and_process_bio in dm_request,
2679 * we take the write lock. To prevent any process from reentering 2431 * we take the write lock. To prevent any process from reentering
2680 * __split_and_process_bio from dm_request, we set 2432 * __split_and_process_bio from dm_request and quiesce the thread
2681 * DMF_QUEUE_IO_TO_THREAD. 2433 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2682 * 2434 * flush_workqueue(md->wq).
2683 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2684 * and call flush_workqueue(md->wq). flush_workqueue will wait until
2685 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2686 * further calls to __split_and_process_bio from dm_wq_work.
2687 */ 2435 */
2688 down_write(&md->io_lock); 2436 down_write(&md->io_lock);
2689 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2437 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2690 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2691 up_write(&md->io_lock); 2438 up_write(&md->io_lock);
2692 2439
2693 /* 2440 /*
2694 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2441 * Stop md->queue before flushing md->wq in case request-based
2695 * can be kicked until md->queue is stopped. So stop md->queue before 2442 * dm defers requests to md->wq from md->queue.
2696 * flushing md->wq.
2697 */ 2443 */
2698 if (dm_request_based(md)) 2444 if (dm_request_based(md))
2699 stop_queue(md->queue); 2445 stop_queue(md->queue);
@@ -2772,7 +2518,6 @@ int dm_resume(struct mapped_device *md)
2772 2518
2773 clear_bit(DMF_SUSPENDED, &md->flags); 2519 clear_bit(DMF_SUSPENDED, &md->flags);
2774 2520
2775 dm_table_unplug_all(map);
2776 r = 0; 2521 r = 0;
2777out: 2522out:
2778 dm_table_put(map); 2523 dm_table_put(map);
@@ -2876,9 +2621,10 @@ int dm_noflush_suspending(struct dm_target *ti)
2876} 2621}
2877EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2622EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2878 2623
2879struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2624struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2880{ 2625{
2881 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2626 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2627 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2882 2628
2883 if (!pools) 2629 if (!pools)
2884 return NULL; 2630 return NULL;
@@ -2895,13 +2641,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2895 if (!pools->tio_pool) 2641 if (!pools->tio_pool)
2896 goto free_io_pool_and_out; 2642 goto free_io_pool_and_out;
2897 2643
2898 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2644 pools->bs = bioset_create(pool_size, 0);
2899 bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2900 if (!pools->bs) 2645 if (!pools->bs)
2901 goto free_tio_pool_and_out; 2646 goto free_tio_pool_and_out;
2902 2647
2648 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2649 goto free_bioset_and_out;
2650
2903 return pools; 2651 return pools;
2904 2652
2653free_bioset_and_out:
2654 bioset_free(pools->bs);
2655
2905free_tio_pool_and_out: 2656free_tio_pool_and_out:
2906 mempool_destroy(pools->tio_pool); 2657 mempool_destroy(pools->tio_pool);
2907 2658