aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-02 12:29:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-02 12:29:34 -0400
commit681a2895486243a82547d8c9f53043eb54b53da0 (patch)
tree464273280aed6db55a99cc0d8614d4393f94fc48
parent6c52486dedbb30a1313da64945dcd686b4579c51 (diff)
parented851860b4552fc8963ecf71eab9f6f7a5c19d74 (diff)
Merge branch 'for-3.16/core' of git://git.kernel.dk/linux-block into next
Pull block core updates from Jens Axboe: "It's a big(ish) round this time, lots of development effort has gone into blk-mq in the last 3 months. Generally we're heading to where 3.16 will be a feature complete and performant blk-mq. scsi-mq is progressing nicely and will hopefully be in 3.17. A nvme port is in progress, and the Micron pci-e flash driver, mtip32xx, is converted and will be sent in with the driver pull request for 3.16. This pull request contains: - Lots of prep and support patches for scsi-mq have been integrated. All from Christoph. - API and code cleanups for blk-mq from Christoph. - Lots of good corner case and error handling cleanup fixes for blk-mq from Ming Lei. - A flew of blk-mq updates from me: * Provide strict mappings so that the driver can rely on the CPU to queue mapping. This enables optimizations in the driver. * Provided a bitmap tagging instead of percpu_ida, which never really worked well for blk-mq. percpu_ida relies on the fact that we have a lot more tags available than we really need, it fails miserably for cases where we exhaust (or are close to exhausting) the tag space. * Provide sane support for shared tag maps, as utilized by scsi-mq * Various fixes for IO timeouts. * API cleanups, and lots of perf tweaks and optimizations. - Remove 'buffer' from struct request. This is ancient code, from when requests were always virtually mapped. Kill it, to reclaim some space in struct request. From me. - Remove 'magic' from blk_plug. Since we store these on the stack and since we've never caught any actual bugs with this, lets just get rid of it. From me. - Only call part_in_flight() once for IO completion, as includes two atomic reads. Hopefully we'll get a better implementation soon, as the part IO stats are now one of the more expensive parts of doing IO on blk-mq. From me. - File migration of block code from {mm,fs}/ to block/. This includes bio.c, bio-integrity.c, bounce.c, and ioprio.c. From me, from a discussion on lkml. That should describe the meat of the pull request. Also has various little fixes and cleanups from Dave Jones, Shaohua Li, Duan Jiong, Fengguang Wu, Fabian Frederick, Randy Dunlap, Robert Elliott, and Sam Bradshaw" * 'for-3.16/core' of git://git.kernel.dk/linux-block: (100 commits) blk-mq: push IPI or local end_io decision to __blk_mq_complete_request() blk-mq: remember to start timeout handler for direct queue block: ensure that the timer is always added blk-mq: blk_mq_unregister_hctx() can be static blk-mq: make the sysfs mq/ layout reflect current mappings blk-mq: blk_mq_tag_to_rq should handle flush request block: remove dead code in scsi_ioctl:blk_verify_command blk-mq: request initialization optimizations block: add queue flag for disabling SG merging block: remove 'magic' from struct blk_plug blk-mq: remove alloc_hctx and free_hctx methods blk-mq: add file comments and update copyright notices blk-mq: remove blk_mq_alloc_request_pinned blk-mq: do not use blk_mq_alloc_request_pinned in blk_mq_map_request blk-mq: remove blk_mq_wait_for_tags blk-mq: initialize request in __blk_mq_alloc_request blk-mq: merge blk_mq_alloc_reserved_request into blk_mq_alloc_request blk-mq: add helper to insert requests from irq context blk-mq: remove stale comment for blk_mq_complete_request() blk-mq: allow non-softirq completions ...
-rw-r--r--Documentation/DocBook/filesystems.tmpl2
-rw-r--r--block/Makefile7
-rw-r--r--block/bio-integrity.c (renamed from fs/bio-integrity.c)2
-rw-r--r--block/bio.c (renamed from fs/bio.c)11
-rw-r--r--block/blk-core.c113
-rw-r--r--block/blk-flush.c40
-rw-r--r--block/blk-iopoll.c4
-rw-r--r--block/blk-lib.c4
-rw-r--r--block/blk-map.c3
-rw-r--r--block/blk-merge.c28
-rw-r--r--block/blk-mq-cpu.c17
-rw-r--r--block/blk-mq-cpumap.c27
-rw-r--r--block/blk-mq-sysfs.c160
-rw-r--r--block/blk-mq-tag.c561
-rw-r--r--block/blk-mq-tag.h71
-rw-r--r--block/blk-mq.c1415
-rw-r--r--block/blk-mq.h32
-rw-r--r--block/blk-sysfs.c47
-rw-r--r--block/blk-throttle.c10
-rw-r--r--block/blk-timeout.c60
-rw-r--r--block/blk.h9
-rw-r--r--block/bounce.c (renamed from mm/bounce.c)0
-rw-r--r--block/cfq-iosched.c4
-rw-r--r--block/ioprio.c (renamed from fs/ioprio.c)0
-rw-r--r--block/scsi_ioctl.c4
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/ataflop.c2
-rw-r--r--drivers/block/floppy.c18
-rw-r--r--drivers/block/hd.c10
-rw-r--r--drivers/block/mg_disk.c12
-rw-r--r--drivers/block/null_blk.c117
-rw-r--r--drivers/block/paride/pcd.c2
-rw-r--r--drivers/block/paride/pd.c4
-rw-r--r--drivers/block/paride/pf.c4
-rw-r--r--drivers/block/skd_main.c5
-rw-r--r--drivers/block/swim.c2
-rw-r--r--drivers/block/swim3.c6
-rw-r--r--drivers/block/virtio_blk.c75
-rw-r--r--drivers/block/xen-blkfront.c4
-rw-r--r--drivers/block/xsysace.c4
-rw-r--r--drivers/block/z2ram.c6
-rw-r--r--drivers/cdrom/gdrom.c2
-rw-r--r--drivers/char/random.c1
-rw-r--r--drivers/ide/ide-disk.c5
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/mtd/mtd_blkdevs.c3
-rw-r--r--drivers/mtd/ubi/block.c2
-rw-r--r--drivers/sbus/char/jsflash.c2
-rw-r--r--drivers/scsi/scsi_lib.c5
-rw-r--r--drivers/scsi/sd.c13
-rw-r--r--fs/Makefile3
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h101
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h27
-rw-r--r--mm/Makefile1
56 files changed, 2088 insertions, 986 deletions
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 4f676838da06..bcdfdb9a9277 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -62,7 +62,7 @@
62!Efs/mpage.c 62!Efs/mpage.c
63!Efs/namei.c 63!Efs/namei.c
64!Efs/buffer.c 64!Efs/buffer.c
65!Efs/bio.c 65!Eblock/bio.c
66!Efs/seq_file.c 66!Efs/seq_file.c
67!Efs/filesystems.c 67!Efs/filesystems.c
68!Efs/fs-writeback.c 68!Efs/fs-writeback.c
diff --git a/block/Makefile b/block/Makefile
index 20645e88fb57..a2ce6ac935ec 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -2,13 +2,15 @@
2# Makefile for the kernel block layer 2# Makefile for the kernel block layer
3# 3#
4 4
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ 8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ 9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o partitions/ 10 genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
11 partitions/
11 12
13obj-$(CONFIG_BOUNCE) += bounce.o
12obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 14obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
13obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 15obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
14obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o 16obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
@@ -20,3 +22,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
20obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o 22obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
21obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o 23obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
22obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o 24obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
25obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
diff --git a/fs/bio-integrity.c b/block/bio-integrity.c
index 1c2ce0c87711..9e241063a616 100644
--- a/fs/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -617,7 +617,7 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
617 if (!bs->bio_integrity_pool) 617 if (!bs->bio_integrity_pool)
618 return -1; 618 return -1;
619 619
620 bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); 620 bs->bvec_integrity_pool = biovec_create_pool(pool_size);
621 if (!bs->bvec_integrity_pool) { 621 if (!bs->bvec_integrity_pool) {
622 mempool_destroy(bs->bio_integrity_pool); 622 mempool_destroy(bs->bio_integrity_pool);
623 return -1; 623 return -1;
diff --git a/fs/bio.c b/block/bio.c
index 6f0362b77806..96d28eee8a1e 100644
--- a/fs/bio.c
+++ b/block/bio.c
@@ -305,6 +305,8 @@ static void bio_chain_endio(struct bio *bio, int error)
305 305
306/** 306/**
307 * bio_chain - chain bio completions 307 * bio_chain - chain bio completions
308 * @bio: the target bio
309 * @parent: the @bio's parent bio
308 * 310 *
309 * The caller won't have a bi_end_io called when @bio completes - instead, 311 * The caller won't have a bi_end_io called when @bio completes - instead,
310 * @parent's bi_end_io won't be called until both @parent and @bio have 312 * @parent's bi_end_io won't be called until both @parent and @bio have
@@ -1011,8 +1013,7 @@ static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
1011 bio->bi_private = bmd; 1013 bio->bi_private = bmd;
1012} 1014}
1013 1015
1014static struct bio_map_data *bio_alloc_map_data(int nr_segs, 1016static struct bio_map_data *bio_alloc_map_data(unsigned int iov_count,
1015 unsigned int iov_count,
1016 gfp_t gfp_mask) 1017 gfp_t gfp_mask)
1017{ 1018{
1018 if (iov_count > UIO_MAXIOV) 1019 if (iov_count > UIO_MAXIOV)
@@ -1154,7 +1155,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
1154 if (offset) 1155 if (offset)
1155 nr_pages++; 1156 nr_pages++;
1156 1157
1157 bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); 1158 bmd = bio_alloc_map_data(iov_count, gfp_mask);
1158 if (!bmd) 1159 if (!bmd)
1159 return ERR_PTR(-ENOMEM); 1160 return ERR_PTR(-ENOMEM);
1160 1161
@@ -1859,7 +1860,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
1859 * create memory pools for biovec's in a bio_set. 1860 * create memory pools for biovec's in a bio_set.
1860 * use the global biovec slabs created for general use. 1861 * use the global biovec slabs created for general use.
1861 */ 1862 */
1862mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries) 1863mempool_t *biovec_create_pool(int pool_entries)
1863{ 1864{
1864 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX; 1865 struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
1865 1866
@@ -1922,7 +1923,7 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1922 if (!bs->bio_pool) 1923 if (!bs->bio_pool)
1923 goto bad; 1924 goto bad;
1924 1925
1925 bs->bvec_pool = biovec_create_pool(bs, pool_size); 1926 bs->bvec_pool = biovec_create_pool(pool_size);
1926 if (!bs->bvec_pool) 1927 if (!bs->bvec_pool)
1927 goto bad; 1928 goto bad;
1928 1929
diff --git a/block/blk-core.c b/block/blk-core.c
index a0e3096c4bb5..40d654861c33 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -146,8 +146,8 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
146 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", 146 printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
147 (unsigned long long)blk_rq_pos(rq), 147 (unsigned long long)blk_rq_pos(rq),
148 blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); 148 blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
149 printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", 149 printk(KERN_INFO " bio %p, biotail %p, len %u\n",
150 rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); 150 rq->bio, rq->biotail, blk_rq_bytes(rq));
151 151
152 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 152 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
153 printk(KERN_INFO " cdb: "); 153 printk(KERN_INFO " cdb: ");
@@ -251,8 +251,10 @@ void blk_sync_queue(struct request_queue *q)
251 struct blk_mq_hw_ctx *hctx; 251 struct blk_mq_hw_ctx *hctx;
252 int i; 252 int i;
253 253
254 queue_for_each_hw_ctx(q, hctx, i) 254 queue_for_each_hw_ctx(q, hctx, i) {
255 cancel_delayed_work_sync(&hctx->delayed_work); 255 cancel_delayed_work_sync(&hctx->run_work);
256 cancel_delayed_work_sync(&hctx->delay_work);
257 }
256 } else { 258 } else {
257 cancel_delayed_work_sync(&q->delay_work); 259 cancel_delayed_work_sync(&q->delay_work);
258 } 260 }
@@ -574,12 +576,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
574 if (!q) 576 if (!q)
575 return NULL; 577 return NULL;
576 578
577 if (percpu_counter_init(&q->mq_usage_counter, 0))
578 goto fail_q;
579
580 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 579 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
581 if (q->id < 0) 580 if (q->id < 0)
582 goto fail_c; 581 goto fail_q;
583 582
584 q->backing_dev_info.ra_pages = 583 q->backing_dev_info.ra_pages =
585 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 584 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -637,8 +636,6 @@ fail_bdi:
637 bdi_destroy(&q->backing_dev_info); 636 bdi_destroy(&q->backing_dev_info);
638fail_id: 637fail_id:
639 ida_simple_remove(&blk_queue_ida, q->id); 638 ida_simple_remove(&blk_queue_ida, q->id);
640fail_c:
641 percpu_counter_destroy(&q->mq_usage_counter);
642fail_q: 639fail_q:
643 kmem_cache_free(blk_requestq_cachep, q); 640 kmem_cache_free(blk_requestq_cachep, q);
644 return NULL; 641 return NULL;
@@ -846,6 +843,47 @@ static void freed_request(struct request_list *rl, unsigned int flags)
846 __freed_request(rl, sync ^ 1); 843 __freed_request(rl, sync ^ 1);
847} 844}
848 845
846int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
847{
848 struct request_list *rl;
849
850 spin_lock_irq(q->queue_lock);
851 q->nr_requests = nr;
852 blk_queue_congestion_threshold(q);
853
854 /* congestion isn't cgroup aware and follows root blkcg for now */
855 rl = &q->root_rl;
856
857 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
858 blk_set_queue_congested(q, BLK_RW_SYNC);
859 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
860 blk_clear_queue_congested(q, BLK_RW_SYNC);
861
862 if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
863 blk_set_queue_congested(q, BLK_RW_ASYNC);
864 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
865 blk_clear_queue_congested(q, BLK_RW_ASYNC);
866
867 blk_queue_for_each_rl(rl, q) {
868 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
869 blk_set_rl_full(rl, BLK_RW_SYNC);
870 } else {
871 blk_clear_rl_full(rl, BLK_RW_SYNC);
872 wake_up(&rl->wait[BLK_RW_SYNC]);
873 }
874
875 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
876 blk_set_rl_full(rl, BLK_RW_ASYNC);
877 } else {
878 blk_clear_rl_full(rl, BLK_RW_ASYNC);
879 wake_up(&rl->wait[BLK_RW_ASYNC]);
880 }
881 }
882
883 spin_unlock_irq(q->queue_lock);
884 return 0;
885}
886
849/* 887/*
850 * Determine if elevator data should be initialized when allocating the 888 * Determine if elevator data should be initialized when allocating the
851 * request associated with @bio. 889 * request associated with @bio.
@@ -1135,7 +1173,7 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
1135struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1173struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1136{ 1174{
1137 if (q->mq_ops) 1175 if (q->mq_ops)
1138 return blk_mq_alloc_request(q, rw, gfp_mask); 1176 return blk_mq_alloc_request(q, rw, gfp_mask, false);
1139 else 1177 else
1140 return blk_old_get_request(q, rw, gfp_mask); 1178 return blk_old_get_request(q, rw, gfp_mask);
1141} 1179}
@@ -1231,12 +1269,15 @@ static void add_acct_request(struct request_queue *q, struct request *rq,
1231static void part_round_stats_single(int cpu, struct hd_struct *part, 1269static void part_round_stats_single(int cpu, struct hd_struct *part,
1232 unsigned long now) 1270 unsigned long now)
1233{ 1271{
1272 int inflight;
1273
1234 if (now == part->stamp) 1274 if (now == part->stamp)
1235 return; 1275 return;
1236 1276
1237 if (part_in_flight(part)) { 1277 inflight = part_in_flight(part);
1278 if (inflight) {
1238 __part_stat_add(cpu, part, time_in_queue, 1279 __part_stat_add(cpu, part, time_in_queue,
1239 part_in_flight(part) * (now - part->stamp)); 1280 inflight * (now - part->stamp));
1240 __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); 1281 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1241 } 1282 }
1242 part->stamp = now; 1283 part->stamp = now;
@@ -1360,7 +1401,6 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1360 1401
1361 rq->__data_len = rq->resid_len = len; 1402 rq->__data_len = rq->resid_len = len;
1362 rq->nr_phys_segments = 1; 1403 rq->nr_phys_segments = 1;
1363 rq->buffer = bio_data(bio);
1364} 1404}
1365EXPORT_SYMBOL_GPL(blk_add_request_payload); 1405EXPORT_SYMBOL_GPL(blk_add_request_payload);
1366 1406
@@ -1402,12 +1442,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1402 bio->bi_next = req->bio; 1442 bio->bi_next = req->bio;
1403 req->bio = bio; 1443 req->bio = bio;
1404 1444
1405 /*
1406 * may not be valid. if the low level driver said
1407 * it didn't need a bounce buffer then it better
1408 * not touch req->buffer either...
1409 */
1410 req->buffer = bio_data(bio);
1411 req->__sector = bio->bi_iter.bi_sector; 1445 req->__sector = bio->bi_iter.bi_sector;
1412 req->__data_len += bio->bi_iter.bi_size; 1446 req->__data_len += bio->bi_iter.bi_size;
1413 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1447 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
@@ -1432,6 +1466,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1432 * added on the elevator at this point. In addition, we don't have 1466 * added on the elevator at this point. In addition, we don't have
1433 * reliable access to the elevator outside queue lock. Only check basic 1467 * reliable access to the elevator outside queue lock. Only check basic
1434 * merging parameters without querying the elevator. 1468 * merging parameters without querying the elevator.
1469 *
1470 * Caller must ensure !blk_queue_nomerges(q) beforehand.
1435 */ 1471 */
1436bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1472bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1437 unsigned int *request_count) 1473 unsigned int *request_count)
@@ -1441,9 +1477,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1441 bool ret = false; 1477 bool ret = false;
1442 struct list_head *plug_list; 1478 struct list_head *plug_list;
1443 1479
1444 if (blk_queue_nomerges(q))
1445 goto out;
1446
1447 plug = current->plug; 1480 plug = current->plug;
1448 if (!plug) 1481 if (!plug)
1449 goto out; 1482 goto out;
@@ -1522,7 +1555,8 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1522 * Check if we can merge with the plugged list before grabbing 1555 * Check if we can merge with the plugged list before grabbing
1523 * any locks. 1556 * any locks.
1524 */ 1557 */
1525 if (blk_attempt_plug_merge(q, bio, &request_count)) 1558 if (!blk_queue_nomerges(q) &&
1559 blk_attempt_plug_merge(q, bio, &request_count))
1526 return; 1560 return;
1527 1561
1528 spin_lock_irq(q->queue_lock); 1562 spin_lock_irq(q->queue_lock);
@@ -1654,7 +1688,7 @@ static int __init fail_make_request_debugfs(void)
1654 struct dentry *dir = fault_create_debugfs_attr("fail_make_request", 1688 struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
1655 NULL, &fail_make_request); 1689 NULL, &fail_make_request);
1656 1690
1657 return IS_ERR(dir) ? PTR_ERR(dir) : 0; 1691 return PTR_ERR_OR_ZERO(dir);
1658} 1692}
1659 1693
1660late_initcall(fail_make_request_debugfs); 1694late_initcall(fail_make_request_debugfs);
@@ -2434,7 +2468,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
2434 } 2468 }
2435 2469
2436 req->__data_len -= total_bytes; 2470 req->__data_len -= total_bytes;
2437 req->buffer = bio_data(req->bio);
2438 2471
2439 /* update sector only for requests with clear definition of sector */ 2472 /* update sector only for requests with clear definition of sector */
2440 if (req->cmd_type == REQ_TYPE_FS) 2473 if (req->cmd_type == REQ_TYPE_FS)
@@ -2503,7 +2536,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request);
2503/* 2536/*
2504 * queue lock must be held 2537 * queue lock must be held
2505 */ 2538 */
2506static void blk_finish_request(struct request *req, int error) 2539void blk_finish_request(struct request *req, int error)
2507{ 2540{
2508 if (blk_rq_tagged(req)) 2541 if (blk_rq_tagged(req))
2509 blk_queue_end_tag(req->q, req); 2542 blk_queue_end_tag(req->q, req);
@@ -2529,6 +2562,7 @@ static void blk_finish_request(struct request *req, int error)
2529 __blk_put_request(req->q, req); 2562 __blk_put_request(req->q, req);
2530 } 2563 }
2531} 2564}
2565EXPORT_SYMBOL(blk_finish_request);
2532 2566
2533/** 2567/**
2534 * blk_end_bidi_request - Complete a bidi request 2568 * blk_end_bidi_request - Complete a bidi request
@@ -2752,10 +2786,9 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2752 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ 2786 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
2753 rq->cmd_flags |= bio->bi_rw & REQ_WRITE; 2787 rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
2754 2788
2755 if (bio_has_data(bio)) { 2789 if (bio_has_data(bio))
2756 rq->nr_phys_segments = bio_phys_segments(q, bio); 2790 rq->nr_phys_segments = bio_phys_segments(q, bio);
2757 rq->buffer = bio_data(bio); 2791
2758 }
2759 rq->__data_len = bio->bi_iter.bi_size; 2792 rq->__data_len = bio->bi_iter.bi_size;
2760 rq->bio = rq->biotail = bio; 2793 rq->bio = rq->biotail = bio;
2761 2794
@@ -2831,7 +2864,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
2831 2864
2832/* 2865/*
2833 * Copy attributes of the original request to the clone request. 2866 * Copy attributes of the original request to the clone request.
2834 * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied. 2867 * The actual data parts (e.g. ->cmd, ->sense) are not copied.
2835 */ 2868 */
2836static void __blk_rq_prep_clone(struct request *dst, struct request *src) 2869static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2837{ 2870{
@@ -2857,7 +2890,7 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
2857 * 2890 *
2858 * Description: 2891 * Description:
2859 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. 2892 * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
2860 * The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense) 2893 * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
2861 * are not copied, and copying such parts is the caller's responsibility. 2894 * are not copied, and copying such parts is the caller's responsibility.
2862 * Also, pages which the original bios are pointing to are not copied 2895 * Also, pages which the original bios are pointing to are not copied
2863 * and the cloned bios just point same pages. 2896 * and the cloned bios just point same pages.
@@ -2904,20 +2937,25 @@ free_and_out:
2904} 2937}
2905EXPORT_SYMBOL_GPL(blk_rq_prep_clone); 2938EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
2906 2939
2907int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) 2940int kblockd_schedule_work(struct work_struct *work)
2908{ 2941{
2909 return queue_work(kblockd_workqueue, work); 2942 return queue_work(kblockd_workqueue, work);
2910} 2943}
2911EXPORT_SYMBOL(kblockd_schedule_work); 2944EXPORT_SYMBOL(kblockd_schedule_work);
2912 2945
2913int kblockd_schedule_delayed_work(struct request_queue *q, 2946int kblockd_schedule_delayed_work(struct delayed_work *dwork,
2914 struct delayed_work *dwork, unsigned long delay) 2947 unsigned long delay)
2915{ 2948{
2916 return queue_delayed_work(kblockd_workqueue, dwork, delay); 2949 return queue_delayed_work(kblockd_workqueue, dwork, delay);
2917} 2950}
2918EXPORT_SYMBOL(kblockd_schedule_delayed_work); 2951EXPORT_SYMBOL(kblockd_schedule_delayed_work);
2919 2952
2920#define PLUG_MAGIC 0x91827364 2953int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
2954 unsigned long delay)
2955{
2956 return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
2957}
2958EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
2921 2959
2922/** 2960/**
2923 * blk_start_plug - initialize blk_plug and track it inside the task_struct 2961 * blk_start_plug - initialize blk_plug and track it inside the task_struct
@@ -2937,7 +2975,6 @@ void blk_start_plug(struct blk_plug *plug)
2937{ 2975{
2938 struct task_struct *tsk = current; 2976 struct task_struct *tsk = current;
2939 2977
2940 plug->magic = PLUG_MAGIC;
2941 INIT_LIST_HEAD(&plug->list); 2978 INIT_LIST_HEAD(&plug->list);
2942 INIT_LIST_HEAD(&plug->mq_list); 2979 INIT_LIST_HEAD(&plug->mq_list);
2943 INIT_LIST_HEAD(&plug->cb_list); 2980 INIT_LIST_HEAD(&plug->cb_list);
@@ -3034,8 +3071,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
3034 LIST_HEAD(list); 3071 LIST_HEAD(list);
3035 unsigned int depth; 3072 unsigned int depth;
3036 3073
3037 BUG_ON(plug->magic != PLUG_MAGIC);
3038
3039 flush_plug_callbacks(plug, from_schedule); 3074 flush_plug_callbacks(plug, from_schedule);
3040 3075
3041 if (!list_empty(&plug->mq_list)) 3076 if (!list_empty(&plug->mq_list))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b4755e9a..ff87c664b7df 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,21 +130,13 @@ static void blk_flush_restore_request(struct request *rq)
130 blk_clear_rq_complete(rq); 130 blk_clear_rq_complete(rq);
131} 131}
132 132
133static void mq_flush_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_work);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_insert_request(rq, false, true, false);
141}
142
143static bool blk_flush_queue_rq(struct request *rq, bool add_front) 133static bool blk_flush_queue_rq(struct request *rq, bool add_front)
144{ 134{
145 if (rq->q->mq_ops) { 135 if (rq->q->mq_ops) {
146 INIT_WORK(&rq->mq_flush_work, mq_flush_run); 136 struct request_queue *q = rq->q;
147 kblockd_schedule_work(rq->q, &rq->mq_flush_work); 137
138 blk_mq_add_to_requeue_list(rq, add_front);
139 blk_mq_kick_requeue_list(q);
148 return false; 140 return false;
149 } else { 141 } else {
150 if (add_front) 142 if (add_front)
@@ -231,8 +223,10 @@ static void flush_end_io(struct request *flush_rq, int error)
231 struct request *rq, *n; 223 struct request *rq, *n;
232 unsigned long flags = 0; 224 unsigned long flags = 0;
233 225
234 if (q->mq_ops) 226 if (q->mq_ops) {
235 spin_lock_irqsave(&q->mq_flush_lock, flags); 227 spin_lock_irqsave(&q->mq_flush_lock, flags);
228 q->flush_rq->cmd_flags = 0;
229 }
236 230
237 running = &q->flush_queue[q->flush_running_idx]; 231 running = &q->flush_queue[q->flush_running_idx];
238 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 232 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
@@ -306,23 +300,9 @@ static bool blk_kick_flush(struct request_queue *q)
306 */ 300 */
307 q->flush_pending_idx ^= 1; 301 q->flush_pending_idx ^= 1;
308 302
309 if (q->mq_ops) { 303 blk_rq_init(q, q->flush_rq);
310 struct blk_mq_ctx *ctx = first_rq->mq_ctx; 304 if (q->mq_ops)
311 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 305 blk_mq_clone_flush_request(q->flush_rq, first_rq);
312
313 blk_mq_rq_init(hctx, q->flush_rq);
314 q->flush_rq->mq_ctx = ctx;
315
316 /*
317 * Reuse the tag value from the fist waiting request,
318 * with blk-mq the tag is generated during request
319 * allocation and drivers can rely on it being inside
320 * the range they asked for.
321 */
322 q->flush_rq->tag = first_rq->tag;
323 } else {
324 blk_rq_init(q, q->flush_rq);
325 }
326 306
327 q->flush_rq->cmd_type = REQ_TYPE_FS; 307 q->flush_rq->cmd_type = REQ_TYPE_FS;
328 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 308 q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index c11d24e379e2..d828b44a404b 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -64,12 +64,12 @@ EXPORT_SYMBOL(__blk_iopoll_complete);
64 * iopoll handler will not be invoked again before blk_iopoll_sched_prep() 64 * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
65 * is called. 65 * is called.
66 **/ 66 **/
67void blk_iopoll_complete(struct blk_iopoll *iopoll) 67void blk_iopoll_complete(struct blk_iopoll *iop)
68{ 68{
69 unsigned long flags; 69 unsigned long flags;
70 70
71 local_irq_save(flags); 71 local_irq_save(flags);
72 __blk_iopoll_complete(iopoll); 72 __blk_iopoll_complete(iop);
73 local_irq_restore(flags); 73 local_irq_restore(flags);
74} 74}
75EXPORT_SYMBOL(blk_iopoll_complete); 75EXPORT_SYMBOL(blk_iopoll_complete);
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 97a733cf3d5f..8411be3c19d3 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,8 +226,8 @@ EXPORT_SYMBOL(blkdev_issue_write_same);
226 * Generate and issue number of bios with zerofiled pages. 226 * Generate and issue number of bios with zerofiled pages.
227 */ 227 */
228 228
229int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, 229static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
230 sector_t nr_sects, gfp_t gfp_mask) 230 sector_t nr_sects, gfp_t gfp_mask)
231{ 231{
232 int ret; 232 int ret;
233 struct bio *bio; 233 struct bio *bio;
diff --git a/block/blk-map.c b/block/blk-map.c
index f7b22bc21518..f890d4345b0c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -155,7 +155,6 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
155 if (!bio_flagged(bio, BIO_USER_MAPPED)) 155 if (!bio_flagged(bio, BIO_USER_MAPPED))
156 rq->cmd_flags |= REQ_COPY_USER; 156 rq->cmd_flags |= REQ_COPY_USER;
157 157
158 rq->buffer = NULL;
159 return 0; 158 return 0;
160unmap_rq: 159unmap_rq:
161 blk_rq_unmap_user(bio); 160 blk_rq_unmap_user(bio);
@@ -238,7 +237,6 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
238 blk_queue_bounce(q, &bio); 237 blk_queue_bounce(q, &bio);
239 bio_get(bio); 238 bio_get(bio);
240 blk_rq_bio_prep(q, rq, bio); 239 blk_rq_bio_prep(q, rq, bio);
241 rq->buffer = NULL;
242 return 0; 240 return 0;
243} 241}
244EXPORT_SYMBOL(blk_rq_map_user_iov); 242EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -325,7 +323,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
325 } 323 }
326 324
327 blk_queue_bounce(q, &rq->bio); 325 blk_queue_bounce(q, &rq->bio);
328 rq->buffer = NULL;
329 return 0; 326 return 0;
330} 327}
331EXPORT_SYMBOL(blk_rq_map_kern); 328EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6c583f9c5b65..b3bf0df0f4c2 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -13,7 +13,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
13 struct bio *bio) 13 struct bio *bio)
14{ 14{
15 struct bio_vec bv, bvprv = { NULL }; 15 struct bio_vec bv, bvprv = { NULL };
16 int cluster, high, highprv = 1; 16 int cluster, high, highprv = 1, no_sg_merge;
17 unsigned int seg_size, nr_phys_segs; 17 unsigned int seg_size, nr_phys_segs;
18 struct bio *fbio, *bbio; 18 struct bio *fbio, *bbio;
19 struct bvec_iter iter; 19 struct bvec_iter iter;
@@ -35,12 +35,21 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
35 cluster = blk_queue_cluster(q); 35 cluster = blk_queue_cluster(q);
36 seg_size = 0; 36 seg_size = 0;
37 nr_phys_segs = 0; 37 nr_phys_segs = 0;
38 no_sg_merge = test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags);
39 high = 0;
38 for_each_bio(bio) { 40 for_each_bio(bio) {
39 bio_for_each_segment(bv, bio, iter) { 41 bio_for_each_segment(bv, bio, iter) {
40 /* 42 /*
43 * If SG merging is disabled, each bio vector is
44 * a segment
45 */
46 if (no_sg_merge)
47 goto new_segment;
48
49 /*
41 * the trick here is making sure that a high page is 50 * the trick here is making sure that a high page is
42 * never considered part of another segment, since that 51 * never considered part of another segment, since
43 * might change with the bounce page. 52 * that might change with the bounce page.
44 */ 53 */
45 high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); 54 high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
46 if (!high && !highprv && cluster) { 55 if (!high && !highprv && cluster) {
@@ -84,11 +93,16 @@ void blk_recalc_rq_segments(struct request *rq)
84 93
85void blk_recount_segments(struct request_queue *q, struct bio *bio) 94void blk_recount_segments(struct request_queue *q, struct bio *bio)
86{ 95{
87 struct bio *nxt = bio->bi_next; 96 if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags))
97 bio->bi_phys_segments = bio->bi_vcnt;
98 else {
99 struct bio *nxt = bio->bi_next;
100
101 bio->bi_next = NULL;
102 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
103 bio->bi_next = nxt;
104 }
88 105
89 bio->bi_next = NULL;
90 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
91 bio->bi_next = nxt;
92 bio->bi_flags |= (1 << BIO_SEG_VALID); 106 bio->bi_flags |= (1 << BIO_SEG_VALID);
93} 107}
94EXPORT_SYMBOL(blk_recount_segments); 108EXPORT_SYMBOL(blk_recount_segments);
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
index 136ef8643bba..bb3ed488f7b5 100644
--- a/block/blk-mq-cpu.c
+++ b/block/blk-mq-cpu.c
@@ -1,3 +1,8 @@
1/*
2 * CPU notifier helper code for blk-mq
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 */
1#include <linux/kernel.h> 6#include <linux/kernel.h>
2#include <linux/module.h> 7#include <linux/module.h>
3#include <linux/init.h> 8#include <linux/init.h>
@@ -18,14 +23,18 @@ static int blk_mq_main_cpu_notify(struct notifier_block *self,
18{ 23{
19 unsigned int cpu = (unsigned long) hcpu; 24 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify; 25 struct blk_mq_cpu_notifier *notify;
26 int ret = NOTIFY_OK;
21 27
22 raw_spin_lock(&blk_mq_cpu_notify_lock); 28 raw_spin_lock(&blk_mq_cpu_notify_lock);
23 29
24 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) 30 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list) {
25 notify->notify(notify->data, action, cpu); 31 ret = notify->notify(notify->data, action, cpu);
32 if (ret != NOTIFY_OK)
33 break;
34 }
26 35
27 raw_spin_unlock(&blk_mq_cpu_notify_lock); 36 raw_spin_unlock(&blk_mq_cpu_notify_lock);
28 return NOTIFY_OK; 37 return ret;
29} 38}
30 39
31void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier) 40void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
@@ -45,7 +54,7 @@ void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
45} 54}
46 55
47void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 56void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
48 void (*fn)(void *, unsigned long, unsigned int), 57 int (*fn)(void *, unsigned long, unsigned int),
49 void *data) 58 void *data)
50{ 59{
51 notifier->notify = fn; 60 notifier->notify = fn;
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 097921329619..1065d7c65fa1 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -1,3 +1,8 @@
1/*
2 * CPU <-> hardware queue mapping helpers
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 */
1#include <linux/kernel.h> 6#include <linux/kernel.h>
2#include <linux/threads.h> 7#include <linux/threads.h>
3#include <linux/module.h> 8#include <linux/module.h>
@@ -80,19 +85,35 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
80 return 0; 85 return 0;
81} 86}
82 87
83unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg) 88unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
84{ 89{
85 unsigned int *map; 90 unsigned int *map;
86 91
87 /* If cpus are offline, map them to first hctx */ 92 /* If cpus are offline, map them to first hctx */
88 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, 93 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
89 reg->numa_node); 94 set->numa_node);
90 if (!map) 95 if (!map)
91 return NULL; 96 return NULL;
92 97
93 if (!blk_mq_update_queue_map(map, reg->nr_hw_queues)) 98 if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
94 return map; 99 return map;
95 100
96 kfree(map); 101 kfree(map);
97 return NULL; 102 return NULL;
98} 103}
104
105/*
106 * We have no quick way of doing reverse lookups. This is only used at
107 * queue init time, so runtime isn't important.
108 */
109int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
110{
111 int i;
112
113 for_each_possible_cpu(i) {
114 if (index == mq_map[i])
115 return cpu_to_node(i);
116 }
117
118 return NUMA_NO_NODE;
119}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index b0ba264b0522..ed5217867555 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -203,59 +203,24 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
203 return ret; 203 return ret;
204} 204}
205 205
206static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page) 206static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
207{
208 ssize_t ret;
209
210 spin_lock(&hctx->lock);
211 ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
212 spin_unlock(&hctx->lock);
213
214 return ret;
215}
216
217static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
218 const char *page, size_t len)
219{ 207{
220 struct blk_mq_ctx *ctx; 208 return blk_mq_tag_sysfs_show(hctx->tags, page);
221 unsigned long ret;
222 unsigned int i;
223
224 if (kstrtoul(page, 10, &ret)) {
225 pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
226 return -EINVAL;
227 }
228
229 spin_lock(&hctx->lock);
230 if (ret)
231 hctx->flags |= BLK_MQ_F_SHOULD_IPI;
232 else
233 hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
234 spin_unlock(&hctx->lock);
235
236 hctx_for_each_ctx(hctx, ctx, i)
237 ctx->ipi_redirect = !!ret;
238
239 return len;
240} 209}
241 210
242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page) 211static ssize_t blk_mq_hw_sysfs_active_show(struct blk_mq_hw_ctx *hctx, char *page)
243{ 212{
244 return blk_mq_tag_sysfs_show(hctx->tags, page); 213 return sprintf(page, "%u\n", atomic_read(&hctx->nr_active));
245} 214}
246 215
247static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) 216static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
248{ 217{
249 unsigned int i, queue_num, first = 1; 218 unsigned int i, first = 1;
250 ssize_t ret = 0; 219 ssize_t ret = 0;
251 220
252 blk_mq_disable_hotplug(); 221 blk_mq_disable_hotplug();
253 222
254 for_each_online_cpu(i) { 223 for_each_cpu(i, hctx->cpumask) {
255 queue_num = hctx->queue->mq_map[i];
256 if (queue_num != hctx->queue_num)
257 continue;
258
259 if (first) 224 if (first)
260 ret += sprintf(ret + page, "%u", i); 225 ret += sprintf(ret + page, "%u", i);
261 else 226 else
@@ -307,15 +272,14 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
307 .attr = {.name = "dispatched", .mode = S_IRUGO }, 272 .attr = {.name = "dispatched", .mode = S_IRUGO },
308 .show = blk_mq_hw_sysfs_dispatched_show, 273 .show = blk_mq_hw_sysfs_dispatched_show,
309}; 274};
275static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_active = {
276 .attr = {.name = "active", .mode = S_IRUGO },
277 .show = blk_mq_hw_sysfs_active_show,
278};
310static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = { 279static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
311 .attr = {.name = "pending", .mode = S_IRUGO }, 280 .attr = {.name = "pending", .mode = S_IRUGO },
312 .show = blk_mq_hw_sysfs_rq_list_show, 281 .show = blk_mq_hw_sysfs_rq_list_show,
313}; 282};
314static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
315 .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
316 .show = blk_mq_hw_sysfs_ipi_show,
317 .store = blk_mq_hw_sysfs_ipi_store,
318};
319static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = { 283static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
320 .attr = {.name = "tags", .mode = S_IRUGO }, 284 .attr = {.name = "tags", .mode = S_IRUGO },
321 .show = blk_mq_hw_sysfs_tags_show, 285 .show = blk_mq_hw_sysfs_tags_show,
@@ -330,9 +294,9 @@ static struct attribute *default_hw_ctx_attrs[] = {
330 &blk_mq_hw_sysfs_run.attr, 294 &blk_mq_hw_sysfs_run.attr,
331 &blk_mq_hw_sysfs_dispatched.attr, 295 &blk_mq_hw_sysfs_dispatched.attr,
332 &blk_mq_hw_sysfs_pending.attr, 296 &blk_mq_hw_sysfs_pending.attr,
333 &blk_mq_hw_sysfs_ipi.attr,
334 &blk_mq_hw_sysfs_tags.attr, 297 &blk_mq_hw_sysfs_tags.attr,
335 &blk_mq_hw_sysfs_cpus.attr, 298 &blk_mq_hw_sysfs_cpus.attr,
299 &blk_mq_hw_sysfs_active.attr,
336 NULL, 300 NULL,
337}; 301};
338 302
@@ -363,6 +327,42 @@ static struct kobj_type blk_mq_hw_ktype = {
363 .release = blk_mq_sysfs_release, 327 .release = blk_mq_sysfs_release,
364}; 328};
365 329
330static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx)
331{
332 struct blk_mq_ctx *ctx;
333 int i;
334
335 if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
336 return;
337
338 hctx_for_each_ctx(hctx, ctx, i)
339 kobject_del(&ctx->kobj);
340
341 kobject_del(&hctx->kobj);
342}
343
344static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
345{
346 struct request_queue *q = hctx->queue;
347 struct blk_mq_ctx *ctx;
348 int i, ret;
349
350 if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP))
351 return 0;
352
353 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
354 if (ret)
355 return ret;
356
357 hctx_for_each_ctx(hctx, ctx, i) {
358 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
359 if (ret)
360 break;
361 }
362
363 return ret;
364}
365
366void blk_mq_unregister_disk(struct gendisk *disk) 366void blk_mq_unregister_disk(struct gendisk *disk)
367{ 367{
368 struct request_queue *q = disk->queue; 368 struct request_queue *q = disk->queue;
@@ -371,11 +371,11 @@ void blk_mq_unregister_disk(struct gendisk *disk)
371 int i, j; 371 int i, j;
372 372
373 queue_for_each_hw_ctx(q, hctx, i) { 373 queue_for_each_hw_ctx(q, hctx, i) {
374 hctx_for_each_ctx(hctx, ctx, j) { 374 blk_mq_unregister_hctx(hctx);
375 kobject_del(&ctx->kobj); 375
376 hctx_for_each_ctx(hctx, ctx, j)
376 kobject_put(&ctx->kobj); 377 kobject_put(&ctx->kobj);
377 } 378
378 kobject_del(&hctx->kobj);
379 kobject_put(&hctx->kobj); 379 kobject_put(&hctx->kobj);
380 } 380 }
381 381
@@ -386,15 +386,30 @@ void blk_mq_unregister_disk(struct gendisk *disk)
386 kobject_put(&disk_to_dev(disk)->kobj); 386 kobject_put(&disk_to_dev(disk)->kobj);
387} 387}
388 388
389static void blk_mq_sysfs_init(struct request_queue *q)
390{
391 struct blk_mq_hw_ctx *hctx;
392 struct blk_mq_ctx *ctx;
393 int i, j;
394
395 kobject_init(&q->mq_kobj, &blk_mq_ktype);
396
397 queue_for_each_hw_ctx(q, hctx, i) {
398 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
399
400 hctx_for_each_ctx(hctx, ctx, j)
401 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
402 }
403}
404
389int blk_mq_register_disk(struct gendisk *disk) 405int blk_mq_register_disk(struct gendisk *disk)
390{ 406{
391 struct device *dev = disk_to_dev(disk); 407 struct device *dev = disk_to_dev(disk);
392 struct request_queue *q = disk->queue; 408 struct request_queue *q = disk->queue;
393 struct blk_mq_hw_ctx *hctx; 409 struct blk_mq_hw_ctx *hctx;
394 struct blk_mq_ctx *ctx; 410 int ret, i;
395 int ret, i, j;
396 411
397 kobject_init(&q->mq_kobj, &blk_mq_ktype); 412 blk_mq_sysfs_init(q);
398 413
399 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); 414 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
400 if (ret < 0) 415 if (ret < 0)
@@ -403,20 +418,10 @@ int blk_mq_register_disk(struct gendisk *disk)
403 kobject_uevent(&q->mq_kobj, KOBJ_ADD); 418 kobject_uevent(&q->mq_kobj, KOBJ_ADD);
404 419
405 queue_for_each_hw_ctx(q, hctx, i) { 420 queue_for_each_hw_ctx(q, hctx, i) {
406 kobject_init(&hctx->kobj, &blk_mq_hw_ktype); 421 hctx->flags |= BLK_MQ_F_SYSFS_UP;
407 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i); 422 ret = blk_mq_register_hctx(hctx);
408 if (ret) 423 if (ret)
409 break; 424 break;
410
411 if (!hctx->nr_ctx)
412 continue;
413
414 hctx_for_each_ctx(hctx, ctx, j) {
415 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
416 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
417 if (ret)
418 break;
419 }
420 } 425 }
421 426
422 if (ret) { 427 if (ret) {
@@ -426,3 +431,26 @@ int blk_mq_register_disk(struct gendisk *disk)
426 431
427 return 0; 432 return 0;
428} 433}
434
435void blk_mq_sysfs_unregister(struct request_queue *q)
436{
437 struct blk_mq_hw_ctx *hctx;
438 int i;
439
440 queue_for_each_hw_ctx(q, hctx, i)
441 blk_mq_unregister_hctx(hctx);
442}
443
444int blk_mq_sysfs_register(struct request_queue *q)
445{
446 struct blk_mq_hw_ctx *hctx;
447 int i, ret = 0;
448
449 queue_for_each_hw_ctx(q, hctx, i) {
450 ret = blk_mq_register_hctx(hctx);
451 if (ret)
452 break;
453 }
454
455 return ret;
456}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c51a27..d90c4aeb7dd3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -1,78 +1,345 @@
1/*
2 * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread
3 * over multiple cachelines to avoid ping-pong between multiple submitters
4 * or submitter and completer. Uses rolling wakeups to avoid falling of
5 * the scaling cliff when we run out of tags and have to start putting
6 * submitters to sleep.
7 *
8 * Uses active queue tracking to support fairer distribution of tags
9 * between multiple submitters when a shared tag map is used.
10 *
11 * Copyright (C) 2013-2014 Jens Axboe
12 */
1#include <linux/kernel.h> 13#include <linux/kernel.h>
2#include <linux/module.h> 14#include <linux/module.h>
3#include <linux/percpu_ida.h> 15#include <linux/random.h>
4 16
5#include <linux/blk-mq.h> 17#include <linux/blk-mq.h>
6#include "blk.h" 18#include "blk.h"
7#include "blk-mq.h" 19#include "blk-mq.h"
8#include "blk-mq-tag.h" 20#include "blk-mq-tag.h"
9 21
22static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt)
23{
24 int i;
25
26 for (i = 0; i < bt->map_nr; i++) {
27 struct blk_align_bitmap *bm = &bt->map[i];
28 int ret;
29
30 ret = find_first_zero_bit(&bm->word, bm->depth);
31 if (ret < bm->depth)
32 return true;
33 }
34
35 return false;
36}
37
38bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
39{
40 if (!tags)
41 return true;
42
43 return bt_has_free_tags(&tags->bitmap_tags);
44}
45
46static inline void bt_index_inc(unsigned int *index)
47{
48 *index = (*index + 1) & (BT_WAIT_QUEUES - 1);
49}
50
10/* 51/*
11 * Per tagged queue (tag address space) map 52 * If a previously inactive queue goes active, bump the active user count.
12 */ 53 */
13struct blk_mq_tags { 54bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
14 unsigned int nr_tags; 55{
15 unsigned int nr_reserved_tags; 56 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
16 unsigned int nr_batch_move; 57 !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
17 unsigned int nr_max_cache; 58 atomic_inc(&hctx->tags->active_queues);
18 59
19 struct percpu_ida free_tags; 60 return true;
20 struct percpu_ida reserved_tags; 61}
21};
22 62
23void blk_mq_wait_for_tags(struct blk_mq_tags *tags) 63/*
64 * Wakeup all potentially sleeping on normal (non-reserved) tags
65 */
66static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags)
24{ 67{
25 int tag = blk_mq_get_tag(tags, __GFP_WAIT, false); 68 struct blk_mq_bitmap_tags *bt;
26 blk_mq_put_tag(tags, tag); 69 int i, wake_index;
70
71 bt = &tags->bitmap_tags;
72 wake_index = bt->wake_index;
73 for (i = 0; i < BT_WAIT_QUEUES; i++) {
74 struct bt_wait_state *bs = &bt->bs[wake_index];
75
76 if (waitqueue_active(&bs->wait))
77 wake_up(&bs->wait);
78
79 bt_index_inc(&wake_index);
80 }
27} 81}
28 82
29bool blk_mq_has_free_tags(struct blk_mq_tags *tags) 83/*
84 * If a previously busy queue goes inactive, potential waiters could now
85 * be allowed to queue. Wake them up and check.
86 */
87void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
88{
89 struct blk_mq_tags *tags = hctx->tags;
90
91 if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
92 return;
93
94 atomic_dec(&tags->active_queues);
95
96 blk_mq_tag_wakeup_all(tags);
97}
98
99/*
100 * For shared tag users, we track the number of currently active users
101 * and attempt to provide a fair share of the tag depth for each of them.
102 */
103static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
104 struct blk_mq_bitmap_tags *bt)
105{
106 unsigned int depth, users;
107
108 if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
109 return true;
110 if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
111 return true;
112
113 /*
114 * Don't try dividing an ant
115 */
116 if (bt->depth == 1)
117 return true;
118
119 users = atomic_read(&hctx->tags->active_queues);
120 if (!users)
121 return true;
122
123 /*
124 * Allow at least some tags
125 */
126 depth = max((bt->depth + users - 1) / users, 4U);
127 return atomic_read(&hctx->nr_active) < depth;
128}
129
130static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
30{ 131{
31 return !tags || 132 int tag, org_last_tag, end;
32 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0; 133
134 org_last_tag = last_tag;
135 end = bm->depth;
136 do {
137restart:
138 tag = find_next_zero_bit(&bm->word, end, last_tag);
139 if (unlikely(tag >= end)) {
140 /*
141 * We started with an offset, start from 0 to
142 * exhaust the map.
143 */
144 if (org_last_tag && last_tag) {
145 end = last_tag;
146 last_tag = 0;
147 goto restart;
148 }
149 return -1;
150 }
151 last_tag = tag + 1;
152 } while (test_and_set_bit_lock(tag, &bm->word));
153
154 return tag;
33} 155}
34 156
35static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp) 157/*
158 * Straight forward bitmap tag implementation, where each bit is a tag
159 * (cleared == free, and set == busy). The small twist is using per-cpu
160 * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue
161 * contexts. This enables us to drastically limit the space searched,
162 * without dirtying an extra shared cacheline like we would if we stored
163 * the cache value inside the shared blk_mq_bitmap_tags structure. On top
164 * of that, each word of tags is in a separate cacheline. This means that
165 * multiple users will tend to stick to different cachelines, at least
166 * until the map is exhausted.
167 */
168static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt,
169 unsigned int *tag_cache)
36{ 170{
171 unsigned int last_tag, org_last_tag;
172 int index, i, tag;
173
174 if (!hctx_may_queue(hctx, bt))
175 return -1;
176
177 last_tag = org_last_tag = *tag_cache;
178 index = TAG_TO_INDEX(bt, last_tag);
179
180 for (i = 0; i < bt->map_nr; i++) {
181 tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag));
182 if (tag != -1) {
183 tag += (index << bt->bits_per_word);
184 goto done;
185 }
186
187 last_tag = 0;
188 if (++index >= bt->map_nr)
189 index = 0;
190 }
191
192 *tag_cache = 0;
193 return -1;
194
195 /*
196 * Only update the cache from the allocation path, if we ended
197 * up using the specific cached tag.
198 */
199done:
200 if (tag == org_last_tag) {
201 last_tag = tag + 1;
202 if (last_tag >= bt->depth - 1)
203 last_tag = 0;
204
205 *tag_cache = last_tag;
206 }
207
208 return tag;
209}
210
211static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt,
212 struct blk_mq_hw_ctx *hctx)
213{
214 struct bt_wait_state *bs;
215
216 if (!hctx)
217 return &bt->bs[0];
218
219 bs = &bt->bs[hctx->wait_index];
220 bt_index_inc(&hctx->wait_index);
221 return bs;
222}
223
224static int bt_get(struct blk_mq_bitmap_tags *bt, struct blk_mq_hw_ctx *hctx,
225 unsigned int *last_tag, gfp_t gfp)
226{
227 struct bt_wait_state *bs;
228 DEFINE_WAIT(wait);
37 int tag; 229 int tag;
38 230
39 tag = percpu_ida_alloc(&tags->free_tags, (gfp & __GFP_WAIT) ? 231 tag = __bt_get(hctx, bt, last_tag);
40 TASK_UNINTERRUPTIBLE : TASK_RUNNING); 232 if (tag != -1)
41 if (tag < 0) 233 return tag;
42 return BLK_MQ_TAG_FAIL; 234
43 return tag + tags->nr_reserved_tags; 235 if (!(gfp & __GFP_WAIT))
236 return -1;
237
238 bs = bt_wait_ptr(bt, hctx);
239 do {
240 bool was_empty;
241
242 was_empty = list_empty(&wait.task_list);
243 prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
244
245 tag = __bt_get(hctx, bt, last_tag);
246 if (tag != -1)
247 break;
248
249 if (was_empty)
250 atomic_set(&bs->wait_cnt, bt->wake_cnt);
251
252 io_schedule();
253 } while (1);
254
255 finish_wait(&bs->wait, &wait);
256 return tag;
257}
258
259static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags,
260 struct blk_mq_hw_ctx *hctx,
261 unsigned int *last_tag, gfp_t gfp)
262{
263 int tag;
264
265 tag = bt_get(&tags->bitmap_tags, hctx, last_tag, gfp);
266 if (tag >= 0)
267 return tag + tags->nr_reserved_tags;
268
269 return BLK_MQ_TAG_FAIL;
44} 270}
45 271
46static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags, 272static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
47 gfp_t gfp) 273 gfp_t gfp)
48{ 274{
49 int tag; 275 int tag, zero = 0;
50 276
51 if (unlikely(!tags->nr_reserved_tags)) { 277 if (unlikely(!tags->nr_reserved_tags)) {
52 WARN_ON_ONCE(1); 278 WARN_ON_ONCE(1);
53 return BLK_MQ_TAG_FAIL; 279 return BLK_MQ_TAG_FAIL;
54 } 280 }
55 281
56 tag = percpu_ida_alloc(&tags->reserved_tags, (gfp & __GFP_WAIT) ? 282 tag = bt_get(&tags->breserved_tags, NULL, &zero, gfp);
57 TASK_UNINTERRUPTIBLE : TASK_RUNNING);
58 if (tag < 0) 283 if (tag < 0)
59 return BLK_MQ_TAG_FAIL; 284 return BLK_MQ_TAG_FAIL;
285
60 return tag; 286 return tag;
61} 287}
62 288
63unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved) 289unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag,
290 gfp_t gfp, bool reserved)
64{ 291{
65 if (!reserved) 292 if (!reserved)
66 return __blk_mq_get_tag(tags, gfp); 293 return __blk_mq_get_tag(hctx->tags, hctx, last_tag, gfp);
67 294
68 return __blk_mq_get_reserved_tag(tags, gfp); 295 return __blk_mq_get_reserved_tag(hctx->tags, gfp);
296}
297
298static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
299{
300 int i, wake_index;
301
302 wake_index = bt->wake_index;
303 for (i = 0; i < BT_WAIT_QUEUES; i++) {
304 struct bt_wait_state *bs = &bt->bs[wake_index];
305
306 if (waitqueue_active(&bs->wait)) {
307 if (wake_index != bt->wake_index)
308 bt->wake_index = wake_index;
309
310 return bs;
311 }
312
313 bt_index_inc(&wake_index);
314 }
315
316 return NULL;
317}
318
319static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
320{
321 const int index = TAG_TO_INDEX(bt, tag);
322 struct bt_wait_state *bs;
323
324 /*
325 * The unlock memory barrier need to order access to req in free
326 * path and clearing tag bit
327 */
328 clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
329
330 bs = bt_wake_ptr(bt);
331 if (bs && atomic_dec_and_test(&bs->wait_cnt)) {
332 atomic_set(&bs->wait_cnt, bt->wake_cnt);
333 bt_index_inc(&bt->wake_index);
334 wake_up(&bs->wait);
335 }
69} 336}
70 337
71static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 338static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
72{ 339{
73 BUG_ON(tag >= tags->nr_tags); 340 BUG_ON(tag >= tags->nr_tags);
74 341
75 percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags); 342 bt_clear_tag(&tags->bitmap_tags, tag);
76} 343}
77 344
78static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, 345static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
@@ -80,22 +347,43 @@ static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
80{ 347{
81 BUG_ON(tag >= tags->nr_reserved_tags); 348 BUG_ON(tag >= tags->nr_reserved_tags);
82 349
83 percpu_ida_free(&tags->reserved_tags, tag); 350 bt_clear_tag(&tags->breserved_tags, tag);
84} 351}
85 352
86void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) 353void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
354 unsigned int *last_tag)
87{ 355{
88 if (tag >= tags->nr_reserved_tags) 356 struct blk_mq_tags *tags = hctx->tags;
89 __blk_mq_put_tag(tags, tag); 357
90 else 358 if (tag >= tags->nr_reserved_tags) {
359 const int real_tag = tag - tags->nr_reserved_tags;
360
361 __blk_mq_put_tag(tags, real_tag);
362 *last_tag = real_tag;
363 } else
91 __blk_mq_put_reserved_tag(tags, tag); 364 __blk_mq_put_reserved_tag(tags, tag);
92} 365}
93 366
94static int __blk_mq_tag_iter(unsigned id, void *data) 367static void bt_for_each_free(struct blk_mq_bitmap_tags *bt,
368 unsigned long *free_map, unsigned int off)
95{ 369{
96 unsigned long *tag_map = data; 370 int i;
97 __set_bit(id, tag_map); 371
98 return 0; 372 for (i = 0; i < bt->map_nr; i++) {
373 struct blk_align_bitmap *bm = &bt->map[i];
374 int bit = 0;
375
376 do {
377 bit = find_next_zero_bit(&bm->word, bm->depth, bit);
378 if (bit >= bm->depth)
379 break;
380
381 __set_bit(bit + off, free_map);
382 bit++;
383 } while (1);
384
385 off += (1 << bt->bits_per_word);
386 }
99} 387}
100 388
101void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, 389void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
@@ -109,21 +397,128 @@ void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
109 if (!tag_map) 397 if (!tag_map)
110 return; 398 return;
111 399
112 percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map); 400 bt_for_each_free(&tags->bitmap_tags, tag_map, tags->nr_reserved_tags);
113 if (tags->nr_reserved_tags) 401 if (tags->nr_reserved_tags)
114 percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter, 402 bt_for_each_free(&tags->breserved_tags, tag_map, 0);
115 tag_map);
116 403
117 fn(data, tag_map); 404 fn(data, tag_map);
118 kfree(tag_map); 405 kfree(tag_map);
119} 406}
407EXPORT_SYMBOL(blk_mq_tag_busy_iter);
408
409static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt)
410{
411 unsigned int i, used;
412
413 for (i = 0, used = 0; i < bt->map_nr; i++) {
414 struct blk_align_bitmap *bm = &bt->map[i];
415
416 used += bitmap_weight(&bm->word, bm->depth);
417 }
418
419 return bt->depth - used;
420}
421
422static void bt_update_count(struct blk_mq_bitmap_tags *bt,
423 unsigned int depth)
424{
425 unsigned int tags_per_word = 1U << bt->bits_per_word;
426 unsigned int map_depth = depth;
427
428 if (depth) {
429 int i;
430
431 for (i = 0; i < bt->map_nr; i++) {
432 bt->map[i].depth = min(map_depth, tags_per_word);
433 map_depth -= bt->map[i].depth;
434 }
435 }
436
437 bt->wake_cnt = BT_WAIT_BATCH;
438 if (bt->wake_cnt > depth / 4)
439 bt->wake_cnt = max(1U, depth / 4);
440
441 bt->depth = depth;
442}
443
444static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth,
445 int node, bool reserved)
446{
447 int i;
448
449 bt->bits_per_word = ilog2(BITS_PER_LONG);
450
451 /*
452 * Depth can be zero for reserved tags, that's not a failure
453 * condition.
454 */
455 if (depth) {
456 unsigned int nr, tags_per_word;
457
458 tags_per_word = (1 << bt->bits_per_word);
459
460 /*
461 * If the tag space is small, shrink the number of tags
462 * per word so we spread over a few cachelines, at least.
463 * If less than 4 tags, just forget about it, it's not
464 * going to work optimally anyway.
465 */
466 if (depth >= 4) {
467 while (tags_per_word * 4 > depth) {
468 bt->bits_per_word--;
469 tags_per_word = (1 << bt->bits_per_word);
470 }
471 }
472
473 nr = ALIGN(depth, tags_per_word) / tags_per_word;
474 bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap),
475 GFP_KERNEL, node);
476 if (!bt->map)
477 return -ENOMEM;
478
479 bt->map_nr = nr;
480 }
481
482 bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL);
483 if (!bt->bs) {
484 kfree(bt->map);
485 return -ENOMEM;
486 }
487
488 for (i = 0; i < BT_WAIT_QUEUES; i++)
489 init_waitqueue_head(&bt->bs[i].wait);
490
491 bt_update_count(bt, depth);
492 return 0;
493}
494
495static void bt_free(struct blk_mq_bitmap_tags *bt)
496{
497 kfree(bt->map);
498 kfree(bt->bs);
499}
500
501static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
502 int node)
503{
504 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
505
506 if (bt_alloc(&tags->bitmap_tags, depth, node, false))
507 goto enomem;
508 if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true))
509 goto enomem;
510
511 return tags;
512enomem:
513 bt_free(&tags->bitmap_tags);
514 kfree(tags);
515 return NULL;
516}
120 517
121struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, 518struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
122 unsigned int reserved_tags, int node) 519 unsigned int reserved_tags, int node)
123{ 520{
124 unsigned int nr_tags, nr_cache;
125 struct blk_mq_tags *tags; 521 struct blk_mq_tags *tags;
126 int ret;
127 522
128 if (total_tags > BLK_MQ_TAG_MAX) { 523 if (total_tags > BLK_MQ_TAG_MAX) {
129 pr_err("blk-mq: tag depth too large\n"); 524 pr_err("blk-mq: tag depth too large\n");
@@ -134,73 +529,59 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
134 if (!tags) 529 if (!tags)
135 return NULL; 530 return NULL;
136 531
137 nr_tags = total_tags - reserved_tags;
138 nr_cache = nr_tags / num_possible_cpus();
139
140 if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
141 nr_cache = BLK_MQ_TAG_CACHE_MIN;
142 else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
143 nr_cache = BLK_MQ_TAG_CACHE_MAX;
144
145 tags->nr_tags = total_tags; 532 tags->nr_tags = total_tags;
146 tags->nr_reserved_tags = reserved_tags; 533 tags->nr_reserved_tags = reserved_tags;
147 tags->nr_max_cache = nr_cache;
148 tags->nr_batch_move = max(1u, nr_cache / 2);
149 534
150 ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags - 535 return blk_mq_init_bitmap_tags(tags, node);
151 tags->nr_reserved_tags, 536}
152 tags->nr_max_cache,
153 tags->nr_batch_move);
154 if (ret)
155 goto err_free_tags;
156 537
157 if (reserved_tags) { 538void blk_mq_free_tags(struct blk_mq_tags *tags)
158 /* 539{
159 * With max_cahe and batch set to 1, the allocator fallbacks to 540 bt_free(&tags->bitmap_tags);
160 * no cached. It's fine reserved tags allocation is slow. 541 bt_free(&tags->breserved_tags);
161 */ 542 kfree(tags);
162 ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags, 543}
163 1, 1);
164 if (ret)
165 goto err_reserved_tags;
166 }
167 544
168 return tags; 545void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag)
546{
547 unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
169 548
170err_reserved_tags: 549 *tag = prandom_u32() % depth;
171 percpu_ida_destroy(&tags->free_tags);
172err_free_tags:
173 kfree(tags);
174 return NULL;
175} 550}
176 551
177void blk_mq_free_tags(struct blk_mq_tags *tags) 552int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth)
178{ 553{
179 percpu_ida_destroy(&tags->free_tags); 554 tdepth -= tags->nr_reserved_tags;
180 percpu_ida_destroy(&tags->reserved_tags); 555 if (tdepth > tags->nr_tags)
181 kfree(tags); 556 return -EINVAL;
557
558 /*
559 * Don't need (or can't) update reserved tags here, they remain
560 * static and should never need resizing.
561 */
562 bt_update_count(&tags->bitmap_tags, tdepth);
563 blk_mq_tag_wakeup_all(tags);
564 return 0;
182} 565}
183 566
184ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) 567ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
185{ 568{
186 char *orig_page = page; 569 char *orig_page = page;
187 unsigned int cpu; 570 unsigned int free, res;
188 571
189 if (!tags) 572 if (!tags)
190 return 0; 573 return 0;
191 574
192 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u," 575 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, "
193 " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags, 576 "bits_per_word=%u\n",
194 tags->nr_batch_move, tags->nr_max_cache); 577 tags->nr_tags, tags->nr_reserved_tags,
578 tags->bitmap_tags.bits_per_word);
195 579
196 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", 580 free = bt_unused_tags(&tags->bitmap_tags);
197 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids), 581 res = bt_unused_tags(&tags->breserved_tags);
198 percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
199 582
200 for_each_possible_cpu(cpu) { 583 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res);
201 page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu, 584 page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues));
202 percpu_ida_free_tags(&tags->free_tags, cpu));
203 }
204 585
205 return page - orig_page; 586 return page - orig_page;
206} 587}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 947ba2c6148e..c959de58d2a5 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -1,17 +1,59 @@
1#ifndef INT_BLK_MQ_TAG_H 1#ifndef INT_BLK_MQ_TAG_H
2#define INT_BLK_MQ_TAG_H 2#define INT_BLK_MQ_TAG_H
3 3
4struct blk_mq_tags; 4#include "blk-mq.h"
5
6enum {
7 BT_WAIT_QUEUES = 8,
8 BT_WAIT_BATCH = 8,
9};
10
11struct bt_wait_state {
12 atomic_t wait_cnt;
13 wait_queue_head_t wait;
14} ____cacheline_aligned_in_smp;
15
16#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word)
17#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1))
18
19struct blk_mq_bitmap_tags {
20 unsigned int depth;
21 unsigned int wake_cnt;
22 unsigned int bits_per_word;
23
24 unsigned int map_nr;
25 struct blk_align_bitmap *map;
26
27 unsigned int wake_index;
28 struct bt_wait_state *bs;
29};
30
31/*
32 * Tag address space map.
33 */
34struct blk_mq_tags {
35 unsigned int nr_tags;
36 unsigned int nr_reserved_tags;
37
38 atomic_t active_queues;
39
40 struct blk_mq_bitmap_tags bitmap_tags;
41 struct blk_mq_bitmap_tags breserved_tags;
42
43 struct request **rqs;
44 struct list_head page_list;
45};
46
5 47
6extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node); 48extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
7extern void blk_mq_free_tags(struct blk_mq_tags *tags); 49extern void blk_mq_free_tags(struct blk_mq_tags *tags);
8 50
9extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved); 51extern unsigned int blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, unsigned int *last_tag, gfp_t gfp, bool reserved);
10extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags); 52extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag);
11extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
12extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
13extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); 53extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
14extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); 54extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
55extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag);
56extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
15 57
16enum { 58enum {
17 BLK_MQ_TAG_CACHE_MIN = 1, 59 BLK_MQ_TAG_CACHE_MIN = 1,
@@ -24,4 +66,23 @@ enum {
24 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1, 66 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
25}; 67};
26 68
69extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
70extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
71
72static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
73{
74 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
75 return false;
76
77 return __blk_mq_tag_busy(hctx);
78}
79
80static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
81{
82 if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
83 return;
84
85 __blk_mq_tag_idle(hctx);
86}
87
27#endif 88#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1d2a9bdbee57..0f5879c42dcd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1,3 +1,9 @@
1/*
2 * Block multiqueue core code
3 *
4 * Copyright (C) 2013-2014 Jens Axboe
5 * Copyright (C) 2013-2014 Christoph Hellwig
6 */
1#include <linux/kernel.h> 7#include <linux/kernel.h>
2#include <linux/module.h> 8#include <linux/module.h>
3#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
@@ -56,38 +62,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
56{ 62{
57 unsigned int i; 63 unsigned int i;
58 64
59 for (i = 0; i < hctx->nr_ctx_map; i++) 65 for (i = 0; i < hctx->ctx_map.map_size; i++)
60 if (hctx->ctx_map[i]) 66 if (hctx->ctx_map.map[i].word)
61 return true; 67 return true;
62 68
63 return false; 69 return false;
64} 70}
65 71
72static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
73 struct blk_mq_ctx *ctx)
74{
75 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
76}
77
78#define CTX_TO_BIT(hctx, ctx) \
79 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
80
66/* 81/*
67 * Mark this ctx as having pending work in this hardware queue 82 * Mark this ctx as having pending work in this hardware queue
68 */ 83 */
69static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 84static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
70 struct blk_mq_ctx *ctx) 85 struct blk_mq_ctx *ctx)
71{ 86{
72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 87 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
73 set_bit(ctx->index_hw, hctx->ctx_map); 88
89 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
90 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
74} 91}
75 92
76static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 93static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
77 gfp_t gfp, bool reserved) 94 struct blk_mq_ctx *ctx)
78{ 95{
79 struct request *rq; 96 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
80 unsigned int tag;
81 97
82 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 98 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
83 if (tag != BLK_MQ_TAG_FAIL) {
84 rq = hctx->rqs[tag];
85 rq->tag = tag;
86
87 return rq;
88 }
89
90 return NULL;
91} 99}
92 100
93static int blk_mq_queue_enter(struct request_queue *q) 101static int blk_mq_queue_enter(struct request_queue *q)
@@ -186,78 +194,95 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
186 if (blk_queue_io_stat(q)) 194 if (blk_queue_io_stat(q))
187 rw_flags |= REQ_IO_STAT; 195 rw_flags |= REQ_IO_STAT;
188 196
197 INIT_LIST_HEAD(&rq->queuelist);
198 /* csd/requeue_work/fifo_time is initialized before use */
199 rq->q = q;
189 rq->mq_ctx = ctx; 200 rq->mq_ctx = ctx;
190 rq->cmd_flags = rw_flags; 201 rq->cmd_flags |= rw_flags;
191 rq->start_time = jiffies; 202 /* do not touch atomic flags, it needs atomic ops against the timer */
203 rq->cpu = -1;
204 INIT_HLIST_NODE(&rq->hash);
205 RB_CLEAR_NODE(&rq->rb_node);
206 rq->rq_disk = NULL;
207 rq->part = NULL;
208#ifdef CONFIG_BLK_CGROUP
209 rq->rl = NULL;
192 set_start_time_ns(rq); 210 set_start_time_ns(rq);
211 rq->io_start_time_ns = 0;
212#endif
213 rq->nr_phys_segments = 0;
214#if defined(CONFIG_BLK_DEV_INTEGRITY)
215 rq->nr_integrity_segments = 0;
216#endif
217 rq->special = NULL;
218 /* tag was already set */
219 rq->errors = 0;
220
221 rq->extra_len = 0;
222 rq->sense_len = 0;
223 rq->resid_len = 0;
224 rq->sense = NULL;
225
226 INIT_LIST_HEAD(&rq->timeout_list);
227 rq->end_io = NULL;
228 rq->end_io_data = NULL;
229 rq->next_rq = NULL;
230
193 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 231 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
194} 232}
195 233
196static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 234static struct request *
197 int rw, gfp_t gfp, 235__blk_mq_alloc_request(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
198 bool reserved) 236 struct blk_mq_ctx *ctx, int rw, gfp_t gfp, bool reserved)
199{ 237{
200 struct request *rq; 238 struct request *rq;
239 unsigned int tag;
201 240
202 do { 241 tag = blk_mq_get_tag(hctx, &ctx->last_tag, gfp, reserved);
203 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 242 if (tag != BLK_MQ_TAG_FAIL) {
204 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 243 rq = hctx->tags->rqs[tag];
205 244
206 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 245 rq->cmd_flags = 0;
207 if (rq) { 246 if (blk_mq_tag_busy(hctx)) {
208 blk_mq_rq_ctx_init(q, ctx, rq, rw); 247 rq->cmd_flags = REQ_MQ_INFLIGHT;
209 break; 248 atomic_inc(&hctx->nr_active);
210 } 249 }
211 250
212 blk_mq_put_ctx(ctx); 251 rq->tag = tag;
213 if (!(gfp & __GFP_WAIT)) 252 blk_mq_rq_ctx_init(q, ctx, rq, rw);
214 break; 253 return rq;
215 254 }
216 __blk_mq_run_hw_queue(hctx);
217 blk_mq_wait_for_tags(hctx->tags);
218 } while (1);
219 255
220 return rq; 256 return NULL;
221} 257}
222 258
223struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 259struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
260 bool reserved)
224{ 261{
262 struct blk_mq_ctx *ctx;
263 struct blk_mq_hw_ctx *hctx;
225 struct request *rq; 264 struct request *rq;
226 265
227 if (blk_mq_queue_enter(q)) 266 if (blk_mq_queue_enter(q))
228 return NULL; 267 return NULL;
229 268
230 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 269 ctx = blk_mq_get_ctx(q);
231 if (rq) 270 hctx = q->mq_ops->map_queue(q, ctx->cpu);
232 blk_mq_put_ctx(rq->mq_ctx);
233 return rq;
234}
235
236struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
237 gfp_t gfp)
238{
239 struct request *rq;
240 271
241 if (blk_mq_queue_enter(q)) 272 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp & ~__GFP_WAIT,
242 return NULL; 273 reserved);
274 if (!rq && (gfp & __GFP_WAIT)) {
275 __blk_mq_run_hw_queue(hctx);
276 blk_mq_put_ctx(ctx);
243 277
244 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 278 ctx = blk_mq_get_ctx(q);
245 if (rq) 279 hctx = q->mq_ops->map_queue(q, ctx->cpu);
246 blk_mq_put_ctx(rq->mq_ctx); 280 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, gfp, reserved);
281 }
282 blk_mq_put_ctx(ctx);
247 return rq; 283 return rq;
248} 284}
249EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 285EXPORT_SYMBOL(blk_mq_alloc_request);
250
251/*
252 * Re-init and set pdu, if we have it
253 */
254void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
255{
256 blk_rq_init(hctx->queue, rq);
257
258 if (hctx->cmd_size)
259 rq->special = blk_mq_rq_to_pdu(rq);
260}
261 286
262static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 287static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
263 struct blk_mq_ctx *ctx, struct request *rq) 288 struct blk_mq_ctx *ctx, struct request *rq)
@@ -265,9 +290,11 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
265 const int tag = rq->tag; 290 const int tag = rq->tag;
266 struct request_queue *q = rq->q; 291 struct request_queue *q = rq->q;
267 292
268 blk_mq_rq_init(hctx, rq); 293 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
269 blk_mq_put_tag(hctx->tags, tag); 294 atomic_dec(&hctx->nr_active);
270 295
296 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
297 blk_mq_put_tag(hctx, tag, &ctx->last_tag);
271 blk_mq_queue_exit(q); 298 blk_mq_queue_exit(q);
272} 299}
273 300
@@ -283,20 +310,47 @@ void blk_mq_free_request(struct request *rq)
283 __blk_mq_free_request(hctx, ctx, rq); 310 __blk_mq_free_request(hctx, ctx, rq);
284} 311}
285 312
286bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) 313/*
314 * Clone all relevant state from a request that has been put on hold in
315 * the flush state machine into the preallocated flush request that hangs
316 * off the request queue.
317 *
318 * For a driver the flush request should be invisible, that's why we are
319 * impersonating the original request here.
320 */
321void blk_mq_clone_flush_request(struct request *flush_rq,
322 struct request *orig_rq)
287{ 323{
288 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 324 struct blk_mq_hw_ctx *hctx =
289 return true; 325 orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
326
327 flush_rq->mq_ctx = orig_rq->mq_ctx;
328 flush_rq->tag = orig_rq->tag;
329 memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
330 hctx->cmd_size);
331}
290 332
333inline void __blk_mq_end_io(struct request *rq, int error)
334{
291 blk_account_io_done(rq); 335 blk_account_io_done(rq);
292 336
293 if (rq->end_io) 337 if (rq->end_io) {
294 rq->end_io(rq, error); 338 rq->end_io(rq, error);
295 else 339 } else {
340 if (unlikely(blk_bidi_rq(rq)))
341 blk_mq_free_request(rq->next_rq);
296 blk_mq_free_request(rq); 342 blk_mq_free_request(rq);
297 return false; 343 }
344}
345EXPORT_SYMBOL(__blk_mq_end_io);
346
347void blk_mq_end_io(struct request *rq, int error)
348{
349 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
350 BUG();
351 __blk_mq_end_io(rq, error);
298} 352}
299EXPORT_SYMBOL(blk_mq_end_io_partial); 353EXPORT_SYMBOL(blk_mq_end_io);
300 354
301static void __blk_mq_complete_request_remote(void *data) 355static void __blk_mq_complete_request_remote(void *data)
302{ 356{
@@ -305,18 +359,22 @@ static void __blk_mq_complete_request_remote(void *data)
305 rq->q->softirq_done_fn(rq); 359 rq->q->softirq_done_fn(rq);
306} 360}
307 361
308void __blk_mq_complete_request(struct request *rq) 362static void blk_mq_ipi_complete_request(struct request *rq)
309{ 363{
310 struct blk_mq_ctx *ctx = rq->mq_ctx; 364 struct blk_mq_ctx *ctx = rq->mq_ctx;
365 bool shared = false;
311 int cpu; 366 int cpu;
312 367
313 if (!ctx->ipi_redirect) { 368 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
314 rq->q->softirq_done_fn(rq); 369 rq->q->softirq_done_fn(rq);
315 return; 370 return;
316 } 371 }
317 372
318 cpu = get_cpu(); 373 cpu = get_cpu();
319 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 374 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
375 shared = cpus_share_cache(cpu, ctx->cpu);
376
377 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
320 rq->csd.func = __blk_mq_complete_request_remote; 378 rq->csd.func = __blk_mq_complete_request_remote;
321 rq->csd.info = rq; 379 rq->csd.info = rq;
322 rq->csd.flags = 0; 380 rq->csd.flags = 0;
@@ -327,6 +385,16 @@ void __blk_mq_complete_request(struct request *rq)
327 put_cpu(); 385 put_cpu();
328} 386}
329 387
388void __blk_mq_complete_request(struct request *rq)
389{
390 struct request_queue *q = rq->q;
391
392 if (!q->softirq_done_fn)
393 blk_mq_end_io(rq, rq->errors);
394 else
395 blk_mq_ipi_complete_request(rq);
396}
397
330/** 398/**
331 * blk_mq_complete_request - end I/O on a request 399 * blk_mq_complete_request - end I/O on a request
332 * @rq: the request being processed 400 * @rq: the request being processed
@@ -337,7 +405,9 @@ void __blk_mq_complete_request(struct request *rq)
337 **/ 405 **/
338void blk_mq_complete_request(struct request *rq) 406void blk_mq_complete_request(struct request *rq)
339{ 407{
340 if (unlikely(blk_should_fake_timeout(rq->q))) 408 struct request_queue *q = rq->q;
409
410 if (unlikely(blk_should_fake_timeout(q)))
341 return; 411 return;
342 if (!blk_mark_rq_complete(rq)) 412 if (!blk_mark_rq_complete(rq))
343 __blk_mq_complete_request(rq); 413 __blk_mq_complete_request(rq);
@@ -350,13 +420,31 @@ static void blk_mq_start_request(struct request *rq, bool last)
350 420
351 trace_block_rq_issue(q, rq); 421 trace_block_rq_issue(q, rq);
352 422
423 rq->resid_len = blk_rq_bytes(rq);
424 if (unlikely(blk_bidi_rq(rq)))
425 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
426
353 /* 427 /*
354 * Just mark start time and set the started bit. Due to memory 428 * Just mark start time and set the started bit. Due to memory
355 * ordering, we know we'll see the correct deadline as long as 429 * ordering, we know we'll see the correct deadline as long as
356 * REQ_ATOMIC_STARTED is seen. 430 * REQ_ATOMIC_STARTED is seen. Use the default queue timeout,
431 * unless one has been set in the request.
432 */
433 if (!rq->timeout)
434 rq->deadline = jiffies + q->rq_timeout;
435 else
436 rq->deadline = jiffies + rq->timeout;
437
438 /*
439 * Mark us as started and clear complete. Complete might have been
440 * set if requeue raced with timeout, which then marked it as
441 * complete. So be sure to clear complete again when we start
442 * the request, otherwise we'll ignore the completion event.
357 */ 443 */
358 rq->deadline = jiffies + q->rq_timeout; 444 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
359 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 445 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
446 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
447 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
360 448
361 if (q->dma_drain_size && blk_rq_bytes(rq)) { 449 if (q->dma_drain_size && blk_rq_bytes(rq)) {
362 /* 450 /*
@@ -378,7 +466,7 @@ static void blk_mq_start_request(struct request *rq, bool last)
378 rq->cmd_flags |= REQ_END; 466 rq->cmd_flags |= REQ_END;
379} 467}
380 468
381static void blk_mq_requeue_request(struct request *rq) 469static void __blk_mq_requeue_request(struct request *rq)
382{ 470{
383 struct request_queue *q = rq->q; 471 struct request_queue *q = rq->q;
384 472
@@ -391,6 +479,86 @@ static void blk_mq_requeue_request(struct request *rq)
391 rq->nr_phys_segments--; 479 rq->nr_phys_segments--;
392} 480}
393 481
482void blk_mq_requeue_request(struct request *rq)
483{
484 __blk_mq_requeue_request(rq);
485 blk_clear_rq_complete(rq);
486
487 BUG_ON(blk_queued_rq(rq));
488 blk_mq_add_to_requeue_list(rq, true);
489}
490EXPORT_SYMBOL(blk_mq_requeue_request);
491
492static void blk_mq_requeue_work(struct work_struct *work)
493{
494 struct request_queue *q =
495 container_of(work, struct request_queue, requeue_work);
496 LIST_HEAD(rq_list);
497 struct request *rq, *next;
498 unsigned long flags;
499
500 spin_lock_irqsave(&q->requeue_lock, flags);
501 list_splice_init(&q->requeue_list, &rq_list);
502 spin_unlock_irqrestore(&q->requeue_lock, flags);
503
504 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
505 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
506 continue;
507
508 rq->cmd_flags &= ~REQ_SOFTBARRIER;
509 list_del_init(&rq->queuelist);
510 blk_mq_insert_request(rq, true, false, false);
511 }
512
513 while (!list_empty(&rq_list)) {
514 rq = list_entry(rq_list.next, struct request, queuelist);
515 list_del_init(&rq->queuelist);
516 blk_mq_insert_request(rq, false, false, false);
517 }
518
519 blk_mq_run_queues(q, false);
520}
521
522void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
523{
524 struct request_queue *q = rq->q;
525 unsigned long flags;
526
527 /*
528 * We abuse this flag that is otherwise used by the I/O scheduler to
529 * request head insertation from the workqueue.
530 */
531 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
532
533 spin_lock_irqsave(&q->requeue_lock, flags);
534 if (at_head) {
535 rq->cmd_flags |= REQ_SOFTBARRIER;
536 list_add(&rq->queuelist, &q->requeue_list);
537 } else {
538 list_add_tail(&rq->queuelist, &q->requeue_list);
539 }
540 spin_unlock_irqrestore(&q->requeue_lock, flags);
541}
542EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
543
544void blk_mq_kick_requeue_list(struct request_queue *q)
545{
546 kblockd_schedule_work(&q->requeue_work);
547}
548EXPORT_SYMBOL(blk_mq_kick_requeue_list);
549
550struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
551{
552 struct request_queue *q = hctx->queue;
553
554 if ((q->flush_rq->cmd_flags & REQ_FLUSH_SEQ) &&
555 q->flush_rq->tag == tag)
556 return q->flush_rq;
557
558 return hctx->tags->rqs[tag];
559}
560EXPORT_SYMBOL(blk_mq_tag_to_rq);
561
394struct blk_mq_timeout_data { 562struct blk_mq_timeout_data {
395 struct blk_mq_hw_ctx *hctx; 563 struct blk_mq_hw_ctx *hctx;
396 unsigned long *next; 564 unsigned long *next;
@@ -412,12 +580,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
412 do { 580 do {
413 struct request *rq; 581 struct request *rq;
414 582
415 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 583 tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
416 if (tag >= hctx->queue_depth) 584 if (tag >= hctx->tags->nr_tags)
417 break; 585 break;
418 586
419 rq = hctx->rqs[tag++]; 587 rq = blk_mq_tag_to_rq(hctx, tag++);
420 588 if (rq->q != hctx->queue)
589 continue;
421 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 590 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
422 continue; 591 continue;
423 592
@@ -442,6 +611,28 @@ static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
442 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 611 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
443} 612}
444 613
614static enum blk_eh_timer_return blk_mq_rq_timed_out(struct request *rq)
615{
616 struct request_queue *q = rq->q;
617
618 /*
619 * We know that complete is set at this point. If STARTED isn't set
620 * anymore, then the request isn't active and the "timeout" should
621 * just be ignored. This can happen due to the bitflag ordering.
622 * Timeout first checks if STARTED is set, and if it is, assumes
623 * the request is active. But if we race with completion, then
624 * we both flags will get cleared. So check here again, and ignore
625 * a timeout event with a request that isn't active.
626 */
627 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
628 return BLK_EH_NOT_HANDLED;
629
630 if (!q->mq_ops->timeout)
631 return BLK_EH_RESET_TIMER;
632
633 return q->mq_ops->timeout(rq);
634}
635
445static void blk_mq_rq_timer(unsigned long data) 636static void blk_mq_rq_timer(unsigned long data)
446{ 637{
447 struct request_queue *q = (struct request_queue *) data; 638 struct request_queue *q = (struct request_queue *) data;
@@ -449,11 +640,24 @@ static void blk_mq_rq_timer(unsigned long data)
449 unsigned long next = 0; 640 unsigned long next = 0;
450 int i, next_set = 0; 641 int i, next_set = 0;
451 642
452 queue_for_each_hw_ctx(q, hctx, i) 643 queue_for_each_hw_ctx(q, hctx, i) {
644 /*
645 * If not software queues are currently mapped to this
646 * hardware queue, there's nothing to check
647 */
648 if (!hctx->nr_ctx || !hctx->tags)
649 continue;
650
453 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 651 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
652 }
454 653
455 if (next_set) 654 if (next_set) {
456 mod_timer(&q->timeout, round_jiffies_up(next)); 655 next = blk_rq_timeout(round_jiffies_up(next));
656 mod_timer(&q->timeout, next);
657 } else {
658 queue_for_each_hw_ctx(q, hctx, i)
659 blk_mq_tag_idle(hctx);
660 }
457} 661}
458 662
459/* 663/*
@@ -495,9 +699,38 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
495 return false; 699 return false;
496} 700}
497 701
498void blk_mq_add_timer(struct request *rq) 702/*
703 * Process software queues that have been marked busy, splicing them
704 * to the for-dispatch
705 */
706static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
499{ 707{
500 __blk_add_timer(rq, NULL); 708 struct blk_mq_ctx *ctx;
709 int i;
710
711 for (i = 0; i < hctx->ctx_map.map_size; i++) {
712 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
713 unsigned int off, bit;
714
715 if (!bm->word)
716 continue;
717
718 bit = 0;
719 off = i * hctx->ctx_map.bits_per_word;
720 do {
721 bit = find_next_bit(&bm->word, bm->depth, bit);
722 if (bit >= bm->depth)
723 break;
724
725 ctx = hctx->ctxs[bit + off];
726 clear_bit(bit, &bm->word);
727 spin_lock(&ctx->lock);
728 list_splice_tail_init(&ctx->rq_list, list);
729 spin_unlock(&ctx->lock);
730
731 bit++;
732 } while (1);
733 }
501} 734}
502 735
503/* 736/*
@@ -509,10 +742,11 @@ void blk_mq_add_timer(struct request *rq)
509static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 742static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
510{ 743{
511 struct request_queue *q = hctx->queue; 744 struct request_queue *q = hctx->queue;
512 struct blk_mq_ctx *ctx;
513 struct request *rq; 745 struct request *rq;
514 LIST_HEAD(rq_list); 746 LIST_HEAD(rq_list);
515 int bit, queued; 747 int queued;
748
749 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
516 750
517 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 751 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
518 return; 752 return;
@@ -522,15 +756,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
522 /* 756 /*
523 * Touch any software queue that has pending entries. 757 * Touch any software queue that has pending entries.
524 */ 758 */
525 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 759 flush_busy_ctxs(hctx, &rq_list);
526 clear_bit(bit, hctx->ctx_map);
527 ctx = hctx->ctxs[bit];
528 BUG_ON(bit != ctx->index_hw);
529
530 spin_lock(&ctx->lock);
531 list_splice_tail_init(&ctx->rq_list, &rq_list);
532 spin_unlock(&ctx->lock);
533 }
534 760
535 /* 761 /*
536 * If we have previous entries on our dispatch list, grab them 762 * If we have previous entries on our dispatch list, grab them
@@ -544,13 +770,9 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
544 } 770 }
545 771
546 /* 772 /*
547 * Delete and return all entries from our dispatch list
548 */
549 queued = 0;
550
551 /*
552 * Now process all the entries, sending them to the driver. 773 * Now process all the entries, sending them to the driver.
553 */ 774 */
775 queued = 0;
554 while (!list_empty(&rq_list)) { 776 while (!list_empty(&rq_list)) {
555 int ret; 777 int ret;
556 778
@@ -565,13 +787,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
565 queued++; 787 queued++;
566 continue; 788 continue;
567 case BLK_MQ_RQ_QUEUE_BUSY: 789 case BLK_MQ_RQ_QUEUE_BUSY:
568 /*
569 * FIXME: we should have a mechanism to stop the queue
570 * like blk_stop_queue, otherwise we will waste cpu
571 * time
572 */
573 list_add(&rq->queuelist, &rq_list); 790 list_add(&rq->queuelist, &rq_list);
574 blk_mq_requeue_request(rq); 791 __blk_mq_requeue_request(rq);
575 break; 792 break;
576 default: 793 default:
577 pr_err("blk-mq: bad return on queue: %d\n", ret); 794 pr_err("blk-mq: bad return on queue: %d\n", ret);
@@ -601,17 +818,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
601 } 818 }
602} 819}
603 820
821/*
822 * It'd be great if the workqueue API had a way to pass
823 * in a mask and had some smarts for more clever placement.
824 * For now we just round-robin here, switching for every
825 * BLK_MQ_CPU_WORK_BATCH queued items.
826 */
827static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
828{
829 int cpu = hctx->next_cpu;
830
831 if (--hctx->next_cpu_batch <= 0) {
832 int next_cpu;
833
834 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
835 if (next_cpu >= nr_cpu_ids)
836 next_cpu = cpumask_first(hctx->cpumask);
837
838 hctx->next_cpu = next_cpu;
839 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
840 }
841
842 return cpu;
843}
844
604void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 845void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
605{ 846{
606 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 847 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
607 return; 848 return;
608 849
609 if (!async) 850 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
610 __blk_mq_run_hw_queue(hctx); 851 __blk_mq_run_hw_queue(hctx);
852 else if (hctx->queue->nr_hw_queues == 1)
853 kblockd_schedule_delayed_work(&hctx->run_work, 0);
611 else { 854 else {
612 struct request_queue *q = hctx->queue; 855 unsigned int cpu;
613 856
614 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 857 cpu = blk_mq_hctx_next_cpu(hctx);
858 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
615 } 859 }
616} 860}
617 861
@@ -626,14 +870,17 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
626 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 870 test_bit(BLK_MQ_S_STOPPED, &hctx->state))
627 continue; 871 continue;
628 872
873 preempt_disable();
629 blk_mq_run_hw_queue(hctx, async); 874 blk_mq_run_hw_queue(hctx, async);
875 preempt_enable();
630 } 876 }
631} 877}
632EXPORT_SYMBOL(blk_mq_run_queues); 878EXPORT_SYMBOL(blk_mq_run_queues);
633 879
634void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 880void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
635{ 881{
636 cancel_delayed_work(&hctx->delayed_work); 882 cancel_delayed_work(&hctx->run_work);
883 cancel_delayed_work(&hctx->delay_work);
637 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 884 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
638} 885}
639EXPORT_SYMBOL(blk_mq_stop_hw_queue); 886EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -651,11 +898,25 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queues);
651void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 898void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
652{ 899{
653 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 900 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
901
902 preempt_disable();
654 __blk_mq_run_hw_queue(hctx); 903 __blk_mq_run_hw_queue(hctx);
904 preempt_enable();
655} 905}
656EXPORT_SYMBOL(blk_mq_start_hw_queue); 906EXPORT_SYMBOL(blk_mq_start_hw_queue);
657 907
658void blk_mq_start_stopped_hw_queues(struct request_queue *q) 908void blk_mq_start_hw_queues(struct request_queue *q)
909{
910 struct blk_mq_hw_ctx *hctx;
911 int i;
912
913 queue_for_each_hw_ctx(q, hctx, i)
914 blk_mq_start_hw_queue(hctx);
915}
916EXPORT_SYMBOL(blk_mq_start_hw_queues);
917
918
919void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
659{ 920{
660 struct blk_mq_hw_ctx *hctx; 921 struct blk_mq_hw_ctx *hctx;
661 int i; 922 int i;
@@ -665,19 +926,47 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q)
665 continue; 926 continue;
666 927
667 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 928 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
668 blk_mq_run_hw_queue(hctx, true); 929 preempt_disable();
930 blk_mq_run_hw_queue(hctx, async);
931 preempt_enable();
669 } 932 }
670} 933}
671EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 934EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
672 935
673static void blk_mq_work_fn(struct work_struct *work) 936static void blk_mq_run_work_fn(struct work_struct *work)
674{ 937{
675 struct blk_mq_hw_ctx *hctx; 938 struct blk_mq_hw_ctx *hctx;
676 939
677 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 940 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
941
678 __blk_mq_run_hw_queue(hctx); 942 __blk_mq_run_hw_queue(hctx);
679} 943}
680 944
945static void blk_mq_delay_work_fn(struct work_struct *work)
946{
947 struct blk_mq_hw_ctx *hctx;
948
949 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
950
951 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
952 __blk_mq_run_hw_queue(hctx);
953}
954
955void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
956{
957 unsigned long tmo = msecs_to_jiffies(msecs);
958
959 if (hctx->queue->nr_hw_queues == 1)
960 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
961 else {
962 unsigned int cpu;
963
964 cpu = blk_mq_hctx_next_cpu(hctx);
965 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
966 }
967}
968EXPORT_SYMBOL(blk_mq_delay_queue);
969
681static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 970static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
682 struct request *rq, bool at_head) 971 struct request *rq, bool at_head)
683{ 972{
@@ -689,12 +978,13 @@ static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
689 list_add(&rq->queuelist, &ctx->rq_list); 978 list_add(&rq->queuelist, &ctx->rq_list);
690 else 979 else
691 list_add_tail(&rq->queuelist, &ctx->rq_list); 980 list_add_tail(&rq->queuelist, &ctx->rq_list);
981
692 blk_mq_hctx_mark_pending(hctx, ctx); 982 blk_mq_hctx_mark_pending(hctx, ctx);
693 983
694 /* 984 /*
695 * We do this early, to ensure we are on the right CPU. 985 * We do this early, to ensure we are on the right CPU.
696 */ 986 */
697 blk_mq_add_timer(rq); 987 blk_add_timer(rq);
698} 988}
699 989
700void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 990void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
@@ -719,10 +1009,10 @@ void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
719 spin_unlock(&ctx->lock); 1009 spin_unlock(&ctx->lock);
720 } 1010 }
721 1011
722 blk_mq_put_ctx(current_ctx);
723
724 if (run_queue) 1012 if (run_queue)
725 blk_mq_run_hw_queue(hctx, async); 1013 blk_mq_run_hw_queue(hctx, async);
1014
1015 blk_mq_put_ctx(current_ctx);
726} 1016}
727 1017
728static void blk_mq_insert_requests(struct request_queue *q, 1018static void blk_mq_insert_requests(struct request_queue *q,
@@ -758,9 +1048,8 @@ static void blk_mq_insert_requests(struct request_queue *q,
758 } 1048 }
759 spin_unlock(&ctx->lock); 1049 spin_unlock(&ctx->lock);
760 1050
761 blk_mq_put_ctx(current_ctx);
762
763 blk_mq_run_hw_queue(hctx, from_schedule); 1051 blk_mq_run_hw_queue(hctx, from_schedule);
1052 blk_mq_put_ctx(current_ctx);
764} 1053}
765 1054
766static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1055static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -823,24 +1112,169 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
823static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1112static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
824{ 1113{
825 init_request_from_bio(rq, bio); 1114 init_request_from_bio(rq, bio);
826 blk_account_io_start(rq, 1); 1115
1116 if (blk_do_io_stat(rq)) {
1117 rq->start_time = jiffies;
1118 blk_account_io_start(rq, 1);
1119 }
827} 1120}
828 1121
829static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1122static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1123 struct blk_mq_ctx *ctx,
1124 struct request *rq, struct bio *bio)
1125{
1126 struct request_queue *q = hctx->queue;
1127
1128 if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE)) {
1129 blk_mq_bio_to_request(rq, bio);
1130 spin_lock(&ctx->lock);
1131insert_rq:
1132 __blk_mq_insert_request(hctx, rq, false);
1133 spin_unlock(&ctx->lock);
1134 return false;
1135 } else {
1136 spin_lock(&ctx->lock);
1137 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1138 blk_mq_bio_to_request(rq, bio);
1139 goto insert_rq;
1140 }
1141
1142 spin_unlock(&ctx->lock);
1143 __blk_mq_free_request(hctx, ctx, rq);
1144 return true;
1145 }
1146}
1147
1148struct blk_map_ctx {
1149 struct blk_mq_hw_ctx *hctx;
1150 struct blk_mq_ctx *ctx;
1151};
1152
1153static struct request *blk_mq_map_request(struct request_queue *q,
1154 struct bio *bio,
1155 struct blk_map_ctx *data)
830{ 1156{
831 struct blk_mq_hw_ctx *hctx; 1157 struct blk_mq_hw_ctx *hctx;
832 struct blk_mq_ctx *ctx; 1158 struct blk_mq_ctx *ctx;
1159 struct request *rq;
1160 int rw = bio_data_dir(bio);
1161
1162 if (unlikely(blk_mq_queue_enter(q))) {
1163 bio_endio(bio, -EIO);
1164 return NULL;
1165 }
1166
1167 ctx = blk_mq_get_ctx(q);
1168 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1169
1170 if (rw_is_sync(bio->bi_rw))
1171 rw |= REQ_SYNC;
1172
1173 trace_block_getrq(q, bio, rw);
1174 rq = __blk_mq_alloc_request(q, hctx, ctx, rw, GFP_ATOMIC, false);
1175 if (unlikely(!rq)) {
1176 __blk_mq_run_hw_queue(hctx);
1177 blk_mq_put_ctx(ctx);
1178 trace_block_sleeprq(q, bio, rw);
1179
1180 ctx = blk_mq_get_ctx(q);
1181 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1182 rq = __blk_mq_alloc_request(q, hctx, ctx, rw,
1183 __GFP_WAIT|GFP_ATOMIC, false);
1184 }
1185
1186 hctx->queued++;
1187 data->hctx = hctx;
1188 data->ctx = ctx;
1189 return rq;
1190}
1191
1192/*
1193 * Multiple hardware queue variant. This will not use per-process plugs,
1194 * but will attempt to bypass the hctx queueing if we can go straight to
1195 * hardware for SYNC IO.
1196 */
1197static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1198{
833 const int is_sync = rw_is_sync(bio->bi_rw); 1199 const int is_sync = rw_is_sync(bio->bi_rw);
834 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1200 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
835 int rw = bio_data_dir(bio); 1201 struct blk_map_ctx data;
836 struct request *rq; 1202 struct request *rq;
1203
1204 blk_queue_bounce(q, &bio);
1205
1206 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1207 bio_endio(bio, -EIO);
1208 return;
1209 }
1210
1211 rq = blk_mq_map_request(q, bio, &data);
1212 if (unlikely(!rq))
1213 return;
1214
1215 if (unlikely(is_flush_fua)) {
1216 blk_mq_bio_to_request(rq, bio);
1217 blk_insert_flush(rq);
1218 goto run_queue;
1219 }
1220
1221 if (is_sync) {
1222 int ret;
1223
1224 blk_mq_bio_to_request(rq, bio);
1225 blk_mq_start_request(rq, true);
1226 blk_add_timer(rq);
1227
1228 /*
1229 * For OK queue, we are done. For error, kill it. Any other
1230 * error (busy), just add it to our list as we previously
1231 * would have done
1232 */
1233 ret = q->mq_ops->queue_rq(data.hctx, rq);
1234 if (ret == BLK_MQ_RQ_QUEUE_OK)
1235 goto done;
1236 else {
1237 __blk_mq_requeue_request(rq);
1238
1239 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1240 rq->errors = -EIO;
1241 blk_mq_end_io(rq, rq->errors);
1242 goto done;
1243 }
1244 }
1245 }
1246
1247 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1248 /*
1249 * For a SYNC request, send it to the hardware immediately. For
1250 * an ASYNC request, just ensure that we run it later on. The
1251 * latter allows for merging opportunities and more efficient
1252 * dispatching.
1253 */
1254run_queue:
1255 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1256 }
1257done:
1258 blk_mq_put_ctx(data.ctx);
1259}
1260
1261/*
1262 * Single hardware queue variant. This will attempt to use any per-process
1263 * plug for merging and IO deferral.
1264 */
1265static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1266{
1267 const int is_sync = rw_is_sync(bio->bi_rw);
1268 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
837 unsigned int use_plug, request_count = 0; 1269 unsigned int use_plug, request_count = 0;
1270 struct blk_map_ctx data;
1271 struct request *rq;
838 1272
839 /* 1273 /*
840 * If we have multiple hardware queues, just go directly to 1274 * If we have multiple hardware queues, just go directly to
841 * one of those for sync IO. 1275 * one of those for sync IO.
842 */ 1276 */
843 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 1277 use_plug = !is_flush_fua && !is_sync;
844 1278
845 blk_queue_bounce(q, &bio); 1279 blk_queue_bounce(q, &bio);
846 1280
@@ -849,37 +1283,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
849 return; 1283 return;
850 } 1284 }
851 1285
852 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 1286 if (use_plug && !blk_queue_nomerges(q) &&
1287 blk_attempt_plug_merge(q, bio, &request_count))
853 return; 1288 return;
854 1289
855 if (blk_mq_queue_enter(q)) { 1290 rq = blk_mq_map_request(q, bio, &data);
856 bio_endio(bio, -EIO);
857 return;
858 }
859
860 ctx = blk_mq_get_ctx(q);
861 hctx = q->mq_ops->map_queue(q, ctx->cpu);
862
863 if (is_sync)
864 rw |= REQ_SYNC;
865 trace_block_getrq(q, bio, rw);
866 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
867 if (likely(rq))
868 blk_mq_rq_ctx_init(q, ctx, rq, rw);
869 else {
870 blk_mq_put_ctx(ctx);
871 trace_block_sleeprq(q, bio, rw);
872 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
873 false);
874 ctx = rq->mq_ctx;
875 hctx = q->mq_ops->map_queue(q, ctx->cpu);
876 }
877
878 hctx->queued++;
879 1291
880 if (unlikely(is_flush_fua)) { 1292 if (unlikely(is_flush_fua)) {
881 blk_mq_bio_to_request(rq, bio); 1293 blk_mq_bio_to_request(rq, bio);
882 blk_mq_put_ctx(ctx);
883 blk_insert_flush(rq); 1294 blk_insert_flush(rq);
884 goto run_queue; 1295 goto run_queue;
885 } 1296 }
@@ -901,31 +1312,23 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
901 trace_block_plug(q); 1312 trace_block_plug(q);
902 } 1313 }
903 list_add_tail(&rq->queuelist, &plug->mq_list); 1314 list_add_tail(&rq->queuelist, &plug->mq_list);
904 blk_mq_put_ctx(ctx); 1315 blk_mq_put_ctx(data.ctx);
905 return; 1316 return;
906 } 1317 }
907 } 1318 }
908 1319
909 spin_lock(&ctx->lock); 1320 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
910 1321 /*
911 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1322 * For a SYNC request, send it to the hardware immediately. For
912 blk_mq_attempt_merge(q, ctx, bio)) 1323 * an ASYNC request, just ensure that we run it later on. The
913 __blk_mq_free_request(hctx, ctx, rq); 1324 * latter allows for merging opportunities and more efficient
914 else { 1325 * dispatching.
915 blk_mq_bio_to_request(rq, bio); 1326 */
916 __blk_mq_insert_request(hctx, rq, false); 1327run_queue:
1328 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
917 } 1329 }
918 1330
919 spin_unlock(&ctx->lock); 1331 blk_mq_put_ctx(data.ctx);
920 blk_mq_put_ctx(ctx);
921
922 /*
923 * For a SYNC request, send it to the hardware immediately. For an
924 * ASYNC request, just ensure that we run it later on. The latter
925 * allows for merging opportunities and more efficient dispatching.
926 */
927run_queue:
928 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
929} 1332}
930 1333
931/* 1334/*
@@ -937,32 +1340,153 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
937} 1340}
938EXPORT_SYMBOL(blk_mq_map_queue); 1341EXPORT_SYMBOL(blk_mq_map_queue);
939 1342
940struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 1343static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
941 unsigned int hctx_index) 1344 struct blk_mq_tags *tags, unsigned int hctx_idx)
942{ 1345{
943 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1346 struct page *page;
944 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1347
1348 if (tags->rqs && set->ops->exit_request) {
1349 int i;
1350
1351 for (i = 0; i < tags->nr_tags; i++) {
1352 if (!tags->rqs[i])
1353 continue;
1354 set->ops->exit_request(set->driver_data, tags->rqs[i],
1355 hctx_idx, i);
1356 }
1357 }
1358
1359 while (!list_empty(&tags->page_list)) {
1360 page = list_first_entry(&tags->page_list, struct page, lru);
1361 list_del_init(&page->lru);
1362 __free_pages(page, page->private);
1363 }
1364
1365 kfree(tags->rqs);
1366
1367 blk_mq_free_tags(tags);
945} 1368}
946EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
947 1369
948void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1370static size_t order_to_size(unsigned int order)
949 unsigned int hctx_index)
950{ 1371{
951 kfree(hctx); 1372 return (size_t)PAGE_SIZE << order;
952} 1373}
953EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
954 1374
955static void blk_mq_hctx_notify(void *data, unsigned long action, 1375static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
956 unsigned int cpu) 1376 unsigned int hctx_idx)
1377{
1378 struct blk_mq_tags *tags;
1379 unsigned int i, j, entries_per_page, max_order = 4;
1380 size_t rq_size, left;
1381
1382 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1383 set->numa_node);
1384 if (!tags)
1385 return NULL;
1386
1387 INIT_LIST_HEAD(&tags->page_list);
1388
1389 tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
1390 GFP_KERNEL, set->numa_node);
1391 if (!tags->rqs) {
1392 blk_mq_free_tags(tags);
1393 return NULL;
1394 }
1395
1396 /*
1397 * rq_size is the size of the request plus driver payload, rounded
1398 * to the cacheline size
1399 */
1400 rq_size = round_up(sizeof(struct request) + set->cmd_size,
1401 cache_line_size());
1402 left = rq_size * set->queue_depth;
1403
1404 for (i = 0; i < set->queue_depth; ) {
1405 int this_order = max_order;
1406 struct page *page;
1407 int to_do;
1408 void *p;
1409
1410 while (left < order_to_size(this_order - 1) && this_order)
1411 this_order--;
1412
1413 do {
1414 page = alloc_pages_node(set->numa_node, GFP_KERNEL,
1415 this_order);
1416 if (page)
1417 break;
1418 if (!this_order--)
1419 break;
1420 if (order_to_size(this_order) < rq_size)
1421 break;
1422 } while (1);
1423
1424 if (!page)
1425 goto fail;
1426
1427 page->private = this_order;
1428 list_add_tail(&page->lru, &tags->page_list);
1429
1430 p = page_address(page);
1431 entries_per_page = order_to_size(this_order) / rq_size;
1432 to_do = min(entries_per_page, set->queue_depth - i);
1433 left -= to_do * rq_size;
1434 for (j = 0; j < to_do; j++) {
1435 tags->rqs[i] = p;
1436 if (set->ops->init_request) {
1437 if (set->ops->init_request(set->driver_data,
1438 tags->rqs[i], hctx_idx, i,
1439 set->numa_node))
1440 goto fail;
1441 }
1442
1443 p += rq_size;
1444 i++;
1445 }
1446 }
1447
1448 return tags;
1449
1450fail:
1451 pr_warn("%s: failed to allocate requests\n", __func__);
1452 blk_mq_free_rq_map(set, tags, hctx_idx);
1453 return NULL;
1454}
1455
1456static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1457{
1458 kfree(bitmap->map);
1459}
1460
1461static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1462{
1463 unsigned int bpw = 8, total, num_maps, i;
1464
1465 bitmap->bits_per_word = bpw;
1466
1467 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1468 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1469 GFP_KERNEL, node);
1470 if (!bitmap->map)
1471 return -ENOMEM;
1472
1473 bitmap->map_size = num_maps;
1474
1475 total = nr_cpu_ids;
1476 for (i = 0; i < num_maps; i++) {
1477 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1478 total -= bitmap->map[i].depth;
1479 }
1480
1481 return 0;
1482}
1483
1484static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
957{ 1485{
958 struct blk_mq_hw_ctx *hctx = data;
959 struct request_queue *q = hctx->queue; 1486 struct request_queue *q = hctx->queue;
960 struct blk_mq_ctx *ctx; 1487 struct blk_mq_ctx *ctx;
961 LIST_HEAD(tmp); 1488 LIST_HEAD(tmp);
962 1489
963 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
964 return;
965
966 /* 1490 /*
967 * Move ctx entries to new CPU, if this one is going away. 1491 * Move ctx entries to new CPU, if this one is going away.
968 */ 1492 */
@@ -971,12 +1495,12 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
971 spin_lock(&ctx->lock); 1495 spin_lock(&ctx->lock);
972 if (!list_empty(&ctx->rq_list)) { 1496 if (!list_empty(&ctx->rq_list)) {
973 list_splice_init(&ctx->rq_list, &tmp); 1497 list_splice_init(&ctx->rq_list, &tmp);
974 clear_bit(ctx->index_hw, hctx->ctx_map); 1498 blk_mq_hctx_clear_pending(hctx, ctx);
975 } 1499 }
976 spin_unlock(&ctx->lock); 1500 spin_unlock(&ctx->lock);
977 1501
978 if (list_empty(&tmp)) 1502 if (list_empty(&tmp))
979 return; 1503 return NOTIFY_OK;
980 1504
981 ctx = blk_mq_get_ctx(q); 1505 ctx = blk_mq_get_ctx(q);
982 spin_lock(&ctx->lock); 1506 spin_lock(&ctx->lock);
@@ -993,210 +1517,103 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
993 blk_mq_hctx_mark_pending(hctx, ctx); 1517 blk_mq_hctx_mark_pending(hctx, ctx);
994 1518
995 spin_unlock(&ctx->lock); 1519 spin_unlock(&ctx->lock);
996 blk_mq_put_ctx(ctx);
997 1520
998 blk_mq_run_hw_queue(hctx, true); 1521 blk_mq_run_hw_queue(hctx, true);
1522 blk_mq_put_ctx(ctx);
1523 return NOTIFY_OK;
999} 1524}
1000 1525
1001static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1526static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1002 int (*init)(void *, struct blk_mq_hw_ctx *,
1003 struct request *, unsigned int),
1004 void *data)
1005{ 1527{
1006 unsigned int i; 1528 struct request_queue *q = hctx->queue;
1007 int ret = 0; 1529 struct blk_mq_tag_set *set = q->tag_set;
1008
1009 for (i = 0; i < hctx->queue_depth; i++) {
1010 struct request *rq = hctx->rqs[i];
1011
1012 ret = init(data, hctx, rq, i);
1013 if (ret)
1014 break;
1015 }
1016
1017 return ret;
1018}
1019 1530
1020int blk_mq_init_commands(struct request_queue *q, 1531 if (set->tags[hctx->queue_num])
1021 int (*init)(void *, struct blk_mq_hw_ctx *, 1532 return NOTIFY_OK;
1022 struct request *, unsigned int),
1023 void *data)
1024{
1025 struct blk_mq_hw_ctx *hctx;
1026 unsigned int i;
1027 int ret = 0;
1028 1533
1029 queue_for_each_hw_ctx(q, hctx, i) { 1534 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1030 ret = blk_mq_init_hw_commands(hctx, init, data); 1535 if (!set->tags[hctx->queue_num])
1031 if (ret) 1536 return NOTIFY_STOP;
1032 break;
1033 }
1034 1537
1035 return ret; 1538 hctx->tags = set->tags[hctx->queue_num];
1539 return NOTIFY_OK;
1036} 1540}
1037EXPORT_SYMBOL(blk_mq_init_commands);
1038 1541
1039static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, 1542static int blk_mq_hctx_notify(void *data, unsigned long action,
1040 void (*free)(void *, struct blk_mq_hw_ctx *, 1543 unsigned int cpu)
1041 struct request *, unsigned int),
1042 void *data)
1043{ 1544{
1044 unsigned int i; 1545 struct blk_mq_hw_ctx *hctx = data;
1045 1546
1046 for (i = 0; i < hctx->queue_depth; i++) { 1547 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1047 struct request *rq = hctx->rqs[i]; 1548 return blk_mq_hctx_cpu_offline(hctx, cpu);
1549 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
1550 return blk_mq_hctx_cpu_online(hctx, cpu);
1048 1551
1049 free(data, hctx, rq, i); 1552 return NOTIFY_OK;
1050 }
1051} 1553}
1052 1554
1053void blk_mq_free_commands(struct request_queue *q, 1555static void blk_mq_exit_hw_queues(struct request_queue *q,
1054 void (*free)(void *, struct blk_mq_hw_ctx *, 1556 struct blk_mq_tag_set *set, int nr_queue)
1055 struct request *, unsigned int),
1056 void *data)
1057{ 1557{
1058 struct blk_mq_hw_ctx *hctx; 1558 struct blk_mq_hw_ctx *hctx;
1059 unsigned int i; 1559 unsigned int i;
1060 1560
1061 queue_for_each_hw_ctx(q, hctx, i) 1561 queue_for_each_hw_ctx(q, hctx, i) {
1062 blk_mq_free_hw_commands(hctx, free, data); 1562 if (i == nr_queue)
1063} 1563 break;
1064EXPORT_SYMBOL(blk_mq_free_commands);
1065 1564
1066static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1565 if (set->ops->exit_hctx)
1067{ 1566 set->ops->exit_hctx(hctx, i);
1068 struct page *page;
1069 1567
1070 while (!list_empty(&hctx->page_list)) { 1568 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1071 page = list_first_entry(&hctx->page_list, struct page, lru); 1569 kfree(hctx->ctxs);
1072 list_del_init(&page->lru); 1570 blk_mq_free_bitmap(&hctx->ctx_map);
1073 __free_pages(page, page->private);
1074 } 1571 }
1075 1572
1076 kfree(hctx->rqs);
1077
1078 if (hctx->tags)
1079 blk_mq_free_tags(hctx->tags);
1080}
1081
1082static size_t order_to_size(unsigned int order)
1083{
1084 size_t ret = PAGE_SIZE;
1085
1086 while (order--)
1087 ret *= 2;
1088
1089 return ret;
1090} 1573}
1091 1574
1092static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1575static void blk_mq_free_hw_queues(struct request_queue *q,
1093 unsigned int reserved_tags, int node) 1576 struct blk_mq_tag_set *set)
1094{ 1577{
1095 unsigned int i, j, entries_per_page, max_order = 4; 1578 struct blk_mq_hw_ctx *hctx;
1096 size_t rq_size, left; 1579 unsigned int i;
1097
1098 INIT_LIST_HEAD(&hctx->page_list);
1099
1100 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1101 GFP_KERNEL, node);
1102 if (!hctx->rqs)
1103 return -ENOMEM;
1104
1105 /*
1106 * rq_size is the size of the request plus driver payload, rounded
1107 * to the cacheline size
1108 */
1109 rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1110 cache_line_size());
1111 left = rq_size * hctx->queue_depth;
1112
1113 for (i = 0; i < hctx->queue_depth;) {
1114 int this_order = max_order;
1115 struct page *page;
1116 int to_do;
1117 void *p;
1118
1119 while (left < order_to_size(this_order - 1) && this_order)
1120 this_order--;
1121
1122 do {
1123 page = alloc_pages_node(node, GFP_KERNEL, this_order);
1124 if (page)
1125 break;
1126 if (!this_order--)
1127 break;
1128 if (order_to_size(this_order) < rq_size)
1129 break;
1130 } while (1);
1131
1132 if (!page)
1133 break;
1134
1135 page->private = this_order;
1136 list_add_tail(&page->lru, &hctx->page_list);
1137
1138 p = page_address(page);
1139 entries_per_page = order_to_size(this_order) / rq_size;
1140 to_do = min(entries_per_page, hctx->queue_depth - i);
1141 left -= to_do * rq_size;
1142 for (j = 0; j < to_do; j++) {
1143 hctx->rqs[i] = p;
1144 blk_mq_rq_init(hctx, hctx->rqs[i]);
1145 p += rq_size;
1146 i++;
1147 }
1148 }
1149
1150 if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1151 goto err_rq_map;
1152 else if (i != hctx->queue_depth) {
1153 hctx->queue_depth = i;
1154 pr_warn("%s: queue depth set to %u because of low memory\n",
1155 __func__, i);
1156 }
1157 1580
1158 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1581 queue_for_each_hw_ctx(q, hctx, i) {
1159 if (!hctx->tags) { 1582 free_cpumask_var(hctx->cpumask);
1160err_rq_map: 1583 kfree(hctx);
1161 blk_mq_free_rq_map(hctx);
1162 return -ENOMEM;
1163 } 1584 }
1164
1165 return 0;
1166} 1585}
1167 1586
1168static int blk_mq_init_hw_queues(struct request_queue *q, 1587static int blk_mq_init_hw_queues(struct request_queue *q,
1169 struct blk_mq_reg *reg, void *driver_data) 1588 struct blk_mq_tag_set *set)
1170{ 1589{
1171 struct blk_mq_hw_ctx *hctx; 1590 struct blk_mq_hw_ctx *hctx;
1172 unsigned int i, j; 1591 unsigned int i;
1173 1592
1174 /* 1593 /*
1175 * Initialize hardware queues 1594 * Initialize hardware queues
1176 */ 1595 */
1177 queue_for_each_hw_ctx(q, hctx, i) { 1596 queue_for_each_hw_ctx(q, hctx, i) {
1178 unsigned int num_maps;
1179 int node; 1597 int node;
1180 1598
1181 node = hctx->numa_node; 1599 node = hctx->numa_node;
1182 if (node == NUMA_NO_NODE) 1600 if (node == NUMA_NO_NODE)
1183 node = hctx->numa_node = reg->numa_node; 1601 node = hctx->numa_node = set->numa_node;
1184 1602
1185 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1603 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1604 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1186 spin_lock_init(&hctx->lock); 1605 spin_lock_init(&hctx->lock);
1187 INIT_LIST_HEAD(&hctx->dispatch); 1606 INIT_LIST_HEAD(&hctx->dispatch);
1188 hctx->queue = q; 1607 hctx->queue = q;
1189 hctx->queue_num = i; 1608 hctx->queue_num = i;
1190 hctx->flags = reg->flags; 1609 hctx->flags = set->flags;
1191 hctx->queue_depth = reg->queue_depth; 1610 hctx->cmd_size = set->cmd_size;
1192 hctx->cmd_size = reg->cmd_size;
1193 1611
1194 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1612 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1195 blk_mq_hctx_notify, hctx); 1613 blk_mq_hctx_notify, hctx);
1196 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1614 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1197 1615
1198 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1616 hctx->tags = set->tags[i];
1199 break;
1200 1617
1201 /* 1618 /*
1202 * Allocate space for all possible cpus to avoid allocation in 1619 * Allocate space for all possible cpus to avoid allocation in
@@ -1207,17 +1624,13 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1207 if (!hctx->ctxs) 1624 if (!hctx->ctxs)
1208 break; 1625 break;
1209 1626
1210 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1627 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1211 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1212 GFP_KERNEL, node);
1213 if (!hctx->ctx_map)
1214 break; 1628 break;
1215 1629
1216 hctx->nr_ctx_map = num_maps;
1217 hctx->nr_ctx = 0; 1630 hctx->nr_ctx = 0;
1218 1631
1219 if (reg->ops->init_hctx && 1632 if (set->ops->init_hctx &&
1220 reg->ops->init_hctx(hctx, driver_data, i)) 1633 set->ops->init_hctx(hctx, set->driver_data, i))
1221 break; 1634 break;
1222 } 1635 }
1223 1636
@@ -1227,17 +1640,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
1227 /* 1640 /*
1228 * Init failed 1641 * Init failed
1229 */ 1642 */
1230 queue_for_each_hw_ctx(q, hctx, j) { 1643 blk_mq_exit_hw_queues(q, set, i);
1231 if (i == j)
1232 break;
1233
1234 if (reg->ops->exit_hctx)
1235 reg->ops->exit_hctx(hctx, j);
1236
1237 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1238 blk_mq_free_rq_map(hctx);
1239 kfree(hctx->ctxs);
1240 }
1241 1644
1242 return 1; 1645 return 1;
1243} 1646}
@@ -1258,12 +1661,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
1258 __ctx->queue = q; 1661 __ctx->queue = q;
1259 1662
1260 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1663 /* If the cpu isn't online, the cpu is mapped to first hctx */
1261 hctx = q->mq_ops->map_queue(q, i);
1262 hctx->nr_ctx++;
1263
1264 if (!cpu_online(i)) 1664 if (!cpu_online(i))
1265 continue; 1665 continue;
1266 1666
1667 hctx = q->mq_ops->map_queue(q, i);
1668 cpumask_set_cpu(i, hctx->cpumask);
1669 hctx->nr_ctx++;
1670
1267 /* 1671 /*
1268 * Set local node, IFF we have more than one hw queue. If 1672 * Set local node, IFF we have more than one hw queue. If
1269 * not, we remain on the home node of the device 1673 * not, we remain on the home node of the device
@@ -1280,6 +1684,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1280 struct blk_mq_ctx *ctx; 1684 struct blk_mq_ctx *ctx;
1281 1685
1282 queue_for_each_hw_ctx(q, hctx, i) { 1686 queue_for_each_hw_ctx(q, hctx, i) {
1687 cpumask_clear(hctx->cpumask);
1283 hctx->nr_ctx = 0; 1688 hctx->nr_ctx = 0;
1284 } 1689 }
1285 1690
@@ -1288,115 +1693,208 @@ static void blk_mq_map_swqueue(struct request_queue *q)
1288 */ 1693 */
1289 queue_for_each_ctx(q, ctx, i) { 1694 queue_for_each_ctx(q, ctx, i) {
1290 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1695 /* If the cpu isn't online, the cpu is mapped to first hctx */
1696 if (!cpu_online(i))
1697 continue;
1698
1291 hctx = q->mq_ops->map_queue(q, i); 1699 hctx = q->mq_ops->map_queue(q, i);
1700 cpumask_set_cpu(i, hctx->cpumask);
1292 ctx->index_hw = hctx->nr_ctx; 1701 ctx->index_hw = hctx->nr_ctx;
1293 hctx->ctxs[hctx->nr_ctx++] = ctx; 1702 hctx->ctxs[hctx->nr_ctx++] = ctx;
1294 } 1703 }
1704
1705 queue_for_each_hw_ctx(q, hctx, i) {
1706 /*
1707 * If not software queues are mapped to this hardware queue,
1708 * disable it and free the request entries
1709 */
1710 if (!hctx->nr_ctx) {
1711 struct blk_mq_tag_set *set = q->tag_set;
1712
1713 if (set->tags[i]) {
1714 blk_mq_free_rq_map(set, set->tags[i], i);
1715 set->tags[i] = NULL;
1716 hctx->tags = NULL;
1717 }
1718 continue;
1719 }
1720
1721 /*
1722 * Initialize batch roundrobin counts
1723 */
1724 hctx->next_cpu = cpumask_first(hctx->cpumask);
1725 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1726 }
1295} 1727}
1296 1728
1297struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1729static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1298 void *driver_data)
1299{ 1730{
1300 struct blk_mq_hw_ctx **hctxs; 1731 struct blk_mq_hw_ctx *hctx;
1301 struct blk_mq_ctx *ctx;
1302 struct request_queue *q; 1732 struct request_queue *q;
1733 bool shared;
1303 int i; 1734 int i;
1304 1735
1305 if (!reg->nr_hw_queues || 1736 if (set->tag_list.next == set->tag_list.prev)
1306 !reg->ops->queue_rq || !reg->ops->map_queue || 1737 shared = false;
1307 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1738 else
1308 return ERR_PTR(-EINVAL); 1739 shared = true;
1740
1741 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1742 blk_mq_freeze_queue(q);
1309 1743
1310 if (!reg->queue_depth) 1744 queue_for_each_hw_ctx(q, hctx, i) {
1311 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1745 if (shared)
1312 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1746 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1313 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1747 else
1314 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1748 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1749 }
1750 blk_mq_unfreeze_queue(q);
1315 } 1751 }
1752}
1316 1753
1317 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1754static void blk_mq_del_queue_tag_set(struct request_queue *q)
1318 return ERR_PTR(-EINVAL); 1755{
1756 struct blk_mq_tag_set *set = q->tag_set;
1757
1758 blk_mq_freeze_queue(q);
1759
1760 mutex_lock(&set->tag_list_lock);
1761 list_del_init(&q->tag_set_list);
1762 blk_mq_update_tag_set_depth(set);
1763 mutex_unlock(&set->tag_list_lock);
1764
1765 blk_mq_unfreeze_queue(q);
1766}
1767
1768static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1769 struct request_queue *q)
1770{
1771 q->tag_set = set;
1772
1773 mutex_lock(&set->tag_list_lock);
1774 list_add_tail(&q->tag_set_list, &set->tag_list);
1775 blk_mq_update_tag_set_depth(set);
1776 mutex_unlock(&set->tag_list_lock);
1777}
1778
1779struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1780{
1781 struct blk_mq_hw_ctx **hctxs;
1782 struct blk_mq_ctx *ctx;
1783 struct request_queue *q;
1784 unsigned int *map;
1785 int i;
1319 1786
1320 ctx = alloc_percpu(struct blk_mq_ctx); 1787 ctx = alloc_percpu(struct blk_mq_ctx);
1321 if (!ctx) 1788 if (!ctx)
1322 return ERR_PTR(-ENOMEM); 1789 return ERR_PTR(-ENOMEM);
1323 1790
1324 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1791 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1325 reg->numa_node); 1792 set->numa_node);
1326 1793
1327 if (!hctxs) 1794 if (!hctxs)
1328 goto err_percpu; 1795 goto err_percpu;
1329 1796
1330 for (i = 0; i < reg->nr_hw_queues; i++) { 1797 map = blk_mq_make_queue_map(set);
1331 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1798 if (!map)
1799 goto err_map;
1800
1801 for (i = 0; i < set->nr_hw_queues; i++) {
1802 int node = blk_mq_hw_queue_to_node(map, i);
1803
1804 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1805 GFP_KERNEL, node);
1332 if (!hctxs[i]) 1806 if (!hctxs[i])
1333 goto err_hctxs; 1807 goto err_hctxs;
1334 1808
1335 hctxs[i]->numa_node = NUMA_NO_NODE; 1809 if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
1810 goto err_hctxs;
1811
1812 atomic_set(&hctxs[i]->nr_active, 0);
1813 hctxs[i]->numa_node = node;
1336 hctxs[i]->queue_num = i; 1814 hctxs[i]->queue_num = i;
1337 } 1815 }
1338 1816
1339 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1817 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1340 if (!q) 1818 if (!q)
1341 goto err_hctxs; 1819 goto err_hctxs;
1342 1820
1343 q->mq_map = blk_mq_make_queue_map(reg); 1821 if (percpu_counter_init(&q->mq_usage_counter, 0))
1344 if (!q->mq_map)
1345 goto err_map; 1822 goto err_map;
1346 1823
1347 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1824 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1348 blk_queue_rq_timeout(q, 30000); 1825 blk_queue_rq_timeout(q, 30000);
1349 1826
1350 q->nr_queues = nr_cpu_ids; 1827 q->nr_queues = nr_cpu_ids;
1351 q->nr_hw_queues = reg->nr_hw_queues; 1828 q->nr_hw_queues = set->nr_hw_queues;
1829 q->mq_map = map;
1352 1830
1353 q->queue_ctx = ctx; 1831 q->queue_ctx = ctx;
1354 q->queue_hw_ctx = hctxs; 1832 q->queue_hw_ctx = hctxs;
1355 1833
1356 q->mq_ops = reg->ops; 1834 q->mq_ops = set->ops;
1357 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1835 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1358 1836
1837 if (!(set->flags & BLK_MQ_F_SG_MERGE))
1838 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
1839
1359 q->sg_reserved_size = INT_MAX; 1840 q->sg_reserved_size = INT_MAX;
1360 1841
1361 blk_queue_make_request(q, blk_mq_make_request); 1842 INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
1362 blk_queue_rq_timed_out(q, reg->ops->timeout); 1843 INIT_LIST_HEAD(&q->requeue_list);
1363 if (reg->timeout) 1844 spin_lock_init(&q->requeue_lock);
1364 blk_queue_rq_timeout(q, reg->timeout); 1845
1846 if (q->nr_hw_queues > 1)
1847 blk_queue_make_request(q, blk_mq_make_request);
1848 else
1849 blk_queue_make_request(q, blk_sq_make_request);
1850
1851 blk_queue_rq_timed_out(q, blk_mq_rq_timed_out);
1852 if (set->timeout)
1853 blk_queue_rq_timeout(q, set->timeout);
1854
1855 /*
1856 * Do this after blk_queue_make_request() overrides it...
1857 */
1858 q->nr_requests = set->queue_depth;
1365 1859
1366 if (reg->ops->complete) 1860 if (set->ops->complete)
1367 blk_queue_softirq_done(q, reg->ops->complete); 1861 blk_queue_softirq_done(q, set->ops->complete);
1368 1862
1369 blk_mq_init_flush(q); 1863 blk_mq_init_flush(q);
1370 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1864 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1371 1865
1372 q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, 1866 q->flush_rq = kzalloc(round_up(sizeof(struct request) +
1373 cache_line_size()), GFP_KERNEL); 1867 set->cmd_size, cache_line_size()),
1868 GFP_KERNEL);
1374 if (!q->flush_rq) 1869 if (!q->flush_rq)
1375 goto err_hw; 1870 goto err_hw;
1376 1871
1377 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1872 if (blk_mq_init_hw_queues(q, set))
1378 goto err_flush_rq; 1873 goto err_flush_rq;
1379 1874
1380 blk_mq_map_swqueue(q);
1381
1382 mutex_lock(&all_q_mutex); 1875 mutex_lock(&all_q_mutex);
1383 list_add_tail(&q->all_q_node, &all_q_list); 1876 list_add_tail(&q->all_q_node, &all_q_list);
1384 mutex_unlock(&all_q_mutex); 1877 mutex_unlock(&all_q_mutex);
1385 1878
1879 blk_mq_add_queue_tag_set(set, q);
1880
1881 blk_mq_map_swqueue(q);
1882
1386 return q; 1883 return q;
1387 1884
1388err_flush_rq: 1885err_flush_rq:
1389 kfree(q->flush_rq); 1886 kfree(q->flush_rq);
1390err_hw: 1887err_hw:
1391 kfree(q->mq_map);
1392err_map:
1393 blk_cleanup_queue(q); 1888 blk_cleanup_queue(q);
1394err_hctxs: 1889err_hctxs:
1395 for (i = 0; i < reg->nr_hw_queues; i++) { 1890 kfree(map);
1891 for (i = 0; i < set->nr_hw_queues; i++) {
1396 if (!hctxs[i]) 1892 if (!hctxs[i])
1397 break; 1893 break;
1398 reg->ops->free_hctx(hctxs[i], i); 1894 free_cpumask_var(hctxs[i]->cpumask);
1895 kfree(hctxs[i]);
1399 } 1896 }
1897err_map:
1400 kfree(hctxs); 1898 kfree(hctxs);
1401err_percpu: 1899err_percpu:
1402 free_percpu(ctx); 1900 free_percpu(ctx);
@@ -1406,18 +1904,14 @@ EXPORT_SYMBOL(blk_mq_init_queue);
1406 1904
1407void blk_mq_free_queue(struct request_queue *q) 1905void blk_mq_free_queue(struct request_queue *q)
1408{ 1906{
1409 struct blk_mq_hw_ctx *hctx; 1907 struct blk_mq_tag_set *set = q->tag_set;
1410 int i;
1411 1908
1412 queue_for_each_hw_ctx(q, hctx, i) { 1909 blk_mq_del_queue_tag_set(q);
1413 kfree(hctx->ctx_map); 1910
1414 kfree(hctx->ctxs); 1911 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
1415 blk_mq_free_rq_map(hctx); 1912 blk_mq_free_hw_queues(q, set);
1416 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1913
1417 if (q->mq_ops->exit_hctx) 1914 percpu_counter_destroy(&q->mq_usage_counter);
1418 q->mq_ops->exit_hctx(hctx, i);
1419 q->mq_ops->free_hctx(hctx, i);
1420 }
1421 1915
1422 free_percpu(q->queue_ctx); 1916 free_percpu(q->queue_ctx);
1423 kfree(q->queue_hw_ctx); 1917 kfree(q->queue_hw_ctx);
@@ -1437,6 +1931,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
1437{ 1931{
1438 blk_mq_freeze_queue(q); 1932 blk_mq_freeze_queue(q);
1439 1933
1934 blk_mq_sysfs_unregister(q);
1935
1440 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1936 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1441 1937
1442 /* 1938 /*
@@ -1447,6 +1943,8 @@ static void blk_mq_queue_reinit(struct request_queue *q)
1447 1943
1448 blk_mq_map_swqueue(q); 1944 blk_mq_map_swqueue(q);
1449 1945
1946 blk_mq_sysfs_register(q);
1947
1450 blk_mq_unfreeze_queue(q); 1948 blk_mq_unfreeze_queue(q);
1451} 1949}
1452 1950
@@ -1456,10 +1954,10 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1456 struct request_queue *q; 1954 struct request_queue *q;
1457 1955
1458 /* 1956 /*
1459 * Before new mapping is established, hotadded cpu might already start 1957 * Before new mappings are established, hotadded cpu might already
1460 * handling requests. This doesn't break anything as we map offline 1958 * start handling requests. This doesn't break anything as we map
1461 * CPUs to first hardware queue. We will re-init queue below to get 1959 * offline CPUs to first hardware queue. We will re-init the queue
1462 * optimal settings. 1960 * below to get optimal settings.
1463 */ 1961 */
1464 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1962 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1465 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1963 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
@@ -1472,6 +1970,81 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1472 return NOTIFY_OK; 1970 return NOTIFY_OK;
1473} 1971}
1474 1972
1973int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
1974{
1975 int i;
1976
1977 if (!set->nr_hw_queues)
1978 return -EINVAL;
1979 if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
1980 return -EINVAL;
1981 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
1982 return -EINVAL;
1983
1984 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
1985 return -EINVAL;
1986
1987
1988 set->tags = kmalloc_node(set->nr_hw_queues *
1989 sizeof(struct blk_mq_tags *),
1990 GFP_KERNEL, set->numa_node);
1991 if (!set->tags)
1992 goto out;
1993
1994 for (i = 0; i < set->nr_hw_queues; i++) {
1995 set->tags[i] = blk_mq_init_rq_map(set, i);
1996 if (!set->tags[i])
1997 goto out_unwind;
1998 }
1999
2000 mutex_init(&set->tag_list_lock);
2001 INIT_LIST_HEAD(&set->tag_list);
2002
2003 return 0;
2004
2005out_unwind:
2006 while (--i >= 0)
2007 blk_mq_free_rq_map(set, set->tags[i], i);
2008out:
2009 return -ENOMEM;
2010}
2011EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2012
2013void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2014{
2015 int i;
2016
2017 for (i = 0; i < set->nr_hw_queues; i++) {
2018 if (set->tags[i])
2019 blk_mq_free_rq_map(set, set->tags[i], i);
2020 }
2021
2022 kfree(set->tags);
2023}
2024EXPORT_SYMBOL(blk_mq_free_tag_set);
2025
2026int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2027{
2028 struct blk_mq_tag_set *set = q->tag_set;
2029 struct blk_mq_hw_ctx *hctx;
2030 int i, ret;
2031
2032 if (!set || nr > set->queue_depth)
2033 return -EINVAL;
2034
2035 ret = 0;
2036 queue_for_each_hw_ctx(q, hctx, i) {
2037 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2038 if (ret)
2039 break;
2040 }
2041
2042 if (!ret)
2043 q->nr_requests = nr;
2044
2045 return ret;
2046}
2047
1475void blk_mq_disable_hotplug(void) 2048void blk_mq_disable_hotplug(void)
1476{ 2049{
1477 mutex_lock(&all_q_mutex); 2050 mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ebbe6bac9d61..de7b3bbd5bd6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
1#ifndef INT_BLK_MQ_H 1#ifndef INT_BLK_MQ_H
2#define INT_BLK_MQ_H 2#define INT_BLK_MQ_H
3 3
4struct blk_mq_tag_set;
5
4struct blk_mq_ctx { 6struct blk_mq_ctx {
5 struct { 7 struct {
6 spinlock_t lock; 8 spinlock_t lock;
@@ -9,7 +11,8 @@ struct blk_mq_ctx {
9 11
10 unsigned int cpu; 12 unsigned int cpu;
11 unsigned int index_hw; 13 unsigned int index_hw;
12 unsigned int ipi_redirect; 14
15 unsigned int last_tag ____cacheline_aligned_in_smp;
13 16
14 /* incremented at dispatch time */ 17 /* incremented at dispatch time */
15 unsigned long rq_dispatched[2]; 18 unsigned long rq_dispatched[2];
@@ -20,21 +23,23 @@ struct blk_mq_ctx {
20 23
21 struct request_queue *queue; 24 struct request_queue *queue;
22 struct kobject kobj; 25 struct kobject kobj;
23}; 26} ____cacheline_aligned_in_smp;
24 27
25void __blk_mq_complete_request(struct request *rq); 28void __blk_mq_complete_request(struct request *rq);
26void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 29void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
27void blk_mq_init_flush(struct request_queue *q); 30void blk_mq_init_flush(struct request_queue *q);
28void blk_mq_drain_queue(struct request_queue *q); 31void blk_mq_drain_queue(struct request_queue *q);
29void blk_mq_free_queue(struct request_queue *q); 32void blk_mq_free_queue(struct request_queue *q);
30void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq); 33void blk_mq_clone_flush_request(struct request *flush_rq,
34 struct request *orig_rq);
35int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
31 36
32/* 37/*
33 * CPU hotplug helpers 38 * CPU hotplug helpers
34 */ 39 */
35struct blk_mq_cpu_notifier; 40struct blk_mq_cpu_notifier;
36void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier, 41void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
37 void (*fn)(void *, unsigned long, unsigned int), 42 int (*fn)(void *, unsigned long, unsigned int),
38 void *data); 43 void *data);
39void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 44void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
40void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier); 45void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
@@ -45,10 +50,23 @@ void blk_mq_disable_hotplug(void);
45/* 50/*
46 * CPU -> queue mappings 51 * CPU -> queue mappings
47 */ 52 */
48struct blk_mq_reg; 53extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
49extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
50extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); 54extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
55extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
51 56
52void blk_mq_add_timer(struct request *rq); 57/*
58 * sysfs helpers
59 */
60extern int blk_mq_sysfs_register(struct request_queue *q);
61extern void blk_mq_sysfs_unregister(struct request_queue *q);
62
63/*
64 * Basic implementation of sparser bitmap, allowing the user to spread
65 * the bits over more cachelines.
66 */
67struct blk_align_bitmap {
68 unsigned long word;
69 unsigned long depth;
70} ____cacheline_aligned_in_smp;
53 71
54#endif 72#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f876dae4..23321fbab293 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -48,11 +48,10 @@ static ssize_t queue_requests_show(struct request_queue *q, char *page)
48static ssize_t 48static ssize_t
49queue_requests_store(struct request_queue *q, const char *page, size_t count) 49queue_requests_store(struct request_queue *q, const char *page, size_t count)
50{ 50{
51 struct request_list *rl;
52 unsigned long nr; 51 unsigned long nr;
53 int ret; 52 int ret, err;
54 53
55 if (!q->request_fn) 54 if (!q->request_fn && !q->mq_ops)
56 return -EINVAL; 55 return -EINVAL;
57 56
58 ret = queue_var_store(&nr, page, count); 57 ret = queue_var_store(&nr, page, count);
@@ -62,40 +61,14 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
62 if (nr < BLKDEV_MIN_RQ) 61 if (nr < BLKDEV_MIN_RQ)
63 nr = BLKDEV_MIN_RQ; 62 nr = BLKDEV_MIN_RQ;
64 63
65 spin_lock_irq(q->queue_lock); 64 if (q->request_fn)
66 q->nr_requests = nr; 65 err = blk_update_nr_requests(q, nr);
67 blk_queue_congestion_threshold(q); 66 else
68 67 err = blk_mq_update_nr_requests(q, nr);
69 /* congestion isn't cgroup aware and follows root blkcg for now */ 68
70 rl = &q->root_rl; 69 if (err)
71 70 return err;
72 if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
73 blk_set_queue_congested(q, BLK_RW_SYNC);
74 else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
75 blk_clear_queue_congested(q, BLK_RW_SYNC);
76
77 if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
78 blk_set_queue_congested(q, BLK_RW_ASYNC);
79 else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
80 blk_clear_queue_congested(q, BLK_RW_ASYNC);
81
82 blk_queue_for_each_rl(rl, q) {
83 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
84 blk_set_rl_full(rl, BLK_RW_SYNC);
85 } else {
86 blk_clear_rl_full(rl, BLK_RW_SYNC);
87 wake_up(&rl->wait[BLK_RW_SYNC]);
88 }
89
90 if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
91 blk_set_rl_full(rl, BLK_RW_ASYNC);
92 } else {
93 blk_clear_rl_full(rl, BLK_RW_ASYNC);
94 wake_up(&rl->wait[BLK_RW_ASYNC]);
95 }
96 }
97 71
98 spin_unlock_irq(q->queue_lock);
99 return ret; 72 return ret;
100} 73}
101 74
@@ -544,8 +517,6 @@ static void blk_release_queue(struct kobject *kobj)
544 if (q->queue_tags) 517 if (q->queue_tags)
545 __blk_queue_free_tags(q); 518 __blk_queue_free_tags(q);
546 519
547 percpu_counter_destroy(&q->mq_usage_counter);
548
549 if (q->mq_ops) 520 if (q->mq_ops)
550 blk_mq_free_queue(q); 521 blk_mq_free_queue(q);
551 522
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 033745cd7fba..9353b4683359 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -744,7 +744,7 @@ static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
744static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 744static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
745{ 745{
746 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 746 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
747 return 0; 747 return false;
748 748
749 return 1; 749 return 1;
750} 750}
@@ -842,7 +842,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
842 if (tg->io_disp[rw] + 1 <= io_allowed) { 842 if (tg->io_disp[rw] + 1 <= io_allowed) {
843 if (wait) 843 if (wait)
844 *wait = 0; 844 *wait = 0;
845 return 1; 845 return true;
846 } 846 }
847 847
848 /* Calc approx time to dispatch */ 848 /* Calc approx time to dispatch */
@@ -880,7 +880,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
880 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 880 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) {
881 if (wait) 881 if (wait)
882 *wait = 0; 882 *wait = 0;
883 return 1; 883 return true;
884 } 884 }
885 885
886 /* Calc approx time to dispatch */ 886 /* Calc approx time to dispatch */
@@ -923,7 +923,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
923 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 923 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
924 if (wait) 924 if (wait)
925 *wait = 0; 925 *wait = 0;
926 return 1; 926 return true;
927 } 927 }
928 928
929 /* 929 /*
@@ -1258,7 +1258,7 @@ out_unlock:
1258 * of throtl_data->service_queue. Those bio's are ready and issued by this 1258 * of throtl_data->service_queue. Those bio's are ready and issued by this
1259 * function. 1259 * function.
1260 */ 1260 */
1261void blk_throtl_dispatch_work_fn(struct work_struct *work) 1261static void blk_throtl_dispatch_work_fn(struct work_struct *work)
1262{ 1262{
1263 struct throtl_data *td = container_of(work, struct throtl_data, 1263 struct throtl_data *td = container_of(work, struct throtl_data,
1264 dispatch_work); 1264 dispatch_work);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index d96f7061c6fd..95a09590ccfd 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -96,11 +96,7 @@ static void blk_rq_timed_out(struct request *req)
96 __blk_complete_request(req); 96 __blk_complete_request(req);
97 break; 97 break;
98 case BLK_EH_RESET_TIMER: 98 case BLK_EH_RESET_TIMER:
99 if (q->mq_ops) 99 blk_add_timer(req);
100 blk_mq_add_timer(req);
101 else
102 blk_add_timer(req);
103
104 blk_clear_rq_complete(req); 100 blk_clear_rq_complete(req);
105 break; 101 break;
106 case BLK_EH_NOT_HANDLED: 102 case BLK_EH_NOT_HANDLED:
@@ -170,7 +166,26 @@ void blk_abort_request(struct request *req)
170} 166}
171EXPORT_SYMBOL_GPL(blk_abort_request); 167EXPORT_SYMBOL_GPL(blk_abort_request);
172 168
173void __blk_add_timer(struct request *req, struct list_head *timeout_list) 169unsigned long blk_rq_timeout(unsigned long timeout)
170{
171 unsigned long maxt;
172
173 maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
174 if (time_after(timeout, maxt))
175 timeout = maxt;
176
177 return timeout;
178}
179
180/**
181 * blk_add_timer - Start timeout timer for a single request
182 * @req: request that is about to start running.
183 *
184 * Notes:
185 * Each request has its own timer, and as it is added to the queue, we
186 * set up the timer. When the request completes, we cancel the timer.
187 */
188void blk_add_timer(struct request *req)
174{ 189{
175 struct request_queue *q = req->q; 190 struct request_queue *q = req->q;
176 unsigned long expiry; 191 unsigned long expiry;
@@ -188,32 +203,29 @@ void __blk_add_timer(struct request *req, struct list_head *timeout_list)
188 req->timeout = q->rq_timeout; 203 req->timeout = q->rq_timeout;
189 204
190 req->deadline = jiffies + req->timeout; 205 req->deadline = jiffies + req->timeout;
191 if (timeout_list) 206 if (!q->mq_ops)
192 list_add_tail(&req->timeout_list, timeout_list); 207 list_add_tail(&req->timeout_list, &req->q->timeout_list);
193 208
194 /* 209 /*
195 * If the timer isn't already pending or this timeout is earlier 210 * If the timer isn't already pending or this timeout is earlier
196 * than an existing one, modify the timer. Round up to next nearest 211 * than an existing one, modify the timer. Round up to next nearest
197 * second. 212 * second.
198 */ 213 */
199 expiry = round_jiffies_up(req->deadline); 214 expiry = blk_rq_timeout(round_jiffies_up(req->deadline));
200 215
201 if (!timer_pending(&q->timeout) || 216 if (!timer_pending(&q->timeout) ||
202 time_before(expiry, q->timeout.expires)) 217 time_before(expiry, q->timeout.expires)) {
203 mod_timer(&q->timeout, expiry); 218 unsigned long diff = q->timeout.expires - expiry;
204 219
205} 220 /*
221 * Due to added timer slack to group timers, the timer
222 * will often be a little in front of what we asked for.
223 * So apply some tolerance here too, otherwise we keep
224 * modifying the timer because expires for value X
225 * will be X + something.
226 */
227 if (!timer_pending(&q->timeout) || (diff >= HZ / 2))
228 mod_timer(&q->timeout, expiry);
229 }
206 230
207/**
208 * blk_add_timer - Start timeout timer for a single request
209 * @req: request that is about to start running.
210 *
211 * Notes:
212 * Each request has its own timer, and as it is added to the queue, we
213 * set up the timer. When the request completes, we cancel the timer.
214 */
215void blk_add_timer(struct request *req)
216{
217 __blk_add_timer(req, &req->q->timeout_list);
218} 231}
219
diff --git a/block/blk.h b/block/blk.h
index 1d880f1f957f..45385e9abf6f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -9,6 +9,9 @@
9/* Number of requests a "batching" process may submit */ 9/* Number of requests a "batching" process may submit */
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12/* Max future timer expiry for timeouts */
13#define BLK_MAX_TIMEOUT (5 * HZ)
14
12extern struct kmem_cache *blk_requestq_cachep; 15extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep; 16extern struct kmem_cache *request_cachep;
14extern struct kobj_type blk_queue_ktype; 17extern struct kobj_type blk_queue_ktype;
@@ -37,9 +40,9 @@ bool __blk_end_bidi_request(struct request *rq, int error,
37void blk_rq_timed_out_timer(unsigned long data); 40void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, 41void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set); 42 unsigned int *next_set);
40void __blk_add_timer(struct request *req, struct list_head *timeout_list); 43unsigned long blk_rq_timeout(unsigned long timeout);
44void blk_add_timer(struct request *req);
41void blk_delete_timer(struct request *); 45void blk_delete_timer(struct request *);
42void blk_add_timer(struct request *);
43 46
44 47
45bool bio_attempt_front_merge(struct request_queue *q, struct request *req, 48bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
@@ -185,6 +188,8 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
185 return q->nr_congestion_off; 188 return q->nr_congestion_off;
186} 189}
187 190
191extern int blk_update_nr_requests(struct request_queue *, unsigned int);
192
188/* 193/*
189 * Contribute to IO statistics IFF: 194 * Contribute to IO statistics IFF:
190 * 195 *
diff --git a/mm/bounce.c b/block/bounce.c
index 523918b8c6dc..523918b8c6dc 100644
--- a/mm/bounce.c
+++ b/block/bounce.c
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e0985f1955e7..22dffebc7c73 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -908,7 +908,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
908{ 908{
909 if (cfqd->busy_queues) { 909 if (cfqd->busy_queues) {
910 cfq_log(cfqd, "schedule dispatch"); 910 cfq_log(cfqd, "schedule dispatch");
911 kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work); 911 kblockd_schedule_work(&cfqd->unplug_work);
912 } 912 }
913} 913}
914 914
@@ -4460,7 +4460,7 @@ out_free:
4460static ssize_t 4460static ssize_t
4461cfq_var_show(unsigned int var, char *page) 4461cfq_var_show(unsigned int var, char *page)
4462{ 4462{
4463 return sprintf(page, "%d\n", var); 4463 return sprintf(page, "%u\n", var);
4464} 4464}
4465 4465
4466static ssize_t 4466static ssize_t
diff --git a/fs/ioprio.c b/block/ioprio.c
index e50170ca7c33..e50170ca7c33 100644
--- a/fs/ioprio.c
+++ b/block/ioprio.c
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 26487972ac54..9c28a5b38042 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -205,10 +205,6 @@ int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
205 if (capable(CAP_SYS_RAWIO)) 205 if (capable(CAP_SYS_RAWIO))
206 return 0; 206 return 0;
207 207
208 /* if there's no filter set, assume we're filtering everything out */
209 if (!filter)
210 return -EPERM;
211
212 /* Anybody who can open the device can do a read-safe command */ 208 /* Anybody who can open the device can do a read-safe command */
213 if (test_bit(cmd[0], filter->read_ok)) 209 if (test_bit(cmd[0], filter->read_ok))
214 return 0; 210 return 0;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 748dea4f34dc..758da2287d9a 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1406,7 +1406,7 @@ next_segment:
1406 1406
1407 track = block / (floppy->dtype->sects * floppy->type->sect_mult); 1407 track = block / (floppy->dtype->sects * floppy->type->sect_mult);
1408 sector = block % (floppy->dtype->sects * floppy->type->sect_mult); 1408 sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
1409 data = rq->buffer + 512 * cnt; 1409 data = bio_data(rq->bio) + 512 * cnt;
1410#ifdef DEBUG 1410#ifdef DEBUG
1411 printk("access to track %d, sector %d, with buffer at " 1411 printk("access to track %d, sector %d, with buffer at "
1412 "0x%08lx\n", track, sector, data); 1412 "0x%08lx\n", track, sector, data);
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index cfa64bdf01c9..2104b1b4ccda 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1484,7 +1484,7 @@ repeat:
1484 ReqCnt = 0; 1484 ReqCnt = 0;
1485 ReqCmd = rq_data_dir(fd_request); 1485 ReqCmd = rq_data_dir(fd_request);
1486 ReqBlock = blk_rq_pos(fd_request); 1486 ReqBlock = blk_rq_pos(fd_request);
1487 ReqBuffer = fd_request->buffer; 1487 ReqBuffer = bio_data(fd_request->bio);
1488 setup_req_params( drive ); 1488 setup_req_params( drive );
1489 do_fd_action( drive ); 1489 do_fd_action( drive );
1490 1490
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index fa9bb742df6e..dc3a41c82b38 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2351,7 +2351,7 @@ static void rw_interrupt(void)
2351 } 2351 }
2352 2352
2353 if (CT(COMMAND) != FD_READ || 2353 if (CT(COMMAND) != FD_READ ||
2354 raw_cmd->kernel_data == current_req->buffer) { 2354 raw_cmd->kernel_data == bio_data(current_req->bio)) {
2355 /* transfer directly from buffer */ 2355 /* transfer directly from buffer */
2356 cont->done(1); 2356 cont->done(1);
2357 } else if (CT(COMMAND) == FD_READ) { 2357 } else if (CT(COMMAND) == FD_READ) {
@@ -2640,7 +2640,7 @@ static int make_raw_rw_request(void)
2640 raw_cmd->flags &= ~FD_RAW_WRITE; 2640 raw_cmd->flags &= ~FD_RAW_WRITE;
2641 raw_cmd->flags |= FD_RAW_READ; 2641 raw_cmd->flags |= FD_RAW_READ;
2642 COMMAND = FM_MODE(_floppy, FD_READ); 2642 COMMAND = FM_MODE(_floppy, FD_READ);
2643 } else if ((unsigned long)current_req->buffer < MAX_DMA_ADDRESS) { 2643 } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
2644 unsigned long dma_limit; 2644 unsigned long dma_limit;
2645 int direct, indirect; 2645 int direct, indirect;
2646 2646
@@ -2654,13 +2654,13 @@ static int make_raw_rw_request(void)
2654 */ 2654 */
2655 max_size = buffer_chain_size(); 2655 max_size = buffer_chain_size();
2656 dma_limit = (MAX_DMA_ADDRESS - 2656 dma_limit = (MAX_DMA_ADDRESS -
2657 ((unsigned long)current_req->buffer)) >> 9; 2657 ((unsigned long)bio_data(current_req->bio))) >> 9;
2658 if ((unsigned long)max_size > dma_limit) 2658 if ((unsigned long)max_size > dma_limit)
2659 max_size = dma_limit; 2659 max_size = dma_limit;
2660 /* 64 kb boundaries */ 2660 /* 64 kb boundaries */
2661 if (CROSS_64KB(current_req->buffer, max_size << 9)) 2661 if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
2662 max_size = (K_64 - 2662 max_size = (K_64 -
2663 ((unsigned long)current_req->buffer) % 2663 ((unsigned long)bio_data(current_req->bio)) %
2664 K_64) >> 9; 2664 K_64) >> 9;
2665 direct = transfer_size(ssize, max_sector, max_size) - fsector_t; 2665 direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
2666 /* 2666 /*
@@ -2677,7 +2677,7 @@ static int make_raw_rw_request(void)
2677 (DP->read_track & (1 << DRS->probed_format)))))) { 2677 (DP->read_track & (1 << DRS->probed_format)))))) {
2678 max_size = blk_rq_sectors(current_req); 2678 max_size = blk_rq_sectors(current_req);
2679 } else { 2679 } else {
2680 raw_cmd->kernel_data = current_req->buffer; 2680 raw_cmd->kernel_data = bio_data(current_req->bio);
2681 raw_cmd->length = current_count_sectors << 9; 2681 raw_cmd->length = current_count_sectors << 9;
2682 if (raw_cmd->length == 0) { 2682 if (raw_cmd->length == 0) {
2683 DPRINT("%s: zero dma transfer attempted\n", __func__); 2683 DPRINT("%s: zero dma transfer attempted\n", __func__);
@@ -2731,7 +2731,7 @@ static int make_raw_rw_request(void)
2731 raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; 2731 raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
2732 raw_cmd->length <<= 9; 2732 raw_cmd->length <<= 9;
2733 if ((raw_cmd->length < current_count_sectors << 9) || 2733 if ((raw_cmd->length < current_count_sectors << 9) ||
2734 (raw_cmd->kernel_data != current_req->buffer && 2734 (raw_cmd->kernel_data != bio_data(current_req->bio) &&
2735 CT(COMMAND) == FD_WRITE && 2735 CT(COMMAND) == FD_WRITE &&
2736 (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || 2736 (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
2737 aligned_sector_t < buffer_min)) || 2737 aligned_sector_t < buffer_min)) ||
@@ -2739,7 +2739,7 @@ static int make_raw_rw_request(void)
2739 raw_cmd->length <= 0 || current_count_sectors <= 0) { 2739 raw_cmd->length <= 0 || current_count_sectors <= 0) {
2740 DPRINT("fractionary current count b=%lx s=%lx\n", 2740 DPRINT("fractionary current count b=%lx s=%lx\n",
2741 raw_cmd->length, current_count_sectors); 2741 raw_cmd->length, current_count_sectors);
2742 if (raw_cmd->kernel_data != current_req->buffer) 2742 if (raw_cmd->kernel_data != bio_data(current_req->bio))
2743 pr_info("addr=%d, length=%ld\n", 2743 pr_info("addr=%d, length=%ld\n",
2744 (int)((raw_cmd->kernel_data - 2744 (int)((raw_cmd->kernel_data -
2745 floppy_track_buffer) >> 9), 2745 floppy_track_buffer) >> 9),
@@ -2756,7 +2756,7 @@ static int make_raw_rw_request(void)
2756 return 0; 2756 return 0;
2757 } 2757 }
2758 2758
2759 if (raw_cmd->kernel_data != current_req->buffer) { 2759 if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
2760 if (raw_cmd->kernel_data < floppy_track_buffer || 2760 if (raw_cmd->kernel_data < floppy_track_buffer ||
2761 current_count_sectors < 0 || 2761 current_count_sectors < 0 ||
2762 raw_cmd->length < 0 || 2762 raw_cmd->length < 0 ||
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index bf397bf108b7..8a290c08262f 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -464,11 +464,11 @@ static void read_intr(void)
464 464
465ok_to_read: 465ok_to_read:
466 req = hd_req; 466 req = hd_req;
467 insw(HD_DATA, req->buffer, 256); 467 insw(HD_DATA, bio_data(req->bio), 256);
468#ifdef DEBUG 468#ifdef DEBUG
469 printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", 469 printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
470 req->rq_disk->disk_name, blk_rq_pos(req) + 1, 470 req->rq_disk->disk_name, blk_rq_pos(req) + 1,
471 blk_rq_sectors(req) - 1, req->buffer+512); 471 blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
472#endif 472#endif
473 if (hd_end_request(0, 512)) { 473 if (hd_end_request(0, 512)) {
474 SET_HANDLER(&read_intr); 474 SET_HANDLER(&read_intr);
@@ -505,7 +505,7 @@ static void write_intr(void)
505ok_to_write: 505ok_to_write:
506 if (hd_end_request(0, 512)) { 506 if (hd_end_request(0, 512)) {
507 SET_HANDLER(&write_intr); 507 SET_HANDLER(&write_intr);
508 outsw(HD_DATA, req->buffer, 256); 508 outsw(HD_DATA, bio_data(req->bio), 256);
509 return; 509 return;
510 } 510 }
511 511
@@ -624,7 +624,7 @@ repeat:
624 printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", 624 printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
625 req->rq_disk->disk_name, 625 req->rq_disk->disk_name,
626 req_data_dir(req) == READ ? "read" : "writ", 626 req_data_dir(req) == READ ? "read" : "writ",
627 cyl, head, sec, nsect, req->buffer); 627 cyl, head, sec, nsect, bio_data(req->bio));
628#endif 628#endif
629 if (req->cmd_type == REQ_TYPE_FS) { 629 if (req->cmd_type == REQ_TYPE_FS) {
630 switch (rq_data_dir(req)) { 630 switch (rq_data_dir(req)) {
@@ -643,7 +643,7 @@ repeat:
643 bad_rw_intr(); 643 bad_rw_intr();
644 goto repeat; 644 goto repeat;
645 } 645 }
646 outsw(HD_DATA, req->buffer, 256); 646 outsw(HD_DATA, bio_data(req->bio), 256);
647 break; 647 break;
648 default: 648 default:
649 printk("unknown hd-command\n"); 649 printk("unknown hd-command\n");
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index eb59b1241366..e352cac707e8 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -479,7 +479,7 @@ static unsigned int mg_out(struct mg_host *host,
479 479
480static void mg_read_one(struct mg_host *host, struct request *req) 480static void mg_read_one(struct mg_host *host, struct request *req)
481{ 481{
482 u16 *buff = (u16 *)req->buffer; 482 u16 *buff = (u16 *)bio_data(req->bio);
483 u32 i; 483 u32 i;
484 484
485 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) 485 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -496,7 +496,7 @@ static void mg_read(struct request *req)
496 mg_bad_rw_intr(host); 496 mg_bad_rw_intr(host);
497 497
498 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", 498 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
499 blk_rq_sectors(req), blk_rq_pos(req), req->buffer); 499 blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
500 500
501 do { 501 do {
502 if (mg_wait(host, ATA_DRQ, 502 if (mg_wait(host, ATA_DRQ,
@@ -514,7 +514,7 @@ static void mg_read(struct request *req)
514 514
515static void mg_write_one(struct mg_host *host, struct request *req) 515static void mg_write_one(struct mg_host *host, struct request *req)
516{ 516{
517 u16 *buff = (u16 *)req->buffer; 517 u16 *buff = (u16 *)bio_data(req->bio);
518 u32 i; 518 u32 i;
519 519
520 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) 520 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
@@ -534,7 +534,7 @@ static void mg_write(struct request *req)
534 } 534 }
535 535
536 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", 536 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
537 rem, blk_rq_pos(req), req->buffer); 537 rem, blk_rq_pos(req), bio_data(req->bio));
538 538
539 if (mg_wait(host, ATA_DRQ, 539 if (mg_wait(host, ATA_DRQ,
540 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { 540 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
@@ -585,7 +585,7 @@ ok_to_read:
585 mg_read_one(host, req); 585 mg_read_one(host, req);
586 586
587 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", 587 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
588 blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); 588 blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
589 589
590 /* send read confirm */ 590 /* send read confirm */
591 outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); 591 outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -624,7 +624,7 @@ ok_to_write:
624 /* write 1 sector and set handler if remains */ 624 /* write 1 sector and set handler if remains */
625 mg_write_one(host, req); 625 mg_write_one(host, req);
626 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", 626 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
627 blk_rq_pos(req), blk_rq_sectors(req), req->buffer); 627 blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
628 host->mg_do_intr = mg_write_intr; 628 host->mg_do_intr = mg_write_intr;
629 mod_timer(&host->timer, jiffies + 3 * HZ); 629 mod_timer(&host->timer, jiffies + 3 * HZ);
630 } 630 }
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 091b9ea14feb..b40af63a5476 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -32,6 +32,7 @@ struct nullb {
32 unsigned int index; 32 unsigned int index;
33 struct request_queue *q; 33 struct request_queue *q;
34 struct gendisk *disk; 34 struct gendisk *disk;
35 struct blk_mq_tag_set tag_set;
35 struct hrtimer timer; 36 struct hrtimer timer;
36 unsigned int queue_depth; 37 unsigned int queue_depth;
37 spinlock_t lock; 38 spinlock_t lock;
@@ -226,7 +227,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
226 227
227static void null_softirq_done_fn(struct request *rq) 228static void null_softirq_done_fn(struct request *rq)
228{ 229{
229 end_cmd(rq->special); 230 end_cmd(blk_mq_rq_to_pdu(rq));
230} 231}
231 232
232static inline void null_handle_cmd(struct nullb_cmd *cmd) 233static inline void null_handle_cmd(struct nullb_cmd *cmd)
@@ -311,7 +312,7 @@ static void null_request_fn(struct request_queue *q)
311 312
312static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq) 313static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
313{ 314{
314 struct nullb_cmd *cmd = rq->special; 315 struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
315 316
316 cmd->rq = rq; 317 cmd->rq = rq;
317 cmd->nq = hctx->driver_data; 318 cmd->nq = hctx->driver_data;
@@ -320,46 +321,6 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
320 return BLK_MQ_RQ_QUEUE_OK; 321 return BLK_MQ_RQ_QUEUE_OK;
321} 322}
322 323
323static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
324{
325 int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
326 int tip = (reg->nr_hw_queues % nr_online_nodes);
327 int node = 0, i, n;
328
329 /*
330 * Split submit queues evenly wrt to the number of nodes. If uneven,
331 * fill the first buckets with one extra, until the rest is filled with
332 * no extra.
333 */
334 for (i = 0, n = 1; i < hctx_index; i++, n++) {
335 if (n % b_size == 0) {
336 n = 0;
337 node++;
338
339 tip--;
340 if (!tip)
341 b_size = reg->nr_hw_queues / nr_online_nodes;
342 }
343 }
344
345 /*
346 * A node might not be online, therefore map the relative node id to the
347 * real node id.
348 */
349 for_each_online_node(n) {
350 if (!node)
351 break;
352 node--;
353 }
354
355 return kzalloc_node(sizeof(struct blk_mq_hw_ctx), GFP_KERNEL, n);
356}
357
358static void null_free_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_index)
359{
360 kfree(hctx);
361}
362
363static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) 324static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
364{ 325{
365 BUG_ON(!nullb); 326 BUG_ON(!nullb);
@@ -389,19 +350,14 @@ static struct blk_mq_ops null_mq_ops = {
389 .complete = null_softirq_done_fn, 350 .complete = null_softirq_done_fn,
390}; 351};
391 352
392static struct blk_mq_reg null_mq_reg = {
393 .ops = &null_mq_ops,
394 .queue_depth = 64,
395 .cmd_size = sizeof(struct nullb_cmd),
396 .flags = BLK_MQ_F_SHOULD_MERGE,
397};
398
399static void null_del_dev(struct nullb *nullb) 353static void null_del_dev(struct nullb *nullb)
400{ 354{
401 list_del_init(&nullb->list); 355 list_del_init(&nullb->list);
402 356
403 del_gendisk(nullb->disk); 357 del_gendisk(nullb->disk);
404 blk_cleanup_queue(nullb->q); 358 blk_cleanup_queue(nullb->q);
359 if (queue_mode == NULL_Q_MQ)
360 blk_mq_free_tag_set(&nullb->tag_set);
405 put_disk(nullb->disk); 361 put_disk(nullb->disk);
406 kfree(nullb); 362 kfree(nullb);
407} 363}
@@ -506,7 +462,7 @@ static int null_add_dev(void)
506 462
507 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); 463 nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
508 if (!nullb) 464 if (!nullb)
509 return -ENOMEM; 465 goto out;
510 466
511 spin_lock_init(&nullb->lock); 467 spin_lock_init(&nullb->lock);
512 468
@@ -514,49 +470,44 @@ static int null_add_dev(void)
514 submit_queues = nr_online_nodes; 470 submit_queues = nr_online_nodes;
515 471
516 if (setup_queues(nullb)) 472 if (setup_queues(nullb))
517 goto err; 473 goto out_free_nullb;
518 474
519 if (queue_mode == NULL_Q_MQ) { 475 if (queue_mode == NULL_Q_MQ) {
520 null_mq_reg.numa_node = home_node; 476 nullb->tag_set.ops = &null_mq_ops;
521 null_mq_reg.queue_depth = hw_queue_depth; 477 nullb->tag_set.nr_hw_queues = submit_queues;
522 null_mq_reg.nr_hw_queues = submit_queues; 478 nullb->tag_set.queue_depth = hw_queue_depth;
523 479 nullb->tag_set.numa_node = home_node;
524 if (use_per_node_hctx) { 480 nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
525 null_mq_reg.ops->alloc_hctx = null_alloc_hctx; 481 nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
526 null_mq_reg.ops->free_hctx = null_free_hctx; 482 nullb->tag_set.driver_data = nullb;
527 } else { 483
528 null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue; 484 if (blk_mq_alloc_tag_set(&nullb->tag_set))
529 null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue; 485 goto out_cleanup_queues;
530 } 486
531 487 nullb->q = blk_mq_init_queue(&nullb->tag_set);
532 nullb->q = blk_mq_init_queue(&null_mq_reg, nullb); 488 if (!nullb->q)
489 goto out_cleanup_tags;
533 } else if (queue_mode == NULL_Q_BIO) { 490 } else if (queue_mode == NULL_Q_BIO) {
534 nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); 491 nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
492 if (!nullb->q)
493 goto out_cleanup_queues;
535 blk_queue_make_request(nullb->q, null_queue_bio); 494 blk_queue_make_request(nullb->q, null_queue_bio);
536 init_driver_queues(nullb); 495 init_driver_queues(nullb);
537 } else { 496 } else {
538 nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); 497 nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
498 if (!nullb->q)
499 goto out_cleanup_queues;
539 blk_queue_prep_rq(nullb->q, null_rq_prep_fn); 500 blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
540 if (nullb->q) 501 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
541 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
542 init_driver_queues(nullb); 502 init_driver_queues(nullb);
543 } 503 }
544 504
545 if (!nullb->q)
546 goto queue_fail;
547
548 nullb->q->queuedata = nullb; 505 nullb->q->queuedata = nullb;
549 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); 506 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
550 507
551 disk = nullb->disk = alloc_disk_node(1, home_node); 508 disk = nullb->disk = alloc_disk_node(1, home_node);
552 if (!disk) { 509 if (!disk)
553queue_fail: 510 goto out_cleanup_blk_queue;
554 blk_cleanup_queue(nullb->q);
555 cleanup_queues(nullb);
556err:
557 kfree(nullb);
558 return -ENOMEM;
559 }
560 511
561 mutex_lock(&lock); 512 mutex_lock(&lock);
562 list_add_tail(&nullb->list, &nullb_list); 513 list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +530,18 @@ err:
579 sprintf(disk->disk_name, "nullb%d", nullb->index); 530 sprintf(disk->disk_name, "nullb%d", nullb->index);
580 add_disk(disk); 531 add_disk(disk);
581 return 0; 532 return 0;
533
534out_cleanup_blk_queue:
535 blk_cleanup_queue(nullb->q);
536out_cleanup_tags:
537 if (queue_mode == NULL_Q_MQ)
538 blk_mq_free_tag_set(&nullb->tag_set);
539out_cleanup_queues:
540 cleanup_queues(nullb);
541out_free_nullb:
542 kfree(nullb);
543out:
544 return -ENOMEM;
582} 545}
583 546
584static int __init null_init(void) 547static int __init null_init(void)
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index e76bdc074dbe..719cb1bc1640 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -747,7 +747,7 @@ static void do_pcd_request(struct request_queue * q)
747 pcd_current = cd; 747 pcd_current = cd;
748 pcd_sector = blk_rq_pos(pcd_req); 748 pcd_sector = blk_rq_pos(pcd_req);
749 pcd_count = blk_rq_cur_sectors(pcd_req); 749 pcd_count = blk_rq_cur_sectors(pcd_req);
750 pcd_buf = pcd_req->buffer; 750 pcd_buf = bio_data(pcd_req->bio);
751 pcd_busy = 1; 751 pcd_busy = 1;
752 ps_set_intr(do_pcd_read, NULL, 0, nice); 752 ps_set_intr(do_pcd_read, NULL, 0, nice);
753 return; 753 return;
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 19ad8f0c83ef..fea7e76a00de 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -454,7 +454,7 @@ static enum action do_pd_io_start(void)
454 if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) 454 if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
455 return Fail; 455 return Fail;
456 pd_run = blk_rq_sectors(pd_req); 456 pd_run = blk_rq_sectors(pd_req);
457 pd_buf = pd_req->buffer; 457 pd_buf = bio_data(pd_req->bio);
458 pd_retries = 0; 458 pd_retries = 0;
459 if (pd_cmd == READ) 459 if (pd_cmd == READ)
460 return do_pd_read_start(); 460 return do_pd_read_start();
@@ -485,7 +485,7 @@ static int pd_next_buf(void)
485 spin_lock_irqsave(&pd_lock, saved_flags); 485 spin_lock_irqsave(&pd_lock, saved_flags);
486 __blk_end_request_cur(pd_req, 0); 486 __blk_end_request_cur(pd_req, 0);
487 pd_count = blk_rq_cur_sectors(pd_req); 487 pd_count = blk_rq_cur_sectors(pd_req);
488 pd_buf = pd_req->buffer; 488 pd_buf = bio_data(pd_req->bio);
489 spin_unlock_irqrestore(&pd_lock, saved_flags); 489 spin_unlock_irqrestore(&pd_lock, saved_flags);
490 return 0; 490 return 0;
491} 491}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index f5c86d523ba0..9a15fd3c9349 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -795,7 +795,7 @@ repeat:
795 } 795 }
796 796
797 pf_cmd = rq_data_dir(pf_req); 797 pf_cmd = rq_data_dir(pf_req);
798 pf_buf = pf_req->buffer; 798 pf_buf = bio_data(pf_req->bio);
799 pf_retries = 0; 799 pf_retries = 0;
800 800
801 pf_busy = 1; 801 pf_busy = 1;
@@ -827,7 +827,7 @@ static int pf_next_buf(void)
827 if (!pf_req) 827 if (!pf_req)
828 return 1; 828 return 1;
829 pf_count = blk_rq_cur_sectors(pf_req); 829 pf_count = blk_rq_cur_sectors(pf_req);
830 pf_buf = pf_req->buffer; 830 pf_buf = bio_data(pf_req->bio);
831 } 831 }
832 return 0; 832 return 0;
833} 833}
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a69dd93d1bd5..c48d9084c965 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -563,7 +563,6 @@ skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
563 563
564 req = skreq->req; 564 req = skreq->req;
565 blk_add_request_payload(req, page, len); 565 blk_add_request_payload(req, page, len);
566 req->buffer = buf;
567} 566}
568 567
569static void skd_request_fn_not_online(struct request_queue *q); 568static void skd_request_fn_not_online(struct request_queue *q);
@@ -744,6 +743,7 @@ static void skd_request_fn(struct request_queue *q)
744 break; 743 break;
745 } 744 }
746 skreq->discard_page = 1; 745 skreq->discard_page = 1;
746 req->completion_data = page;
747 skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); 747 skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
748 748
749 } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { 749 } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
@@ -858,8 +858,7 @@ static void skd_end_request(struct skd_device *skdev,
858 (skreq->discard_page == 1)) { 858 (skreq->discard_page == 1)) {
859 pr_debug("%s:%s:%d, free the page!", 859 pr_debug("%s:%s:%d, free the page!",
860 skdev->name, __func__, __LINE__); 860 skdev->name, __func__, __LINE__);
861 free_page((unsigned long)req->buffer); 861 __free_page(req->completion_data);
862 req->buffer = NULL;
863 } 862 }
864 863
865 if (unlikely(error)) { 864 if (unlikely(error)) {
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b02d53a399f3..6b44bbe528b7 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -549,7 +549,7 @@ static void redo_fd_request(struct request_queue *q)
549 case READ: 549 case READ:
550 err = floppy_read_sectors(fs, blk_rq_pos(req), 550 err = floppy_read_sectors(fs, blk_rq_pos(req),
551 blk_rq_cur_sectors(req), 551 blk_rq_cur_sectors(req),
552 req->buffer); 552 bio_data(req->bio));
553 break; 553 break;
554 } 554 }
555 done: 555 done:
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c74f7b56e7c4..523ee8fd4c15 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -342,7 +342,7 @@ static void start_request(struct floppy_state *fs)
342 swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", 342 swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
343 req->rq_disk->disk_name, req->cmd, 343 req->rq_disk->disk_name, req->cmd,
344 (long)blk_rq_pos(req), blk_rq_sectors(req), 344 (long)blk_rq_pos(req), blk_rq_sectors(req),
345 req->buffer); 345 bio_data(req->bio));
346 swim3_dbg(" errors=%d current_nr_sectors=%u\n", 346 swim3_dbg(" errors=%d current_nr_sectors=%u\n",
347 req->errors, blk_rq_cur_sectors(req)); 347 req->errors, blk_rq_cur_sectors(req));
348#endif 348#endif
@@ -479,11 +479,11 @@ static inline void setup_transfer(struct floppy_state *fs)
479 /* Set up 3 dma commands: write preamble, data, postamble */ 479 /* Set up 3 dma commands: write preamble, data, postamble */
480 init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); 480 init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
481 ++cp; 481 ++cp;
482 init_dma(cp, OUTPUT_MORE, req->buffer, 512); 482 init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
483 ++cp; 483 ++cp;
484 init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); 484 init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
485 } else { 485 } else {
486 init_dma(cp, INPUT_LAST, req->buffer, n * 512); 486 init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
487 } 487 }
488 ++cp; 488 ++cp;
489 out_le16(&cp->command, DBDMA_STOP); 489 out_le16(&cp->command, DBDMA_STOP);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cb9b1f8326c3..c8f286e8d80f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,9 @@ struct virtio_blk
30 /* The disk structure for the kernel. */ 30 /* The disk structure for the kernel. */
31 struct gendisk *disk; 31 struct gendisk *disk;
32 32
33 /* Block layer tags. */
34 struct blk_mq_tag_set tag_set;
35
33 /* Process context for config space updates */ 36 /* Process context for config space updates */
34 struct work_struct config_work; 37 struct work_struct config_work;
35 38
@@ -112,7 +115,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
112 115
113static inline void virtblk_request_done(struct request *req) 116static inline void virtblk_request_done(struct request *req)
114{ 117{
115 struct virtblk_req *vbr = req->special; 118 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
116 int error = virtblk_result(vbr); 119 int error = virtblk_result(vbr);
117 120
118 if (req->cmd_type == REQ_TYPE_BLOCK_PC) { 121 if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -147,14 +150,14 @@ static void virtblk_done(struct virtqueue *vq)
147 150
148 /* In case queue is stopped waiting for more buffers. */ 151 /* In case queue is stopped waiting for more buffers. */
149 if (req_done) 152 if (req_done)
150 blk_mq_start_stopped_hw_queues(vblk->disk->queue); 153 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
151 spin_unlock_irqrestore(&vblk->vq_lock, flags); 154 spin_unlock_irqrestore(&vblk->vq_lock, flags);
152} 155}
153 156
154static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req) 157static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
155{ 158{
156 struct virtio_blk *vblk = hctx->queue->queuedata; 159 struct virtio_blk *vblk = hctx->queue->queuedata;
157 struct virtblk_req *vbr = req->special; 160 struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
158 unsigned long flags; 161 unsigned long flags;
159 unsigned int num; 162 unsigned int num;
160 const bool last = (req->cmd_flags & REQ_END) != 0; 163 const bool last = (req->cmd_flags & REQ_END) != 0;
@@ -480,33 +483,27 @@ static const struct device_attribute dev_attr_cache_type_rw =
480 __ATTR(cache_type, S_IRUGO|S_IWUSR, 483 __ATTR(cache_type, S_IRUGO|S_IWUSR,
481 virtblk_cache_type_show, virtblk_cache_type_store); 484 virtblk_cache_type_show, virtblk_cache_type_store);
482 485
483static struct blk_mq_ops virtio_mq_ops = { 486static int virtblk_init_request(void *data, struct request *rq,
484 .queue_rq = virtio_queue_rq, 487 unsigned int hctx_idx, unsigned int request_idx,
485 .map_queue = blk_mq_map_queue, 488 unsigned int numa_node)
486 .alloc_hctx = blk_mq_alloc_single_hw_queue,
487 .free_hctx = blk_mq_free_single_hw_queue,
488 .complete = virtblk_request_done,
489};
490
491static struct blk_mq_reg virtio_mq_reg = {
492 .ops = &virtio_mq_ops,
493 .nr_hw_queues = 1,
494 .queue_depth = 0, /* Set in virtblk_probe */
495 .numa_node = NUMA_NO_NODE,
496 .flags = BLK_MQ_F_SHOULD_MERGE,
497};
498module_param_named(queue_depth, virtio_mq_reg.queue_depth, uint, 0444);
499
500static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
501 struct request *rq, unsigned int nr)
502{ 489{
503 struct virtio_blk *vblk = data; 490 struct virtio_blk *vblk = data;
504 struct virtblk_req *vbr = rq->special; 491 struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
505 492
506 sg_init_table(vbr->sg, vblk->sg_elems); 493 sg_init_table(vbr->sg, vblk->sg_elems);
507 return 0; 494 return 0;
508} 495}
509 496
497static struct blk_mq_ops virtio_mq_ops = {
498 .queue_rq = virtio_queue_rq,
499 .map_queue = blk_mq_map_queue,
500 .complete = virtblk_request_done,
501 .init_request = virtblk_init_request,
502};
503
504static unsigned int virtblk_queue_depth;
505module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
506
510static int virtblk_probe(struct virtio_device *vdev) 507static int virtblk_probe(struct virtio_device *vdev)
511{ 508{
512 struct virtio_blk *vblk; 509 struct virtio_blk *vblk;
@@ -561,24 +558,34 @@ static int virtblk_probe(struct virtio_device *vdev)
561 } 558 }
562 559
563 /* Default queue sizing is to fill the ring. */ 560 /* Default queue sizing is to fill the ring. */
564 if (!virtio_mq_reg.queue_depth) { 561 if (!virtblk_queue_depth) {
565 virtio_mq_reg.queue_depth = vblk->vq->num_free; 562 virtblk_queue_depth = vblk->vq->num_free;
566 /* ... but without indirect descs, we use 2 descs per req */ 563 /* ... but without indirect descs, we use 2 descs per req */
567 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) 564 if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
568 virtio_mq_reg.queue_depth /= 2; 565 virtblk_queue_depth /= 2;
569 } 566 }
570 virtio_mq_reg.cmd_size = 567
568 memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
569 vblk->tag_set.ops = &virtio_mq_ops;
570 vblk->tag_set.nr_hw_queues = 1;
571 vblk->tag_set.queue_depth = virtblk_queue_depth;
572 vblk->tag_set.numa_node = NUMA_NO_NODE;
573 vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
574 vblk->tag_set.cmd_size =
571 sizeof(struct virtblk_req) + 575 sizeof(struct virtblk_req) +
572 sizeof(struct scatterlist) * sg_elems; 576 sizeof(struct scatterlist) * sg_elems;
577 vblk->tag_set.driver_data = vblk;
573 578
574 q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk); 579 err = blk_mq_alloc_tag_set(&vblk->tag_set);
580 if (err)
581 goto out_put_disk;
582
583 q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
575 if (!q) { 584 if (!q) {
576 err = -ENOMEM; 585 err = -ENOMEM;
577 goto out_put_disk; 586 goto out_free_tags;
578 } 587 }
579 588
580 blk_mq_init_commands(q, virtblk_init_vbr, vblk);
581
582 q->queuedata = vblk; 589 q->queuedata = vblk;
583 590
584 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); 591 virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@@ -679,6 +686,8 @@ static int virtblk_probe(struct virtio_device *vdev)
679out_del_disk: 686out_del_disk:
680 del_gendisk(vblk->disk); 687 del_gendisk(vblk->disk);
681 blk_cleanup_queue(vblk->disk->queue); 688 blk_cleanup_queue(vblk->disk->queue);
689out_free_tags:
690 blk_mq_free_tag_set(&vblk->tag_set);
682out_put_disk: 691out_put_disk:
683 put_disk(vblk->disk); 692 put_disk(vblk->disk);
684out_free_vq: 693out_free_vq:
@@ -705,6 +714,8 @@ static void virtblk_remove(struct virtio_device *vdev)
705 del_gendisk(vblk->disk); 714 del_gendisk(vblk->disk);
706 blk_cleanup_queue(vblk->disk->queue); 715 blk_cleanup_queue(vblk->disk->queue);
707 716
717 blk_mq_free_tag_set(&vblk->tag_set);
718
708 /* Stop all the virtqueues. */ 719 /* Stop all the virtqueues. */
709 vdev->config->reset(vdev); 720 vdev->config->reset(vdev);
710 721
@@ -749,7 +760,7 @@ static int virtblk_restore(struct virtio_device *vdev)
749 vblk->config_enable = true; 760 vblk->config_enable = true;
750 ret = init_vq(vdev->priv); 761 ret = init_vq(vdev->priv);
751 if (!ret) 762 if (!ret)
752 blk_mq_start_stopped_hw_queues(vblk->disk->queue); 763 blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
753 764
754 return ret; 765 return ret;
755} 766}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index efe1b4761735..283a30e88287 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -612,10 +612,10 @@ static void do_blkif_request(struct request_queue *rq)
612 } 612 }
613 613
614 pr_debug("do_blk_req %p: cmd %p, sec %lx, " 614 pr_debug("do_blk_req %p: cmd %p, sec %lx, "
615 "(%u/%u) buffer:%p [%s]\n", 615 "(%u/%u) [%s]\n",
616 req, req->cmd, (unsigned long)blk_rq_pos(req), 616 req, req->cmd, (unsigned long)blk_rq_pos(req),
617 blk_rq_cur_sectors(req), blk_rq_sectors(req), 617 blk_rq_cur_sectors(req), blk_rq_sectors(req),
618 req->buffer, rq_data_dir(req) ? "write" : "read"); 618 rq_data_dir(req) ? "write" : "read");
619 619
620 if (blkif_queue_request(req)) { 620 if (blkif_queue_request(req)) {
621 blk_requeue_request(rq, req); 621 blk_requeue_request(rq, req);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 1393b8871a28..ab3ea62e5dfc 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -661,7 +661,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
661 rq_data_dir(req)); 661 rq_data_dir(req));
662 662
663 ace->req = req; 663 ace->req = req;
664 ace->data_ptr = req->buffer; 664 ace->data_ptr = bio_data(req->bio);
665 ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; 665 ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
666 ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); 666 ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
667 667
@@ -733,7 +733,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
733 * blk_rq_sectors(ace->req), 733 * blk_rq_sectors(ace->req),
734 * blk_rq_cur_sectors(ace->req)); 734 * blk_rq_cur_sectors(ace->req));
735 */ 735 */
736 ace->data_ptr = ace->req->buffer; 736 ace->data_ptr = bio_data(ace->req->bio);
737 ace->data_count = blk_rq_cur_sectors(ace->req) * 16; 737 ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
738 ace_fsm_yieldirq(ace); 738 ace_fsm_yieldirq(ace);
739 break; 739 break;
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 27de5046708a..968f9e52effa 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -87,13 +87,15 @@ static void do_z2_request(struct request_queue *q)
87 while (len) { 87 while (len) {
88 unsigned long addr = start & Z2RAM_CHUNKMASK; 88 unsigned long addr = start & Z2RAM_CHUNKMASK;
89 unsigned long size = Z2RAM_CHUNKSIZE - addr; 89 unsigned long size = Z2RAM_CHUNKSIZE - addr;
90 void *buffer = bio_data(req->bio);
91
90 if (len < size) 92 if (len < size)
91 size = len; 93 size = len;
92 addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; 94 addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
93 if (rq_data_dir(req) == READ) 95 if (rq_data_dir(req) == READ)
94 memcpy(req->buffer, (char *)addr, size); 96 memcpy(buffer, (char *)addr, size);
95 else 97 else
96 memcpy((char *)addr, req->buffer, size); 98 memcpy((char *)addr, buffer, size);
97 start += size; 99 start += size;
98 len -= size; 100 len -= size;
99 } 101 }
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 51e75ad96422..584bc3126403 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -602,7 +602,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
602 spin_unlock(&gdrom_lock); 602 spin_unlock(&gdrom_lock);
603 block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; 603 block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
604 block_cnt = blk_rq_sectors(req)/GD_TO_BLK; 604 block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
605 __raw_writel(virt_to_phys(req->buffer), GDROM_DMA_STARTADDR_REG); 605 __raw_writel(virt_to_phys(bio_data(req->bio)), GDROM_DMA_STARTADDR_REG);
606 __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); 606 __raw_writel(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
607 __raw_writel(1, GDROM_DMA_DIRECTION_REG); 607 __raw_writel(1, GDROM_DMA_DIRECTION_REG);
608 __raw_writel(1, GDROM_DMA_ENABLE_REG); 608 __raw_writel(1, GDROM_DMA_ENABLE_REG);
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 102c50d38902..06cea7ff3a7c 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -902,6 +902,7 @@ void add_disk_randomness(struct gendisk *disk)
902 add_timer_randomness(disk->random, 0x100 + disk_devt(disk)); 902 add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
903 trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool)); 903 trace_add_disk_randomness(disk_devt(disk), ENTROPY_BITS(&input_pool));
904} 904}
905EXPORT_SYMBOL_GPL(add_disk_randomness);
905#endif 906#endif
906 907
907/********************************************************************* 908/*********************************************************************
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 16f69be820c7..ee880382e3bc 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -188,10 +188,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
188 188
189 ledtrig_ide_activity(); 189 ledtrig_ide_activity();
190 190
191 pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n", 191 pr_debug("%s: %sing: block=%llu, sectors=%u\n",
192 drive->name, rq_data_dir(rq) == READ ? "read" : "writ", 192 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
193 (unsigned long long)block, blk_rq_sectors(rq), 193 (unsigned long long)block, blk_rq_sectors(rq));
194 (unsigned long)rq->buffer);
195 194
196 if (hwif->rw_disk) 195 if (hwif->rw_disk)
197 hwif->rw_disk(drive, rq); 196 hwif->rw_disk(drive, rq);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 455e64916498..6a71bc7c9133 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1544,7 +1544,6 @@ static int setup_clone(struct request *clone, struct request *rq,
1544 clone->cmd = rq->cmd; 1544 clone->cmd = rq->cmd;
1545 clone->cmd_len = rq->cmd_len; 1545 clone->cmd_len = rq->cmd_len;
1546 clone->sense = rq->sense; 1546 clone->sense = rq->sense;
1547 clone->buffer = rq->buffer;
1548 clone->end_io = end_clone_request; 1547 clone->end_io = end_clone_request;
1549 clone->end_io_data = tio; 1548 clone->end_io_data = tio;
1550 1549
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 0b2ccb68c0d0..4dbfaee9aa95 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -82,8 +82,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
82 82
83 block = blk_rq_pos(req) << 9 >> tr->blkshift; 83 block = blk_rq_pos(req) << 9 >> tr->blkshift;
84 nsect = blk_rq_cur_bytes(req) >> tr->blkshift; 84 nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
85 85 buf = bio_data(req->bio);
86 buf = req->buffer;
87 86
88 if (req->cmd_type != REQ_TYPE_FS) 87 if (req->cmd_type != REQ_TYPE_FS)
89 return -EIO; 88 return -EIO;
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 8d659e6a1b4c..20a667c95da4 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -253,7 +253,7 @@ static int do_ubiblock_request(struct ubiblock *dev, struct request *req)
253 * flash access anyway. 253 * flash access anyway.
254 */ 254 */
255 mutex_lock(&dev->dev_mutex); 255 mutex_lock(&dev->dev_mutex);
256 ret = ubiblock_read(dev, req->buffer, sec, len); 256 ret = ubiblock_read(dev, bio_data(req->bio), sec, len);
257 mutex_unlock(&dev->dev_mutex); 257 mutex_unlock(&dev->dev_mutex);
258 258
259 return ret; 259 return ret;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 4ccb5d869389..a40ee1e37486 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -207,7 +207,7 @@ static void jsfd_do_request(struct request_queue *q)
207 goto end; 207 goto end;
208 } 208 }
209 209
210 jsfd_read(req->buffer, jdp->dbase + offset, len); 210 jsfd_read(bio_data(req->bio), jdp->dbase + offset, len);
211 err = 0; 211 err = 0;
212 end: 212 end:
213 if (!__blk_end_request_cur(req, err)) 213 if (!__blk_end_request_cur(req, err))
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9db097a28a74..a0c95cac91f0 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -140,7 +140,7 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
140 cmd->result = 0; 140 cmd->result = 0;
141 spin_lock_irqsave(q->queue_lock, flags); 141 spin_lock_irqsave(q->queue_lock, flags);
142 blk_requeue_request(q, cmd->request); 142 blk_requeue_request(q, cmd->request);
143 kblockd_schedule_work(q, &device->requeue_work); 143 kblockd_schedule_work(&device->requeue_work);
144 spin_unlock_irqrestore(q->queue_lock, flags); 144 spin_unlock_irqrestore(q->queue_lock, flags);
145} 145}
146 146
@@ -1019,8 +1019,6 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
1019 return BLKPREP_DEFER; 1019 return BLKPREP_DEFER;
1020 } 1020 }
1021 1021
1022 req->buffer = NULL;
1023
1024 /* 1022 /*
1025 * Next, walk the list, and fill in the addresses and sizes of 1023 * Next, walk the list, and fill in the addresses and sizes of
1026 * each segment. 1024 * each segment.
@@ -1158,7 +1156,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
1158 BUG_ON(blk_rq_bytes(req)); 1156 BUG_ON(blk_rq_bytes(req));
1159 1157
1160 memset(&cmd->sdb, 0, sizeof(cmd->sdb)); 1158 memset(&cmd->sdb, 0, sizeof(cmd->sdb));
1161 req->buffer = NULL;
1162 } 1159 }
1163 1160
1164 cmd->cmd_len = req->cmd_len; 1161 cmd->cmd_len = req->cmd_len;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index efcbcd182863..96af195224f2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -737,16 +737,14 @@ static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)
737 goto out; 737 goto out;
738 } 738 }
739 739
740 rq->completion_data = page;
740 blk_add_request_payload(rq, page, len); 741 blk_add_request_payload(rq, page, len);
741 ret = scsi_setup_blk_pc_cmnd(sdp, rq); 742 ret = scsi_setup_blk_pc_cmnd(sdp, rq);
742 rq->buffer = page_address(page);
743 rq->__data_len = nr_bytes; 743 rq->__data_len = nr_bytes;
744 744
745out: 745out:
746 if (ret != BLKPREP_OK) { 746 if (ret != BLKPREP_OK)
747 __free_page(page); 747 __free_page(page);
748 rq->buffer = NULL;
749 }
750 return ret; 748 return ret;
751} 749}
752 750
@@ -842,10 +840,9 @@ static void sd_unprep_fn(struct request_queue *q, struct request *rq)
842{ 840{
843 struct scsi_cmnd *SCpnt = rq->special; 841 struct scsi_cmnd *SCpnt = rq->special;
844 842
845 if (rq->cmd_flags & REQ_DISCARD) { 843 if (rq->cmd_flags & REQ_DISCARD)
846 free_page((unsigned long)rq->buffer); 844 __free_page(rq->completion_data);
847 rq->buffer = NULL; 845
848 }
849 if (SCpnt->cmnd != rq->cmd) { 846 if (SCpnt->cmnd != rq->cmd) {
850 mempool_free(SCpnt->cmnd, sd_cdb_pool); 847 mempool_free(SCpnt->cmnd, sd_cdb_pool);
851 SCpnt->cmnd = NULL; 848 SCpnt->cmnd = NULL;
diff --git a/fs/Makefile b/fs/Makefile
index f9cb9876e466..4030cbfbc9af 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \
14 stack.o fs_struct.o statfs.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o block_dev.o direct-io.o mpage.o
18else 18else
19obj-y += no-block.o 19obj-y += no-block.o
20endif 20endif
21 21
22obj-$(CONFIG_PROC_FS) += proc_namespace.o 22obj-$(CONFIG_PROC_FS) += proc_namespace.o
23 23
24obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
25obj-y += notify/ 24obj-y += notify/
26obj-$(CONFIG_EPOLL) += eventpoll.o 25obj-$(CONFIG_EPOLL) += eventpoll.o
27obj-$(CONFIG_ANON_INODES) += anon_inodes.o 26obj-$(CONFIG_ANON_INODES) += anon_inodes.o
diff --git a/include/linux/bio.h b/include/linux/bio.h
index bba550826921..5a645769f020 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -333,7 +333,7 @@ static inline struct bio *bio_next_split(struct bio *bio, int sectors,
333 333
334extern struct bio_set *bioset_create(unsigned int, unsigned int); 334extern struct bio_set *bioset_create(unsigned int, unsigned int);
335extern void bioset_free(struct bio_set *); 335extern void bioset_free(struct bio_set *);
336extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries); 336extern mempool_t *biovec_create_pool(int pool_entries);
337 337
338extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); 338extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
339extern void bio_put(struct bio *); 339extern void bio_put(struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451545d8..c15128833100 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -8,7 +8,13 @@ struct blk_mq_tags;
8struct blk_mq_cpu_notifier { 8struct blk_mq_cpu_notifier {
9 struct list_head list; 9 struct list_head list;
10 void *data; 10 void *data;
11 void (*notify)(void *data, unsigned long action, unsigned int cpu); 11 int (*notify)(void *data, unsigned long action, unsigned int cpu);
12};
13
14struct blk_mq_ctxmap {
15 unsigned int map_size;
16 unsigned int bits_per_word;
17 struct blk_align_bitmap *map;
12}; 18};
13 19
14struct blk_mq_hw_ctx { 20struct blk_mq_hw_ctx {
@@ -18,7 +24,11 @@ struct blk_mq_hw_ctx {
18 } ____cacheline_aligned_in_smp; 24 } ____cacheline_aligned_in_smp;
19 25
20 unsigned long state; /* BLK_MQ_S_* flags */ 26 unsigned long state; /* BLK_MQ_S_* flags */
21 struct delayed_work delayed_work; 27 struct delayed_work run_work;
28 struct delayed_work delay_work;
29 cpumask_var_t cpumask;
30 int next_cpu;
31 int next_cpu_batch;
22 32
23 unsigned long flags; /* BLK_MQ_F_* flags */ 33 unsigned long flags; /* BLK_MQ_F_* flags */
24 34
@@ -27,13 +37,13 @@ struct blk_mq_hw_ctx {
27 37
28 void *driver_data; 38 void *driver_data;
29 39
40 struct blk_mq_ctxmap ctx_map;
41
30 unsigned int nr_ctx; 42 unsigned int nr_ctx;
31 struct blk_mq_ctx **ctxs; 43 struct blk_mq_ctx **ctxs;
32 unsigned int nr_ctx_map;
33 unsigned long *ctx_map;
34 44
35 struct request **rqs; 45 unsigned int wait_index;
36 struct list_head page_list; 46
37 struct blk_mq_tags *tags; 47 struct blk_mq_tags *tags;
38 48
39 unsigned long queued; 49 unsigned long queued;
@@ -41,31 +51,40 @@ struct blk_mq_hw_ctx {
41#define BLK_MQ_MAX_DISPATCH_ORDER 10 51#define BLK_MQ_MAX_DISPATCH_ORDER 10
42 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; 52 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
43 53
44 unsigned int queue_depth;
45 unsigned int numa_node; 54 unsigned int numa_node;
46 unsigned int cmd_size; /* per-request extra data */ 55 unsigned int cmd_size; /* per-request extra data */
47 56
57 atomic_t nr_active;
58
48 struct blk_mq_cpu_notifier cpu_notifier; 59 struct blk_mq_cpu_notifier cpu_notifier;
49 struct kobject kobj; 60 struct kobject kobj;
50}; 61};
51 62
52struct blk_mq_reg { 63struct blk_mq_tag_set {
53 struct blk_mq_ops *ops; 64 struct blk_mq_ops *ops;
54 unsigned int nr_hw_queues; 65 unsigned int nr_hw_queues;
55 unsigned int queue_depth; 66 unsigned int queue_depth; /* max hw supported */
56 unsigned int reserved_tags; 67 unsigned int reserved_tags;
57 unsigned int cmd_size; /* per-request extra data */ 68 unsigned int cmd_size; /* per-request extra data */
58 int numa_node; 69 int numa_node;
59 unsigned int timeout; 70 unsigned int timeout;
60 unsigned int flags; /* BLK_MQ_F_* */ 71 unsigned int flags; /* BLK_MQ_F_* */
72 void *driver_data;
73
74 struct blk_mq_tags **tags;
75
76 struct mutex tag_list_lock;
77 struct list_head tag_list;
61}; 78};
62 79
63typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *); 80typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
64typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); 81typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
65typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
66typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
67typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); 82typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
68typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); 83typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
84typedef int (init_request_fn)(void *, struct request *, unsigned int,
85 unsigned int, unsigned int);
86typedef void (exit_request_fn)(void *, struct request *, unsigned int,
87 unsigned int);
69 88
70struct blk_mq_ops { 89struct blk_mq_ops {
71 /* 90 /*
@@ -86,18 +105,20 @@ struct blk_mq_ops {
86 softirq_done_fn *complete; 105 softirq_done_fn *complete;
87 106
88 /* 107 /*
89 * Override for hctx allocations (should probably go)
90 */
91 alloc_hctx_fn *alloc_hctx;
92 free_hctx_fn *free_hctx;
93
94 /*
95 * Called when the block layer side of a hardware queue has been 108 * Called when the block layer side of a hardware queue has been
96 * set up, allowing the driver to allocate/init matching structures. 109 * set up, allowing the driver to allocate/init matching structures.
97 * Ditto for exit/teardown. 110 * Ditto for exit/teardown.
98 */ 111 */
99 init_hctx_fn *init_hctx; 112 init_hctx_fn *init_hctx;
100 exit_hctx_fn *exit_hctx; 113 exit_hctx_fn *exit_hctx;
114
115 /*
116 * Called for every command allocated by the block layer to allow
117 * the driver to set up driver specific data.
118 * Ditto for exit/teardown.
119 */
120 init_request_fn *init_request;
121 exit_request_fn *exit_request;
101}; 122};
102 123
103enum { 124enum {
@@ -107,18 +128,24 @@ enum {
107 128
108 BLK_MQ_F_SHOULD_MERGE = 1 << 0, 129 BLK_MQ_F_SHOULD_MERGE = 1 << 0,
109 BLK_MQ_F_SHOULD_SORT = 1 << 1, 130 BLK_MQ_F_SHOULD_SORT = 1 << 1,
110 BLK_MQ_F_SHOULD_IPI = 1 << 2, 131 BLK_MQ_F_TAG_SHARED = 1 << 2,
132 BLK_MQ_F_SG_MERGE = 1 << 3,
133 BLK_MQ_F_SYSFS_UP = 1 << 4,
111 134
112 BLK_MQ_S_STOPPED = 0, 135 BLK_MQ_S_STOPPED = 0,
136 BLK_MQ_S_TAG_ACTIVE = 1,
113 137
114 BLK_MQ_MAX_DEPTH = 2048, 138 BLK_MQ_MAX_DEPTH = 2048,
139
140 BLK_MQ_CPU_WORK_BATCH = 8,
115}; 141};
116 142
117struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *); 143struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
118int blk_mq_register_disk(struct gendisk *); 144int blk_mq_register_disk(struct gendisk *);
119void blk_mq_unregister_disk(struct gendisk *); 145void blk_mq_unregister_disk(struct gendisk *);
120int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 146
121void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data); 147int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
148void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
122 149
123void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); 150void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
124 151
@@ -126,28 +153,28 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
126void blk_mq_run_queues(struct request_queue *q, bool async); 153void blk_mq_run_queues(struct request_queue *q, bool async);
127void blk_mq_free_request(struct request *rq); 154void blk_mq_free_request(struct request *rq);
128bool blk_mq_can_queue(struct blk_mq_hw_ctx *); 155bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
129struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp); 156struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
130struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp); 157 gfp_t gfp, bool reserved);
131struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag); 158struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx, unsigned int tag);
132 159
133struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); 160struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
134struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int); 161struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int);
135void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
136 162
137bool blk_mq_end_io_partial(struct request *rq, int error, 163void blk_mq_end_io(struct request *rq, int error);
138 unsigned int nr_bytes); 164void __blk_mq_end_io(struct request *rq, int error);
139static inline void blk_mq_end_io(struct request *rq, int error)
140{
141 bool done = !blk_mq_end_io_partial(rq, error, blk_rq_bytes(rq));
142 BUG_ON(!done);
143}
144 165
166void blk_mq_requeue_request(struct request *rq);
167void blk_mq_add_to_requeue_list(struct request *rq, bool at_head);
168void blk_mq_kick_requeue_list(struct request_queue *q);
145void blk_mq_complete_request(struct request *rq); 169void blk_mq_complete_request(struct request *rq);
146 170
147void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); 171void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
148void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); 172void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
149void blk_mq_stop_hw_queues(struct request_queue *q); 173void blk_mq_stop_hw_queues(struct request_queue *q);
150void blk_mq_start_stopped_hw_queues(struct request_queue *q); 174void blk_mq_start_hw_queues(struct request_queue *q);
175void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
176void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
177void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
151 178
152/* 179/*
153 * Driver command data is immediately after the request. So subtract request 180 * Driver command data is immediately after the request. So subtract request
@@ -162,12 +189,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
162 return (void *) rq + sizeof(*rq); 189 return (void *) rq + sizeof(*rq);
163} 190}
164 191
165static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
166 unsigned int tag)
167{
168 return hctx->rqs[tag];
169}
170
171#define queue_for_each_hw_ctx(q, hctx, i) \ 192#define queue_for_each_hw_ctx(q, hctx, i) \
172 for ((i) = 0; (i) < (q)->nr_hw_queues && \ 193 for ((i) = 0; (i) < (q)->nr_hw_queues && \
173 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) 194 ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index aa0eaa2d0bd8..d8e4cea23a25 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -190,6 +190,7 @@ enum rq_flag_bits {
190 __REQ_PM, /* runtime pm request */ 190 __REQ_PM, /* runtime pm request */
191 __REQ_END, /* last of chain of requests */ 191 __REQ_END, /* last of chain of requests */
192 __REQ_HASHED, /* on IO scheduler merge hash */ 192 __REQ_HASHED, /* on IO scheduler merge hash */
193 __REQ_MQ_INFLIGHT, /* track inflight for MQ */
193 __REQ_NR_BITS, /* stops here */ 194 __REQ_NR_BITS, /* stops here */
194}; 195};
195 196
@@ -243,5 +244,6 @@ enum rq_flag_bits {
243#define REQ_PM (1ULL << __REQ_PM) 244#define REQ_PM (1ULL << __REQ_PM)
244#define REQ_END (1ULL << __REQ_END) 245#define REQ_END (1ULL << __REQ_END)
245#define REQ_HASHED (1ULL << __REQ_HASHED) 246#define REQ_HASHED (1ULL << __REQ_HASHED)
247#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
246 248
247#endif /* __LINUX_BLK_TYPES_H */ 249#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0d84981ee03f..695b9fd41efe 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -90,15 +90,15 @@ enum rq_cmd_type_bits {
90#define BLK_MAX_CDB 16 90#define BLK_MAX_CDB 16
91 91
92/* 92/*
93 * try to put the fields that are referenced together in the same cacheline. 93 * Try to put the fields that are referenced together in the same cacheline.
94 * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() 94 *
95 * as well! 95 * If you modify this structure, make sure to update blk_rq_init() and
96 * especially blk_mq_rq_ctx_init() to take care of the added fields.
96 */ 97 */
97struct request { 98struct request {
98 struct list_head queuelist; 99 struct list_head queuelist;
99 union { 100 union {
100 struct call_single_data csd; 101 struct call_single_data csd;
101 struct work_struct mq_flush_work;
102 unsigned long fifo_time; 102 unsigned long fifo_time;
103 }; 103 };
104 104
@@ -178,7 +178,6 @@ struct request {
178 unsigned short ioprio; 178 unsigned short ioprio;
179 179
180 void *special; /* opaque pointer available for LLD use */ 180 void *special; /* opaque pointer available for LLD use */
181 char *buffer; /* kaddr of the current segment if available */
182 181
183 int tag; 182 int tag;
184 int errors; 183 int errors;
@@ -463,6 +462,10 @@ struct request_queue {
463 struct request *flush_rq; 462 struct request *flush_rq;
464 spinlock_t mq_flush_lock; 463 spinlock_t mq_flush_lock;
465 464
465 struct list_head requeue_list;
466 spinlock_t requeue_lock;
467 struct work_struct requeue_work;
468
466 struct mutex sysfs_lock; 469 struct mutex sysfs_lock;
467 470
468 int bypass_depth; 471 int bypass_depth;
@@ -481,6 +484,9 @@ struct request_queue {
481 wait_queue_head_t mq_freeze_wq; 484 wait_queue_head_t mq_freeze_wq;
482 struct percpu_counter mq_usage_counter; 485 struct percpu_counter mq_usage_counter;
483 struct list_head all_q_node; 486 struct list_head all_q_node;
487
488 struct blk_mq_tag_set *tag_set;
489 struct list_head tag_set_list;
484}; 490};
485 491
486#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 492#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -504,6 +510,7 @@ struct request_queue {
504#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 510#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
505#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 511#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
506#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */ 512#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
513#define QUEUE_FLAG_NO_SG_MERGE 21 /* don't attempt to merge SG segments*/
507 514
508#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 515#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
509 (1 << QUEUE_FLAG_STACKABLE) | \ 516 (1 << QUEUE_FLAG_STACKABLE) | \
@@ -937,6 +944,7 @@ extern struct request *blk_fetch_request(struct request_queue *q);
937 */ 944 */
938extern bool blk_update_request(struct request *rq, int error, 945extern bool blk_update_request(struct request *rq, int error,
939 unsigned int nr_bytes); 946 unsigned int nr_bytes);
947extern void blk_finish_request(struct request *rq, int error);
940extern bool blk_end_request(struct request *rq, int error, 948extern bool blk_end_request(struct request *rq, int error,
941 unsigned int nr_bytes); 949 unsigned int nr_bytes);
942extern void blk_end_request_all(struct request *rq, int error); 950extern void blk_end_request_all(struct request *rq, int error);
@@ -1053,7 +1061,6 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
1053 * schedule() where blk_schedule_flush_plug() is called. 1061 * schedule() where blk_schedule_flush_plug() is called.
1054 */ 1062 */
1055struct blk_plug { 1063struct blk_plug {
1056 unsigned long magic; /* detect uninitialized use-cases */
1057 struct list_head list; /* requests */ 1064 struct list_head list; /* requests */
1058 struct list_head mq_list; /* blk-mq requests */ 1065 struct list_head mq_list; /* blk-mq requests */
1059 struct list_head cb_list; /* md requires an unplug callback */ 1066 struct list_head cb_list; /* md requires an unplug callback */
@@ -1102,7 +1109,8 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1102/* 1109/*
1103 * tag stuff 1110 * tag stuff
1104 */ 1111 */
1105#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) 1112#define blk_rq_tagged(rq) \
1113 ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED))
1106extern int blk_queue_start_tag(struct request_queue *, struct request *); 1114extern int blk_queue_start_tag(struct request_queue *, struct request *);
1107extern struct request *blk_queue_find_tag(struct request_queue *, int); 1115extern struct request *blk_queue_find_tag(struct request_queue *, int);
1108extern void blk_queue_end_tag(struct request_queue *, struct request *); 1116extern void blk_queue_end_tag(struct request_queue *, struct request *);
@@ -1370,8 +1378,9 @@ static inline void put_dev_sector(Sector p)
1370} 1378}
1371 1379
1372struct work_struct; 1380struct work_struct;
1373int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1381int kblockd_schedule_work(struct work_struct *work);
1374int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay); 1382int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
1383int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
1375 1384
1376#ifdef CONFIG_BLK_CGROUP 1385#ifdef CONFIG_BLK_CGROUP
1377/* 1386/*
diff --git a/mm/Makefile b/mm/Makefile
index b484452dac57..0173940407f6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,7 +30,6 @@ endif
30 30
31obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 31obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32 32
33obj-$(CONFIG_BOUNCE) += bounce.o
34obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
35obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
36obj-$(CONFIG_ZSWAP) += zswap.o 35obj-$(CONFIG_ZSWAP) += zswap.o