aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-10-24 04:20:05 -0400
committerJens Axboe <axboe@kernel.dk>2013-10-25 06:56:00 -0400
commit320ae51feed5c2f13664aa05a76bec198967e04d (patch)
treead37ccbcc5ddb1c9c19e48965bf8fec1b05217dc
parent1dddc01af0d42b21058e0cb9c1ca9e8d5204d9b0 (diff)
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices: - The classic request_fn based approach, where drivers use struct request units for IO. The block layer provides various helper functionalities to let drivers share code, things like tag management, timeout handling, queueing, etc. - The "stacked" approach, where a driver squeezes in between the block layer and IO submitter. Since this bypasses the IO stack, driver generally have to manage everything themselves. With drivers being written for new high IOPS devices, the classic request_fn based driver doesn't work well enough. The design dates back to when both SMP and high IOPS was rare. It has problems with scaling to bigger machines, and runs into scaling issues even on smaller machines when you have IOPS in the hundreds of thousands per device. The stacked approach is then most often selected as the model for the driver. But this means that everybody has to re-invent everything, and along with that we get all the problems again that the shared approach solved. This commit introduces blk-mq, block multi queue support. The design is centered around per-cpu queues for queueing IO, which then funnel down into x number of hardware submission queues. We might have a 1:1 mapping between the two, or it might be an N:M mapping. That all depends on what the hardware supports. blk-mq provides various helper functions, which include: - Scalable support for request tagging. Most devices need to be able to uniquely identify a request both in the driver and to the hardware. The tagging uses per-cpu caches for freed tags, to enable cache hot reuse. - Timeout handling without tracking request on a per-device basis. Basically the driver should be able to get a notification, if a request happens to fail. - Optional support for non 1:1 mappings between issue and submission queues. blk-mq can redirect IO completions to the desired location. - Support for per-request payloads. Drivers almost always need to associate a request structure with some driver private command structure. Drivers can tell blk-mq this at init time, and then any request handed to the driver will have the required size of memory associated with it. - Support for merging of IO, and plugging. The stacked model gets neither of these. Even for high IOPS devices, merging sequential IO reduces per-command overhead and thus increases bandwidth. For now, this is provided as a potential 3rd queueing model, with the hope being that, as it matures, it can replace both the classic and stacked model. That would get us back to having just 1 real model for block devices, leaving the stacked approach to dm/md devices (as it was originally intended). Contributions in this patch from the following people: Shaohua Li <shli@fusionio.com> Alexander Gordeev <agordeev@redhat.com> Christoph Hellwig <hch@infradead.org> Mike Christie <michaelc@cs.wisc.edu> Matias Bjorling <m@bjorling.me> Jeff Moyer <jmoyer@redhat.com> Acked-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/Makefile5
-rw-r--r--block/blk-core.c142
-rw-r--r--block/blk-exec.c7
-rw-r--r--block/blk-flush.c154
-rw-r--r--block/blk-mq-cpu.c93
-rw-r--r--block/blk-mq-cpumap.c108
-rw-r--r--block/blk-mq-sysfs.c384
-rw-r--r--block/blk-mq-tag.c204
-rw-r--r--block/blk-mq-tag.h27
-rw-r--r--block/blk-mq.c1480
-rw-r--r--block/blk-mq.h52
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-timeout.c73
-rw-r--r--block/blk.h17
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h182
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/blkdev.h54
18 files changed, 2890 insertions, 109 deletions
diff --git a/block/Makefile b/block/Makefile
index 671a83d063a5..20645e88fb57 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,8 +5,9 @@
5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ 5obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ 6 blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ 7 blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
8 blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \ 8 blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
9 partition-generic.o partitions/ 9 blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
10 genhd.o scsi_ioctl.o partition-generic.o partitions/
10 11
11obj-$(CONFIG_BLK_DEV_BSG) += bsg.o 12obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
12obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o 13obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 18faa7e81d3b..3bb9e9f7f87e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -16,6 +16,7 @@
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/bio.h> 17#include <linux/bio.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/blk-mq.h>
19#include <linux/highmem.h> 20#include <linux/highmem.h>
20#include <linux/mm.h> 21#include <linux/mm.h>
21#include <linux/kernel_stat.h> 22#include <linux/kernel_stat.h>
@@ -48,7 +49,7 @@ DEFINE_IDA(blk_queue_ida);
48/* 49/*
49 * For the allocated request tables 50 * For the allocated request tables
50 */ 51 */
51static struct kmem_cache *request_cachep; 52struct kmem_cache *request_cachep = NULL;
52 53
53/* 54/*
54 * For queue allocation 55 * For queue allocation
@@ -60,42 +61,6 @@ struct kmem_cache *blk_requestq_cachep;
60 */ 61 */
61static struct workqueue_struct *kblockd_workqueue; 62static struct workqueue_struct *kblockd_workqueue;
62 63
63static void drive_stat_acct(struct request *rq, int new_io)
64{
65 struct hd_struct *part;
66 int rw = rq_data_dir(rq);
67 int cpu;
68
69 if (!blk_do_io_stat(rq))
70 return;
71
72 cpu = part_stat_lock();
73
74 if (!new_io) {
75 part = rq->part;
76 part_stat_inc(cpu, part, merges[rw]);
77 } else {
78 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
79 if (!hd_struct_try_get(part)) {
80 /*
81 * The partition is already being removed,
82 * the request will be accounted on the disk only
83 *
84 * We take a reference on disk->part0 although that
85 * partition will never be deleted, so we can treat
86 * it as any other partition.
87 */
88 part = &rq->rq_disk->part0;
89 hd_struct_get(part);
90 }
91 part_round_stats(cpu, part);
92 part_inc_in_flight(part, rw);
93 rq->part = part;
94 }
95
96 part_stat_unlock();
97}
98
99void blk_queue_congestion_threshold(struct request_queue *q) 64void blk_queue_congestion_threshold(struct request_queue *q)
100{ 65{
101 int nr; 66 int nr;
@@ -594,9 +559,12 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
594 if (!q) 559 if (!q)
595 return NULL; 560 return NULL;
596 561
562 if (percpu_counter_init(&q->mq_usage_counter, 0))
563 goto fail_q;
564
597 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); 565 q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
598 if (q->id < 0) 566 if (q->id < 0)
599 goto fail_q; 567 goto fail_c;
600 568
601 q->backing_dev_info.ra_pages = 569 q->backing_dev_info.ra_pages =
602 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 570 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
@@ -643,6 +611,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
643 q->bypass_depth = 1; 611 q->bypass_depth = 1;
644 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); 612 __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags);
645 613
614 init_waitqueue_head(&q->mq_freeze_wq);
615
646 if (blkcg_init_queue(q)) 616 if (blkcg_init_queue(q))
647 goto fail_id; 617 goto fail_id;
648 618
@@ -650,6 +620,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
650 620
651fail_id: 621fail_id:
652 ida_simple_remove(&blk_queue_ida, q->id); 622 ida_simple_remove(&blk_queue_ida, q->id);
623fail_c:
624 percpu_counter_destroy(&q->mq_usage_counter);
653fail_q: 625fail_q:
654 kmem_cache_free(blk_requestq_cachep, q); 626 kmem_cache_free(blk_requestq_cachep, q);
655 return NULL; 627 return NULL;
@@ -1108,7 +1080,8 @@ retry:
1108 goto retry; 1080 goto retry;
1109} 1081}
1110 1082
1111struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 1083static struct request *blk_old_get_request(struct request_queue *q, int rw,
1084 gfp_t gfp_mask)
1112{ 1085{
1113 struct request *rq; 1086 struct request *rq;
1114 1087
@@ -1125,6 +1098,14 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1125 1098
1126 return rq; 1099 return rq;
1127} 1100}
1101
1102struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
1103{
1104 if (q->mq_ops)
1105 return blk_mq_alloc_request(q, rw, gfp_mask);
1106 else
1107 return blk_old_get_request(q, rw, gfp_mask);
1108}
1128EXPORT_SYMBOL(blk_get_request); 1109EXPORT_SYMBOL(blk_get_request);
1129 1110
1130/** 1111/**
@@ -1210,7 +1191,7 @@ EXPORT_SYMBOL(blk_requeue_request);
1210static void add_acct_request(struct request_queue *q, struct request *rq, 1191static void add_acct_request(struct request_queue *q, struct request *rq,
1211 int where) 1192 int where)
1212{ 1193{
1213 drive_stat_acct(rq, 1); 1194 blk_account_io_start(rq, true);
1214 __elv_add_request(q, rq, where); 1195 __elv_add_request(q, rq, where);
1215} 1196}
1216 1197
@@ -1299,12 +1280,17 @@ EXPORT_SYMBOL_GPL(__blk_put_request);
1299 1280
1300void blk_put_request(struct request *req) 1281void blk_put_request(struct request *req)
1301{ 1282{
1302 unsigned long flags;
1303 struct request_queue *q = req->q; 1283 struct request_queue *q = req->q;
1304 1284
1305 spin_lock_irqsave(q->queue_lock, flags); 1285 if (q->mq_ops)
1306 __blk_put_request(q, req); 1286 blk_mq_free_request(req);
1307 spin_unlock_irqrestore(q->queue_lock, flags); 1287 else {
1288 unsigned long flags;
1289
1290 spin_lock_irqsave(q->queue_lock, flags);
1291 __blk_put_request(q, req);
1292 spin_unlock_irqrestore(q->queue_lock, flags);
1293 }
1308} 1294}
1309EXPORT_SYMBOL(blk_put_request); 1295EXPORT_SYMBOL(blk_put_request);
1310 1296
@@ -1340,8 +1326,8 @@ void blk_add_request_payload(struct request *rq, struct page *page,
1340} 1326}
1341EXPORT_SYMBOL_GPL(blk_add_request_payload); 1327EXPORT_SYMBOL_GPL(blk_add_request_payload);
1342 1328
1343static bool bio_attempt_back_merge(struct request_queue *q, struct request *req, 1329bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1344 struct bio *bio) 1330 struct bio *bio)
1345{ 1331{
1346 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1332 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1347 1333
@@ -1358,12 +1344,12 @@ static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
1358 req->__data_len += bio->bi_size; 1344 req->__data_len += bio->bi_size;
1359 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1345 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1360 1346
1361 drive_stat_acct(req, 0); 1347 blk_account_io_start(req, false);
1362 return true; 1348 return true;
1363} 1349}
1364 1350
1365static bool bio_attempt_front_merge(struct request_queue *q, 1351bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
1366 struct request *req, struct bio *bio) 1352 struct bio *bio)
1367{ 1353{
1368 const int ff = bio->bi_rw & REQ_FAILFAST_MASK; 1354 const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
1369 1355
@@ -1388,12 +1374,12 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1388 req->__data_len += bio->bi_size; 1374 req->__data_len += bio->bi_size;
1389 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); 1375 req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
1390 1376
1391 drive_stat_acct(req, 0); 1377 blk_account_io_start(req, false);
1392 return true; 1378 return true;
1393} 1379}
1394 1380
1395/** 1381/**
1396 * attempt_plug_merge - try to merge with %current's plugged list 1382 * blk_attempt_plug_merge - try to merge with %current's plugged list
1397 * @q: request_queue new bio is being queued at 1383 * @q: request_queue new bio is being queued at
1398 * @bio: new bio being queued 1384 * @bio: new bio being queued
1399 * @request_count: out parameter for number of traversed plugged requests 1385 * @request_count: out parameter for number of traversed plugged requests
@@ -1409,8 +1395,8 @@ static bool bio_attempt_front_merge(struct request_queue *q,
1409 * reliable access to the elevator outside queue lock. Only check basic 1395 * reliable access to the elevator outside queue lock. Only check basic
1410 * merging parameters without querying the elevator. 1396 * merging parameters without querying the elevator.
1411 */ 1397 */
1412static bool attempt_plug_merge(struct request_queue *q, struct bio *bio, 1398bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
1413 unsigned int *request_count) 1399 unsigned int *request_count)
1414{ 1400{
1415 struct blk_plug *plug; 1401 struct blk_plug *plug;
1416 struct request *rq; 1402 struct request *rq;
@@ -1489,7 +1475,7 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio)
1489 * Check if we can merge with the plugged list before grabbing 1475 * Check if we can merge with the plugged list before grabbing
1490 * any locks. 1476 * any locks.
1491 */ 1477 */
1492 if (attempt_plug_merge(q, bio, &request_count)) 1478 if (blk_attempt_plug_merge(q, bio, &request_count))
1493 return; 1479 return;
1494 1480
1495 spin_lock_irq(q->queue_lock); 1481 spin_lock_irq(q->queue_lock);
@@ -1557,7 +1543,7 @@ get_rq:
1557 } 1543 }
1558 } 1544 }
1559 list_add_tail(&req->queuelist, &plug->list); 1545 list_add_tail(&req->queuelist, &plug->list);
1560 drive_stat_acct(req, 1); 1546 blk_account_io_start(req, true);
1561 } else { 1547 } else {
1562 spin_lock_irq(q->queue_lock); 1548 spin_lock_irq(q->queue_lock);
1563 add_acct_request(q, req, where); 1549 add_acct_request(q, req, where);
@@ -2011,7 +1997,7 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
2011} 1997}
2012EXPORT_SYMBOL_GPL(blk_rq_err_bytes); 1998EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
2013 1999
2014static void blk_account_io_completion(struct request *req, unsigned int bytes) 2000void blk_account_io_completion(struct request *req, unsigned int bytes)
2015{ 2001{
2016 if (blk_do_io_stat(req)) { 2002 if (blk_do_io_stat(req)) {
2017 const int rw = rq_data_dir(req); 2003 const int rw = rq_data_dir(req);
@@ -2025,7 +2011,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
2025 } 2011 }
2026} 2012}
2027 2013
2028static void blk_account_io_done(struct request *req) 2014void blk_account_io_done(struct request *req)
2029{ 2015{
2030 /* 2016 /*
2031 * Account IO completion. flush_rq isn't accounted as a 2017 * Account IO completion. flush_rq isn't accounted as a
@@ -2073,6 +2059,42 @@ static inline struct request *blk_pm_peek_request(struct request_queue *q,
2073} 2059}
2074#endif 2060#endif
2075 2061
2062void blk_account_io_start(struct request *rq, bool new_io)
2063{
2064 struct hd_struct *part;
2065 int rw = rq_data_dir(rq);
2066 int cpu;
2067
2068 if (!blk_do_io_stat(rq))
2069 return;
2070
2071 cpu = part_stat_lock();
2072
2073 if (!new_io) {
2074 part = rq->part;
2075 part_stat_inc(cpu, part, merges[rw]);
2076 } else {
2077 part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
2078 if (!hd_struct_try_get(part)) {
2079 /*
2080 * The partition is already being removed,
2081 * the request will be accounted on the disk only
2082 *
2083 * We take a reference on disk->part0 although that
2084 * partition will never be deleted, so we can treat
2085 * it as any other partition.
2086 */
2087 part = &rq->rq_disk->part0;
2088 hd_struct_get(part);
2089 }
2090 part_round_stats(cpu, part);
2091 part_inc_in_flight(part, rw);
2092 rq->part = part;
2093 }
2094
2095 part_stat_unlock();
2096}
2097
2076/** 2098/**
2077 * blk_peek_request - peek at the top of a request queue 2099 * blk_peek_request - peek at the top of a request queue
2078 * @q: request queue to peek at 2100 * @q: request queue to peek at
@@ -2448,7 +2470,6 @@ static void blk_finish_request(struct request *req, int error)
2448 if (req->cmd_flags & REQ_DONTPREP) 2470 if (req->cmd_flags & REQ_DONTPREP)
2449 blk_unprep_request(req); 2471 blk_unprep_request(req);
2450 2472
2451
2452 blk_account_io_done(req); 2473 blk_account_io_done(req);
2453 2474
2454 if (req->end_io) 2475 if (req->end_io)
@@ -2870,6 +2891,7 @@ void blk_start_plug(struct blk_plug *plug)
2870 2891
2871 plug->magic = PLUG_MAGIC; 2892 plug->magic = PLUG_MAGIC;
2872 INIT_LIST_HEAD(&plug->list); 2893 INIT_LIST_HEAD(&plug->list);
2894 INIT_LIST_HEAD(&plug->mq_list);
2873 INIT_LIST_HEAD(&plug->cb_list); 2895 INIT_LIST_HEAD(&plug->cb_list);
2874 2896
2875 /* 2897 /*
@@ -2967,6 +2989,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
2967 BUG_ON(plug->magic != PLUG_MAGIC); 2989 BUG_ON(plug->magic != PLUG_MAGIC);
2968 2990
2969 flush_plug_callbacks(plug, from_schedule); 2991 flush_plug_callbacks(plug, from_schedule);
2992
2993 if (!list_empty(&plug->mq_list))
2994 blk_mq_flush_plug_list(plug, from_schedule);
2995
2970 if (list_empty(&plug->list)) 2996 if (list_empty(&plug->list))
2971 return; 2997 return;
2972 2998
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 6b18d82d91c5..c3edf9dff566 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -5,6 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/bio.h> 6#include <linux/bio.h>
7#include <linux/blkdev.h> 7#include <linux/blkdev.h>
8#include <linux/blk-mq.h>
8#include <linux/sched/sysctl.h> 9#include <linux/sched/sysctl.h>
9 10
10#include "blk.h" 11#include "blk.h"
@@ -58,6 +59,12 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
58 59
59 rq->rq_disk = bd_disk; 60 rq->rq_disk = bd_disk;
60 rq->end_io = done; 61 rq->end_io = done;
62
63 if (q->mq_ops) {
64 blk_mq_insert_request(q, rq, true);
65 return;
66 }
67
61 /* 68 /*
62 * need to check this before __blk_run_queue(), because rq can 69 * need to check this before __blk_run_queue(), because rq can
63 * be freed before that returns. 70 * be freed before that returns.
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827a853c..3e4cc9c7890a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
69#include <linux/bio.h> 69#include <linux/bio.h>
70#include <linux/blkdev.h> 70#include <linux/blkdev.h>
71#include <linux/gfp.h> 71#include <linux/gfp.h>
72#include <linux/blk-mq.h>
72 73
73#include "blk.h" 74#include "blk.h"
75#include "blk-mq.h"
74 76
75/* FLUSH/FUA sequences */ 77/* FLUSH/FUA sequences */
76enum { 78enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
124 /* make @rq a normal request */ 126 /* make @rq a normal request */
125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 127 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
126 rq->end_io = rq->flush.saved_end_io; 128 rq->end_io = rq->flush.saved_end_io;
129
130 blk_clear_rq_complete(rq);
131}
132
133static void mq_flush_data_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_data);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_run_request(rq, true, false);
141}
142
143static void blk_mq_flush_data_insert(struct request *rq)
144{
145 INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
146 kblockd_schedule_work(rq->q, &rq->mq_flush_data);
127} 147}
128 148
129/** 149/**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
136 * completion and trigger the next step. 156 * completion and trigger the next step.
137 * 157 *
138 * CONTEXT: 158 * CONTEXT:
139 * spin_lock_irq(q->queue_lock) 159 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
140 * 160 *
141 * RETURNS: 161 * RETURNS:
142 * %true if requests were added to the dispatch queue, %false otherwise. 162 * %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
146{ 166{
147 struct request_queue *q = rq->q; 167 struct request_queue *q = rq->q;
148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 168 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
149 bool queued = false; 169 bool queued = false, kicked;
150 170
151 BUG_ON(rq->flush.seq & seq); 171 BUG_ON(rq->flush.seq & seq);
152 rq->flush.seq |= seq; 172 rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
167 187
168 case REQ_FSEQ_DATA: 188 case REQ_FSEQ_DATA:
169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 189 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
170 list_add(&rq->queuelist, &q->queue_head); 190 if (q->mq_ops)
171 queued = true; 191 blk_mq_flush_data_insert(rq);
192 else {
193 list_add(&rq->queuelist, &q->queue_head);
194 queued = true;
195 }
172 break; 196 break;
173 197
174 case REQ_FSEQ_DONE: 198 case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
181 BUG_ON(!list_empty(&rq->queuelist)); 205 BUG_ON(!list_empty(&rq->queuelist));
182 list_del_init(&rq->flush.list); 206 list_del_init(&rq->flush.list);
183 blk_flush_restore_request(rq); 207 blk_flush_restore_request(rq);
184 __blk_end_request_all(rq, error); 208 if (q->mq_ops)
209 blk_mq_end_io(rq, error);
210 else
211 __blk_end_request_all(rq, error);
185 break; 212 break;
186 213
187 default: 214 default:
188 BUG(); 215 BUG();
189 } 216 }
190 217
191 return blk_kick_flush(q) | queued; 218 kicked = blk_kick_flush(q);
219 /* blk_mq_run_flush will run queue */
220 if (q->mq_ops)
221 return queued;
222 return kicked | queued;
192} 223}
193 224
194static void flush_end_io(struct request *flush_rq, int error) 225static void flush_end_io(struct request *flush_rq, int error)
195{ 226{
196 struct request_queue *q = flush_rq->q; 227 struct request_queue *q = flush_rq->q;
197 struct list_head *running = &q->flush_queue[q->flush_running_idx]; 228 struct list_head *running;
198 bool queued = false; 229 bool queued = false;
199 struct request *rq, *n; 230 struct request *rq, *n;
231 unsigned long flags = 0;
200 232
233 if (q->mq_ops) {
234 blk_mq_free_request(flush_rq);
235 spin_lock_irqsave(&q->mq_flush_lock, flags);
236 }
237 running = &q->flush_queue[q->flush_running_idx];
201 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 238 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
202 239
203 /* account completion of the flush request */ 240 /* account completion of the flush request */
204 q->flush_running_idx ^= 1; 241 q->flush_running_idx ^= 1;
205 elv_completed_request(q, flush_rq); 242
243 if (!q->mq_ops)
244 elv_completed_request(q, flush_rq);
206 245
207 /* and push the waiting requests to the next stage */ 246 /* and push the waiting requests to the next stage */
208 list_for_each_entry_safe(rq, n, running, flush.list) { 247 list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
223 * directly into request_fn may confuse the driver. Always use 262 * directly into request_fn may confuse the driver. Always use
224 * kblockd. 263 * kblockd.
225 */ 264 */
226 if (queued || q->flush_queue_delayed) 265 if (queued || q->flush_queue_delayed) {
227 blk_run_queue_async(q); 266 if (!q->mq_ops)
267 blk_run_queue_async(q);
268 else
269 /*
270 * This can be optimized to only run queues with requests
271 * queued if necessary.
272 */
273 blk_mq_run_queues(q, true);
274 }
228 q->flush_queue_delayed = 0; 275 q->flush_queue_delayed = 0;
276 if (q->mq_ops)
277 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
278}
279
280static void mq_flush_work(struct work_struct *work)
281{
282 struct request_queue *q;
283 struct request *rq;
284
285 q = container_of(work, struct request_queue, mq_flush_work);
286
287 /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
288 rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
289 __GFP_WAIT|GFP_ATOMIC);
290 rq->cmd_type = REQ_TYPE_FS;
291 rq->end_io = flush_end_io;
292
293 blk_mq_run_request(rq, true, false);
294}
295
296/*
297 * We can't directly use q->flush_rq, because it doesn't have tag and is not in
298 * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
299 * so offload the work to workqueue.
300 *
301 * Note: we assume a flush request finished in any hardware queue will flush
302 * the whole disk cache.
303 */
304static void mq_run_flush(struct request_queue *q)
305{
306 kblockd_schedule_work(q, &q->mq_flush_work);
229} 307}
230 308
231/** 309/**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
236 * Please read the comment at the top of this file for more info. 314 * Please read the comment at the top of this file for more info.
237 * 315 *
238 * CONTEXT: 316 * CONTEXT:
239 * spin_lock_irq(q->queue_lock) 317 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
240 * 318 *
241 * RETURNS: 319 * RETURNS:
242 * %true if flush was issued, %false otherwise. 320 * %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
261 * Issue flush and toggle pending_idx. This makes pending_idx 339 * Issue flush and toggle pending_idx. This makes pending_idx
262 * different from running_idx, which means flush is in flight. 340 * different from running_idx, which means flush is in flight.
263 */ 341 */
342 q->flush_pending_idx ^= 1;
343 if (q->mq_ops) {
344 mq_run_flush(q);
345 return true;
346 }
347
264 blk_rq_init(q, &q->flush_rq); 348 blk_rq_init(q, &q->flush_rq);
265 q->flush_rq.cmd_type = REQ_TYPE_FS; 349 q->flush_rq.cmd_type = REQ_TYPE_FS;
266 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 350 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
267 q->flush_rq.rq_disk = first_rq->rq_disk; 351 q->flush_rq.rq_disk = first_rq->rq_disk;
268 q->flush_rq.end_io = flush_end_io; 352 q->flush_rq.end_io = flush_end_io;
269 353
270 q->flush_pending_idx ^= 1;
271 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 354 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
272 return true; 355 return true;
273} 356}
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
284 blk_run_queue_async(q); 367 blk_run_queue_async(q);
285} 368}
286 369
370static void mq_flush_data_end_io(struct request *rq, int error)
371{
372 struct request_queue *q = rq->q;
373 struct blk_mq_hw_ctx *hctx;
374 struct blk_mq_ctx *ctx;
375 unsigned long flags;
376
377 ctx = rq->mq_ctx;
378 hctx = q->mq_ops->map_queue(q, ctx->cpu);
379
380 /*
381 * After populating an empty queue, kick it to avoid stall. Read
382 * the comment in flush_end_io().
383 */
384 spin_lock_irqsave(&q->mq_flush_lock, flags);
385 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
386 blk_mq_run_hw_queue(hctx, true);
387 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
388}
389
287/** 390/**
288 * blk_insert_flush - insert a new FLUSH/FUA request 391 * blk_insert_flush - insert a new FLUSH/FUA request
289 * @rq: request to insert 392 * @rq: request to insert
290 * 393 *
291 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 394 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
395 * or __blk_mq_run_hw_queue() to dispatch request.
292 * @rq is being submitted. Analyze what needs to be done and put it on the 396 * @rq is being submitted. Analyze what needs to be done and put it on the
293 * right queue. 397 * right queue.
294 * 398 *
295 * CONTEXT: 399 * CONTEXT:
296 * spin_lock_irq(q->queue_lock) 400 * spin_lock_irq(q->queue_lock) in !mq case
297 */ 401 */
298void blk_insert_flush(struct request *rq) 402void blk_insert_flush(struct request *rq)
299{ 403{
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
316 * complete the request. 420 * complete the request.
317 */ 421 */
318 if (!policy) { 422 if (!policy) {
319 __blk_end_bidi_request(rq, 0, 0, 0); 423 if (q->mq_ops)
424 blk_mq_end_io(rq, 0);
425 else
426 __blk_end_bidi_request(rq, 0, 0, 0);
320 return; 427 return;
321 } 428 }
322 429
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
329 */ 436 */
330 if ((policy & REQ_FSEQ_DATA) && 437 if ((policy & REQ_FSEQ_DATA) &&
331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 438 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
332 list_add_tail(&rq->queuelist, &q->queue_head); 439 if (q->mq_ops) {
440 blk_mq_run_request(rq, false, true);
441 } else
442 list_add_tail(&rq->queuelist, &q->queue_head);
333 return; 443 return;
334 } 444 }
335 445
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
341 INIT_LIST_HEAD(&rq->flush.list); 451 INIT_LIST_HEAD(&rq->flush.list);
342 rq->cmd_flags |= REQ_FLUSH_SEQ; 452 rq->cmd_flags |= REQ_FLUSH_SEQ;
343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 453 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
454 if (q->mq_ops) {
455 rq->end_io = mq_flush_data_end_io;
456
457 spin_lock_irq(&q->mq_flush_lock);
458 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
459 spin_unlock_irq(&q->mq_flush_lock);
460 return;
461 }
344 rq->end_io = flush_data_end_io; 462 rq->end_io = flush_data_end_io;
345 463
346 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 464 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
453 return ret; 571 return ret;
454} 572}
455EXPORT_SYMBOL(blkdev_issue_flush); 573EXPORT_SYMBOL(blkdev_issue_flush);
574
575void blk_mq_init_flush(struct request_queue *q)
576{
577 spin_lock_init(&q->mq_flush_lock);
578 INIT_WORK(&q->mq_flush_work, mq_flush_work);
579}
diff --git a/block/blk-mq-cpu.c b/block/blk-mq-cpu.c
new file mode 100644
index 000000000000..f8ea39d7ae54
--- /dev/null
+++ b/block/blk-mq-cpu.c
@@ -0,0 +1,93 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <linux/blkdev.h>
5#include <linux/list.h>
6#include <linux/llist.h>
7#include <linux/smp.h>
8#include <linux/cpu.h>
9
10#include <linux/blk-mq.h>
11#include "blk-mq.h"
12
13static LIST_HEAD(blk_mq_cpu_notify_list);
14static DEFINE_SPINLOCK(blk_mq_cpu_notify_lock);
15
16static int __cpuinit blk_mq_main_cpu_notify(struct notifier_block *self,
17 unsigned long action, void *hcpu)
18{
19 unsigned int cpu = (unsigned long) hcpu;
20 struct blk_mq_cpu_notifier *notify;
21
22 spin_lock(&blk_mq_cpu_notify_lock);
23
24 list_for_each_entry(notify, &blk_mq_cpu_notify_list, list)
25 notify->notify(notify->data, action, cpu);
26
27 spin_unlock(&blk_mq_cpu_notify_lock);
28 return NOTIFY_OK;
29}
30
31static void __cpuinit blk_mq_cpu_notify(void *data, unsigned long action,
32 unsigned int cpu)
33{
34 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
35 /*
36 * If the CPU goes away, ensure that we run any pending
37 * completions.
38 */
39 struct llist_node *node;
40 struct request *rq;
41
42 local_irq_disable();
43
44 node = llist_del_all(&per_cpu(ipi_lists, cpu));
45 while (node) {
46 struct llist_node *next = node->next;
47
48 rq = llist_entry(node, struct request, ll_list);
49 __blk_mq_end_io(rq, rq->errors);
50 node = next;
51 }
52
53 local_irq_enable();
54 }
55}
56
57static struct notifier_block __cpuinitdata blk_mq_main_cpu_notifier = {
58 .notifier_call = blk_mq_main_cpu_notify,
59};
60
61void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
62{
63 BUG_ON(!notifier->notify);
64
65 spin_lock(&blk_mq_cpu_notify_lock);
66 list_add_tail(&notifier->list, &blk_mq_cpu_notify_list);
67 spin_unlock(&blk_mq_cpu_notify_lock);
68}
69
70void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier)
71{
72 spin_lock(&blk_mq_cpu_notify_lock);
73 list_del(&notifier->list);
74 spin_unlock(&blk_mq_cpu_notify_lock);
75}
76
77void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
78 void (*fn)(void *, unsigned long, unsigned int),
79 void *data)
80{
81 notifier->notify = fn;
82 notifier->data = data;
83}
84
85static struct blk_mq_cpu_notifier __cpuinitdata cpu_notifier = {
86 .notify = blk_mq_cpu_notify,
87};
88
89void __init blk_mq_cpu_init(void)
90{
91 register_hotcpu_notifier(&blk_mq_main_cpu_notifier);
92 blk_mq_register_cpu_notifier(&cpu_notifier);
93}
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
new file mode 100644
index 000000000000..f8721278601c
--- /dev/null
+++ b/block/blk-mq-cpumap.c
@@ -0,0 +1,108 @@
1#include <linux/kernel.h>
2#include <linux/threads.h>
3#include <linux/module.h>
4#include <linux/mm.h>
5#include <linux/smp.h>
6#include <linux/cpu.h>
7
8#include <linux/blk-mq.h>
9#include "blk.h"
10#include "blk-mq.h"
11
12static void show_map(unsigned int *map, unsigned int nr)
13{
14 int i;
15
16 pr_info("blk-mq: CPU -> queue map\n");
17 for_each_online_cpu(i)
18 pr_info(" CPU%2u -> Queue %u\n", i, map[i]);
19}
20
21static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
22 const int cpu)
23{
24 return cpu / ((nr_cpus + nr_queues - 1) / nr_queues);
25}
26
27static int get_first_sibling(unsigned int cpu)
28{
29 unsigned int ret;
30
31 ret = cpumask_first(topology_thread_cpumask(cpu));
32 if (ret < nr_cpu_ids)
33 return ret;
34
35 return cpu;
36}
37
38int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
39{
40 unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
41 cpumask_var_t cpus;
42
43 if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
44 return 1;
45
46 cpumask_clear(cpus);
47 nr_cpus = nr_uniq_cpus = 0;
48 for_each_online_cpu(i) {
49 nr_cpus++;
50 first_sibling = get_first_sibling(i);
51 if (!cpumask_test_cpu(first_sibling, cpus))
52 nr_uniq_cpus++;
53 cpumask_set_cpu(i, cpus);
54 }
55
56 queue = 0;
57 for_each_possible_cpu(i) {
58 if (!cpu_online(i)) {
59 map[i] = 0;
60 continue;
61 }
62
63 /*
64 * Easy case - we have equal or more hardware queues. Or
65 * there are no thread siblings to take into account. Do
66 * 1:1 if enough, or sequential mapping if less.
67 */
68 if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
69 map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
70 queue++;
71 continue;
72 }
73
74 /*
75 * Less then nr_cpus queues, and we have some number of
76 * threads per cores. Map sibling threads to the same
77 * queue.
78 */
79 first_sibling = get_first_sibling(i);
80 if (first_sibling == i) {
81 map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
82 queue);
83 queue++;
84 } else
85 map[i] = map[first_sibling];
86 }
87
88 show_map(map, nr_cpus);
89 free_cpumask_var(cpus);
90 return 0;
91}
92
93unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
94{
95 unsigned int *map;
96
97 /* If cpus are offline, map them to first hctx */
98 map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
99 reg->numa_node);
100 if (!map)
101 return NULL;
102
103 if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
104 return map;
105
106 kfree(map);
107 return NULL;
108}
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
new file mode 100644
index 000000000000..ba6cf8e9aa0a
--- /dev/null
+++ b/block/blk-mq-sysfs.c
@@ -0,0 +1,384 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/backing-dev.h>
4#include <linux/bio.h>
5#include <linux/blkdev.h>
6#include <linux/mm.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/workqueue.h>
10#include <linux/smp.h>
11
12#include <linux/blk-mq.h>
13#include "blk-mq.h"
14#include "blk-mq-tag.h"
15
16static void blk_mq_sysfs_release(struct kobject *kobj)
17{
18}
19
20struct blk_mq_ctx_sysfs_entry {
21 struct attribute attr;
22 ssize_t (*show)(struct blk_mq_ctx *, char *);
23 ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
24};
25
26struct blk_mq_hw_ctx_sysfs_entry {
27 struct attribute attr;
28 ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
29 ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
30};
31
32static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
33 char *page)
34{
35 struct blk_mq_ctx_sysfs_entry *entry;
36 struct blk_mq_ctx *ctx;
37 struct request_queue *q;
38 ssize_t res;
39
40 entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
41 ctx = container_of(kobj, struct blk_mq_ctx, kobj);
42 q = ctx->queue;
43
44 if (!entry->show)
45 return -EIO;
46
47 res = -ENOENT;
48 mutex_lock(&q->sysfs_lock);
49 if (!blk_queue_dying(q))
50 res = entry->show(ctx, page);
51 mutex_unlock(&q->sysfs_lock);
52 return res;
53}
54
55static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
56 const char *page, size_t length)
57{
58 struct blk_mq_ctx_sysfs_entry *entry;
59 struct blk_mq_ctx *ctx;
60 struct request_queue *q;
61 ssize_t res;
62
63 entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
64 ctx = container_of(kobj, struct blk_mq_ctx, kobj);
65 q = ctx->queue;
66
67 if (!entry->store)
68 return -EIO;
69
70 res = -ENOENT;
71 mutex_lock(&q->sysfs_lock);
72 if (!blk_queue_dying(q))
73 res = entry->store(ctx, page, length);
74 mutex_unlock(&q->sysfs_lock);
75 return res;
76}
77
78static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
79 struct attribute *attr, char *page)
80{
81 struct blk_mq_hw_ctx_sysfs_entry *entry;
82 struct blk_mq_hw_ctx *hctx;
83 struct request_queue *q;
84 ssize_t res;
85
86 entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
87 hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
88 q = hctx->queue;
89
90 if (!entry->show)
91 return -EIO;
92
93 res = -ENOENT;
94 mutex_lock(&q->sysfs_lock);
95 if (!blk_queue_dying(q))
96 res = entry->show(hctx, page);
97 mutex_unlock(&q->sysfs_lock);
98 return res;
99}
100
101static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
102 struct attribute *attr, const char *page,
103 size_t length)
104{
105 struct blk_mq_hw_ctx_sysfs_entry *entry;
106 struct blk_mq_hw_ctx *hctx;
107 struct request_queue *q;
108 ssize_t res;
109
110 entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
111 hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
112 q = hctx->queue;
113
114 if (!entry->store)
115 return -EIO;
116
117 res = -ENOENT;
118 mutex_lock(&q->sysfs_lock);
119 if (!blk_queue_dying(q))
120 res = entry->store(hctx, page, length);
121 mutex_unlock(&q->sysfs_lock);
122 return res;
123}
124
125static ssize_t blk_mq_sysfs_dispatched_show(struct blk_mq_ctx *ctx, char *page)
126{
127 return sprintf(page, "%lu %lu\n", ctx->rq_dispatched[1],
128 ctx->rq_dispatched[0]);
129}
130
131static ssize_t blk_mq_sysfs_merged_show(struct blk_mq_ctx *ctx, char *page)
132{
133 return sprintf(page, "%lu\n", ctx->rq_merged);
134}
135
136static ssize_t blk_mq_sysfs_completed_show(struct blk_mq_ctx *ctx, char *page)
137{
138 return sprintf(page, "%lu %lu\n", ctx->rq_completed[1],
139 ctx->rq_completed[0]);
140}
141
142static ssize_t sysfs_list_show(char *page, struct list_head *list, char *msg)
143{
144 char *start_page = page;
145 struct request *rq;
146
147 page += sprintf(page, "%s:\n", msg);
148
149 list_for_each_entry(rq, list, queuelist)
150 page += sprintf(page, "\t%p\n", rq);
151
152 return page - start_page;
153}
154
155static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page)
156{
157 ssize_t ret;
158
159 spin_lock(&ctx->lock);
160 ret = sysfs_list_show(page, &ctx->rq_list, "CTX pending");
161 spin_unlock(&ctx->lock);
162
163 return ret;
164}
165
166static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx,
167 char *page)
168{
169 return sprintf(page, "%lu\n", hctx->queued);
170}
171
172static ssize_t blk_mq_hw_sysfs_run_show(struct blk_mq_hw_ctx *hctx, char *page)
173{
174 return sprintf(page, "%lu\n", hctx->run);
175}
176
177static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx,
178 char *page)
179{
180 char *start_page = page;
181 int i;
182
183 page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
184
185 for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) {
186 unsigned long d = 1U << (i - 1);
187
188 page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]);
189 }
190
191 return page - start_page;
192}
193
194static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
195 char *page)
196{
197 ssize_t ret;
198
199 spin_lock(&hctx->lock);
200 ret = sysfs_list_show(page, &hctx->dispatch, "HCTX pending");
201 spin_unlock(&hctx->lock);
202
203 return ret;
204}
205
206static ssize_t blk_mq_hw_sysfs_ipi_show(struct blk_mq_hw_ctx *hctx, char *page)
207{
208 ssize_t ret;
209
210 spin_lock(&hctx->lock);
211 ret = sprintf(page, "%u\n", !!(hctx->flags & BLK_MQ_F_SHOULD_IPI));
212 spin_unlock(&hctx->lock);
213
214 return ret;
215}
216
217static ssize_t blk_mq_hw_sysfs_ipi_store(struct blk_mq_hw_ctx *hctx,
218 const char *page, size_t len)
219{
220 struct blk_mq_ctx *ctx;
221 unsigned long ret;
222 unsigned int i;
223
224 if (kstrtoul(page, 10, &ret)) {
225 pr_err("blk-mq-sysfs: invalid input '%s'\n", page);
226 return -EINVAL;
227 }
228
229 spin_lock(&hctx->lock);
230 if (ret)
231 hctx->flags |= BLK_MQ_F_SHOULD_IPI;
232 else
233 hctx->flags &= ~BLK_MQ_F_SHOULD_IPI;
234 spin_unlock(&hctx->lock);
235
236 hctx_for_each_ctx(hctx, ctx, i)
237 ctx->ipi_redirect = !!ret;
238
239 return len;
240}
241
242static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
243{
244 return blk_mq_tag_sysfs_show(hctx->tags, page);
245}
246
247static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
248 .attr = {.name = "dispatched", .mode = S_IRUGO },
249 .show = blk_mq_sysfs_dispatched_show,
250};
251static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_merged = {
252 .attr = {.name = "merged", .mode = S_IRUGO },
253 .show = blk_mq_sysfs_merged_show,
254};
255static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_completed = {
256 .attr = {.name = "completed", .mode = S_IRUGO },
257 .show = blk_mq_sysfs_completed_show,
258};
259static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_rq_list = {
260 .attr = {.name = "rq_list", .mode = S_IRUGO },
261 .show = blk_mq_sysfs_rq_list_show,
262};
263
264static struct attribute *default_ctx_attrs[] = {
265 &blk_mq_sysfs_dispatched.attr,
266 &blk_mq_sysfs_merged.attr,
267 &blk_mq_sysfs_completed.attr,
268 &blk_mq_sysfs_rq_list.attr,
269 NULL,
270};
271
272static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_queued = {
273 .attr = {.name = "queued", .mode = S_IRUGO },
274 .show = blk_mq_hw_sysfs_queued_show,
275};
276static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_run = {
277 .attr = {.name = "run", .mode = S_IRUGO },
278 .show = blk_mq_hw_sysfs_run_show,
279};
280static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_dispatched = {
281 .attr = {.name = "dispatched", .mode = S_IRUGO },
282 .show = blk_mq_hw_sysfs_dispatched_show,
283};
284static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
285 .attr = {.name = "pending", .mode = S_IRUGO },
286 .show = blk_mq_hw_sysfs_rq_list_show,
287};
288static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_ipi = {
289 .attr = {.name = "ipi_redirect", .mode = S_IRUGO | S_IWUSR},
290 .show = blk_mq_hw_sysfs_ipi_show,
291 .store = blk_mq_hw_sysfs_ipi_store,
292};
293static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
294 .attr = {.name = "tags", .mode = S_IRUGO },
295 .show = blk_mq_hw_sysfs_tags_show,
296};
297
298static struct attribute *default_hw_ctx_attrs[] = {
299 &blk_mq_hw_sysfs_queued.attr,
300 &blk_mq_hw_sysfs_run.attr,
301 &blk_mq_hw_sysfs_dispatched.attr,
302 &blk_mq_hw_sysfs_pending.attr,
303 &blk_mq_hw_sysfs_ipi.attr,
304 &blk_mq_hw_sysfs_tags.attr,
305 NULL,
306};
307
308static const struct sysfs_ops blk_mq_sysfs_ops = {
309 .show = blk_mq_sysfs_show,
310 .store = blk_mq_sysfs_store,
311};
312
313static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
314 .show = blk_mq_hw_sysfs_show,
315 .store = blk_mq_hw_sysfs_store,
316};
317
318static struct kobj_type blk_mq_ktype = {
319 .sysfs_ops = &blk_mq_sysfs_ops,
320 .release = blk_mq_sysfs_release,
321};
322
323static struct kobj_type blk_mq_ctx_ktype = {
324 .sysfs_ops = &blk_mq_sysfs_ops,
325 .default_attrs = default_ctx_attrs,
326 .release = blk_mq_sysfs_release,
327};
328
329static struct kobj_type blk_mq_hw_ktype = {
330 .sysfs_ops = &blk_mq_hw_sysfs_ops,
331 .default_attrs = default_hw_ctx_attrs,
332 .release = blk_mq_sysfs_release,
333};
334
335void blk_mq_unregister_disk(struct gendisk *disk)
336{
337 struct request_queue *q = disk->queue;
338
339 kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
340 kobject_del(&q->mq_kobj);
341
342 kobject_put(&disk_to_dev(disk)->kobj);
343}
344
345int blk_mq_register_disk(struct gendisk *disk)
346{
347 struct device *dev = disk_to_dev(disk);
348 struct request_queue *q = disk->queue;
349 struct blk_mq_hw_ctx *hctx;
350 struct blk_mq_ctx *ctx;
351 int ret, i, j;
352
353 kobject_init(&q->mq_kobj, &blk_mq_ktype);
354
355 ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
356 if (ret < 0)
357 return ret;
358
359 kobject_uevent(&q->mq_kobj, KOBJ_ADD);
360
361 queue_for_each_hw_ctx(q, hctx, i) {
362 kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
363 ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", i);
364 if (ret)
365 break;
366
367 if (!hctx->nr_ctx)
368 continue;
369
370 hctx_for_each_ctx(hctx, ctx, j) {
371 kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
372 ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
373 if (ret)
374 break;
375 }
376 }
377
378 if (ret) {
379 blk_mq_unregister_disk(disk);
380 return ret;
381 }
382
383 return 0;
384}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
new file mode 100644
index 000000000000..d64a02fb1f73
--- /dev/null
+++ b/block/blk-mq-tag.c
@@ -0,0 +1,204 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/percpu_ida.h>
4
5#include <linux/blk-mq.h>
6#include "blk.h"
7#include "blk-mq.h"
8#include "blk-mq-tag.h"
9
10/*
11 * Per tagged queue (tag address space) map
12 */
13struct blk_mq_tags {
14 unsigned int nr_tags;
15 unsigned int nr_reserved_tags;
16 unsigned int nr_batch_move;
17 unsigned int nr_max_cache;
18
19 struct percpu_ida free_tags;
20 struct percpu_ida reserved_tags;
21};
22
23void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
24{
25 int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
26 blk_mq_put_tag(tags, tag);
27}
28
29bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
30{
31 return !tags ||
32 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids) != 0;
33}
34
35static unsigned int __blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp)
36{
37 int tag;
38
39 tag = percpu_ida_alloc(&tags->free_tags, gfp);
40 if (tag < 0)
41 return BLK_MQ_TAG_FAIL;
42 return tag + tags->nr_reserved_tags;
43}
44
45static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_tags *tags,
46 gfp_t gfp)
47{
48 int tag;
49
50 if (unlikely(!tags->nr_reserved_tags)) {
51 WARN_ON_ONCE(1);
52 return BLK_MQ_TAG_FAIL;
53 }
54
55 tag = percpu_ida_alloc(&tags->reserved_tags, gfp);
56 if (tag < 0)
57 return BLK_MQ_TAG_FAIL;
58 return tag;
59}
60
61unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved)
62{
63 if (!reserved)
64 return __blk_mq_get_tag(tags, gfp);
65
66 return __blk_mq_get_reserved_tag(tags, gfp);
67}
68
69static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
70{
71 BUG_ON(tag >= tags->nr_tags);
72
73 percpu_ida_free(&tags->free_tags, tag - tags->nr_reserved_tags);
74}
75
76static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
77 unsigned int tag)
78{
79 BUG_ON(tag >= tags->nr_reserved_tags);
80
81 percpu_ida_free(&tags->reserved_tags, tag);
82}
83
84void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
85{
86 if (tag >= tags->nr_reserved_tags)
87 __blk_mq_put_tag(tags, tag);
88 else
89 __blk_mq_put_reserved_tag(tags, tag);
90}
91
92static int __blk_mq_tag_iter(unsigned id, void *data)
93{
94 unsigned long *tag_map = data;
95 __set_bit(id, tag_map);
96 return 0;
97}
98
99void blk_mq_tag_busy_iter(struct blk_mq_tags *tags,
100 void (*fn)(void *, unsigned long *), void *data)
101{
102 unsigned long *tag_map;
103 size_t map_size;
104
105 map_size = ALIGN(tags->nr_tags, BITS_PER_LONG) / BITS_PER_LONG;
106 tag_map = kzalloc(map_size * sizeof(unsigned long), GFP_ATOMIC);
107 if (!tag_map)
108 return;
109
110 percpu_ida_for_each_free(&tags->free_tags, __blk_mq_tag_iter, tag_map);
111 if (tags->nr_reserved_tags)
112 percpu_ida_for_each_free(&tags->reserved_tags, __blk_mq_tag_iter,
113 tag_map);
114
115 fn(data, tag_map);
116 kfree(tag_map);
117}
118
119struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
120 unsigned int reserved_tags, int node)
121{
122 unsigned int nr_tags, nr_cache;
123 struct blk_mq_tags *tags;
124 int ret;
125
126 if (total_tags > BLK_MQ_TAG_MAX) {
127 pr_err("blk-mq: tag depth too large\n");
128 return NULL;
129 }
130
131 tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
132 if (!tags)
133 return NULL;
134
135 nr_tags = total_tags - reserved_tags;
136 nr_cache = nr_tags / num_possible_cpus();
137
138 if (nr_cache < BLK_MQ_TAG_CACHE_MIN)
139 nr_cache = BLK_MQ_TAG_CACHE_MIN;
140 else if (nr_cache > BLK_MQ_TAG_CACHE_MAX)
141 nr_cache = BLK_MQ_TAG_CACHE_MAX;
142
143 tags->nr_tags = total_tags;
144 tags->nr_reserved_tags = reserved_tags;
145 tags->nr_max_cache = nr_cache;
146 tags->nr_batch_move = max(1u, nr_cache / 2);
147
148 ret = __percpu_ida_init(&tags->free_tags, tags->nr_tags -
149 tags->nr_reserved_tags,
150 tags->nr_max_cache,
151 tags->nr_batch_move);
152 if (ret)
153 goto err_free_tags;
154
155 if (reserved_tags) {
156 /*
157 * With max_cahe and batch set to 1, the allocator fallbacks to
158 * no cached. It's fine reserved tags allocation is slow.
159 */
160 ret = __percpu_ida_init(&tags->reserved_tags, reserved_tags,
161 1, 1);
162 if (ret)
163 goto err_reserved_tags;
164 }
165
166 return tags;
167
168err_reserved_tags:
169 percpu_ida_destroy(&tags->free_tags);
170err_free_tags:
171 kfree(tags);
172 return NULL;
173}
174
175void blk_mq_free_tags(struct blk_mq_tags *tags)
176{
177 percpu_ida_destroy(&tags->free_tags);
178 percpu_ida_destroy(&tags->reserved_tags);
179 kfree(tags);
180}
181
182ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page)
183{
184 char *orig_page = page;
185 int cpu;
186
187 if (!tags)
188 return 0;
189
190 page += sprintf(page, "nr_tags=%u, reserved_tags=%u, batch_move=%u,"
191 " max_cache=%u\n", tags->nr_tags, tags->nr_reserved_tags,
192 tags->nr_batch_move, tags->nr_max_cache);
193
194 page += sprintf(page, "nr_free=%u, nr_reserved=%u\n",
195 percpu_ida_free_tags(&tags->free_tags, nr_cpu_ids),
196 percpu_ida_free_tags(&tags->reserved_tags, nr_cpu_ids));
197
198 for_each_possible_cpu(cpu) {
199 page += sprintf(page, " cpu%02u: nr_free=%u\n", cpu,
200 percpu_ida_free_tags(&tags->free_tags, cpu));
201 }
202
203 return page - orig_page;
204}
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
new file mode 100644
index 000000000000..947ba2c6148e
--- /dev/null
+++ b/block/blk-mq-tag.h
@@ -0,0 +1,27 @@
1#ifndef INT_BLK_MQ_TAG_H
2#define INT_BLK_MQ_TAG_H
3
4struct blk_mq_tags;
5
6extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node);
7extern void blk_mq_free_tags(struct blk_mq_tags *tags);
8
9extern unsigned int blk_mq_get_tag(struct blk_mq_tags *tags, gfp_t gfp, bool reserved);
10extern void blk_mq_wait_for_tags(struct blk_mq_tags *tags);
11extern void blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag);
12extern void blk_mq_tag_busy_iter(struct blk_mq_tags *tags, void (*fn)(void *data, unsigned long *), void *data);
13extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
14extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
15
16enum {
17 BLK_MQ_TAG_CACHE_MIN = 1,
18 BLK_MQ_TAG_CACHE_MAX = 64,
19};
20
21enum {
22 BLK_MQ_TAG_FAIL = -1U,
23 BLK_MQ_TAG_MIN = BLK_MQ_TAG_CACHE_MIN,
24 BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
25};
26
27#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
new file mode 100644
index 000000000000..f21ec964e411
--- /dev/null
+++ b/block/blk-mq.c
@@ -0,0 +1,1480 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/backing-dev.h>
4#include <linux/bio.h>
5#include <linux/blkdev.h>
6#include <linux/mm.h>
7#include <linux/init.h>
8#include <linux/slab.h>
9#include <linux/workqueue.h>
10#include <linux/smp.h>
11#include <linux/llist.h>
12#include <linux/list_sort.h>
13#include <linux/cpu.h>
14#include <linux/cache.h>
15#include <linux/sched/sysctl.h>
16#include <linux/delay.h>
17
18#include <trace/events/block.h>
19
20#include <linux/blk-mq.h>
21#include "blk.h"
22#include "blk-mq.h"
23#include "blk-mq-tag.h"
24
25static DEFINE_MUTEX(all_q_mutex);
26static LIST_HEAD(all_q_list);
27
28static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
29
30DEFINE_PER_CPU(struct llist_head, ipi_lists);
31
32static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
33 unsigned int cpu)
34{
35 return per_cpu_ptr(q->queue_ctx, cpu);
36}
37
38/*
39 * This assumes per-cpu software queueing queues. They could be per-node
40 * as well, for instance. For now this is hardcoded as-is. Note that we don't
41 * care about preemption, since we know the ctx's are persistent. This does
42 * mean that we can't rely on ctx always matching the currently running CPU.
43 */
44static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
45{
46 return __blk_mq_get_ctx(q, get_cpu());
47}
48
49static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
50{
51 put_cpu();
52}
53
54/*
55 * Check if any of the ctx's have pending work in this hardware queue
56 */
57static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
58{
59 unsigned int i;
60
61 for (i = 0; i < hctx->nr_ctx_map; i++)
62 if (hctx->ctx_map[i])
63 return true;
64
65 return false;
66}
67
68/*
69 * Mark this ctx as having pending work in this hardware queue
70 */
71static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
72 struct blk_mq_ctx *ctx)
73{
74 if (!test_bit(ctx->index_hw, hctx->ctx_map))
75 set_bit(ctx->index_hw, hctx->ctx_map);
76}
77
78static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp,
79 bool reserved)
80{
81 struct request *rq;
82 unsigned int tag;
83
84 tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
85 if (tag != BLK_MQ_TAG_FAIL) {
86 rq = hctx->rqs[tag];
87 rq->tag = tag;
88
89 return rq;
90 }
91
92 return NULL;
93}
94
95static int blk_mq_queue_enter(struct request_queue *q)
96{
97 int ret;
98
99 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
100 smp_wmb();
101 /* we have problems to freeze the queue if it's initializing */
102 if (!blk_queue_bypass(q) || !blk_queue_init_done(q))
103 return 0;
104
105 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
106
107 spin_lock_irq(q->queue_lock);
108 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
109 !blk_queue_bypass(q), *q->queue_lock);
110 /* inc usage with lock hold to avoid freeze_queue runs here */
111 if (!ret)
112 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
113 spin_unlock_irq(q->queue_lock);
114
115 return ret;
116}
117
118static void blk_mq_queue_exit(struct request_queue *q)
119{
120 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
121}
122
123/*
124 * Guarantee no request is in use, so we can change any data structure of
125 * the queue afterward.
126 */
127static void blk_mq_freeze_queue(struct request_queue *q)
128{
129 bool drain;
130
131 spin_lock_irq(q->queue_lock);
132 drain = !q->bypass_depth++;
133 queue_flag_set(QUEUE_FLAG_BYPASS, q);
134 spin_unlock_irq(q->queue_lock);
135
136 if (!drain)
137 return;
138
139 while (true) {
140 s64 count;
141
142 spin_lock_irq(q->queue_lock);
143 count = percpu_counter_sum(&q->mq_usage_counter);
144 spin_unlock_irq(q->queue_lock);
145
146 if (count == 0)
147 break;
148 blk_mq_run_queues(q, false);
149 msleep(10);
150 }
151}
152
153static void blk_mq_unfreeze_queue(struct request_queue *q)
154{
155 bool wake = false;
156
157 spin_lock_irq(q->queue_lock);
158 if (!--q->bypass_depth) {
159 queue_flag_clear(QUEUE_FLAG_BYPASS, q);
160 wake = true;
161 }
162 WARN_ON_ONCE(q->bypass_depth < 0);
163 spin_unlock_irq(q->queue_lock);
164 if (wake)
165 wake_up_all(&q->mq_freeze_wq);
166}
167
168bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
169{
170 return blk_mq_has_free_tags(hctx->tags);
171}
172EXPORT_SYMBOL(blk_mq_can_queue);
173
174static void blk_mq_rq_ctx_init(struct blk_mq_ctx *ctx, struct request *rq,
175 unsigned int rw_flags)
176{
177 rq->mq_ctx = ctx;
178 rq->cmd_flags = rw_flags;
179 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
180}
181
182static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
183 gfp_t gfp, bool reserved)
184{
185 return blk_mq_alloc_rq(hctx, gfp, reserved);
186}
187
188static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
189 int rw, gfp_t gfp,
190 bool reserved)
191{
192 struct request *rq;
193
194 do {
195 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
196 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
197
198 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
199 if (rq) {
200 blk_mq_rq_ctx_init(ctx, rq, rw);
201 break;
202 } else if (!(gfp & __GFP_WAIT))
203 break;
204
205 blk_mq_put_ctx(ctx);
206 __blk_mq_run_hw_queue(hctx);
207 blk_mq_wait_for_tags(hctx->tags);
208 } while (1);
209
210 return rq;
211}
212
213struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp)
214{
215 struct request *rq;
216
217 if (blk_mq_queue_enter(q))
218 return NULL;
219
220 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
221 blk_mq_put_ctx(rq->mq_ctx);
222 return rq;
223}
224
225struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
226 gfp_t gfp)
227{
228 struct request *rq;
229
230 if (blk_mq_queue_enter(q))
231 return NULL;
232
233 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
234 blk_mq_put_ctx(rq->mq_ctx);
235 return rq;
236}
237EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
238
239/*
240 * Re-init and set pdu, if we have it
241 */
242static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
243{
244 blk_rq_init(hctx->queue, rq);
245
246 if (hctx->cmd_size)
247 rq->special = blk_mq_rq_to_pdu(rq);
248}
249
250static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
251 struct blk_mq_ctx *ctx, struct request *rq)
252{
253 const int tag = rq->tag;
254 struct request_queue *q = rq->q;
255
256 blk_mq_rq_init(hctx, rq);
257 blk_mq_put_tag(hctx->tags, tag);
258
259 blk_mq_queue_exit(q);
260}
261
262void blk_mq_free_request(struct request *rq)
263{
264 struct blk_mq_ctx *ctx = rq->mq_ctx;
265 struct blk_mq_hw_ctx *hctx;
266 struct request_queue *q = rq->q;
267
268 ctx->rq_completed[rq_is_sync(rq)]++;
269
270 hctx = q->mq_ops->map_queue(q, ctx->cpu);
271 __blk_mq_free_request(hctx, ctx, rq);
272}
273
274static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error)
275{
276 if (error)
277 clear_bit(BIO_UPTODATE, &bio->bi_flags);
278 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
279 error = -EIO;
280
281 if (unlikely(rq->cmd_flags & REQ_QUIET))
282 set_bit(BIO_QUIET, &bio->bi_flags);
283
284 /* don't actually finish bio if it's part of flush sequence */
285 if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
286 bio_endio(bio, error);
287}
288
289void blk_mq_complete_request(struct request *rq, int error)
290{
291 struct bio *bio = rq->bio;
292 unsigned int bytes = 0;
293
294 trace_block_rq_complete(rq->q, rq);
295
296 while (bio) {
297 struct bio *next = bio->bi_next;
298
299 bio->bi_next = NULL;
300 bytes += bio->bi_size;
301 blk_mq_bio_endio(rq, bio, error);
302 bio = next;
303 }
304
305 blk_account_io_completion(rq, bytes);
306
307 if (rq->end_io)
308 rq->end_io(rq, error);
309 else
310 blk_mq_free_request(rq);
311
312 blk_account_io_done(rq);
313}
314
315void __blk_mq_end_io(struct request *rq, int error)
316{
317 if (!blk_mark_rq_complete(rq))
318 blk_mq_complete_request(rq, error);
319}
320
321#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
322
323/*
324 * Called with interrupts disabled.
325 */
326static void ipi_end_io(void *data)
327{
328 struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
329 struct llist_node *entry, *next;
330 struct request *rq;
331
332 entry = llist_del_all(list);
333
334 while (entry) {
335 next = entry->next;
336 rq = llist_entry(entry, struct request, ll_list);
337 __blk_mq_end_io(rq, rq->errors);
338 entry = next;
339 }
340}
341
342static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
343 struct request *rq, const int error)
344{
345 struct call_single_data *data = &rq->csd;
346
347 rq->errors = error;
348 rq->ll_list.next = NULL;
349
350 /*
351 * If the list is non-empty, an existing IPI must already
352 * be "in flight". If that is the case, we need not schedule
353 * a new one.
354 */
355 if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
356 data->func = ipi_end_io;
357 data->flags = 0;
358 __smp_call_function_single(ctx->cpu, data, 0);
359 }
360
361 return true;
362}
363#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
364static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
365 struct request *rq, const int error)
366{
367 return false;
368}
369#endif
370
371/*
372 * End IO on this request on a multiqueue enabled driver. We'll either do
373 * it directly inline, or punt to a local IPI handler on the matching
374 * remote CPU.
375 */
376void blk_mq_end_io(struct request *rq, int error)
377{
378 struct blk_mq_ctx *ctx = rq->mq_ctx;
379 int cpu;
380
381 if (!ctx->ipi_redirect)
382 return __blk_mq_end_io(rq, error);
383
384 cpu = get_cpu();
385
386 if (cpu == ctx->cpu || !cpu_online(ctx->cpu) ||
387 !ipi_remote_cpu(ctx, cpu, rq, error))
388 __blk_mq_end_io(rq, error);
389
390 put_cpu();
391}
392EXPORT_SYMBOL(blk_mq_end_io);
393
394static void blk_mq_start_request(struct request *rq)
395{
396 struct request_queue *q = rq->q;
397
398 trace_block_rq_issue(q, rq);
399
400 /*
401 * Just mark start time and set the started bit. Due to memory
402 * ordering, we know we'll see the correct deadline as long as
403 * REQ_ATOMIC_STARTED is seen.
404 */
405 rq->deadline = jiffies + q->rq_timeout;
406 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
407}
408
409static void blk_mq_requeue_request(struct request *rq)
410{
411 struct request_queue *q = rq->q;
412
413 trace_block_rq_requeue(q, rq);
414 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
415}
416
417struct blk_mq_timeout_data {
418 struct blk_mq_hw_ctx *hctx;
419 unsigned long *next;
420 unsigned int *next_set;
421};
422
423static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
424{
425 struct blk_mq_timeout_data *data = __data;
426 struct blk_mq_hw_ctx *hctx = data->hctx;
427 unsigned int tag;
428
429 /* It may not be in flight yet (this is where
430 * the REQ_ATOMIC_STARTED flag comes in). The requests are
431 * statically allocated, so we know it's always safe to access the
432 * memory associated with a bit offset into ->rqs[].
433 */
434 tag = 0;
435 do {
436 struct request *rq;
437
438 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
439 if (tag >= hctx->queue_depth)
440 break;
441
442 rq = hctx->rqs[tag++];
443
444 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
445 continue;
446
447 blk_rq_check_expired(rq, data->next, data->next_set);
448 } while (1);
449}
450
451static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
452 unsigned long *next,
453 unsigned int *next_set)
454{
455 struct blk_mq_timeout_data data = {
456 .hctx = hctx,
457 .next = next,
458 .next_set = next_set,
459 };
460
461 /*
462 * Ask the tagging code to iterate busy requests, so we can
463 * check them for timeout.
464 */
465 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
466}
467
468static void blk_mq_rq_timer(unsigned long data)
469{
470 struct request_queue *q = (struct request_queue *) data;
471 struct blk_mq_hw_ctx *hctx;
472 unsigned long next = 0;
473 int i, next_set = 0;
474
475 queue_for_each_hw_ctx(q, hctx, i)
476 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
477
478 if (next_set)
479 mod_timer(&q->timeout, round_jiffies_up(next));
480}
481
482/*
483 * Reverse check our software queue for entries that we could potentially
484 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
485 * too much time checking for merges.
486 */
487static bool blk_mq_attempt_merge(struct request_queue *q,
488 struct blk_mq_ctx *ctx, struct bio *bio)
489{
490 struct request *rq;
491 int checked = 8;
492
493 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
494 int el_ret;
495
496 if (!checked--)
497 break;
498
499 if (!blk_rq_merge_ok(rq, bio))
500 continue;
501
502 el_ret = blk_try_merge(rq, bio);
503 if (el_ret == ELEVATOR_BACK_MERGE) {
504 if (bio_attempt_back_merge(q, rq, bio)) {
505 ctx->rq_merged++;
506 return true;
507 }
508 break;
509 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
510 if (bio_attempt_front_merge(q, rq, bio)) {
511 ctx->rq_merged++;
512 return true;
513 }
514 break;
515 }
516 }
517
518 return false;
519}
520
521void blk_mq_add_timer(struct request *rq)
522{
523 __blk_add_timer(rq, NULL);
524}
525
526/*
527 * Run this hardware queue, pulling any software queues mapped to it in.
528 * Note that this function currently has various problems around ordering
529 * of IO. In particular, we'd like FIFO behaviour on handling existing
530 * items on the hctx->dispatch list. Ignore that for now.
531 */
532static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
533{
534 struct request_queue *q = hctx->queue;
535 struct blk_mq_ctx *ctx;
536 struct request *rq;
537 LIST_HEAD(rq_list);
538 int bit, queued;
539
540 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
541 return;
542
543 hctx->run++;
544
545 /*
546 * Touch any software queue that has pending entries.
547 */
548 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
549 clear_bit(bit, hctx->ctx_map);
550 ctx = hctx->ctxs[bit];
551 BUG_ON(bit != ctx->index_hw);
552
553 spin_lock(&ctx->lock);
554 list_splice_tail_init(&ctx->rq_list, &rq_list);
555 spin_unlock(&ctx->lock);
556 }
557
558 /*
559 * If we have previous entries on our dispatch list, grab them
560 * and stuff them at the front for more fair dispatch.
561 */
562 if (!list_empty_careful(&hctx->dispatch)) {
563 spin_lock(&hctx->lock);
564 if (!list_empty(&hctx->dispatch))
565 list_splice_init(&hctx->dispatch, &rq_list);
566 spin_unlock(&hctx->lock);
567 }
568
569 /*
570 * Delete and return all entries from our dispatch list
571 */
572 queued = 0;
573
574 /*
575 * Now process all the entries, sending them to the driver.
576 */
577 while (!list_empty(&rq_list)) {
578 int ret;
579
580 rq = list_first_entry(&rq_list, struct request, queuelist);
581 list_del_init(&rq->queuelist);
582 blk_mq_start_request(rq);
583
584 /*
585 * Last request in the series. Flag it as such, this
586 * enables drivers to know when IO should be kicked off,
587 * if they don't do it on a per-request basis.
588 *
589 * Note: the flag isn't the only condition drivers
590 * should do kick off. If drive is busy, the last
591 * request might not have the bit set.
592 */
593 if (list_empty(&rq_list))
594 rq->cmd_flags |= REQ_END;
595
596 ret = q->mq_ops->queue_rq(hctx, rq);
597 switch (ret) {
598 case BLK_MQ_RQ_QUEUE_OK:
599 queued++;
600 continue;
601 case BLK_MQ_RQ_QUEUE_BUSY:
602 /*
603 * FIXME: we should have a mechanism to stop the queue
604 * like blk_stop_queue, otherwise we will waste cpu
605 * time
606 */
607 list_add(&rq->queuelist, &rq_list);
608 blk_mq_requeue_request(rq);
609 break;
610 default:
611 pr_err("blk-mq: bad return on queue: %d\n", ret);
612 rq->errors = -EIO;
613 case BLK_MQ_RQ_QUEUE_ERROR:
614 blk_mq_end_io(rq, rq->errors);
615 break;
616 }
617
618 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
619 break;
620 }
621
622 if (!queued)
623 hctx->dispatched[0]++;
624 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
625 hctx->dispatched[ilog2(queued) + 1]++;
626
627 /*
628 * Any items that need requeuing? Stuff them into hctx->dispatch,
629 * that is where we will continue on next queue run.
630 */
631 if (!list_empty(&rq_list)) {
632 spin_lock(&hctx->lock);
633 list_splice(&rq_list, &hctx->dispatch);
634 spin_unlock(&hctx->lock);
635 }
636}
637
638void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
639{
640 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
641 return;
642
643 if (!async)
644 __blk_mq_run_hw_queue(hctx);
645 else {
646 struct request_queue *q = hctx->queue;
647
648 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
649 }
650}
651
652void blk_mq_run_queues(struct request_queue *q, bool async)
653{
654 struct blk_mq_hw_ctx *hctx;
655 int i;
656
657 queue_for_each_hw_ctx(q, hctx, i) {
658 if ((!blk_mq_hctx_has_pending(hctx) &&
659 list_empty_careful(&hctx->dispatch)) ||
660 test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
661 continue;
662
663 blk_mq_run_hw_queue(hctx, async);
664 }
665}
666EXPORT_SYMBOL(blk_mq_run_queues);
667
668void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
669{
670 cancel_delayed_work(&hctx->delayed_work);
671 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
672}
673EXPORT_SYMBOL(blk_mq_stop_hw_queue);
674
675void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
676{
677 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
678 __blk_mq_run_hw_queue(hctx);
679}
680EXPORT_SYMBOL(blk_mq_start_hw_queue);
681
682void blk_mq_start_stopped_hw_queues(struct request_queue *q)
683{
684 struct blk_mq_hw_ctx *hctx;
685 int i;
686
687 queue_for_each_hw_ctx(q, hctx, i) {
688 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
689 continue;
690
691 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
692 blk_mq_run_hw_queue(hctx, true);
693 }
694}
695EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
696
697static void blk_mq_work_fn(struct work_struct *work)
698{
699 struct blk_mq_hw_ctx *hctx;
700
701 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
702 __blk_mq_run_hw_queue(hctx);
703}
704
705static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
706 struct request *rq)
707{
708 struct blk_mq_ctx *ctx = rq->mq_ctx;
709
710 list_add_tail(&rq->queuelist, &ctx->rq_list);
711 blk_mq_hctx_mark_pending(hctx, ctx);
712
713 /*
714 * We do this early, to ensure we are on the right CPU.
715 */
716 blk_mq_add_timer(rq);
717}
718
719void blk_mq_insert_request(struct request_queue *q, struct request *rq,
720 bool run_queue)
721{
722 struct blk_mq_hw_ctx *hctx;
723 struct blk_mq_ctx *ctx, *current_ctx;
724
725 ctx = rq->mq_ctx;
726 hctx = q->mq_ops->map_queue(q, ctx->cpu);
727
728 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
729 blk_insert_flush(rq);
730 } else {
731 current_ctx = blk_mq_get_ctx(q);
732
733 if (!cpu_online(ctx->cpu)) {
734 ctx = current_ctx;
735 hctx = q->mq_ops->map_queue(q, ctx->cpu);
736 rq->mq_ctx = ctx;
737 }
738 spin_lock(&ctx->lock);
739 __blk_mq_insert_request(hctx, rq);
740 spin_unlock(&ctx->lock);
741
742 blk_mq_put_ctx(current_ctx);
743 }
744
745 if (run_queue)
746 __blk_mq_run_hw_queue(hctx);
747}
748EXPORT_SYMBOL(blk_mq_insert_request);
749
750/*
751 * This is a special version of blk_mq_insert_request to bypass FLUSH request
752 * check. Should only be used internally.
753 */
754void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
755{
756 struct request_queue *q = rq->q;
757 struct blk_mq_hw_ctx *hctx;
758 struct blk_mq_ctx *ctx, *current_ctx;
759
760 current_ctx = blk_mq_get_ctx(q);
761
762 ctx = rq->mq_ctx;
763 if (!cpu_online(ctx->cpu)) {
764 ctx = current_ctx;
765 rq->mq_ctx = ctx;
766 }
767 hctx = q->mq_ops->map_queue(q, ctx->cpu);
768
769 /* ctx->cpu might be offline */
770 spin_lock(&ctx->lock);
771 __blk_mq_insert_request(hctx, rq);
772 spin_unlock(&ctx->lock);
773
774 blk_mq_put_ctx(current_ctx);
775
776 if (run_queue)
777 blk_mq_run_hw_queue(hctx, async);
778}
779
780static void blk_mq_insert_requests(struct request_queue *q,
781 struct blk_mq_ctx *ctx,
782 struct list_head *list,
783 int depth,
784 bool from_schedule)
785
786{
787 struct blk_mq_hw_ctx *hctx;
788 struct blk_mq_ctx *current_ctx;
789
790 trace_block_unplug(q, depth, !from_schedule);
791
792 current_ctx = blk_mq_get_ctx(q);
793
794 if (!cpu_online(ctx->cpu))
795 ctx = current_ctx;
796 hctx = q->mq_ops->map_queue(q, ctx->cpu);
797
798 /*
799 * preemption doesn't flush plug list, so it's possible ctx->cpu is
800 * offline now
801 */
802 spin_lock(&ctx->lock);
803 while (!list_empty(list)) {
804 struct request *rq;
805
806 rq = list_first_entry(list, struct request, queuelist);
807 list_del_init(&rq->queuelist);
808 rq->mq_ctx = ctx;
809 __blk_mq_insert_request(hctx, rq);
810 }
811 spin_unlock(&ctx->lock);
812
813 blk_mq_put_ctx(current_ctx);
814
815 blk_mq_run_hw_queue(hctx, from_schedule);
816}
817
818static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
819{
820 struct request *rqa = container_of(a, struct request, queuelist);
821 struct request *rqb = container_of(b, struct request, queuelist);
822
823 return !(rqa->mq_ctx < rqb->mq_ctx ||
824 (rqa->mq_ctx == rqb->mq_ctx &&
825 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
826}
827
828void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
829{
830 struct blk_mq_ctx *this_ctx;
831 struct request_queue *this_q;
832 struct request *rq;
833 LIST_HEAD(list);
834 LIST_HEAD(ctx_list);
835 unsigned int depth;
836
837 list_splice_init(&plug->mq_list, &list);
838
839 list_sort(NULL, &list, plug_ctx_cmp);
840
841 this_q = NULL;
842 this_ctx = NULL;
843 depth = 0;
844
845 while (!list_empty(&list)) {
846 rq = list_entry_rq(list.next);
847 list_del_init(&rq->queuelist);
848 BUG_ON(!rq->q);
849 if (rq->mq_ctx != this_ctx) {
850 if (this_ctx) {
851 blk_mq_insert_requests(this_q, this_ctx,
852 &ctx_list, depth,
853 from_schedule);
854 }
855
856 this_ctx = rq->mq_ctx;
857 this_q = rq->q;
858 depth = 0;
859 }
860
861 depth++;
862 list_add_tail(&rq->queuelist, &ctx_list);
863 }
864
865 /*
866 * If 'this_ctx' is set, we know we have entries to complete
867 * on 'ctx_list'. Do those.
868 */
869 if (this_ctx) {
870 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
871 from_schedule);
872 }
873}
874
875static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
876{
877 init_request_from_bio(rq, bio);
878 blk_account_io_start(rq, 1);
879}
880
881static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
882{
883 struct blk_mq_hw_ctx *hctx;
884 struct blk_mq_ctx *ctx;
885 const int is_sync = rw_is_sync(bio->bi_rw);
886 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
887 int rw = bio_data_dir(bio);
888 struct request *rq;
889 unsigned int use_plug, request_count = 0;
890
891 /*
892 * If we have multiple hardware queues, just go directly to
893 * one of those for sync IO.
894 */
895 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync);
896
897 blk_queue_bounce(q, &bio);
898
899 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
900 return;
901
902 if (blk_mq_queue_enter(q)) {
903 bio_endio(bio, -EIO);
904 return;
905 }
906
907 ctx = blk_mq_get_ctx(q);
908 hctx = q->mq_ops->map_queue(q, ctx->cpu);
909
910 trace_block_getrq(q, bio, rw);
911 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
912 if (likely(rq))
913 blk_mq_rq_ctx_init(ctx, rq, rw);
914 else {
915 blk_mq_put_ctx(ctx);
916 trace_block_sleeprq(q, bio, rw);
917 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC,
918 false);
919 ctx = rq->mq_ctx;
920 hctx = q->mq_ops->map_queue(q, ctx->cpu);
921 }
922
923 hctx->queued++;
924
925 if (unlikely(is_flush_fua)) {
926 blk_mq_bio_to_request(rq, bio);
927 blk_mq_put_ctx(ctx);
928 blk_insert_flush(rq);
929 goto run_queue;
930 }
931
932 /*
933 * A task plug currently exists. Since this is completely lockless,
934 * utilize that to temporarily store requests until the task is
935 * either done or scheduled away.
936 */
937 if (use_plug) {
938 struct blk_plug *plug = current->plug;
939
940 if (plug) {
941 blk_mq_bio_to_request(rq, bio);
942 if (list_empty(&plug->list))
943 trace_block_plug(q);
944 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
945 blk_flush_plug_list(plug, false);
946 trace_block_plug(q);
947 }
948 list_add_tail(&rq->queuelist, &plug->mq_list);
949 blk_mq_put_ctx(ctx);
950 return;
951 }
952 }
953
954 spin_lock(&ctx->lock);
955
956 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
957 blk_mq_attempt_merge(q, ctx, bio))
958 __blk_mq_free_request(hctx, ctx, rq);
959 else {
960 blk_mq_bio_to_request(rq, bio);
961 __blk_mq_insert_request(hctx, rq);
962 }
963
964 spin_unlock(&ctx->lock);
965 blk_mq_put_ctx(ctx);
966
967 /*
968 * For a SYNC request, send it to the hardware immediately. For an
969 * ASYNC request, just ensure that we run it later on. The latter
970 * allows for merging opportunities and more efficient dispatching.
971 */
972run_queue:
973 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua);
974}
975
976/*
977 * Default mapping to a software queue, since we use one per CPU.
978 */
979struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
980{
981 return q->queue_hw_ctx[q->mq_map[cpu]];
982}
983EXPORT_SYMBOL(blk_mq_map_queue);
984
985struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
986 unsigned int hctx_index)
987{
988 return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
989 GFP_KERNEL | __GFP_ZERO, reg->numa_node);
990}
991EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
992
993void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
994 unsigned int hctx_index)
995{
996 kfree(hctx);
997}
998EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
999
1000static void blk_mq_hctx_notify(void *data, unsigned long action,
1001 unsigned int cpu)
1002{
1003 struct blk_mq_hw_ctx *hctx = data;
1004 struct blk_mq_ctx *ctx;
1005 LIST_HEAD(tmp);
1006
1007 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1008 return;
1009
1010 /*
1011 * Move ctx entries to new CPU, if this one is going away.
1012 */
1013 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1014
1015 spin_lock(&ctx->lock);
1016 if (!list_empty(&ctx->rq_list)) {
1017 list_splice_init(&ctx->rq_list, &tmp);
1018 clear_bit(ctx->index_hw, hctx->ctx_map);
1019 }
1020 spin_unlock(&ctx->lock);
1021
1022 if (list_empty(&tmp))
1023 return;
1024
1025 ctx = blk_mq_get_ctx(hctx->queue);
1026 spin_lock(&ctx->lock);
1027
1028 while (!list_empty(&tmp)) {
1029 struct request *rq;
1030
1031 rq = list_first_entry(&tmp, struct request, queuelist);
1032 rq->mq_ctx = ctx;
1033 list_move_tail(&rq->queuelist, &ctx->rq_list);
1034 }
1035
1036 blk_mq_hctx_mark_pending(hctx, ctx);
1037
1038 spin_unlock(&ctx->lock);
1039 blk_mq_put_ctx(ctx);
1040}
1041
1042static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
1043 void (*init)(void *, struct blk_mq_hw_ctx *,
1044 struct request *, unsigned int),
1045 void *data)
1046{
1047 unsigned int i;
1048
1049 for (i = 0; i < hctx->queue_depth; i++) {
1050 struct request *rq = hctx->rqs[i];
1051
1052 init(data, hctx, rq, i);
1053 }
1054}
1055
1056void blk_mq_init_commands(struct request_queue *q,
1057 void (*init)(void *, struct blk_mq_hw_ctx *,
1058 struct request *, unsigned int),
1059 void *data)
1060{
1061 struct blk_mq_hw_ctx *hctx;
1062 unsigned int i;
1063
1064 queue_for_each_hw_ctx(q, hctx, i)
1065 blk_mq_init_hw_commands(hctx, init, data);
1066}
1067EXPORT_SYMBOL(blk_mq_init_commands);
1068
1069static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
1070{
1071 struct page *page;
1072
1073 while (!list_empty(&hctx->page_list)) {
1074 page = list_first_entry(&hctx->page_list, struct page, list);
1075 list_del_init(&page->list);
1076 __free_pages(page, page->private);
1077 }
1078
1079 kfree(hctx->rqs);
1080
1081 if (hctx->tags)
1082 blk_mq_free_tags(hctx->tags);
1083}
1084
1085static size_t order_to_size(unsigned int order)
1086{
1087 size_t ret = PAGE_SIZE;
1088
1089 while (order--)
1090 ret *= 2;
1091
1092 return ret;
1093}
1094
1095static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
1096 unsigned int reserved_tags, int node)
1097{
1098 unsigned int i, j, entries_per_page, max_order = 4;
1099 size_t rq_size, left;
1100
1101 INIT_LIST_HEAD(&hctx->page_list);
1102
1103 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
1104 GFP_KERNEL, node);
1105 if (!hctx->rqs)
1106 return -ENOMEM;
1107
1108 /*
1109 * rq_size is the size of the request plus driver payload, rounded
1110 * to the cacheline size
1111 */
1112 rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
1113 cache_line_size());
1114 left = rq_size * hctx->queue_depth;
1115
1116 for (i = 0; i < hctx->queue_depth;) {
1117 int this_order = max_order;
1118 struct page *page;
1119 int to_do;
1120 void *p;
1121
1122 while (left < order_to_size(this_order - 1) && this_order)
1123 this_order--;
1124
1125 do {
1126 page = alloc_pages_node(node, GFP_KERNEL, this_order);
1127 if (page)
1128 break;
1129 if (!this_order--)
1130 break;
1131 if (order_to_size(this_order) < rq_size)
1132 break;
1133 } while (1);
1134
1135 if (!page)
1136 break;
1137
1138 page->private = this_order;
1139 list_add_tail(&page->list, &hctx->page_list);
1140
1141 p = page_address(page);
1142 entries_per_page = order_to_size(this_order) / rq_size;
1143 to_do = min(entries_per_page, hctx->queue_depth - i);
1144 left -= to_do * rq_size;
1145 for (j = 0; j < to_do; j++) {
1146 hctx->rqs[i] = p;
1147 blk_mq_rq_init(hctx, hctx->rqs[i]);
1148 p += rq_size;
1149 i++;
1150 }
1151 }
1152
1153 if (i < (reserved_tags + BLK_MQ_TAG_MIN))
1154 goto err_rq_map;
1155 else if (i != hctx->queue_depth) {
1156 hctx->queue_depth = i;
1157 pr_warn("%s: queue depth set to %u because of low memory\n",
1158 __func__, i);
1159 }
1160
1161 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
1162 if (!hctx->tags) {
1163err_rq_map:
1164 blk_mq_free_rq_map(hctx);
1165 return -ENOMEM;
1166 }
1167
1168 return 0;
1169}
1170
1171static int blk_mq_init_hw_queues(struct request_queue *q,
1172 struct blk_mq_reg *reg, void *driver_data)
1173{
1174 struct blk_mq_hw_ctx *hctx;
1175 unsigned int i, j;
1176
1177 /*
1178 * Initialize hardware queues
1179 */
1180 queue_for_each_hw_ctx(q, hctx, i) {
1181 unsigned int num_maps;
1182 int node;
1183
1184 node = hctx->numa_node;
1185 if (node == NUMA_NO_NODE)
1186 node = hctx->numa_node = reg->numa_node;
1187
1188 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
1189 spin_lock_init(&hctx->lock);
1190 INIT_LIST_HEAD(&hctx->dispatch);
1191 hctx->queue = q;
1192 hctx->queue_num = i;
1193 hctx->flags = reg->flags;
1194 hctx->queue_depth = reg->queue_depth;
1195 hctx->cmd_size = reg->cmd_size;
1196
1197 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1198 blk_mq_hctx_notify, hctx);
1199 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1200
1201 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
1202 break;
1203
1204 /*
1205 * Allocate space for all possible cpus to avoid allocation in
1206 * runtime
1207 */
1208 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1209 GFP_KERNEL, node);
1210 if (!hctx->ctxs)
1211 break;
1212
1213 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
1214 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
1215 GFP_KERNEL, node);
1216 if (!hctx->ctx_map)
1217 break;
1218
1219 hctx->nr_ctx_map = num_maps;
1220 hctx->nr_ctx = 0;
1221
1222 if (reg->ops->init_hctx &&
1223 reg->ops->init_hctx(hctx, driver_data, i))
1224 break;
1225 }
1226
1227 if (i == q->nr_hw_queues)
1228 return 0;
1229
1230 /*
1231 * Init failed
1232 */
1233 queue_for_each_hw_ctx(q, hctx, j) {
1234 if (i == j)
1235 break;
1236
1237 if (reg->ops->exit_hctx)
1238 reg->ops->exit_hctx(hctx, j);
1239
1240 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1241 blk_mq_free_rq_map(hctx);
1242 kfree(hctx->ctxs);
1243 }
1244
1245 return 1;
1246}
1247
1248static void blk_mq_init_cpu_queues(struct request_queue *q,
1249 unsigned int nr_hw_queues)
1250{
1251 unsigned int i;
1252
1253 for_each_possible_cpu(i) {
1254 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1255 struct blk_mq_hw_ctx *hctx;
1256
1257 memset(__ctx, 0, sizeof(*__ctx));
1258 __ctx->cpu = i;
1259 spin_lock_init(&__ctx->lock);
1260 INIT_LIST_HEAD(&__ctx->rq_list);
1261 __ctx->queue = q;
1262
1263 /* If the cpu isn't online, the cpu is mapped to first hctx */
1264 hctx = q->mq_ops->map_queue(q, i);
1265 hctx->nr_ctx++;
1266
1267 if (!cpu_online(i))
1268 continue;
1269
1270 /*
1271 * Set local node, IFF we have more than one hw queue. If
1272 * not, we remain on the home node of the device
1273 */
1274 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1275 hctx->numa_node = cpu_to_node(i);
1276 }
1277}
1278
1279static void blk_mq_map_swqueue(struct request_queue *q)
1280{
1281 unsigned int i;
1282 struct blk_mq_hw_ctx *hctx;
1283 struct blk_mq_ctx *ctx;
1284
1285 queue_for_each_hw_ctx(q, hctx, i) {
1286 hctx->nr_ctx = 0;
1287 }
1288
1289 /*
1290 * Map software to hardware queues
1291 */
1292 queue_for_each_ctx(q, ctx, i) {
1293 /* If the cpu isn't online, the cpu is mapped to first hctx */
1294 hctx = q->mq_ops->map_queue(q, i);
1295 ctx->index_hw = hctx->nr_ctx;
1296 hctx->ctxs[hctx->nr_ctx++] = ctx;
1297 }
1298}
1299
1300struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
1301 void *driver_data)
1302{
1303 struct blk_mq_hw_ctx **hctxs;
1304 struct blk_mq_ctx *ctx;
1305 struct request_queue *q;
1306 int i;
1307
1308 if (!reg->nr_hw_queues ||
1309 !reg->ops->queue_rq || !reg->ops->map_queue ||
1310 !reg->ops->alloc_hctx || !reg->ops->free_hctx)
1311 return ERR_PTR(-EINVAL);
1312
1313 if (!reg->queue_depth)
1314 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1315 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
1316 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
1317 reg->queue_depth = BLK_MQ_MAX_DEPTH;
1318 }
1319
1320 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
1321 return ERR_PTR(-EINVAL);
1322
1323 ctx = alloc_percpu(struct blk_mq_ctx);
1324 if (!ctx)
1325 return ERR_PTR(-ENOMEM);
1326
1327 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1328 reg->numa_node);
1329
1330 if (!hctxs)
1331 goto err_percpu;
1332
1333 for (i = 0; i < reg->nr_hw_queues; i++) {
1334 hctxs[i] = reg->ops->alloc_hctx(reg, i);
1335 if (!hctxs[i])
1336 goto err_hctxs;
1337
1338 hctxs[i]->numa_node = NUMA_NO_NODE;
1339 hctxs[i]->queue_num = i;
1340 }
1341
1342 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
1343 if (!q)
1344 goto err_hctxs;
1345
1346 q->mq_map = blk_mq_make_queue_map(reg);
1347 if (!q->mq_map)
1348 goto err_map;
1349
1350 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1351 blk_queue_rq_timeout(q, 30000);
1352
1353 q->nr_queues = nr_cpu_ids;
1354 q->nr_hw_queues = reg->nr_hw_queues;
1355
1356 q->queue_ctx = ctx;
1357 q->queue_hw_ctx = hctxs;
1358
1359 q->mq_ops = reg->ops;
1360
1361 blk_queue_make_request(q, blk_mq_make_request);
1362 blk_queue_rq_timed_out(q, reg->ops->timeout);
1363 if (reg->timeout)
1364 blk_queue_rq_timeout(q, reg->timeout);
1365
1366 blk_mq_init_flush(q);
1367 blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
1368
1369 if (blk_mq_init_hw_queues(q, reg, driver_data))
1370 goto err_hw;
1371
1372 blk_mq_map_swqueue(q);
1373
1374 mutex_lock(&all_q_mutex);
1375 list_add_tail(&q->all_q_node, &all_q_list);
1376 mutex_unlock(&all_q_mutex);
1377
1378 return q;
1379err_hw:
1380 kfree(q->mq_map);
1381err_map:
1382 blk_cleanup_queue(q);
1383err_hctxs:
1384 for (i = 0; i < reg->nr_hw_queues; i++) {
1385 if (!hctxs[i])
1386 break;
1387 reg->ops->free_hctx(hctxs[i], i);
1388 }
1389 kfree(hctxs);
1390err_percpu:
1391 free_percpu(ctx);
1392 return ERR_PTR(-ENOMEM);
1393}
1394EXPORT_SYMBOL(blk_mq_init_queue);
1395
1396void blk_mq_free_queue(struct request_queue *q)
1397{
1398 struct blk_mq_hw_ctx *hctx;
1399 int i;
1400
1401 queue_for_each_hw_ctx(q, hctx, i) {
1402 cancel_delayed_work_sync(&hctx->delayed_work);
1403 kfree(hctx->ctx_map);
1404 kfree(hctx->ctxs);
1405 blk_mq_free_rq_map(hctx);
1406 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1407 if (q->mq_ops->exit_hctx)
1408 q->mq_ops->exit_hctx(hctx, i);
1409 q->mq_ops->free_hctx(hctx, i);
1410 }
1411
1412 free_percpu(q->queue_ctx);
1413 kfree(q->queue_hw_ctx);
1414 kfree(q->mq_map);
1415
1416 q->queue_ctx = NULL;
1417 q->queue_hw_ctx = NULL;
1418 q->mq_map = NULL;
1419
1420 mutex_lock(&all_q_mutex);
1421 list_del_init(&q->all_q_node);
1422 mutex_unlock(&all_q_mutex);
1423}
1424EXPORT_SYMBOL(blk_mq_free_queue);
1425
1426/* Basically redo blk_mq_init_queue with queue frozen */
1427static void __cpuinit blk_mq_queue_reinit(struct request_queue *q)
1428{
1429 blk_mq_freeze_queue(q);
1430
1431 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1432
1433 /*
1434 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1435 * we should change hctx numa_node according to new topology (this
1436 * involves free and re-allocate memory, worthy doing?)
1437 */
1438
1439 blk_mq_map_swqueue(q);
1440
1441 blk_mq_unfreeze_queue(q);
1442}
1443
1444static int __cpuinit blk_mq_queue_reinit_notify(struct notifier_block *nb,
1445 unsigned long action, void *hcpu)
1446{
1447 struct request_queue *q;
1448
1449 /*
1450 * Before new mapping is established, hotadded cpu might already start
1451 * handling requests. This doesn't break anything as we map offline
1452 * CPUs to first hardware queue. We will re-init queue below to get
1453 * optimal settings.
1454 */
1455 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1456 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1457 return NOTIFY_OK;
1458
1459 mutex_lock(&all_q_mutex);
1460 list_for_each_entry(q, &all_q_list, all_q_node)
1461 blk_mq_queue_reinit(q);
1462 mutex_unlock(&all_q_mutex);
1463 return NOTIFY_OK;
1464}
1465
1466static int __init blk_mq_init(void)
1467{
1468 unsigned int i;
1469
1470 for_each_possible_cpu(i)
1471 init_llist_head(&per_cpu(ipi_lists, i));
1472
1473 blk_mq_cpu_init();
1474
1475 /* Must be called after percpu_counter_hotcpu_callback() */
1476 hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
1477
1478 return 0;
1479}
1480subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
new file mode 100644
index 000000000000..52bf1f96a2c2
--- /dev/null
+++ b/block/blk-mq.h
@@ -0,0 +1,52 @@
1#ifndef INT_BLK_MQ_H
2#define INT_BLK_MQ_H
3
4struct blk_mq_ctx {
5 struct {
6 spinlock_t lock;
7 struct list_head rq_list;
8 } ____cacheline_aligned_in_smp;
9
10 unsigned int cpu;
11 unsigned int index_hw;
12 unsigned int ipi_redirect;
13
14 /* incremented at dispatch time */
15 unsigned long rq_dispatched[2];
16 unsigned long rq_merged;
17
18 /* incremented at completion time */
19 unsigned long ____cacheline_aligned_in_smp rq_completed[2];
20
21 struct request_queue *queue;
22 struct kobject kobj;
23};
24
25void __blk_mq_end_io(struct request *rq, int error);
26void blk_mq_complete_request(struct request *rq, int error);
27void blk_mq_run_request(struct request *rq, bool run_queue, bool async);
28void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
29void blk_mq_init_flush(struct request_queue *q);
30
31/*
32 * CPU hotplug helpers
33 */
34struct blk_mq_cpu_notifier;
35void blk_mq_init_cpu_notifier(struct blk_mq_cpu_notifier *notifier,
36 void (*fn)(void *, unsigned long, unsigned int),
37 void *data);
38void blk_mq_register_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
39void blk_mq_unregister_cpu_notifier(struct blk_mq_cpu_notifier *notifier);
40void blk_mq_cpu_init(void);
41DECLARE_PER_CPU(struct llist_head, ipi_lists);
42
43/*
44 * CPU -> queue mappings
45 */
46struct blk_mq_reg;
47extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
48extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
49
50void blk_mq_add_timer(struct request *rq);
51
52#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3aa5b195f4dd..4f8c4d90ec73 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -7,6 +7,7 @@
7#include <linux/bio.h> 7#include <linux/bio.h>
8#include <linux/blkdev.h> 8#include <linux/blkdev.h>
9#include <linux/blktrace_api.h> 9#include <linux/blktrace_api.h>
10#include <linux/blk-mq.h>
10 11
11#include "blk.h" 12#include "blk.h"
12#include "blk-cgroup.h" 13#include "blk-cgroup.h"
@@ -542,6 +543,11 @@ static void blk_release_queue(struct kobject *kobj)
542 if (q->queue_tags) 543 if (q->queue_tags)
543 __blk_queue_free_tags(q); 544 __blk_queue_free_tags(q);
544 545
546 percpu_counter_destroy(&q->mq_usage_counter);
547
548 if (q->mq_ops)
549 blk_mq_free_queue(q);
550
545 blk_trace_shutdown(q); 551 blk_trace_shutdown(q);
546 552
547 bdi_destroy(&q->backing_dev_info); 553 bdi_destroy(&q->backing_dev_info);
@@ -575,6 +581,7 @@ int blk_register_queue(struct gendisk *disk)
575 * bypass from queue allocation. 581 * bypass from queue allocation.
576 */ 582 */
577 blk_queue_bypass_end(q); 583 blk_queue_bypass_end(q);
584 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
578 585
579 ret = blk_trace_init_sysfs(dev); 586 ret = blk_trace_init_sysfs(dev);
580 if (ret) 587 if (ret)
@@ -588,6 +595,9 @@ int blk_register_queue(struct gendisk *disk)
588 595
589 kobject_uevent(&q->kobj, KOBJ_ADD); 596 kobject_uevent(&q->kobj, KOBJ_ADD);
590 597
598 if (q->mq_ops)
599 blk_mq_register_disk(disk);
600
591 if (!q->request_fn) 601 if (!q->request_fn)
592 return 0; 602 return 0;
593 603
@@ -610,6 +620,9 @@ void blk_unregister_queue(struct gendisk *disk)
610 if (WARN_ON(!q)) 620 if (WARN_ON(!q))
611 return; 621 return;
612 622
623 if (q->mq_ops)
624 blk_mq_unregister_disk(disk);
625
613 if (q->request_fn) 626 if (q->request_fn)
614 elv_unregister_queue(q); 627 elv_unregister_queue(q);
615 628
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 65f103563969..22846cf3595a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -7,6 +7,7 @@
7#include <linux/fault-inject.h> 7#include <linux/fault-inject.h>
8 8
9#include "blk.h" 9#include "blk.h"
10#include "blk-mq.h"
10 11
11#ifdef CONFIG_FAIL_IO_TIMEOUT 12#ifdef CONFIG_FAIL_IO_TIMEOUT
12 13
@@ -88,11 +89,18 @@ static void blk_rq_timed_out(struct request *req)
88 ret = q->rq_timed_out_fn(req); 89 ret = q->rq_timed_out_fn(req);
89 switch (ret) { 90 switch (ret) {
90 case BLK_EH_HANDLED: 91 case BLK_EH_HANDLED:
91 __blk_complete_request(req); 92 /* Can we use req->errors here? */
93 if (q->mq_ops)
94 blk_mq_complete_request(req, req->errors);
95 else
96 __blk_complete_request(req);
92 break; 97 break;
93 case BLK_EH_RESET_TIMER: 98 case BLK_EH_RESET_TIMER:
94 blk_clear_rq_complete(req); 99 blk_clear_rq_complete(req);
95 blk_add_timer(req); 100 if (q->mq_ops)
101 blk_mq_add_timer(req);
102 else
103 blk_add_timer(req);
96 break; 104 break;
97 case BLK_EH_NOT_HANDLED: 105 case BLK_EH_NOT_HANDLED:
98 /* 106 /*
@@ -108,6 +116,23 @@ static void blk_rq_timed_out(struct request *req)
108 } 116 }
109} 117}
110 118
119void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
120 unsigned int *next_set)
121{
122 if (time_after_eq(jiffies, rq->deadline)) {
123 list_del_init(&rq->timeout_list);
124
125 /*
126 * Check if we raced with end io completion
127 */
128 if (!blk_mark_rq_complete(rq))
129 blk_rq_timed_out(rq);
130 } else if (!*next_set || time_after(*next_timeout, rq->deadline)) {
131 *next_timeout = rq->deadline;
132 *next_set = 1;
133 }
134}
135
111void blk_rq_timed_out_timer(unsigned long data) 136void blk_rq_timed_out_timer(unsigned long data)
112{ 137{
113 struct request_queue *q = (struct request_queue *) data; 138 struct request_queue *q = (struct request_queue *) data;
@@ -117,21 +142,8 @@ void blk_rq_timed_out_timer(unsigned long data)
117 142
118 spin_lock_irqsave(q->queue_lock, flags); 143 spin_lock_irqsave(q->queue_lock, flags);
119 144
120 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) { 145 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
121 if (time_after_eq(jiffies, rq->deadline)) { 146 blk_rq_check_expired(rq, &next, &next_set);
122 list_del_init(&rq->timeout_list);
123
124 /*
125 * Check if we raced with end io completion
126 */
127 if (blk_mark_rq_complete(rq))
128 continue;
129 blk_rq_timed_out(rq);
130 } else if (!next_set || time_after(next, rq->deadline)) {
131 next = rq->deadline;
132 next_set = 1;
133 }
134 }
135 147
136 if (next_set) 148 if (next_set)
137 mod_timer(&q->timeout, round_jiffies_up(next)); 149 mod_timer(&q->timeout, round_jiffies_up(next));
@@ -157,15 +169,7 @@ void blk_abort_request(struct request *req)
157} 169}
158EXPORT_SYMBOL_GPL(blk_abort_request); 170EXPORT_SYMBOL_GPL(blk_abort_request);
159 171
160/** 172void __blk_add_timer(struct request *req, struct list_head *timeout_list)
161 * blk_add_timer - Start timeout timer for a single request
162 * @req: request that is about to start running.
163 *
164 * Notes:
165 * Each request has its own timer, and as it is added to the queue, we
166 * set up the timer. When the request completes, we cancel the timer.
167 */
168void blk_add_timer(struct request *req)
169{ 173{
170 struct request_queue *q = req->q; 174 struct request_queue *q = req->q;
171 unsigned long expiry; 175 unsigned long expiry;
@@ -184,7 +188,8 @@ void blk_add_timer(struct request *req)
184 req->timeout = q->rq_timeout; 188 req->timeout = q->rq_timeout;
185 189
186 req->deadline = jiffies + req->timeout; 190 req->deadline = jiffies + req->timeout;
187 list_add_tail(&req->timeout_list, &q->timeout_list); 191 if (timeout_list)
192 list_add_tail(&req->timeout_list, timeout_list);
188 193
189 /* 194 /*
190 * If the timer isn't already pending or this timeout is earlier 195 * If the timer isn't already pending or this timeout is earlier
@@ -196,5 +201,19 @@ void blk_add_timer(struct request *req)
196 if (!timer_pending(&q->timeout) || 201 if (!timer_pending(&q->timeout) ||
197 time_before(expiry, q->timeout.expires)) 202 time_before(expiry, q->timeout.expires))
198 mod_timer(&q->timeout, expiry); 203 mod_timer(&q->timeout, expiry);
204
205}
206
207/**
208 * blk_add_timer - Start timeout timer for a single request
209 * @req: request that is about to start running.
210 *
211 * Notes:
212 * Each request has its own timer, and as it is added to the queue, we
213 * set up the timer. When the request completes, we cancel the timer.
214 */
215void blk_add_timer(struct request *req)
216{
217 __blk_add_timer(req, &req->q->timeout_list);
199} 218}
200 219
diff --git a/block/blk.h b/block/blk.h
index e837b8f619b7..c90e1d8f7a2b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -10,6 +10,7 @@
10#define BLK_BATCH_REQ 32 10#define BLK_BATCH_REQ 32
11 11
12extern struct kmem_cache *blk_requestq_cachep; 12extern struct kmem_cache *blk_requestq_cachep;
13extern struct kmem_cache *request_cachep;
13extern struct kobj_type blk_queue_ktype; 14extern struct kobj_type blk_queue_ktype;
14extern struct ida blk_queue_ida; 15extern struct ida blk_queue_ida;
15 16
@@ -34,14 +35,30 @@ bool __blk_end_bidi_request(struct request *rq, int error,
34 unsigned int nr_bytes, unsigned int bidi_bytes); 35 unsigned int nr_bytes, unsigned int bidi_bytes);
35 36
36void blk_rq_timed_out_timer(unsigned long data); 37void blk_rq_timed_out_timer(unsigned long data);
38void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
39 unsigned int *next_set);
40void __blk_add_timer(struct request *req, struct list_head *timeout_list);
37void blk_delete_timer(struct request *); 41void blk_delete_timer(struct request *);
38void blk_add_timer(struct request *); 42void blk_add_timer(struct request *);
39 43
44
45bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
46 struct bio *bio);
47bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
48 struct bio *bio);
49bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
50 unsigned int *request_count);
51
52void blk_account_io_start(struct request *req, bool new_io);
53void blk_account_io_completion(struct request *req, unsigned int bytes);
54void blk_account_io_done(struct request *req);
55
40/* 56/*
41 * Internal atomic flags for request handling 57 * Internal atomic flags for request handling
42 */ 58 */
43enum rq_atomic_flags { 59enum rq_atomic_flags {
44 REQ_ATOM_COMPLETE = 0, 60 REQ_ATOM_COMPLETE = 0,
61 REQ_ATOM_STARTED,
45}; 62};
46 63
47/* 64/*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec48bac5b039..4c2775443dcf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -419,6 +419,8 @@ static inline void bio_list_init(struct bio_list *bl)
419 bl->head = bl->tail = NULL; 419 bl->head = bl->tail = NULL;
420} 420}
421 421
422#define BIO_EMPTY_LIST { NULL, NULL }
423
422#define bio_list_for_each(bio, bl) \ 424#define bio_list_for_each(bio, bl) \
423 for (bio = (bl)->head; bio; bio = bio->bi_next) 425 for (bio = (bl)->head; bio; bio = bio->bi_next)
424 426
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
new file mode 100644
index 000000000000..746042ff321a
--- /dev/null
+++ b/include/linux/blk-mq.h
@@ -0,0 +1,182 @@
1#ifndef BLK_MQ_H
2#define BLK_MQ_H
3
4#include <linux/blkdev.h>
5
6struct blk_mq_tags;
7
8struct blk_mq_cpu_notifier {
9 struct list_head list;
10 void *data;
11 void (*notify)(void *data, unsigned long action, unsigned int cpu);
12};
13
14struct blk_mq_hw_ctx {
15 struct {
16 spinlock_t lock;
17 struct list_head dispatch;
18 } ____cacheline_aligned_in_smp;
19
20 unsigned long state; /* BLK_MQ_S_* flags */
21 struct delayed_work delayed_work;
22
23 unsigned long flags; /* BLK_MQ_F_* flags */
24
25 struct request_queue *queue;
26 unsigned int queue_num;
27
28 void *driver_data;
29
30 unsigned int nr_ctx;
31 struct blk_mq_ctx **ctxs;
32 unsigned int nr_ctx_map;
33 unsigned long *ctx_map;
34
35 struct request **rqs;
36 struct list_head page_list;
37 struct blk_mq_tags *tags;
38
39 unsigned long queued;
40 unsigned long run;
41#define BLK_MQ_MAX_DISPATCH_ORDER 10
42 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
43
44 unsigned int queue_depth;
45 unsigned int numa_node;
46 unsigned int cmd_size; /* per-request extra data */
47
48 struct blk_mq_cpu_notifier cpu_notifier;
49 struct kobject kobj;
50};
51
52struct blk_mq_reg {
53 struct blk_mq_ops *ops;
54 unsigned int nr_hw_queues;
55 unsigned int queue_depth;
56 unsigned int reserved_tags;
57 unsigned int cmd_size; /* per-request extra data */
58 int numa_node;
59 unsigned int timeout;
60 unsigned int flags; /* BLK_MQ_F_* */
61};
62
63typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
64typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
65typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
66typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
67typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
68typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
69
70struct blk_mq_ops {
71 /*
72 * Queue request
73 */
74 queue_rq_fn *queue_rq;
75
76 /*
77 * Map to specific hardware queue
78 */
79 map_queue_fn *map_queue;
80
81 /*
82 * Called on request timeout
83 */
84 rq_timed_out_fn *timeout;
85
86 /*
87 * Override for hctx allocations (should probably go)
88 */
89 alloc_hctx_fn *alloc_hctx;
90 free_hctx_fn *free_hctx;
91
92 /*
93 * Called when the block layer side of a hardware queue has been
94 * set up, allowing the driver to allocate/init matching structures.
95 * Ditto for exit/teardown.
96 */
97 init_hctx_fn *init_hctx;
98 exit_hctx_fn *exit_hctx;
99};
100
101enum {
102 BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */
103 BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */
104 BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */
105
106 BLK_MQ_F_SHOULD_MERGE = 1 << 0,
107 BLK_MQ_F_SHOULD_SORT = 1 << 1,
108 BLK_MQ_F_SHOULD_IPI = 1 << 2,
109
110 BLK_MQ_S_STOPPED = 1 << 0,
111
112 BLK_MQ_MAX_DEPTH = 2048,
113};
114
115struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
116void blk_mq_free_queue(struct request_queue *);
117int blk_mq_register_disk(struct gendisk *);
118void blk_mq_unregister_disk(struct gendisk *);
119void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
120
121void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
122
123void blk_mq_insert_request(struct request_queue *, struct request *, bool);
124void blk_mq_run_queues(struct request_queue *q, bool async);
125void blk_mq_free_request(struct request *rq);
126bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
127struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
128struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
129struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
130
131struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
132struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
133void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
134
135void blk_mq_end_io(struct request *rq, int error);
136
137void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
138void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
139void blk_mq_start_stopped_hw_queues(struct request_queue *q);
140
141/*
142 * Driver command data is immediately after the request. So subtract request
143 * size to get back to the original request.
144 */
145static inline struct request *blk_mq_rq_from_pdu(void *pdu)
146{
147 return pdu - sizeof(struct request);
148}
149static inline void *blk_mq_rq_to_pdu(struct request *rq)
150{
151 return (void *) rq + sizeof(*rq);
152}
153
154static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
155 unsigned int tag)
156{
157 return hctx->rqs[tag];
158}
159
160#define queue_for_each_hw_ctx(q, hctx, i) \
161 for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \
162 (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i])
163
164#define queue_for_each_ctx(q, ctx, i) \
165 for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \
166 (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i)))
167
168#define hctx_for_each_ctx(hctx, ctx, i) \
169 for ((i) = 0, ctx = (hctx)->ctxs[0]; \
170 (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)])
171
172#define blk_ctx_sum(q, sum) \
173({ \
174 struct blk_mq_ctx *__x; \
175 unsigned int __ret = 0, __i; \
176 \
177 queue_for_each_ctx((q), __x, __i) \
178 __ret += sum; \
179 __ret; \
180})
181
182#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c26801e14788..238ef0ed62f8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -178,6 +178,7 @@ enum rq_flag_bits {
178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */ 178 __REQ_MIXED_MERGE, /* merge of different types, fail separately */
179 __REQ_KERNEL, /* direct IO to kernel pages */ 179 __REQ_KERNEL, /* direct IO to kernel pages */
180 __REQ_PM, /* runtime pm request */ 180 __REQ_PM, /* runtime pm request */
181 __REQ_END, /* last of chain of requests */
181 __REQ_NR_BITS, /* stops here */ 182 __REQ_NR_BITS, /* stops here */
182}; 183};
183 184
@@ -229,5 +230,6 @@ enum rq_flag_bits {
229#define REQ_SECURE (1ULL << __REQ_SECURE) 230#define REQ_SECURE (1ULL << __REQ_SECURE)
230#define REQ_KERNEL (1ULL << __REQ_KERNEL) 231#define REQ_KERNEL (1ULL << __REQ_KERNEL)
231#define REQ_PM (1ULL << __REQ_PM) 232#define REQ_PM (1ULL << __REQ_PM)
233#define REQ_END (1ULL << __REQ_END)
232 234
233#endif /* __LINUX_BLK_TYPES_H */ 235#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0a8da96274c3..f26ec20f6354 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -8,6 +8,7 @@
8#include <linux/major.h> 8#include <linux/major.h>
9#include <linux/genhd.h> 9#include <linux/genhd.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/llist.h>
11#include <linux/timer.h> 12#include <linux/timer.h>
12#include <linux/workqueue.h> 13#include <linux/workqueue.h>
13#include <linux/pagemap.h> 14#include <linux/pagemap.h>
@@ -94,10 +95,17 @@ enum rq_cmd_type_bits {
94 * as well! 95 * as well!
95 */ 96 */
96struct request { 97struct request {
97 struct list_head queuelist; 98 union {
98 struct call_single_data csd; 99 struct list_head queuelist;
100 struct llist_node ll_list;
101 };
102 union {
103 struct call_single_data csd;
104 struct work_struct mq_flush_data;
105 };
99 106
100 struct request_queue *q; 107 struct request_queue *q;
108 struct blk_mq_ctx *mq_ctx;
101 109
102 u64 cmd_flags; 110 u64 cmd_flags;
103 enum rq_cmd_type_bits cmd_type; 111 enum rq_cmd_type_bits cmd_type;
@@ -213,6 +221,8 @@ struct request_pm_state
213 221
214#include <linux/elevator.h> 222#include <linux/elevator.h>
215 223
224struct blk_queue_ctx;
225
216typedef void (request_fn_proc) (struct request_queue *q); 226typedef void (request_fn_proc) (struct request_queue *q);
217typedef void (make_request_fn) (struct request_queue *q, struct bio *bio); 227typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
218typedef int (prep_rq_fn) (struct request_queue *, struct request *); 228typedef int (prep_rq_fn) (struct request_queue *, struct request *);
@@ -311,6 +321,18 @@ struct request_queue {
311 dma_drain_needed_fn *dma_drain_needed; 321 dma_drain_needed_fn *dma_drain_needed;
312 lld_busy_fn *lld_busy_fn; 322 lld_busy_fn *lld_busy_fn;
313 323
324 struct blk_mq_ops *mq_ops;
325
326 unsigned int *mq_map;
327
328 /* sw queues */
329 struct blk_mq_ctx *queue_ctx;
330 unsigned int nr_queues;
331
332 /* hw dispatch queues */
333 struct blk_mq_hw_ctx **queue_hw_ctx;
334 unsigned int nr_hw_queues;
335
314 /* 336 /*
315 * Dispatch queue sorting 337 * Dispatch queue sorting
316 */ 338 */
@@ -359,6 +381,11 @@ struct request_queue {
359 */ 381 */
360 struct kobject kobj; 382 struct kobject kobj;
361 383
384 /*
385 * mq queue kobject
386 */
387 struct kobject mq_kobj;
388
362#ifdef CONFIG_PM_RUNTIME 389#ifdef CONFIG_PM_RUNTIME
363 struct device *dev; 390 struct device *dev;
364 int rpm_status; 391 int rpm_status;
@@ -423,7 +450,13 @@ struct request_queue {
423 unsigned long flush_pending_since; 450 unsigned long flush_pending_since;
424 struct list_head flush_queue[2]; 451 struct list_head flush_queue[2];
425 struct list_head flush_data_in_flight; 452 struct list_head flush_data_in_flight;
426 struct request flush_rq; 453 union {
454 struct request flush_rq;
455 struct {
456 spinlock_t mq_flush_lock;
457 struct work_struct mq_flush_work;
458 };
459 };
427 460
428 struct mutex sysfs_lock; 461 struct mutex sysfs_lock;
429 462
@@ -435,14 +468,14 @@ struct request_queue {
435 struct bsg_class_device bsg_dev; 468 struct bsg_class_device bsg_dev;
436#endif 469#endif
437 470
438#ifdef CONFIG_BLK_CGROUP
439 struct list_head all_q_node;
440#endif
441#ifdef CONFIG_BLK_DEV_THROTTLING 471#ifdef CONFIG_BLK_DEV_THROTTLING
442 /* Throttle data */ 472 /* Throttle data */
443 struct throtl_data *td; 473 struct throtl_data *td;
444#endif 474#endif
445 struct rcu_head rcu_head; 475 struct rcu_head rcu_head;
476 wait_queue_head_t mq_freeze_wq;
477 struct percpu_counter mq_usage_counter;
478 struct list_head all_q_node;
446}; 479};
447 480
448#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 481#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -465,6 +498,7 @@ struct request_queue {
465#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ 498#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */
466#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ 499#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
467#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */ 500#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
501#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
468 502
469#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 503#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
470 (1 << QUEUE_FLAG_STACKABLE) | \ 504 (1 << QUEUE_FLAG_STACKABLE) | \
@@ -537,6 +571,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
537#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) 571#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
538#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) 572#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
539#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags) 573#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
574#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
540#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) 575#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
541#define blk_queue_noxmerges(q) \ 576#define blk_queue_noxmerges(q) \
542 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) 577 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
@@ -1011,6 +1046,7 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
1011struct blk_plug { 1046struct blk_plug {
1012 unsigned long magic; /* detect uninitialized use-cases */ 1047 unsigned long magic; /* detect uninitialized use-cases */
1013 struct list_head list; /* requests */ 1048 struct list_head list; /* requests */
1049 struct list_head mq_list; /* blk-mq requests */
1014 struct list_head cb_list; /* md requires an unplug callback */ 1050 struct list_head cb_list; /* md requires an unplug callback */
1015}; 1051};
1016#define BLK_MAX_REQUEST_COUNT 16 1052#define BLK_MAX_REQUEST_COUNT 16
@@ -1048,7 +1084,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1048{ 1084{
1049 struct blk_plug *plug = tsk->plug; 1085 struct blk_plug *plug = tsk->plug;
1050 1086
1051 return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list)); 1087 return plug &&
1088 (!list_empty(&plug->list) ||
1089 !list_empty(&plug->mq_list) ||
1090 !list_empty(&plug->cb_list));
1052} 1091}
1053 1092
1054/* 1093/*
@@ -1323,6 +1362,7 @@ static inline void put_dev_sector(Sector p)
1323 1362
1324struct work_struct; 1363struct work_struct;
1325int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); 1364int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
1365int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
1326 1366
1327#ifdef CONFIG_BLK_CGROUP 1367#ifdef CONFIG_BLK_CGROUP
1328/* 1368/*