blk-mq: new multi-queue block IO queueing mechanism

Linux currently has two models for block devices: - The classic request_fn based approach, where drivers use struct request units for IO. The block layer provides various helper functionalities to let drivers share code, things like tag management, timeout handling, queueing, etc. - The "stacked" approach, where a driver squeezes in between the block layer and IO submitter. Since this bypasses the IO stack, driver generally have to manage everything themselves. With drivers being written for new high IOPS devices, the classic request_fn based driver doesn't work well enough. The design dates back to when both SMP and high IOPS was rare. It has problems with scaling to bigger machines, and runs into scaling issues even on smaller machines when you have IOPS in the hundreds of thousands per device. The stacked approach is then most often selected as the model for the driver. But this means that everybody has to re-invent everything, and along with that we get all the problems again that the shared approach solved. This commit introduces blk-mq, block multi queue support. The design is centered around per-cpu queues for queueing IO, which then funnel down into x number of hardware submission queues. We might have a 1:1 mapping between the two, or it might be an N:M mapping. That all depends on what the hardware supports. blk-mq provides various helper functions, which include: - Scalable support for request tagging. Most devices need to be able to uniquely identify a request both in the driver and to the hardware. The tagging uses per-cpu caches for freed tags, to enable cache hot reuse. - Timeout handling without tracking request on a per-device basis. Basically the driver should be able to get a notification, if a request happens to fail. - Optional support for non 1:1 mappings between issue and submission queues. blk-mq can redirect IO completions to the desired location. - Support for per-request payloads. Drivers almost always need to associate a request structure with some driver private command structure. Drivers can tell blk-mq this at init time, and then any request handed to the driver will have the required size of memory associated with it. - Support for merging of IO, and plugging. The stacked model gets neither of these. Even for high IOPS devices, merging sequential IO reduces per-command overhead and thus increases bandwidth. For now, this is provided as a potential 3rd queueing model, with the hope being that, as it matures, it can replace both the classic and stacked model. That would get us back to having just 1 real model for block devices, leaving the stacked approach to dm/md devices (as it was originally intended). Contributions in this patch from the following people: Shaohua Li <shli@fusionio.com> Alexander Gordeev <agordeev@redhat.com> Christoph Hellwig <hch@infradead.org> Mike Christie <michaelc@cs.wisc.edu> Matias Bjorling <m@bjorling.me> Jeff Moyer <jmoyer@redhat.com> Acked-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Jens Axboe <axboe@kernel.dk> 2013-10-24 04:20:05 -0400
committer: Jens Axboe <axboe@kernel.dk> 2013-10-25 06:56:00 -0400
commit: 320ae51feed5c2f13664aa05a76bec198967e04d (patch)
tree: ad37ccbcc5ddb1c9c19e48965bf8fec1b05217dc /include
parent: 1dddc01af0d42b21058e0cb9c1ca9e8d5204d9b0 (diff)
4 files changed, 233 insertions, 7 deletions
diff --git a/include/linux/bio.h b/include/linux/bio.h
index ec48bac5b039..4c2775443dcf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -419,6 +419,8 @@ static inline void bio_list_init(struct bio_list *bl)
        bl->head = bl->tail = NULL;
 }
+#define BIO_EMPTY_LIST  { NULL, NULL }
 #define bio_list_for_each(bio, bl) \
        for (bio = (bl)->head; bio; bio = bio->bi_next)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
new file mode 100644
index 000000000000..746042ff321a
--- /dev/null
+++ b/include/linux/blk-mq.h
@@ -0,0 +1,182 @@
+#ifndef BLK_MQ_H
+#define BLK_MQ_H
+#include <linux/blkdev.h>
+struct blk_mq_tags;
+struct blk_mq_cpu_notifier {
+        struct list_head list;
+        void *data;
+        void (*notify)(void *data, unsigned long action, unsigned int cpu);
+};
+struct blk_mq_hw_ctx {
+        struct {
+                spinlock_t              lock;
+                struct list_head        dispatch;
+        } ____cacheline_aligned_in_smp;
+        unsigned long           state;          /* BLK_MQ_S_* flags */
+        struct delayed_work     delayed_work;
+        unsigned long           flags;          /* BLK_MQ_F_* flags */
+        struct request_queue    *queue;
+        unsigned int            queue_num;
+        void                    *driver_data;
+        unsigned int            nr_ctx;
+        struct blk_mq_ctx       **ctxs;
+        unsigned int            nr_ctx_map;
+        unsigned long           *ctx_map;
+        struct request          **rqs;
+        struct list_head        page_list;
+        struct blk_mq_tags      *tags;
+        unsigned long           queued;
+        unsigned long           run;
+#define BLK_MQ_MAX_DISPATCH_ORDER       10
+        unsigned long           dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
+        unsigned int            queue_depth;
+        unsigned int            numa_node;
+        unsigned int            cmd_size;       /* per-request extra data */
+        struct blk_mq_cpu_notifier      cpu_notifier;
+        struct kobject          kobj;
+};
+struct blk_mq_reg {
+        struct blk_mq_ops       *ops;
+        unsigned int            nr_hw_queues;
+        unsigned int            queue_depth;
+        unsigned int            reserved_tags;
+        unsigned int            cmd_size;       /* per-request extra data */
+        int                     numa_node;
+        unsigned int            timeout;
+        unsigned int            flags;          /* BLK_MQ_F_* */
+};
+typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
+typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
+typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
+typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
+typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+struct blk_mq_ops {
+        /*
+         * Queue request
+         */
+        queue_rq_fn             *queue_rq;
+        /*
+         * Map to specific hardware queue
+         */
+        map_queue_fn            *map_queue;
+        /*
+         * Called on request timeout
+         */
+        rq_timed_out_fn         *timeout;
+        /*
+         * Override for hctx allocations (should probably go)
+         */
+        alloc_hctx_fn           *alloc_hctx;
+        free_hctx_fn            *free_hctx;
+        /*
+         * Called when the block layer side of a hardware queue has been
+         * set up, allowing the driver to allocate/init matching structures.
+         * Ditto for exit/teardown.
+         */
+        init_hctx_fn            *init_hctx;
+        exit_hctx_fn            *exit_hctx;
+};
+enum {
+        BLK_MQ_RQ_QUEUE_OK      = 0,    /* queued fine */
+        BLK_MQ_RQ_QUEUE_BUSY    = 1,    /* requeue IO for later */
+        BLK_MQ_RQ_QUEUE_ERROR   = 2,    /* end IO with error */
+        BLK_MQ_F_SHOULD_MERGE   = 1 << 0,
+        BLK_MQ_F_SHOULD_SORT    = 1 << 1,
+        BLK_MQ_F_SHOULD_IPI     = 1 << 2,
+        BLK_MQ_S_STOPPED        = 1 << 0,
+        BLK_MQ_MAX_DEPTH        = 2048,
+};
+struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+void blk_mq_free_queue(struct request_queue *);
+int blk_mq_register_disk(struct gendisk *);
+void blk_mq_unregister_disk(struct gendisk *);
+void blk_mq_init_commands(struct request_queue *, void (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
+void blk_mq_insert_request(struct request_queue *, struct request *, bool);
+void blk_mq_run_queues(struct request_queue *q, bool async);
+void blk_mq_free_request(struct request *rq);
+bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
+struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
+struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
+void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
+void blk_mq_end_io(struct request *rq, int error);
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
+void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
+void blk_mq_start_stopped_hw_queues(struct request_queue *q);
+/*
+ * Driver command data is immediately after the request. So subtract request
+ * size to get back to the original request.
+ */
+static inline struct request *blk_mq_rq_from_pdu(void *pdu)
+{
+        return pdu - sizeof(struct request);
+}
+static inline void *blk_mq_rq_to_pdu(struct request *rq)
+{
+        return (void *) rq + sizeof(*rq);
+}
+static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
+                                               unsigned int tag)
+{
+        return hctx->rqs[tag];
+}
+#define queue_for_each_hw_ctx(q, hctx, i)                               \
+        for ((i) = 0, hctx = (q)->queue_hw_ctx[0];                      \
+             (i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i])
+#define queue_for_each_ctx(q, ctx, i)                                   \
+        for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0);             \
+             (i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i)))
+#define hctx_for_each_ctx(hctx, ctx, i)                                 \
+        for ((i) = 0, ctx = (hctx)->ctxs[0];                            \
+             (i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)])
+#define blk_ctx_sum(q, sum)                                             \
+({                                                                      \
+        struct blk_mq_ctx *__x;                                         \
+        unsigned int __ret = 0, __i;                                    \
+                                                                        \
+        queue_for_each_ctx((q), __x, __i)                               \
+                __ret += sum;                                           \
+        __ret;                                                          \
+})
+#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c26801e14788..238ef0ed62f8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -178,6 +178,7 @@ enum rq_flag_bits {
        __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
        __REQ_KERNEL,           /* direct IO to kernel pages */
        __REQ_PM,               /* runtime pm request */
+        __REQ_END,              /* last of chain of requests */
        __REQ_NR_BITS,          /* stops here */
 };
@@ -229,5 +230,6 @@ enum rq_flag_bits {
 #define REQ_SECURE              (1ULL << __REQ_SECURE)
 #define REQ_KERNEL              (1ULL << __REQ_KERNEL)
 #define REQ_PM                  (1ULL << __REQ_PM)
+#define REQ_END                 (1ULL << __REQ_END)
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0a8da96274c3..f26ec20f6354 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -8,6 +8,7 @@
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
@@ -94,10 +95,17 @@ enum rq_cmd_type_bits {
 * as well!
 */
 struct request {
-        struct list_head queuelist;
+        union {
-        struct call_single_data csd;
+                struct list_head queuelist;
+                struct llist_node ll_list;
+        };
+        union {
+                struct call_single_data csd;
+                struct work_struct mq_flush_data;
+        };
        struct request_queue *q;
+        struct blk_mq_ctx *mq_ctx;
        u64 cmd_flags;
        enum rq_cmd_type_bits cmd_type;
@@ -213,6 +221,8 @@ struct request_pm_state
 #include <linux/elevator.h>
+struct blk_queue_ctx;
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
@@ -311,6 +321,18 @@ struct request_queue {
        dma_drain_needed_fn     *dma_drain_needed;
        lld_busy_fn             *lld_busy_fn;
+        struct blk_mq_ops       *mq_ops;
+        unsigned int            *mq_map;
+        /* sw queues */
+        struct blk_mq_ctx       *queue_ctx;
+        unsigned int            nr_queues;
+        /* hw dispatch queues */
+        struct blk_mq_hw_ctx    **queue_hw_ctx;
+        unsigned int            nr_hw_queues;
        /*
         * Dispatch queue sorting
         */
@@ -359,6 +381,11 @@ struct request_queue {
         */
        struct kobject kobj;
+        /*
+         * mq queue kobject
+         */
+        struct kobject mq_kobj;
 #ifdef CONFIG_PM_RUNTIME
        struct device           *dev;
        int                     rpm_status;
@@ -423,7 +450,13 @@ struct request_queue {
        unsigned long           flush_pending_since;
        struct list_head        flush_queue[2];
        struct list_head        flush_data_in_flight;
-        struct request          flush_rq;
+        union {
+                struct request  flush_rq;
+                struct {
+                        spinlock_t mq_flush_lock;
+                        struct work_struct mq_flush_work;
+                };
+        };
        struct mutex            sysfs_lock;
@@ -435,14 +468,14 @@ struct request_queue {
        struct bsg_class_device bsg_dev;
 #endif
-#ifdef CONFIG_BLK_CGROUP
-        struct list_head        all_q_node;
-#endif
 #ifdef CONFIG_BLK_DEV_THROTTLING
        /* Throttle data */
        struct throtl_data *td;
 #endif
        struct rcu_head         rcu_head;
+        wait_queue_head_t       mq_freeze_wq;
+        struct percpu_counter   mq_usage_counter;
+        struct list_head        all_q_node;
 };
 #define QUEUE_FLAG_QUEUED       1       /* uses generic tag queueing */
@@ -465,6 +498,7 @@ struct request_queue {
 #define QUEUE_FLAG_SECDISCARD  17       /* supports SECDISCARD */
 #define QUEUE_FLAG_SAME_FORCE  18       /* force complete on same CPU */
 #define QUEUE_FLAG_DEAD        19       /* queue tear-down finished */
+#define QUEUE_FLAG_INIT_DONE   20       /* queue is initialized */
 #define QUEUE_FLAG_DEFAULT      ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \
@@ -537,6 +571,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_dying(q)      test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
 #define blk_queue_dead(q)       test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_bypass(q)     test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
+#define blk_queue_init_done(q)  test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
 #define blk_queue_nomerges(q)   test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)  \
        test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
@@ -1011,6 +1046,7 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
 struct blk_plug {
        unsigned long magic; /* detect uninitialized use-cases */
        struct list_head list; /* requests */
+        struct list_head mq_list; /* blk-mq requests */
        struct list_head cb_list; /* md requires an unplug callback */
 };
 #define BLK_MAX_REQUEST_COUNT 16
@@ -1048,7 +1084,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
        struct blk_plug *plug = tsk->plug;
-        return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
+        return plug &&
+                (!list_empty(&plug->list) ||
+                 !list_empty(&plug->mq_list) ||
+                 !list_empty(&plug->cb_list));
 }
 /*
@@ -1323,6 +1362,7 @@ static inline void put_dev_sector(Sector p)
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+int kblockd_schedule_delayed_work(struct request_queue *q, struct delayed_work *dwork, unsigned long delay);
 #ifdef CONFIG_BLK_CGROUP
 /*
author	Jens Axboe <axboe@kernel.dk>	2013-10-24 04:20:05 -0400
committer	Jens Axboe <axboe@kernel.dk>	2013-10-25 06:56:00 -0400
commit	320ae51feed5c2f13664aa05a76bec198967e04d (patch)
tree	ad37ccbcc5ddb1c9c19e48965bf8fec1b05217dc /include
parent	1dddc01af0d42b21058e0cb9c1ca9e8d5204d9b0 (diff)

diff --git a/include/linux/bio.h b/include/linux/bio.h index ec48bac5b039..4c2775443dcf 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h
@@ -419,6 +419,8 @@ static inline void bio_list_init(struct bio_list *bl)
419	bl->head = bl->tail = NULL;	419	bl->head = bl->tail = NULL;
420	}	420	}
421		421
		422	#define BIO_EMPTY_LIST { NULL, NULL }
		423
422	#define bio_list_for_each(bio, bl) \	424	#define bio_list_for_each(bio, bl) \
423	for (bio = (bl)->head; bio; bio = bio->bi_next)	425	for (bio = (bl)->head; bio; bio = bio->bi_next)
424		426


diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h new file mode 100644 index 000000000000..746042ff321a --- /dev/null +++ b/include/linux/blk-mq.h
@@ -0,0 +1,182 @@
		1	#ifndef BLK_MQ_H
		2	#define BLK_MQ_H
		3
		4	#include <linux/blkdev.h>
		5
		6	struct blk_mq_tags;
		7
		8	struct blk_mq_cpu_notifier {
		9	struct list_head list;
		10	void *data;
		11	void (notify)(void data, unsigned long action, unsigned int cpu);
		12	};
		13
		14	struct blk_mq_hw_ctx {
		15	struct {
		16	spinlock_t lock;
		17	struct list_head dispatch;
		18	} ____cacheline_aligned_in_smp;
		19
		20	unsigned long state; /* BLK_MQ_S_* flags */
		21	struct delayed_work delayed_work;
		22
		23	unsigned long flags; /* BLK_MQ_F_* flags */
		24
		25	struct request_queue *queue;
		26	unsigned int queue_num;
		27
		28	void *driver_data;
		29
		30	unsigned int nr_ctx;
		31	struct blk_mq_ctx **ctxs;
		32	unsigned int nr_ctx_map;
		33	unsigned long *ctx_map;
		34
		35	struct request **rqs;
		36	struct list_head page_list;
		37	struct blk_mq_tags *tags;
		38
		39	unsigned long queued;
		40	unsigned long run;
		41	#define BLK_MQ_MAX_DISPATCH_ORDER 10
		42	unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
		43
		44	unsigned int queue_depth;
		45	unsigned int numa_node;
		46	unsigned int cmd_size; /* per-request extra data */
		47
		48	struct blk_mq_cpu_notifier cpu_notifier;
		49	struct kobject kobj;
		50	};
		51
		52	struct blk_mq_reg {
		53	struct blk_mq_ops *ops;
		54	unsigned int nr_hw_queues;
		55	unsigned int queue_depth;
		56	unsigned int reserved_tags;
		57	unsigned int cmd_size; /* per-request extra data */
		58	int numa_node;
		59	unsigned int timeout;
		60	unsigned int flags; /* BLK_MQ_F_* */
		61	};
		62
		63	typedef int (queue_rq_fn)(struct blk_mq_hw_ctx , struct request );
		64	typedef struct blk_mq_hw_ctx (map_queue_fn)(struct request_queue , const int);
		65	typedef struct blk_mq_hw_ctx (alloc_hctx_fn)(struct blk_mq_reg ,unsigned int);
		66	typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
		67	typedef int (init_hctx_fn)(struct blk_mq_hw_ctx , void , unsigned int);
		68	typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
		69
		70	struct blk_mq_ops {
		71	/*
		72	* Queue request
		73	*/
		74	queue_rq_fn *queue_rq;
		75
		76	/*
		77	* Map to specific hardware queue
		78	*/
		79	map_queue_fn *map_queue;
		80
		81	/*
		82	* Called on request timeout
		83	*/
		84	rq_timed_out_fn *timeout;
		85
		86	/*
		87	* Override for hctx allocations (should probably go)
		88	*/
		89	alloc_hctx_fn *alloc_hctx;
		90	free_hctx_fn *free_hctx;
		91
		92	/*
		93	* Called when the block layer side of a hardware queue has been
		94	* set up, allowing the driver to allocate/init matching structures.
		95	* Ditto for exit/teardown.
		96	*/
		97	init_hctx_fn *init_hctx;
		98	exit_hctx_fn *exit_hctx;
		99	};
		100
		101	enum {
		102	BLK_MQ_RQ_QUEUE_OK = 0, /* queued fine */
		103	BLK_MQ_RQ_QUEUE_BUSY = 1, /* requeue IO for later */
		104	BLK_MQ_RQ_QUEUE_ERROR = 2, /* end IO with error */
		105
		106	BLK_MQ_F_SHOULD_MERGE = 1 << 0,
		107	BLK_MQ_F_SHOULD_SORT = 1 << 1,
		108	BLK_MQ_F_SHOULD_IPI = 1 << 2,
		109
		110	BLK_MQ_S_STOPPED = 1 << 0,
		111
		112	BLK_MQ_MAX_DEPTH = 2048,
		113	};
		114
		115	struct request_queue blk_mq_init_queue(struct blk_mq_reg , void *);
		116	void blk_mq_free_queue(struct request_queue *);
		117	int blk_mq_register_disk(struct gendisk *);
		118	void blk_mq_unregister_disk(struct gendisk *);
		119	void blk_mq_init_commands(struct request_queue , void (init)(void data, struct blk_mq_hw_ctx , struct request , unsigned int), void data);
		120
		121	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
		122
		123	void blk_mq_insert_request(struct request_queue , struct request , bool);
		124	void blk_mq_run_queues(struct request_queue *q, bool async);
		125	void blk_mq_free_request(struct request *rq);
		126	bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
		127	struct request blk_mq_alloc_request(struct request_queue q, int rw, gfp_t gfp);
		128	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw, gfp_t gfp);
		129	struct request blk_mq_rq_from_tag(struct request_queue q, unsigned int tag);
		130
		131	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue , const int ctx_index);
		132	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_reg , unsigned int);
		133	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
		134
		135	void blk_mq_end_io(struct request *rq, int error);
		136
		137	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
		138	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
		139	void blk_mq_start_stopped_hw_queues(struct request_queue *q);
		140
		141	/*
		142	* Driver command data is immediately after the request. So subtract request
		143	* size to get back to the original request.
		144	*/
		145	static inline struct request blk_mq_rq_from_pdu(void pdu)
		146	{
		147	return pdu - sizeof(struct request);
		148	}
		149	static inline void blk_mq_rq_to_pdu(struct request rq)
		150	{
		151	return (void ) rq + sizeof(rq);
		152	}
		153
		154	static inline struct request blk_mq_tag_to_rq(struct blk_mq_hw_ctx hctx,
		155	unsigned int tag)
		156	{
		157	return hctx->rqs[tag];
		158	}
		159
		160	#define queue_for_each_hw_ctx(q, hctx, i) \
		161	for ((i) = 0, hctx = (q)->queue_hw_ctx[0]; \
		162	(i) < (q)->nr_hw_queues; (i)++, hctx = (q)->queue_hw_ctx[i])
		163
		164	#define queue_for_each_ctx(q, ctx, i) \
		165	for ((i) = 0, ctx = per_cpu_ptr((q)->queue_ctx, 0); \
		166	(i) < (q)->nr_queues; (i)++, ctx = per_cpu_ptr(q->queue_ctx, (i)))
		167
		168	#define hctx_for_each_ctx(hctx, ctx, i) \
		169	for ((i) = 0, ctx = (hctx)->ctxs[0]; \
		170	(i) < (hctx)->nr_ctx; (i)++, ctx = (hctx)->ctxs[(i)])
		171
		172	#define blk_ctx_sum(q, sum) \
		173	({ \
		174	struct blk_mq_ctx *__x; \
		175	unsigned int __ret = 0, __i; \
		176	\
		177	queue_for_each_ctx((q), __x, __i) \
		178	__ret += sum; \
		179	__ret; \
		180	})
		181
		182	#endif


diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c26801e14788..238ef0ed62f8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h
@@ -178,6 +178,7 @@ enum rq_flag_bits {
178	__REQ_MIXED_MERGE, /* merge of different types, fail separately */	178	__REQ_MIXED_MERGE, /* merge of different types, fail separately */
179	__REQ_KERNEL, /* direct IO to kernel pages */	179	__REQ_KERNEL, /* direct IO to kernel pages */
180	__REQ_PM, /* runtime pm request */	180	__REQ_PM, /* runtime pm request */
		181	__REQ_END, /* last of chain of requests */
181	__REQ_NR_BITS, /* stops here */	182	__REQ_NR_BITS, /* stops here */
182	};	183	};
183		184
@@ -229,5 +230,6 @@ enum rq_flag_bits {
229	#define REQ_SECURE (1ULL << __REQ_SECURE)	230	#define REQ_SECURE (1ULL << __REQ_SECURE)
230	#define REQ_KERNEL (1ULL << __REQ_KERNEL)	231	#define REQ_KERNEL (1ULL << __REQ_KERNEL)
231	#define REQ_PM (1ULL << __REQ_PM)	232	#define REQ_PM (1ULL << __REQ_PM)
		233	#define REQ_END (1ULL << __REQ_END)
232		234
233	#endif /* __LINUX_BLK_TYPES_H */	235	#endif /* __LINUX_BLK_TYPES_H */


diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0a8da96274c3..f26ec20f6354 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h
@@ -8,6 +8,7 @@
8	#include <linux/major.h>	8	#include <linux/major.h>
9	#include <linux/genhd.h>	9	#include <linux/genhd.h>
10	#include <linux/list.h>	10	#include <linux/list.h>
		11	#include <linux/llist.h>
11	#include <linux/timer.h>	12	#include <linux/timer.h>
12	#include <linux/workqueue.h>	13	#include <linux/workqueue.h>
13	#include <linux/pagemap.h>	14	#include <linux/pagemap.h>
@@ -94,10 +95,17 @@ enum rq_cmd_type_bits {
94	* as well!	95	* as well!
95	*/	96	*/
96	struct request {	97	struct request {
97	struct list_head queuelist;	98	union {
98	struct call_single_data csd;	99	struct list_head queuelist;
		100	struct llist_node ll_list;
		101	};
		102	union {
		103	struct call_single_data csd;
		104	struct work_struct mq_flush_data;
		105	};
99		106
100	struct request_queue *q;	107	struct request_queue *q;
		108	struct blk_mq_ctx *mq_ctx;
101		109
102	u64 cmd_flags;	110	u64 cmd_flags;
103	enum rq_cmd_type_bits cmd_type;	111	enum rq_cmd_type_bits cmd_type;
@@ -213,6 +221,8 @@ struct request_pm_state
213		221
214	#include <linux/elevator.h>	222	#include <linux/elevator.h>
215		223
		224	struct blk_queue_ctx;
		225
216	typedef void (request_fn_proc) (struct request_queue *q);	226	typedef void (request_fn_proc) (struct request_queue *q);
217	typedef void (make_request_fn) (struct request_queue q, struct bio bio);	227	typedef void (make_request_fn) (struct request_queue q, struct bio bio);
218	typedef int (prep_rq_fn) (struct request_queue , struct request );	228	typedef int (prep_rq_fn) (struct request_queue , struct request );
@@ -311,6 +321,18 @@ struct request_queue {
311	dma_drain_needed_fn *dma_drain_needed;	321	dma_drain_needed_fn *dma_drain_needed;
312	lld_busy_fn *lld_busy_fn;	322	lld_busy_fn *lld_busy_fn;
313		323
		324	struct blk_mq_ops *mq_ops;
		325
		326	unsigned int *mq_map;
		327
		328	/* sw queues */
		329	struct blk_mq_ctx *queue_ctx;
		330	unsigned int nr_queues;
		331
		332	/* hw dispatch queues */
		333	struct blk_mq_hw_ctx **queue_hw_ctx;
		334	unsigned int nr_hw_queues;
		335
314	/*	336	/*
315	* Dispatch queue sorting	337	* Dispatch queue sorting
316	*/	338	*/
@@ -359,6 +381,11 @@ struct request_queue {
359	*/	381	*/
360	struct kobject kobj;	382	struct kobject kobj;
361		383
		384	/*
		385	* mq queue kobject
		386	*/
		387	struct kobject mq_kobj;
		388
362	#ifdef CONFIG_PM_RUNTIME	389	#ifdef CONFIG_PM_RUNTIME
363	struct device *dev;	390	struct device *dev;
364	int rpm_status;	391	int rpm_status;
@@ -423,7 +450,13 @@ struct request_queue {
423	unsigned long flush_pending_since;	450	unsigned long flush_pending_since;
424	struct list_head flush_queue[2];	451	struct list_head flush_queue[2];
425	struct list_head flush_data_in_flight;	452	struct list_head flush_data_in_flight;
426	struct request flush_rq;	453	union {
		454	struct request flush_rq;
		455	struct {
		456	spinlock_t mq_flush_lock;
		457	struct work_struct mq_flush_work;
		458	};
		459	};
427		460
428	struct mutex sysfs_lock;	461	struct mutex sysfs_lock;
429		462
@@ -435,14 +468,14 @@ struct request_queue {
435	struct bsg_class_device bsg_dev;	468	struct bsg_class_device bsg_dev;
436	#endif	469	#endif
437		470
438	#ifdef CONFIG_BLK_CGROUP
439	struct list_head all_q_node;
440	#endif
441	#ifdef CONFIG_BLK_DEV_THROTTLING	471	#ifdef CONFIG_BLK_DEV_THROTTLING
442	/* Throttle data */	472	/* Throttle data */
443	struct throtl_data *td;	473	struct throtl_data *td;
444	#endif	474	#endif
445	struct rcu_head rcu_head;	475	struct rcu_head rcu_head;
		476	wait_queue_head_t mq_freeze_wq;
		477	struct percpu_counter mq_usage_counter;
		478	struct list_head all_q_node;
446	};	479	};
447		480
448	#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */	481	#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
@@ -465,6 +498,7 @@ struct request_queue {
465	#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */	498	#define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */
466	#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */	499	#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */
467	#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */	500	#define QUEUE_FLAG_DEAD 19 /* queue tear-down finished */
		501	#define QUEUE_FLAG_INIT_DONE 20 /* queue is initialized */
468		502
469	#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) \| \	503	#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) \| \
470	(1 << QUEUE_FLAG_STACKABLE) \| \	504	(1 << QUEUE_FLAG_STACKABLE) \| \
@@ -537,6 +571,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
537	#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)	571	#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
538	#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)	572	#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
539	#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)	573	#define blk_queue_bypass(q) test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
		574	#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
540	#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)	575	#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
541	#define blk_queue_noxmerges(q) \	576	#define blk_queue_noxmerges(q) \
542	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)	577	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
@@ -1011,6 +1046,7 @@ static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
1011	struct blk_plug {	1046	struct blk_plug {
1012	unsigned long magic; /* detect uninitialized use-cases */	1047	unsigned long magic; /* detect uninitialized use-cases */
1013	struct list_head list; /* requests */	1048	struct list_head list; /* requests */
		1049	struct list_head mq_list; /* blk-mq requests */
1014	struct list_head cb_list; /* md requires an unplug callback */	1050	struct list_head cb_list; /* md requires an unplug callback */
1015	};	1051	};
1016	#define BLK_MAX_REQUEST_COUNT 16	1052	#define BLK_MAX_REQUEST_COUNT 16
@@ -1048,7 +1084,10 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
1048	{	1084	{
1049	struct blk_plug *plug = tsk->plug;	1085	struct blk_plug *plug = tsk->plug;
1050		1086
1051	return plug && (!list_empty(&plug->list) \|\| !list_empty(&plug->cb_list));	1087	return plug &&
		1088	(!list_empty(&plug->list) \|\|
		1089	!list_empty(&plug->mq_list) \|\|
		1090	!list_empty(&plug->cb_list));
1052	}	1091	}
1053		1092
1054	/*	1093	/*
@@ -1323,6 +1362,7 @@ static inline void put_dev_sector(Sector p)
1323		1362
1324	struct work_struct;	1363	struct work_struct;
1325	int kblockd_schedule_work(struct request_queue q, struct work_struct work);	1364	int kblockd_schedule_work(struct request_queue q, struct work_struct work);
		1365	int kblockd_schedule_delayed_work(struct request_queue q, struct delayed_work dwork, unsigned long delay);
1326		1366
1327	#ifdef CONFIG_BLK_CGROUP	1367	#ifdef CONFIG_BLK_CGROUP
1328	/*	1368	/*