aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-flush.c
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2013-10-24 04:20:05 -0400
committerJens Axboe <axboe@kernel.dk>2013-10-25 06:56:00 -0400
commit320ae51feed5c2f13664aa05a76bec198967e04d (patch)
treead37ccbcc5ddb1c9c19e48965bf8fec1b05217dc /block/blk-flush.c
parent1dddc01af0d42b21058e0cb9c1ca9e8d5204d9b0 (diff)
blk-mq: new multi-queue block IO queueing mechanism
Linux currently has two models for block devices: - The classic request_fn based approach, where drivers use struct request units for IO. The block layer provides various helper functionalities to let drivers share code, things like tag management, timeout handling, queueing, etc. - The "stacked" approach, where a driver squeezes in between the block layer and IO submitter. Since this bypasses the IO stack, driver generally have to manage everything themselves. With drivers being written for new high IOPS devices, the classic request_fn based driver doesn't work well enough. The design dates back to when both SMP and high IOPS was rare. It has problems with scaling to bigger machines, and runs into scaling issues even on smaller machines when you have IOPS in the hundreds of thousands per device. The stacked approach is then most often selected as the model for the driver. But this means that everybody has to re-invent everything, and along with that we get all the problems again that the shared approach solved. This commit introduces blk-mq, block multi queue support. The design is centered around per-cpu queues for queueing IO, which then funnel down into x number of hardware submission queues. We might have a 1:1 mapping between the two, or it might be an N:M mapping. That all depends on what the hardware supports. blk-mq provides various helper functions, which include: - Scalable support for request tagging. Most devices need to be able to uniquely identify a request both in the driver and to the hardware. The tagging uses per-cpu caches for freed tags, to enable cache hot reuse. - Timeout handling without tracking request on a per-device basis. Basically the driver should be able to get a notification, if a request happens to fail. - Optional support for non 1:1 mappings between issue and submission queues. blk-mq can redirect IO completions to the desired location. - Support for per-request payloads. Drivers almost always need to associate a request structure with some driver private command structure. Drivers can tell blk-mq this at init time, and then any request handed to the driver will have the required size of memory associated with it. - Support for merging of IO, and plugging. The stacked model gets neither of these. Even for high IOPS devices, merging sequential IO reduces per-command overhead and thus increases bandwidth. For now, this is provided as a potential 3rd queueing model, with the hope being that, as it matures, it can replace both the classic and stacked model. That would get us back to having just 1 real model for block devices, leaving the stacked approach to dm/md devices (as it was originally intended). Contributions in this patch from the following people: Shaohua Li <shli@fusionio.com> Alexander Gordeev <agordeev@redhat.com> Christoph Hellwig <hch@infradead.org> Mike Christie <michaelc@cs.wisc.edu> Matias Bjorling <m@bjorling.me> Jeff Moyer <jmoyer@redhat.com> Acked-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/blk-flush.c')
-rw-r--r--block/blk-flush.c154
1 files changed, 139 insertions, 15 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c
index cc2b827a853c..3e4cc9c7890a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,8 +69,10 @@
69#include <linux/bio.h> 69#include <linux/bio.h>
70#include <linux/blkdev.h> 70#include <linux/blkdev.h>
71#include <linux/gfp.h> 71#include <linux/gfp.h>
72#include <linux/blk-mq.h>
72 73
73#include "blk.h" 74#include "blk.h"
75#include "blk-mq.h"
74 76
75/* FLUSH/FUA sequences */ 77/* FLUSH/FUA sequences */
76enum { 78enum {
@@ -124,6 +126,24 @@ static void blk_flush_restore_request(struct request *rq)
124 /* make @rq a normal request */ 126 /* make @rq a normal request */
125 rq->cmd_flags &= ~REQ_FLUSH_SEQ; 127 rq->cmd_flags &= ~REQ_FLUSH_SEQ;
126 rq->end_io = rq->flush.saved_end_io; 128 rq->end_io = rq->flush.saved_end_io;
129
130 blk_clear_rq_complete(rq);
131}
132
133static void mq_flush_data_run(struct work_struct *work)
134{
135 struct request *rq;
136
137 rq = container_of(work, struct request, mq_flush_data);
138
139 memset(&rq->csd, 0, sizeof(rq->csd));
140 blk_mq_run_request(rq, true, false);
141}
142
143static void blk_mq_flush_data_insert(struct request *rq)
144{
145 INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
146 kblockd_schedule_work(rq->q, &rq->mq_flush_data);
127} 147}
128 148
129/** 149/**
@@ -136,7 +156,7 @@ static void blk_flush_restore_request(struct request *rq)
136 * completion and trigger the next step. 156 * completion and trigger the next step.
137 * 157 *
138 * CONTEXT: 158 * CONTEXT:
139 * spin_lock_irq(q->queue_lock) 159 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
140 * 160 *
141 * RETURNS: 161 * RETURNS:
142 * %true if requests were added to the dispatch queue, %false otherwise. 162 * %true if requests were added to the dispatch queue, %false otherwise.
@@ -146,7 +166,7 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
146{ 166{
147 struct request_queue *q = rq->q; 167 struct request_queue *q = rq->q;
148 struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; 168 struct list_head *pending = &q->flush_queue[q->flush_pending_idx];
149 bool queued = false; 169 bool queued = false, kicked;
150 170
151 BUG_ON(rq->flush.seq & seq); 171 BUG_ON(rq->flush.seq & seq);
152 rq->flush.seq |= seq; 172 rq->flush.seq |= seq;
@@ -167,8 +187,12 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
167 187
168 case REQ_FSEQ_DATA: 188 case REQ_FSEQ_DATA:
169 list_move_tail(&rq->flush.list, &q->flush_data_in_flight); 189 list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
170 list_add(&rq->queuelist, &q->queue_head); 190 if (q->mq_ops)
171 queued = true; 191 blk_mq_flush_data_insert(rq);
192 else {
193 list_add(&rq->queuelist, &q->queue_head);
194 queued = true;
195 }
172 break; 196 break;
173 197
174 case REQ_FSEQ_DONE: 198 case REQ_FSEQ_DONE:
@@ -181,28 +205,43 @@ static bool blk_flush_complete_seq(struct request *rq, unsigned int seq,
181 BUG_ON(!list_empty(&rq->queuelist)); 205 BUG_ON(!list_empty(&rq->queuelist));
182 list_del_init(&rq->flush.list); 206 list_del_init(&rq->flush.list);
183 blk_flush_restore_request(rq); 207 blk_flush_restore_request(rq);
184 __blk_end_request_all(rq, error); 208 if (q->mq_ops)
209 blk_mq_end_io(rq, error);
210 else
211 __blk_end_request_all(rq, error);
185 break; 212 break;
186 213
187 default: 214 default:
188 BUG(); 215 BUG();
189 } 216 }
190 217
191 return blk_kick_flush(q) | queued; 218 kicked = blk_kick_flush(q);
219 /* blk_mq_run_flush will run queue */
220 if (q->mq_ops)
221 return queued;
222 return kicked | queued;
192} 223}
193 224
194static void flush_end_io(struct request *flush_rq, int error) 225static void flush_end_io(struct request *flush_rq, int error)
195{ 226{
196 struct request_queue *q = flush_rq->q; 227 struct request_queue *q = flush_rq->q;
197 struct list_head *running = &q->flush_queue[q->flush_running_idx]; 228 struct list_head *running;
198 bool queued = false; 229 bool queued = false;
199 struct request *rq, *n; 230 struct request *rq, *n;
231 unsigned long flags = 0;
200 232
233 if (q->mq_ops) {
234 blk_mq_free_request(flush_rq);
235 spin_lock_irqsave(&q->mq_flush_lock, flags);
236 }
237 running = &q->flush_queue[q->flush_running_idx];
201 BUG_ON(q->flush_pending_idx == q->flush_running_idx); 238 BUG_ON(q->flush_pending_idx == q->flush_running_idx);
202 239
203 /* account completion of the flush request */ 240 /* account completion of the flush request */
204 q->flush_running_idx ^= 1; 241 q->flush_running_idx ^= 1;
205 elv_completed_request(q, flush_rq); 242
243 if (!q->mq_ops)
244 elv_completed_request(q, flush_rq);
206 245
207 /* and push the waiting requests to the next stage */ 246 /* and push the waiting requests to the next stage */
208 list_for_each_entry_safe(rq, n, running, flush.list) { 247 list_for_each_entry_safe(rq, n, running, flush.list) {
@@ -223,9 +262,48 @@ static void flush_end_io(struct request *flush_rq, int error)
223 * directly into request_fn may confuse the driver. Always use 262 * directly into request_fn may confuse the driver. Always use
224 * kblockd. 263 * kblockd.
225 */ 264 */
226 if (queued || q->flush_queue_delayed) 265 if (queued || q->flush_queue_delayed) {
227 blk_run_queue_async(q); 266 if (!q->mq_ops)
267 blk_run_queue_async(q);
268 else
269 /*
270 * This can be optimized to only run queues with requests
271 * queued if necessary.
272 */
273 blk_mq_run_queues(q, true);
274 }
228 q->flush_queue_delayed = 0; 275 q->flush_queue_delayed = 0;
276 if (q->mq_ops)
277 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
278}
279
280static void mq_flush_work(struct work_struct *work)
281{
282 struct request_queue *q;
283 struct request *rq;
284
285 q = container_of(work, struct request_queue, mq_flush_work);
286
287 /* We don't need set REQ_FLUSH_SEQ, it's for consistency */
288 rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
289 __GFP_WAIT|GFP_ATOMIC);
290 rq->cmd_type = REQ_TYPE_FS;
291 rq->end_io = flush_end_io;
292
293 blk_mq_run_request(rq, true, false);
294}
295
296/*
297 * We can't directly use q->flush_rq, because it doesn't have tag and is not in
298 * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
299 * so offload the work to workqueue.
300 *
301 * Note: we assume a flush request finished in any hardware queue will flush
302 * the whole disk cache.
303 */
304static void mq_run_flush(struct request_queue *q)
305{
306 kblockd_schedule_work(q, &q->mq_flush_work);
229} 307}
230 308
231/** 309/**
@@ -236,7 +314,7 @@ static void flush_end_io(struct request *flush_rq, int error)
236 * Please read the comment at the top of this file for more info. 314 * Please read the comment at the top of this file for more info.
237 * 315 *
238 * CONTEXT: 316 * CONTEXT:
239 * spin_lock_irq(q->queue_lock) 317 * spin_lock_irq(q->queue_lock or q->mq_flush_lock)
240 * 318 *
241 * RETURNS: 319 * RETURNS:
242 * %true if flush was issued, %false otherwise. 320 * %true if flush was issued, %false otherwise.
@@ -261,13 +339,18 @@ static bool blk_kick_flush(struct request_queue *q)
261 * Issue flush and toggle pending_idx. This makes pending_idx 339 * Issue flush and toggle pending_idx. This makes pending_idx
262 * different from running_idx, which means flush is in flight. 340 * different from running_idx, which means flush is in flight.
263 */ 341 */
342 q->flush_pending_idx ^= 1;
343 if (q->mq_ops) {
344 mq_run_flush(q);
345 return true;
346 }
347
264 blk_rq_init(q, &q->flush_rq); 348 blk_rq_init(q, &q->flush_rq);
265 q->flush_rq.cmd_type = REQ_TYPE_FS; 349 q->flush_rq.cmd_type = REQ_TYPE_FS;
266 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ; 350 q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
267 q->flush_rq.rq_disk = first_rq->rq_disk; 351 q->flush_rq.rq_disk = first_rq->rq_disk;
268 q->flush_rq.end_io = flush_end_io; 352 q->flush_rq.end_io = flush_end_io;
269 353
270 q->flush_pending_idx ^= 1;
271 list_add_tail(&q->flush_rq.queuelist, &q->queue_head); 354 list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
272 return true; 355 return true;
273} 356}
@@ -284,16 +367,37 @@ static void flush_data_end_io(struct request *rq, int error)
284 blk_run_queue_async(q); 367 blk_run_queue_async(q);
285} 368}
286 369
370static void mq_flush_data_end_io(struct request *rq, int error)
371{
372 struct request_queue *q = rq->q;
373 struct blk_mq_hw_ctx *hctx;
374 struct blk_mq_ctx *ctx;
375 unsigned long flags;
376
377 ctx = rq->mq_ctx;
378 hctx = q->mq_ops->map_queue(q, ctx->cpu);
379
380 /*
381 * After populating an empty queue, kick it to avoid stall. Read
382 * the comment in flush_end_io().
383 */
384 spin_lock_irqsave(&q->mq_flush_lock, flags);
385 if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error))
386 blk_mq_run_hw_queue(hctx, true);
387 spin_unlock_irqrestore(&q->mq_flush_lock, flags);
388}
389
287/** 390/**
288 * blk_insert_flush - insert a new FLUSH/FUA request 391 * blk_insert_flush - insert a new FLUSH/FUA request
289 * @rq: request to insert 392 * @rq: request to insert
290 * 393 *
291 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. 394 * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
395 * or __blk_mq_run_hw_queue() to dispatch request.
292 * @rq is being submitted. Analyze what needs to be done and put it on the 396 * @rq is being submitted. Analyze what needs to be done and put it on the
293 * right queue. 397 * right queue.
294 * 398 *
295 * CONTEXT: 399 * CONTEXT:
296 * spin_lock_irq(q->queue_lock) 400 * spin_lock_irq(q->queue_lock) in !mq case
297 */ 401 */
298void blk_insert_flush(struct request *rq) 402void blk_insert_flush(struct request *rq)
299{ 403{
@@ -316,7 +420,10 @@ void blk_insert_flush(struct request *rq)
316 * complete the request. 420 * complete the request.
317 */ 421 */
318 if (!policy) { 422 if (!policy) {
319 __blk_end_bidi_request(rq, 0, 0, 0); 423 if (q->mq_ops)
424 blk_mq_end_io(rq, 0);
425 else
426 __blk_end_bidi_request(rq, 0, 0, 0);
320 return; 427 return;
321 } 428 }
322 429
@@ -329,7 +436,10 @@ void blk_insert_flush(struct request *rq)
329 */ 436 */
330 if ((policy & REQ_FSEQ_DATA) && 437 if ((policy & REQ_FSEQ_DATA) &&
331 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { 438 !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
332 list_add_tail(&rq->queuelist, &q->queue_head); 439 if (q->mq_ops) {
440 blk_mq_run_request(rq, false, true);
441 } else
442 list_add_tail(&rq->queuelist, &q->queue_head);
333 return; 443 return;
334 } 444 }
335 445
@@ -341,6 +451,14 @@ void blk_insert_flush(struct request *rq)
341 INIT_LIST_HEAD(&rq->flush.list); 451 INIT_LIST_HEAD(&rq->flush.list);
342 rq->cmd_flags |= REQ_FLUSH_SEQ; 452 rq->cmd_flags |= REQ_FLUSH_SEQ;
343 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ 453 rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
454 if (q->mq_ops) {
455 rq->end_io = mq_flush_data_end_io;
456
457 spin_lock_irq(&q->mq_flush_lock);
458 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
459 spin_unlock_irq(&q->mq_flush_lock);
460 return;
461 }
344 rq->end_io = flush_data_end_io; 462 rq->end_io = flush_data_end_io;
345 463
346 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); 464 blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
@@ -453,3 +571,9 @@ int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
453 return ret; 571 return ret;
454} 572}
455EXPORT_SYMBOL(blkdev_issue_flush); 573EXPORT_SYMBOL(blkdev_issue_flush);
574
575void blk_mq_init_flush(struct request_queue *q)
576{
577 spin_lock_init(&q->mq_flush_lock);
578 INIT_WORK(&q->mq_flush_work, mq_flush_work);
579}