summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMing Lei <ming.lei@redhat.com>2017-11-02 11:24:38 -0400
committerJens Axboe <axboe@kernel.dk>2017-11-04 14:40:13 -0400
commit923218f6166a84688973acdc39094f3bee1e9ad4 (patch)
tree1013b8c39764532780292633f7e4214c99480aaf
parent244c65a3ccaa06fd15cc940315606674d3108b2f (diff)
blk-mq: don't allocate driver tag upfront for flush rq
The idea behind it is simple: 1) for none scheduler, driver tag has to be borrowed for flush rq, otherwise we may run out of tag, and that causes an IO hang. And get/put driver tag is actually noop for none, so reordering tags isn't necessary at all. 2) for a real I/O scheduler, we need not allocate a driver tag upfront for flush rq. It works just fine to follow the same approach as normal requests: allocate driver tag for each rq just before calling ->queue_rq(). One driver visible change is that the driver tag isn't shared in the flush request sequence. That won't be a problem, since we always do that in legacy path. Then flush rq need not be treated specially wrt. get/put driver tag. This cleans up the code - for instance, reorder_tags_to_front() can be removed, and we needn't worry about request ordering in dispatch list for avoiding I/O deadlock. Also we have to put the driver tag before requeueing. Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/blk-flush.c35
-rw-r--r--block/blk-mq-sched.c42
-rw-r--r--block/blk-mq.c41
3 files changed, 37 insertions, 81 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c
index a9773d2075ac..f17170675917 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -231,8 +231,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
231 /* release the tag's ownership to the req cloned from */ 231 /* release the tag's ownership to the req cloned from */
232 spin_lock_irqsave(&fq->mq_flush_lock, flags); 232 spin_lock_irqsave(&fq->mq_flush_lock, flags);
233 hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); 233 hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
234 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); 234 if (!q->elevator) {
235 flush_rq->tag = -1; 235 blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
236 flush_rq->tag = -1;
237 } else {
238 blk_mq_put_driver_tag_hctx(hctx, flush_rq);
239 flush_rq->internal_tag = -1;
240 }
236 } 241 }
237 242
238 running = &fq->flush_queue[fq->flush_running_idx]; 243 running = &fq->flush_queue[fq->flush_running_idx];
@@ -318,19 +323,26 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
318 blk_rq_init(q, flush_rq); 323 blk_rq_init(q, flush_rq);
319 324
320 /* 325 /*
321 * Borrow tag from the first request since they can't 326 * In case of none scheduler, borrow tag from the first request
322 * be in flight at the same time. And acquire the tag's 327 * since they can't be in flight at the same time. And acquire
323 * ownership for flush req. 328 * the tag's ownership for flush req.
329 *
330 * In case of IO scheduler, flush rq need to borrow scheduler tag
331 * just for cheating put/get driver tag.
324 */ 332 */
325 if (q->mq_ops) { 333 if (q->mq_ops) {
326 struct blk_mq_hw_ctx *hctx; 334 struct blk_mq_hw_ctx *hctx;
327 335
328 flush_rq->mq_ctx = first_rq->mq_ctx; 336 flush_rq->mq_ctx = first_rq->mq_ctx;
329 flush_rq->tag = first_rq->tag;
330 fq->orig_rq = first_rq;
331 337
332 hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); 338 if (!q->elevator) {
333 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); 339 fq->orig_rq = first_rq;
340 flush_rq->tag = first_rq->tag;
341 hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
342 blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
343 } else {
344 flush_rq->internal_tag = first_rq->internal_tag;
345 }
334 } 346 }
335 347
336 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; 348 flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -394,6 +406,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
394 406
395 hctx = blk_mq_map_queue(q, ctx->cpu); 407 hctx = blk_mq_map_queue(q, ctx->cpu);
396 408
409 if (q->elevator) {
410 WARN_ON(rq->tag < 0);
411 blk_mq_put_driver_tag_hctx(hctx, rq);
412 }
413
397 /* 414 /*
398 * After populating an empty queue, kick it to avoid stall. Read 415 * After populating an empty queue, kick it to avoid stall. Read
399 * the comment in flush_end_io(). 416 * the comment in flush_end_io().
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index e7094f44afaf..01a43fed6b8c 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -356,29 +356,12 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
356 return true; 356 return true;
357 } 357 }
358 358
359 if (has_sched) { 359 if (has_sched)
360 rq->rq_flags |= RQF_SORTED; 360 rq->rq_flags |= RQF_SORTED;
361 WARN_ON(rq->tag != -1);
362 }
363 361
364 return false; 362 return false;
365} 363}
366 364
367/*
368 * Add flush/fua to the queue. If we fail getting a driver tag, then
369 * punt to the requeue list. Requeue will re-invoke us from a context
370 * that's safe to block from.
371 */
372static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
373 struct request *rq, bool can_block)
374{
375 if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
376 blk_insert_flush(rq);
377 blk_mq_run_hw_queue(hctx, true);
378 } else
379 blk_mq_add_to_requeue_list(rq, false, true);
380}
381
382void blk_mq_sched_insert_request(struct request *rq, bool at_head, 365void blk_mq_sched_insert_request(struct request *rq, bool at_head,
383 bool run_queue, bool async, bool can_block) 366 bool run_queue, bool async, bool can_block)
384{ 367{
@@ -389,10 +372,12 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
389 372
390 /* flush rq in flush machinery need to be dispatched directly */ 373 /* flush rq in flush machinery need to be dispatched directly */
391 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 374 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
392 blk_mq_sched_insert_flush(hctx, rq, can_block); 375 blk_insert_flush(rq);
393 return; 376 goto run;
394 } 377 }
395 378
379 WARN_ON(e && (rq->tag != -1));
380
396 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) 381 if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
397 goto run; 382 goto run;
398 383
@@ -419,23 +404,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
419 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 404 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
420 struct elevator_queue *e = hctx->queue->elevator; 405 struct elevator_queue *e = hctx->queue->elevator;
421 406
422 if (e) {
423 struct request *rq, *next;
424
425 /*
426 * We bypass requests that already have a driver tag assigned,
427 * which should only be flushes. Flushes are only ever inserted
428 * as single requests, so we shouldn't ever hit the
429 * WARN_ON_ONCE() below (but let's handle it just in case).
430 */
431 list_for_each_entry_safe(rq, next, list, queuelist) {
432 if (WARN_ON_ONCE(rq->tag != -1)) {
433 list_del_init(&rq->queuelist);
434 blk_mq_sched_bypass_insert(hctx, true, rq);
435 }
436 }
437 }
438
439 if (e && e->type->ops.mq.insert_requests) 407 if (e && e->type->ops.mq.insert_requests)
440 e->type->ops.mq.insert_requests(hctx, list, false); 408 e->type->ops.mq.insert_requests(hctx, list, false);
441 else 409 else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 14f6886fbec8..c501cbd0de93 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -653,6 +653,8 @@ static void __blk_mq_requeue_request(struct request *rq)
653{ 653{
654 struct request_queue *q = rq->q; 654 struct request_queue *q = rq->q;
655 655
656 blk_mq_put_driver_tag(rq);
657
656 trace_block_rq_requeue(q, rq); 658 trace_block_rq_requeue(q, rq);
657 wbt_requeue(q->rq_wb, &rq->issue_stat); 659 wbt_requeue(q->rq_wb, &rq->issue_stat);
658 blk_mq_sched_requeue_request(rq); 660 blk_mq_sched_requeue_request(rq);
@@ -996,30 +998,6 @@ done:
996 return rq->tag != -1; 998 return rq->tag != -1;
997} 999}
998 1000
999/*
1000 * If we fail getting a driver tag because all the driver tags are already
1001 * assigned and on the dispatch list, BUT the first entry does not have a
1002 * tag, then we could deadlock. For that case, move entries with assigned
1003 * driver tags to the front, leaving the set of tagged requests in the
1004 * same order, and the untagged set in the same order.
1005 */
1006static bool reorder_tags_to_front(struct list_head *list)
1007{
1008 struct request *rq, *tmp, *first = NULL;
1009
1010 list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) {
1011 if (rq == first)
1012 break;
1013 if (rq->tag != -1) {
1014 list_move(&rq->queuelist, list);
1015 if (!first)
1016 first = rq;
1017 }
1018 }
1019
1020 return first != NULL;
1021}
1022
1023static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 1001static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
1024 void *key) 1002 void *key)
1025{ 1003{
@@ -1080,9 +1058,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1080 1058
1081 rq = list_first_entry(list, struct request, queuelist); 1059 rq = list_first_entry(list, struct request, queuelist);
1082 if (!blk_mq_get_driver_tag(rq, &hctx, false)) { 1060 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1083 if (!queued && reorder_tags_to_front(list))
1084 continue;
1085
1086 /* 1061 /*
1087 * The initial allocation attempt failed, so we need to 1062 * The initial allocation attempt failed, so we need to
1088 * rerun the hardware queue when a tag is freed. 1063 * rerun the hardware queue when a tag is freed.
@@ -1133,7 +1108,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1133 nxt = list_first_entry(list, struct request, queuelist); 1108 nxt = list_first_entry(list, struct request, queuelist);
1134 blk_mq_put_driver_tag(nxt); 1109 blk_mq_put_driver_tag(nxt);
1135 } 1110 }
1136 blk_mq_put_driver_tag_hctx(hctx, rq);
1137 list_add(&rq->queuelist, list); 1111 list_add(&rq->queuelist, list);
1138 __blk_mq_requeue_request(rq); 1112 __blk_mq_requeue_request(rq);
1139 break; 1113 break;
@@ -1698,13 +1672,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1698 if (unlikely(is_flush_fua)) { 1672 if (unlikely(is_flush_fua)) {
1699 blk_mq_put_ctx(data.ctx); 1673 blk_mq_put_ctx(data.ctx);
1700 blk_mq_bio_to_request(rq, bio); 1674 blk_mq_bio_to_request(rq, bio);
1701 if (q->elevator) { 1675
1702 blk_mq_sched_insert_request(rq, false, true, true, 1676 /* bypass scheduler for flush rq */
1703 true); 1677 blk_insert_flush(rq);
1704 } else { 1678 blk_mq_run_hw_queue(data.hctx, true);
1705 blk_insert_flush(rq);
1706 blk_mq_run_hw_queue(data.hctx, true);
1707 }
1708 } else if (plug && q->nr_hw_queues == 1) { 1679 } else if (plug && q->nr_hw_queues == 1) {
1709 struct request *last = NULL; 1680 struct request *last = NULL;
1710 1681