summaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c229
1 files changed, 157 insertions, 72 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5e57c80a82b..156203876c8c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
483 if (blk_rq_rl(rq)) 483 if (blk_rq_rl(rq))
484 blk_put_rl(blk_rq_rl(rq)); 484 blk_put_rl(blk_rq_rl(rq));
485 485
486 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
486 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 487 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
487 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 488 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
488 if (rq->tag != -1) 489 if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
530 bool shared = false; 531 bool shared = false;
531 int cpu; 532 int cpu;
532 533
534 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
535
533 if (rq->internal_tag != -1) 536 if (rq->internal_tag != -1)
534 blk_mq_sched_completed_request(rq); 537 blk_mq_sched_completed_request(rq);
535 if (rq->rq_flags & RQF_STATS) { 538 if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
573 *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); 576 *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
574} 577}
575 578
579static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
580{
581 unsigned long flags;
582
583 /*
584 * blk_mq_rq_aborted_gstate() is used from the completion path and
585 * can thus be called from irq context. u64_stats_fetch in the
586 * middle of update on the same CPU leads to lockup. Disable irq
587 * while updating.
588 */
589 local_irq_save(flags);
590 u64_stats_update_begin(&rq->aborted_gstate_sync);
591 rq->aborted_gstate = gstate;
592 u64_stats_update_end(&rq->aborted_gstate_sync);
593 local_irq_restore(flags);
594}
595
596static u64 blk_mq_rq_aborted_gstate(struct request *rq)
597{
598 unsigned int start;
599 u64 aborted_gstate;
600
601 do {
602 start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
603 aborted_gstate = rq->aborted_gstate;
604 } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
605
606 return aborted_gstate;
607}
608
576/** 609/**
577 * blk_mq_complete_request - end I/O on a request 610 * blk_mq_complete_request - end I/O on a request
578 * @rq: the request being processed 611 * @rq: the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
590 if (unlikely(blk_should_fake_timeout(q))) 623 if (unlikely(blk_should_fake_timeout(q)))
591 return; 624 return;
592 625
626 /*
627 * If @rq->aborted_gstate equals the current instance, timeout is
628 * claiming @rq and we lost. This is synchronized through
629 * hctx_lock(). See blk_mq_timeout_work() for details.
630 *
631 * Completion path never blocks and we can directly use RCU here
632 * instead of hctx_lock() which can be either RCU or SRCU.
633 * However, that would complicate paths which want to synchronize
634 * against us. Let stay in sync with the issue path so that
635 * hctx_lock() covers both issue and completion paths.
636 */
593 hctx_lock(hctx, &srcu_idx); 637 hctx_lock(hctx, &srcu_idx);
594 if (!blk_mark_rq_complete(rq)) 638 if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
639 !blk_mark_rq_complete(rq))
595 __blk_mq_complete_request(rq); 640 __blk_mq_complete_request(rq);
596 hctx_unlock(hctx, srcu_idx); 641 hctx_unlock(hctx, srcu_idx);
597} 642}
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
617 wbt_issue(q->rq_wb, &rq->issue_stat); 662 wbt_issue(q->rq_wb, &rq->issue_stat);
618 } 663 }
619 664
620 blk_add_timer(rq); 665 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
621
622 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); 666 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
623 667
624 /* 668 /*
625 * Mark us as started and clear complete. Complete might have been 669 * Mark @rq in-flight which also advances the generation number,
626 * set if requeue raced with timeout, which then marked it as 670 * and register for timeout. Protect with a seqcount to allow the
627 * complete. So be sure to clear complete again when we start 671 * timeout path to read both @rq->gstate and @rq->deadline
628 * the request, otherwise we'll ignore the completion event. 672 * coherently.
629 * 673 *
630 * Ensure that ->deadline is visible before we set STARTED, such that 674 * This is the only place where a request is marked in-flight. If
631 * blk_mq_check_expired() is guaranteed to observe our ->deadline when 675 * the timeout path reads an in-flight @rq->gstate, the
632 * it observes STARTED. 676 * @rq->deadline it reads together under @rq->gstate_seq is
677 * guaranteed to be the matching one.
633 */ 678 */
634 smp_wmb(); 679 preempt_disable();
680 write_seqcount_begin(&rq->gstate_seq);
681
682 blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
683 blk_add_timer(rq);
684
685 write_seqcount_end(&rq->gstate_seq);
686 preempt_enable();
687
635 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 688 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
636 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { 689 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
637 /*
638 * Coherence order guarantees these consecutive stores to a
639 * single variable propagate in the specified order. Thus the
640 * clear_bit() is ordered _after_ the set bit. See
641 * blk_mq_check_expired().
642 *
643 * (the bits must be part of the same byte for this to be
644 * true).
645 */
646 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 690 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
647 }
648 691
649 if (q->dma_drain_size && blk_rq_bytes(rq)) { 692 if (q->dma_drain_size && blk_rq_bytes(rq)) {
650 /* 693 /*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
677 blk_mq_sched_requeue_request(rq); 720 blk_mq_sched_requeue_request(rq);
678 721
679 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 722 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
723 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
680 if (q->dma_drain_size && blk_rq_bytes(rq)) 724 if (q->dma_drain_size && blk_rq_bytes(rq))
681 rq->nr_phys_segments--; 725 rq->nr_phys_segments--;
682 } 726 }
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
774struct blk_mq_timeout_data { 818struct blk_mq_timeout_data {
775 unsigned long next; 819 unsigned long next;
776 unsigned int next_set; 820 unsigned int next_set;
821 unsigned int nr_expired;
777}; 822};
778 823
779void blk_mq_rq_timed_out(struct request *req, bool reserved) 824void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
801 __blk_mq_complete_request(req); 846 __blk_mq_complete_request(req);
802 break; 847 break;
803 case BLK_EH_RESET_TIMER: 848 case BLK_EH_RESET_TIMER:
849 /*
850 * As nothing prevents from completion happening while
851 * ->aborted_gstate is set, this may lead to ignored
852 * completions and further spurious timeouts.
853 */
854 blk_mq_rq_update_aborted_gstate(req, 0);
804 blk_add_timer(req); 855 blk_add_timer(req);
805 blk_clear_rq_complete(req); 856 blk_clear_rq_complete(req);
806 break; 857 break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816 struct request *rq, void *priv, bool reserved) 867 struct request *rq, void *priv, bool reserved)
817{ 868{
818 struct blk_mq_timeout_data *data = priv; 869 struct blk_mq_timeout_data *data = priv;
819 unsigned long deadline; 870 unsigned long gstate, deadline;
871 int start;
872
873 might_sleep();
820 874
821 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 875 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
822 return; 876 return;
823 877
824 /* 878 /* read coherent snapshots of @rq->state_gen and @rq->deadline */
825 * Ensures that if we see STARTED we must also see our 879 while (true) {
826 * up-to-date deadline, see blk_mq_start_request(). 880 start = read_seqcount_begin(&rq->gstate_seq);
827 */ 881 gstate = READ_ONCE(rq->gstate);
828 smp_rmb(); 882 deadline = rq->deadline;
829 883 if (!read_seqcount_retry(&rq->gstate_seq, start))
830 deadline = READ_ONCE(rq->deadline); 884 break;
885 cond_resched();
886 }
831 887
832 /* 888 /* if in-flight && overdue, mark for abortion */
833 * The rq being checked may have been freed and reallocated 889 if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
834 * out already here, we avoid this race by checking rq->deadline 890 time_after_eq(jiffies, deadline)) {
835 * and REQ_ATOM_COMPLETE flag together: 891 blk_mq_rq_update_aborted_gstate(rq, gstate);
836 * 892 data->nr_expired++;
837 * - if rq->deadline is observed as new value because of 893 hctx->nr_expired++;
838 * reusing, the rq won't be timed out because of timing.
839 * - if rq->deadline is observed as previous value,
840 * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
841 * because we put a barrier between setting rq->deadline
842 * and clearing the flag in blk_mq_start_request(), so
843 * this rq won't be timed out too.
844 */
845 if (time_after_eq(jiffies, deadline)) {
846 if (!blk_mark_rq_complete(rq)) {
847 /*
848 * Again coherence order ensures that consecutive reads
849 * from the same variable must be in that order. This
850 * ensures that if we see COMPLETE clear, we must then
851 * see STARTED set and we'll ignore this timeout.
852 *
853 * (There's also the MB implied by the test_and_clear())
854 */
855 blk_mq_rq_timed_out(rq, reserved);
856 }
857 } else if (!data->next_set || time_after(data->next, deadline)) { 894 } else if (!data->next_set || time_after(data->next, deadline)) {
858 data->next = deadline; 895 data->next = deadline;
859 data->next_set = 1; 896 data->next_set = 1;
860 } 897 }
861} 898}
862 899
900static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
901 struct request *rq, void *priv, bool reserved)
902{
903 /*
904 * We marked @rq->aborted_gstate and waited for RCU. If there were
905 * completions that we lost to, they would have finished and
906 * updated @rq->gstate by now; otherwise, the completion path is
907 * now guaranteed to see @rq->aborted_gstate and yield. If
908 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
909 */
910 if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
911 !blk_mark_rq_complete(rq))
912 blk_mq_rq_timed_out(rq, reserved);
913}
914
863static void blk_mq_timeout_work(struct work_struct *work) 915static void blk_mq_timeout_work(struct work_struct *work)
864{ 916{
865 struct request_queue *q = 917 struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
867 struct blk_mq_timeout_data data = { 919 struct blk_mq_timeout_data data = {
868 .next = 0, 920 .next = 0,
869 .next_set = 0, 921 .next_set = 0,
922 .nr_expired = 0,
870 }; 923 };
924 struct blk_mq_hw_ctx *hctx;
871 int i; 925 int i;
872 926
873 /* A deadlock might occur if a request is stuck requiring a 927 /* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
886 if (!percpu_ref_tryget(&q->q_usage_counter)) 940 if (!percpu_ref_tryget(&q->q_usage_counter))
887 return; 941 return;
888 942
943 /* scan for the expired ones and set their ->aborted_gstate */
889 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 944 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
890 945
946 if (data.nr_expired) {
947 bool has_rcu = false;
948
949 /*
950 * Wait till everyone sees ->aborted_gstate. The
951 * sequential waits for SRCUs aren't ideal. If this ever
952 * becomes a problem, we can add per-hw_ctx rcu_head and
953 * wait in parallel.
954 */
955 queue_for_each_hw_ctx(q, hctx, i) {
956 if (!hctx->nr_expired)
957 continue;
958
959 if (!(hctx->flags & BLK_MQ_F_BLOCKING))
960 has_rcu = true;
961 else
962 synchronize_srcu(hctx->queue_rq_srcu);
963
964 hctx->nr_expired = 0;
965 }
966 if (has_rcu)
967 synchronize_rcu();
968
969 /* terminate the ones we won */
970 blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
971 }
972
891 if (data.next_set) { 973 if (data.next_set) {
892 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 974 data.next = blk_rq_timeout(round_jiffies_up(data.next));
893 mod_timer(&q->timeout, data.next); 975 mod_timer(&q->timeout, data.next);
894 } else { 976 } else {
895 struct blk_mq_hw_ctx *hctx;
896
897 queue_for_each_hw_ctx(q, hctx, i) { 977 queue_for_each_hw_ctx(q, hctx, i) {
898 /* the hctx may be unmapped, so check it here */ 978 /* the hctx may be unmapped, so check it here */
899 if (blk_mq_hw_queue_mapped(hctx)) 979 if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
1893 return (size_t)PAGE_SIZE << order; 1973 return (size_t)PAGE_SIZE << order;
1894} 1974}
1895 1975
1976static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
1977 unsigned int hctx_idx, int node)
1978{
1979 int ret;
1980
1981 if (set->ops->init_request) {
1982 ret = set->ops->init_request(set, rq, hctx_idx, node);
1983 if (ret)
1984 return ret;
1985 }
1986
1987 seqcount_init(&rq->gstate_seq);
1988 u64_stats_init(&rq->aborted_gstate_sync);
1989 return 0;
1990}
1991
1896int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 1992int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1897 unsigned int hctx_idx, unsigned int depth) 1993 unsigned int hctx_idx, unsigned int depth)
1898{ 1994{
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1954 struct request *rq = p; 2050 struct request *rq = p;
1955 2051
1956 tags->static_rqs[i] = rq; 2052 tags->static_rqs[i] = rq;
1957 if (set->ops->init_request) { 2053 if (blk_mq_init_request(set, rq, hctx_idx, node)) {
1958 if (set->ops->init_request(set, rq, hctx_idx, 2054 tags->static_rqs[i] = NULL;
1959 node)) { 2055 goto fail;
1960 tags->static_rqs[i] = NULL;
1961 goto fail;
1962 }
1963 } 2056 }
1964 2057
1965 p += rq_size; 2058 p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
2099 if (!hctx->fq) 2192 if (!hctx->fq)
2100 goto sched_exit_hctx; 2193 goto sched_exit_hctx;
2101 2194
2102 if (set->ops->init_request && 2195 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2103 set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
2104 node))
2105 goto free_fq; 2196 goto free_fq;
2106 2197
2107 if (hctx->flags & BLK_MQ_F_BLOCKING) 2198 if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3019 3110
3020static int __init blk_mq_init(void) 3111static int __init blk_mq_init(void)
3021{ 3112{
3022 /*
3023 * See comment in block/blk.h rq_atomic_flags enum
3024 */
3025 BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
3026 (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
3027
3028 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 3113 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3029 blk_mq_hctx_notify_dead); 3114 blk_mq_hctx_notify_dead);
3030 return 0; 3115 return 0;