diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 229 |
1 files changed, 157 insertions, 72 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index f5e57c80a82b..156203876c8c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq) | |||
483 | if (blk_rq_rl(rq)) | 483 | if (blk_rq_rl(rq)) |
484 | blk_put_rl(blk_rq_rl(rq)); | 484 | blk_put_rl(blk_rq_rl(rq)); |
485 | 485 | ||
486 | blk_mq_rq_update_state(rq, MQ_RQ_IDLE); | ||
486 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 487 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
487 | clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); | 488 | clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); |
488 | if (rq->tag != -1) | 489 | if (rq->tag != -1) |
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq) | |||
530 | bool shared = false; | 531 | bool shared = false; |
531 | int cpu; | 532 | int cpu; |
532 | 533 | ||
534 | WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT); | ||
535 | |||
533 | if (rq->internal_tag != -1) | 536 | if (rq->internal_tag != -1) |
534 | blk_mq_sched_completed_request(rq); | 537 | blk_mq_sched_completed_request(rq); |
535 | if (rq->rq_flags & RQF_STATS) { | 538 | if (rq->rq_flags & RQF_STATS) { |
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) | |||
573 | *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); | 576 | *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); |
574 | } | 577 | } |
575 | 578 | ||
579 | static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate) | ||
580 | { | ||
581 | unsigned long flags; | ||
582 | |||
583 | /* | ||
584 | * blk_mq_rq_aborted_gstate() is used from the completion path and | ||
585 | * can thus be called from irq context. u64_stats_fetch in the | ||
586 | * middle of update on the same CPU leads to lockup. Disable irq | ||
587 | * while updating. | ||
588 | */ | ||
589 | local_irq_save(flags); | ||
590 | u64_stats_update_begin(&rq->aborted_gstate_sync); | ||
591 | rq->aborted_gstate = gstate; | ||
592 | u64_stats_update_end(&rq->aborted_gstate_sync); | ||
593 | local_irq_restore(flags); | ||
594 | } | ||
595 | |||
596 | static u64 blk_mq_rq_aborted_gstate(struct request *rq) | ||
597 | { | ||
598 | unsigned int start; | ||
599 | u64 aborted_gstate; | ||
600 | |||
601 | do { | ||
602 | start = u64_stats_fetch_begin(&rq->aborted_gstate_sync); | ||
603 | aborted_gstate = rq->aborted_gstate; | ||
604 | } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start)); | ||
605 | |||
606 | return aborted_gstate; | ||
607 | } | ||
608 | |||
576 | /** | 609 | /** |
577 | * blk_mq_complete_request - end I/O on a request | 610 | * blk_mq_complete_request - end I/O on a request |
578 | * @rq: the request being processed | 611 | * @rq: the request being processed |
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq) | |||
590 | if (unlikely(blk_should_fake_timeout(q))) | 623 | if (unlikely(blk_should_fake_timeout(q))) |
591 | return; | 624 | return; |
592 | 625 | ||
626 | /* | ||
627 | * If @rq->aborted_gstate equals the current instance, timeout is | ||
628 | * claiming @rq and we lost. This is synchronized through | ||
629 | * hctx_lock(). See blk_mq_timeout_work() for details. | ||
630 | * | ||
631 | * Completion path never blocks and we can directly use RCU here | ||
632 | * instead of hctx_lock() which can be either RCU or SRCU. | ||
633 | * However, that would complicate paths which want to synchronize | ||
634 | * against us. Let stay in sync with the issue path so that | ||
635 | * hctx_lock() covers both issue and completion paths. | ||
636 | */ | ||
593 | hctx_lock(hctx, &srcu_idx); | 637 | hctx_lock(hctx, &srcu_idx); |
594 | if (!blk_mark_rq_complete(rq)) | 638 | if (blk_mq_rq_aborted_gstate(rq) != rq->gstate && |
639 | !blk_mark_rq_complete(rq)) | ||
595 | __blk_mq_complete_request(rq); | 640 | __blk_mq_complete_request(rq); |
596 | hctx_unlock(hctx, srcu_idx); | 641 | hctx_unlock(hctx, srcu_idx); |
597 | } | 642 | } |
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq) | |||
617 | wbt_issue(q->rq_wb, &rq->issue_stat); | 662 | wbt_issue(q->rq_wb, &rq->issue_stat); |
618 | } | 663 | } |
619 | 664 | ||
620 | blk_add_timer(rq); | 665 | WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); |
621 | |||
622 | WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); | 666 | WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); |
623 | 667 | ||
624 | /* | 668 | /* |
625 | * Mark us as started and clear complete. Complete might have been | 669 | * Mark @rq in-flight which also advances the generation number, |
626 | * set if requeue raced with timeout, which then marked it as | 670 | * and register for timeout. Protect with a seqcount to allow the |
627 | * complete. So be sure to clear complete again when we start | 671 | * timeout path to read both @rq->gstate and @rq->deadline |
628 | * the request, otherwise we'll ignore the completion event. | 672 | * coherently. |
629 | * | 673 | * |
630 | * Ensure that ->deadline is visible before we set STARTED, such that | 674 | * This is the only place where a request is marked in-flight. If |
631 | * blk_mq_check_expired() is guaranteed to observe our ->deadline when | 675 | * the timeout path reads an in-flight @rq->gstate, the |
632 | * it observes STARTED. | 676 | * @rq->deadline it reads together under @rq->gstate_seq is |
677 | * guaranteed to be the matching one. | ||
633 | */ | 678 | */ |
634 | smp_wmb(); | 679 | preempt_disable(); |
680 | write_seqcount_begin(&rq->gstate_seq); | ||
681 | |||
682 | blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT); | ||
683 | blk_add_timer(rq); | ||
684 | |||
685 | write_seqcount_end(&rq->gstate_seq); | ||
686 | preempt_enable(); | ||
687 | |||
635 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 688 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
636 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { | 689 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) |
637 | /* | ||
638 | * Coherence order guarantees these consecutive stores to a | ||
639 | * single variable propagate in the specified order. Thus the | ||
640 | * clear_bit() is ordered _after_ the set bit. See | ||
641 | * blk_mq_check_expired(). | ||
642 | * | ||
643 | * (the bits must be part of the same byte for this to be | ||
644 | * true). | ||
645 | */ | ||
646 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | 690 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); |
647 | } | ||
648 | 691 | ||
649 | if (q->dma_drain_size && blk_rq_bytes(rq)) { | 692 | if (q->dma_drain_size && blk_rq_bytes(rq)) { |
650 | /* | 693 | /* |
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq) | |||
677 | blk_mq_sched_requeue_request(rq); | 720 | blk_mq_sched_requeue_request(rq); |
678 | 721 | ||
679 | if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { | 722 | if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { |
723 | blk_mq_rq_update_state(rq, MQ_RQ_IDLE); | ||
680 | if (q->dma_drain_size && blk_rq_bytes(rq)) | 724 | if (q->dma_drain_size && blk_rq_bytes(rq)) |
681 | rq->nr_phys_segments--; | 725 | rq->nr_phys_segments--; |
682 | } | 726 | } |
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq); | |||
774 | struct blk_mq_timeout_data { | 818 | struct blk_mq_timeout_data { |
775 | unsigned long next; | 819 | unsigned long next; |
776 | unsigned int next_set; | 820 | unsigned int next_set; |
821 | unsigned int nr_expired; | ||
777 | }; | 822 | }; |
778 | 823 | ||
779 | void blk_mq_rq_timed_out(struct request *req, bool reserved) | 824 | void blk_mq_rq_timed_out(struct request *req, bool reserved) |
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) | |||
801 | __blk_mq_complete_request(req); | 846 | __blk_mq_complete_request(req); |
802 | break; | 847 | break; |
803 | case BLK_EH_RESET_TIMER: | 848 | case BLK_EH_RESET_TIMER: |
849 | /* | ||
850 | * As nothing prevents from completion happening while | ||
851 | * ->aborted_gstate is set, this may lead to ignored | ||
852 | * completions and further spurious timeouts. | ||
853 | */ | ||
854 | blk_mq_rq_update_aborted_gstate(req, 0); | ||
804 | blk_add_timer(req); | 855 | blk_add_timer(req); |
805 | blk_clear_rq_complete(req); | 856 | blk_clear_rq_complete(req); |
806 | break; | 857 | break; |
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | |||
816 | struct request *rq, void *priv, bool reserved) | 867 | struct request *rq, void *priv, bool reserved) |
817 | { | 868 | { |
818 | struct blk_mq_timeout_data *data = priv; | 869 | struct blk_mq_timeout_data *data = priv; |
819 | unsigned long deadline; | 870 | unsigned long gstate, deadline; |
871 | int start; | ||
872 | |||
873 | might_sleep(); | ||
820 | 874 | ||
821 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 875 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) |
822 | return; | 876 | return; |
823 | 877 | ||
824 | /* | 878 | /* read coherent snapshots of @rq->state_gen and @rq->deadline */ |
825 | * Ensures that if we see STARTED we must also see our | 879 | while (true) { |
826 | * up-to-date deadline, see blk_mq_start_request(). | 880 | start = read_seqcount_begin(&rq->gstate_seq); |
827 | */ | 881 | gstate = READ_ONCE(rq->gstate); |
828 | smp_rmb(); | 882 | deadline = rq->deadline; |
829 | 883 | if (!read_seqcount_retry(&rq->gstate_seq, start)) | |
830 | deadline = READ_ONCE(rq->deadline); | 884 | break; |
885 | cond_resched(); | ||
886 | } | ||
831 | 887 | ||
832 | /* | 888 | /* if in-flight && overdue, mark for abortion */ |
833 | * The rq being checked may have been freed and reallocated | 889 | if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT && |
834 | * out already here, we avoid this race by checking rq->deadline | 890 | time_after_eq(jiffies, deadline)) { |
835 | * and REQ_ATOM_COMPLETE flag together: | 891 | blk_mq_rq_update_aborted_gstate(rq, gstate); |
836 | * | 892 | data->nr_expired++; |
837 | * - if rq->deadline is observed as new value because of | 893 | hctx->nr_expired++; |
838 | * reusing, the rq won't be timed out because of timing. | ||
839 | * - if rq->deadline is observed as previous value, | ||
840 | * REQ_ATOM_COMPLETE flag won't be cleared in reuse path | ||
841 | * because we put a barrier between setting rq->deadline | ||
842 | * and clearing the flag in blk_mq_start_request(), so | ||
843 | * this rq won't be timed out too. | ||
844 | */ | ||
845 | if (time_after_eq(jiffies, deadline)) { | ||
846 | if (!blk_mark_rq_complete(rq)) { | ||
847 | /* | ||
848 | * Again coherence order ensures that consecutive reads | ||
849 | * from the same variable must be in that order. This | ||
850 | * ensures that if we see COMPLETE clear, we must then | ||
851 | * see STARTED set and we'll ignore this timeout. | ||
852 | * | ||
853 | * (There's also the MB implied by the test_and_clear()) | ||
854 | */ | ||
855 | blk_mq_rq_timed_out(rq, reserved); | ||
856 | } | ||
857 | } else if (!data->next_set || time_after(data->next, deadline)) { | 894 | } else if (!data->next_set || time_after(data->next, deadline)) { |
858 | data->next = deadline; | 895 | data->next = deadline; |
859 | data->next_set = 1; | 896 | data->next_set = 1; |
860 | } | 897 | } |
861 | } | 898 | } |
862 | 899 | ||
900 | static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx, | ||
901 | struct request *rq, void *priv, bool reserved) | ||
902 | { | ||
903 | /* | ||
904 | * We marked @rq->aborted_gstate and waited for RCU. If there were | ||
905 | * completions that we lost to, they would have finished and | ||
906 | * updated @rq->gstate by now; otherwise, the completion path is | ||
907 | * now guaranteed to see @rq->aborted_gstate and yield. If | ||
908 | * @rq->aborted_gstate still matches @rq->gstate, @rq is ours. | ||
909 | */ | ||
910 | if (READ_ONCE(rq->gstate) == rq->aborted_gstate && | ||
911 | !blk_mark_rq_complete(rq)) | ||
912 | blk_mq_rq_timed_out(rq, reserved); | ||
913 | } | ||
914 | |||
863 | static void blk_mq_timeout_work(struct work_struct *work) | 915 | static void blk_mq_timeout_work(struct work_struct *work) |
864 | { | 916 | { |
865 | struct request_queue *q = | 917 | struct request_queue *q = |
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work) | |||
867 | struct blk_mq_timeout_data data = { | 919 | struct blk_mq_timeout_data data = { |
868 | .next = 0, | 920 | .next = 0, |
869 | .next_set = 0, | 921 | .next_set = 0, |
922 | .nr_expired = 0, | ||
870 | }; | 923 | }; |
924 | struct blk_mq_hw_ctx *hctx; | ||
871 | int i; | 925 | int i; |
872 | 926 | ||
873 | /* A deadlock might occur if a request is stuck requiring a | 927 | /* A deadlock might occur if a request is stuck requiring a |
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work) | |||
886 | if (!percpu_ref_tryget(&q->q_usage_counter)) | 940 | if (!percpu_ref_tryget(&q->q_usage_counter)) |
887 | return; | 941 | return; |
888 | 942 | ||
943 | /* scan for the expired ones and set their ->aborted_gstate */ | ||
889 | blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); | 944 | blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); |
890 | 945 | ||
946 | if (data.nr_expired) { | ||
947 | bool has_rcu = false; | ||
948 | |||
949 | /* | ||
950 | * Wait till everyone sees ->aborted_gstate. The | ||
951 | * sequential waits for SRCUs aren't ideal. If this ever | ||
952 | * becomes a problem, we can add per-hw_ctx rcu_head and | ||
953 | * wait in parallel. | ||
954 | */ | ||
955 | queue_for_each_hw_ctx(q, hctx, i) { | ||
956 | if (!hctx->nr_expired) | ||
957 | continue; | ||
958 | |||
959 | if (!(hctx->flags & BLK_MQ_F_BLOCKING)) | ||
960 | has_rcu = true; | ||
961 | else | ||
962 | synchronize_srcu(hctx->queue_rq_srcu); | ||
963 | |||
964 | hctx->nr_expired = 0; | ||
965 | } | ||
966 | if (has_rcu) | ||
967 | synchronize_rcu(); | ||
968 | |||
969 | /* terminate the ones we won */ | ||
970 | blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL); | ||
971 | } | ||
972 | |||
891 | if (data.next_set) { | 973 | if (data.next_set) { |
892 | data.next = blk_rq_timeout(round_jiffies_up(data.next)); | 974 | data.next = blk_rq_timeout(round_jiffies_up(data.next)); |
893 | mod_timer(&q->timeout, data.next); | 975 | mod_timer(&q->timeout, data.next); |
894 | } else { | 976 | } else { |
895 | struct blk_mq_hw_ctx *hctx; | ||
896 | |||
897 | queue_for_each_hw_ctx(q, hctx, i) { | 977 | queue_for_each_hw_ctx(q, hctx, i) { |
898 | /* the hctx may be unmapped, so check it here */ | 978 | /* the hctx may be unmapped, so check it here */ |
899 | if (blk_mq_hw_queue_mapped(hctx)) | 979 | if (blk_mq_hw_queue_mapped(hctx)) |
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order) | |||
1893 | return (size_t)PAGE_SIZE << order; | 1973 | return (size_t)PAGE_SIZE << order; |
1894 | } | 1974 | } |
1895 | 1975 | ||
1976 | static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, | ||
1977 | unsigned int hctx_idx, int node) | ||
1978 | { | ||
1979 | int ret; | ||
1980 | |||
1981 | if (set->ops->init_request) { | ||
1982 | ret = set->ops->init_request(set, rq, hctx_idx, node); | ||
1983 | if (ret) | ||
1984 | return ret; | ||
1985 | } | ||
1986 | |||
1987 | seqcount_init(&rq->gstate_seq); | ||
1988 | u64_stats_init(&rq->aborted_gstate_sync); | ||
1989 | return 0; | ||
1990 | } | ||
1991 | |||
1896 | int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, | 1992 | int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, |
1897 | unsigned int hctx_idx, unsigned int depth) | 1993 | unsigned int hctx_idx, unsigned int depth) |
1898 | { | 1994 | { |
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, | |||
1954 | struct request *rq = p; | 2050 | struct request *rq = p; |
1955 | 2051 | ||
1956 | tags->static_rqs[i] = rq; | 2052 | tags->static_rqs[i] = rq; |
1957 | if (set->ops->init_request) { | 2053 | if (blk_mq_init_request(set, rq, hctx_idx, node)) { |
1958 | if (set->ops->init_request(set, rq, hctx_idx, | 2054 | tags->static_rqs[i] = NULL; |
1959 | node)) { | 2055 | goto fail; |
1960 | tags->static_rqs[i] = NULL; | ||
1961 | goto fail; | ||
1962 | } | ||
1963 | } | 2056 | } |
1964 | 2057 | ||
1965 | p += rq_size; | 2058 | p += rq_size; |
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q, | |||
2099 | if (!hctx->fq) | 2192 | if (!hctx->fq) |
2100 | goto sched_exit_hctx; | 2193 | goto sched_exit_hctx; |
2101 | 2194 | ||
2102 | if (set->ops->init_request && | 2195 | if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) |
2103 | set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx, | ||
2104 | node)) | ||
2105 | goto free_fq; | 2196 | goto free_fq; |
2106 | 2197 | ||
2107 | if (hctx->flags & BLK_MQ_F_BLOCKING) | 2198 | if (hctx->flags & BLK_MQ_F_BLOCKING) |
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) | |||
3019 | 3110 | ||
3020 | static int __init blk_mq_init(void) | 3111 | static int __init blk_mq_init(void) |
3021 | { | 3112 | { |
3022 | /* | ||
3023 | * See comment in block/blk.h rq_atomic_flags enum | ||
3024 | */ | ||
3025 | BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) != | ||
3026 | (REQ_ATOM_COMPLETE / BITS_PER_BYTE)); | ||
3027 | |||
3028 | cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, | 3113 | cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, |
3029 | blk_mq_hctx_notify_dead); | 3114 | blk_mq_hctx_notify_dead); |
3030 | return 0; | 3115 | return 0; |