summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2018-01-09 11:29:48 -0500
committerJens Axboe <axboe@kernel.dk>2018-01-09 11:31:15 -0500
commit1d9bd5161ba32db5665a617edc8b0723880f543e (patch)
tree673fe21cec5b35a1dd56cd8078bc73e0d72e0388 /block
parent5197c05e16b49885cc9086f1676455371e821b0e (diff)
blk-mq: replace timeout synchronization with a RCU and generation based scheme
Currently, blk-mq timeout path synchronizes against the usual issue/completion path using a complex scheme involving atomic bitflags, REQ_ATOM_*, memory barriers and subtle memory coherence rules. Unfortunately, it contains quite a few holes. There's a complex dancing around REQ_ATOM_STARTED and REQ_ATOM_COMPLETE between issue/completion and timeout paths; however, they don't have a synchronization point across request recycle instances and it isn't clear what the barriers add. blk_mq_check_expired() can easily read STARTED from N-2'th iteration, deadline from N-1'th, blk_mark_rq_complete() against Nth instance. In fact, it's pretty easy to make blk_mq_check_expired() terminate a later instance of a request. If we induce 5 sec delay before time_after_eq() test in blk_mq_check_expired(), shorten the timeout to 2s, and issue back-to-back large IOs, blk-mq starts timing out requests spuriously pretty quickly. Nothing actually timed out. It just made the call on a recycle instance of a request and then terminated a later instance long after the original instance finished. The scenario isn't theoretical either. This patch replaces the broken synchronization mechanism with a RCU and generation number based one. 1. Each request has a u64 generation + state value, which can be updated only by the request owner. Whenever a request becomes in-flight, the generation number gets bumped up too. This provides the basis for the timeout path to distinguish different recycle instances of the request. Also, marking a request in-flight and setting its deadline are protected with a seqcount so that the timeout path can fetch both values coherently. 2. The timeout path fetches the generation, state and deadline. If the verdict is timeout, it records the generation into a dedicated request abortion field and does RCU wait. 3. The completion path is also protected by RCU (from the previous patch) and checks whether the current generation number and state match the abortion field. If so, it skips completion. 4. The timeout path, after RCU wait, scans requests again and terminates the ones whose generation and state still match the ones requested for abortion. By now, the timeout path knows that either the generation number and state changed if it lost the race or the completion will yield to it and can safely timeout the request. While it's more lines of code, it's conceptually simpler, doesn't depend on direct use of subtle memory ordering or coherence, and hopefully doesn't terminate the wrong instance. While this change makes REQ_ATOM_COMPLETE synchronization unnecessary between issue/complete and timeout paths, REQ_ATOM_COMPLETE isn't removed yet as it's still used in other places. Future patches will move all state tracking to the new mechanism and remove all bitops in the hot paths. Note that this patch adds a comment explaining a race condition in BLK_EH_RESET_TIMER path. The race has always been there and this patch doesn't change it. It's just documenting the existing race. v2: - Fixed BLK_EH_RESET_TIMER handling as pointed out by Jianchao. - s/request->gstate_seqc/request->gstate_seq/ as suggested by Peter. - READ_ONCE() added in blk_mq_rq_update_state() as suggested by Peter. v3: - Fixed possible extended seqcount / u64_stats_sync read looping spotted by Peter. - MQ_RQ_IDLE was incorrectly being set in complete_request instead of free_request. Fixed. v4: - Rebased on top of hctx_lock() refactoring patch. - Added comment explaining the use of hctx_lock() in completion path. v5: - Added comments requested by Bart. - Note the addition of BLK_EH_RESET_TIMER race condition in the commit message. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: "jianchao.wang" <jianchao.w.wang@oracle.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Christoph Hellwig <hch@lst.de> Cc: Bart Van Assche <Bart.VanAssche@wdc.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block')
-rw-r--r--block/blk-core.c2
-rw-r--r--block/blk-mq.c229
-rw-r--r--block/blk-mq.h46
-rw-r--r--block/blk-timeout.c2
-rw-r--r--block/blk.h6
5 files changed, 206 insertions, 79 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 2e0d041e2daf..f843ae4f858d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -126,6 +126,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
126 rq->start_time = jiffies; 126 rq->start_time = jiffies;
127 set_start_time_ns(rq); 127 set_start_time_ns(rq);
128 rq->part = NULL; 128 rq->part = NULL;
129 seqcount_init(&rq->gstate_seq);
130 u64_stats_init(&rq->aborted_gstate_sync);
129} 131}
130EXPORT_SYMBOL(blk_rq_init); 132EXPORT_SYMBOL(blk_rq_init);
131 133
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5e57c80a82b..156203876c8c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
483 if (blk_rq_rl(rq)) 483 if (blk_rq_rl(rq))
484 blk_put_rl(blk_rq_rl(rq)); 484 blk_put_rl(blk_rq_rl(rq));
485 485
486 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
486 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 487 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
487 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 488 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
488 if (rq->tag != -1) 489 if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
530 bool shared = false; 531 bool shared = false;
531 int cpu; 532 int cpu;
532 533
534 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
535
533 if (rq->internal_tag != -1) 536 if (rq->internal_tag != -1)
534 blk_mq_sched_completed_request(rq); 537 blk_mq_sched_completed_request(rq);
535 if (rq->rq_flags & RQF_STATS) { 538 if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
573 *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu); 576 *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
574} 577}
575 578
579static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
580{
581 unsigned long flags;
582
583 /*
584 * blk_mq_rq_aborted_gstate() is used from the completion path and
585 * can thus be called from irq context. u64_stats_fetch in the
586 * middle of update on the same CPU leads to lockup. Disable irq
587 * while updating.
588 */
589 local_irq_save(flags);
590 u64_stats_update_begin(&rq->aborted_gstate_sync);
591 rq->aborted_gstate = gstate;
592 u64_stats_update_end(&rq->aborted_gstate_sync);
593 local_irq_restore(flags);
594}
595
596static u64 blk_mq_rq_aborted_gstate(struct request *rq)
597{
598 unsigned int start;
599 u64 aborted_gstate;
600
601 do {
602 start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
603 aborted_gstate = rq->aborted_gstate;
604 } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
605
606 return aborted_gstate;
607}
608
576/** 609/**
577 * blk_mq_complete_request - end I/O on a request 610 * blk_mq_complete_request - end I/O on a request
578 * @rq: the request being processed 611 * @rq: the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
590 if (unlikely(blk_should_fake_timeout(q))) 623 if (unlikely(blk_should_fake_timeout(q)))
591 return; 624 return;
592 625
626 /*
627 * If @rq->aborted_gstate equals the current instance, timeout is
628 * claiming @rq and we lost. This is synchronized through
629 * hctx_lock(). See blk_mq_timeout_work() for details.
630 *
631 * Completion path never blocks and we can directly use RCU here
632 * instead of hctx_lock() which can be either RCU or SRCU.
633 * However, that would complicate paths which want to synchronize
634 * against us. Let stay in sync with the issue path so that
635 * hctx_lock() covers both issue and completion paths.
636 */
593 hctx_lock(hctx, &srcu_idx); 637 hctx_lock(hctx, &srcu_idx);
594 if (!blk_mark_rq_complete(rq)) 638 if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
639 !blk_mark_rq_complete(rq))
595 __blk_mq_complete_request(rq); 640 __blk_mq_complete_request(rq);
596 hctx_unlock(hctx, srcu_idx); 641 hctx_unlock(hctx, srcu_idx);
597} 642}
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
617 wbt_issue(q->rq_wb, &rq->issue_stat); 662 wbt_issue(q->rq_wb, &rq->issue_stat);
618 } 663 }
619 664
620 blk_add_timer(rq); 665 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
621
622 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)); 666 WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
623 667
624 /* 668 /*
625 * Mark us as started and clear complete. Complete might have been 669 * Mark @rq in-flight which also advances the generation number,
626 * set if requeue raced with timeout, which then marked it as 670 * and register for timeout. Protect with a seqcount to allow the
627 * complete. So be sure to clear complete again when we start 671 * timeout path to read both @rq->gstate and @rq->deadline
628 * the request, otherwise we'll ignore the completion event. 672 * coherently.
629 * 673 *
630 * Ensure that ->deadline is visible before we set STARTED, such that 674 * This is the only place where a request is marked in-flight. If
631 * blk_mq_check_expired() is guaranteed to observe our ->deadline when 675 * the timeout path reads an in-flight @rq->gstate, the
632 * it observes STARTED. 676 * @rq->deadline it reads together under @rq->gstate_seq is
677 * guaranteed to be the matching one.
633 */ 678 */
634 smp_wmb(); 679 preempt_disable();
680 write_seqcount_begin(&rq->gstate_seq);
681
682 blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
683 blk_add_timer(rq);
684
685 write_seqcount_end(&rq->gstate_seq);
686 preempt_enable();
687
635 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 688 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
636 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) { 689 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
637 /*
638 * Coherence order guarantees these consecutive stores to a
639 * single variable propagate in the specified order. Thus the
640 * clear_bit() is ordered _after_ the set bit. See
641 * blk_mq_check_expired().
642 *
643 * (the bits must be part of the same byte for this to be
644 * true).
645 */
646 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 690 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
647 }
648 691
649 if (q->dma_drain_size && blk_rq_bytes(rq)) { 692 if (q->dma_drain_size && blk_rq_bytes(rq)) {
650 /* 693 /*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
677 blk_mq_sched_requeue_request(rq); 720 blk_mq_sched_requeue_request(rq);
678 721
679 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 722 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
723 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
680 if (q->dma_drain_size && blk_rq_bytes(rq)) 724 if (q->dma_drain_size && blk_rq_bytes(rq))
681 rq->nr_phys_segments--; 725 rq->nr_phys_segments--;
682 } 726 }
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
774struct blk_mq_timeout_data { 818struct blk_mq_timeout_data {
775 unsigned long next; 819 unsigned long next;
776 unsigned int next_set; 820 unsigned int next_set;
821 unsigned int nr_expired;
777}; 822};
778 823
779void blk_mq_rq_timed_out(struct request *req, bool reserved) 824void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
801 __blk_mq_complete_request(req); 846 __blk_mq_complete_request(req);
802 break; 847 break;
803 case BLK_EH_RESET_TIMER: 848 case BLK_EH_RESET_TIMER:
849 /*
850 * As nothing prevents from completion happening while
851 * ->aborted_gstate is set, this may lead to ignored
852 * completions and further spurious timeouts.
853 */
854 blk_mq_rq_update_aborted_gstate(req, 0);
804 blk_add_timer(req); 855 blk_add_timer(req);
805 blk_clear_rq_complete(req); 856 blk_clear_rq_complete(req);
806 break; 857 break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816 struct request *rq, void *priv, bool reserved) 867 struct request *rq, void *priv, bool reserved)
817{ 868{
818 struct blk_mq_timeout_data *data = priv; 869 struct blk_mq_timeout_data *data = priv;
819 unsigned long deadline; 870 unsigned long gstate, deadline;
871 int start;
872
873 might_sleep();
820 874
821 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 875 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
822 return; 876 return;
823 877
824 /* 878 /* read coherent snapshots of @rq->state_gen and @rq->deadline */
825 * Ensures that if we see STARTED we must also see our 879 while (true) {
826 * up-to-date deadline, see blk_mq_start_request(). 880 start = read_seqcount_begin(&rq->gstate_seq);
827 */ 881 gstate = READ_ONCE(rq->gstate);
828 smp_rmb(); 882 deadline = rq->deadline;
829 883 if (!read_seqcount_retry(&rq->gstate_seq, start))
830 deadline = READ_ONCE(rq->deadline); 884 break;
885 cond_resched();
886 }
831 887
832 /* 888 /* if in-flight && overdue, mark for abortion */
833 * The rq being checked may have been freed and reallocated 889 if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
834 * out already here, we avoid this race by checking rq->deadline 890 time_after_eq(jiffies, deadline)) {
835 * and REQ_ATOM_COMPLETE flag together: 891 blk_mq_rq_update_aborted_gstate(rq, gstate);
836 * 892 data->nr_expired++;
837 * - if rq->deadline is observed as new value because of 893 hctx->nr_expired++;
838 * reusing, the rq won't be timed out because of timing.
839 * - if rq->deadline is observed as previous value,
840 * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
841 * because we put a barrier between setting rq->deadline
842 * and clearing the flag in blk_mq_start_request(), so
843 * this rq won't be timed out too.
844 */
845 if (time_after_eq(jiffies, deadline)) {
846 if (!blk_mark_rq_complete(rq)) {
847 /*
848 * Again coherence order ensures that consecutive reads
849 * from the same variable must be in that order. This
850 * ensures that if we see COMPLETE clear, we must then
851 * see STARTED set and we'll ignore this timeout.
852 *
853 * (There's also the MB implied by the test_and_clear())
854 */
855 blk_mq_rq_timed_out(rq, reserved);
856 }
857 } else if (!data->next_set || time_after(data->next, deadline)) { 894 } else if (!data->next_set || time_after(data->next, deadline)) {
858 data->next = deadline; 895 data->next = deadline;
859 data->next_set = 1; 896 data->next_set = 1;
860 } 897 }
861} 898}
862 899
900static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
901 struct request *rq, void *priv, bool reserved)
902{
903 /*
904 * We marked @rq->aborted_gstate and waited for RCU. If there were
905 * completions that we lost to, they would have finished and
906 * updated @rq->gstate by now; otherwise, the completion path is
907 * now guaranteed to see @rq->aborted_gstate and yield. If
908 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
909 */
910 if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
911 !blk_mark_rq_complete(rq))
912 blk_mq_rq_timed_out(rq, reserved);
913}
914
863static void blk_mq_timeout_work(struct work_struct *work) 915static void blk_mq_timeout_work(struct work_struct *work)
864{ 916{
865 struct request_queue *q = 917 struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
867 struct blk_mq_timeout_data data = { 919 struct blk_mq_timeout_data data = {
868 .next = 0, 920 .next = 0,
869 .next_set = 0, 921 .next_set = 0,
922 .nr_expired = 0,
870 }; 923 };
924 struct blk_mq_hw_ctx *hctx;
871 int i; 925 int i;
872 926
873 /* A deadlock might occur if a request is stuck requiring a 927 /* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
886 if (!percpu_ref_tryget(&q->q_usage_counter)) 940 if (!percpu_ref_tryget(&q->q_usage_counter))
887 return; 941 return;
888 942
943 /* scan for the expired ones and set their ->aborted_gstate */
889 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 944 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
890 945
946 if (data.nr_expired) {
947 bool has_rcu = false;
948
949 /*
950 * Wait till everyone sees ->aborted_gstate. The
951 * sequential waits for SRCUs aren't ideal. If this ever
952 * becomes a problem, we can add per-hw_ctx rcu_head and
953 * wait in parallel.
954 */
955 queue_for_each_hw_ctx(q, hctx, i) {
956 if (!hctx->nr_expired)
957 continue;
958
959 if (!(hctx->flags & BLK_MQ_F_BLOCKING))
960 has_rcu = true;
961 else
962 synchronize_srcu(hctx->queue_rq_srcu);
963
964 hctx->nr_expired = 0;
965 }
966 if (has_rcu)
967 synchronize_rcu();
968
969 /* terminate the ones we won */
970 blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
971 }
972
891 if (data.next_set) { 973 if (data.next_set) {
892 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 974 data.next = blk_rq_timeout(round_jiffies_up(data.next));
893 mod_timer(&q->timeout, data.next); 975 mod_timer(&q->timeout, data.next);
894 } else { 976 } else {
895 struct blk_mq_hw_ctx *hctx;
896
897 queue_for_each_hw_ctx(q, hctx, i) { 977 queue_for_each_hw_ctx(q, hctx, i) {
898 /* the hctx may be unmapped, so check it here */ 978 /* the hctx may be unmapped, so check it here */
899 if (blk_mq_hw_queue_mapped(hctx)) 979 if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
1893 return (size_t)PAGE_SIZE << order; 1973 return (size_t)PAGE_SIZE << order;
1894} 1974}
1895 1975
1976static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
1977 unsigned int hctx_idx, int node)
1978{
1979 int ret;
1980
1981 if (set->ops->init_request) {
1982 ret = set->ops->init_request(set, rq, hctx_idx, node);
1983 if (ret)
1984 return ret;
1985 }
1986
1987 seqcount_init(&rq->gstate_seq);
1988 u64_stats_init(&rq->aborted_gstate_sync);
1989 return 0;
1990}
1991
1896int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 1992int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1897 unsigned int hctx_idx, unsigned int depth) 1993 unsigned int hctx_idx, unsigned int depth)
1898{ 1994{
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1954 struct request *rq = p; 2050 struct request *rq = p;
1955 2051
1956 tags->static_rqs[i] = rq; 2052 tags->static_rqs[i] = rq;
1957 if (set->ops->init_request) { 2053 if (blk_mq_init_request(set, rq, hctx_idx, node)) {
1958 if (set->ops->init_request(set, rq, hctx_idx, 2054 tags->static_rqs[i] = NULL;
1959 node)) { 2055 goto fail;
1960 tags->static_rqs[i] = NULL;
1961 goto fail;
1962 }
1963 } 2056 }
1964 2057
1965 p += rq_size; 2058 p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
2099 if (!hctx->fq) 2192 if (!hctx->fq)
2100 goto sched_exit_hctx; 2193 goto sched_exit_hctx;
2101 2194
2102 if (set->ops->init_request && 2195 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2103 set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
2104 node))
2105 goto free_fq; 2196 goto free_fq;
2106 2197
2107 if (hctx->flags & BLK_MQ_F_BLOCKING) 2198 if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3019 3110
3020static int __init blk_mq_init(void) 3111static int __init blk_mq_init(void)
3021{ 3112{
3022 /*
3023 * See comment in block/blk.h rq_atomic_flags enum
3024 */
3025 BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
3026 (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
3027
3028 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 3113 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3029 blk_mq_hctx_notify_dead); 3114 blk_mq_hctx_notify_dead);
3030 return 0; 3115 return 0;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6c7c3ff5bf62..cf01f6f8c73d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,6 +27,19 @@ struct blk_mq_ctx {
27 struct kobject kobj; 27 struct kobject kobj;
28} ____cacheline_aligned_in_smp; 28} ____cacheline_aligned_in_smp;
29 29
30/*
31 * Bits for request->gstate. The lower two bits carry MQ_RQ_* state value
32 * and the upper bits the generation number.
33 */
34enum mq_rq_state {
35 MQ_RQ_IDLE = 0,
36 MQ_RQ_IN_FLIGHT = 1,
37
38 MQ_RQ_STATE_BITS = 2,
39 MQ_RQ_STATE_MASK = (1 << MQ_RQ_STATE_BITS) - 1,
40 MQ_RQ_GEN_INC = 1 << MQ_RQ_STATE_BITS,
41};
42
30void blk_mq_freeze_queue(struct request_queue *q); 43void blk_mq_freeze_queue(struct request_queue *q);
31void blk_mq_free_queue(struct request_queue *q); 44void blk_mq_free_queue(struct request_queue *q);
32int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); 45int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -85,6 +98,39 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved);
85 98
86void blk_mq_release(struct request_queue *q); 99void blk_mq_release(struct request_queue *q);
87 100
101/**
102 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
103 * @rq: target request.
104 */
105static inline int blk_mq_rq_state(struct request *rq)
106{
107 return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
108}
109
110/**
111 * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
112 * @rq: target request.
113 * @state: new state to set.
114 *
115 * Set @rq's state to @state. The caller is responsible for ensuring that
116 * there are no other updaters. A request can transition into IN_FLIGHT
117 * only from IDLE and doing so increments the generation number.
118 */
119static inline void blk_mq_rq_update_state(struct request *rq,
120 enum mq_rq_state state)
121{
122 u64 old_val = READ_ONCE(rq->gstate);
123 u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;
124
125 if (state == MQ_RQ_IN_FLIGHT) {
126 WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
127 new_val += MQ_RQ_GEN_INC;
128 }
129
130 /* avoid exposing interim values */
131 WRITE_ONCE(rq->gstate, new_val);
132}
133
88static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 134static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
89 unsigned int cpu) 135 unsigned int cpu)
90{ 136{
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 764ecf9aeb30..6427be7ac363 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -208,7 +208,7 @@ void blk_add_timer(struct request *req)
208 if (!req->timeout) 208 if (!req->timeout)
209 req->timeout = q->rq_timeout; 209 req->timeout = q->rq_timeout;
210 210
211 WRITE_ONCE(req->deadline, jiffies + req->timeout); 211 req->deadline = jiffies + req->timeout;
212 212
213 /* 213 /*
214 * Only the non-mq case needs to add the request to a protected list. 214 * Only the non-mq case needs to add the request to a protected list.
diff --git a/block/blk.h b/block/blk.h
index 3f1446937aec..9cb2739edb6a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -123,12 +123,6 @@ void blk_account_io_done(struct request *req);
123 * Internal atomic flags for request handling 123 * Internal atomic flags for request handling
124 */ 124 */
125enum rq_atomic_flags { 125enum rq_atomic_flags {
126 /*
127 * Keep these two bits first - not because we depend on the
128 * value of them, but we do depend on them being in the same
129 * byte of storage to ensure ordering on writes. Keeping them
130 * first will achieve that nicely.
131 */
132 REQ_ATOM_COMPLETE = 0, 126 REQ_ATOM_COMPLETE = 0,
133 REQ_ATOM_STARTED, 127 REQ_ATOM_STARTED,
134 128