1 files changed, 157 insertions, 72 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5e57c80a82b..156203876c8c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
        if (blk_rq_rl(rq))
                blk_put_rl(blk_rq_rl(rq));
+        blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
        clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
        clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
        if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
        bool shared = false;
        int cpu;
+        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
        if (rq->internal_tag != -1)
                blk_mq_sched_completed_request(rq);
        if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
                *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
 }
+static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+{
+        unsigned long flags;
+        /*
+         * blk_mq_rq_aborted_gstate() is used from the completion path and
+         * can thus be called from irq context.  u64_stats_fetch in the
+         * middle of update on the same CPU leads to lockup.  Disable irq
+         * while updating.
+         */
+        local_irq_save(flags);
+        u64_stats_update_begin(&rq->aborted_gstate_sync);
+        rq->aborted_gstate = gstate;
+        u64_stats_update_end(&rq->aborted_gstate_sync);
+        local_irq_restore(flags);
+}
+static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+{
+        unsigned int start;
+        u64 aborted_gstate;
+        do {
+                start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+                aborted_gstate = rq->aborted_gstate;
+        } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+        return aborted_gstate;
+}
 /**
 * blk_mq_complete_request - end I/O on a request
 * @rq:         the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
        if (unlikely(blk_should_fake_timeout(q)))
                return;
+        /*
+         * If @rq->aborted_gstate equals the current instance, timeout is
+         * claiming @rq and we lost.  This is synchronized through
+         * hctx_lock().  See blk_mq_timeout_work() for details.
+         *
+         * Completion path never blocks and we can directly use RCU here
+         * instead of hctx_lock() which can be either RCU or SRCU.
+         * However, that would complicate paths which want to synchronize
+         * against us.  Let stay in sync with the issue path so that
+         * hctx_lock() covers both issue and completion paths.
+         */
        hctx_lock(hctx, &srcu_idx);
-        if (!blk_mark_rq_complete(rq))
+        if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
+            !blk_mark_rq_complete(rq))
                __blk_mq_complete_request(rq);
        hctx_unlock(hctx, srcu_idx);
 }
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
                wbt_issue(q->rq_wb, &rq->issue_stat);
        }
-        blk_add_timer(rq);
+        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
        WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
        /*
-         * Mark us as started and clear complete. Complete might have been
+         * Mark @rq in-flight which also advances the generation number,
-         * set if requeue raced with timeout, which then marked it as
+         * and register for timeout.  Protect with a seqcount to allow the
-         * complete. So be sure to clear complete again when we start
+         * timeout path to read both @rq->gstate and @rq->deadline
-         * the request, otherwise we'll ignore the completion event.
+         * coherently.
         *
-         * Ensure that ->deadline is visible before we set STARTED, such that
+         * This is the only place where a request is marked in-flight.  If
-         * blk_mq_check_expired() is guaranteed to observe our ->deadline when
+         * the timeout path reads an in-flight @rq->gstate, the
-         * it observes STARTED.
+         * @rq->deadline it reads together under @rq->gstate_seq is
+         * guaranteed to be the matching one.
         */
-        smp_wmb();
+        preempt_disable();
+        write_seqcount_begin(&rq->gstate_seq);
+        blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+        blk_add_timer(rq);
+        write_seqcount_end(&rq->gstate_seq);
+        preempt_enable();
        set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+        if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
-                /*
-                 * Coherence order guarantees these consecutive stores to a
-                 * single variable propagate in the specified order. Thus the
-                 * clear_bit() is ordered _after_ the set bit. See
-                 * blk_mq_check_expired().
-                 *
-                 * (the bits must be part of the same byte for this to be
-                 * true).
-                 */
                clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-        }
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
        blk_mq_sched_requeue_request(rq);
        if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+                blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
                if (q->dma_drain_size && blk_rq_bytes(rq))
                        rq->nr_phys_segments--;
        }
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
        unsigned long next;
        unsigned int next_set;
+        unsigned int nr_expired;
 };
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
                __blk_mq_complete_request(req);
                break;
        case BLK_EH_RESET_TIMER:
+                /*
+                 * As nothing prevents from completion happening while
+                 * ->aborted_gstate is set, this may lead to ignored
+                 * completions and further spurious timeouts.
+                 */
+                blk_mq_rq_update_aborted_gstate(req, 0);
                blk_add_timer(req);
                blk_clear_rq_complete(req);
                break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                struct request *rq, void *priv, bool reserved)
 {
        struct blk_mq_timeout_data *data = priv;
-        unsigned long deadline;
+        unsigned long gstate, deadline;
+        int start;
+        might_sleep();
        if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
                return;
-        /*
+        /* read coherent snapshots of @rq->state_gen and @rq->deadline */
-         * Ensures that if we see STARTED we must also see our
+        while (true) {
-         * up-to-date deadline, see blk_mq_start_request().
+                start = read_seqcount_begin(&rq->gstate_seq);
-         */
+                gstate = READ_ONCE(rq->gstate);
-        smp_rmb();
+                deadline = rq->deadline;
+                if (!read_seqcount_retry(&rq->gstate_seq, start))
-        deadline = READ_ONCE(rq->deadline);
+                        break;
+                cond_resched();
+        }
-        /*
+        /* if in-flight && overdue, mark for abortion */
-         * The rq being checked may have been freed and reallocated
+        if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
-         * out already here, we avoid this race by checking rq->deadline
+            time_after_eq(jiffies, deadline)) {
-         * and REQ_ATOM_COMPLETE flag together:
+                blk_mq_rq_update_aborted_gstate(rq, gstate);
-         *
+                data->nr_expired++;
-         * - if rq->deadline is observed as new value because of
+                hctx->nr_expired++;
-         *   reusing, the rq won't be timed out because of timing.
-         * - if rq->deadline is observed as previous value,
-         *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-         *   because we put a barrier between setting rq->deadline
-         *   and clearing the flag in blk_mq_start_request(), so
-         *   this rq won't be timed out too.
-         */
-        if (time_after_eq(jiffies, deadline)) {
-                if (!blk_mark_rq_complete(rq)) {
-                        /*
-                         * Again coherence order ensures that consecutive reads
-                         * from the same variable must be in that order. This
-                         * ensures that if we see COMPLETE clear, we must then
-                         * see STARTED set and we'll ignore this timeout.
-                         *
-                         * (There's also the MB implied by the test_and_clear())
-                         */
-                        blk_mq_rq_timed_out(rq, reserved);
-                }
        } else if (!data->next_set || time_after(data->next, deadline)) {
                data->next = deadline;
                data->next_set = 1;
        }
 }
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+                struct request *rq, void *priv, bool reserved)
+{
+        /*
+         * We marked @rq->aborted_gstate and waited for RCU.  If there were
+         * completions that we lost to, they would have finished and
+         * updated @rq->gstate by now; otherwise, the completion path is
+         * now guaranteed to see @rq->aborted_gstate and yield.  If
+         * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+         */
+        if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
+            !blk_mark_rq_complete(rq))
+                blk_mq_rq_timed_out(rq, reserved);
+}
 static void blk_mq_timeout_work(struct work_struct *work)
 {
        struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
        struct blk_mq_timeout_data data = {
                .next           = 0,
                .next_set       = 0,
+                .nr_expired     = 0,
        };
+        struct blk_mq_hw_ctx *hctx;
        int i;
        /* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;
+        /* scan for the expired ones and set their ->aborted_gstate */
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+        if (data.nr_expired) {
+                bool has_rcu = false;
+                /*
+                 * Wait till everyone sees ->aborted_gstate.  The
+                 * sequential waits for SRCUs aren't ideal.  If this ever
+                 * becomes a problem, we can add per-hw_ctx rcu_head and
+                 * wait in parallel.
+                 */
+                queue_for_each_hw_ctx(q, hctx, i) {
+                        if (!hctx->nr_expired)
+                                continue;
+                        if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                                has_rcu = true;
+                        else
+                                synchronize_srcu(hctx->queue_rq_srcu);
+                        hctx->nr_expired = 0;
+                }
+                if (has_rcu)
+                        synchronize_rcu();
+                /* terminate the ones we won */
+                blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+        }
        if (data.next_set) {
                data.next = blk_rq_timeout(round_jiffies_up(data.next));
                mod_timer(&q->timeout, data.next);
        } else {
-                struct blk_mq_hw_ctx *hctx;
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
        return (size_t)PAGE_SIZE << order;
 }
+static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+                               unsigned int hctx_idx, int node)
+{
+        int ret;
+        if (set->ops->init_request) {
+                ret = set->ops->init_request(set, rq, hctx_idx, node);
+                if (ret)
+                        return ret;
+        }
+        seqcount_init(&rq->gstate_seq);
+        u64_stats_init(&rq->aborted_gstate_sync);
+        return 0;
+}
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx, unsigned int depth)
 {
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                        struct request *rq = p;
                        tags->static_rqs[i] = rq;
-                        if (set->ops->init_request) {
+                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
-                                if (set->ops->init_request(set, rq, hctx_idx,
+                                tags->static_rqs[i] = NULL;
-                                                node)) {
+                                goto fail;
-                                        tags->static_rqs[i] = NULL;
-                                        goto fail;
-                                }
                        }
                        p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
        if (!hctx->fq)
                goto sched_exit_hctx;
-        if (set->ops->init_request &&
+        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
-            set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-                                   node))
                goto free_fq;
        if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 static int __init blk_mq_init(void)
 {
-        /*
-         * See comment in block/blk.h rq_atomic_flags enum
-         */
-        BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-                        (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        return 0;

diff --git a/block/blk-mq.c b/block/blk-mq.c index f5e57c80a82b..156203876c8c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c
@@ -483,6 +483,7 @@ void blk_mq_free_request(struct request *rq)
483	if (blk_rq_rl(rq))	483	if (blk_rq_rl(rq))
484	blk_put_rl(blk_rq_rl(rq));	484	blk_put_rl(blk_rq_rl(rq));
485		485
		486	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
486	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);	487	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
487	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);	488	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
488	if (rq->tag != -1)	489	if (rq->tag != -1)
@@ -530,6 +531,8 @@ static void __blk_mq_complete_request(struct request *rq)
530	bool shared = false;	531	bool shared = false;
531	int cpu;	532	int cpu;
532		533
		534	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
		535
533	if (rq->internal_tag != -1)	536	if (rq->internal_tag != -1)
534	blk_mq_sched_completed_request(rq);	537	blk_mq_sched_completed_request(rq);
535	if (rq->rq_flags & RQF_STATS) {	538	if (rq->rq_flags & RQF_STATS) {
@@ -573,6 +576,36 @@ static void hctx_lock(struct blk_mq_hw_ctx hctx, int srcu_idx)
573	*srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);	576	*srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
574	}	577	}
575		578
		579	static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
		580	{
		581	unsigned long flags;
		582
		583	/*
		584	* blk_mq_rq_aborted_gstate() is used from the completion path and
		585	* can thus be called from irq context. u64_stats_fetch in the
		586	* middle of update on the same CPU leads to lockup. Disable irq
		587	* while updating.
		588	*/
		589	local_irq_save(flags);
		590	u64_stats_update_begin(&rq->aborted_gstate_sync);
		591	rq->aborted_gstate = gstate;
		592	u64_stats_update_end(&rq->aborted_gstate_sync);
		593	local_irq_restore(flags);
		594	}
		595
		596	static u64 blk_mq_rq_aborted_gstate(struct request *rq)
		597	{
		598	unsigned int start;
		599	u64 aborted_gstate;
		600
		601	do {
		602	start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
		603	aborted_gstate = rq->aborted_gstate;
		604	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
		605
		606	return aborted_gstate;
		607	}
		608
576	/**	609	/**
577	* blk_mq_complete_request - end I/O on a request	610	* blk_mq_complete_request - end I/O on a request
578	* @rq: the request being processed	611	* @rq: the request being processed
@@ -590,8 +623,20 @@ void blk_mq_complete_request(struct request *rq)
590	if (unlikely(blk_should_fake_timeout(q)))	623	if (unlikely(blk_should_fake_timeout(q)))
591	return;	624	return;
592		625
		626	/*
		627	* If @rq->aborted_gstate equals the current instance, timeout is
		628	* claiming @rq and we lost. This is synchronized through
		629	* hctx_lock(). See blk_mq_timeout_work() for details.
		630	*
		631	* Completion path never blocks and we can directly use RCU here
		632	* instead of hctx_lock() which can be either RCU or SRCU.
		633	* However, that would complicate paths which want to synchronize
		634	* against us. Let stay in sync with the issue path so that
		635	* hctx_lock() covers both issue and completion paths.
		636	*/
593	hctx_lock(hctx, &srcu_idx);	637	hctx_lock(hctx, &srcu_idx);
594	if (!blk_mark_rq_complete(rq))	638	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
		639	!blk_mark_rq_complete(rq))
595	__blk_mq_complete_request(rq);	640	__blk_mq_complete_request(rq);
596	hctx_unlock(hctx, srcu_idx);	641	hctx_unlock(hctx, srcu_idx);
597	}	642	}
@@ -617,34 +662,32 @@ void blk_mq_start_request(struct request *rq)
617	wbt_issue(q->rq_wb, &rq->issue_stat);	662	wbt_issue(q->rq_wb, &rq->issue_stat);
618	}	663	}
619		664
620	blk_add_timer(rq);	665	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
621
622	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));	666	WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
623		667
624	/*	668	/*
625	* Mark us as started and clear complete. Complete might have been	669	* Mark @rq in-flight which also advances the generation number,
626	* set if requeue raced with timeout, which then marked it as	670	* and register for timeout. Protect with a seqcount to allow the
627	* complete. So be sure to clear complete again when we start	671	* timeout path to read both @rq->gstate and @rq->deadline
628	* the request, otherwise we'll ignore the completion event.	672	* coherently.
629	*	673	*
630	* Ensure that ->deadline is visible before we set STARTED, such that	674	* This is the only place where a request is marked in-flight. If
631	* blk_mq_check_expired() is guaranteed to observe our ->deadline when	675	* the timeout path reads an in-flight @rq->gstate, the
632	* it observes STARTED.	676	* @rq->deadline it reads together under @rq->gstate_seq is
		677	* guaranteed to be the matching one.
633	*/	678	*/
634	smp_wmb();	679	preempt_disable();
		680	write_seqcount_begin(&rq->gstate_seq);
		681
		682	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
		683	blk_add_timer(rq);
		684
		685	write_seqcount_end(&rq->gstate_seq);
		686	preempt_enable();
		687
635	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);	688	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
636	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {	689	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
637	/*
638	* Coherence order guarantees these consecutive stores to a
639	* single variable propagate in the specified order. Thus the
640	* clear_bit() is ordered _after_ the set bit. See
641	* blk_mq_check_expired().
642	*
643	* (the bits must be part of the same byte for this to be
644	* true).
645	*/
646	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);	690	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
647	}
648		691
649	if (q->dma_drain_size && blk_rq_bytes(rq)) {	692	if (q->dma_drain_size && blk_rq_bytes(rq)) {
650	/*	693	/*
@@ -677,6 +720,7 @@ static void __blk_mq_requeue_request(struct request *rq)
677	blk_mq_sched_requeue_request(rq);	720	blk_mq_sched_requeue_request(rq);
678		721
679	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {	722	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
		723	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
680	if (q->dma_drain_size && blk_rq_bytes(rq))	724	if (q->dma_drain_size && blk_rq_bytes(rq))
681	rq->nr_phys_segments--;	725	rq->nr_phys_segments--;
682	}	726	}
@@ -774,6 +818,7 @@ EXPORT_SYMBOL(blk_mq_tag_to_rq);
774	struct blk_mq_timeout_data {	818	struct blk_mq_timeout_data {
775	unsigned long next;	819	unsigned long next;
776	unsigned int next_set;	820	unsigned int next_set;
		821	unsigned int nr_expired;
777	};	822	};
778		823
779	void blk_mq_rq_timed_out(struct request *req, bool reserved)	824	void blk_mq_rq_timed_out(struct request *req, bool reserved)
@@ -801,6 +846,12 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
801	__blk_mq_complete_request(req);	846	__blk_mq_complete_request(req);
802	break;	847	break;
803	case BLK_EH_RESET_TIMER:	848	case BLK_EH_RESET_TIMER:
		849	/*
		850	* As nothing prevents from completion happening while
		851	* ->aborted_gstate is set, this may lead to ignored
		852	* completions and further spurious timeouts.
		853	*/
		854	blk_mq_rq_update_aborted_gstate(req, 0);
804	blk_add_timer(req);	855	blk_add_timer(req);
805	blk_clear_rq_complete(req);	856	blk_clear_rq_complete(req);
806	break;	857	break;
@@ -816,50 +867,51 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
816	struct request rq, void priv, bool reserved)	867	struct request rq, void priv, bool reserved)
817	{	868	{
818	struct blk_mq_timeout_data *data = priv;	869	struct blk_mq_timeout_data *data = priv;
819	unsigned long deadline;	870	unsigned long gstate, deadline;
		871	int start;
		872
		873	might_sleep();
820		874
821	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))	875	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
822	return;	876	return;
823		877
824	/*	878	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
825	* Ensures that if we see STARTED we must also see our	879	while (true) {
826	* up-to-date deadline, see blk_mq_start_request().	880	start = read_seqcount_begin(&rq->gstate_seq);
827	*/	881	gstate = READ_ONCE(rq->gstate);
828	smp_rmb();	882	deadline = rq->deadline;
829		883	if (!read_seqcount_retry(&rq->gstate_seq, start))
830	deadline = READ_ONCE(rq->deadline);	884	break;
		885	cond_resched();
		886	}
831		887
832	/*	888	/* if in-flight && overdue, mark for abortion */
833	* The rq being checked may have been freed and reallocated	889	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
834	* out already here, we avoid this race by checking rq->deadline	890	time_after_eq(jiffies, deadline)) {
835	* and REQ_ATOM_COMPLETE flag together:	891	blk_mq_rq_update_aborted_gstate(rq, gstate);
836	*	892	data->nr_expired++;
837	* - if rq->deadline is observed as new value because of	893	hctx->nr_expired++;
838	* reusing, the rq won't be timed out because of timing.
839	* - if rq->deadline is observed as previous value,
840	* REQ_ATOM_COMPLETE flag won't be cleared in reuse path
841	* because we put a barrier between setting rq->deadline
842	* and clearing the flag in blk_mq_start_request(), so
843	* this rq won't be timed out too.
844	*/
845	if (time_after_eq(jiffies, deadline)) {
846	if (!blk_mark_rq_complete(rq)) {
847	/*
848	* Again coherence order ensures that consecutive reads
849	* from the same variable must be in that order. This
850	* ensures that if we see COMPLETE clear, we must then
851	* see STARTED set and we'll ignore this timeout.
852	*
853	* (There's also the MB implied by the test_and_clear())
854	*/
855	blk_mq_rq_timed_out(rq, reserved);
856	}
857	} else if (!data->next_set \|\| time_after(data->next, deadline)) {	894	} else if (!data->next_set \|\| time_after(data->next, deadline)) {
858	data->next = deadline;	895	data->next = deadline;
859	data->next_set = 1;	896	data->next_set = 1;
860	}	897	}
861	}	898	}
862		899
		900	static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
		901	struct request rq, void priv, bool reserved)
		902	{
		903	/*
		904	* We marked @rq->aborted_gstate and waited for RCU. If there were
		905	* completions that we lost to, they would have finished and
		906	* updated @rq->gstate by now; otherwise, the completion path is
		907	* now guaranteed to see @rq->aborted_gstate and yield. If
		908	* @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
		909	*/
		910	if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
		911	!blk_mark_rq_complete(rq))
		912	blk_mq_rq_timed_out(rq, reserved);
		913	}
		914
863	static void blk_mq_timeout_work(struct work_struct *work)	915	static void blk_mq_timeout_work(struct work_struct *work)
864	{	916	{
865	struct request_queue *q =	917	struct request_queue *q =
@@ -867,7 +919,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
867	struct blk_mq_timeout_data data = {	919	struct blk_mq_timeout_data data = {
868	.next = 0,	920	.next = 0,
869	.next_set = 0,	921	.next_set = 0,
		922	.nr_expired = 0,
870	};	923	};
		924	struct blk_mq_hw_ctx *hctx;
871	int i;	925	int i;
872		926
873	/* A deadlock might occur if a request is stuck requiring a	927	/* A deadlock might occur if a request is stuck requiring a
@@ -886,14 +940,40 @@ static void blk_mq_timeout_work(struct work_struct *work)
886	if (!percpu_ref_tryget(&q->q_usage_counter))	940	if (!percpu_ref_tryget(&q->q_usage_counter))
887	return;	941	return;
888		942
		943	/* scan for the expired ones and set their ->aborted_gstate */
889	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);	944	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
890		945
		946	if (data.nr_expired) {
		947	bool has_rcu = false;
		948
		949	/*
		950	* Wait till everyone sees ->aborted_gstate. The
		951	* sequential waits for SRCUs aren't ideal. If this ever
		952	* becomes a problem, we can add per-hw_ctx rcu_head and
		953	* wait in parallel.
		954	*/
		955	queue_for_each_hw_ctx(q, hctx, i) {
		956	if (!hctx->nr_expired)
		957	continue;
		958
		959	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
		960	has_rcu = true;
		961	else
		962	synchronize_srcu(hctx->queue_rq_srcu);
		963
		964	hctx->nr_expired = 0;
		965	}
		966	if (has_rcu)
		967	synchronize_rcu();
		968
		969	/* terminate the ones we won */
		970	blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
		971	}
		972
891	if (data.next_set) {	973	if (data.next_set) {
892	data.next = blk_rq_timeout(round_jiffies_up(data.next));	974	data.next = blk_rq_timeout(round_jiffies_up(data.next));
893	mod_timer(&q->timeout, data.next);	975	mod_timer(&q->timeout, data.next);
894	} else {	976	} else {
895	struct blk_mq_hw_ctx *hctx;
896
897	queue_for_each_hw_ctx(q, hctx, i) {	977	queue_for_each_hw_ctx(q, hctx, i) {
898	/* the hctx may be unmapped, so check it here */	978	/* the hctx may be unmapped, so check it here */
899	if (blk_mq_hw_queue_mapped(hctx))	979	if (blk_mq_hw_queue_mapped(hctx))
@@ -1893,6 +1973,22 @@ static size_t order_to_size(unsigned int order)
1893	return (size_t)PAGE_SIZE << order;	1973	return (size_t)PAGE_SIZE << order;
1894	}	1974	}
1895		1975
		1976	static int blk_mq_init_request(struct blk_mq_tag_set set, struct request rq,
		1977	unsigned int hctx_idx, int node)
		1978	{
		1979	int ret;
		1980
		1981	if (set->ops->init_request) {
		1982	ret = set->ops->init_request(set, rq, hctx_idx, node);
		1983	if (ret)
		1984	return ret;
		1985	}
		1986
		1987	seqcount_init(&rq->gstate_seq);
		1988	u64_stats_init(&rq->aborted_gstate_sync);
		1989	return 0;
		1990	}
		1991
1896	int blk_mq_alloc_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,	1992	int blk_mq_alloc_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
1897	unsigned int hctx_idx, unsigned int depth)	1993	unsigned int hctx_idx, unsigned int depth)
1898	{	1994	{
@@ -1954,12 +2050,9 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set set, struct blk_mq_tags tags,
1954	struct request *rq = p;	2050	struct request *rq = p;
1955		2051
1956	tags->static_rqs[i] = rq;	2052	tags->static_rqs[i] = rq;
1957	if (set->ops->init_request) {	2053	if (blk_mq_init_request(set, rq, hctx_idx, node)) {
1958	if (set->ops->init_request(set, rq, hctx_idx,	2054	tags->static_rqs[i] = NULL;
1959	node)) {	2055	goto fail;
1960	tags->static_rqs[i] = NULL;
1961	goto fail;
1962	}
1963	}	2056	}
1964		2057
1965	p += rq_size;	2058	p += rq_size;
@@ -2099,9 +2192,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
2099	if (!hctx->fq)	2192	if (!hctx->fq)
2100	goto sched_exit_hctx;	2193	goto sched_exit_hctx;
2101		2194
2102	if (set->ops->init_request &&	2195	if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2103	set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
2104	node))
2105	goto free_fq;	2196	goto free_fq;
2106		2197
2107	if (hctx->flags & BLK_MQ_F_BLOCKING)	2198	if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -3019,12 +3110,6 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3019		3110
3020	static int __init blk_mq_init(void)	3111	static int __init blk_mq_init(void)
3021	{	3112	{
3022	/*
3023	* See comment in block/blk.h rq_atomic_flags enum
3024	*/
3025	BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
3026	(REQ_ATOM_COMPLETE / BITS_PER_BYTE));
3027
3028	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,	3113	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3029	blk_mq_hctx_notify_dead);	3114	blk_mq_hctx_notify_dead);
3030	return 0;	3115	return 0;