2 files changed, 261 insertions, 34 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index d5bc32371ace..9e2fbb7d1fb6 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
 #undef BFQ_BFQQ_FNS                                             \
 /* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1815,6 +1816,111 @@ static void bfq_add_request(struct request *rq)
        if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
                /*
+                 * Detect whether bfqq's I/O seems synchronized with
+                 * that of some other queue, i.e., whether bfqq, after
+                 * remaining empty, happens to receive new I/O only
+                 * right after some I/O request of the other queue has
+                 * been completed. We call waker queue the other
+                 * queue, and we assume, for simplicity, that bfqq may
+                 * have at most one waker queue.
+                 *
+                 * A remarkable throughput boost can be reached by
+                 * unconditionally injecting the I/O of the waker
+                 * queue, every time a new bfq_dispatch_request
+                 * happens to be invoked while I/O is being plugged
+                 * for bfqq.  In addition to boosting throughput, this
+                 * unblocks bfqq's I/O, thereby improving bandwidth
+                 * and latency for bfqq. Note that these same results
+                 * may be achieved with the general injection
+                 * mechanism, but less effectively. For details on
+                 * this aspect, see the comments on the choice of the
+                 * queue for injection in bfq_select_queue().
+                 *
+                 * Turning back to the detection of a waker queue, a
+                 * queue Q is deemed as a waker queue for bfqq if, for
+                 * two consecutive times, bfqq happens to become non
+                 * empty right after a request of Q has been
+                 * completed. In particular, on the first time, Q is
+                 * tentatively set as a candidate waker queue, while
+                 * on the second time, the flag
+                 * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
+                 * is a waker queue for bfqq. These detection steps
+                 * are performed only if bfqq has a long think time,
+                 * so as to make it more likely that bfqq's I/O is
+                 * actually being blocked by a synchronization. This
+                 * last filter, plus the above two-times requirement,
+                 * make false positives less likely.
+                 *
+                 * NOTE
+                 *
+                 * The sooner a waker queue is detected, the sooner
+                 * throughput can be boosted by injecting I/O from the
+                 * waker queue. Fortunately, detection is likely to be
+                 * actually fast, for the following reasons. While
+                 * blocked by synchronization, bfqq has a long think
+                 * time. This implies that bfqq's inject limit is at
+                 * least equal to 1 (see the comments in
+                 * bfq_update_inject_limit()). So, thanks to
+                 * injection, the waker queue is likely to be served
+                 * during the very first I/O-plugging time interval
+                 * for bfqq. This triggers the first step of the
+                 * detection mechanism. Thanks again to injection, the
+                 * candidate waker queue is then likely to be
+                 * confirmed no later than during the next
+                 * I/O-plugging interval for bfqq.
+                 */
+                if (!bfq_bfqq_has_short_ttime(bfqq) &&
+                    ktime_get_ns() - bfqd->last_completion <
+                    200 * NSEC_PER_USEC) {
+                        if (bfqd->last_completed_rq_bfqq != bfqq &&
+                                   bfqd->last_completed_rq_bfqq !=
+                                   bfqq->waker_bfqq) {
+                                /*
+                                 * First synchronization detected with
+                                 * a candidate waker queue, or with a
+                                 * different candidate waker queue
+                                 * from the current one.
+                                 */
+                                bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+                                /*
+                                 * If the waker queue disappears, then
+                                 * bfqq->waker_bfqq must be reset. To
+                                 * this goal, we maintain in each
+                                 * waker queue a list, woken_list, of
+                                 * all the queues that reference the
+                                 * waker queue through their
+                                 * waker_bfqq pointer. When the waker
+                                 * queue exits, the waker_bfqq pointer
+                                 * of all the queues in the woken_list
+                                 * is reset.
+                                 *
+                                 * In addition, if bfqq is already in
+                                 * the woken_list of a waker queue,
+                                 * then, before being inserted into
+                                 * the woken_list of a new waker
+                                 * queue, bfqq must be removed from
+                                 * the woken_list of the old waker
+                                 * queue.
+                                 */
+                                if (!hlist_unhashed(&bfqq->woken_list_node))
+                                        hlist_del_init(&bfqq->woken_list_node);
+                                hlist_add_head(&bfqq->woken_list_node,
+                                    &bfqd->last_completed_rq_bfqq->woken_list);
+                                bfq_clear_bfqq_has_waker(bfqq);
+                        } else if (bfqd->last_completed_rq_bfqq ==
+                                   bfqq->waker_bfqq &&
+                                   !bfq_bfqq_has_waker(bfqq)) {
+                                /*
+                                 * synchronization with waker_bfqq
+                                 * seen for the second time
+                                 */
+                                bfq_mark_bfqq_has_waker(bfqq);
+                        }
+                }
+                /*
                 * Periodically reset inject limit, to make sure that
                 * the latter eventually drops in case workload
                 * changes, see step (3) in the comments on
@@ -4164,18 +4270,89 @@ check_queue:
                        bfqq->bic->bfqq[0] : NULL;
                /*
-                 * If the process associated with bfqq has also async
+                 * The next three mutually-exclusive ifs decide
-                 * I/O pending, then inject it
+                 * whether to try injection, and choose the queue to
-                 * unconditionally. Injecting I/O from the same
+                 * pick an I/O request from.
-                 * process can cause no harm to the process. On the
+                 *
-                 * contrary, it can only increase bandwidth and reduce
+                 * The first if checks whether the process associated
-                 * latency for the process.
+                 * with bfqq has also async I/O pending. If so, it
+                 * injects such I/O unconditionally. Injecting async
+                 * I/O from the same process can cause no harm to the
+                 * process. On the contrary, it can only increase
+                 * bandwidth and reduce latency for the process.
+                 *
+                 * The second if checks whether there happens to be a
+                 * non-empty waker queue for bfqq, i.e., a queue whose
+                 * I/O needs to be completed for bfqq to receive new
+                 * I/O. This happens, e.g., if bfqq is associated with
+                 * a process that does some sync. A sync generates
+                 * extra blocking I/O, which must be completed before
+                 * the process associated with bfqq can go on with its
+                 * I/O. If the I/O of the waker queue is not served,
+                 * then bfqq remains empty, and no I/O is dispatched,
+                 * until the idle timeout fires for bfqq. This is
+                 * likely to result in lower bandwidth and higher
+                 * latencies for bfqq, and in a severe loss of total
+                 * throughput. The best action to take is therefore to
+                 * serve the waker queue as soon as possible. So do it
+                 * (without relying on the third alternative below for
+                 * eventually serving waker_bfqq's I/O; see the last
+                 * paragraph for further details). This systematic
+                 * injection of I/O from the waker queue does not
+                 * cause any delay to bfqq's I/O. On the contrary,
+                 * next bfqq's I/O is brought forward dramatically,
+                 * for it is not blocked for milliseconds.
+                 *
+                 * The third if checks whether bfqq is a queue for
+                 * which it is better to avoid injection. It is so if
+                 * bfqq delivers more throughput when served without
+                 * any further I/O from other queues in the middle, or
+                 * if the service times of bfqq's I/O requests both
+                 * count more than overall throughput, and may be
+                 * easily increased by injection (this happens if bfqq
+                 * has a short think time). If none of these
+                 * conditions holds, then a candidate queue for
+                 * injection is looked for through
+                 * bfq_choose_bfqq_for_injection(). Note that the
+                 * latter may return NULL (for example if the inject
+                 * limit for bfqq is currently 0).
+                 *
+                 * NOTE: motivation for the second alternative
+                 *
+                 * Thanks to the way the inject limit is updated in
+                 * bfq_update_has_short_ttime(), it is rather likely
+                 * that, if I/O is being plugged for bfqq and the
+                 * waker queue has pending I/O requests that are
+                 * blocking bfqq's I/O, then the third alternative
+                 * above lets the waker queue get served before the
+                 * I/O-plugging timeout fires. So one may deem the
+                 * second alternative superfluous. It is not, because
+                 * the third alternative may be way less effective in
+                 * case of a synchronization. For two main
+                 * reasons. First, throughput may be low because the
+                 * inject limit may be too low to guarantee the same
+                 * amount of injected I/O, from the waker queue or
+                 * other queues, that the second alternative
+                 * guarantees (the second alternative unconditionally
+                 * injects a pending I/O request of the waker queue
+                 * for each bfq_dispatch_request()). Second, with the
+                 * third alternative, the duration of the plugging,
+                 * i.e., the time before bfqq finally receives new I/O,
+                 * may not be minimized, because the waker queue may
+                 * happen to be served only after other queues.
                 */
                if (async_bfqq &&
                    icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
                    bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
                    bfq_bfqq_budget_left(async_bfqq))
                        bfqq = bfqq->bic->bfqq[0];
+                else if (bfq_bfqq_has_waker(bfqq) &&
+                           bfq_bfqq_busy(bfqq->waker_bfqq) &&
+                           bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
+                                              bfqq->waker_bfqq) <=
+                           bfq_bfqq_budget_left(bfqq->waker_bfqq)
+                        )
+                        bfqq = bfqq->waker_bfqq;
                else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
                         (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
                          !bfq_bfqq_has_short_ttime(bfqq)))
@@ -4564,6 +4741,9 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
+        struct bfq_queue *item;
+        struct hlist_node *n;
        if (bfqq == bfqd->in_service_queue) {
                __bfq_bfqq_expire(bfqd, bfqq);
                bfq_schedule_dispatch(bfqd);
@@ -4573,6 +4753,18 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
        bfq_put_cooperator(bfqq);
+        /* remove bfqq from woken list */
+        if (!hlist_unhashed(&bfqq->woken_list_node))
+                hlist_del_init(&bfqq->woken_list_node);
+        /* reset waker for all queues in woken list */
+        hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
+                                  woken_list_node) {
+                item->waker_bfqq = NULL;
+                bfq_clear_bfqq_has_waker(item);
+                hlist_del_init(&item->woken_list_node);
+        }
        bfq_put_queue(bfqq); /* release process reference */
 }
@@ -4691,6 +4883,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
        RB_CLEAR_NODE(&bfqq->entity.rb_node);
        INIT_LIST_HEAD(&bfqq->fifo);
        INIT_HLIST_NODE(&bfqq->burst_list_node);
+        INIT_HLIST_NODE(&bfqq->woken_list_node);
+        INIT_HLIST_HEAD(&bfqq->woken_list);
        bfqq->ref = 0;
        bfqq->bfqd = bfqd;
@@ -4909,28 +5103,27 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
         * bfqq may have a long think time because of a
         * synchronization with some other queue, i.e., because the
         * I/O of some other queue may need to be completed for bfqq
-         * to receive new I/O. This happens, e.g., if bfqq is
+         * to receive new I/O. Details in the comments on the choice
-         * associated with a process that does some sync. A sync
+         * of the queue for injection in bfq_select_queue().
-         * generates extra blocking I/O, which must be completed
-         * before the process associated with bfqq can go on with its
-         * I/O.
         *
-         * If such a synchronization is actually in place, then,
+         * As stressed in those comments, if such a synchronization is
-         * without injection on bfqq, the blocking I/O cannot happen
+         * actually in place, then, without injection on bfqq, the
-         * to served while bfqq is in service. As a consequence, if
+         * blocking I/O cannot happen to served while bfqq is in
-         * bfqq is granted I/O-dispatch-plugging, then bfqq remains
+         * service. As a consequence, if bfqq is granted
-         * empty, and no I/O is dispatched, until the idle timeout
+         * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
-         * fires. This is likely to result in lower bandwidth and
+         * is dispatched, until the idle timeout fires. This is likely
-         * higher latencies for bfqq, and in a severe loss of total
+         * to result in lower bandwidth and higher latencies for bfqq,
-         * throughput.
+         * and in a severe loss of total throughput.
         *
         * On the opposite end, a non-zero inject limit may allow the
         * I/O that blocks bfqq to be executed soon, and therefore
-         * bfqq to receive new I/O soon. But, if this actually
+         * bfqq to receive new I/O soon.
-         * happens, then the next think-time sample for bfqq may be
+         *
-         * very low. This in turn may cause bfqq's think time to be
+         * But, if the blocking gets actually eliminated, then the
-         * deemed short. Without the 100 ms barrier, this new state
+         * next think-time sample for bfqq may be very low. This in
-         * change would cause the body of the next if to be executed
+         * turn may cause bfqq's think time to be deemed
+         * short. Without the 100 ms barrier, this new state change
+         * would cause the body of the next if to be executed
         * immediately. But this would set to 0 the inject
         * limit. Without injection, the blocking I/O would cause the
         * think time of bfqq to become long again, and therefore the
@@ -4941,11 +5134,11 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
         * In contrast, if the inject limit is not reset during such a
         * long time interval as 100 ms, then the number of short
         * think time samples can grow significantly before the reset
-         * is allowed. As a consequence, the think time state can
+         * is performed. As a consequence, the think time state can
-         * become stable before the reset. There will be no state
+         * become stable before the reset. Therefore there will be no
-         * change when the 100 ms elapse, and therefore no reset of
+         * state change when the 100 ms elapse, and no reset of the
-         * the inject limit. The inject limit remains steadily equal
+         * inject limit. The inject limit remains steadily equal to 1
-         * to 1 both during and after the 100 ms. So injection can be
+         * both during and after the 100 ms. So injection can be
         * performed at all times, and throughput gets boosted.
         *
         * An inject limit equal to 1 is however in conflict, in
@@ -4960,10 +5153,20 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
         * brought forward, because it is not blocked for
         * milliseconds.
         *
-         * In addition, during the 100 ms, the base value for the
+         * In addition, serving the blocking I/O much sooner, and much
-         * total service time is likely to get finally computed,
+         * more frequently than once per I/O-plugging timeout, makes
-         * freeing the inject limit from its relation with the think
+         * it much quicker to detect a waker queue (the concept of
-         * time.
+         * waker queue is defined in the comments in
+         * bfq_add_request()). This makes it possible to start sooner
+         * to boost throughput more effectively, by injecting the I/O
+         * of the waker queue unconditionally on every
+         * bfq_dispatch_request().
+         *
+         * One last, important benefit of not resetting the inject
+         * limit before 100 ms is that, during this time interval, the
+         * base value for the total service time is likely to get
+         * finally computed for bfqq, freeing the inject limit from
+         * its relation with the think time.
         */
        if (state_changed && bfqq->last_serv_time_ns == 0 &&
            (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
@@ -5278,6 +5481,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
                        1UL<<(BFQ_RATE_SHIFT - 10))
                bfq_update_rate_reset(bfqd, NULL);
        bfqd->last_completion = now_ns;
+        bfqd->last_completed_rq_bfqq = bfqq;
        /*
         * If we are waiting to discover whether the request pattern
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 584d3c9ed8ba..e80adf822bbe 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -357,6 +357,24 @@ struct bfq_queue {
        /* max service rate measured so far */
        u32 max_service_rate;
+        /*
+         * Pointer to the waker queue for this queue, i.e., to the
+         * queue Q such that this queue happens to get new I/O right
+         * after some I/O request of Q is completed. For details, see
+         * the comments on the choice of the queue for injection in
+         * bfq_select_queue().
+         */
+        struct bfq_queue *waker_bfqq;
+        /* node for woken_list, see below */
+        struct hlist_node woken_list_node;
+        /*
+         * Head of the list of the woken queues for this queue, i.e.,
+         * of the list of the queues for which this queue is a waker
+         * queue. This list is used to reset the waker_bfqq pointer in
+         * the woken queues when this queue exits.
+         */
+        struct hlist_head woken_list;
 };
 /**
@@ -533,6 +551,9 @@ struct bfq_data {
        /* time of last request completion (ns) */
        u64 last_completion;
+        /* bfqq owning the last completed rq */
+        struct bfq_queue *last_completed_rq_bfqq;
        /* time of last transition from empty to non-empty (ns) */
        u64 last_empty_occupied_ns;
@@ -743,7 +764,8 @@ enum bfqq_state_flags {
                                 * update
                                 */
        BFQQF_coop,             /* bfqq is shared */
-        BFQQF_split_coop        /* shared bfqq will be split */
+        BFQQF_split_coop,       /* shared bfqq will be split */
+        BFQQF_has_waker         /* bfqq has a waker queue */
 };
 #define BFQ_BFQQ_FNS(name)                                              \
@@ -763,6 +785,7 @@ BFQ_BFQQ_FNS(in_large_burst);
 BFQ_BFQQ_FNS(coop);
 BFQ_BFQQ_FNS(split_coop);
 BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
 #undef BFQ_BFQQ_FNS
 /* Expiration reasons. */

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d5bc32371ace..9e2fbb7d1fb6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
157	BFQ_BFQQ_FNS(coop);	157	BFQ_BFQQ_FNS(coop);
158	BFQ_BFQQ_FNS(split_coop);	158	BFQ_BFQQ_FNS(split_coop);
159	BFQ_BFQQ_FNS(softrt_update);	159	BFQ_BFQQ_FNS(softrt_update);
		160	BFQ_BFQQ_FNS(has_waker);
160	#undef BFQ_BFQQ_FNS \	161	#undef BFQ_BFQQ_FNS \
161		162
162	/* Expiration time of sync (0) and async (1) requests, in ns. */	163	/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1815,6 +1816,111 @@ static void bfq_add_request(struct request *rq)
1815		1816
1816	if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {	1817	if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
1817	/*	1818	/*
		1819	* Detect whether bfqq's I/O seems synchronized with
		1820	* that of some other queue, i.e., whether bfqq, after
		1821	* remaining empty, happens to receive new I/O only
		1822	* right after some I/O request of the other queue has
		1823	* been completed. We call waker queue the other
		1824	* queue, and we assume, for simplicity, that bfqq may
		1825	* have at most one waker queue.
		1826	*
		1827	* A remarkable throughput boost can be reached by
		1828	* unconditionally injecting the I/O of the waker
		1829	* queue, every time a new bfq_dispatch_request
		1830	* happens to be invoked while I/O is being plugged
		1831	* for bfqq. In addition to boosting throughput, this
		1832	* unblocks bfqq's I/O, thereby improving bandwidth
		1833	* and latency for bfqq. Note that these same results
		1834	* may be achieved with the general injection
		1835	* mechanism, but less effectively. For details on
		1836	* this aspect, see the comments on the choice of the
		1837	* queue for injection in bfq_select_queue().
		1838	*
		1839	* Turning back to the detection of a waker queue, a
		1840	* queue Q is deemed as a waker queue for bfqq if, for
		1841	* two consecutive times, bfqq happens to become non
		1842	* empty right after a request of Q has been
		1843	* completed. In particular, on the first time, Q is
		1844	* tentatively set as a candidate waker queue, while
		1845	* on the second time, the flag
		1846	* bfq_bfqq_has_waker(bfqq) is set to confirm that Q
		1847	* is a waker queue for bfqq. These detection steps
		1848	* are performed only if bfqq has a long think time,
		1849	* so as to make it more likely that bfqq's I/O is
		1850	* actually being blocked by a synchronization. This
		1851	* last filter, plus the above two-times requirement,
		1852	* make false positives less likely.
		1853	*
		1854	* NOTE
		1855	*
		1856	* The sooner a waker queue is detected, the sooner
		1857	* throughput can be boosted by injecting I/O from the
		1858	* waker queue. Fortunately, detection is likely to be
		1859	* actually fast, for the following reasons. While
		1860	* blocked by synchronization, bfqq has a long think
		1861	* time. This implies that bfqq's inject limit is at
		1862	* least equal to 1 (see the comments in
		1863	* bfq_update_inject_limit()). So, thanks to
		1864	* injection, the waker queue is likely to be served
		1865	* during the very first I/O-plugging time interval
		1866	* for bfqq. This triggers the first step of the
		1867	* detection mechanism. Thanks again to injection, the
		1868	* candidate waker queue is then likely to be
		1869	* confirmed no later than during the next
		1870	* I/O-plugging interval for bfqq.
		1871	*/
		1872	if (!bfq_bfqq_has_short_ttime(bfqq) &&
		1873	ktime_get_ns() - bfqd->last_completion <
		1874	200 * NSEC_PER_USEC) {
		1875	if (bfqd->last_completed_rq_bfqq != bfqq &&
		1876	bfqd->last_completed_rq_bfqq !=
		1877	bfqq->waker_bfqq) {
		1878	/*
		1879	* First synchronization detected with
		1880	* a candidate waker queue, or with a
		1881	* different candidate waker queue
		1882	* from the current one.
		1883	*/
		1884	bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
		1885
		1886	/*
		1887	* If the waker queue disappears, then
		1888	* bfqq->waker_bfqq must be reset. To
		1889	* this goal, we maintain in each
		1890	* waker queue a list, woken_list, of
		1891	* all the queues that reference the
		1892	* waker queue through their
		1893	* waker_bfqq pointer. When the waker
		1894	* queue exits, the waker_bfqq pointer
		1895	* of all the queues in the woken_list
		1896	* is reset.
		1897	*
		1898	* In addition, if bfqq is already in
		1899	* the woken_list of a waker queue,
		1900	* then, before being inserted into
		1901	* the woken_list of a new waker
		1902	* queue, bfqq must be removed from
		1903	* the woken_list of the old waker
		1904	* queue.
		1905	*/
		1906	if (!hlist_unhashed(&bfqq->woken_list_node))
		1907	hlist_del_init(&bfqq->woken_list_node);
		1908	hlist_add_head(&bfqq->woken_list_node,
		1909	&bfqd->last_completed_rq_bfqq->woken_list);
		1910
		1911	bfq_clear_bfqq_has_waker(bfqq);
		1912	} else if (bfqd->last_completed_rq_bfqq ==
		1913	bfqq->waker_bfqq &&
		1914	!bfq_bfqq_has_waker(bfqq)) {
		1915	/*
		1916	* synchronization with waker_bfqq
		1917	* seen for the second time
		1918	*/
		1919	bfq_mark_bfqq_has_waker(bfqq);
		1920	}
		1921	}
		1922
		1923	/*
1818	* Periodically reset inject limit, to make sure that	1924	* Periodically reset inject limit, to make sure that
1819	* the latter eventually drops in case workload	1925	* the latter eventually drops in case workload
1820	* changes, see step (3) in the comments on	1926	* changes, see step (3) in the comments on
@@ -4164,18 +4270,89 @@ check_queue:
4164	bfqq->bic->bfqq[0] : NULL;	4270	bfqq->bic->bfqq[0] : NULL;
4165		4271
4166	/*	4272	/*
4167	* If the process associated with bfqq has also async	4273	* The next three mutually-exclusive ifs decide
4168	* I/O pending, then inject it	4274	* whether to try injection, and choose the queue to
4169	* unconditionally. Injecting I/O from the same	4275	* pick an I/O request from.
4170	* process can cause no harm to the process. On the	4276	*
4171	* contrary, it can only increase bandwidth and reduce	4277	* The first if checks whether the process associated
4172	* latency for the process.	4278	* with bfqq has also async I/O pending. If so, it
		4279	* injects such I/O unconditionally. Injecting async
		4280	* I/O from the same process can cause no harm to the
		4281	* process. On the contrary, it can only increase
		4282	* bandwidth and reduce latency for the process.
		4283	*
		4284	* The second if checks whether there happens to be a
		4285	* non-empty waker queue for bfqq, i.e., a queue whose
		4286	* I/O needs to be completed for bfqq to receive new
		4287	* I/O. This happens, e.g., if bfqq is associated with
		4288	* a process that does some sync. A sync generates
		4289	* extra blocking I/O, which must be completed before
		4290	* the process associated with bfqq can go on with its
		4291	* I/O. If the I/O of the waker queue is not served,
		4292	* then bfqq remains empty, and no I/O is dispatched,
		4293	* until the idle timeout fires for bfqq. This is
		4294	* likely to result in lower bandwidth and higher
		4295	* latencies for bfqq, and in a severe loss of total
		4296	* throughput. The best action to take is therefore to
		4297	* serve the waker queue as soon as possible. So do it
		4298	* (without relying on the third alternative below for
		4299	* eventually serving waker_bfqq's I/O; see the last
		4300	* paragraph for further details). This systematic
		4301	* injection of I/O from the waker queue does not
		4302	* cause any delay to bfqq's I/O. On the contrary,
		4303	* next bfqq's I/O is brought forward dramatically,
		4304	* for it is not blocked for milliseconds.
		4305	*
		4306	* The third if checks whether bfqq is a queue for
		4307	* which it is better to avoid injection. It is so if
		4308	* bfqq delivers more throughput when served without
		4309	* any further I/O from other queues in the middle, or
		4310	* if the service times of bfqq's I/O requests both
		4311	* count more than overall throughput, and may be
		4312	* easily increased by injection (this happens if bfqq
		4313	* has a short think time). If none of these
		4314	* conditions holds, then a candidate queue for
		4315	* injection is looked for through
		4316	* bfq_choose_bfqq_for_injection(). Note that the
		4317	* latter may return NULL (for example if the inject
		4318	* limit for bfqq is currently 0).
		4319	*
		4320	* NOTE: motivation for the second alternative
		4321	*
		4322	* Thanks to the way the inject limit is updated in
		4323	* bfq_update_has_short_ttime(), it is rather likely
		4324	* that, if I/O is being plugged for bfqq and the
		4325	* waker queue has pending I/O requests that are
		4326	* blocking bfqq's I/O, then the third alternative
		4327	* above lets the waker queue get served before the
		4328	* I/O-plugging timeout fires. So one may deem the
		4329	* second alternative superfluous. It is not, because
		4330	* the third alternative may be way less effective in
		4331	* case of a synchronization. For two main
		4332	* reasons. First, throughput may be low because the
		4333	* inject limit may be too low to guarantee the same
		4334	* amount of injected I/O, from the waker queue or
		4335	* other queues, that the second alternative
		4336	* guarantees (the second alternative unconditionally
		4337	* injects a pending I/O request of the waker queue
		4338	* for each bfq_dispatch_request()). Second, with the
		4339	* third alternative, the duration of the plugging,
		4340	* i.e., the time before bfqq finally receives new I/O,
		4341	* may not be minimized, because the waker queue may
		4342	* happen to be served only after other queues.
4173	*/	4343	*/
4174	if (async_bfqq &&	4344	if (async_bfqq &&
4175	icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&	4345	icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
4176	bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=	4346	bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
4177	bfq_bfqq_budget_left(async_bfqq))	4347	bfq_bfqq_budget_left(async_bfqq))
4178	bfqq = bfqq->bic->bfqq[0];	4348	bfqq = bfqq->bic->bfqq[0];
		4349	else if (bfq_bfqq_has_waker(bfqq) &&
		4350	bfq_bfqq_busy(bfqq->waker_bfqq) &&
		4351	bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
		4352	bfqq->waker_bfqq) <=
		4353	bfq_bfqq_budget_left(bfqq->waker_bfqq)
		4354	)
		4355	bfqq = bfqq->waker_bfqq;
4179	else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&	4356	else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
4180	(bfqq->wr_coeff == 1 \|\| bfqd->wr_busy_queues > 1 \|\|	4357	(bfqq->wr_coeff == 1 \|\| bfqd->wr_busy_queues > 1 \|\|
4181	!bfq_bfqq_has_short_ttime(bfqq)))	4358	!bfq_bfqq_has_short_ttime(bfqq)))
@@ -4564,6 +4741,9 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
4564		4741
4565	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)	4742	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
4566	{	4743	{
		4744	struct bfq_queue *item;
		4745	struct hlist_node *n;
		4746
4567	if (bfqq == bfqd->in_service_queue) {	4747	if (bfqq == bfqd->in_service_queue) {
4568	__bfq_bfqq_expire(bfqd, bfqq);	4748	__bfq_bfqq_expire(bfqd, bfqq);
4569	bfq_schedule_dispatch(bfqd);	4749	bfq_schedule_dispatch(bfqd);
@@ -4573,6 +4753,18 @@ static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
4573		4753
4574	bfq_put_cooperator(bfqq);	4754	bfq_put_cooperator(bfqq);
4575		4755
		4756	/* remove bfqq from woken list */
		4757	if (!hlist_unhashed(&bfqq->woken_list_node))
		4758	hlist_del_init(&bfqq->woken_list_node);
		4759
		4760	/* reset waker for all queues in woken list */
		4761	hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
		4762	woken_list_node) {
		4763	item->waker_bfqq = NULL;
		4764	bfq_clear_bfqq_has_waker(item);
		4765	hlist_del_init(&item->woken_list_node);
		4766	}
		4767
4576	bfq_put_queue(bfqq); /* release process reference */	4768	bfq_put_queue(bfqq); /* release process reference */
4577	}	4769	}
4578		4770
@@ -4691,6 +4883,8 @@ static void bfq_init_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
4691	RB_CLEAR_NODE(&bfqq->entity.rb_node);	4883	RB_CLEAR_NODE(&bfqq->entity.rb_node);
4692	INIT_LIST_HEAD(&bfqq->fifo);	4884	INIT_LIST_HEAD(&bfqq->fifo);
4693	INIT_HLIST_NODE(&bfqq->burst_list_node);	4885	INIT_HLIST_NODE(&bfqq->burst_list_node);
		4886	INIT_HLIST_NODE(&bfqq->woken_list_node);
		4887	INIT_HLIST_HEAD(&bfqq->woken_list);
4694		4888
4695	bfqq->ref = 0;	4889	bfqq->ref = 0;
4696	bfqq->bfqd = bfqd;	4890	bfqq->bfqd = bfqd;
@@ -4909,28 +5103,27 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4909	* bfqq may have a long think time because of a	5103	* bfqq may have a long think time because of a
4910	* synchronization with some other queue, i.e., because the	5104	* synchronization with some other queue, i.e., because the
4911	* I/O of some other queue may need to be completed for bfqq	5105	* I/O of some other queue may need to be completed for bfqq
4912	* to receive new I/O. This happens, e.g., if bfqq is	5106	* to receive new I/O. Details in the comments on the choice
4913	* associated with a process that does some sync. A sync	5107	* of the queue for injection in bfq_select_queue().
4914	* generates extra blocking I/O, which must be completed
4915	* before the process associated with bfqq can go on with its
4916	* I/O.
4917	*	5108	*
4918	* If such a synchronization is actually in place, then,	5109	* As stressed in those comments, if such a synchronization is
4919	* without injection on bfqq, the blocking I/O cannot happen	5110	* actually in place, then, without injection on bfqq, the
4920	* to served while bfqq is in service. As a consequence, if	5111	* blocking I/O cannot happen to served while bfqq is in
4921	* bfqq is granted I/O-dispatch-plugging, then bfqq remains	5112	* service. As a consequence, if bfqq is granted
4922	* empty, and no I/O is dispatched, until the idle timeout	5113	* I/O-dispatch-plugging, then bfqq remains empty, and no I/O
4923	* fires. This is likely to result in lower bandwidth and	5114	* is dispatched, until the idle timeout fires. This is likely
4924	* higher latencies for bfqq, and in a severe loss of total	5115	* to result in lower bandwidth and higher latencies for bfqq,
4925	* throughput.	5116	* and in a severe loss of total throughput.
4926	*	5117	*
4927	* On the opposite end, a non-zero inject limit may allow the	5118	* On the opposite end, a non-zero inject limit may allow the
4928	* I/O that blocks bfqq to be executed soon, and therefore	5119	* I/O that blocks bfqq to be executed soon, and therefore
4929	* bfqq to receive new I/O soon. But, if this actually	5120	* bfqq to receive new I/O soon.
4930	* happens, then the next think-time sample for bfqq may be	5121	*
4931	* very low. This in turn may cause bfqq's think time to be	5122	* But, if the blocking gets actually eliminated, then the
4932	* deemed short. Without the 100 ms barrier, this new state	5123	* next think-time sample for bfqq may be very low. This in
4933	* change would cause the body of the next if to be executed	5124	* turn may cause bfqq's think time to be deemed
		5125	* short. Without the 100 ms barrier, this new state change
		5126	* would cause the body of the next if to be executed
4934	* immediately. But this would set to 0 the inject	5127	* immediately. But this would set to 0 the inject
4935	* limit. Without injection, the blocking I/O would cause the	5128	* limit. Without injection, the blocking I/O would cause the
4936	* think time of bfqq to become long again, and therefore the	5129	* think time of bfqq to become long again, and therefore the
@@ -4941,11 +5134,11 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4941	* In contrast, if the inject limit is not reset during such a	5134	* In contrast, if the inject limit is not reset during such a
4942	* long time interval as 100 ms, then the number of short	5135	* long time interval as 100 ms, then the number of short
4943	* think time samples can grow significantly before the reset	5136	* think time samples can grow significantly before the reset
4944	* is allowed. As a consequence, the think time state can	5137	* is performed. As a consequence, the think time state can
4945	* become stable before the reset. There will be no state	5138	* become stable before the reset. Therefore there will be no
4946	* change when the 100 ms elapse, and therefore no reset of	5139	* state change when the 100 ms elapse, and no reset of the
4947	* the inject limit. The inject limit remains steadily equal	5140	* inject limit. The inject limit remains steadily equal to 1
4948	* to 1 both during and after the 100 ms. So injection can be	5141	* both during and after the 100 ms. So injection can be
4949	* performed at all times, and throughput gets boosted.	5142	* performed at all times, and throughput gets boosted.
4950	*	5143	*
4951	* An inject limit equal to 1 is however in conflict, in	5144	* An inject limit equal to 1 is however in conflict, in
@@ -4960,10 +5153,20 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4960	* brought forward, because it is not blocked for	5153	* brought forward, because it is not blocked for
4961	* milliseconds.	5154	* milliseconds.
4962	*	5155	*
4963	* In addition, during the 100 ms, the base value for the	5156	* In addition, serving the blocking I/O much sooner, and much
4964	* total service time is likely to get finally computed,	5157	* more frequently than once per I/O-plugging timeout, makes
4965	* freeing the inject limit from its relation with the think	5158	* it much quicker to detect a waker queue (the concept of
4966	* time.	5159	* waker queue is defined in the comments in
		5160	* bfq_add_request()). This makes it possible to start sooner
		5161	* to boost throughput more effectively, by injecting the I/O
		5162	* of the waker queue unconditionally on every
		5163	* bfq_dispatch_request().
		5164	*
		5165	* One last, important benefit of not resetting the inject
		5166	* limit before 100 ms is that, during this time interval, the
		5167	* base value for the total service time is likely to get
		5168	* finally computed for bfqq, freeing the inject limit from
		5169	* its relation with the think time.
4967	*/	5170	*/
4968	if (state_changed && bfqq->last_serv_time_ns == 0 &&	5171	if (state_changed && bfqq->last_serv_time_ns == 0 &&
4969	(time_is_before_eq_jiffies(bfqq->decrease_time_jif +	5172	(time_is_before_eq_jiffies(bfqq->decrease_time_jif +
@@ -5278,6 +5481,7 @@ static void bfq_completed_request(struct bfq_queue bfqq, struct bfq_data bfqd)
5278	1UL<<(BFQ_RATE_SHIFT - 10))	5481	1UL<<(BFQ_RATE_SHIFT - 10))
5279	bfq_update_rate_reset(bfqd, NULL);	5482	bfq_update_rate_reset(bfqd, NULL);
5280	bfqd->last_completion = now_ns;	5483	bfqd->last_completion = now_ns;
		5484	bfqd->last_completed_rq_bfqq = bfqq;
5281		5485
5282	/*	5486	/*
5283	* If we are waiting to discover whether the request pattern	5487	* If we are waiting to discover whether the request pattern


diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 584d3c9ed8ba..e80adf822bbe 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h
@@ -357,6 +357,24 @@ struct bfq_queue {
357		357
358	/* max service rate measured so far */	358	/* max service rate measured so far */
359	u32 max_service_rate;	359	u32 max_service_rate;
		360
		361	/*
		362	* Pointer to the waker queue for this queue, i.e., to the
		363	* queue Q such that this queue happens to get new I/O right
		364	* after some I/O request of Q is completed. For details, see
		365	* the comments on the choice of the queue for injection in
		366	* bfq_select_queue().
		367	*/
		368	struct bfq_queue *waker_bfqq;
		369	/* node for woken_list, see below */
		370	struct hlist_node woken_list_node;
		371	/*
		372	* Head of the list of the woken queues for this queue, i.e.,
		373	* of the list of the queues for which this queue is a waker
		374	* queue. This list is used to reset the waker_bfqq pointer in
		375	* the woken queues when this queue exits.
		376	*/
		377	struct hlist_head woken_list;
360	};	378	};
361		379
362	/**	380	/**
@@ -533,6 +551,9 @@ struct bfq_data {
533	/* time of last request completion (ns) */	551	/* time of last request completion (ns) */
534	u64 last_completion;	552	u64 last_completion;
535		553
		554	/* bfqq owning the last completed rq */
		555	struct bfq_queue *last_completed_rq_bfqq;
		556
536	/* time of last transition from empty to non-empty (ns) */	557	/* time of last transition from empty to non-empty (ns) */
537	u64 last_empty_occupied_ns;	558	u64 last_empty_occupied_ns;
538		559
@@ -743,7 +764,8 @@ enum bfqq_state_flags {
743	* update	764	* update
744	*/	765	*/
745	BFQQF_coop, /* bfqq is shared */	766	BFQQF_coop, /* bfqq is shared */
746	BFQQF_split_coop /* shared bfqq will be split */	767	BFQQF_split_coop, /* shared bfqq will be split */
		768	BFQQF_has_waker /* bfqq has a waker queue */
747	};	769	};
748		770
749	#define BFQ_BFQQ_FNS(name) \	771	#define BFQ_BFQQ_FNS(name) \
@@ -763,6 +785,7 @@ BFQ_BFQQ_FNS(in_large_burst);
763	BFQ_BFQQ_FNS(coop);	785	BFQ_BFQQ_FNS(coop);
764	BFQ_BFQQ_FNS(split_coop);	786	BFQ_BFQQ_FNS(split_coop);
765	BFQ_BFQQ_FNS(softrt_update);	787	BFQ_BFQQ_FNS(softrt_update);
		788	BFQ_BFQQ_FNS(has_waker);
766	#undef BFQ_BFQQ_FNS	789	#undef BFQ_BFQQ_FNS
767		790
768	/* Expiration reasons. */	791	/* Expiration reasons. */