summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/bfq-iosched.c270
-rw-r--r--block/bfq-iosched.h25
2 files changed, 261 insertions, 34 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index d5bc32371ace..9e2fbb7d1fb6 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
157BFQ_BFQQ_FNS(coop); 157BFQ_BFQQ_FNS(coop);
158BFQ_BFQQ_FNS(split_coop); 158BFQ_BFQQ_FNS(split_coop);
159BFQ_BFQQ_FNS(softrt_update); 159BFQ_BFQQ_FNS(softrt_update);
160BFQ_BFQQ_FNS(has_waker);
160#undef BFQ_BFQQ_FNS \ 161#undef BFQ_BFQQ_FNS \
161 162
162/* Expiration time of sync (0) and async (1) requests, in ns. */ 163/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1815,6 +1816,111 @@ static void bfq_add_request(struct request *rq)
1815 1816
1816 if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { 1817 if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
1817 /* 1818 /*
1819 * Detect whether bfqq's I/O seems synchronized with
1820 * that of some other queue, i.e., whether bfqq, after
1821 * remaining empty, happens to receive new I/O only
1822 * right after some I/O request of the other queue has
1823 * been completed. We call waker queue the other
1824 * queue, and we assume, for simplicity, that bfqq may
1825 * have at most one waker queue.
1826 *
1827 * A remarkable throughput boost can be reached by
1828 * unconditionally injecting the I/O of the waker
1829 * queue, every time a new bfq_dispatch_request
1830 * happens to be invoked while I/O is being plugged
1831 * for bfqq. In addition to boosting throughput, this
1832 * unblocks bfqq's I/O, thereby improving bandwidth
1833 * and latency for bfqq. Note that these same results
1834 * may be achieved with the general injection
1835 * mechanism, but less effectively. For details on
1836 * this aspect, see the comments on the choice of the
1837 * queue for injection in bfq_select_queue().
1838 *
1839 * Turning back to the detection of a waker queue, a
1840 * queue Q is deemed as a waker queue for bfqq if, for
1841 * two consecutive times, bfqq happens to become non
1842 * empty right after a request of Q has been
1843 * completed. In particular, on the first time, Q is
1844 * tentatively set as a candidate waker queue, while
1845 * on the second time, the flag
1846 * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
1847 * is a waker queue for bfqq. These detection steps
1848 * are performed only if bfqq has a long think time,
1849 * so as to make it more likely that bfqq's I/O is
1850 * actually being blocked by a synchronization. This
1851 * last filter, plus the above two-times requirement,
1852 * make false positives less likely.
1853 *
1854 * NOTE
1855 *
1856 * The sooner a waker queue is detected, the sooner
1857 * throughput can be boosted by injecting I/O from the
1858 * waker queue. Fortunately, detection is likely to be
1859 * actually fast, for the following reasons. While
1860 * blocked by synchronization, bfqq has a long think
1861 * time. This implies that bfqq's inject limit is at
1862 * least equal to 1 (see the comments in
1863 * bfq_update_inject_limit()). So, thanks to
1864 * injection, the waker queue is likely to be served
1865 * during the very first I/O-plugging time interval
1866 * for bfqq. This triggers the first step of the
1867 * detection mechanism. Thanks again to injection, the
1868 * candidate waker queue is then likely to be
1869 * confirmed no later than during the next
1870 * I/O-plugging interval for bfqq.
1871 */
1872 if (!bfq_bfqq_has_short_ttime(bfqq) &&
1873 ktime_get_ns() - bfqd->last_completion <
1874 200 * NSEC_PER_USEC) {
1875 if (bfqd->last_completed_rq_bfqq != bfqq &&
1876 bfqd->last_completed_rq_bfqq !=
1877 bfqq->waker_bfqq) {
1878 /*
1879 * First synchronization detected with
1880 * a candidate waker queue, or with a
1881 * different candidate waker queue
1882 * from the current one.
1883 */
1884 bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
1885
1886 /*
1887 * If the waker queue disappears, then
1888 * bfqq->waker_bfqq must be reset. To
1889 * this goal, we maintain in each
1890 * waker queue a list, woken_list, of
1891 * all the queues that reference the
1892 * waker queue through their
1893 * waker_bfqq pointer. When the waker
1894 * queue exits, the waker_bfqq pointer
1895 * of all the queues in the woken_list
1896 * is reset.
1897 *
1898 * In addition, if bfqq is already in
1899 * the woken_list of a waker queue,
1900 * then, before being inserted into
1901 * the woken_list of a new waker
1902 * queue, bfqq must be removed from
1903 * the woken_list of the old waker
1904 * queue.
1905 */
1906 if (!hlist_unhashed(&bfqq->woken_list_node))
1907 hlist_del_init(&bfqq->woken_list_node);
1908 hlist_add_head(&bfqq->woken_list_node,
1909 &bfqd->last_completed_rq_bfqq->woken_list);
1910
1911 bfq_clear_bfqq_has_waker(bfqq);
1912 } else if (bfqd->last_completed_rq_bfqq ==
1913 bfqq->waker_bfqq &&
1914 !bfq_bfqq_has_waker(bfqq)) {
1915 /*
1916 * synchronization with waker_bfqq
1917 * seen for the second time
1918 */
1919 bfq_mark_bfqq_has_waker(bfqq);
1920 }
1921 }
1922
1923 /*
1818 * Periodically reset inject limit, to make sure that 1924 * Periodically reset inject limit, to make sure that
1819 * the latter eventually drops in case workload 1925 * the latter eventually drops in case workload
1820 * changes, see step (3) in the comments on 1926 * changes, see step (3) in the comments on
@@ -4164,18 +4270,89 @@ check_queue:
4164 bfqq->bic->bfqq[0] : NULL; 4270 bfqq->bic->bfqq[0] : NULL;
4165 4271
4166 /* 4272 /*
4167 * If the process associated with bfqq has also async 4273 * The next three mutually-exclusive ifs decide
4168 * I/O pending, then inject it 4274 * whether to try injection, and choose the queue to
4169 * unconditionally. Injecting I/O from the same 4275 * pick an I/O request from.
4170 * process can cause no harm to the process. On the 4276 *
4171 * contrary, it can only increase bandwidth and reduce 4277 * The first if checks whether the process associated
4172 * latency for the process. 4278 * with bfqq has also async I/O pending. If so, it
4279 * injects such I/O unconditionally. Injecting async
4280 * I/O from the same process can cause no harm to the
4281 * process. On the contrary, it can only increase
4282 * bandwidth and reduce latency for the process.
4283 *
4284 * The second if checks whether there happens to be a
4285 * non-empty waker queue for bfqq, i.e., a queue whose
4286 * I/O needs to be completed for bfqq to receive new
4287 * I/O. This happens, e.g., if bfqq is associated with
4288 * a process that does some sync. A sync generates
4289 * extra blocking I/O, which must be completed before
4290 * the process associated with bfqq can go on with its
4291 * I/O. If the I/O of the waker queue is not served,
4292 * then bfqq remains empty, and no I/O is dispatched,
4293 * until the idle timeout fires for bfqq. This is
4294 * likely to result in lower bandwidth and higher
4295 * latencies for bfqq, and in a severe loss of total
4296 * throughput. The best action to take is therefore to
4297 * serve the waker queue as soon as possible. So do it
4298 * (without relying on the third alternative below for
4299 * eventually serving waker_bfqq's I/O; see the last
4300 * paragraph for further details). This systematic
4301 * injection of I/O from the waker queue does not
4302 * cause any delay to bfqq's I/O. On the contrary,
4303 * next bfqq's I/O is brought forward dramatically,
4304 * for it is not blocked for milliseconds.
4305 *
4306 * The third if checks whether bfqq is a queue for
4307 * which it is better to avoid injection. It is so if
4308 * bfqq delivers more throughput when served without
4309 * any further I/O from other queues in the middle, or
4310 * if the service times of bfqq's I/O requests both
4311 * count more than overall throughput, and may be
4312 * easily increased by injection (this happens if bfqq
4313 * has a short think time). If none of these
4314 * conditions holds, then a candidate queue for
4315 * injection is looked for through
4316 * bfq_choose_bfqq_for_injection(). Note that the
4317 * latter may return NULL (for example if the inject
4318 * limit for bfqq is currently 0).
4319 *
4320 * NOTE: motivation for the second alternative
4321 *
4322 * Thanks to the way the inject limit is updated in
4323 * bfq_update_has_short_ttime(), it is rather likely
4324 * that, if I/O is being plugged for bfqq and the
4325 * waker queue has pending I/O requests that are
4326 * blocking bfqq's I/O, then the third alternative
4327 * above lets the waker queue get served before the
4328 * I/O-plugging timeout fires. So one may deem the
4329 * second alternative superfluous. It is not, because
4330 * the third alternative may be way less effective in
4331 * case of a synchronization. For two main
4332 * reasons. First, throughput may be low because the
4333 * inject limit may be too low to guarantee the same
4334 * amount of injected I/O, from the waker queue or
4335 * other queues, that the second alternative
4336 * guarantees (the second alternative unconditionally
4337 * injects a pending I/O request of the waker queue
4338 * for each bfq_dispatch_request()). Second, with the
4339 * third alternative, the duration of the plugging,
4340 * i.e., the time before bfqq finally receives new I/O,
4341 * may not be minimized, because the waker queue may
4342 * happen to be served only after other queues.
4173 */ 4343 */
4174 if (async_bfqq && 4344 if (async_bfqq &&
4175 icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && 4345 icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
4176 bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= 4346 bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
4177 bfq_bfqq_budget_left(async_bfqq)) 4347 bfq_bfqq_budget_left(async_bfqq))
4178 bfqq = bfqq->bic->bfqq[0]; 4348 bfqq = bfqq->bic->bfqq[0];
4349 else if (bfq_bfqq_has_waker(bfqq) &&
4350 bfq_bfqq_busy(bfqq->waker_bfqq) &&
4351 bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
4352 bfqq->waker_bfqq) <=
4353 bfq_bfqq_budget_left(bfqq->waker_bfqq)
4354 )
4355 bfqq = bfqq->waker_bfqq;
4179 else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && 4356 else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
4180 (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || 4357 (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
4181 !bfq_bfqq_has_short_ttime(bfqq))) 4358 !bfq_bfqq_has_short_ttime(bfqq)))
@@ -4564,6 +4741,9 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
4564 4741
4565static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) 4742static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4566{ 4743{
4744 struct bfq_queue *item;
4745 struct hlist_node *n;
4746
4567 if (bfqq == bfqd->in_service_queue) { 4747 if (bfqq == bfqd->in_service_queue) {
4568 __bfq_bfqq_expire(bfqd, bfqq); 4748 __bfq_bfqq_expire(bfqd, bfqq);
4569 bfq_schedule_dispatch(bfqd); 4749 bfq_schedule_dispatch(bfqd);
@@ -4573,6 +4753,18 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4573 4753
4574 bfq_put_cooperator(bfqq); 4754 bfq_put_cooperator(bfqq);
4575 4755
4756 /* remove bfqq from woken list */
4757 if (!hlist_unhashed(&bfqq->woken_list_node))
4758 hlist_del_init(&bfqq->woken_list_node);
4759
4760 /* reset waker for all queues in woken list */
4761 hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
4762 woken_list_node) {
4763 item->waker_bfqq = NULL;
4764 bfq_clear_bfqq_has_waker(item);
4765 hlist_del_init(&item->woken_list_node);
4766 }
4767
4576 bfq_put_queue(bfqq); /* release process reference */ 4768 bfq_put_queue(bfqq); /* release process reference */
4577} 4769}
4578 4770
@@ -4691,6 +4883,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
4691 RB_CLEAR_NODE(&bfqq->entity.rb_node); 4883 RB_CLEAR_NODE(&bfqq->entity.rb_node);
4692 INIT_LIST_HEAD(&bfqq->fifo); 4884 INIT_LIST_HEAD(&bfqq->fifo);
4693 INIT_HLIST_NODE(&bfqq->burst_list_node); 4885 INIT_HLIST_NODE(&bfqq->burst_list_node);
4886 INIT_HLIST_NODE(&bfqq->woken_list_node);
4887 INIT_HLIST_HEAD(&bfqq->woken_list);
4694 4888
4695 bfqq->ref = 0; 4889 bfqq->ref = 0;
4696 bfqq->bfqd = bfqd; 4890 bfqq->bfqd = bfqd;
@@ -4909,28 +5103,27 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4909 * bfqq may have a long think time because of a 5103 * bfqq may have a long think time because of a
4910 * synchronization with some other queue, i.e., because the 5104 * synchronization with some other queue, i.e., because the
4911 * I/O of some other queue may need to be completed for bfqq 5105 * I/O of some other queue may need to be completed for bfqq
4912 * to receive new I/O. This happens, e.g., if bfqq is 5106 * to receive new I/O. Details in the comments on the choice
4913 * associated with a process that does some sync. A sync 5107 * of the queue for injection in bfq_select_queue().
4914 * generates extra blocking I/O, which must be completed
4915 * before the process associated with bfqq can go on with its
4916 * I/O.
4917 * 5108 *
4918 * If such a synchronization is actually in place, then, 5109 * As stressed in those comments, if such a synchronization is
4919 * without injection on bfqq, the blocking I/O cannot happen 5110 * actually in place, then, without injection on bfqq, the
4920 * to served while bfqq is in service. As a consequence, if 5111 * blocking I/O cannot happen to served while bfqq is in
4921 * bfqq is granted I/O-dispatch-plugging, then bfqq remains 5112 * service. As a consequence, if bfqq is granted
4922 * empty, and no I/O is dispatched, until the idle timeout 5113 * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
4923 * fires. This is likely to result in lower bandwidth and 5114 * is dispatched, until the idle timeout fires. This is likely
4924 * higher latencies for bfqq, and in a severe loss of total 5115 * to result in lower bandwidth and higher latencies for bfqq,
4925 * throughput. 5116 * and in a severe loss of total throughput.
4926 * 5117 *
4927 * On the opposite end, a non-zero inject limit may allow the 5118 * On the opposite end, a non-zero inject limit may allow the
4928 * I/O that blocks bfqq to be executed soon, and therefore 5119 * I/O that blocks bfqq to be executed soon, and therefore
4929 * bfqq to receive new I/O soon. But, if this actually 5120 * bfqq to receive new I/O soon.
4930 * happens, then the next think-time sample for bfqq may be 5121 *
4931 * very low. This in turn may cause bfqq's think time to be 5122 * But, if the blocking gets actually eliminated, then the
4932 * deemed short. Without the 100 ms barrier, this new state 5123 * next think-time sample for bfqq may be very low. This in
4933 * change would cause the body of the next if to be executed 5124 * turn may cause bfqq's think time to be deemed
5125 * short. Without the 100 ms barrier, this new state change
5126 * would cause the body of the next if to be executed
4934 * immediately. But this would set to 0 the inject 5127 * immediately. But this would set to 0 the inject
4935 * limit. Without injection, the blocking I/O would cause the 5128 * limit. Without injection, the blocking I/O would cause the
4936 * think time of bfqq to become long again, and therefore the 5129 * think time of bfqq to become long again, and therefore the
@@ -4941,11 +5134,11 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4941 * In contrast, if the inject limit is not reset during such a 5134 * In contrast, if the inject limit is not reset during such a
4942 * long time interval as 100 ms, then the number of short 5135 * long time interval as 100 ms, then the number of short
4943 * think time samples can grow significantly before the reset 5136 * think time samples can grow significantly before the reset
4944 * is allowed. As a consequence, the think time state can 5137 * is performed. As a consequence, the think time state can
4945 * become stable before the reset. There will be no state 5138 * become stable before the reset. Therefore there will be no
4946 * change when the 100 ms elapse, and therefore no reset of 5139 * state change when the 100 ms elapse, and no reset of the
4947 * the inject limit. The inject limit remains steadily equal 5140 * inject limit. The inject limit remains steadily equal to 1
4948 * to 1 both during and after the 100 ms. So injection can be 5141 * both during and after the 100 ms. So injection can be
4949 * performed at all times, and throughput gets boosted. 5142 * performed at all times, and throughput gets boosted.
4950 * 5143 *
4951 * An inject limit equal to 1 is however in conflict, in 5144 * An inject limit equal to 1 is however in conflict, in
@@ -4960,10 +5153,20 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
4960 * brought forward, because it is not blocked for 5153 * brought forward, because it is not blocked for
4961 * milliseconds. 5154 * milliseconds.
4962 * 5155 *
4963 * In addition, during the 100 ms, the base value for the 5156 * In addition, serving the blocking I/O much sooner, and much
4964 * total service time is likely to get finally computed, 5157 * more frequently than once per I/O-plugging timeout, makes
4965 * freeing the inject limit from its relation with the think 5158 * it much quicker to detect a waker queue (the concept of
4966 * time. 5159 * waker queue is defined in the comments in
5160 * bfq_add_request()). This makes it possible to start sooner
5161 * to boost throughput more effectively, by injecting the I/O
5162 * of the waker queue unconditionally on every
5163 * bfq_dispatch_request().
5164 *
5165 * One last, important benefit of not resetting the inject
5166 * limit before 100 ms is that, during this time interval, the
5167 * base value for the total service time is likely to get
5168 * finally computed for bfqq, freeing the inject limit from
5169 * its relation with the think time.
4967 */ 5170 */
4968 if (state_changed && bfqq->last_serv_time_ns == 0 && 5171 if (state_changed && bfqq->last_serv_time_ns == 0 &&
4969 (time_is_before_eq_jiffies(bfqq->decrease_time_jif + 5172 (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
@@ -5278,6 +5481,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5278 1UL<<(BFQ_RATE_SHIFT - 10)) 5481 1UL<<(BFQ_RATE_SHIFT - 10))
5279 bfq_update_rate_reset(bfqd, NULL); 5482 bfq_update_rate_reset(bfqd, NULL);
5280 bfqd->last_completion = now_ns; 5483 bfqd->last_completion = now_ns;
5484 bfqd->last_completed_rq_bfqq = bfqq;
5281 5485
5282 /* 5486 /*
5283 * If we are waiting to discover whether the request pattern 5487 * If we are waiting to discover whether the request pattern
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 584d3c9ed8ba..e80adf822bbe 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -357,6 +357,24 @@ struct bfq_queue {
357 357
358 /* max service rate measured so far */ 358 /* max service rate measured so far */
359 u32 max_service_rate; 359 u32 max_service_rate;
360
361 /*
362 * Pointer to the waker queue for this queue, i.e., to the
363 * queue Q such that this queue happens to get new I/O right
364 * after some I/O request of Q is completed. For details, see
365 * the comments on the choice of the queue for injection in
366 * bfq_select_queue().
367 */
368 struct bfq_queue *waker_bfqq;
369 /* node for woken_list, see below */
370 struct hlist_node woken_list_node;
371 /*
372 * Head of the list of the woken queues for this queue, i.e.,
373 * of the list of the queues for which this queue is a waker
374 * queue. This list is used to reset the waker_bfqq pointer in
375 * the woken queues when this queue exits.
376 */
377 struct hlist_head woken_list;
360}; 378};
361 379
362/** 380/**
@@ -533,6 +551,9 @@ struct bfq_data {
533 /* time of last request completion (ns) */ 551 /* time of last request completion (ns) */
534 u64 last_completion; 552 u64 last_completion;
535 553
554 /* bfqq owning the last completed rq */
555 struct bfq_queue *last_completed_rq_bfqq;
556
536 /* time of last transition from empty to non-empty (ns) */ 557 /* time of last transition from empty to non-empty (ns) */
537 u64 last_empty_occupied_ns; 558 u64 last_empty_occupied_ns;
538 559
@@ -743,7 +764,8 @@ enum bfqq_state_flags {
743 * update 764 * update
744 */ 765 */
745 BFQQF_coop, /* bfqq is shared */ 766 BFQQF_coop, /* bfqq is shared */
746 BFQQF_split_coop /* shared bfqq will be split */ 767 BFQQF_split_coop, /* shared bfqq will be split */
768 BFQQF_has_waker /* bfqq has a waker queue */
747}; 769};
748 770
749#define BFQ_BFQQ_FNS(name) \ 771#define BFQ_BFQQ_FNS(name) \
@@ -763,6 +785,7 @@ BFQ_BFQQ_FNS(in_large_burst);
763BFQ_BFQQ_FNS(coop); 785BFQ_BFQQ_FNS(coop);
764BFQ_BFQQ_FNS(split_coop); 786BFQ_BFQQ_FNS(split_coop);
765BFQ_BFQQ_FNS(softrt_update); 787BFQ_BFQQ_FNS(softrt_update);
788BFQ_BFQQ_FNS(has_waker);
766#undef BFQ_BFQQ_FNS 789#undef BFQ_BFQQ_FNS
767 790
768/* Expiration reasons. */ 791/* Expiration reasons. */