summaryrefslogtreecommitdiffstats
path: root/block/bfq-iosched.c
diff options
context:
space:
mode:
authorPaolo Valente <paolo.valente@linaro.org>2019-06-25 01:12:48 -0400
committerJens Axboe <axboe@kernel.dk>2019-06-25 11:07:35 -0400
commit96a291c38c329910738c002de83a9e3f6bf8c6e7 (patch)
treed6b3293b5e867a6e2b25f6d86de4f9737aeb9f74 /block/bfq-iosched.c
parent13a857a4c4e826c587cde3a69bc3d1162d247d9d (diff)
block, bfq: preempt lower-weight or lower-priority queues
BFQ enqueues the I/O coming from each process into a separate bfq_queue, and serves bfq_queues one at a time. Each bfq_queue may be served for at most timeout_sync milliseconds (default: 125 ms). This service scheme is prone to the following inaccuracy. While a bfq_queue Q1 is in service, some empty bfq_queue Q2 may receive I/O, and, according to BFQ's scheduling policy, may become the right bfq_queue to serve, in place of the currently in-service bfq_queue. In this respect, postponing the service of Q2 to after the service of Q1 finishes may delay the completion of Q2's I/O, compared with an ideal service in which all non-empty bfq_queues are served in parallel, and every non-empty bfq_queue is served at a rate proportional to the bfq_queue's weight. This additional delay is equal at most to the time Q1 may unjustly remain in service before switching to Q2. If Q1 and Q2 have the same weight, then this time is most likely negligible compared with the completion time to be guaranteed to Q2's I/O. In addition, first, one of the reasons why BFQ may want to serve Q1 for a while is that this boosts throughput and, second, serving Q1 longer reduces BFQ's overhead. As a conclusion, it is usually better not to preempt Q1 if both Q1 and Q2 have the same weight. In contrast, as Q2's weight or priority becomes higher and higher compared with that of Q1, the above delay becomes larger and larger, compared with the I/O completion times that have to be guaranteed to Q2 according to Q2's weight. So reducing this delay may be more important than avoiding the costs of preempting Q1. Accordingly, this commit preempts Q1 if Q2 has a higher weight or a higher priority than Q1. Preemption causes Q1 to be re-scheduled, and triggers a new choice of the next bfq_queue to serve. If Q2 really is the next bfq_queue to serve, then Q2 will be set in service immediately. This change reduces the component of the I/O latency caused by the above delay by about 80%. For example, on an (old) PLEXTOR PX-256M5 SSD, the maximum latency reported by fio drops from 15.1 to 3.2 ms for a process doing sporadic random reads while another process is doing continuous sequential reads. Signed-off-by: Nicola Bottura <bottura.nicola95@gmail.com> Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'block/bfq-iosched.c')
-rw-r--r--block/bfq-iosched.c95
1 files changed, 75 insertions, 20 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 9e2fbb7d1fb6..6a3d05023300 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1428,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd)
1428 * mechanism may be re-designed in such a way to make it possible to 1428 * mechanism may be re-designed in such a way to make it possible to
1429 * know whether preemption is needed without needing to update service 1429 * know whether preemption is needed without needing to update service
1430 * trees). In addition, queue preemptions almost always cause random 1430 * trees). In addition, queue preemptions almost always cause random
1431 * I/O, and thus loss of throughput. Because of these facts, the next 1431 * I/O, which may in turn cause loss of throughput. Finally, there may
1432 * function adopts the following simple scheme to avoid both costly 1432 * even be no in-service queue when the next function is invoked (so,
1433 * operations and too frequent preemptions: it requests the expiration 1433 * no queue to compare timestamps with). Because of these facts, the
1434 * of the in-service queue (unconditionally) only for queues that need 1434 * next function adopts the following simple scheme to avoid costly
1435 * to recover a hole, or that either are weight-raised or deserve to 1435 * operations, too frequent preemptions and too many dependencies on
1436 * be weight-raised. 1436 * the state of the scheduler: it requests the expiration of the
1437 * in-service queue (unconditionally) only for queues that need to
1438 * recover a hole. Then it delegates to other parts of the code the
1439 * responsibility of handling the above case 2.
1437 */ 1440 */
1438static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, 1441static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1439 struct bfq_queue *bfqq, 1442 struct bfq_queue *bfqq,
1440 bool arrived_in_time, 1443 bool arrived_in_time)
1441 bool wr_or_deserves_wr)
1442{ 1444{
1443 struct bfq_entity *entity = &bfqq->entity; 1445 struct bfq_entity *entity = &bfqq->entity;
1444 1446
@@ -1493,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
1493 entity->budget = max_t(unsigned long, bfqq->max_budget, 1495 entity->budget = max_t(unsigned long, bfqq->max_budget,
1494 bfq_serv_to_charge(bfqq->next_rq, bfqq)); 1496 bfq_serv_to_charge(bfqq->next_rq, bfqq));
1495 bfq_clear_bfqq_non_blocking_wait_rq(bfqq); 1497 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
1496 return wr_or_deserves_wr; 1498 return false;
1497} 1499}
1498 1500
1499/* 1501/*
@@ -1611,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
1611 bfqd->bfq_wr_min_idle_time); 1613 bfqd->bfq_wr_min_idle_time);
1612} 1614}
1613 1615
1616
1617/*
1618 * Return true if bfqq is in a higher priority class, or has a higher
1619 * weight than the in-service queue.
1620 */
1621static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
1622 struct bfq_queue *in_serv_bfqq)
1623{
1624 int bfqq_weight, in_serv_weight;
1625
1626 if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
1627 return true;
1628
1629 if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
1630 bfqq_weight = bfqq->entity.weight;
1631 in_serv_weight = in_serv_bfqq->entity.weight;
1632 } else {
1633 if (bfqq->entity.parent)
1634 bfqq_weight = bfqq->entity.parent->weight;
1635 else
1636 bfqq_weight = bfqq->entity.weight;
1637 if (in_serv_bfqq->entity.parent)
1638 in_serv_weight = in_serv_bfqq->entity.parent->weight;
1639 else
1640 in_serv_weight = in_serv_bfqq->entity.weight;
1641 }
1642
1643 return bfqq_weight > in_serv_weight;
1644}
1645
1614static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, 1646static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1615 struct bfq_queue *bfqq, 1647 struct bfq_queue *bfqq,
1616 int old_wr_coeff, 1648 int old_wr_coeff,
@@ -1655,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1655 */ 1687 */
1656 bfqq_wants_to_preempt = 1688 bfqq_wants_to_preempt =
1657 bfq_bfqq_update_budg_for_activation(bfqd, bfqq, 1689 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
1658 arrived_in_time, 1690 arrived_in_time);
1659 wr_or_deserves_wr);
1660 1691
1661 /* 1692 /*
1662 * If bfqq happened to be activated in a burst, but has been 1693 * If bfqq happened to be activated in a burst, but has been
@@ -1721,16 +1752,40 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
1721 1752
1722 /* 1753 /*
1723 * Expire in-service queue only if preemption may be needed 1754 * Expire in-service queue only if preemption may be needed
1724 * for guarantees. In this respect, the function 1755 * for guarantees. In particular, we care only about two
1725 * next_queue_may_preempt just checks a simple, necessary 1756 * cases. The first is that bfqq has to recover a service
1726 * condition, and not a sufficient condition based on 1757 * hole, as explained in the comments on
1727 * timestamps. In fact, for the latter condition to be 1758 * bfq_bfqq_update_budg_for_activation(), i.e., that
1728 * evaluated, timestamps would need first to be updated, and 1759 * bfqq_wants_to_preempt is true. However, if bfqq does not
1729 * this operation is quite costly (see the comments on the 1760 * carry time-critical I/O, then bfqq's bandwidth is less
1730 * function bfq_bfqq_update_budg_for_activation). 1761 * important than that of queues that carry time-critical I/O.
1762 * So, as a further constraint, we consider this case only if
1763 * bfqq is at least as weight-raised, i.e., at least as time
1764 * critical, as the in-service queue.
1765 *
1766 * The second case is that bfqq is in a higher priority class,
1767 * or has a higher weight than the in-service queue. If this
1768 * condition does not hold, we don't care because, even if
1769 * bfqq does not start to be served immediately, the resulting
1770 * delay for bfqq's I/O is however lower or much lower than
1771 * the ideal completion time to be guaranteed to bfqq's I/O.
1772 *
1773 * In both cases, preemption is needed only if, according to
1774 * the timestamps of both bfqq and of the in-service queue,
1775 * bfqq actually is the next queue to serve. So, to reduce
1776 * useless preemptions, the return value of
1777 * next_queue_may_preempt() is considered in the next compound
1778 * condition too. Yet next_queue_may_preempt() just checks a
1779 * simple, necessary condition for bfqq to be the next queue
1780 * to serve. In fact, to evaluate a sufficient condition, the
1781 * timestamps of the in-service queue would need to be
1782 * updated, and this operation is quite costly (see the
1783 * comments on bfq_bfqq_update_budg_for_activation()).
1731 */ 1784 */
1732 if (bfqd->in_service_queue && bfqq_wants_to_preempt && 1785 if (bfqd->in_service_queue &&
1733 bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && 1786 ((bfqq_wants_to_preempt &&
1787 bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
1788 bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
1734 next_queue_may_preempt(bfqd)) 1789 next_queue_may_preempt(bfqd))
1735 bfq_bfqq_expire(bfqd, bfqd->in_service_queue, 1790 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
1736 false, BFQQE_PREEMPTED); 1791 false, BFQQE_PREEMPTED);