summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/block/bfq-iosched.txt9
-rw-r--r--block/bfq-iosched.c740
2 files changed, 675 insertions, 74 deletions
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index 461b27fce979..1b87df6cd476 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -375,6 +375,11 @@ default, low latency mode is enabled. If enabled, interactive and soft
375real-time applications are privileged and experience a lower latency, 375real-time applications are privileged and experience a lower latency,
376as explained in more detail in the description of how BFQ works. 376as explained in more detail in the description of how BFQ works.
377 377
378DO NOT enable this mode if you need full control on bandwidth
379distribution. In fact, if it is enabled, then BFQ automatically
380increases the bandwidth share of privileged applications, as the main
381means to guarantee a lower latency to them.
382
378timeout_sync 383timeout_sync
379------------ 384------------
380 385
@@ -507,6 +512,10 @@ linear mapping between ioprio and weights, described at the beginning
507of the tunable section, is still valid, but all weights higher than 512of the tunable section, is still valid, but all weights higher than
508IOPRIO_BE_NR*10 are mapped to ioprio 0. 513IOPRIO_BE_NR*10 are mapped to ioprio 0.
509 514
515Recall that, if low-latency is set, then BFQ automatically raises the
516weight of the queues associated with interactive and soft real-time
517applications. Unset this tunable if you need/want to control weights.
518
510 519
511[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O 520[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
512 Scheduler", Proceedings of the First Workshop on Mobile System 521 Scheduler", Proceedings of the First Workshop on Mobile System
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index dce273b91015..1a32c8341ab0 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -339,6 +339,17 @@ struct bfq_queue {
339 339
340 /* pid of the process owning the queue, used for logging purposes */ 340 /* pid of the process owning the queue, used for logging purposes */
341 pid_t pid; 341 pid_t pid;
342
343 /* current maximum weight-raising time for this queue */
344 unsigned long wr_cur_max_time;
345 /*
346 * Start time of the current weight-raising period if
347 * the @bfq-queue is being weight-raised, otherwise
348 * finish time of the last weight-raising period.
349 */
350 unsigned long last_wr_start_finish;
351 /* factor by which the weight of this queue is multiplied */
352 unsigned int wr_coeff;
342}; 353};
343 354
344/** 355/**
@@ -356,6 +367,11 @@ struct bfq_io_cq {
356#endif 367#endif
357}; 368};
358 369
370enum bfq_device_speed {
371 BFQ_BFQD_FAST,
372 BFQ_BFQD_SLOW,
373};
374
359/** 375/**
360 * struct bfq_data - per-device data structure. 376 * struct bfq_data - per-device data structure.
361 * 377 *
@@ -487,6 +503,34 @@ struct bfq_data {
487 */ 503 */
488 bool strict_guarantees; 504 bool strict_guarantees;
489 505
506 /* if set to true, low-latency heuristics are enabled */
507 bool low_latency;
508 /*
509 * Maximum factor by which the weight of a weight-raised queue
510 * is multiplied.
511 */
512 unsigned int bfq_wr_coeff;
513 /* maximum duration of a weight-raising period (jiffies) */
514 unsigned int bfq_wr_max_time;
515 /*
516 * Minimum idle period after which weight-raising may be
517 * reactivated for a queue (in jiffies).
518 */
519 unsigned int bfq_wr_min_idle_time;
520 /*
521 * Minimum period between request arrivals after which
522 * weight-raising may be reactivated for an already busy async
523 * queue (in jiffies).
524 */
525 unsigned long bfq_wr_min_inter_arr_async;
526 /*
527 * Cached value of the product R*T, used for computing the
528 * maximum duration of weight raising automatically.
529 */
530 u64 RT_prod;
531 /* device-speed class for the low-latency heuristic */
532 enum bfq_device_speed device_speed;
533
490 /* fallback dummy bfqq for extreme OOM conditions */ 534 /* fallback dummy bfqq for extreme OOM conditions */
491 struct bfq_queue oom_bfqq; 535 struct bfq_queue oom_bfqq;
492 536
@@ -515,7 +559,6 @@ enum bfqq_state_flags {
515 BFQQF_fifo_expire, /* FIFO checked in this slice */ 559 BFQQF_fifo_expire, /* FIFO checked in this slice */
516 BFQQF_idle_window, /* slice idling enabled */ 560 BFQQF_idle_window, /* slice idling enabled */
517 BFQQF_sync, /* synchronous queue */ 561 BFQQF_sync, /* synchronous queue */
518 BFQQF_budget_new, /* no completion with this budget */
519 BFQQF_IO_bound, /* 562 BFQQF_IO_bound, /*
520 * bfqq has timed-out at least once 563 * bfqq has timed-out at least once
521 * having consumed at most 2/10 of 564 * having consumed at most 2/10 of
@@ -543,7 +586,6 @@ BFQ_BFQQ_FNS(non_blocking_wait_rq);
543BFQ_BFQQ_FNS(fifo_expire); 586BFQ_BFQQ_FNS(fifo_expire);
544BFQ_BFQQ_FNS(idle_window); 587BFQ_BFQQ_FNS(idle_window);
545BFQ_BFQQ_FNS(sync); 588BFQ_BFQQ_FNS(sync);
546BFQ_BFQQ_FNS(budget_new);
547BFQ_BFQQ_FNS(IO_bound); 589BFQ_BFQQ_FNS(IO_bound);
548#undef BFQ_BFQQ_FNS 590#undef BFQ_BFQQ_FNS
549 591
@@ -637,7 +679,7 @@ struct bfq_group_data {
637 /* must be the first member */ 679 /* must be the first member */
638 struct blkcg_policy_data pd; 680 struct blkcg_policy_data pd;
639 681
640 unsigned short weight; 682 unsigned int weight;
641}; 683};
642 684
643/** 685/**
@@ -732,6 +774,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq);
732static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, 774static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
733 struct bio *bio, bool is_sync, 775 struct bio *bio, bool is_sync,
734 struct bfq_io_cq *bic); 776 struct bfq_io_cq *bic);
777static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
778 struct bfq_group *bfqg);
735static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); 779static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
736static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); 780static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
737 781
@@ -787,6 +831,56 @@ static struct kmem_cache *bfq_pool;
787/* Shift used for peak rate fixed precision calculations. */ 831/* Shift used for peak rate fixed precision calculations. */
788#define BFQ_RATE_SHIFT 16 832#define BFQ_RATE_SHIFT 16
789 833
834/*
835 * By default, BFQ computes the duration of the weight raising for
836 * interactive applications automatically, using the following formula:
837 * duration = (R / r) * T, where r is the peak rate of the device, and
838 * R and T are two reference parameters.
839 * In particular, R is the peak rate of the reference device (see below),
840 * and T is a reference time: given the systems that are likely to be
841 * installed on the reference device according to its speed class, T is
842 * about the maximum time needed, under BFQ and while reading two files in
843 * parallel, to load typical large applications on these systems.
844 * In practice, the slower/faster the device at hand is, the more/less it
845 * takes to load applications with respect to the reference device.
846 * Accordingly, the longer/shorter BFQ grants weight raising to interactive
847 * applications.
848 *
849 * BFQ uses four different reference pairs (R, T), depending on:
850 * . whether the device is rotational or non-rotational;
851 * . whether the device is slow, such as old or portable HDDs, as well as
852 * SD cards, or fast, such as newer HDDs and SSDs.
853 *
854 * The device's speed class is dynamically (re)detected in
855 * bfq_update_peak_rate() every time the estimated peak rate is updated.
856 *
857 * In the following definitions, R_slow[0]/R_fast[0] and
858 * T_slow[0]/T_fast[0] are the reference values for a slow/fast
859 * rotational device, whereas R_slow[1]/R_fast[1] and
860 * T_slow[1]/T_fast[1] are the reference values for a slow/fast
861 * non-rotational device. Finally, device_speed_thresh are the
862 * thresholds used to switch between speed classes. The reference
863 * rates are not the actual peak rates of the devices used as a
864 * reference, but slightly lower values. The reason for using these
865 * slightly lower values is that the peak-rate estimator tends to
866 * yield slightly lower values than the actual peak rate (it can yield
867 * the actual peak rate only if there is only one process doing I/O,
868 * and the process does sequential I/O).
869 *
870 * Both the reference peak rates and the thresholds are measured in
871 * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
872 */
873static int R_slow[2] = {1000, 10700};
874static int R_fast[2] = {14000, 33000};
875/*
876 * To improve readability, a conversion function is used to initialize the
877 * following arrays, which entails that they can be initialized only in a
878 * function.
879 */
880static int T_slow[2];
881static int T_fast[2];
882static int device_speed_thresh[2];
883
790#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ 884#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
791 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) 885 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
792 886
@@ -1486,7 +1580,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1486 1580
1487 if (entity->prio_changed) { 1581 if (entity->prio_changed) {
1488 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); 1582 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1489 unsigned short prev_weight, new_weight; 1583 unsigned int prev_weight, new_weight;
1490 struct bfq_data *bfqd = NULL; 1584 struct bfq_data *bfqd = NULL;
1491#ifdef CONFIG_BFQ_GROUP_IOSCHED 1585#ifdef CONFIG_BFQ_GROUP_IOSCHED
1492 struct bfq_sched_data *sd; 1586 struct bfq_sched_data *sd;
@@ -1535,7 +1629,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
1535 new_st = bfq_entity_service_tree(entity); 1629 new_st = bfq_entity_service_tree(entity);
1536 1630
1537 prev_weight = entity->weight; 1631 prev_weight = entity->weight;
1538 new_weight = entity->orig_weight; 1632 new_weight = entity->orig_weight *
1633 (bfqq ? bfqq->wr_coeff : 1);
1539 entity->weight = new_weight; 1634 entity->weight = new_weight;
1540 1635
1541 new_st->wsum += entity->weight; 1636 new_st->wsum += entity->weight;
@@ -1630,6 +1725,8 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
1630 struct bfq_service_tree *st, 1725 struct bfq_service_tree *st,
1631 bool backshifted) 1726 bool backshifted)
1632{ 1727{
1728 struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
1729
1633 st = __bfq_entity_update_weight_prio(st, entity); 1730 st = __bfq_entity_update_weight_prio(st, entity);
1634 bfq_calc_finish(entity, entity->budget); 1731 bfq_calc_finish(entity, entity->budget);
1635 1732
@@ -1659,10 +1756,19 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
1659 * time. This may introduce a little unfairness among queues 1756 * time. This may introduce a little unfairness among queues
1660 * with backshifted timestamps, but it does not break 1757 * with backshifted timestamps, but it does not break
1661 * worst-case fairness guarantees. 1758 * worst-case fairness guarantees.
1759 *
1760 * As a special case, if bfqq is weight-raised, push up
1761 * timestamps much less, to keep very low the probability that
1762 * this push up causes the backshifted finish timestamps of
1763 * weight-raised queues to become higher than the backshifted
1764 * finish timestamps of non weight-raised queues.
1662 */ 1765 */
1663 if (backshifted && bfq_gt(st->vtime, entity->finish)) { 1766 if (backshifted && bfq_gt(st->vtime, entity->finish)) {
1664 unsigned long delta = st->vtime - entity->finish; 1767 unsigned long delta = st->vtime - entity->finish;
1665 1768
1769 if (bfqq)
1770 delta /= bfqq->wr_coeff;
1771
1666 entity->start += delta; 1772 entity->start += delta;
1667 entity->finish += delta; 1773 entity->finish += delta;
1668 } 1774 }
@@ -3070,6 +3176,18 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
3070 bfqg_stats_xfer_dead(bfqg); 3176 bfqg_stats_xfer_dead(bfqg);
3071} 3177}
3072 3178
3179static void bfq_end_wr_async(struct bfq_data *bfqd)
3180{
3181 struct blkcg_gq *blkg;
3182
3183 list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
3184 struct bfq_group *bfqg = blkg_to_bfqg(blkg);
3185
3186 bfq_end_wr_async_queues(bfqd, bfqg);
3187 }
3188 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3189}
3190
3073static int bfq_io_show_weight(struct seq_file *sf, void *v) 3191static int bfq_io_show_weight(struct seq_file *sf, void *v)
3074{ 3192{
3075 struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); 3193 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
@@ -3433,6 +3551,11 @@ static void bfq_init_entity(struct bfq_entity *entity,
3433 3551
3434static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} 3552static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
3435 3553
3554static void bfq_end_wr_async(struct bfq_data *bfqd)
3555{
3556 bfq_end_wr_async_queues(bfqd, bfqd->root_group);
3557}
3558
3436static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, 3559static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
3437 struct blkcg *blkcg) 3560 struct blkcg *blkcg)
3438{ 3561{
@@ -3613,7 +3736,7 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
3613static unsigned long bfq_serv_to_charge(struct request *rq, 3736static unsigned long bfq_serv_to_charge(struct request *rq,
3614 struct bfq_queue *bfqq) 3737 struct bfq_queue *bfqq)
3615{ 3738{
3616 if (bfq_bfqq_sync(bfqq)) 3739 if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
3617 return blk_rq_sectors(rq); 3740 return blk_rq_sectors(rq);
3618 3741
3619 return blk_rq_sectors(rq) * bfq_async_charge_factor; 3742 return blk_rq_sectors(rq) * bfq_async_charge_factor;
@@ -3700,12 +3823,12 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
3700 * whether the in-service queue should be expired, by returning 3823 * whether the in-service queue should be expired, by returning
3701 * true. The purpose of expiring the in-service queue is to give bfqq 3824 * true. The purpose of expiring the in-service queue is to give bfqq
3702 * the chance to possibly preempt the in-service queue, and the reason 3825 * the chance to possibly preempt the in-service queue, and the reason
3703 * for preempting the in-service queue is to achieve the following 3826 * for preempting the in-service queue is to achieve one of the two
3704 * goal: guarantee to bfqq its reserved bandwidth even if bfqq has 3827 * goals below.
3705 * expired because it has remained idle.
3706 * 3828 *
3707 * In particular, bfqq may have expired for one of the following two 3829 * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
3708 * reasons: 3830 * expired because it has remained idle. In particular, bfqq may have
3831 * expired for one of the following two reasons:
3709 * 3832 *
3710 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling 3833 * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
3711 * and did not make it to issue a new request before its last 3834 * and did not make it to issue a new request before its last
@@ -3769,10 +3892,36 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
3769 * above-described special way, and signals that the in-service queue 3892 * above-described special way, and signals that the in-service queue
3770 * should be expired. Timestamp back-shifting is done later in 3893 * should be expired. Timestamp back-shifting is done later in
3771 * __bfq_activate_entity. 3894 * __bfq_activate_entity.
3895 *
3896 * 2. Reduce latency. Even if timestamps are not backshifted to let
3897 * the process associated with bfqq recover a service hole, bfqq may
3898 * however happen to have, after being (re)activated, a lower finish
3899 * timestamp than the in-service queue. That is, the next budget of
3900 * bfqq may have to be completed before the one of the in-service
3901 * queue. If this is the case, then preempting the in-service queue
3902 * allows this goal to be achieved, apart from the unpreemptible,
3903 * outstanding requests mentioned above.
3904 *
3905 * Unfortunately, regardless of which of the above two goals one wants
3906 * to achieve, service trees need first to be updated to know whether
3907 * the in-service queue must be preempted. To have service trees
3908 * correctly updated, the in-service queue must be expired and
3909 * rescheduled, and bfqq must be scheduled too. This is one of the
3910 * most costly operations (in future versions, the scheduling
3911 * mechanism may be re-designed in such a way to make it possible to
3912 * know whether preemption is needed without needing to update service
3913 * trees). In addition, queue preemptions almost always cause random
3914 * I/O, and thus loss of throughput. Because of these facts, the next
3915 * function adopts the following simple scheme to avoid both costly
3916 * operations and too frequent preemptions: it requests the expiration
3917 * of the in-service queue (unconditionally) only for queues that need
3918 * to recover a hole, or that either are weight-raised or deserve to
3919 * be weight-raised.
3772 */ 3920 */
3773static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, 3921static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
3774 struct bfq_queue *bfqq, 3922 struct bfq_queue *bfqq,
3775 bool arrived_in_time) 3923 bool arrived_in_time,
3924 bool wr_or_deserves_wr)
3776{ 3925{
3777 struct bfq_entity *entity = &bfqq->entity; 3926 struct bfq_entity *entity = &bfqq->entity;
3778 3927
@@ -3807,14 +3956,85 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
3807 entity->budget = max_t(unsigned long, bfqq->max_budget, 3956 entity->budget = max_t(unsigned long, bfqq->max_budget,
3808 bfq_serv_to_charge(bfqq->next_rq, bfqq)); 3957 bfq_serv_to_charge(bfqq->next_rq, bfqq));
3809 bfq_clear_bfqq_non_blocking_wait_rq(bfqq); 3958 bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
3810 return false; 3959 return wr_or_deserves_wr;
3960}
3961
3962static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
3963{
3964 u64 dur;
3965
3966 if (bfqd->bfq_wr_max_time > 0)
3967 return bfqd->bfq_wr_max_time;
3968
3969 dur = bfqd->RT_prod;
3970 do_div(dur, bfqd->peak_rate);
3971
3972 /*
3973 * Limit duration between 3 and 13 seconds. Tests show that
3974 * higher values than 13 seconds often yield the opposite of
3975 * the desired result, i.e., worsen responsiveness by letting
3976 * non-interactive and non-soft-real-time applications
3977 * preserve weight raising for a too long time interval.
3978 *
3979 * On the other end, lower values than 3 seconds make it
3980 * difficult for most interactive tasks to complete their jobs
3981 * before weight-raising finishes.
3982 */
3983 if (dur > msecs_to_jiffies(13000))
3984 dur = msecs_to_jiffies(13000);
3985 else if (dur < msecs_to_jiffies(3000))
3986 dur = msecs_to_jiffies(3000);
3987
3988 return dur;
3989}
3990
3991static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
3992 struct bfq_queue *bfqq,
3993 unsigned int old_wr_coeff,
3994 bool wr_or_deserves_wr,
3995 bool interactive)
3996{
3997 if (old_wr_coeff == 1 && wr_or_deserves_wr) {
3998 /* start a weight-raising period */
3999 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4000 /* update wr duration */
4001 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4002
4003 /*
4004 * If needed, further reduce budget to make sure it is
4005 * close to bfqq's backlog, so as to reduce the
4006 * scheduling-error component due to a too large
4007 * budget. Do not care about throughput consequences,
4008 * but only about latency. Finally, do not assign a
4009 * too small budget either, to avoid increasing
4010 * latency by causing too frequent expirations.
4011 */
4012 bfqq->entity.budget = min_t(unsigned long,
4013 bfqq->entity.budget,
4014 2 * bfq_min_budget(bfqd));
4015 } else if (old_wr_coeff > 1) {
4016 /* update wr duration */
4017 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4018 }
4019}
4020
4021static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
4022 struct bfq_queue *bfqq)
4023{
4024 return bfqq->dispatched == 0 &&
4025 time_is_before_jiffies(
4026 bfqq->budget_timeout +
4027 bfqd->bfq_wr_min_idle_time);
3811} 4028}
3812 4029
3813static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, 4030static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
3814 struct bfq_queue *bfqq, 4031 struct bfq_queue *bfqq,
3815 struct request *rq) 4032 int old_wr_coeff,
4033 struct request *rq,
4034 bool *interactive)
3816{ 4035{
3817 bool bfqq_wants_to_preempt, 4036 bool wr_or_deserves_wr, bfqq_wants_to_preempt,
4037 idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
3818 /* 4038 /*
3819 * See the comments on 4039 * See the comments on
3820 * bfq_bfqq_update_budg_for_activation for 4040 * bfq_bfqq_update_budg_for_activation for
@@ -3827,12 +4047,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
3827 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); 4047 bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
3828 4048
3829 /* 4049 /*
3830 * Update budget and check whether bfqq may want to preempt 4050 * bfqq deserves to be weight-raised if:
3831 * the in-service queue. 4051 * - it is sync,
4052 * - it has been idle for enough time.
4053 */
4054 *interactive = idle_for_long_time;
4055 wr_or_deserves_wr = bfqd->low_latency &&
4056 (bfqq->wr_coeff > 1 ||
4057 (bfq_bfqq_sync(bfqq) && *interactive));
4058
4059 /*
4060 * Using the last flag, update budget and check whether bfqq
4061 * may want to preempt the in-service queue.
3832 */ 4062 */
3833 bfqq_wants_to_preempt = 4063 bfqq_wants_to_preempt =
3834 bfq_bfqq_update_budg_for_activation(bfqd, bfqq, 4064 bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
3835 arrived_in_time); 4065 arrived_in_time,
4066 wr_or_deserves_wr);
3836 4067
3837 if (!bfq_bfqq_IO_bound(bfqq)) { 4068 if (!bfq_bfqq_IO_bound(bfqq)) {
3838 if (arrived_in_time) { 4069 if (arrived_in_time) {
@@ -3844,6 +4075,16 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
3844 bfqq->requests_within_timer = 0; 4075 bfqq->requests_within_timer = 0;
3845 } 4076 }
3846 4077
4078 if (bfqd->low_latency) {
4079 bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
4080 old_wr_coeff,
4081 wr_or_deserves_wr,
4082 *interactive);
4083
4084 if (old_wr_coeff != bfqq->wr_coeff)
4085 bfqq->entity.prio_changed = 1;
4086 }
4087
3847 bfq_add_bfqq_busy(bfqd, bfqq); 4088 bfq_add_bfqq_busy(bfqd, bfqq);
3848 4089
3849 /* 4090 /*
@@ -3857,6 +4098,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
3857 * function bfq_bfqq_update_budg_for_activation). 4098 * function bfq_bfqq_update_budg_for_activation).
3858 */ 4099 */
3859 if (bfqd->in_service_queue && bfqq_wants_to_preempt && 4100 if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
4101 bfqd->in_service_queue->wr_coeff == 1 &&
3860 next_queue_may_preempt(bfqd)) 4102 next_queue_may_preempt(bfqd))
3861 bfq_bfqq_expire(bfqd, bfqd->in_service_queue, 4103 bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
3862 false, BFQQE_PREEMPTED); 4104 false, BFQQE_PREEMPTED);
@@ -3867,6 +4109,8 @@ static void bfq_add_request(struct request *rq)
3867 struct bfq_queue *bfqq = RQ_BFQQ(rq); 4109 struct bfq_queue *bfqq = RQ_BFQQ(rq);
3868 struct bfq_data *bfqd = bfqq->bfqd; 4110 struct bfq_data *bfqd = bfqq->bfqd;
3869 struct request *next_rq, *prev; 4111 struct request *next_rq, *prev;
4112 unsigned int old_wr_coeff = bfqq->wr_coeff;
4113 bool interactive = false;
3870 4114
3871 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); 4115 bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
3872 bfqq->queued[rq_is_sync(rq)]++; 4116 bfqq->queued[rq_is_sync(rq)]++;
@@ -3882,9 +4126,45 @@ static void bfq_add_request(struct request *rq)
3882 bfqq->next_rq = next_rq; 4126 bfqq->next_rq = next_rq;
3883 4127
3884 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ 4128 if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
3885 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq); 4129 bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
3886 else if (prev != bfqq->next_rq) 4130 rq, &interactive);
3887 bfq_updated_next_req(bfqd, bfqq); 4131 else {
4132 if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
4133 time_is_before_jiffies(
4134 bfqq->last_wr_start_finish +
4135 bfqd->bfq_wr_min_inter_arr_async)) {
4136 bfqq->wr_coeff = bfqd->bfq_wr_coeff;
4137 bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
4138
4139 bfqq->entity.prio_changed = 1;
4140 }
4141 if (prev != bfqq->next_rq)
4142 bfq_updated_next_req(bfqd, bfqq);
4143 }
4144
4145 /*
4146 * Assign jiffies to last_wr_start_finish in the following
4147 * cases:
4148 *
4149 * . if bfqq is not going to be weight-raised, because, for
4150 * non weight-raised queues, last_wr_start_finish stores the
4151 * arrival time of the last request; as of now, this piece
4152 * of information is used only for deciding whether to
4153 * weight-raise async queues
4154 *
4155 * . if bfqq is not weight-raised, because, if bfqq is now
4156 * switching to weight-raised, then last_wr_start_finish
4157 * stores the time when weight-raising starts
4158 *
4159 * . if bfqq is interactive, because, regardless of whether
4160 * bfqq is currently weight-raised, the weight-raising
4161 * period must start or restart (this case is considered
4162 * separately because it is not detected by the above
4163 * conditions, if bfqq is already weight-raised)
4164 */
4165 if (bfqd->low_latency &&
4166 (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
4167 bfqq->last_wr_start_finish = jiffies;
3888} 4168}
3889 4169
3890static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, 4170static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
@@ -4087,6 +4367,46 @@ end:
4087 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); 4367 bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
4088} 4368}
4089 4369
4370/* Must be called with bfqq != NULL */
4371static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
4372{
4373 bfqq->wr_coeff = 1;
4374 bfqq->wr_cur_max_time = 0;
4375 /*
4376 * Trigger a weight change on the next invocation of
4377 * __bfq_entity_update_weight_prio.
4378 */
4379 bfqq->entity.prio_changed = 1;
4380}
4381
4382static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
4383 struct bfq_group *bfqg)
4384{
4385 int i, j;
4386
4387 for (i = 0; i < 2; i++)
4388 for (j = 0; j < IOPRIO_BE_NR; j++)
4389 if (bfqg->async_bfqq[i][j])
4390 bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
4391 if (bfqg->async_idle_bfqq)
4392 bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
4393}
4394
4395static void bfq_end_wr(struct bfq_data *bfqd)
4396{
4397 struct bfq_queue *bfqq;
4398
4399 spin_lock_irq(&bfqd->lock);
4400
4401 list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
4402 bfq_bfqq_end_wr(bfqq);
4403 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
4404 bfq_bfqq_end_wr(bfqq);
4405 bfq_end_wr_async(bfqd);
4406
4407 spin_unlock_irq(&bfqd->lock);
4408}
4409
4090static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, 4410static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4091 struct bio *bio) 4411 struct bio *bio)
4092{ 4412{
@@ -4110,16 +4430,32 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
4110 return bfqq == RQ_BFQQ(rq); 4430 return bfqq == RQ_BFQQ(rq);
4111} 4431}
4112 4432
4433/*
4434 * Set the maximum time for the in-service queue to consume its
4435 * budget. This prevents seeky processes from lowering the throughput.
4436 * In practice, a time-slice service scheme is used with seeky
4437 * processes.
4438 */
4439static void bfq_set_budget_timeout(struct bfq_data *bfqd,
4440 struct bfq_queue *bfqq)
4441{
4442 bfqd->last_budget_start = ktime_get();
4443
4444 bfqq->budget_timeout = jiffies +
4445 bfqd->bfq_timeout *
4446 (bfqq->entity.weight / bfqq->entity.orig_weight);
4447}
4448
4113static void __bfq_set_in_service_queue(struct bfq_data *bfqd, 4449static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
4114 struct bfq_queue *bfqq) 4450 struct bfq_queue *bfqq)
4115{ 4451{
4116 if (bfqq) { 4452 if (bfqq) {
4117 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); 4453 bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
4118 bfq_mark_bfqq_budget_new(bfqq);
4119 bfq_clear_bfqq_fifo_expire(bfqq); 4454 bfq_clear_bfqq_fifo_expire(bfqq);
4120 4455
4121 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8; 4456 bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
4122 4457
4458 bfq_set_budget_timeout(bfqd, bfqq);
4123 bfq_log_bfqq(bfqd, bfqq, 4459 bfq_log_bfqq(bfqd, bfqq,
4124 "set_in_service_queue, cur-budget = %d", 4460 "set_in_service_queue, cur-budget = %d",
4125 bfqq->entity.budget); 4461 bfqq->entity.budget);
@@ -4159,9 +4495,13 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
4159 */ 4495 */
4160 sl = bfqd->bfq_slice_idle; 4496 sl = bfqd->bfq_slice_idle;
4161 /* 4497 /*
4162 * Grant only minimum idle time if the queue is seeky. 4498 * Unless the queue is being weight-raised, grant only minimum
4499 * idle time if the queue is seeky. A long idling is preserved
4500 * for a weight-raised queue, because it is needed for
4501 * guaranteeing to the queue its reserved share of the
4502 * throughput.
4163 */ 4503 */
4164 if (BFQQ_SEEKY(bfqq)) 4504 if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1)
4165 sl = min_t(u64, sl, BFQ_MIN_TT); 4505 sl = min_t(u64, sl, BFQ_MIN_TT);
4166 4506
4167 bfqd->last_idling_start = ktime_get(); 4507 bfqd->last_idling_start = ktime_get();
@@ -4171,27 +4511,6 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
4171} 4511}
4172 4512
4173/* 4513/*
4174 * Set the maximum time for the in-service queue to consume its
4175 * budget. This prevents seeky processes from lowering the disk
4176 * throughput (always guaranteed with a time slice scheme as in CFQ).
4177 */
4178static void bfq_set_budget_timeout(struct bfq_data *bfqd)
4179{
4180 struct bfq_queue *bfqq = bfqd->in_service_queue;
4181 unsigned int timeout_coeff = bfqq->entity.weight /
4182 bfqq->entity.orig_weight;
4183
4184 bfqd->last_budget_start = ktime_get();
4185
4186 bfq_clear_bfqq_budget_new(bfqq);
4187 bfqq->budget_timeout = jiffies +
4188 bfqd->bfq_timeout * timeout_coeff;
4189
4190 bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
4191 jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
4192}
4193
4194/*
4195 * In autotuning mode, max_budget is dynamically recomputed as the 4514 * In autotuning mode, max_budget is dynamically recomputed as the
4196 * amount of sectors transferred in timeout at the estimated peak 4515 * amount of sectors transferred in timeout at the estimated peak
4197 * rate. This enables BFQ to utilize a full timeslice with a full 4516 * rate. This enables BFQ to utilize a full timeslice with a full
@@ -4204,6 +4523,42 @@ static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
4204 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; 4523 jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
4205} 4524}
4206 4525
4526/*
4527 * Update parameters related to throughput and responsiveness, as a
4528 * function of the estimated peak rate. See comments on
4529 * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
4530 */
4531static void update_thr_responsiveness_params(struct bfq_data *bfqd)
4532{
4533 int dev_type = blk_queue_nonrot(bfqd->queue);
4534
4535 if (bfqd->bfq_user_max_budget == 0)
4536 bfqd->bfq_max_budget =
4537 bfq_calc_max_budget(bfqd);
4538
4539 if (bfqd->device_speed == BFQ_BFQD_FAST &&
4540 bfqd->peak_rate < device_speed_thresh[dev_type]) {
4541 bfqd->device_speed = BFQ_BFQD_SLOW;
4542 bfqd->RT_prod = R_slow[dev_type] *
4543 T_slow[dev_type];
4544 } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
4545 bfqd->peak_rate > device_speed_thresh[dev_type]) {
4546 bfqd->device_speed = BFQ_BFQD_FAST;
4547 bfqd->RT_prod = R_fast[dev_type] *
4548 T_fast[dev_type];
4549 }
4550
4551 bfq_log(bfqd,
4552"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
4553 dev_type == 0 ? "ROT" : "NONROT",
4554 bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
4555 bfqd->device_speed == BFQ_BFQD_FAST ?
4556 (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
4557 (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
4558 (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
4559 BFQ_RATE_SHIFT);
4560}
4561
4207static void bfq_reset_rate_computation(struct bfq_data *bfqd, 4562static void bfq_reset_rate_computation(struct bfq_data *bfqd,
4208 struct request *rq) 4563 struct request *rq)
4209{ 4564{
@@ -4315,9 +4670,7 @@ static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
4315 rate /= divisor; /* smoothing constant alpha = 1/divisor */ 4670 rate /= divisor; /* smoothing constant alpha = 1/divisor */
4316 4671
4317 bfqd->peak_rate += rate; 4672 bfqd->peak_rate += rate;
4318 if (bfqd->bfq_user_max_budget == 0) 4673 update_thr_responsiveness_params(bfqd);
4319 bfqd->bfq_max_budget =
4320 bfq_calc_max_budget(bfqd);
4321 4674
4322reset_computation: 4675reset_computation:
4323 bfq_reset_rate_computation(bfqd, rq); 4676 bfq_reset_rate_computation(bfqd, rq);
@@ -4439,9 +4792,18 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
4439 4792
4440static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) 4793static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4441{ 4794{
4442 if (RB_EMPTY_ROOT(&bfqq->sort_list)) 4795 if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
4796 if (bfqq->dispatched == 0)
4797 /*
4798 * Overloading budget_timeout field to store
4799 * the time at which the queue remains with no
4800 * backlog and no outstanding request; used by
4801 * the weight-raising mechanism.
4802 */
4803 bfqq->budget_timeout = jiffies;
4804
4443 bfq_del_bfqq_busy(bfqd, bfqq, true); 4805 bfq_del_bfqq_busy(bfqd, bfqq, true);
4444 else 4806 } else
4445 bfq_requeue_bfqq(bfqd, bfqq); 4807 bfq_requeue_bfqq(bfqd, bfqq);
4446 4808
4447 /* 4809 /*
@@ -4468,9 +4830,18 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4468 struct request *next_rq; 4830 struct request *next_rq;
4469 int budget, min_budget; 4831 int budget, min_budget;
4470 4832
4471 budget = bfqq->max_budget;
4472 min_budget = bfq_min_budget(bfqd); 4833 min_budget = bfq_min_budget(bfqd);
4473 4834
4835 if (bfqq->wr_coeff == 1)
4836 budget = bfqq->max_budget;
4837 else /*
4838 * Use a constant, low budget for weight-raised queues,
4839 * to help achieve a low latency. Keep it slightly higher
4840 * than the minimum possible budget, to cause a little
4841 * bit fewer expirations.
4842 */
4843 budget = 2 * min_budget;
4844
4474 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", 4845 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
4475 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); 4846 bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
4476 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", 4847 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
@@ -4478,7 +4849,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4478 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", 4849 bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
4479 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); 4850 bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
4480 4851
4481 if (bfq_bfqq_sync(bfqq)) { 4852 if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
4482 switch (reason) { 4853 switch (reason) {
4483 /* 4854 /*
4484 * Caveat: in all the following cases we trade latency 4855 * Caveat: in all the following cases we trade latency
@@ -4577,7 +4948,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
4577 default: 4948 default:
4578 return; 4949 return;
4579 } 4950 }
4580 } else { 4951 } else if (!bfq_bfqq_sync(bfqq)) {
4581 /* 4952 /*
4582 * Async queues get always the maximum possible 4953 * Async queues get always the maximum possible
4583 * budget, as for them we do not care about latency 4954 * budget, as for them we do not care about latency
@@ -4766,15 +5137,19 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
4766 * bandwidth, and not time, distribution with little unlucky 5137 * bandwidth, and not time, distribution with little unlucky
4767 * or quasi-sequential processes. 5138 * or quasi-sequential processes.
4768 */ 5139 */
4769 if (slow || 5140 if (bfqq->wr_coeff == 1 &&
4770 (reason == BFQQE_BUDGET_TIMEOUT && 5141 (slow ||
4771 bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)) 5142 (reason == BFQQE_BUDGET_TIMEOUT &&
5143 bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
4772 bfq_bfqq_charge_time(bfqd, bfqq, delta); 5144 bfq_bfqq_charge_time(bfqd, bfqq, delta);
4773 5145
4774 if (reason == BFQQE_TOO_IDLE && 5146 if (reason == BFQQE_TOO_IDLE &&
4775 entity->service <= 2 * entity->budget / 10) 5147 entity->service <= 2 * entity->budget / 10)
4776 bfq_clear_bfqq_IO_bound(bfqq); 5148 bfq_clear_bfqq_IO_bound(bfqq);
4777 5149
5150 if (bfqd->low_latency && bfqq->wr_coeff == 1)
5151 bfqq->last_wr_start_finish = jiffies;
5152
4778 bfq_log_bfqq(bfqd, bfqq, 5153 bfq_log_bfqq(bfqd, bfqq,
4779 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, 5154 "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
4780 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); 5155 slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
@@ -4801,10 +5176,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd,
4801 */ 5176 */
4802static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) 5177static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
4803{ 5178{
4804 if (bfq_bfqq_budget_new(bfqq) || 5179 return time_is_before_eq_jiffies(bfqq->budget_timeout);
4805 time_is_after_jiffies(bfqq->budget_timeout))
4806 return false;
4807 return true;
4808} 5180}
4809 5181
4810/* 5182/*
@@ -4831,19 +5203,40 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
4831 5203
4832/* 5204/*
4833 * For a queue that becomes empty, device idling is allowed only if 5205 * For a queue that becomes empty, device idling is allowed only if
4834 * this function returns true for the queue. And this function returns 5206 * this function returns true for the queue. As a consequence, since
4835 * true only if idling is beneficial for throughput. 5207 * device idling plays a critical role in both throughput boosting and
5208 * service guarantees, the return value of this function plays a
5209 * critical role in both these aspects as well.
5210 *
5211 * In a nutshell, this function returns true only if idling is
5212 * beneficial for throughput or, even if detrimental for throughput,
5213 * idling is however necessary to preserve service guarantees (low
5214 * latency, desired throughput distribution, ...). In particular, on
5215 * NCQ-capable devices, this function tries to return false, so as to
5216 * help keep the drives' internal queues full, whenever this helps the
5217 * device boost the throughput without causing any service-guarantee
5218 * issue.
5219 *
5220 * In more detail, the return value of this function is obtained by,
5221 * first, computing a number of boolean variables that take into
5222 * account throughput and service-guarantee issues, and, then,
5223 * combining these variables in a logical expression. Most of the
5224 * issues taken into account are not trivial. We discuss these issues
5225 * individually while introducing the variables.
4836 */ 5226 */
4837static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) 5227static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
4838{ 5228{
4839 struct bfq_data *bfqd = bfqq->bfqd; 5229 struct bfq_data *bfqd = bfqq->bfqd;
4840 bool idling_boosts_thr; 5230 bool idling_boosts_thr, asymmetric_scenario;
4841 5231
4842 if (bfqd->strict_guarantees) 5232 if (bfqd->strict_guarantees)
4843 return true; 5233 return true;
4844 5234
4845 /* 5235 /*
4846 * The value of the next variable is computed considering that 5236 * The next variable takes into account the cases where idling
5237 * boosts the throughput.
5238 *
5239 * The value of the variable is computed considering that
4847 * idling is usually beneficial for the throughput if: 5240 * idling is usually beneficial for the throughput if:
4848 * (a) the device is not NCQ-capable, or 5241 * (a) the device is not NCQ-capable, or
4849 * (b) regardless of the presence of NCQ, the request pattern 5242 * (b) regardless of the presence of NCQ, the request pattern
@@ -4857,13 +5250,80 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
4857 idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq); 5250 idling_boosts_thr = !bfqd->hw_tag || bfq_bfqq_IO_bound(bfqq);
4858 5251
4859 /* 5252 /*
4860 * We have now the components we need to compute the return 5253 * There is then a case where idling must be performed not for
4861 * value of the function, which is true only if both the 5254 * throughput concerns, but to preserve service guarantees. To
4862 * following conditions hold: 5255 * introduce it, we can note that allowing the drive to
5256 * enqueue more than one request at a time, and hence
5257 * delegating de facto final scheduling decisions to the
5258 * drive's internal scheduler, causes loss of control on the
5259 * actual request service order. In particular, the critical
5260 * situation is when requests from different processes happens
5261 * to be present, at the same time, in the internal queue(s)
5262 * of the drive. In such a situation, the drive, by deciding
5263 * the service order of the internally-queued requests, does
5264 * determine also the actual throughput distribution among
5265 * these processes. But the drive typically has no notion or
5266 * concern about per-process throughput distribution, and
5267 * makes its decisions only on a per-request basis. Therefore,
5268 * the service distribution enforced by the drive's internal
5269 * scheduler is likely to coincide with the desired
5270 * device-throughput distribution only in a completely
5271 * symmetric scenario where: (i) each of these processes must
5272 * get the same throughput as the others; (ii) all these
5273 * processes have the same I/O pattern (either sequential or
5274 * random). In fact, in such a scenario, the drive will tend
5275 * to treat the requests of each of these processes in about
5276 * the same way as the requests of the others, and thus to
5277 * provide each of these processes with about the same
5278 * throughput (which is exactly the desired throughput
5279 * distribution). In contrast, in any asymmetric scenario,
5280 * device idling is certainly needed to guarantee that bfqq
5281 * receives its assigned fraction of the device throughput
5282 * (see [1] for details).
5283 *
5284 * As for sub-condition (i), actually we check only whether
5285 * bfqq is being weight-raised. In fact, if bfqq is not being
5286 * weight-raised, we have that:
5287 * - if the process associated with bfqq is not I/O-bound, then
5288 * it is not either latency- or throughput-critical; therefore
5289 * idling is not needed for bfqq;
5290 * - if the process asociated with bfqq is I/O-bound, then
5291 * idling is already granted with bfqq (see the comments on
5292 * idling_boosts_thr).
5293 *
5294 * We do not check sub-condition (ii) at all, i.e., the next
5295 * variable is true if and only if bfqq is being
5296 * weight-raised. We do not need to control sub-condition (ii)
5297 * for the following reason:
5298 * - if bfqq is being weight-raised, then idling is already
5299 * guaranteed to bfqq by sub-condition (i);
5300 * - if bfqq is not being weight-raised, then idling is
5301 * already guaranteed to bfqq (only) if it matters, i.e., if
5302 * bfqq is associated to a currently I/O-bound process (see
5303 * the above comment on sub-condition (i)).
5304 *
5305 * As a side note, it is worth considering that the above
5306 * device-idling countermeasures may however fail in the
5307 * following unlucky scenario: if idling is (correctly)
5308 * disabled in a time period during which the symmetry
5309 * sub-condition holds, and hence the device is allowed to
5310 * enqueue many requests, but at some later point in time some
5311 * sub-condition stops to hold, then it may become impossible
5312 * to let requests be served in the desired order until all
5313 * the requests already queued in the device have been served.
5314 */
5315 asymmetric_scenario = bfqq->wr_coeff > 1;
5316
5317 /*
5318 * We have now all the components we need to compute the return
5319 * value of the function, which is true only if both the following
5320 * conditions hold:
4863 * 1) bfqq is sync, because idling make sense only for sync queues; 5321 * 1) bfqq is sync, because idling make sense only for sync queues;
4864 * 2) idling boosts the throughput. 5322 * 2) idling either boosts the throughput (without issues), or
5323 * is necessary to preserve service guarantees.
4865 */ 5324 */
4866 return bfq_bfqq_sync(bfqq) && idling_boosts_thr; 5325 return bfq_bfqq_sync(bfqq) &&
5326 (idling_boosts_thr || asymmetric_scenario);
4867} 5327}
4868 5328
4869/* 5329/*
@@ -4986,6 +5446,43 @@ keep_queue:
4986 return bfqq; 5446 return bfqq;
4987} 5447}
4988 5448
5449static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
5450{
5451 struct bfq_entity *entity = &bfqq->entity;
5452
5453 if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
5454 bfq_log_bfqq(bfqd, bfqq,
5455 "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
5456 jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
5457 jiffies_to_msecs(bfqq->wr_cur_max_time),
5458 bfqq->wr_coeff,
5459 bfqq->entity.weight, bfqq->entity.orig_weight);
5460
5461 if (entity->prio_changed)
5462 bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
5463
5464 /*
5465 * If too much time has elapsed from the beginning of
5466 * this weight-raising period, then end weight
5467 * raising.
5468 */
5469 if (time_is_before_jiffies(bfqq->last_wr_start_finish +
5470 bfqq->wr_cur_max_time)) {
5471 bfqq->last_wr_start_finish = jiffies;
5472 bfq_log_bfqq(bfqd, bfqq,
5473 "wrais ending at %lu, rais_max_time %u",
5474 bfqq->last_wr_start_finish,
5475 jiffies_to_msecs(bfqq->wr_cur_max_time));
5476 bfq_bfqq_end_wr(bfqq);
5477 }
5478 }
5479 /* Update weight both if it must be raised and if it must be lowered */
5480 if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
5481 __bfq_entity_update_weight_prio(
5482 bfq_entity_service_tree(entity),
5483 entity);
5484}
5485
4989/* 5486/*
4990 * Dispatch next request from bfqq. 5487 * Dispatch next request from bfqq.
4991 */ 5488 */
@@ -5001,6 +5498,19 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
5001 5498
5002 bfq_dispatch_remove(bfqd->queue, rq); 5499 bfq_dispatch_remove(bfqd->queue, rq);
5003 5500
5501 /*
5502 * If weight raising has to terminate for bfqq, then next
5503 * function causes an immediate update of bfqq's weight,
5504 * without waiting for next activation. As a consequence, on
5505 * expiration, bfqq will be timestamped as if has never been
5506 * weight-raised during this service slot, even if it has
5507 * received part or even most of the service as a
5508 * weight-raised queue. This inflates bfqq's timestamps, which
5509 * is beneficial, as bfqq is then more willing to leave the
5510 * device immediately to possible other weight-raised queues.
5511 */
5512 bfq_update_wr_data(bfqd, bfqq);
5513
5004 if (!bfqd->in_service_bic) { 5514 if (!bfqd->in_service_bic) {
5005 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); 5515 atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
5006 bfqd->in_service_bic = RQ_BIC(rq); 5516 bfqd->in_service_bic = RQ_BIC(rq);
@@ -5306,6 +5816,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
5306 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; 5816 bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
5307 bfqq->budget_timeout = bfq_smallest_from_now(); 5817 bfqq->budget_timeout = bfq_smallest_from_now();
5308 5818
5819 bfqq->wr_coeff = 1;
5820 bfqq->last_wr_start_finish = bfq_smallest_from_now();
5821
5309 /* first request is almost certainly seeky */ 5822 /* first request is almost certainly seeky */
5310 bfqq->seek_history = 1; 5823 bfqq->seek_history = 1;
5311} 5824}
@@ -5440,7 +5953,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd,
5440 (bfqd->hw_tag && BFQQ_SEEKY(bfqq))) 5953 (bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
5441 enable_idle = 0; 5954 enable_idle = 0;
5442 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) { 5955 else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
5443 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle) 5956 if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
5957 bfqq->wr_coeff == 1)
5444 enable_idle = 0; 5958 enable_idle = 0;
5445 else 5959 else
5446 enable_idle = 1; 5960 enable_idle = 1;
@@ -5618,6 +6132,16 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5618 bfqd->rq_in_driver--; 6132 bfqd->rq_in_driver--;
5619 bfqq->dispatched--; 6133 bfqq->dispatched--;
5620 6134
6135 if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
6136 /*
6137 * Set budget_timeout (which we overload to store the
6138 * time at which the queue remains with no backlog and
6139 * no outstanding request; used by the weight-raising
6140 * mechanism).
6141 */
6142 bfqq->budget_timeout = jiffies;
6143 }
6144
5621 now_ns = ktime_get_ns(); 6145 now_ns = ktime_get_ns();
5622 6146
5623 bfqq->ttime.last_end_request = now_ns; 6147 bfqq->ttime.last_end_request = now_ns;
@@ -5655,10 +6179,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
5655 * or if we want to idle in case it has no pending requests. 6179 * or if we want to idle in case it has no pending requests.
5656 */ 6180 */
5657 if (bfqd->in_service_queue == bfqq) { 6181 if (bfqd->in_service_queue == bfqq) {
5658 if (bfq_bfqq_budget_new(bfqq)) 6182 if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
5659 bfq_set_budget_timeout(bfqd);
5660
5661 if (bfq_bfqq_must_idle(bfqq)) {
5662 bfq_arm_slice_timer(bfqd); 6183 bfq_arm_slice_timer(bfqd);
5663 return; 6184 return;
5664 } else if (bfq_may_expire_for_budg_timeout(bfqq)) 6185 } else if (bfq_may_expire_for_budg_timeout(bfqq))
@@ -5966,6 +6487,26 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5966 6487
5967 bfqd->bfq_requests_within_timer = 120; 6488 bfqd->bfq_requests_within_timer = 120;
5968 6489
6490 bfqd->low_latency = true;
6491
6492 /*
6493 * Trade-off between responsiveness and fairness.
6494 */
6495 bfqd->bfq_wr_coeff = 30;
6496 bfqd->bfq_wr_max_time = 0;
6497 bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
6498 bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
6499
6500 /*
6501 * Begin by assuming, optimistically, that the device is a
6502 * high-speed one, and that its peak rate is equal to 2/3 of
6503 * the highest reference rate.
6504 */
6505 bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
6506 T_fast[blk_queue_nonrot(bfqd->queue)];
6507 bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
6508 bfqd->device_speed = BFQ_BFQD_FAST;
6509
5969 spin_lock_init(&bfqd->lock); 6510 spin_lock_init(&bfqd->lock);
5970 6511
5971 /* 6512 /*
@@ -6047,6 +6588,7 @@ SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
6047SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); 6588SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
6048SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); 6589SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
6049SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); 6590SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
6591SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
6050#undef SHOW_FUNCTION 6592#undef SHOW_FUNCTION
6051 6593
6052#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ 6594#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
@@ -6167,6 +6709,22 @@ static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
6167 return ret; 6709 return ret;
6168} 6710}
6169 6711
6712static ssize_t bfq_low_latency_store(struct elevator_queue *e,
6713 const char *page, size_t count)
6714{
6715 struct bfq_data *bfqd = e->elevator_data;
6716 unsigned long uninitialized_var(__data);
6717 int ret = bfq_var_store(&__data, (page), count);
6718
6719 if (__data > 1)
6720 __data = 1;
6721 if (__data == 0 && bfqd->low_latency != 0)
6722 bfq_end_wr(bfqd);
6723 bfqd->low_latency = __data;
6724
6725 return ret;
6726}
6727
6170#define BFQ_ATTR(name) \ 6728#define BFQ_ATTR(name) \
6171 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store) 6729 __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
6172 6730
@@ -6180,6 +6738,7 @@ static struct elv_fs_entry bfq_attrs[] = {
6180 BFQ_ATTR(max_budget), 6738 BFQ_ATTR(max_budget),
6181 BFQ_ATTR(timeout_sync), 6739 BFQ_ATTR(timeout_sync),
6182 BFQ_ATTR(strict_guarantees), 6740 BFQ_ATTR(strict_guarantees),
6741 BFQ_ATTR(low_latency),
6183 __ATTR_NULL 6742 __ATTR_NULL
6184}; 6743};
6185 6744
@@ -6242,6 +6801,39 @@ static int __init bfq_init(void)
6242 if (bfq_slab_setup()) 6801 if (bfq_slab_setup())
6243 goto err_pol_unreg; 6802 goto err_pol_unreg;
6244 6803
6804 /*
6805 * Times to load large popular applications for the typical
6806 * systems installed on the reference devices (see the
6807 * comments before the definitions of the next two
6808 * arrays). Actually, we use slightly slower values, as the
6809 * estimated peak rate tends to be smaller than the actual
6810 * peak rate. The reason for this last fact is that estimates
6811 * are computed over much shorter time intervals than the long
6812 * intervals typically used for benchmarking. Why? First, to
6813 * adapt more quickly to variations. Second, because an I/O
6814 * scheduler cannot rely on a peak-rate-evaluation workload to
6815 * be run for a long time.
6816 */
6817 T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
6818 T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
6819 T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
6820 T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
6821
6822 /*
6823 * Thresholds that determine the switch between speed classes
6824 * (see the comments before the definition of the array
6825 * device_speed_thresh). These thresholds are biased towards
6826 * transitions to the fast class. This is safer than the
6827 * opposite bias. In fact, a wrong transition to the slow
6828 * class results in short weight-raising periods, because the
6829 * speed of the device then tends to be higher that the
6830 * reference peak rate. On the opposite end, a wrong
6831 * transition to the fast class tends to increase
6832 * weight-raising periods, because of the opposite reason.
6833 */
6834 device_speed_thresh[0] = (4 * R_slow[0]) / 3;
6835 device_speed_thresh[1] = (4 * R_slow[1]) / 3;
6836
6245 ret = elv_register(&iosched_bfq_mq); 6837 ret = elv_register(&iosched_bfq_mq);
6246 if (ret) 6838 if (ret)
6247 goto err_pol_unreg; 6839 goto err_pol_unreg;