summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--block/bfq-iosched.c387
1 files changed, 203 insertions, 184 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 6a3d05023300..72840ebf953e 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -3210,7 +3210,186 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
3210 bfq_remove_request(q, rq); 3210 bfq_remove_request(q, rq);
3211} 3211}
3212 3212
3213static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) 3213/*
3214 * There is a case where idling does not have to be performed for
3215 * throughput concerns, but to preserve the throughput share of
3216 * the process associated with bfqq.
3217 *
3218 * To introduce this case, we can note that allowing the drive
3219 * to enqueue more than one request at a time, and hence
3220 * delegating de facto final scheduling decisions to the
3221 * drive's internal scheduler, entails loss of control on the
3222 * actual request service order. In particular, the critical
3223 * situation is when requests from different processes happen
3224 * to be present, at the same time, in the internal queue(s)
3225 * of the drive. In such a situation, the drive, by deciding
3226 * the service order of the internally-queued requests, does
3227 * determine also the actual throughput distribution among
3228 * these processes. But the drive typically has no notion or
3229 * concern about per-process throughput distribution, and
3230 * makes its decisions only on a per-request basis. Therefore,
3231 * the service distribution enforced by the drive's internal
3232 * scheduler is likely to coincide with the desired throughput
3233 * distribution only in a completely symmetric, or favorably
3234 * skewed scenario where:
3235 * (i-a) each of these processes must get the same throughput as
3236 * the others,
3237 * (i-b) in case (i-a) does not hold, it holds that the process
3238 * associated with bfqq must receive a lower or equal
3239 * throughput than any of the other processes;
3240 * (ii) the I/O of each process has the same properties, in
3241 * terms of locality (sequential or random), direction
3242 * (reads or writes), request sizes, greediness
3243 * (from I/O-bound to sporadic), and so on;
3244
3245 * In fact, in such a scenario, the drive tends to treat the requests
3246 * of each process in about the same way as the requests of the
3247 * others, and thus to provide each of these processes with about the
3248 * same throughput. This is exactly the desired throughput
3249 * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3250 * even more convenient distribution for (the process associated with)
3251 * bfqq.
3252 *
3253 * In contrast, in any asymmetric or unfavorable scenario, device
3254 * idling (I/O-dispatch plugging) is certainly needed to guarantee
3255 * that bfqq receives its assigned fraction of the device throughput
3256 * (see [1] for details).
3257 *
3258 * The problem is that idling may significantly reduce throughput with
3259 * certain combinations of types of I/O and devices. An important
3260 * example is sync random I/O on flash storage with command
3261 * queueing. So, unless bfqq falls in cases where idling also boosts
3262 * throughput, it is important to check conditions (i-a), i(-b) and
3263 * (ii) accurately, so as to avoid idling when not strictly needed for
3264 * service guarantees.
3265 *
3266 * Unfortunately, it is extremely difficult to thoroughly check
3267 * condition (ii). And, in case there are active groups, it becomes
3268 * very difficult to check conditions (i-a) and (i-b) too. In fact,
3269 * if there are active groups, then, for conditions (i-a) or (i-b) to
3270 * become false 'indirectly', it is enough that an active group
3271 * contains more active processes or sub-groups than some other active
3272 * group. More precisely, for conditions (i-a) or (i-b) to become
3273 * false because of such a group, it is not even necessary that the
3274 * group is (still) active: it is sufficient that, even if the group
3275 * has become inactive, some of its descendant processes still have
3276 * some request already dispatched but still waiting for
3277 * completion. In fact, requests have still to be guaranteed their
3278 * share of the throughput even after being dispatched. In this
3279 * respect, it is easy to show that, if a group frequently becomes
3280 * inactive while still having in-flight requests, and if, when this
3281 * happens, the group is not considered in the calculation of whether
3282 * the scenario is asymmetric, then the group may fail to be
3283 * guaranteed its fair share of the throughput (basically because
3284 * idling may not be performed for the descendant processes of the
3285 * group, but it had to be). We address this issue with the following
3286 * bi-modal behavior, implemented in the function
3287 * bfq_asymmetric_scenario().
3288 *
3289 * If there are groups with requests waiting for completion
3290 * (as commented above, some of these groups may even be
3291 * already inactive), then the scenario is tagged as
3292 * asymmetric, conservatively, without checking any of the
3293 * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3294 * This behavior matches also the fact that groups are created
3295 * exactly if controlling I/O is a primary concern (to
3296 * preserve bandwidth and latency guarantees).
3297 *
3298 * On the opposite end, if there are no groups with requests waiting
3299 * for completion, then only conditions (i-a) and (i-b) are actually
3300 * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3301 * idling is not performed, regardless of whether condition (ii)
3302 * holds. In other words, only if conditions (i-a) and (i-b) do not
3303 * hold, then idling is allowed, and the device tends to be prevented
3304 * from queueing many requests, possibly of several processes. Since
3305 * there are no groups with requests waiting for completion, then, to
3306 * control conditions (i-a) and (i-b) it is enough to check just
3307 * whether all the queues with requests waiting for completion also
3308 * have the same weight.
3309 *
3310 * Not checking condition (ii) evidently exposes bfqq to the
3311 * risk of getting less throughput than its fair share.
3312 * However, for queues with the same weight, a further
3313 * mechanism, preemption, mitigates or even eliminates this
3314 * problem. And it does so without consequences on overall
3315 * throughput. This mechanism and its benefits are explained
3316 * in the next three paragraphs.
3317 *
3318 * Even if a queue, say Q, is expired when it remains idle, Q
3319 * can still preempt the new in-service queue if the next
3320 * request of Q arrives soon (see the comments on
3321 * bfq_bfqq_update_budg_for_activation). If all queues and
3322 * groups have the same weight, this form of preemption,
3323 * combined with the hole-recovery heuristic described in the
3324 * comments on function bfq_bfqq_update_budg_for_activation,
3325 * are enough to preserve a correct bandwidth distribution in
3326 * the mid term, even without idling. In fact, even if not
3327 * idling allows the internal queues of the device to contain
3328 * many requests, and thus to reorder requests, we can rather
3329 * safely assume that the internal scheduler still preserves a
3330 * minimum of mid-term fairness.
3331 *
3332 * More precisely, this preemption-based, idleless approach
3333 * provides fairness in terms of IOPS, and not sectors per
3334 * second. This can be seen with a simple example. Suppose
3335 * that there are two queues with the same weight, but that
3336 * the first queue receives requests of 8 sectors, while the
3337 * second queue receives requests of 1024 sectors. In
3338 * addition, suppose that each of the two queues contains at
3339 * most one request at a time, which implies that each queue
3340 * always remains idle after it is served. Finally, after
3341 * remaining idle, each queue receives very quickly a new
3342 * request. It follows that the two queues are served
3343 * alternatively, preempting each other if needed. This
3344 * implies that, although both queues have the same weight,
3345 * the queue with large requests receives a service that is
3346 * 1024/8 times as high as the service received by the other
3347 * queue.
3348 *
3349 * The motivation for using preemption instead of idling (for
3350 * queues with the same weight) is that, by not idling,
3351 * service guarantees are preserved (completely or at least in
3352 * part) without minimally sacrificing throughput. And, if
3353 * there is no active group, then the primary expectation for
3354 * this device is probably a high throughput.
3355 *
3356 * We are now left only with explaining the additional
3357 * compound condition that is checked below for deciding
3358 * whether the scenario is asymmetric. To explain this
3359 * compound condition, we need to add that the function
3360 * bfq_asymmetric_scenario checks the weights of only
3361 * non-weight-raised queues, for efficiency reasons (see
3362 * comments on bfq_weights_tree_add()). Then the fact that
3363 * bfqq is weight-raised is checked explicitly here. More
3364 * precisely, the compound condition below takes into account
3365 * also the fact that, even if bfqq is being weight-raised,
3366 * the scenario is still symmetric if all queues with requests
3367 * waiting for completion happen to be
3368 * weight-raised. Actually, we should be even more precise
3369 * here, and differentiate between interactive weight raising
3370 * and soft real-time weight raising.
3371 *
3372 * As a side note, it is worth considering that the above
3373 * device-idling countermeasures may however fail in the
3374 * following unlucky scenario: if idling is (correctly)
3375 * disabled in a time period during which all symmetry
3376 * sub-conditions hold, and hence the device is allowed to
3377 * enqueue many requests, but at some later point in time some
3378 * sub-condition stops to hold, then it may become impossible
3379 * to let requests be served in the desired order until all
3380 * the requests already queued in the device have been served.
3381 */
3382static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
3383 struct bfq_queue *bfqq)
3384{
3385 return (bfqq->wr_coeff > 1 &&
3386 bfqd->wr_busy_queues <
3387 bfq_tot_busy_queues(bfqd)) ||
3388 bfq_asymmetric_scenario(bfqd, bfqq);
3389}
3390
3391static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
3392 enum bfqq_expiration reason)
3214{ 3393{
3215 /* 3394 /*
3216 * If this bfqq is shared between multiple processes, check 3395 * If this bfqq is shared between multiple processes, check
@@ -3221,7 +3400,22 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3221 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) 3400 if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
3222 bfq_mark_bfqq_split_coop(bfqq); 3401 bfq_mark_bfqq_split_coop(bfqq);
3223 3402
3224 if (RB_EMPTY_ROOT(&bfqq->sort_list)) { 3403 /*
3404 * Consider queues with a higher finish virtual time than
3405 * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
3406 * true, then bfqq's bandwidth would be violated if an
3407 * uncontrolled amount of I/O from these queues were
3408 * dispatched while bfqq is waiting for its new I/O to
3409 * arrive. This is exactly what may happen if this is a forced
3410 * expiration caused by a preemption attempt, and if bfqq is
3411 * not re-scheduled. To prevent this from happening, re-queue
3412 * bfqq if it needs I/O-dispatch plugging, even if it is
3413 * empty. By doing so, bfqq is granted to be served before the
3414 * above queues (provided that bfqq is of course eligible).
3415 */
3416 if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
3417 !(reason == BFQQE_PREEMPTED &&
3418 idling_needed_for_service_guarantees(bfqd, bfqq))) {
3225 if (bfqq->dispatched == 0) 3419 if (bfqq->dispatched == 0)
3226 /* 3420 /*
3227 * Overloading budget_timeout field to store 3421 * Overloading budget_timeout field to store
@@ -3238,7 +3432,8 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
3238 * Resort priority tree of potential close cooperators. 3432 * Resort priority tree of potential close cooperators.
3239 * See comments on bfq_pos_tree_add_move() for the unlikely(). 3433 * See comments on bfq_pos_tree_add_move() for the unlikely().
3240 */ 3434 */
3241 if (unlikely(!bfqd->nonrot_with_queueing)) 3435 if (unlikely(!bfqd->nonrot_with_queueing &&
3436 !RB_EMPTY_ROOT(&bfqq->sort_list)))
3242 bfq_pos_tree_add_move(bfqd, bfqq); 3437 bfq_pos_tree_add_move(bfqd, bfqq);
3243 } 3438 }
3244 3439
@@ -3739,7 +3934,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
3739 * reason. 3934 * reason.
3740 */ 3935 */
3741 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); 3936 __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
3742 if (__bfq_bfqq_expire(bfqd, bfqq)) 3937 if (__bfq_bfqq_expire(bfqd, bfqq, reason))
3743 /* bfqq is gone, no more actions on it */ 3938 /* bfqq is gone, no more actions on it */
3744 return; 3939 return;
3745 3940
@@ -3886,184 +4081,6 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
3886} 4081}
3887 4082
3888/* 4083/*
3889 * There is a case where idling does not have to be performed for
3890 * throughput concerns, but to preserve the throughput share of
3891 * the process associated with bfqq.
3892 *
3893 * To introduce this case, we can note that allowing the drive
3894 * to enqueue more than one request at a time, and hence
3895 * delegating de facto final scheduling decisions to the
3896 * drive's internal scheduler, entails loss of control on the
3897 * actual request service order. In particular, the critical
3898 * situation is when requests from different processes happen
3899 * to be present, at the same time, in the internal queue(s)
3900 * of the drive. In such a situation, the drive, by deciding
3901 * the service order of the internally-queued requests, does
3902 * determine also the actual throughput distribution among
3903 * these processes. But the drive typically has no notion or
3904 * concern about per-process throughput distribution, and
3905 * makes its decisions only on a per-request basis. Therefore,
3906 * the service distribution enforced by the drive's internal
3907 * scheduler is likely to coincide with the desired throughput
3908 * distribution only in a completely symmetric, or favorably
3909 * skewed scenario where:
3910 * (i-a) each of these processes must get the same throughput as
3911 * the others,
3912 * (i-b) in case (i-a) does not hold, it holds that the process
3913 * associated with bfqq must receive a lower or equal
3914 * throughput than any of the other processes;
3915 * (ii) the I/O of each process has the same properties, in
3916 * terms of locality (sequential or random), direction
3917 * (reads or writes), request sizes, greediness
3918 * (from I/O-bound to sporadic), and so on;
3919
3920 * In fact, in such a scenario, the drive tends to treat the requests
3921 * of each process in about the same way as the requests of the
3922 * others, and thus to provide each of these processes with about the
3923 * same throughput. This is exactly the desired throughput
3924 * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
3925 * even more convenient distribution for (the process associated with)
3926 * bfqq.
3927 *
3928 * In contrast, in any asymmetric or unfavorable scenario, device
3929 * idling (I/O-dispatch plugging) is certainly needed to guarantee
3930 * that bfqq receives its assigned fraction of the device throughput
3931 * (see [1] for details).
3932 *
3933 * The problem is that idling may significantly reduce throughput with
3934 * certain combinations of types of I/O and devices. An important
3935 * example is sync random I/O on flash storage with command
3936 * queueing. So, unless bfqq falls in cases where idling also boosts
3937 * throughput, it is important to check conditions (i-a), i(-b) and
3938 * (ii) accurately, so as to avoid idling when not strictly needed for
3939 * service guarantees.
3940 *
3941 * Unfortunately, it is extremely difficult to thoroughly check
3942 * condition (ii). And, in case there are active groups, it becomes
3943 * very difficult to check conditions (i-a) and (i-b) too. In fact,
3944 * if there are active groups, then, for conditions (i-a) or (i-b) to
3945 * become false 'indirectly', it is enough that an active group
3946 * contains more active processes or sub-groups than some other active
3947 * group. More precisely, for conditions (i-a) or (i-b) to become
3948 * false because of such a group, it is not even necessary that the
3949 * group is (still) active: it is sufficient that, even if the group
3950 * has become inactive, some of its descendant processes still have
3951 * some request already dispatched but still waiting for
3952 * completion. In fact, requests have still to be guaranteed their
3953 * share of the throughput even after being dispatched. In this
3954 * respect, it is easy to show that, if a group frequently becomes
3955 * inactive while still having in-flight requests, and if, when this
3956 * happens, the group is not considered in the calculation of whether
3957 * the scenario is asymmetric, then the group may fail to be
3958 * guaranteed its fair share of the throughput (basically because
3959 * idling may not be performed for the descendant processes of the
3960 * group, but it had to be). We address this issue with the following
3961 * bi-modal behavior, implemented in the function
3962 * bfq_asymmetric_scenario().
3963 *
3964 * If there are groups with requests waiting for completion
3965 * (as commented above, some of these groups may even be
3966 * already inactive), then the scenario is tagged as
3967 * asymmetric, conservatively, without checking any of the
3968 * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
3969 * This behavior matches also the fact that groups are created
3970 * exactly if controlling I/O is a primary concern (to
3971 * preserve bandwidth and latency guarantees).
3972 *
3973 * On the opposite end, if there are no groups with requests waiting
3974 * for completion, then only conditions (i-a) and (i-b) are actually
3975 * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
3976 * idling is not performed, regardless of whether condition (ii)
3977 * holds. In other words, only if conditions (i-a) and (i-b) do not
3978 * hold, then idling is allowed, and the device tends to be prevented
3979 * from queueing many requests, possibly of several processes. Since
3980 * there are no groups with requests waiting for completion, then, to
3981 * control conditions (i-a) and (i-b) it is enough to check just
3982 * whether all the queues with requests waiting for completion also
3983 * have the same weight.
3984 *
3985 * Not checking condition (ii) evidently exposes bfqq to the
3986 * risk of getting less throughput than its fair share.
3987 * However, for queues with the same weight, a further
3988 * mechanism, preemption, mitigates or even eliminates this
3989 * problem. And it does so without consequences on overall
3990 * throughput. This mechanism and its benefits are explained
3991 * in the next three paragraphs.
3992 *
3993 * Even if a queue, say Q, is expired when it remains idle, Q
3994 * can still preempt the new in-service queue if the next
3995 * request of Q arrives soon (see the comments on
3996 * bfq_bfqq_update_budg_for_activation). If all queues and
3997 * groups have the same weight, this form of preemption,
3998 * combined with the hole-recovery heuristic described in the
3999 * comments on function bfq_bfqq_update_budg_for_activation,
4000 * are enough to preserve a correct bandwidth distribution in
4001 * the mid term, even without idling. In fact, even if not
4002 * idling allows the internal queues of the device to contain
4003 * many requests, and thus to reorder requests, we can rather
4004 * safely assume that the internal scheduler still preserves a
4005 * minimum of mid-term fairness.
4006 *
4007 * More precisely, this preemption-based, idleless approach
4008 * provides fairness in terms of IOPS, and not sectors per
4009 * second. This can be seen with a simple example. Suppose
4010 * that there are two queues with the same weight, but that
4011 * the first queue receives requests of 8 sectors, while the
4012 * second queue receives requests of 1024 sectors. In
4013 * addition, suppose that each of the two queues contains at
4014 * most one request at a time, which implies that each queue
4015 * always remains idle after it is served. Finally, after
4016 * remaining idle, each queue receives very quickly a new
4017 * request. It follows that the two queues are served
4018 * alternatively, preempting each other if needed. This
4019 * implies that, although both queues have the same weight,
4020 * the queue with large requests receives a service that is
4021 * 1024/8 times as high as the service received by the other
4022 * queue.
4023 *
4024 * The motivation for using preemption instead of idling (for
4025 * queues with the same weight) is that, by not idling,
4026 * service guarantees are preserved (completely or at least in
4027 * part) without minimally sacrificing throughput. And, if
4028 * there is no active group, then the primary expectation for
4029 * this device is probably a high throughput.
4030 *
4031 * We are now left only with explaining the additional
4032 * compound condition that is checked below for deciding
4033 * whether the scenario is asymmetric. To explain this
4034 * compound condition, we need to add that the function
4035 * bfq_asymmetric_scenario checks the weights of only
4036 * non-weight-raised queues, for efficiency reasons (see
4037 * comments on bfq_weights_tree_add()). Then the fact that
4038 * bfqq is weight-raised is checked explicitly here. More
4039 * precisely, the compound condition below takes into account
4040 * also the fact that, even if bfqq is being weight-raised,
4041 * the scenario is still symmetric if all queues with requests
4042 * waiting for completion happen to be
4043 * weight-raised. Actually, we should be even more precise
4044 * here, and differentiate between interactive weight raising
4045 * and soft real-time weight raising.
4046 *
4047 * As a side note, it is worth considering that the above
4048 * device-idling countermeasures may however fail in the
4049 * following unlucky scenario: if idling is (correctly)
4050 * disabled in a time period during which all symmetry
4051 * sub-conditions hold, and hence the device is allowed to
4052 * enqueue many requests, but at some later point in time some
4053 * sub-condition stops to hold, then it may become impossible
4054 * to let requests be served in the desired order until all
4055 * the requests already queued in the device have been served.
4056 */
4057static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
4058 struct bfq_queue *bfqq)
4059{
4060 return (bfqq->wr_coeff > 1 &&
4061 bfqd->wr_busy_queues <
4062 bfq_tot_busy_queues(bfqd)) ||
4063 bfq_asymmetric_scenario(bfqd, bfqq);
4064}
4065
4066/*
4067 * For a queue that becomes empty, device idling is allowed only if 4084 * For a queue that becomes empty, device idling is allowed only if
4068 * this function returns true for that queue. As a consequence, since 4085 * this function returns true for that queue. As a consequence, since
4069 * device idling plays a critical role for both throughput boosting 4086 * device idling plays a critical role for both throughput boosting
@@ -4321,7 +4338,8 @@ check_queue:
4321 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { 4338 (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
4322 struct bfq_queue *async_bfqq = 4339 struct bfq_queue *async_bfqq =
4323 bfqq->bic && bfqq->bic->bfqq[0] && 4340 bfqq->bic && bfqq->bic->bfqq[0] &&
4324 bfq_bfqq_busy(bfqq->bic->bfqq[0]) ? 4341 bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
4342 bfqq->bic->bfqq[0]->next_rq ?
4325 bfqq->bic->bfqq[0] : NULL; 4343 bfqq->bic->bfqq[0] : NULL;
4326 4344
4327 /* 4345 /*
@@ -4403,6 +4421,7 @@ check_queue:
4403 bfqq = bfqq->bic->bfqq[0]; 4421 bfqq = bfqq->bic->bfqq[0];
4404 else if (bfq_bfqq_has_waker(bfqq) && 4422 else if (bfq_bfqq_has_waker(bfqq) &&
4405 bfq_bfqq_busy(bfqq->waker_bfqq) && 4423 bfq_bfqq_busy(bfqq->waker_bfqq) &&
4424 bfqq->next_rq &&
4406 bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, 4425 bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
4407 bfqq->waker_bfqq) <= 4426 bfqq->waker_bfqq) <=
4408 bfq_bfqq_budget_left(bfqq->waker_bfqq) 4427 bfq_bfqq_budget_left(bfqq->waker_bfqq)
@@ -4800,7 +4819,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
4800 struct hlist_node *n; 4819 struct hlist_node *n;
4801 4820
4802 if (bfqq == bfqd->in_service_queue) { 4821 if (bfqq == bfqd->in_service_queue) {
4803 __bfq_bfqq_expire(bfqd, bfqq); 4822 __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
4804 bfq_schedule_dispatch(bfqd); 4823 bfq_schedule_dispatch(bfqd);
4805 } 4824 }
4806 4825