aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaolo Valente <paolo.valente@linaro.org>2018-12-06 13:18:18 -0500
committerJens Axboe <axboe@kernel.dk>2018-12-07 09:40:07 -0500
commitba7aeae5539c7a7cccc4cf07a2bc61281a93c50e (patch)
tree3c629f7194a500adb395329925ea0ce83c2a28d3
parentffe81d45322cc3cb140f0db080a4727ea284661e (diff)
block, bfq: fix decrement of num_active_groups
Since commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")', if there are process groups with I/O requests waiting for completion, then BFQ tags the scenario as 'asymmetric'. This detection is needed for preserving service guarantees (for details, see comments on the computation * of the variable asymmetric_scenario in the function bfq_better_to_idle). Unfortunately, commit '2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection")' contains an error exactly in the updating of the number of groups with I/O requests waiting for completion: if a group has more than one descendant process, then the above number of groups, which is renamed from num_active_groups to a more appropriate num_groups_with_pending_reqs by this commit, may happen to be wrongly decremented multiple times, namely every time one of the descendant processes gets all its pending I/O requests completed. A correct, complete solution should work as follows. Consider a group that is inactive, i.e., that has no descendant process with pending I/O inside BFQ queues. Then suppose that num_groups_with_pending_reqs is still accounting for this group, because the group still has some descendant process with some I/O request still in flight. num_groups_with_pending_reqs should be decremented when the in-flight request of the last descendant process is finally completed (assuming that nothing else has changed for the group in the meantime, in terms of composition of the group and active/inactive state of child groups and processes). To accomplish this, an additional pending-request counter must be added to entities, and must be updated correctly. To avoid this additional field and operations, this commit resorts to the following tradeoff between simplicity and accuracy: for an inactive group that is still counted in num_groups_with_pending_reqs, this commit decrements num_groups_with_pending_reqs when the first descendant process of the group remains with no request waiting for completion. This simplified scheme provides a fix to the unbalanced decrements introduced by 2d29c9f89fcd. Since this error was also caused by lack of comments on this non-trivial issue, this commit also adds related comments. Fixes: 2d29c9f89fcd ("block, bfq: improve asymmetric scenarios detection") Reported-by: Steven Barrett <steven@liquorix.net> Tested-by: Steven Barrett <steven@liquorix.net> Tested-by: Lucjan Lucjanov <lucjan.lucjanov@gmail.com> Reviewed-by: Federico Motta <federico@willer.it> Signed-off-by: Paolo Valente <paolo.valente@linaro.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--block/bfq-iosched.c76
-rw-r--r--block/bfq-iosched.h51
-rw-r--r--block/bfq-wf2q.c5
3 files changed, 107 insertions, 25 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..97337214bec4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
638 bfqd->queue_weights_tree.rb_node->rb_right) 638 bfqd->queue_weights_tree.rb_node->rb_right)
639#ifdef CONFIG_BFQ_GROUP_IOSCHED 639#ifdef CONFIG_BFQ_GROUP_IOSCHED
640 ) || 640 ) ||
641 (bfqd->num_active_groups > 0 641 (bfqd->num_groups_with_pending_reqs > 0
642#endif 642#endif
643 ); 643 );
644} 644}
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
802 */ 802 */
803 break; 803 break;
804 } 804 }
805 bfqd->num_active_groups--; 805
806 /*
807 * The decrement of num_groups_with_pending_reqs is
808 * not performed immediately upon the deactivation of
809 * entity, but it is delayed to when it also happens
810 * that the first leaf descendant bfqq of entity gets
811 * all its pending requests completed. The following
812 * instructions perform this delayed decrement, if
813 * needed. See the comments on
814 * num_groups_with_pending_reqs for details.
815 */
816 if (entity->in_groups_with_pending_reqs) {
817 entity->in_groups_with_pending_reqs = false;
818 bfqd->num_groups_with_pending_reqs--;
819 }
806 } 820 }
807} 821}
808 822
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3529 * fact, if there are active groups, then, for condition (i) 3543 * fact, if there are active groups, then, for condition (i)
3530 * to become false, it is enough that an active group contains 3544 * to become false, it is enough that an active group contains
3531 * more active processes or sub-groups than some other active 3545 * more active processes or sub-groups than some other active
3532 * group. We address this issue with the following bi-modal 3546 * group. More precisely, for condition (i) to hold because of
3533 * behavior, implemented in the function 3547 * such a group, it is not even necessary that the group is
3548 * (still) active: it is sufficient that, even if the group
3549 * has become inactive, some of its descendant processes still
3550 * have some request already dispatched but still waiting for
3551 * completion. In fact, requests have still to be guaranteed
3552 * their share of the throughput even after being
3553 * dispatched. In this respect, it is easy to show that, if a
3554 * group frequently becomes inactive while still having
3555 * in-flight requests, and if, when this happens, the group is
3556 * not considered in the calculation of whether the scenario
3557 * is asymmetric, then the group may fail to be guaranteed its
3558 * fair share of the throughput (basically because idling may
3559 * not be performed for the descendant processes of the group,
3560 * but it had to be). We address this issue with the
3561 * following bi-modal behavior, implemented in the function
3534 * bfq_symmetric_scenario(). 3562 * bfq_symmetric_scenario().
3535 * 3563 *
3536 * If there are active groups, then the scenario is tagged as 3564 * If there are groups with requests waiting for completion
3565 * (as commented above, some of these groups may even be
3566 * already inactive), then the scenario is tagged as
3537 * asymmetric, conservatively, without checking any of the 3567 * asymmetric, conservatively, without checking any of the
3538 * conditions (i) and (ii). So the device is idled for bfqq. 3568 * conditions (i) and (ii). So the device is idled for bfqq.
3539 * This behavior matches also the fact that groups are created 3569 * This behavior matches also the fact that groups are created
3540 * exactly if controlling I/O (to preserve bandwidth and 3570 * exactly if controlling I/O is a primary concern (to
3541 * latency guarantees) is a primary concern. 3571 * preserve bandwidth and latency guarantees).
3542 * 3572 *
3543 * On the opposite end, if there are no active groups, then 3573 * On the opposite end, if there are no groups with requests
3544 * only condition (i) is actually controlled, i.e., provided 3574 * waiting for completion, then only condition (i) is actually
3545 * that condition (i) holds, idling is not performed, 3575 * controlled, i.e., provided that condition (i) holds, idling
3546 * regardless of whether condition (ii) holds. In other words, 3576 * is not performed, regardless of whether condition (ii)
3547 * only if condition (i) does not hold, then idling is 3577 * holds. In other words, only if condition (i) does not hold,
3548 * allowed, and the device tends to be prevented from queueing 3578 * then idling is allowed, and the device tends to be
3549 * many requests, possibly of several processes. Since there 3579 * prevented from queueing many requests, possibly of several
3550 * are no active groups, then, to control condition (i) it is 3580 * processes. Since there are no groups with requests waiting
3551 * enough to check whether all active queues have the same 3581 * for completion, then, to control condition (i) it is enough
3552 * weight. 3582 * to check just whether all the queues with requests waiting
3583 * for completion also have the same weight.
3553 * 3584 *
3554 * Not checking condition (ii) evidently exposes bfqq to the 3585 * Not checking condition (ii) evidently exposes bfqq to the
3555 * risk of getting less throughput than its fair share. 3586 * risk of getting less throughput than its fair share.
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3607 * bfqq is weight-raised is checked explicitly here. More 3638 * bfqq is weight-raised is checked explicitly here. More
3608 * precisely, the compound condition below takes into account 3639 * precisely, the compound condition below takes into account
3609 * also the fact that, even if bfqq is being weight-raised, 3640 * also the fact that, even if bfqq is being weight-raised,
3610 * the scenario is still symmetric if all active queues happen 3641 * the scenario is still symmetric if all queues with requests
3611 * to be weight-raised. Actually, we should be even more 3642 * waiting for completion happen to be
3612 * precise here, and differentiate between interactive weight 3643 * weight-raised. Actually, we should be even more precise
3613 * raising and soft real-time weight raising. 3644 * here, and differentiate between interactive weight raising
3645 * and soft real-time weight raising.
3614 * 3646 *
3615 * As a side note, it is worth considering that the above 3647 * As a side note, it is worth considering that the above
3616 * device-idling countermeasures may however fail in the 3648 * device-idling countermeasures may however fail in the
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
5417 bfqd->idle_slice_timer.function = bfq_idle_slice_timer; 5449 bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5418 5450
5419 bfqd->queue_weights_tree = RB_ROOT; 5451 bfqd->queue_weights_tree = RB_ROOT;
5420 bfqd->num_active_groups = 0; 5452 bfqd->num_groups_with_pending_reqs = 0;
5421 5453
5422 INIT_LIST_HEAD(&bfqd->active_list); 5454 INIT_LIST_HEAD(&bfqd->active_list);
5423 INIT_LIST_HEAD(&bfqd->idle_list); 5455 INIT_LIST_HEAD(&bfqd->idle_list);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 77651d817ecd..0b02bf302de0 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -196,6 +196,9 @@ struct bfq_entity {
196 196
197 /* flag, set to request a weight, ioprio or ioprio_class change */ 197 /* flag, set to request a weight, ioprio or ioprio_class change */
198 int prio_changed; 198 int prio_changed;
199
200 /* flag, set if the entity is counted in groups_with_pending_reqs */
201 bool in_groups_with_pending_reqs;
199}; 202};
200 203
201struct bfq_group; 204struct bfq_group;
@@ -448,10 +451,54 @@ struct bfq_data {
448 * bfq_weights_tree_[add|remove] for further details). 451 * bfq_weights_tree_[add|remove] for further details).
449 */ 452 */
450 struct rb_root queue_weights_tree; 453 struct rb_root queue_weights_tree;
454
451 /* 455 /*
452 * number of groups with requests still waiting for completion 456 * Number of groups with at least one descendant process that
457 * has at least one request waiting for completion. Note that
458 * this accounts for also requests already dispatched, but not
459 * yet completed. Therefore this number of groups may differ
460 * (be larger) than the number of active groups, as a group is
461 * considered active only if its corresponding entity has
462 * descendant queues with at least one request queued. This
463 * number is used to decide whether a scenario is symmetric.
464 * For a detailed explanation see comments on the computation
465 * of the variable asymmetric_scenario in the function
466 * bfq_better_to_idle().
467 *
468 * However, it is hard to compute this number exactly, for
469 * groups with multiple descendant processes. Consider a group
470 * that is inactive, i.e., that has no descendant process with
471 * pending I/O inside BFQ queues. Then suppose that
472 * num_groups_with_pending_reqs is still accounting for this
473 * group, because the group has descendant processes with some
474 * I/O request still in flight. num_groups_with_pending_reqs
475 * should be decremented when the in-flight request of the
476 * last descendant process is finally completed (assuming that
477 * nothing else has changed for the group in the meantime, in
478 * terms of composition of the group and active/inactive state of child
479 * groups and processes). To accomplish this, an additional
480 * pending-request counter must be added to entities, and must
481 * be updated correctly. To avoid this additional field and operations,
482 * we resort to the following tradeoff between simplicity and
483 * accuracy: for an inactive group that is still counted in
484 * num_groups_with_pending_reqs, we decrement
485 * num_groups_with_pending_reqs when the first descendant
486 * process of the group remains with no request waiting for
487 * completion.
488 *
489 * Even this simpler decrement strategy requires a little
490 * carefulness: to avoid multiple decrements, we flag a group,
491 * more precisely an entity representing a group, as still
492 * counted in num_groups_with_pending_reqs when it becomes
493 * inactive. Then, when the first descendant queue of the
494 * entity remains with no request waiting for completion,
495 * num_groups_with_pending_reqs is decremented, and this flag
496 * is reset. After this flag is reset for the entity,
497 * num_groups_with_pending_reqs won't be decremented any
498 * longer in case a new descendant queue of the entity remains
499 * with no request waiting for completion.
453 */ 500 */
454 unsigned int num_active_groups; 501 unsigned int num_groups_with_pending_reqs;
455 502
456 /* 503 /*
457 * Number of bfq_queues containing requests (including the 504 * Number of bfq_queues containing requests (including the
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4b0d5fb69160..63e0f12be7c9 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
1012 container_of(entity, struct bfq_group, entity); 1012 container_of(entity, struct bfq_group, entity);
1013 struct bfq_data *bfqd = bfqg->bfqd; 1013 struct bfq_data *bfqd = bfqg->bfqd;
1014 1014
1015 bfqd->num_active_groups++; 1015 if (!entity->in_groups_with_pending_reqs) {
1016 entity->in_groups_with_pending_reqs = true;
1017 bfqd->num_groups_with_pending_reqs++;
1018 }
1016 } 1019 }
1017#endif 1020#endif
1018 1021