3 files changed, 107 insertions, 25 deletions
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..97337214bec4 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
                 bfqd->queue_weights_tree.rb_node->rb_right)
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
               ) ||
-                (bfqd->num_active_groups > 0
+                (bfqd->num_groups_with_pending_reqs > 0
 #endif
               );
 }
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
                         */
                        break;
                }
-                bfqd->num_active_groups--;
+                /*
+                 * The decrement of num_groups_with_pending_reqs is
+                 * not performed immediately upon the deactivation of
+                 * entity, but it is delayed to when it also happens
+                 * that the first leaf descendant bfqq of entity gets
+                 * all its pending requests completed. The following
+                 * instructions perform this delayed decrement, if
+                 * needed. See the comments on
+                 * num_groups_with_pending_reqs for details.
+                 */
+                if (entity->in_groups_with_pending_reqs) {
+                        entity->in_groups_with_pending_reqs = false;
+                        bfqd->num_groups_with_pending_reqs--;
+                }
        }
 }
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * fact, if there are active groups, then, for condition (i)
         * to become false, it is enough that an active group contains
         * more active processes or sub-groups than some other active
-         * group. We address this issue with the following bi-modal
+         * group. More precisely, for condition (i) to hold because of
-         * behavior, implemented in the function
+         * such a group, it is not even necessary that the group is
+         * (still) active: it is sufficient that, even if the group
+         * has become inactive, some of its descendant processes still
+         * have some request already dispatched but still waiting for
+         * completion. In fact, requests have still to be guaranteed
+         * their share of the throughput even after being
+         * dispatched. In this respect, it is easy to show that, if a
+         * group frequently becomes inactive while still having
+         * in-flight requests, and if, when this happens, the group is
+         * not considered in the calculation of whether the scenario
+         * is asymmetric, then the group may fail to be guaranteed its
+         * fair share of the throughput (basically because idling may
+         * not be performed for the descendant processes of the group,
+         * but it had to be).  We address this issue with the
+         * following bi-modal behavior, implemented in the function
         * bfq_symmetric_scenario().
         *
-         * If there are active groups, then the scenario is tagged as
+         * If there are groups with requests waiting for completion
+         * (as commented above, some of these groups may even be
+         * already inactive), then the scenario is tagged as
         * asymmetric, conservatively, without checking any of the
         * conditions (i) and (ii). So the device is idled for bfqq.
         * This behavior matches also the fact that groups are created
-         * exactly if controlling I/O (to preserve bandwidth and
+         * exactly if controlling I/O is a primary concern (to
-         * latency guarantees) is a primary concern.
+         * preserve bandwidth and latency guarantees).
         *
-         * On the opposite end, if there are no active groups, then
+         * On the opposite end, if there are no groups with requests
-         * only condition (i) is actually controlled, i.e., provided
+         * waiting for completion, then only condition (i) is actually
-         * that condition (i) holds, idling is not performed,
+         * controlled, i.e., provided that condition (i) holds, idling
-         * regardless of whether condition (ii) holds. In other words,
+         * is not performed, regardless of whether condition (ii)
-         * only if condition (i) does not hold, then idling is
+         * holds. In other words, only if condition (i) does not hold,
-         * allowed, and the device tends to be prevented from queueing
+         * then idling is allowed, and the device tends to be
-         * many requests, possibly of several processes. Since there
+         * prevented from queueing many requests, possibly of several
-         * are no active groups, then, to control condition (i) it is
+         * processes. Since there are no groups with requests waiting
-         * enough to check whether all active queues have the same
+         * for completion, then, to control condition (i) it is enough
-         * weight.
+         * to check just whether all the queues with requests waiting
+         * for completion also have the same weight.
         *
         * Not checking condition (ii) evidently exposes bfqq to the
         * risk of getting less throughput than its fair share.
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
         * bfqq is weight-raised is checked explicitly here. More
         * precisely, the compound condition below takes into account
         * also the fact that, even if bfqq is being weight-raised,
-         * the scenario is still symmetric if all active queues happen
+         * the scenario is still symmetric if all queues with requests
-         * to be weight-raised. Actually, we should be even more
+         * waiting for completion happen to be
-         * precise here, and differentiate between interactive weight
+         * weight-raised. Actually, we should be even more precise
-         * raising and soft real-time weight raising.
+         * here, and differentiate between interactive weight raising
+         * and soft real-time weight raising.
         *
         * As a side note, it is worth considering that the above
         * device-idling countermeasures may however fail in the
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
        bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
        bfqd->queue_weights_tree = RB_ROOT;
-        bfqd->num_active_groups = 0;
+        bfqd->num_groups_with_pending_reqs = 0;
        INIT_LIST_HEAD(&bfqd->active_list);
        INIT_LIST_HEAD(&bfqd->idle_list);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 77651d817ecd..0b02bf302de0 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -196,6 +196,9 @@ struct bfq_entity {
        /* flag, set to request a weight, ioprio or ioprio_class change  */
        int prio_changed;
+        /* flag, set if the entity is counted in groups_with_pending_reqs */
+        bool in_groups_with_pending_reqs;
 };
 struct bfq_group;
@@ -448,10 +451,54 @@ struct bfq_data {
         * bfq_weights_tree_[add|remove] for further details).
         */
        struct rb_root queue_weights_tree;
        /*
-         * number of groups with requests still waiting for completion
+         * Number of groups with at least one descendant process that
+         * has at least one request waiting for completion. Note that
+         * this accounts for also requests already dispatched, but not
+         * yet completed. Therefore this number of groups may differ
+         * (be larger) than the number of active groups, as a group is
+         * considered active only if its corresponding entity has
+         * descendant queues with at least one request queued. This
+         * number is used to decide whether a scenario is symmetric.
+         * For a detailed explanation see comments on the computation
+         * of the variable asymmetric_scenario in the function
+         * bfq_better_to_idle().
+         *
+         * However, it is hard to compute this number exactly, for
+         * groups with multiple descendant processes. Consider a group
+         * that is inactive, i.e., that has no descendant process with
+         * pending I/O inside BFQ queues. Then suppose that
+         * num_groups_with_pending_reqs is still accounting for this
+         * group, because the group has descendant processes with some
+         * I/O request still in flight. num_groups_with_pending_reqs
+         * should be decremented when the in-flight request of the
+         * last descendant process is finally completed (assuming that
+         * nothing else has changed for the group in the meantime, in
+         * terms of composition of the group and active/inactive state of child
+         * groups and processes). To accomplish this, an additional
+         * pending-request counter must be added to entities, and must
+         * be updated correctly. To avoid this additional field and operations,
+         * we resort to the following tradeoff between simplicity and
+         * accuracy: for an inactive group that is still counted in
+         * num_groups_with_pending_reqs, we decrement
+         * num_groups_with_pending_reqs when the first descendant
+         * process of the group remains with no request waiting for
+         * completion.
+         *
+         * Even this simpler decrement strategy requires a little
+         * carefulness: to avoid multiple decrements, we flag a group,
+         * more precisely an entity representing a group, as still
+         * counted in num_groups_with_pending_reqs when it becomes
+         * inactive. Then, when the first descendant queue of the
+         * entity remains with no request waiting for completion,
+         * num_groups_with_pending_reqs is decremented, and this flag
+         * is reset. After this flag is reset for the entity,
+         * num_groups_with_pending_reqs won't be decremented any
+         * longer in case a new descendant queue of the entity remains
+         * with no request waiting for completion.
         */
-        unsigned int num_active_groups;
+        unsigned int num_groups_with_pending_reqs;
        /*
         * Number of bfq_queues containing requests (including the
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 4b0d5fb69160..63e0f12be7c9 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
                        container_of(entity, struct bfq_group, entity);
                struct bfq_data *bfqd = bfqg->bfqd;
-                bfqd->num_active_groups++;
+                if (!entity->in_groups_with_pending_reqs) {
+                        entity->in_groups_with_pending_reqs = true;
+                        bfqd->num_groups_with_pending_reqs++;
+                }
        }
 #endif

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3a27d31fcda6..97337214bec4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c
@@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd)
638	bfqd->queue_weights_tree.rb_node->rb_right)	638	bfqd->queue_weights_tree.rb_node->rb_right)
639	#ifdef CONFIG_BFQ_GROUP_IOSCHED	639	#ifdef CONFIG_BFQ_GROUP_IOSCHED
640	) \|\|	640	) \|\|
641	(bfqd->num_active_groups > 0	641	(bfqd->num_groups_with_pending_reqs > 0
642	#endif	642	#endif
643	);	643	);
644	}	644	}
@@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd,
802	*/	802	*/
803	break;	803	break;
804	}	804	}
805	bfqd->num_active_groups--;	805
		806	/*
		807	* The decrement of num_groups_with_pending_reqs is
		808	* not performed immediately upon the deactivation of
		809	* entity, but it is delayed to when it also happens
		810	* that the first leaf descendant bfqq of entity gets
		811	* all its pending requests completed. The following
		812	* instructions perform this delayed decrement, if
		813	* needed. See the comments on
		814	* num_groups_with_pending_reqs for details.
		815	*/
		816	if (entity->in_groups_with_pending_reqs) {
		817	entity->in_groups_with_pending_reqs = false;
		818	bfqd->num_groups_with_pending_reqs--;
		819	}
806	}	820	}
807	}	821	}
808		822
@@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3529	* fact, if there are active groups, then, for condition (i)	3543	* fact, if there are active groups, then, for condition (i)
3530	* to become false, it is enough that an active group contains	3544	* to become false, it is enough that an active group contains
3531	* more active processes or sub-groups than some other active	3545	* more active processes or sub-groups than some other active
3532	* group. We address this issue with the following bi-modal	3546	* group. More precisely, for condition (i) to hold because of
3533	* behavior, implemented in the function	3547	* such a group, it is not even necessary that the group is
		3548	* (still) active: it is sufficient that, even if the group
		3549	* has become inactive, some of its descendant processes still
		3550	* have some request already dispatched but still waiting for
		3551	* completion. In fact, requests have still to be guaranteed
		3552	* their share of the throughput even after being
		3553	* dispatched. In this respect, it is easy to show that, if a
		3554	* group frequently becomes inactive while still having
		3555	* in-flight requests, and if, when this happens, the group is
		3556	* not considered in the calculation of whether the scenario
		3557	* is asymmetric, then the group may fail to be guaranteed its
		3558	* fair share of the throughput (basically because idling may
		3559	* not be performed for the descendant processes of the group,
		3560	* but it had to be). We address this issue with the
		3561	* following bi-modal behavior, implemented in the function
3534	* bfq_symmetric_scenario().	3562	* bfq_symmetric_scenario().
3535	*	3563	*
3536	* If there are active groups, then the scenario is tagged as	3564	* If there are groups with requests waiting for completion
		3565	* (as commented above, some of these groups may even be
		3566	* already inactive), then the scenario is tagged as
3537	* asymmetric, conservatively, without checking any of the	3567	* asymmetric, conservatively, without checking any of the
3538	* conditions (i) and (ii). So the device is idled for bfqq.	3568	* conditions (i) and (ii). So the device is idled for bfqq.
3539	* This behavior matches also the fact that groups are created	3569	* This behavior matches also the fact that groups are created
3540	* exactly if controlling I/O (to preserve bandwidth and	3570	* exactly if controlling I/O is a primary concern (to
3541	* latency guarantees) is a primary concern.	3571	* preserve bandwidth and latency guarantees).
3542	*	3572	*
3543	* On the opposite end, if there are no active groups, then	3573	* On the opposite end, if there are no groups with requests
3544	* only condition (i) is actually controlled, i.e., provided	3574	* waiting for completion, then only condition (i) is actually
3545	* that condition (i) holds, idling is not performed,	3575	* controlled, i.e., provided that condition (i) holds, idling
3546	* regardless of whether condition (ii) holds. In other words,	3576	* is not performed, regardless of whether condition (ii)
3547	* only if condition (i) does not hold, then idling is	3577	* holds. In other words, only if condition (i) does not hold,
3548	* allowed, and the device tends to be prevented from queueing	3578	* then idling is allowed, and the device tends to be
3549	* many requests, possibly of several processes. Since there	3579	* prevented from queueing many requests, possibly of several
3550	* are no active groups, then, to control condition (i) it is	3580	* processes. Since there are no groups with requests waiting
3551	* enough to check whether all active queues have the same	3581	* for completion, then, to control condition (i) it is enough
3552	* weight.	3582	* to check just whether all the queues with requests waiting
		3583	* for completion also have the same weight.
3553	*	3584	*
3554	* Not checking condition (ii) evidently exposes bfqq to the	3585	* Not checking condition (ii) evidently exposes bfqq to the
3555	* risk of getting less throughput than its fair share.	3586	* risk of getting less throughput than its fair share.
@@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
3607	* bfqq is weight-raised is checked explicitly here. More	3638	* bfqq is weight-raised is checked explicitly here. More
3608	* precisely, the compound condition below takes into account	3639	* precisely, the compound condition below takes into account
3609	* also the fact that, even if bfqq is being weight-raised,	3640	* also the fact that, even if bfqq is being weight-raised,
3610	* the scenario is still symmetric if all active queues happen	3641	* the scenario is still symmetric if all queues with requests
3611	* to be weight-raised. Actually, we should be even more	3642	* waiting for completion happen to be
3612	* precise here, and differentiate between interactive weight	3643	* weight-raised. Actually, we should be even more precise
3613	* raising and soft real-time weight raising.	3644	* here, and differentiate between interactive weight raising
		3645	* and soft real-time weight raising.
3614	*	3646	*
3615	* As a side note, it is worth considering that the above	3647	* As a side note, it is worth considering that the above
3616	* device-idling countermeasures may however fail in the	3648	* device-idling countermeasures may however fail in the
@@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue q, struct elevator_type e)
5417	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;	5449	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
5418		5450
5419	bfqd->queue_weights_tree = RB_ROOT;	5451	bfqd->queue_weights_tree = RB_ROOT;
5420	bfqd->num_active_groups = 0;	5452	bfqd->num_groups_with_pending_reqs = 0;
5421		5453
5422	INIT_LIST_HEAD(&bfqd->active_list);	5454	INIT_LIST_HEAD(&bfqd->active_list);
5423	INIT_LIST_HEAD(&bfqd->idle_list);	5455	INIT_LIST_HEAD(&bfqd->idle_list);


diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 77651d817ecd..0b02bf302de0 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h
@@ -196,6 +196,9 @@ struct bfq_entity {
196		196
197	/* flag, set to request a weight, ioprio or ioprio_class change */	197	/* flag, set to request a weight, ioprio or ioprio_class change */
198	int prio_changed;	198	int prio_changed;
		199
		200	/* flag, set if the entity is counted in groups_with_pending_reqs */
		201	bool in_groups_with_pending_reqs;
199	};	202	};
200		203
201	struct bfq_group;	204	struct bfq_group;
@@ -448,10 +451,54 @@ struct bfq_data {
448	* bfq_weights_tree_[add\|remove] for further details).	451	* bfq_weights_tree_[add\|remove] for further details).
449	*/	452	*/
450	struct rb_root queue_weights_tree;	453	struct rb_root queue_weights_tree;
		454
451	/*	455	/*
452	* number of groups with requests still waiting for completion	456	* Number of groups with at least one descendant process that
		457	* has at least one request waiting for completion. Note that
		458	* this accounts for also requests already dispatched, but not
		459	* yet completed. Therefore this number of groups may differ
		460	* (be larger) than the number of active groups, as a group is
		461	* considered active only if its corresponding entity has
		462	* descendant queues with at least one request queued. This
		463	* number is used to decide whether a scenario is symmetric.
		464	* For a detailed explanation see comments on the computation
		465	* of the variable asymmetric_scenario in the function
		466	* bfq_better_to_idle().
		467	*
		468	* However, it is hard to compute this number exactly, for
		469	* groups with multiple descendant processes. Consider a group
		470	* that is inactive, i.e., that has no descendant process with
		471	* pending I/O inside BFQ queues. Then suppose that
		472	* num_groups_with_pending_reqs is still accounting for this
		473	* group, because the group has descendant processes with some
		474	* I/O request still in flight. num_groups_with_pending_reqs
		475	* should be decremented when the in-flight request of the
		476	* last descendant process is finally completed (assuming that
		477	* nothing else has changed for the group in the meantime, in
		478	* terms of composition of the group and active/inactive state of child
		479	* groups and processes). To accomplish this, an additional
		480	* pending-request counter must be added to entities, and must
		481	* be updated correctly. To avoid this additional field and operations,
		482	* we resort to the following tradeoff between simplicity and
		483	* accuracy: for an inactive group that is still counted in
		484	* num_groups_with_pending_reqs, we decrement
		485	* num_groups_with_pending_reqs when the first descendant
		486	* process of the group remains with no request waiting for
		487	* completion.
		488	*
		489	* Even this simpler decrement strategy requires a little
		490	* carefulness: to avoid multiple decrements, we flag a group,
		491	* more precisely an entity representing a group, as still
		492	* counted in num_groups_with_pending_reqs when it becomes
		493	* inactive. Then, when the first descendant queue of the
		494	* entity remains with no request waiting for completion,
		495	* num_groups_with_pending_reqs is decremented, and this flag
		496	* is reset. After this flag is reset for the entity,
		497	* num_groups_with_pending_reqs won't be decremented any
		498	* longer in case a new descendant queue of the entity remains
		499	* with no request waiting for completion.
453	*/	500	*/
454	unsigned int num_active_groups;	501	unsigned int num_groups_with_pending_reqs;
455		502
456	/*	503	/*
457	* Number of bfq_queues containing requests (including the	504	* Number of bfq_queues containing requests (including the


diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4b0d5fb69160..63e0f12be7c9 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c
@@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
1012	container_of(entity, struct bfq_group, entity);	1012	container_of(entity, struct bfq_group, entity);
1013	struct bfq_data *bfqd = bfqg->bfqd;	1013	struct bfq_data *bfqd = bfqg->bfqd;
1014		1014
1015	bfqd->num_active_groups++;	1015	if (!entity->in_groups_with_pending_reqs) {
		1016	entity->in_groups_with_pending_reqs = true;
		1017	bfqd->num_groups_with_pending_reqs++;
		1018	}
1016	}	1019	}
1017	#endif	1020	#endif
1018		1021