1 files changed, 141 insertions, 2 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3c94584d947..ca469646ebe1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6454,6 +6454,137 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 }
 /*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+{
+        unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+        int cpu, best_energy_cpu = prev_cpu;
+        struct perf_domain *head, *pd;
+        unsigned long cpu_cap, util;
+        struct sched_domain *sd;
+        rcu_read_lock();
+        pd = rcu_dereference(rd->pd);
+        if (!pd || READ_ONCE(rd->overutilized))
+                goto fail;
+        head = pd;
+        /*
+         * Energy-aware wake-up happens on the lowest sched_domain starting
+         * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+         */
+        sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+        while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                sd = sd->parent;
+        if (!sd)
+                goto fail;
+        sync_entity_load_avg(&p->se);
+        if (!task_util_est(p))
+                goto unlock;
+        for (; pd; pd = pd->next) {
+                unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+                int max_spare_cap_cpu = -1;
+                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+                        if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                                continue;
+                        /* Skip CPUs that will be overutilized. */
+                        util = cpu_util_next(cpu, p, cpu);
+                        cpu_cap = capacity_of(cpu);
+                        if (cpu_cap * 1024 < util * capacity_margin)
+                                continue;
+                        /* Always use prev_cpu as a candidate. */
+                        if (cpu == prev_cpu) {
+                                prev_energy = compute_energy(p, prev_cpu, head);
+                                best_energy = min(best_energy, prev_energy);
+                                continue;
+                        }
+                        /*
+                         * Find the CPU with the maximum spare capacity in
+                         * the performance domain
+                         */
+                        spare_cap = cpu_cap - util;
+                        if (spare_cap > max_spare_cap) {
+                                max_spare_cap = spare_cap;
+                                max_spare_cap_cpu = cpu;
+                        }
+                }
+                /* Evaluate the energy impact of using this CPU. */
+                if (max_spare_cap_cpu >= 0) {
+                        cur_energy = compute_energy(p, max_spare_cap_cpu, head);
+                        if (cur_energy < best_energy) {
+                                best_energy = cur_energy;
+                                best_energy_cpu = max_spare_cap_cpu;
+                        }
+                }
+        }
+unlock:
+        rcu_read_unlock();
+        /*
+         * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+         * least 6% of the energy used by prev_cpu.
+         */
+        if (prev_energy == ULONG_MAX)
+                return best_energy_cpu;
+        if ((prev_energy - best_energy) > (prev_energy >> 4))
+                return best_energy_cpu;
+        return prev_cpu;
+fail:
+        rcu_read_unlock();
+        return -1;
+}
+/*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
@@ -6476,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
-                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-                              && cpumask_test_cpu(cpu, &p->cpus_allowed);
+                if (static_branch_unlikely(&sched_energy_present)) {
+                        new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+                        if (new_cpu >= 0)
+                                return new_cpu;
+                        new_cpu = prev_cpu;
+                }
+                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+                              cpumask_test_cpu(cpu, &p->cpus_allowed);
        }
        rcu_read_lock();

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3c94584d947..ca469646ebe1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -6454,6 +6454,137 @@ compute_energy(struct task_struct p, int dst_cpu, struct perf_domain pd)
6454	}	6454	}
6455		6455
6456	/*	6456	/*
		6457	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
		6458	* waking task. find_energy_efficient_cpu() looks for the CPU with maximum
		6459	* spare capacity in each performance domain and uses it as a potential
		6460	* candidate to execute the task. Then, it uses the Energy Model to figure
		6461	* out which of the CPU candidates is the most energy-efficient.
		6462	*
		6463	* The rationale for this heuristic is as follows. In a performance domain,
		6464	* all the most energy efficient CPU candidates (according to the Energy
		6465	* Model) are those for which we'll request a low frequency. When there are
		6466	* several CPUs for which the frequency request will be the same, we don't
		6467	* have enough data to break the tie between them, because the Energy Model
		6468	* only includes active power costs. With this model, if we assume that
		6469	* frequency requests follow utilization (e.g. using schedutil), the CPU with
		6470	* the maximum spare capacity in a performance domain is guaranteed to be among
		6471	* the best candidates of the performance domain.
		6472	*
		6473	* In practice, it could be preferable from an energy standpoint to pack
		6474	* small tasks on a CPU in order to let other CPUs go in deeper idle states,
		6475	* but that could also hurt our chances to go cluster idle, and we have no
		6476	* ways to tell with the current Energy Model if this is actually a good
		6477	* idea or not. So, find_energy_efficient_cpu() basically favors
		6478	* cluster-packing, and spreading inside a cluster. That should at least be
		6479	* a good thing for latency, and this is consistent with the idea that most
		6480	* of the energy savings of EAS come from the asymmetry of the system, and
		6481	* not so much from breaking the tie between identical CPUs. That's also the
		6482	* reason why EAS is enabled in the topology code only for systems where
		6483	* SD_ASYM_CPUCAPACITY is set.
		6484	*
		6485	* NOTE: Forkees are not accepted in the energy-aware wake-up path because
		6486	* they don't have any useful utilization data yet and it's not possible to
		6487	* forecast their impact on energy consumption. Consequently, they will be
		6488	* placed by find_idlest_cpu() on the least loaded CPU, which might turn out
		6489	* to be energy-inefficient in some use-cases. The alternative would be to
		6490	* bias new tasks towards specific types of CPUs first, or to try to infer
		6491	* their util_avg from the parent task, but those heuristics could hurt
		6492	* other use-cases too. So, until someone finds a better way to solve this,
		6493	* let's keep things simple by re-using the existing slow path.
		6494	*/
		6495
		6496	static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
		6497	{
		6498	unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
		6499	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
		6500	int cpu, best_energy_cpu = prev_cpu;
		6501	struct perf_domain head, pd;
		6502	unsigned long cpu_cap, util;
		6503	struct sched_domain *sd;
		6504
		6505	rcu_read_lock();
		6506	pd = rcu_dereference(rd->pd);
		6507	if (!pd \|\| READ_ONCE(rd->overutilized))
		6508	goto fail;
		6509	head = pd;
		6510
		6511	/*
		6512	* Energy-aware wake-up happens on the lowest sched_domain starting
		6513	* from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
		6514	*/
		6515	sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
		6516	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
		6517	sd = sd->parent;
		6518	if (!sd)
		6519	goto fail;
		6520
		6521	sync_entity_load_avg(&p->se);
		6522	if (!task_util_est(p))
		6523	goto unlock;
		6524
		6525	for (; pd; pd = pd->next) {
		6526	unsigned long cur_energy, spare_cap, max_spare_cap = 0;
		6527	int max_spare_cap_cpu = -1;
		6528
		6529	for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
		6530	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
		6531	continue;
		6532
		6533	/* Skip CPUs that will be overutilized. */
		6534	util = cpu_util_next(cpu, p, cpu);
		6535	cpu_cap = capacity_of(cpu);
		6536	if (cpu_cap * 1024 < util * capacity_margin)
		6537	continue;
		6538
		6539	/* Always use prev_cpu as a candidate. */
		6540	if (cpu == prev_cpu) {
		6541	prev_energy = compute_energy(p, prev_cpu, head);
		6542	best_energy = min(best_energy, prev_energy);
		6543	continue;
		6544	}
		6545
		6546	/*
		6547	* Find the CPU with the maximum spare capacity in
		6548	* the performance domain
		6549	*/
		6550	spare_cap = cpu_cap - util;
		6551	if (spare_cap > max_spare_cap) {
		6552	max_spare_cap = spare_cap;
		6553	max_spare_cap_cpu = cpu;
		6554	}
		6555	}
		6556
		6557	/* Evaluate the energy impact of using this CPU. */
		6558	if (max_spare_cap_cpu >= 0) {
		6559	cur_energy = compute_energy(p, max_spare_cap_cpu, head);
		6560	if (cur_energy < best_energy) {
		6561	best_energy = cur_energy;
		6562	best_energy_cpu = max_spare_cap_cpu;
		6563	}
		6564	}
		6565	}
		6566	unlock:
		6567	rcu_read_unlock();
		6568
		6569	/*
		6570	* Pick the best CPU if prev_cpu cannot be used, or if it saves at
		6571	* least 6% of the energy used by prev_cpu.
		6572	*/
		6573	if (prev_energy == ULONG_MAX)
		6574	return best_energy_cpu;
		6575
		6576	if ((prev_energy - best_energy) > (prev_energy >> 4))
		6577	return best_energy_cpu;
		6578
		6579	return prev_cpu;
		6580
		6581	fail:
		6582	rcu_read_unlock();
		6583
		6584	return -1;
		6585	}
		6586
		6587	/*
6457	* select_task_rq_fair: Select target runqueue for the waking task in domains	6588	* select_task_rq_fair: Select target runqueue for the waking task in domains
6458	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,	6589	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6459	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.	6590	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
@@ -6476,8 +6607,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6476		6607
6477	if (sd_flag & SD_BALANCE_WAKE) {	6608	if (sd_flag & SD_BALANCE_WAKE) {
6478	record_wakee(p);	6609	record_wakee(p);
6479	want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)	6610
6480	&& cpumask_test_cpu(cpu, &p->cpus_allowed);	6611	if (static_branch_unlikely(&sched_energy_present)) {
		6612	new_cpu = find_energy_efficient_cpu(p, prev_cpu);
		6613	if (new_cpu >= 0)
		6614	return new_cpu;
		6615	new_cpu = prev_cpu;
		6616	}
		6617
		6618	want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
		6619	cpumask_test_cpu(cpu, &p->cpus_allowed);
6481	}	6620	}
6482		6621
6483	rcu_read_lock();	6622	rcu_read_lock();