5 files changed, 112 insertions, 61 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f26b70..6fedf3a98581 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
 #ifdef CONFIG_SCHED_SMT
        /*
-         * The sched_smt_present static key needs to be evaluated on every
+         * When going up, increment the number of cores with SMT present.
-         * hotplug event because at boot time SMT might be disabled when
-         * the number of booted CPUs is limited.
-         *
-         * If then later a sibling gets hotplugged, then the key would stay
-         * off and SMT scheduling would never be functional.
         */
-        if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
+        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
-                static_branch_enable_cpuslocked(&sched_smt_present);
+                static_branch_inc_cpuslocked(&sched_smt_present);
 #endif
        set_cpu_active(cpu, true);
@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
         */
        synchronize_rcu_mult(call_rcu, call_rcu_sched);
+#ifdef CONFIG_SCHED_SMT
+        /*
+         * When going down, decrement the number of cores with SMT present.
+         */
+        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+                static_branch_dec_cpuslocked(&sched_smt_present);
+#endif
        if (!sched_smp_initialized)
                return 0;
@@ -5851,11 +5854,14 @@ void __init sched_init_smp(void)
        /*
         * There's no userspace yet to cause hotplug operations; hence all the
         * CPU masks are stable and all blatant races in the below code cannot
-         * happen.
+         * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
+         * but there won't be any contention on it.
         */
+        cpus_read_lock();
        mutex_lock(&sched_domains_mutex);
        sched_init_domains(cpu_active_mask);
        mutex_unlock(&sched_domains_mutex);
+        cpus_read_unlock();
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                local = 1;
        /*
-         * Retry task to preferred node migration periodically, in case it
+         * Retry to migrate task to preferred node periodically, in case it
-         * case it previously failed, or the scheduler moved us.
+         * previously failed, or the scheduler moved us.
         */
        if (time_after(jiffies, p->numa_migrate_retry)) {
                task_numa_placement(p);
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
        return target;
 }
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 {
-        return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+        return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 }
 /*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-                        spare_cap = capacity_spare_wake(i, p);
+                        spare_cap = capacity_spare_without(i, p);
                        if (spare_cap > max_spare_cap)
                                max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                return prev_cpu;
        /*
-         * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+         * We need task's util for capacity_spare_without, sync it up to
-         * last_update_time.
+         * prev_cpu's last_update_time.
         */
        if (!(sd_flag & SD_BALANCE_FORK))
                sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
 }
 /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
+ * cpu_util_without: compute cpu utilization without any contributions from *p
- * the waking task p removed.
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
 */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
        struct cfs_rq *cfs_rq;
        unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
        cfs_rq = &cpu_rq(cpu)->cfs;
        util = READ_ONCE(cfs_rq->avg.util_avg);
-        /* Discount task's blocked util from CPU's util */
+        /* Discount task's util from CPU's util */
        util -= min_t(unsigned int, util, task_util(p));
        /*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
         * a) if *p is the only task sleeping on this CPU, then:
         *      cpu_util (== task_util) > util_est (== 0)
         *    and thus we return:
-         *      cpu_util_wake = (cpu_util - task_util) = 0
+         *      cpu_util_without = (cpu_util - task_util) = 0
         *
         * b) if other tasks are SLEEPING on this CPU, which is now exiting
         *    IDLE, then:
         *      cpu_util >= task_util
         *      cpu_util > util_est (== 0)
         *    and thus we discount *p's blocked utilization to return:
-         *      cpu_util_wake = (cpu_util - task_util) >= 0
+         *      cpu_util_without = (cpu_util - task_util) >= 0
         *
         * c) if other tasks are RUNNABLE on that CPU and
         *      util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
         * covered by the following code when estimated utilization is
         * enabled.
         */
-        if (sched_feat(UTIL_EST))
+        if (sched_feat(UTIL_EST)) {
-                util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+                unsigned int estimated =
+                        READ_ONCE(cfs_rq->avg.util_est.enqueued);
+                /*
+                 * Despite the following checks we still have a small window
+                 * for a possible race, when an execl's select_task_rq_fair()
+                 * races with LB's detach_task():
+                 *
+                 *   detach_task()
+                 *     p->on_rq = TASK_ON_RQ_MIGRATING;
+                 *     ---------------------------------- A
+                 *     deactivate_task()                   \
+                 *       dequeue_task()                     + RaceTime
+                 *         util_est_dequeue()              /
+                 *     ---------------------------------- B
+                 *
+                 * The additional check on "current == p" it's required to
+                 * properly fix the execl regression and it helps in further
+                 * reducing the chances for the above race.
+                 */
+                if (unlikely(task_on_rq_queued(p) || current == p)) {
+                        estimated -= min_t(unsigned int, estimated,
+                                           (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+                }
+                util = max(util, estimated);
+        }
        /*
         * Utilization (estimated) can exceed the CPU capacity, thus let's
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7cdecfc010af..fe24de3fbc93 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -136,8 +136,18 @@
 static int psi_bug __read_mostly;
-bool psi_disabled __read_mostly;
+DEFINE_STATIC_KEY_FALSE(psi_disabled);
-core_param(psi_disabled, psi_disabled, bool, 0644);
+#ifdef CONFIG_PSI_DEFAULT_DISABLED
+bool psi_enable;
+#else
+bool psi_enable = true;
+#endif
+static int __init setup_psi(char *str)
+{
+        return kstrtobool(str, &psi_enable) == 0;
+}
+__setup("psi=", setup_psi);
 /* Running averages - we need to be higher-res than loadavg */
 #define PSI_FREQ        (2*HZ+1)        /* 2 sec intervals */
@@ -169,8 +179,10 @@ static void group_init(struct psi_group *group)
 void __init psi_init(void)
 {
-        if (psi_disabled)
+        if (!psi_enable) {
+                static_branch_enable(&psi_disabled);
                return;
+        }
        psi_period = jiffies_to_nsecs(PSI_FREQ);
        group_init(&psi_system);
@@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags)
        struct rq_flags rf;
        struct rq *rq;
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        *flags = current->flags & PF_MEMSTALL;
@@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags)
        struct rq_flags rf;
        struct rq *rq;
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        if (*flags)
@@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags)
 #ifdef CONFIG_CGROUPS
 int psi_cgroup_alloc(struct cgroup *cgroup)
 {
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return 0;
        cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
@@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
 void psi_cgroup_free(struct cgroup *cgroup)
 {
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        cancel_delayed_work_sync(&cgroup->psi.clock_work);
@@ -633,38 +645,39 @@ void psi_cgroup_free(struct cgroup *cgroup)
 */
 void cgroup_move_task(struct task_struct *task, struct css_set *to)
 {
-        bool move_psi = !psi_disabled;
        unsigned int task_flags = 0;
        struct rq_flags rf;
        struct rq *rq;
-        if (move_psi) {
+        if (static_branch_likely(&psi_disabled)) {
-                rq = task_rq_lock(task, &rf);
+                /*
+                 * Lame to do this here, but the scheduler cannot be locked
+                 * from the outside, so we move cgroups from inside sched/.
+                 */
+                rcu_assign_pointer(task->cgroups, to);
+                return;
+        }
-                if (task_on_rq_queued(task))
+        rq = task_rq_lock(task, &rf);
-                        task_flags = TSK_RUNNING;
-                else if (task->in_iowait)
-                        task_flags = TSK_IOWAIT;
-                if (task->flags & PF_MEMSTALL)
+        if (task_on_rq_queued(task))
-                        task_flags |= TSK_MEMSTALL;
+                task_flags = TSK_RUNNING;
+        else if (task->in_iowait)
+                task_flags = TSK_IOWAIT;
-                if (task_flags)
+        if (task->flags & PF_MEMSTALL)
-                        psi_task_change(task, task_flags, 0);
+                task_flags |= TSK_MEMSTALL;
-        }
-        /*
+        if (task_flags)
-         * Lame to do this here, but the scheduler cannot be locked
+                psi_task_change(task, task_flags, 0);
-         * from the outside, so we move cgroups from inside sched/.
-         */
+        /* See comment above */
        rcu_assign_pointer(task->cgroups, to);
-        if (move_psi) {
+        if (task_flags)
-                if (task_flags)
+                psi_task_change(task, 0, task_flags);
-                        psi_task_change(task, 0, task_flags);
-                task_rq_unlock(rq, task, &rf);
+        task_rq_unlock(rq, task, &rf);
-        }
 }
 #endif /* CONFIG_CGROUPS */
@@ -672,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 {
        int full;
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return -EOPNOTSUPP;
        update_stats(group);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 618577fc9aa8..4e524ab589c9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -23,6 +23,7 @@
 #include <linux/sched/prio.h>
 #include <linux/sched/rt.h>
 #include <linux/sched/signal.h>
+#include <linux/sched/smt.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/task.h>
@@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq)
 #ifdef CONFIG_SCHED_SMT
-extern struct static_key_false sched_smt_present;
 extern void __update_idle_core(struct rq *rq);
 static inline void update_idle_core(struct rq *rq)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 4904c4677000..aa0de240fb41 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
 {
        int clear = 0, set = TSK_RUNNING;
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        if (!wakeup || p->sched_psi_wake_requeue) {
@@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
        int clear = TSK_RUNNING, set = 0;
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        if (!sleep) {
@@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
 static inline void psi_ttwu_dequeue(struct task_struct *p)
 {
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        /*
         * Is the task being migrated during a wakeup? Make sure to
@@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
 static inline void psi_task_tick(struct rq *rq)
 {
-        if (psi_disabled)
+        if (static_branch_likely(&psi_disabled))
                return;
        if (unlikely(rq->curr->flags & PF_MEMSTALL))

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f12225f26b70..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
5738		5738
5739	#ifdef CONFIG_SCHED_SMT	5739	#ifdef CONFIG_SCHED_SMT
5740	/*	5740	/*
5741	* The sched_smt_present static key needs to be evaluated on every	5741	* When going up, increment the number of cores with SMT present.
5742	* hotplug event because at boot time SMT might be disabled when
5743	* the number of booted CPUs is limited.
5744	*
5745	* If then later a sibling gets hotplugged, then the key would stay
5746	* off and SMT scheduling would never be functional.
5747	*/	5742	*/
5748	if (cpumask_weight(cpu_smt_mask(cpu)) > 1)	5743	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
5749	static_branch_enable_cpuslocked(&sched_smt_present);	5744	static_branch_inc_cpuslocked(&sched_smt_present);
5750	#endif	5745	#endif
5751	set_cpu_active(cpu, true);	5746	set_cpu_active(cpu, true);
5752		5747
@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
5790	*/	5785	*/
5791	synchronize_rcu_mult(call_rcu, call_rcu_sched);	5786	synchronize_rcu_mult(call_rcu, call_rcu_sched);
5792		5787
		5788	#ifdef CONFIG_SCHED_SMT
		5789	/*
		5790	* When going down, decrement the number of cores with SMT present.
		5791	*/
		5792	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
		5793	static_branch_dec_cpuslocked(&sched_smt_present);
		5794	#endif
		5795
5793	if (!sched_smp_initialized)	5796	if (!sched_smp_initialized)
5794	return 0;	5797	return 0;
5795		5798
@@ -5851,11 +5854,14 @@ void __init sched_init_smp(void)
5851	/*	5854	/*
5852	* There's no userspace yet to cause hotplug operations; hence all the	5855	* There's no userspace yet to cause hotplug operations; hence all the
5853	* CPU masks are stable and all blatant races in the below code cannot	5856	* CPU masks are stable and all blatant races in the below code cannot
5854	* happen.	5857	* happen. The hotplug lock is nevertheless taken to satisfy lockdep,
		5858	* but there won't be any contention on it.
5855	*/	5859	*/
		5860	cpus_read_lock();
5856	mutex_lock(&sched_domains_mutex);	5861	mutex_lock(&sched_domains_mutex);
5857	sched_init_domains(cpu_active_mask);	5862	sched_init_domains(cpu_active_mask);
5858	mutex_unlock(&sched_domains_mutex);	5863	mutex_unlock(&sched_domains_mutex);
		5864	cpus_read_unlock();
5859		5865
5860	/* Move init over to a non-isolated CPU */	5866	/* Move init over to a non-isolated CPU */
5861	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)	5867	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee271bb661cc..ac855b2f4774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2400	local = 1;	2400	local = 1;
2401		2401
2402	/*	2402	/*
2403	* Retry task to preferred node migration periodically, in case it	2403	* Retry to migrate task to preferred node periodically, in case it
2404	* case it previously failed, or the scheduler moved us.	2404	* previously failed, or the scheduler moved us.
2405	*/	2405	*/
2406	if (time_after(jiffies, p->numa_migrate_retry)) {	2406	if (time_after(jiffies, p->numa_migrate_retry)) {
2407	task_numa_placement(p);	2407	task_numa_placement(p);
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain sd, struct task_struct p,
5674	return target;	5674	return target;
5675	}	5675	}
5676		5676
5677	static unsigned long cpu_util_wake(int cpu, struct task_struct *p);	5677	static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5678		5678
5679	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)	5679	static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5680	{	5680	{
5681	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);	5681	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5682	}	5682	}
5683		5683
5684	/*	5684	/*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5738		5738
5739	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);	5739	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5740		5740
5741	spare_cap = capacity_spare_wake(i, p);	5741	spare_cap = capacity_spare_without(i, p);
5742		5742
5743	if (spare_cap > max_spare_cap)	5743	if (spare_cap > max_spare_cap)
5744	max_spare_cap = spare_cap;	5744	max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain sd, struct task_struct p
5889	return prev_cpu;	5889	return prev_cpu;
5890		5890
5891	/*	5891	/*
5892	* We need task's util for capacity_spare_wake, sync it up to prev_cpu's	5892	* We need task's util for capacity_spare_without, sync it up to
5893	* last_update_time.	5893	* prev_cpu's last_update_time.
5894	*/	5894	*/
5895	if (!(sd_flag & SD_BALANCE_FORK))	5895	if (!(sd_flag & SD_BALANCE_FORK))
5896	sync_entity_load_avg(&p->se);	5896	sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
6216	}	6216	}
6217		6217
6218	/*	6218	/*
6219	* cpu_util_wake: Compute CPU utilization with any contributions from	6219	* cpu_util_without: compute cpu utilization without any contributions from *p
6220	* the waking task p removed.	6220	* @cpu: the CPU which utilization is requested
		6221	* @p: the task which utilization should be discounted
		6222	*
		6223	* The utilization of a CPU is defined by the utilization of tasks currently
		6224	* enqueued on that CPU as well as tasks which are currently sleeping after an
		6225	* execution on that CPU.
		6226	*
		6227	* This method returns the utilization of the specified CPU by discounting the
		6228	* utilization of the specified task, whenever the task is currently
		6229	* contributing to the CPU utilization.
6221	*/	6230	*/
6222	static unsigned long cpu_util_wake(int cpu, struct task_struct *p)	6231	static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6223	{	6232	{
6224	struct cfs_rq *cfs_rq;	6233	struct cfs_rq *cfs_rq;
6225	unsigned int util;	6234	unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6231	cfs_rq = &cpu_rq(cpu)->cfs;	6240	cfs_rq = &cpu_rq(cpu)->cfs;
6232	util = READ_ONCE(cfs_rq->avg.util_avg);	6241	util = READ_ONCE(cfs_rq->avg.util_avg);
6233		6242
6234	/* Discount task's blocked util from CPU's util */	6243	/* Discount task's util from CPU's util */
6235	util -= min_t(unsigned int, util, task_util(p));	6244	util -= min_t(unsigned int, util, task_util(p));
6236		6245
6237	/*	6246	/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6240	* a) if *p is the only task sleeping on this CPU, then:	6249	* a) if *p is the only task sleeping on this CPU, then:
6241	* cpu_util (== task_util) > util_est (== 0)	6250	* cpu_util (== task_util) > util_est (== 0)
6242	* and thus we return:	6251	* and thus we return:
6243	* cpu_util_wake = (cpu_util - task_util) = 0	6252	* cpu_util_without = (cpu_util - task_util) = 0
6244	*	6253	*
6245	* b) if other tasks are SLEEPING on this CPU, which is now exiting	6254	* b) if other tasks are SLEEPING on this CPU, which is now exiting
6246	* IDLE, then:	6255	* IDLE, then:
6247	* cpu_util >= task_util	6256	* cpu_util >= task_util
6248	* cpu_util > util_est (== 0)	6257	* cpu_util > util_est (== 0)
6249	* and thus we discount *p's blocked utilization to return:	6258	* and thus we discount *p's blocked utilization to return:
6250	* cpu_util_wake = (cpu_util - task_util) >= 0	6259	* cpu_util_without = (cpu_util - task_util) >= 0
6251	*	6260	*
6252	* c) if other tasks are RUNNABLE on that CPU and	6261	* c) if other tasks are RUNNABLE on that CPU and
6253	* util_est > cpu_util	6262	* util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6260	* covered by the following code when estimated utilization is	6269	* covered by the following code when estimated utilization is
6261	* enabled.	6270	* enabled.
6262	*/	6271	*/
6263	if (sched_feat(UTIL_EST))	6272	if (sched_feat(UTIL_EST)) {
6264	util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));	6273	unsigned int estimated =
		6274	READ_ONCE(cfs_rq->avg.util_est.enqueued);
		6275
		6276	/*
		6277	* Despite the following checks we still have a small window
		6278	* for a possible race, when an execl's select_task_rq_fair()
		6279	* races with LB's detach_task():
		6280	*
		6281	* detach_task()
		6282	* p->on_rq = TASK_ON_RQ_MIGRATING;
		6283	* ---------------------------------- A
		6284	* deactivate_task() \
		6285	* dequeue_task() + RaceTime
		6286	* util_est_dequeue() /
		6287	* ---------------------------------- B
		6288	*
		6289	* The additional check on "current == p" it's required to
		6290	* properly fix the execl regression and it helps in further
		6291	* reducing the chances for the above race.
		6292	*/
		6293	if (unlikely(task_on_rq_queued(p) \|\| current == p)) {
		6294	estimated -= min_t(unsigned int, estimated,
		6295	(_task_util_est(p) \| UTIL_AVG_UNCHANGED));
		6296	}
		6297	util = max(util, estimated);
		6298	}
6265		6299
6266	/*	6300	/*
6267	* Utilization (estimated) can exceed the CPU capacity, thus let's	6301	* Utilization (estimated) can exceed the CPU capacity, thus let's


diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc010af..fe24de3fbc93 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c
@@ -136,8 +136,18 @@
136		136
137	static int psi_bug __read_mostly;	137	static int psi_bug __read_mostly;
138		138
139	bool psi_disabled __read_mostly;	139	DEFINE_STATIC_KEY_FALSE(psi_disabled);
140	core_param(psi_disabled, psi_disabled, bool, 0644);	140
		141	#ifdef CONFIG_PSI_DEFAULT_DISABLED
		142	bool psi_enable;
		143	#else
		144	bool psi_enable = true;
		145	#endif
		146	static int __init setup_psi(char *str)
		147	{
		148	return kstrtobool(str, &psi_enable) == 0;
		149	}
		150	__setup("psi=", setup_psi);
141		151
142	/* Running averages - we need to be higher-res than loadavg */	152	/* Running averages - we need to be higher-res than loadavg */
143	#define PSI_FREQ (2HZ+1) / 2 sec intervals */	153	#define PSI_FREQ (2HZ+1) / 2 sec intervals */
@@ -169,8 +179,10 @@ static void group_init(struct psi_group *group)
169		179
170	void __init psi_init(void)	180	void __init psi_init(void)
171	{	181	{
172	if (psi_disabled)	182	if (!psi_enable) {
		183	static_branch_enable(&psi_disabled);
173	return;	184	return;
		185	}
174		186
175	psi_period = jiffies_to_nsecs(PSI_FREQ);	187	psi_period = jiffies_to_nsecs(PSI_FREQ);
176	group_init(&psi_system);	188	group_init(&psi_system);
@@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags)
549	struct rq_flags rf;	561	struct rq_flags rf;
550	struct rq *rq;	562	struct rq *rq;
551		563
552	if (psi_disabled)	564	if (static_branch_likely(&psi_disabled))
553	return;	565	return;
554		566
555	*flags = current->flags & PF_MEMSTALL;	567	*flags = current->flags & PF_MEMSTALL;
@@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags)
579	struct rq_flags rf;	591	struct rq_flags rf;
580	struct rq *rq;	592	struct rq *rq;
581		593
582	if (psi_disabled)	594	if (static_branch_likely(&psi_disabled))
583	return;	595	return;
584		596
585	if (*flags)	597	if (*flags)
@@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags)
600	#ifdef CONFIG_CGROUPS	612	#ifdef CONFIG_CGROUPS
601	int psi_cgroup_alloc(struct cgroup *cgroup)	613	int psi_cgroup_alloc(struct cgroup *cgroup)
602	{	614	{
603	if (psi_disabled)	615	if (static_branch_likely(&psi_disabled))
604	return 0;	616	return 0;
605		617
606	cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);	618	cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
@@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
612		624
613	void psi_cgroup_free(struct cgroup *cgroup)	625	void psi_cgroup_free(struct cgroup *cgroup)
614	{	626	{
615	if (psi_disabled)	627	if (static_branch_likely(&psi_disabled))
616	return;	628	return;
617		629
618	cancel_delayed_work_sync(&cgroup->psi.clock_work);	630	cancel_delayed_work_sync(&cgroup->psi.clock_work);
@@ -633,38 +645,39 @@ void psi_cgroup_free(struct cgroup *cgroup)
633	*/	645	*/
634	void cgroup_move_task(struct task_struct task, struct css_set to)	646	void cgroup_move_task(struct task_struct task, struct css_set to)
635	{	647	{
636	bool move_psi = !psi_disabled;
637	unsigned int task_flags = 0;	648	unsigned int task_flags = 0;
638	struct rq_flags rf;	649	struct rq_flags rf;
639	struct rq *rq;	650	struct rq *rq;
640		651
641	if (move_psi) {	652	if (static_branch_likely(&psi_disabled)) {
642	rq = task_rq_lock(task, &rf);	653	/*
		654	* Lame to do this here, but the scheduler cannot be locked
		655	* from the outside, so we move cgroups from inside sched/.
		656	*/
		657	rcu_assign_pointer(task->cgroups, to);
		658	return;
		659	}
643		660
644	if (task_on_rq_queued(task))	661	rq = task_rq_lock(task, &rf);
645	task_flags = TSK_RUNNING;
646	else if (task->in_iowait)
647	task_flags = TSK_IOWAIT;
648		662
649	if (task->flags & PF_MEMSTALL)	663	if (task_on_rq_queued(task))
650	task_flags \|= TSK_MEMSTALL;	664	task_flags = TSK_RUNNING;
		665	else if (task->in_iowait)
		666	task_flags = TSK_IOWAIT;
651		667
652	if (task_flags)	668	if (task->flags & PF_MEMSTALL)
653	psi_task_change(task, task_flags, 0);	669	task_flags \|= TSK_MEMSTALL;
654	}
655		670
656	/*	671	if (task_flags)
657	* Lame to do this here, but the scheduler cannot be locked	672	psi_task_change(task, task_flags, 0);
658	* from the outside, so we move cgroups from inside sched/.	673
659	*/	674	/* See comment above */
660	rcu_assign_pointer(task->cgroups, to);	675	rcu_assign_pointer(task->cgroups, to);
661		676
662	if (move_psi) {	677	if (task_flags)
663	if (task_flags)	678	psi_task_change(task, 0, task_flags);
664	psi_task_change(task, 0, task_flags);
665		679
666	task_rq_unlock(rq, task, &rf);	680	task_rq_unlock(rq, task, &rf);
667	}
668	}	681	}
669	#endif /* CONFIG_CGROUPS */	682	#endif /* CONFIG_CGROUPS */
670		683
@@ -672,7 +685,7 @@ int psi_show(struct seq_file m, struct psi_group group, enum psi_res res)
672	{	685	{
673	int full;	686	int full;
674		687
675	if (psi_disabled)	688	if (static_branch_likely(&psi_disabled))
676	return -EOPNOTSUPP;	689	return -EOPNOTSUPP;
677		690
678	update_stats(group);	691	update_stats(group);


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..4e524ab589c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -23,6 +23,7 @@
23	#include <linux/sched/prio.h>	23	#include <linux/sched/prio.h>
24	#include <linux/sched/rt.h>	24	#include <linux/sched/rt.h>
25	#include <linux/sched/signal.h>	25	#include <linux/sched/signal.h>
		26	#include <linux/sched/smt.h>
26	#include <linux/sched/stat.h>	27	#include <linux/sched/stat.h>
27	#include <linux/sched/sysctl.h>	28	#include <linux/sched/sysctl.h>
28	#include <linux/sched/task.h>	29	#include <linux/sched/task.h>
@@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq)
936		937
937		938
938	#ifdef CONFIG_SCHED_SMT	939	#ifdef CONFIG_SCHED_SMT
939
940	extern struct static_key_false sched_smt_present;
941
942	extern void __update_idle_core(struct rq *rq);	940	extern void __update_idle_core(struct rq *rq);
943		941
944	static inline void update_idle_core(struct rq *rq)	942	static inline void update_idle_core(struct rq *rq)


diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4904c4677000..aa0de240fb41 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h
@@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
66	{	66	{
67	int clear = 0, set = TSK_RUNNING;	67	int clear = 0, set = TSK_RUNNING;
68		68
69	if (psi_disabled)	69	if (static_branch_likely(&psi_disabled))
70	return;	70	return;
71		71
72	if (!wakeup \|\| p->sched_psi_wake_requeue) {	72	if (!wakeup \|\| p->sched_psi_wake_requeue) {
@@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
86	{	86	{
87	int clear = TSK_RUNNING, set = 0;	87	int clear = TSK_RUNNING, set = 0;
88		88
89	if (psi_disabled)	89	if (static_branch_likely(&psi_disabled))
90	return;	90	return;
91		91
92	if (!sleep) {	92	if (!sleep) {
@@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
102		102
103	static inline void psi_ttwu_dequeue(struct task_struct *p)	103	static inline void psi_ttwu_dequeue(struct task_struct *p)
104	{	104	{
105	if (psi_disabled)	105	if (static_branch_likely(&psi_disabled))
106	return;	106	return;
107	/*	107	/*
108	* Is the task being migrated during a wakeup? Make sure to	108	* Is the task being migrated during a wakeup? Make sure to
@@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)
128		128
129	static inline void psi_task_tick(struct rq *rq)	129	static inline void psi_task_tick(struct rq *rq)
130	{	130	{
131	if (psi_disabled)	131	if (static_branch_likely(&psi_disabled))
132	return;	132	return;
133		133
134	if (unlikely(rq->curr->flags & PF_MEMSTALL))	134	if (unlikely(rq->curr->flags & PF_MEMSTALL))