3 files changed, 76 insertions, 38 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f12225f26b70..091e089063be 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5851,11 +5851,14 @@ void __init sched_init_smp(void)
        /*
         * There's no userspace yet to cause hotplug operations; hence all the
         * CPU masks are stable and all blatant races in the below code cannot
-         * happen.
+         * happen. The hotplug lock is nevertheless taken to satisfy lockdep,
+         * but there won't be any contention on it.
         */
+        cpus_read_lock();
        mutex_lock(&sched_domains_mutex);
        sched_init_domains(cpu_active_mask);
        mutex_unlock(&sched_domains_mutex);
+        cpus_read_unlock();
        /* Move init over to a non-isolated CPU */
        if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee271bb661cc..ac855b2f4774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                local = 1;
        /*
-         * Retry task to preferred node migration periodically, in case it
+         * Retry to migrate task to preferred node periodically, in case it
-         * case it previously failed, or the scheduler moved us.
+         * previously failed, or the scheduler moved us.
         */
        if (time_after(jiffies, p->numa_migrate_retry)) {
                task_numa_placement(p);
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
        return target;
 }
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
 {
-        return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+        return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
 }
 /*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
-                        spare_cap = capacity_spare_wake(i, p);
+                        spare_cap = capacity_spare_without(i, p);
                        if (spare_cap > max_spare_cap)
                                max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                return prev_cpu;
        /*
-         * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+         * We need task's util for capacity_spare_without, sync it up to
-         * last_update_time.
+         * prev_cpu's last_update_time.
         */
        if (!(sd_flag & SD_BALANCE_FORK))
                sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
 }
 /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
+ * cpu_util_without: compute cpu utilization without any contributions from *p
- * the waking task p removed.
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
 */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
 {
        struct cfs_rq *cfs_rq;
        unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
        cfs_rq = &cpu_rq(cpu)->cfs;
        util = READ_ONCE(cfs_rq->avg.util_avg);
-        /* Discount task's blocked util from CPU's util */
+        /* Discount task's util from CPU's util */
        util -= min_t(unsigned int, util, task_util(p));
        /*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
         * a) if *p is the only task sleeping on this CPU, then:
         *      cpu_util (== task_util) > util_est (== 0)
         *    and thus we return:
-         *      cpu_util_wake = (cpu_util - task_util) = 0
+         *      cpu_util_without = (cpu_util - task_util) = 0
         *
         * b) if other tasks are SLEEPING on this CPU, which is now exiting
         *    IDLE, then:
         *      cpu_util >= task_util
         *      cpu_util > util_est (== 0)
         *    and thus we discount *p's blocked utilization to return:
-         *      cpu_util_wake = (cpu_util - task_util) >= 0
+         *      cpu_util_without = (cpu_util - task_util) >= 0
         *
         * c) if other tasks are RUNNABLE on that CPU and
         *      util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
         * covered by the following code when estimated utilization is
         * enabled.
         */
-        if (sched_feat(UTIL_EST))
+        if (sched_feat(UTIL_EST)) {
-                util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+                unsigned int estimated =
+                        READ_ONCE(cfs_rq->avg.util_est.enqueued);
+                /*
+                 * Despite the following checks we still have a small window
+                 * for a possible race, when an execl's select_task_rq_fair()
+                 * races with LB's detach_task():
+                 *
+                 *   detach_task()
+                 *     p->on_rq = TASK_ON_RQ_MIGRATING;
+                 *     ---------------------------------- A
+                 *     deactivate_task()                   \
+                 *       dequeue_task()                     + RaceTime
+                 *         util_est_dequeue()              /
+                 *     ---------------------------------- B
+                 *
+                 * The additional check on "current == p" it's required to
+                 * properly fix the execl regression and it helps in further
+                 * reducing the chances for the above race.
+                 */
+                if (unlikely(task_on_rq_queued(p) || current == p)) {
+                        estimated -= min_t(unsigned int, estimated,
+                                           (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+                }
+                util = max(util, estimated);
+        }
        /*
         * Utilization (estimated) can exceed the CPU capacity, thus let's
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7cdecfc010af..3d7355d7c3e3 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup)
 */
 void cgroup_move_task(struct task_struct *task, struct css_set *to)
 {
-        bool move_psi = !psi_disabled;
        unsigned int task_flags = 0;
        struct rq_flags rf;
        struct rq *rq;
-        if (move_psi) {
+        if (psi_disabled) {
-                rq = task_rq_lock(task, &rf);
+                /*
+                 * Lame to do this here, but the scheduler cannot be locked
+                 * from the outside, so we move cgroups from inside sched/.
+                 */
+                rcu_assign_pointer(task->cgroups, to);
+                return;
+        }
-                if (task_on_rq_queued(task))
+        rq = task_rq_lock(task, &rf);
-                        task_flags = TSK_RUNNING;
-                else if (task->in_iowait)
-                        task_flags = TSK_IOWAIT;
-                if (task->flags & PF_MEMSTALL)
+        if (task_on_rq_queued(task))
-                        task_flags |= TSK_MEMSTALL;
+                task_flags = TSK_RUNNING;
+        else if (task->in_iowait)
+                task_flags = TSK_IOWAIT;
-                if (task_flags)
+        if (task->flags & PF_MEMSTALL)
-                        psi_task_change(task, task_flags, 0);
+                task_flags |= TSK_MEMSTALL;
-        }
-        /*
+        if (task_flags)
-         * Lame to do this here, but the scheduler cannot be locked
+                psi_task_change(task, task_flags, 0);
-         * from the outside, so we move cgroups from inside sched/.
-         */
+        /* See comment above */
        rcu_assign_pointer(task->cgroups, to);
-        if (move_psi) {
+        if (task_flags)
-                if (task_flags)
+                psi_task_change(task, 0, task_flags);
-                        psi_task_change(task, 0, task_flags);
-                task_rq_unlock(rq, task, &rf);
+        task_rq_unlock(rq, task, &rf);
-        }
 }
 #endif /* CONFIG_CGROUPS */

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f12225f26b70..091e089063be 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5851,11 +5851,14 @@ void __init sched_init_smp(void)
5851	/*	5851	/*
5852	* There's no userspace yet to cause hotplug operations; hence all the	5852	* There's no userspace yet to cause hotplug operations; hence all the
5853	* CPU masks are stable and all blatant races in the below code cannot	5853	* CPU masks are stable and all blatant races in the below code cannot
5854	* happen.	5854	* happen. The hotplug lock is nevertheless taken to satisfy lockdep,
		5855	* but there won't be any contention on it.
5855	*/	5856	*/
		5857	cpus_read_lock();
5856	mutex_lock(&sched_domains_mutex);	5858	mutex_lock(&sched_domains_mutex);
5857	sched_init_domains(cpu_active_mask);	5859	sched_init_domains(cpu_active_mask);
5858	mutex_unlock(&sched_domains_mutex);	5860	mutex_unlock(&sched_domains_mutex);
		5861	cpus_read_unlock();
5859		5862
5860	/* Move init over to a non-isolated CPU */	5863	/* Move init over to a non-isolated CPU */
5861	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)	5864	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee271bb661cc..ac855b2f4774 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -2400,8 +2400,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2400	local = 1;	2400	local = 1;
2401		2401
2402	/*	2402	/*
2403	* Retry task to preferred node migration periodically, in case it	2403	* Retry to migrate task to preferred node periodically, in case it
2404	* case it previously failed, or the scheduler moved us.	2404	* previously failed, or the scheduler moved us.
2405	*/	2405	*/
2406	if (time_after(jiffies, p->numa_migrate_retry)) {	2406	if (time_after(jiffies, p->numa_migrate_retry)) {
2407	task_numa_placement(p);	2407	task_numa_placement(p);
@@ -5674,11 +5674,11 @@ static int wake_affine(struct sched_domain sd, struct task_struct p,
5674	return target;	5674	return target;
5675	}	5675	}
5676		5676
5677	static unsigned long cpu_util_wake(int cpu, struct task_struct *p);	5677	static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5678		5678
5679	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)	5679	static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5680	{	5680	{
5681	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);	5681	return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5682	}	5682	}
5683		5683
5684	/*	5684	/*
@@ -5738,7 +5738,7 @@ find_idlest_group(struct sched_domain sd, struct task_struct p,
5738		5738
5739	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);	5739	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5740		5740
5741	spare_cap = capacity_spare_wake(i, p);	5741	spare_cap = capacity_spare_without(i, p);
5742		5742
5743	if (spare_cap > max_spare_cap)	5743	if (spare_cap > max_spare_cap)
5744	max_spare_cap = spare_cap;	5744	max_spare_cap = spare_cap;
@@ -5889,8 +5889,8 @@ static inline int find_idlest_cpu(struct sched_domain sd, struct task_struct p
5889	return prev_cpu;	5889	return prev_cpu;
5890		5890
5891	/*	5891	/*
5892	* We need task's util for capacity_spare_wake, sync it up to prev_cpu's	5892	* We need task's util for capacity_spare_without, sync it up to
5893	* last_update_time.	5893	* prev_cpu's last_update_time.
5894	*/	5894	*/
5895	if (!(sd_flag & SD_BALANCE_FORK))	5895	if (!(sd_flag & SD_BALANCE_FORK))
5896	sync_entity_load_avg(&p->se);	5896	sync_entity_load_avg(&p->se);
@@ -6216,10 +6216,19 @@ static inline unsigned long cpu_util(int cpu)
6216	}	6216	}
6217		6217
6218	/*	6218	/*
6219	* cpu_util_wake: Compute CPU utilization with any contributions from	6219	* cpu_util_without: compute cpu utilization without any contributions from *p
6220	* the waking task p removed.	6220	* @cpu: the CPU which utilization is requested
		6221	* @p: the task which utilization should be discounted
		6222	*
		6223	* The utilization of a CPU is defined by the utilization of tasks currently
		6224	* enqueued on that CPU as well as tasks which are currently sleeping after an
		6225	* execution on that CPU.
		6226	*
		6227	* This method returns the utilization of the specified CPU by discounting the
		6228	* utilization of the specified task, whenever the task is currently
		6229	* contributing to the CPU utilization.
6221	*/	6230	*/
6222	static unsigned long cpu_util_wake(int cpu, struct task_struct *p)	6231	static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6223	{	6232	{
6224	struct cfs_rq *cfs_rq;	6233	struct cfs_rq *cfs_rq;
6225	unsigned int util;	6234	unsigned int util;
@@ -6231,7 +6240,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6231	cfs_rq = &cpu_rq(cpu)->cfs;	6240	cfs_rq = &cpu_rq(cpu)->cfs;
6232	util = READ_ONCE(cfs_rq->avg.util_avg);	6241	util = READ_ONCE(cfs_rq->avg.util_avg);
6233		6242
6234	/* Discount task's blocked util from CPU's util */	6243	/* Discount task's util from CPU's util */
6235	util -= min_t(unsigned int, util, task_util(p));	6244	util -= min_t(unsigned int, util, task_util(p));
6236		6245
6237	/*	6246	/*
@@ -6240,14 +6249,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6240	* a) if *p is the only task sleeping on this CPU, then:	6249	* a) if *p is the only task sleeping on this CPU, then:
6241	* cpu_util (== task_util) > util_est (== 0)	6250	* cpu_util (== task_util) > util_est (== 0)
6242	* and thus we return:	6251	* and thus we return:
6243	* cpu_util_wake = (cpu_util - task_util) = 0	6252	* cpu_util_without = (cpu_util - task_util) = 0
6244	*	6253	*
6245	* b) if other tasks are SLEEPING on this CPU, which is now exiting	6254	* b) if other tasks are SLEEPING on this CPU, which is now exiting
6246	* IDLE, then:	6255	* IDLE, then:
6247	* cpu_util >= task_util	6256	* cpu_util >= task_util
6248	* cpu_util > util_est (== 0)	6257	* cpu_util > util_est (== 0)
6249	* and thus we discount *p's blocked utilization to return:	6258	* and thus we discount *p's blocked utilization to return:
6250	* cpu_util_wake = (cpu_util - task_util) >= 0	6259	* cpu_util_without = (cpu_util - task_util) >= 0
6251	*	6260	*
6252	* c) if other tasks are RUNNABLE on that CPU and	6261	* c) if other tasks are RUNNABLE on that CPU and
6253	* util_est > cpu_util	6262	* util_est > cpu_util
@@ -6260,8 +6269,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6260	* covered by the following code when estimated utilization is	6269	* covered by the following code when estimated utilization is
6261	* enabled.	6270	* enabled.
6262	*/	6271	*/
6263	if (sched_feat(UTIL_EST))	6272	if (sched_feat(UTIL_EST)) {
6264	util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));	6273	unsigned int estimated =
		6274	READ_ONCE(cfs_rq->avg.util_est.enqueued);
		6275
		6276	/*
		6277	* Despite the following checks we still have a small window
		6278	* for a possible race, when an execl's select_task_rq_fair()
		6279	* races with LB's detach_task():
		6280	*
		6281	* detach_task()
		6282	* p->on_rq = TASK_ON_RQ_MIGRATING;
		6283	* ---------------------------------- A
		6284	* deactivate_task() \
		6285	* dequeue_task() + RaceTime
		6286	* util_est_dequeue() /
		6287	* ---------------------------------- B
		6288	*
		6289	* The additional check on "current == p" it's required to
		6290	* properly fix the execl regression and it helps in further
		6291	* reducing the chances for the above race.
		6292	*/
		6293	if (unlikely(task_on_rq_queued(p) \|\| current == p)) {
		6294	estimated -= min_t(unsigned int, estimated,
		6295	(_task_util_est(p) \| UTIL_AVG_UNCHANGED));
		6296	}
		6297	util = max(util, estimated);
		6298	}
6265		6299
6266	/*	6300	/*
6267	* Utilization (estimated) can exceed the CPU capacity, thus let's	6301	* Utilization (estimated) can exceed the CPU capacity, thus let's


diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc010af..3d7355d7c3e3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c
@@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup)
633	*/	633	*/
634	void cgroup_move_task(struct task_struct task, struct css_set to)	634	void cgroup_move_task(struct task_struct task, struct css_set to)
635	{	635	{
636	bool move_psi = !psi_disabled;
637	unsigned int task_flags = 0;	636	unsigned int task_flags = 0;
638	struct rq_flags rf;	637	struct rq_flags rf;
639	struct rq *rq;	638	struct rq *rq;
640		639
641	if (move_psi) {	640	if (psi_disabled) {
642	rq = task_rq_lock(task, &rf);	641	/*
		642	* Lame to do this here, but the scheduler cannot be locked
		643	* from the outside, so we move cgroups from inside sched/.
		644	*/
		645	rcu_assign_pointer(task->cgroups, to);
		646	return;
		647	}
643		648
644	if (task_on_rq_queued(task))	649	rq = task_rq_lock(task, &rf);
645	task_flags = TSK_RUNNING;
646	else if (task->in_iowait)
647	task_flags = TSK_IOWAIT;
648		650
649	if (task->flags & PF_MEMSTALL)	651	if (task_on_rq_queued(task))
650	task_flags \|= TSK_MEMSTALL;	652	task_flags = TSK_RUNNING;
		653	else if (task->in_iowait)
		654	task_flags = TSK_IOWAIT;
651		655
652	if (task_flags)	656	if (task->flags & PF_MEMSTALL)
653	psi_task_change(task, task_flags, 0);	657	task_flags \|= TSK_MEMSTALL;
654	}
655		658
656	/*	659	if (task_flags)
657	* Lame to do this here, but the scheduler cannot be locked	660	psi_task_change(task, task_flags, 0);
658	* from the outside, so we move cgroups from inside sched/.	661
659	*/	662	/* See comment above */
660	rcu_assign_pointer(task->cgroups, to);	663	rcu_assign_pointer(task->cgroups, to);
661		664
662	if (move_psi) {	665	if (task_flags)
663	if (task_flags)	666	psi_task_change(task, 0, task_flags);
664	psi_task_change(task, 0, task_flags);
665		667
666	task_rq_unlock(rq, task, &rf);	668	task_rq_unlock(rq, task, &rf);
667	}
668	}	669	}
669	#endif /* CONFIG_CGROUPS */	670	#endif /* CONFIG_CGROUPS */
670		671