Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: - power-aware scheduling improvements (Patrick Bellasi) - NUMA balancing improvements (Mel Gorman) - vCPU scheduling fixes (Rohit Jain) * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Update util_est before updating schedutil sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu() sched/wait: Include <linux/wait.h> in <linux/swait.h> sched/numa: Stagger NUMA balancing scan periods for new threads sched/core: Don't schedule threads on pre-empted vCPUs sched/fair: Avoid calling sync_entity_load_avg() unnecessarily sched/fair: Rearrange select_task_rq_fair() to optimize it
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-06-04 20:45:38 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-06-04 20:45:38 -0400
commit: f7f4e7fc6c517708738d1d1984b170e9475a130f (patch)
tree: 9744eba2f74f1f19818d8a4ab8b8d65f865ddec8 /kernel/sched
parent: d9b446e294f21a9616d36a786087466da64afe0a (diff)
parent: 2539fc82aa9b07d968cf9ba1ffeec3e0416ac721 (diff)
4 files changed, 111 insertions, 68 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e27034bd954e..e9866f86f304 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        INIT_HLIST_HEAD(&p->preempt_notifiers);
 #endif
-#ifdef CONFIG_NUMA_BALANCING
+        init_numa_balancing(clone_flags, p);
-        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-                p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                p->mm->numa_scan_seq = 0;
-        }
-        if (clone_flags & CLONE_VM)
-                p->numa_preferred_nid = current->numa_preferred_nid;
-        else
-                p->numa_preferred_nid = -1;
-        p->node_stamp = 0ULL;
-        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
-        p->numa_work.next = &p->numa_work;
-        p->numa_faults = NULL;
-        p->last_task_numa_placement = 0;
-        p->last_sum_exec_runtime = 0;
-        p->numa_group = NULL;
-#endif /* CONFIG_NUMA_BALANCING */
 }
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4050,6 +4030,23 @@ int idle_cpu(int cpu)
 }
 /**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+        if (!idle_cpu(cpu))
+                return 0;
+        if (vcpu_is_preempted(cpu))
+                return 0;
+        return 1;
+}
+/**
 * idle_task - return the idle task for a given CPU.
 * @cpu: the processor in question.
 *
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e13df951aca7..28592b62b1d5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
 static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
 {
        struct rq *rq = cpu_rq(sg_cpu->cpu);
-        unsigned long util;
-        if (rq->rt.rt_nr_running) {
+        if (rq->rt.rt_nr_running)
-                util = sg_cpu->max;
+                return sg_cpu->max;
-        } else {
-                util = sg_cpu->util_dl;
-                if (rq->cfs.h_nr_running)
-                        util += sg_cpu->util_cfs;
-        }
        /*
+         * Utilization required by DEADLINE must always be granted while, for
+         * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
+         * gracefully reduce the frequency when no tasks show up for longer
+         * periods of time.
+         *
         * Ideally we would like to set util_dl as min/guaranteed freq and
         * util_cfs + util_dl as requested freq. However, cpufreq is not yet
         * ready for such an interface. So, we only do the latter for now.
         */
-        return min(util, sg_cpu->max);
+        return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
 }
 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 79f574dba096..e497c05aab7f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+        int mm_users = 0;
+        struct mm_struct *mm = p->mm;
+        if (mm) {
+                mm_users = atomic_read(&mm->mm_users);
+                if (mm_users == 1) {
+                        mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+                        mm->numa_scan_seq = 0;
+                }
+        }
+        p->node_stamp                   = 0;
+        p->numa_scan_seq                = mm ? mm->numa_scan_seq : 0;
+        p->numa_scan_period             = sysctl_numa_balancing_scan_delay;
+        p->numa_work.next               = &p->numa_work;
+        p->numa_faults                  = NULL;
+        p->numa_group                   = NULL;
+        p->last_task_numa_placement     = 0;
+        p->last_sum_exec_runtime        = 0;
+        /* New address space, reset the preferred nid */
+        if (!(clone_flags & CLONE_VM)) {
+                p->numa_preferred_nid = -1;
+                return;
+        }
+        /*
+         * New thread, keep existing numa_preferred_nid which should be copied
+         * already by arch_dup_task_struct but stagger when scans start.
+         */
+        if (mm) {
+                unsigned int delay;
+                delay = min_t(unsigned int, task_scan_max(current),
+                        current->numa_scan_period * mm_users * NSEC_PER_MSEC);
+                delay += 2 * TICK_NSEC;
+                p->node_stamp = delay;
+        }
+}
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
        rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct sched_entity *se = &p->se;
        /*
+         * The code below (indirectly) updates schedutil which looks at
+         * the cfs_rq utilization to select a frequency.
+         * Let's add the task's estimated utilization to the cfs_rq's
+         * estimated utilization, before we update schedutil.
+         */
+        util_est_enqueue(&rq->cfs, p);
+        /*
         * If in_iowait is set, the code below may not trigger any cpufreq
         * utilization updates, so do it here explicitly with the IOWAIT flag
         * passed.
@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                add_nr_running(rq, 1);
-        util_est_enqueue(&rq->cfs, p);
        hrtick_update(rq);
 }
@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
         * a cpufreq perspective, it's better to have higher utilisation
         * on one CPU.
         */
-        if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
+        if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
-                return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
+                return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
        if (sync && cpu_rq(this_cpu)->nr_running == 1)
                return this_cpu;
@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
-                if (idle_cpu(i)) {
+                if (available_idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
                        if (idle && idle->exit_latency < min_exit_latency) {
@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
        if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
                return prev_cpu;
+        /*
+         * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
+         * last_update_time.
+         */
+        if (!(sd_flag & SD_BALANCE_FORK))
+                sync_entity_load_avg(&p->se);
        while (sd) {
                struct sched_group *group;
                struct sched_domain *tmp;
@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
                if (cpu == core)
                        continue;
-                if (!idle_cpu(cpu))
+                if (!available_idle_cpu(cpu))
                        goto unlock;
        }
@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
                for_each_cpu(cpu, cpu_smt_mask(core)) {
                        cpumask_clear_cpu(cpu, cpus);
-                        if (!idle_cpu(cpu))
+                        if (!available_idle_cpu(cpu))
                                idle = false;
                }
@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
        for_each_cpu(cpu, cpu_smt_mask(target)) {
                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
                        continue;
-                if (idle_cpu(cpu))
+                if (available_idle_cpu(cpu))
                        return cpu;
        }
@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
                        return -1;
                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
                        continue;
-                if (idle_cpu(cpu))
+                if (available_idle_cpu(cpu))
                        break;
        }
@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        struct sched_domain *sd;
        int i, recent_used_cpu;
-        if (idle_cpu(target))
+        if (available_idle_cpu(target))
                return target;
        /*
         * If the previous CPU is cache affine and idle, don't be stupid:
         */
-        if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+        if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
                return prev;
        /* Check a recently used CPU as a potential idle candidate: */
@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        if (recent_used_cpu != prev &&
            recent_used_cpu != target &&
            cpus_share_cache(recent_used_cpu, target) &&
-            idle_cpu(recent_used_cpu) &&
+            available_idle_cpu(recent_used_cpu) &&
            cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
                /*
                 * Replace recent_used_cpu with prev as it is a potential
@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
-        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+        struct sched_domain *tmp, *sd = NULL;
        int cpu = smp_processor_id();
        int new_cpu = prev_cpu;
        int want_affine = 0;
@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                 */
                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
-                        affine_sd = tmp;
+                        if (cpu != prev_cpu)
+                                new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
+                        sd = NULL; /* Prefer wake_affine over balance flags */
                        break;
                }
@@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        break;
        }
-        if (affine_sd) {
+        if (unlikely(sd)) {
-                sd = NULL; /* Prefer wake_affine over balance flags */
+                /* Slow path */
-                if (cpu == prev_cpu)
+                new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
-                        goto pick_cpu;
+        } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
+                /* Fast path */
-                new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
-        }
-        if (sd && !(sd_flag & SD_BALANCE_FORK)) {
-                /*
-                 * We're going to need the task's util for capacity_spare_wake
-                 * in find_idlest_group. Sync it up to prev_cpu's
-                 * last_update_time.
-                 */
-                sync_entity_load_avg(&p->se);
-        }
-        if (!sd) {
+                new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-pick_cpu:
-                if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
-                        new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-                        if (want_affine)
+                if (want_affine)
-                                current->recent_used_cpu = cpu;
+                        current->recent_used_cpu = cpu;
-                }
-        } else {
-                new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
        }
        rcu_read_unlock();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cb467c221b15..6601baf2361c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1069,6 +1069,12 @@ enum numa_faults_stats {
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
 extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
+#else
+static inline void
+init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 #ifdef CONFIG_SMP
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-06-04 20:45:38 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-06-04 20:45:38 -0400
commit	f7f4e7fc6c517708738d1d1984b170e9475a130f (patch)
tree	9744eba2f74f1f19818d8a4ab8b8d65f865ddec8 /kernel/sched
parent	d9b446e294f21a9616d36a786087466da64afe0a (diff)
parent	2539fc82aa9b07d968cf9ba1ffeec3e0416ac721 (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e27034bd954e..e9866f86f304 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2194	INIT_HLIST_HEAD(&p->preempt_notifiers);	2194	INIT_HLIST_HEAD(&p->preempt_notifiers);
2195	#endif	2195	#endif
2196		2196
2197	#ifdef CONFIG_NUMA_BALANCING	2197	init_numa_balancing(clone_flags, p);
2198	if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2199	p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2200	p->mm->numa_scan_seq = 0;
2201	}
2202
2203	if (clone_flags & CLONE_VM)
2204	p->numa_preferred_nid = current->numa_preferred_nid;
2205	else
2206	p->numa_preferred_nid = -1;
2207
2208	p->node_stamp = 0ULL;
2209	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2210	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2211	p->numa_work.next = &p->numa_work;
2212	p->numa_faults = NULL;
2213	p->last_task_numa_placement = 0;
2214	p->last_sum_exec_runtime = 0;
2215
2216	p->numa_group = NULL;
2217	#endif /* CONFIG_NUMA_BALANCING */
2218	}	2198	}
2219		2199
2220	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);	2200	DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4050,6 +4030,23 @@ int idle_cpu(int cpu)
4050	}	4030	}
4051		4031
4052	/**	4032	/**
		4033	* available_idle_cpu - is a given CPU idle for enqueuing work.
		4034	* @cpu: the CPU in question.
		4035	*
		4036	* Return: 1 if the CPU is currently idle. 0 otherwise.
		4037	*/
		4038	int available_idle_cpu(int cpu)
		4039	{
		4040	if (!idle_cpu(cpu))
		4041	return 0;
		4042
		4043	if (vcpu_is_preempted(cpu))
		4044	return 0;
		4045
		4046	return 1;
		4047	}
		4048
		4049	/**
4053	* idle_task - return the idle task for a given CPU.	4050	* idle_task - return the idle task for a given CPU.
4054	* @cpu: the processor in question.	4051	* @cpu: the processor in question.
4055	*	4052	*


diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e13df951aca7..28592b62b1d5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c
@@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
183	static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)	183	static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
184	{	184	{
185	struct rq *rq = cpu_rq(sg_cpu->cpu);	185	struct rq *rq = cpu_rq(sg_cpu->cpu);
186	unsigned long util;
187		186
188	if (rq->rt.rt_nr_running) {	187	if (rq->rt.rt_nr_running)
189	util = sg_cpu->max;	188	return sg_cpu->max;
190	} else {
191	util = sg_cpu->util_dl;
192	if (rq->cfs.h_nr_running)
193	util += sg_cpu->util_cfs;
194	}
195		189
196	/*	190	/*
		191	* Utilization required by DEADLINE must always be granted while, for
		192	* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
		193	* gracefully reduce the frequency when no tasks show up for longer
		194	* periods of time.
		195	*
197	* Ideally we would like to set util_dl as min/guaranteed freq and	196	* Ideally we would like to set util_dl as min/guaranteed freq and
198	* util_cfs + util_dl as requested freq. However, cpufreq is not yet	197	* util_cfs + util_dl as requested freq. However, cpufreq is not yet
199	* ready for such an interface. So, we only do the latter for now.	198	* ready for such an interface. So, we only do the latter for now.
200	*/	199	*/
201	return min(util, sg_cpu->max);	200	return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
202	}	201	}
203		202
204	static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)	203	static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79f574dba096..e497c05aab7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
1139	return max(smin, smax);	1139	return max(smin, smax);
1140	}	1140	}
1141		1141
		1142	void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
		1143	{
		1144	int mm_users = 0;
		1145	struct mm_struct *mm = p->mm;
		1146
		1147	if (mm) {
		1148	mm_users = atomic_read(&mm->mm_users);
		1149	if (mm_users == 1) {
		1150	mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
		1151	mm->numa_scan_seq = 0;
		1152	}
		1153	}
		1154	p->node_stamp = 0;
		1155	p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
		1156	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
		1157	p->numa_work.next = &p->numa_work;
		1158	p->numa_faults = NULL;
		1159	p->numa_group = NULL;
		1160	p->last_task_numa_placement = 0;
		1161	p->last_sum_exec_runtime = 0;
		1162
		1163	/* New address space, reset the preferred nid */
		1164	if (!(clone_flags & CLONE_VM)) {
		1165	p->numa_preferred_nid = -1;
		1166	return;
		1167	}
		1168
		1169	/*
		1170	* New thread, keep existing numa_preferred_nid which should be copied
		1171	* already by arch_dup_task_struct but stagger when scans start.
		1172	*/
		1173	if (mm) {
		1174	unsigned int delay;
		1175
		1176	delay = min_t(unsigned int, task_scan_max(current),
		1177	current->numa_scan_period * mm_users * NSEC_PER_MSEC);
		1178	delay += 2 * TICK_NSEC;
		1179	p->node_stamp = delay;
		1180	}
		1181	}
		1182
1142	static void account_numa_enqueue(struct rq rq, struct task_struct p)	1183	static void account_numa_enqueue(struct rq rq, struct task_struct p)
1143	{	1184	{
1144	rq->nr_numa_running += (p->numa_preferred_nid != -1);	1185	rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
5345	struct sched_entity *se = &p->se;	5386	struct sched_entity *se = &p->se;
5346		5387
5347	/*	5388	/*
		5389	* The code below (indirectly) updates schedutil which looks at
		5390	* the cfs_rq utilization to select a frequency.
		5391	* Let's add the task's estimated utilization to the cfs_rq's
		5392	* estimated utilization, before we update schedutil.
		5393	*/
		5394	util_est_enqueue(&rq->cfs, p);
		5395
		5396	/*
5348	* If in_iowait is set, the code below may not trigger any cpufreq	5397	* If in_iowait is set, the code below may not trigger any cpufreq
5349	* utilization updates, so do it here explicitly with the IOWAIT flag	5398	* utilization updates, so do it here explicitly with the IOWAIT flag
5350	* passed.	5399	* passed.
@@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
5385	if (!se)	5434	if (!se)
5386	add_nr_running(rq, 1);	5435	add_nr_running(rq, 1);
5387		5436
5388	util_est_enqueue(&rq->cfs, p);
5389	hrtick_update(rq);	5437	hrtick_update(rq);
5390	}	5438	}
5391		5439
@@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5858	* a cpufreq perspective, it's better to have higher utilisation	5906	* a cpufreq perspective, it's better to have higher utilisation
5859	* on one CPU.	5907	* on one CPU.
5860	*/	5908	*/
5861	if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))	5909	if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5862	return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;	5910	return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5863		5911
5864	if (sync && cpu_rq(this_cpu)->nr_running == 1)	5912	if (sync && cpu_rq(this_cpu)->nr_running == 1)
5865	return this_cpu;	5913	return this_cpu;
@@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group group, struct task_struct p, int this
6102		6150
6103	/* Traverse only the allowed CPUs */	6151	/* Traverse only the allowed CPUs */
6104	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {	6152	for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
6105	if (idle_cpu(i)) {	6153	if (available_idle_cpu(i)) {
6106	struct rq *rq = cpu_rq(i);	6154	struct rq *rq = cpu_rq(i);
6107	struct cpuidle_state *idle = idle_get_state(rq);	6155	struct cpuidle_state *idle = idle_get_state(rq);
6108	if (idle && idle->exit_latency < min_exit_latency) {	6156	if (idle && idle->exit_latency < min_exit_latency) {
@@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain sd, struct task_struct p
6144	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))	6192	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6145	return prev_cpu;	6193	return prev_cpu;
6146		6194
		6195	/*
		6196	* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
		6197	* last_update_time.
		6198	*/
		6199	if (!(sd_flag & SD_BALANCE_FORK))
		6200	sync_entity_load_avg(&p->se);
		6201
6147	while (sd) {	6202	while (sd) {
6148	struct sched_group *group;	6203	struct sched_group *group;
6149	struct sched_domain *tmp;	6204	struct sched_domain *tmp;
@@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
6224	if (cpu == core)	6279	if (cpu == core)
6225	continue;	6280	continue;
6226		6281
6227	if (!idle_cpu(cpu))	6282	if (!available_idle_cpu(cpu))
6228	goto unlock;	6283	goto unlock;
6229	}	6284	}
6230		6285
@@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct p, struct sched_domain sd, int
6256		6311
6257	for_each_cpu(cpu, cpu_smt_mask(core)) {	6312	for_each_cpu(cpu, cpu_smt_mask(core)) {
6258	cpumask_clear_cpu(cpu, cpus);	6313	cpumask_clear_cpu(cpu, cpus);
6259	if (!idle_cpu(cpu))	6314	if (!available_idle_cpu(cpu))
6260	idle = false;	6315	idle = false;
6261	}	6316	}
6262		6317
@@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct p, struct sched_domain sd, int t
6285	for_each_cpu(cpu, cpu_smt_mask(target)) {	6340	for_each_cpu(cpu, cpu_smt_mask(target)) {
6286	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))	6341	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6287	continue;	6342	continue;
6288	if (idle_cpu(cpu))	6343	if (available_idle_cpu(cpu))
6289	return cpu;	6344	return cpu;
6290	}	6345	}
6291		6346
@@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct p, struct sched_domain sd, int t
6348	return -1;	6403	return -1;
6349	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))	6404	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
6350	continue;	6405	continue;
6351	if (idle_cpu(cpu))	6406	if (available_idle_cpu(cpu))
6352	break;	6407	break;
6353	}	6408	}
6354		6409
@@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6368	struct sched_domain *sd;	6423	struct sched_domain *sd;
6369	int i, recent_used_cpu;	6424	int i, recent_used_cpu;
6370		6425
6371	if (idle_cpu(target))	6426	if (available_idle_cpu(target))
6372	return target;	6427	return target;
6373		6428
6374	/*	6429	/*
6375	* If the previous CPU is cache affine and idle, don't be stupid:	6430	* If the previous CPU is cache affine and idle, don't be stupid:
6376	*/	6431	*/
6377	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))	6432	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
6378	return prev;	6433	return prev;
6379		6434
6380	/* Check a recently used CPU as a potential idle candidate: */	6435	/* Check a recently used CPU as a potential idle candidate: */
@@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6382	if (recent_used_cpu != prev &&	6437	if (recent_used_cpu != prev &&
6383	recent_used_cpu != target &&	6438	recent_used_cpu != target &&
6384	cpus_share_cache(recent_used_cpu, target) &&	6439	cpus_share_cache(recent_used_cpu, target) &&
6385	idle_cpu(recent_used_cpu) &&	6440	available_idle_cpu(recent_used_cpu) &&
6386	cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {	6441	cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6387	/*	6442	/*
6388	* Replace recent_used_cpu with prev as it is a potential	6443	* Replace recent_used_cpu with prev as it is a potential
@@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6558	static int	6613	static int
6559	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)	6614	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6560	{	6615	{
6561	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;	6616	struct sched_domain tmp, sd = NULL;
6562	int cpu = smp_processor_id();	6617	int cpu = smp_processor_id();
6563	int new_cpu = prev_cpu;	6618	int new_cpu = prev_cpu;
6564	int want_affine = 0;	6619	int want_affine = 0;
@@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6581	*/	6636	*/
6582	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&	6637	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6583	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {	6638	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6584	affine_sd = tmp;	6639	if (cpu != prev_cpu)
		6640	new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
		6641
		6642	sd = NULL; /* Prefer wake_affine over balance flags */
6585	break;	6643	break;
6586	}	6644	}
6587		6645
@@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6591	break;	6649	break;
6592	}	6650	}
6593		6651
6594	if (affine_sd) {	6652	if (unlikely(sd)) {
6595	sd = NULL; /* Prefer wake_affine over balance flags */	6653	/* Slow path */
6596	if (cpu == prev_cpu)	6654	new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6597	goto pick_cpu;	6655	} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
6598		6656	/* Fast path */
6599	new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
6600	}
6601
6602	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6603	/*
6604	* We're going to need the task's util for capacity_spare_wake
6605	* in find_idlest_group. Sync it up to prev_cpu's
6606	* last_update_time.
6607	*/
6608	sync_entity_load_avg(&p->se);
6609	}
6610		6657
6611	if (!sd) {	6658	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6612	pick_cpu:
6613	if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
6614	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6615		6659
6616	if (want_affine)	6660	if (want_affine)
6617	current->recent_used_cpu = cpu;	6661	current->recent_used_cpu = cpu;
6618	}
6619	} else {
6620	new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6621	}	6662	}
6622	rcu_read_unlock();	6663	rcu_read_unlock();
6623		6664


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cb467c221b15..6601baf2361c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -1069,6 +1069,12 @@ enum numa_faults_stats {
1069	extern void sched_setnuma(struct task_struct *p, int node);	1069	extern void sched_setnuma(struct task_struct *p, int node);
1070	extern int migrate_task_to(struct task_struct *p, int cpu);	1070	extern int migrate_task_to(struct task_struct *p, int cpu);
1071	extern int migrate_swap(struct task_struct , struct task_struct );	1071	extern int migrate_swap(struct task_struct , struct task_struct );
		1072	extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
		1073	#else
		1074	static inline void
		1075	init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
		1076	{
		1077	}
1072	#endif /* CONFIG_NUMA_BALANCING */	1078	#endif /* CONFIG_NUMA_BALANCING */
1073		1079
1074	#ifdef CONFIG_SMP	1080	#ifdef CONFIG_SMP