Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar: "Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a cputime fix and two cpuacct cleanups" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/cpuacct: Simplify the cpuacct code sched/cpuacct: Rename parameter in cpuusage_write() for readability sched/fair: Add comments to explain select_idle_sibling() sched/fair: Fix fairness issue on migration sched/cgroup: Fix/cleanup cgroup teardown/init sched/cputime: Fix steal time accounting vs. CPU hotplug
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-24 12:42:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-03-24 12:42:50 -0400
commit: be53f58fa0fcd97c62a84f2eb98cff528f8b2443 (patch)
tree: 8026c54554a32777130f535a0b1685cb5078210d
parent: 19d6f04cd374b886b98d7b070ebf287c93bff7ac (diff)
parent: 73e6aafd9ea81498d31361f01db84a0118da2d1c (diff)
5 files changed, 72 insertions, 55 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44db0fffa8be..d8465eeab8b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5371,6 +5371,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        case CPU_UP_PREPARE:
                rq->calc_load_update = calc_load_update;
+                account_reset_rq(rq);
                break;
        case CPU_ONLINE:
@@ -7537,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p)
 /* task_group_lock serializes the addition/removal of task groups */
 static DEFINE_SPINLOCK(task_group_lock);
-static void free_sched_group(struct task_group *tg)
+static void sched_free_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
        free_rt_sched_group(tg);
@@ -7563,7 +7564,7 @@ struct task_group *sched_create_group(struct task_group *parent)
        return tg;
 err:
-        free_sched_group(tg);
+        sched_free_group(tg);
        return ERR_PTR(-ENOMEM);
 }
@@ -7583,17 +7584,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 }
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group_rcu(struct rcu_head *rhp)
+static void sched_free_group_rcu(struct rcu_head *rhp)
 {
        /* now it should be safe to free those cfs_rqs */
-        free_sched_group(container_of(rhp, struct task_group, rcu));
+        sched_free_group(container_of(rhp, struct task_group, rcu));
 }
-/* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group_rcu);
+        call_rcu(&tg->rcu, sched_free_group_rcu);
 }
 void sched_offline_group(struct task_group *tg)
@@ -8052,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (IS_ERR(tg))
                return ERR_PTR(-ENOMEM);
+        sched_online_group(tg, parent);
        return &tg->css;
 }
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
-        struct task_group *parent = css_tg(css->parent);
-        if (parent)
+        sched_offline_group(tg);
-                sched_online_group(tg, parent);
-        return 0;
 }
 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
-        sched_destroy_group(tg);
+        /*
-}
+         * Relies on the RCU grace period between css_released() and this.
+         */
-static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+        sched_free_group(tg);
-{
-        struct task_group *tg = css_tg(css);
-        sched_offline_group(tg);
 }
 static void cpu_cgroup_fork(struct task_struct *task)
@@ -8436,9 +8431,8 @@ static struct cftype cpu_files[] = {
 struct cgroup_subsys cpu_cgrp_subsys = {
        .css_alloc      = cpu_cgroup_css_alloc,
+        .css_released   = cpu_cgroup_css_released,
        .css_free       = cpu_cgroup_css_free,
-        .css_online     = cpu_cgroup_css_online,
-        .css_offline    = cpu_cgroup_css_offline,
        .fork           = cpu_cgroup_fork,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 2ddaebf7469a..4a811203c04a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
 }
 static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
-                          u64 reset)
+                          u64 val)
 {
        struct cpuacct *ca = css_ca(css);
        int err = 0;
        int i;
-        if (reset) {
+        /*
+         * Only allow '0' here to do a reset.
+         */
+        if (val) {
                err = -EINVAL;
                goto out;
        }
@@ -235,23 +238,10 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
-        int cpu;
-        cpu = task_cpu(tsk);
        rcu_read_lock();
+        for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-        ca = task_ca(tsk);
+                *this_cpu_ptr(ca->cpuusage) += cputime;
-        while (true) {
-                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-                *cpuusage += cputime;
-                ca = parent_ca(ca);
-                if (!ca)
-                        break;
-        }
        rcu_read_unlock();
 }
@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 *
 * Note: it's the caller that updates the account of the root cgroup.
 */
-void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 {
-        struct kernel_cpustat *kcpustat;
        struct cpuacct *ca;
        rcu_read_lock();
-        ca = task_ca(p);
+        for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
-        while (ca != &root_cpuacct) {
+                this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
-                kcpustat = this_cpu_ptr(ca->cpustat);
-                kcpustat->cpustat[index] += val;
-                ca = parent_ca(ca);
-        }
        rcu_read_unlock();
 }
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed605624a5e7..ba72807c73d4 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
 #ifdef CONFIG_CGROUP_CPUACCT
 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
 #else
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 }
 static inline void
-cpuacct_account_field(struct task_struct *p, int index, u64 val)
+cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 {
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46d64e4ccfde..0fe30e66aff1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3181,17 +3181,25 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+        bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
+        bool curr = cfs_rq->curr == se;
        /*
-         * Update the normalized vruntime before updating min_vruntime
+         * If we're the current task, we must renormalise before calling
-         * through calling update_curr().
+         * update_curr().
         */
-        if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+        if (renorm && curr)
                se->vruntime += cfs_rq->min_vruntime;
+        update_curr(cfs_rq);
        /*
-         * Update run-time statistics of the 'current'.
+         * Otherwise, renormalise after, such that we're placed at the current
+         * moment in time, instead of some random moment in the past.
         */
-        update_curr(cfs_rq);
+        if (renorm && !curr)
+                se->vruntime += cfs_rq->min_vruntime;
        enqueue_entity_load_avg(cfs_rq, se);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -3207,7 +3215,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                update_stats_enqueue(cfs_rq, se);
                check_spread(cfs_rq, se);
        }
-        if (se != cfs_rq->curr)
+        if (!curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
@@ -5071,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
                return i;
        /*
-         * Otherwise, iterate the domains and find an elegible idle cpu.
+         * Otherwise, iterate the domains and find an eligible idle cpu.
+         *
+         * A completely idle sched group at higher domains is more
+         * desirable than an idle group at a lower level, because lower
+         * domains have smaller groups and usually share hardware
+         * resources which causes tasks to contend on them, e.g. x86
+         * hyperthread siblings in the lowest domain (SMT) can contend
+         * on the shared cpu pipeline.
+         *
+         * However, while we prefer idle groups at higher domains
+         * finding an idle cpu at the lowest domain is still better than
+         * returning 'target', which we've already established, isn't
+         * idle.
         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
        for_each_lower_domain(sd) {
@@ -5081,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
                                                tsk_cpus_allowed(p)))
                                goto next;
+                        /* Ensure the entire group is idle */
                        for_each_cpu(i, sched_group_cpus(sg)) {
                                if (i == target || !idle_cpu(i))
                                        goto next;
                        }
+                        /*
+                         * It doesn't matter which cpu we pick, the
+                         * whole group is idle.
+                         */
                        target = cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
                        goto done;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 382848a24ed9..ec2e8d23527e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1841,3 +1841,16 @@ static inline void cpufreq_trigger_update(u64 time)
 static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
 static inline void cpufreq_trigger_update(u64 time) {}
 #endif /* CONFIG_CPU_FREQ */
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+        rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        rq->prev_steal_time_rq = 0;
+#endif
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-24 12:42:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-03-24 12:42:50 -0400
commit	be53f58fa0fcd97c62a84f2eb98cff528f8b2443 (patch)
tree	8026c54554a32777130f535a0b1685cb5078210d
parent	19d6f04cd374b886b98d7b070ebf287c93bff7ac (diff)
parent	73e6aafd9ea81498d31361f01db84a0118da2d1c (diff)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44db0fffa8be..d8465eeab8b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -5371,6 +5371,7 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
5371		5371
5372	case CPU_UP_PREPARE:	5372	case CPU_UP_PREPARE:
5373	rq->calc_load_update = calc_load_update;	5373	rq->calc_load_update = calc_load_update;
		5374	account_reset_rq(rq);
5374	break;	5375	break;
5375		5376
5376	case CPU_ONLINE:	5377	case CPU_ONLINE:
@@ -7537,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p)
7537	/* task_group_lock serializes the addition/removal of task groups */	7538	/* task_group_lock serializes the addition/removal of task groups */
7538	static DEFINE_SPINLOCK(task_group_lock);	7539	static DEFINE_SPINLOCK(task_group_lock);
7539		7540
7540	static void free_sched_group(struct task_group *tg)	7541	static void sched_free_group(struct task_group *tg)
7541	{	7542	{
7542	free_fair_sched_group(tg);	7543	free_fair_sched_group(tg);
7543	free_rt_sched_group(tg);	7544	free_rt_sched_group(tg);
@@ -7563,7 +7564,7 @@ struct task_group sched_create_group(struct task_group parent)
7563	return tg;	7564	return tg;
7564		7565
7565	err:	7566	err:
7566	free_sched_group(tg);	7567	sched_free_group(tg);
7567	return ERR_PTR(-ENOMEM);	7568	return ERR_PTR(-ENOMEM);
7568	}	7569	}
7569		7570
@@ -7583,17 +7584,16 @@ void sched_online_group(struct task_group tg, struct task_group parent)
7583	}	7584	}
7584		7585
7585	/* rcu callback to free various structures associated with a task group */	7586	/* rcu callback to free various structures associated with a task group */
7586	static void free_sched_group_rcu(struct rcu_head *rhp)	7587	static void sched_free_group_rcu(struct rcu_head *rhp)
7587	{	7588	{
7588	/* now it should be safe to free those cfs_rqs */	7589	/* now it should be safe to free those cfs_rqs */
7589	free_sched_group(container_of(rhp, struct task_group, rcu));	7590	sched_free_group(container_of(rhp, struct task_group, rcu));
7590	}	7591	}
7591		7592
7592	/* Destroy runqueue etc associated with a task group */
7593	void sched_destroy_group(struct task_group *tg)	7593	void sched_destroy_group(struct task_group *tg)
7594	{	7594	{
7595	/* wait for possible concurrent references to cfs_rqs complete */	7595	/* wait for possible concurrent references to cfs_rqs complete */
7596	call_rcu(&tg->rcu, free_sched_group_rcu);	7596	call_rcu(&tg->rcu, sched_free_group_rcu);
7597	}	7597	}
7598		7598
7599	void sched_offline_group(struct task_group *tg)	7599	void sched_offline_group(struct task_group *tg)
@@ -8052,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8052	if (IS_ERR(tg))	8052	if (IS_ERR(tg))
8053	return ERR_PTR(-ENOMEM);	8053	return ERR_PTR(-ENOMEM);
8054		8054
		8055	sched_online_group(tg, parent);
		8056
8055	return &tg->css;	8057	return &tg->css;
8056	}	8058	}
8057		8059
8058	static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)	8060	static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8059	{	8061	{
8060	struct task_group *tg = css_tg(css);	8062	struct task_group *tg = css_tg(css);
8061	struct task_group *parent = css_tg(css->parent);
8062		8063
8063	if (parent)	8064	sched_offline_group(tg);
8064	sched_online_group(tg, parent);
8065	return 0;
8066	}	8065	}
8067		8066
8068	static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)	8067	static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8069	{	8068	{
8070	struct task_group *tg = css_tg(css);	8069	struct task_group *tg = css_tg(css);
8071		8070
8072	sched_destroy_group(tg);	8071	/*
8073	}	8072	* Relies on the RCU grace period between css_released() and this.
8074		8073	*/
8075	static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)	8074	sched_free_group(tg);
8076	{
8077	struct task_group *tg = css_tg(css);
8078
8079	sched_offline_group(tg);
8080	}	8075	}
8081		8076
8082	static void cpu_cgroup_fork(struct task_struct *task)	8077	static void cpu_cgroup_fork(struct task_struct *task)
@@ -8436,9 +8431,8 @@ static struct cftype cpu_files[] = {
8436		8431
8437	struct cgroup_subsys cpu_cgrp_subsys = {	8432	struct cgroup_subsys cpu_cgrp_subsys = {
8438	.css_alloc = cpu_cgroup_css_alloc,	8433	.css_alloc = cpu_cgroup_css_alloc,
		8434	.css_released = cpu_cgroup_css_released,
8439	.css_free = cpu_cgroup_css_free,	8435	.css_free = cpu_cgroup_css_free,
8440	.css_online = cpu_cgroup_css_online,
8441	.css_offline = cpu_cgroup_css_offline,
8442	.fork = cpu_cgroup_fork,	8436	.fork = cpu_cgroup_fork,
8443	.can_attach = cpu_cgroup_can_attach,	8437	.can_attach = cpu_cgroup_can_attach,
8444	.attach = cpu_cgroup_attach,	8438	.attach = cpu_cgroup_attach,


diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 2ddaebf7469a..4a811203c04a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state css, struct cftype cft)
145	}	145	}
146		146
147	static int cpuusage_write(struct cgroup_subsys_state css, struct cftype cft,	147	static int cpuusage_write(struct cgroup_subsys_state css, struct cftype cft,
148	u64 reset)	148	u64 val)
149	{	149	{
150	struct cpuacct *ca = css_ca(css);	150	struct cpuacct *ca = css_ca(css);
151	int err = 0;	151	int err = 0;
152	int i;	152	int i;
153		153
154	if (reset) {	154	/*
		155	* Only allow '0' here to do a reset.
		156	*/
		157	if (val) {
155	err = -EINVAL;	158	err = -EINVAL;
156	goto out;	159	goto out;
157	}	160	}
@@ -235,23 +238,10 @@ static struct cftype files[] = {
235	void cpuacct_charge(struct task_struct *tsk, u64 cputime)	238	void cpuacct_charge(struct task_struct *tsk, u64 cputime)
236	{	239	{
237	struct cpuacct *ca;	240	struct cpuacct *ca;
238	int cpu;
239
240	cpu = task_cpu(tsk);
241		241
242	rcu_read_lock();	242	rcu_read_lock();
243		243	for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
244	ca = task_ca(tsk);	244	*this_cpu_ptr(ca->cpuusage) += cputime;
245
246	while (true) {
247	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
248	*cpuusage += cputime;
249
250	ca = parent_ca(ca);
251	if (!ca)
252	break;
253	}
254
255	rcu_read_unlock();	245	rcu_read_unlock();
256	}	246	}
257		247
@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
260	*	250	*
261	* Note: it's the caller that updates the account of the root cgroup.	251	* Note: it's the caller that updates the account of the root cgroup.
262	*/	252	*/
263	void cpuacct_account_field(struct task_struct *p, int index, u64 val)	253	void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
264	{	254	{
265	struct kernel_cpustat *kcpustat;
266	struct cpuacct *ca;	255	struct cpuacct *ca;
267		256
268	rcu_read_lock();	257	rcu_read_lock();
269	ca = task_ca(p);	258	for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
270	while (ca != &root_cpuacct) {	259	this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
271	kcpustat = this_cpu_ptr(ca->cpustat);
272	kcpustat->cpustat[index] += val;
273	ca = parent_ca(ca);
274	}
275	rcu_read_unlock();	260	rcu_read_unlock();
276	}	261	}
277		262


diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
1	#ifdef CONFIG_CGROUP_CPUACCT	1	#ifdef CONFIG_CGROUP_CPUACCT
2		2
3	extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);	3	extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4	extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);	4	extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
5		5
6	#else	6	#else
7		7
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10	}	10	}
11		11
12	static inline void	12	static inline void
13	cpuacct_account_field(struct task_struct *p, int index, u64 val)	13	cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
14	{	14	{
15	}	15	}
16		16


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46d64e4ccfde..0fe30e66aff1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3181,17 +3181,25 @@ static inline void check_schedstat_required(void)
3181	static void	3181	static void
3182	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)	3182	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
3183	{	3183	{
		3184	bool renorm = !(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING);
		3185	bool curr = cfs_rq->curr == se;
		3186
3184	/*	3187	/*
3185	* Update the normalized vruntime before updating min_vruntime	3188	* If we're the current task, we must renormalise before calling
3186	* through calling update_curr().	3189	* update_curr().
3187	*/	3190	*/
3188	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))	3191	if (renorm && curr)
3189	se->vruntime += cfs_rq->min_vruntime;	3192	se->vruntime += cfs_rq->min_vruntime;
3190		3193
		3194	update_curr(cfs_rq);
		3195
3191	/*	3196	/*
3192	* Update run-time statistics of the 'current'.	3197	* Otherwise, renormalise after, such that we're placed at the current
		3198	* moment in time, instead of some random moment in the past.
3193	*/	3199	*/
3194	update_curr(cfs_rq);	3200	if (renorm && !curr)
		3201	se->vruntime += cfs_rq->min_vruntime;
		3202
3195	enqueue_entity_load_avg(cfs_rq, se);	3203	enqueue_entity_load_avg(cfs_rq, se);
3196	account_entity_enqueue(cfs_rq, se);	3204	account_entity_enqueue(cfs_rq, se);
3197	update_cfs_shares(cfs_rq);	3205	update_cfs_shares(cfs_rq);
@@ -3207,7 +3215,7 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
3207	update_stats_enqueue(cfs_rq, se);	3215	update_stats_enqueue(cfs_rq, se);
3208	check_spread(cfs_rq, se);	3216	check_spread(cfs_rq, se);
3209	}	3217	}
3210	if (se != cfs_rq->curr)	3218	if (!curr)
3211	__enqueue_entity(cfs_rq, se);	3219	__enqueue_entity(cfs_rq, se);
3212	se->on_rq = 1;	3220	se->on_rq = 1;
3213		3221
@@ -5071,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
5071	return i;	5079	return i;
5072		5080
5073	/*	5081	/*
5074	* Otherwise, iterate the domains and find an elegible idle cpu.	5082	* Otherwise, iterate the domains and find an eligible idle cpu.
		5083	*
		5084	* A completely idle sched group at higher domains is more
		5085	* desirable than an idle group at a lower level, because lower
		5086	* domains have smaller groups and usually share hardware
		5087	* resources which causes tasks to contend on them, e.g. x86
		5088	* hyperthread siblings in the lowest domain (SMT) can contend
		5089	* on the shared cpu pipeline.
		5090	*
		5091	* However, while we prefer idle groups at higher domains
		5092	* finding an idle cpu at the lowest domain is still better than
		5093	* returning 'target', which we've already established, isn't
		5094	* idle.
5075	*/	5095	*/
5076	sd = rcu_dereference(per_cpu(sd_llc, target));	5096	sd = rcu_dereference(per_cpu(sd_llc, target));
5077	for_each_lower_domain(sd) {	5097	for_each_lower_domain(sd) {
@@ -5081,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
5081	tsk_cpus_allowed(p)))	5101	tsk_cpus_allowed(p)))
5082	goto next;	5102	goto next;
5083		5103
		5104	/* Ensure the entire group is idle */
5084	for_each_cpu(i, sched_group_cpus(sg)) {	5105	for_each_cpu(i, sched_group_cpus(sg)) {
5085	if (i == target \|\| !idle_cpu(i))	5106	if (i == target \|\| !idle_cpu(i))
5086	goto next;	5107	goto next;
5087	}	5108	}
5088		5109
		5110	/*
		5111	* It doesn't matter which cpu we pick, the
		5112	* whole group is idle.
		5113	*/
5089	target = cpumask_first_and(sched_group_cpus(sg),	5114	target = cpumask_first_and(sched_group_cpus(sg),
5090	tsk_cpus_allowed(p));	5115	tsk_cpus_allowed(p));
5091	goto done;	5116	goto done;


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 382848a24ed9..ec2e8d23527e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -1841,3 +1841,16 @@ static inline void cpufreq_trigger_update(u64 time)
1841	static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}	1841	static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
1842	static inline void cpufreq_trigger_update(u64 time) {}	1842	static inline void cpufreq_trigger_update(u64 time) {}
1843	#endif /* CONFIG_CPU_FREQ */	1843	#endif /* CONFIG_CPU_FREQ */
		1844
		1845	static inline void account_reset_rq(struct rq *rq)
		1846	{
		1847	#ifdef CONFIG_IRQ_TIME_ACCOUNTING
		1848	rq->prev_irq_time = 0;
		1849	#endif
		1850	#ifdef CONFIG_PARAVIRT
		1851	rq->prev_steal_time = 0;
		1852	#endif
		1853	#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
		1854	rq->prev_steal_time_rq = 0;
		1855	#endif
		1856	}