aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-24 12:42:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-24 12:42:50 -0400
commitbe53f58fa0fcd97c62a84f2eb98cff528f8b2443 (patch)
tree8026c54554a32777130f535a0b1685cb5078210d
parent19d6f04cd374b886b98d7b070ebf287c93bff7ac (diff)
parent73e6aafd9ea81498d31361f01db84a0118da2d1c (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a cputime fix and two cpuacct cleanups" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/cpuacct: Simplify the cpuacct code sched/cpuacct: Rename parameter in cpuusage_write() for readability sched/fair: Add comments to explain select_idle_sibling() sched/fair: Fix fairness issue on migration sched/cgroup: Fix/cleanup cgroup teardown/init sched/cputime: Fix steal time accounting vs. CPU hotplug
-rw-r--r--kernel/sched/core.c36
-rw-r--r--kernel/sched/cpuacct.c35
-rw-r--r--kernel/sched/cpuacct.h4
-rw-r--r--kernel/sched/fair.c39
-rw-r--r--kernel/sched/sched.h13
5 files changed, 72 insertions, 55 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44db0fffa8be..d8465eeab8b3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5371,6 +5371,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5371 5371
5372 case CPU_UP_PREPARE: 5372 case CPU_UP_PREPARE:
5373 rq->calc_load_update = calc_load_update; 5373 rq->calc_load_update = calc_load_update;
5374 account_reset_rq(rq);
5374 break; 5375 break;
5375 5376
5376 case CPU_ONLINE: 5377 case CPU_ONLINE:
@@ -7537,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p)
7537/* task_group_lock serializes the addition/removal of task groups */ 7538/* task_group_lock serializes the addition/removal of task groups */
7538static DEFINE_SPINLOCK(task_group_lock); 7539static DEFINE_SPINLOCK(task_group_lock);
7539 7540
7540static void free_sched_group(struct task_group *tg) 7541static void sched_free_group(struct task_group *tg)
7541{ 7542{
7542 free_fair_sched_group(tg); 7543 free_fair_sched_group(tg);
7543 free_rt_sched_group(tg); 7544 free_rt_sched_group(tg);
@@ -7563,7 +7564,7 @@ struct task_group *sched_create_group(struct task_group *parent)
7563 return tg; 7564 return tg;
7564 7565
7565err: 7566err:
7566 free_sched_group(tg); 7567 sched_free_group(tg);
7567 return ERR_PTR(-ENOMEM); 7568 return ERR_PTR(-ENOMEM);
7568} 7569}
7569 7570
@@ -7583,17 +7584,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
7583} 7584}
7584 7585
7585/* rcu callback to free various structures associated with a task group */ 7586/* rcu callback to free various structures associated with a task group */
7586static void free_sched_group_rcu(struct rcu_head *rhp) 7587static void sched_free_group_rcu(struct rcu_head *rhp)
7587{ 7588{
7588 /* now it should be safe to free those cfs_rqs */ 7589 /* now it should be safe to free those cfs_rqs */
7589 free_sched_group(container_of(rhp, struct task_group, rcu)); 7590 sched_free_group(container_of(rhp, struct task_group, rcu));
7590} 7591}
7591 7592
7592/* Destroy runqueue etc associated with a task group */
7593void sched_destroy_group(struct task_group *tg) 7593void sched_destroy_group(struct task_group *tg)
7594{ 7594{
7595 /* wait for possible concurrent references to cfs_rqs complete */ 7595 /* wait for possible concurrent references to cfs_rqs complete */
7596 call_rcu(&tg->rcu, free_sched_group_rcu); 7596 call_rcu(&tg->rcu, sched_free_group_rcu);
7597} 7597}
7598 7598
7599void sched_offline_group(struct task_group *tg) 7599void sched_offline_group(struct task_group *tg)
@@ -8052,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8052 if (IS_ERR(tg)) 8052 if (IS_ERR(tg))
8053 return ERR_PTR(-ENOMEM); 8053 return ERR_PTR(-ENOMEM);
8054 8054
8055 sched_online_group(tg, parent);
8056
8055 return &tg->css; 8057 return &tg->css;
8056} 8058}
8057 8059
8058static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 8060static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8059{ 8061{
8060 struct task_group *tg = css_tg(css); 8062 struct task_group *tg = css_tg(css);
8061 struct task_group *parent = css_tg(css->parent);
8062 8063
8063 if (parent) 8064 sched_offline_group(tg);
8064 sched_online_group(tg, parent);
8065 return 0;
8066} 8065}
8067 8066
8068static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 8067static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8069{ 8068{
8070 struct task_group *tg = css_tg(css); 8069 struct task_group *tg = css_tg(css);
8071 8070
8072 sched_destroy_group(tg); 8071 /*
8073} 8072 * Relies on the RCU grace period between css_released() and this.
8074 8073 */
8075static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 8074 sched_free_group(tg);
8076{
8077 struct task_group *tg = css_tg(css);
8078
8079 sched_offline_group(tg);
8080} 8075}
8081 8076
8082static void cpu_cgroup_fork(struct task_struct *task) 8077static void cpu_cgroup_fork(struct task_struct *task)
@@ -8436,9 +8431,8 @@ static struct cftype cpu_files[] = {
8436 8431
8437struct cgroup_subsys cpu_cgrp_subsys = { 8432struct cgroup_subsys cpu_cgrp_subsys = {
8438 .css_alloc = cpu_cgroup_css_alloc, 8433 .css_alloc = cpu_cgroup_css_alloc,
8434 .css_released = cpu_cgroup_css_released,
8439 .css_free = cpu_cgroup_css_free, 8435 .css_free = cpu_cgroup_css_free,
8440 .css_online = cpu_cgroup_css_online,
8441 .css_offline = cpu_cgroup_css_offline,
8442 .fork = cpu_cgroup_fork, 8436 .fork = cpu_cgroup_fork,
8443 .can_attach = cpu_cgroup_can_attach, 8437 .can_attach = cpu_cgroup_can_attach,
8444 .attach = cpu_cgroup_attach, 8438 .attach = cpu_cgroup_attach,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 2ddaebf7469a..4a811203c04a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
145} 145}
146 146
147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, 147static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
148 u64 reset) 148 u64 val)
149{ 149{
150 struct cpuacct *ca = css_ca(css); 150 struct cpuacct *ca = css_ca(css);
151 int err = 0; 151 int err = 0;
152 int i; 152 int i;
153 153
154 if (reset) { 154 /*
155 * Only allow '0' here to do a reset.
156 */
157 if (val) {
155 err = -EINVAL; 158 err = -EINVAL;
156 goto out; 159 goto out;
157 } 160 }
@@ -235,23 +238,10 @@ static struct cftype files[] = {
235void cpuacct_charge(struct task_struct *tsk, u64 cputime) 238void cpuacct_charge(struct task_struct *tsk, u64 cputime)
236{ 239{
237 struct cpuacct *ca; 240 struct cpuacct *ca;
238 int cpu;
239
240 cpu = task_cpu(tsk);
241 241
242 rcu_read_lock(); 242 rcu_read_lock();
243 243 for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
244 ca = task_ca(tsk); 244 *this_cpu_ptr(ca->cpuusage) += cputime;
245
246 while (true) {
247 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
248 *cpuusage += cputime;
249
250 ca = parent_ca(ca);
251 if (!ca)
252 break;
253 }
254
255 rcu_read_unlock(); 245 rcu_read_unlock();
256} 246}
257 247
@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime)
260 * 250 *
261 * Note: it's the caller that updates the account of the root cgroup. 251 * Note: it's the caller that updates the account of the root cgroup.
262 */ 252 */
263void cpuacct_account_field(struct task_struct *p, int index, u64 val) 253void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
264{ 254{
265 struct kernel_cpustat *kcpustat;
266 struct cpuacct *ca; 255 struct cpuacct *ca;
267 256
268 rcu_read_lock(); 257 rcu_read_lock();
269 ca = task_ca(p); 258 for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
270 while (ca != &root_cpuacct) { 259 this_cpu_ptr(ca->cpustat)->cpustat[index] += val;
271 kcpustat = this_cpu_ptr(ca->cpustat);
272 kcpustat->cpustat[index] += val;
273 ca = parent_ca(ca);
274 }
275 rcu_read_unlock(); 260 rcu_read_unlock();
276} 261}
277 262
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed605624a5e7..ba72807c73d4 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -1,7 +1,7 @@
1#ifdef CONFIG_CGROUP_CPUACCT 1#ifdef CONFIG_CGROUP_CPUACCT
2 2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); 3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); 4extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
5 5
6#else 6#else
7 7
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10} 10}
11 11
12static inline void 12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val) 13cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
14{ 14{
15} 15}
16 16
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46d64e4ccfde..0fe30e66aff1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3181,17 +3181,25 @@ static inline void check_schedstat_required(void)
3181static void 3181static void
3182enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 3182enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3183{ 3183{
3184 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
3185 bool curr = cfs_rq->curr == se;
3186
3184 /* 3187 /*
3185 * Update the normalized vruntime before updating min_vruntime 3188 * If we're the current task, we must renormalise before calling
3186 * through calling update_curr(). 3189 * update_curr().
3187 */ 3190 */
3188 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) 3191 if (renorm && curr)
3189 se->vruntime += cfs_rq->min_vruntime; 3192 se->vruntime += cfs_rq->min_vruntime;
3190 3193
3194 update_curr(cfs_rq);
3195
3191 /* 3196 /*
3192 * Update run-time statistics of the 'current'. 3197 * Otherwise, renormalise after, such that we're placed at the current
3198 * moment in time, instead of some random moment in the past.
3193 */ 3199 */
3194 update_curr(cfs_rq); 3200 if (renorm && !curr)
3201 se->vruntime += cfs_rq->min_vruntime;
3202
3195 enqueue_entity_load_avg(cfs_rq, se); 3203 enqueue_entity_load_avg(cfs_rq, se);
3196 account_entity_enqueue(cfs_rq, se); 3204 account_entity_enqueue(cfs_rq, se);
3197 update_cfs_shares(cfs_rq); 3205 update_cfs_shares(cfs_rq);
@@ -3207,7 +3215,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3207 update_stats_enqueue(cfs_rq, se); 3215 update_stats_enqueue(cfs_rq, se);
3208 check_spread(cfs_rq, se); 3216 check_spread(cfs_rq, se);
3209 } 3217 }
3210 if (se != cfs_rq->curr) 3218 if (!curr)
3211 __enqueue_entity(cfs_rq, se); 3219 __enqueue_entity(cfs_rq, se);
3212 se->on_rq = 1; 3220 se->on_rq = 1;
3213 3221
@@ -5071,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target)
5071 return i; 5079 return i;
5072 5080
5073 /* 5081 /*
5074 * Otherwise, iterate the domains and find an elegible idle cpu. 5082 * Otherwise, iterate the domains and find an eligible idle cpu.
5083 *
5084 * A completely idle sched group at higher domains is more
5085 * desirable than an idle group at a lower level, because lower
5086 * domains have smaller groups and usually share hardware
5087 * resources which causes tasks to contend on them, e.g. x86
5088 * hyperthread siblings in the lowest domain (SMT) can contend
5089 * on the shared cpu pipeline.
5090 *
5091 * However, while we prefer idle groups at higher domains
5092 * finding an idle cpu at the lowest domain is still better than
5093 * returning 'target', which we've already established, isn't
5094 * idle.
5075 */ 5095 */
5076 sd = rcu_dereference(per_cpu(sd_llc, target)); 5096 sd = rcu_dereference(per_cpu(sd_llc, target));
5077 for_each_lower_domain(sd) { 5097 for_each_lower_domain(sd) {
@@ -5081,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target)
5081 tsk_cpus_allowed(p))) 5101 tsk_cpus_allowed(p)))
5082 goto next; 5102 goto next;
5083 5103
5104 /* Ensure the entire group is idle */
5084 for_each_cpu(i, sched_group_cpus(sg)) { 5105 for_each_cpu(i, sched_group_cpus(sg)) {
5085 if (i == target || !idle_cpu(i)) 5106 if (i == target || !idle_cpu(i))
5086 goto next; 5107 goto next;
5087 } 5108 }
5088 5109
5110 /*
5111 * It doesn't matter which cpu we pick, the
5112 * whole group is idle.
5113 */
5089 target = cpumask_first_and(sched_group_cpus(sg), 5114 target = cpumask_first_and(sched_group_cpus(sg),
5090 tsk_cpus_allowed(p)); 5115 tsk_cpus_allowed(p));
5091 goto done; 5116 goto done;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 382848a24ed9..ec2e8d23527e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1841,3 +1841,16 @@ static inline void cpufreq_trigger_update(u64 time)
1841static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} 1841static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
1842static inline void cpufreq_trigger_update(u64 time) {} 1842static inline void cpufreq_trigger_update(u64 time) {}
1843#endif /* CONFIG_CPU_FREQ */ 1843#endif /* CONFIG_CPU_FREQ */
1844
1845static inline void account_reset_rq(struct rq *rq)
1846{
1847#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1848 rq->prev_irq_time = 0;
1849#endif
1850#ifdef CONFIG_PARAVIRT
1851 rq->prev_steal_time = 0;
1852#endif
1853#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
1854 rq->prev_steal_time_rq = 0;
1855#endif
1856}