diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-24 12:42:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-24 12:42:50 -0400 |
commit | be53f58fa0fcd97c62a84f2eb98cff528f8b2443 (patch) | |
tree | 8026c54554a32777130f535a0b1685cb5078210d | |
parent | 19d6f04cd374b886b98d7b070ebf287c93bff7ac (diff) | |
parent | 73e6aafd9ea81498d31361f01db84a0118da2d1c (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"Misc fixes: a cgroup fix, a fair-scheduler migration accounting fix, a
cputime fix and two cpuacct cleanups"
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/cpuacct: Simplify the cpuacct code
sched/cpuacct: Rename parameter in cpuusage_write() for readability
sched/fair: Add comments to explain select_idle_sibling()
sched/fair: Fix fairness issue on migration
sched/cgroup: Fix/cleanup cgroup teardown/init
sched/cputime: Fix steal time accounting vs. CPU hotplug
-rw-r--r-- | kernel/sched/core.c | 36 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 35 | ||||
-rw-r--r-- | kernel/sched/cpuacct.h | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 39 | ||||
-rw-r--r-- | kernel/sched/sched.h | 13 |
5 files changed, 72 insertions, 55 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44db0fffa8be..d8465eeab8b3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -5371,6 +5371,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5371 | 5371 | ||
5372 | case CPU_UP_PREPARE: | 5372 | case CPU_UP_PREPARE: |
5373 | rq->calc_load_update = calc_load_update; | 5373 | rq->calc_load_update = calc_load_update; |
5374 | account_reset_rq(rq); | ||
5374 | break; | 5375 | break; |
5375 | 5376 | ||
5376 | case CPU_ONLINE: | 5377 | case CPU_ONLINE: |
@@ -7537,7 +7538,7 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
7537 | /* task_group_lock serializes the addition/removal of task groups */ | 7538 | /* task_group_lock serializes the addition/removal of task groups */ |
7538 | static DEFINE_SPINLOCK(task_group_lock); | 7539 | static DEFINE_SPINLOCK(task_group_lock); |
7539 | 7540 | ||
7540 | static void free_sched_group(struct task_group *tg) | 7541 | static void sched_free_group(struct task_group *tg) |
7541 | { | 7542 | { |
7542 | free_fair_sched_group(tg); | 7543 | free_fair_sched_group(tg); |
7543 | free_rt_sched_group(tg); | 7544 | free_rt_sched_group(tg); |
@@ -7563,7 +7564,7 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
7563 | return tg; | 7564 | return tg; |
7564 | 7565 | ||
7565 | err: | 7566 | err: |
7566 | free_sched_group(tg); | 7567 | sched_free_group(tg); |
7567 | return ERR_PTR(-ENOMEM); | 7568 | return ERR_PTR(-ENOMEM); |
7568 | } | 7569 | } |
7569 | 7570 | ||
@@ -7583,17 +7584,16 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7583 | } | 7584 | } |
7584 | 7585 | ||
7585 | /* rcu callback to free various structures associated with a task group */ | 7586 | /* rcu callback to free various structures associated with a task group */ |
7586 | static void free_sched_group_rcu(struct rcu_head *rhp) | 7587 | static void sched_free_group_rcu(struct rcu_head *rhp) |
7587 | { | 7588 | { |
7588 | /* now it should be safe to free those cfs_rqs */ | 7589 | /* now it should be safe to free those cfs_rqs */ |
7589 | free_sched_group(container_of(rhp, struct task_group, rcu)); | 7590 | sched_free_group(container_of(rhp, struct task_group, rcu)); |
7590 | } | 7591 | } |
7591 | 7592 | ||
7592 | /* Destroy runqueue etc associated with a task group */ | ||
7593 | void sched_destroy_group(struct task_group *tg) | 7593 | void sched_destroy_group(struct task_group *tg) |
7594 | { | 7594 | { |
7595 | /* wait for possible concurrent references to cfs_rqs complete */ | 7595 | /* wait for possible concurrent references to cfs_rqs complete */ |
7596 | call_rcu(&tg->rcu, free_sched_group_rcu); | 7596 | call_rcu(&tg->rcu, sched_free_group_rcu); |
7597 | } | 7597 | } |
7598 | 7598 | ||
7599 | void sched_offline_group(struct task_group *tg) | 7599 | void sched_offline_group(struct task_group *tg) |
@@ -8052,31 +8052,26 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
8052 | if (IS_ERR(tg)) | 8052 | if (IS_ERR(tg)) |
8053 | return ERR_PTR(-ENOMEM); | 8053 | return ERR_PTR(-ENOMEM); |
8054 | 8054 | ||
8055 | sched_online_group(tg, parent); | ||
8056 | |||
8055 | return &tg->css; | 8057 | return &tg->css; |
8056 | } | 8058 | } |
8057 | 8059 | ||
8058 | static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) | 8060 | static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) |
8059 | { | 8061 | { |
8060 | struct task_group *tg = css_tg(css); | 8062 | struct task_group *tg = css_tg(css); |
8061 | struct task_group *parent = css_tg(css->parent); | ||
8062 | 8063 | ||
8063 | if (parent) | 8064 | sched_offline_group(tg); |
8064 | sched_online_group(tg, parent); | ||
8065 | return 0; | ||
8066 | } | 8065 | } |
8067 | 8066 | ||
8068 | static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) | 8067 | static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) |
8069 | { | 8068 | { |
8070 | struct task_group *tg = css_tg(css); | 8069 | struct task_group *tg = css_tg(css); |
8071 | 8070 | ||
8072 | sched_destroy_group(tg); | 8071 | /* |
8073 | } | 8072 | * Relies on the RCU grace period between css_released() and this. |
8074 | 8073 | */ | |
8075 | static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) | 8074 | sched_free_group(tg); |
8076 | { | ||
8077 | struct task_group *tg = css_tg(css); | ||
8078 | |||
8079 | sched_offline_group(tg); | ||
8080 | } | 8075 | } |
8081 | 8076 | ||
8082 | static void cpu_cgroup_fork(struct task_struct *task) | 8077 | static void cpu_cgroup_fork(struct task_struct *task) |
@@ -8436,9 +8431,8 @@ static struct cftype cpu_files[] = { | |||
8436 | 8431 | ||
8437 | struct cgroup_subsys cpu_cgrp_subsys = { | 8432 | struct cgroup_subsys cpu_cgrp_subsys = { |
8438 | .css_alloc = cpu_cgroup_css_alloc, | 8433 | .css_alloc = cpu_cgroup_css_alloc, |
8434 | .css_released = cpu_cgroup_css_released, | ||
8439 | .css_free = cpu_cgroup_css_free, | 8435 | .css_free = cpu_cgroup_css_free, |
8440 | .css_online = cpu_cgroup_css_online, | ||
8441 | .css_offline = cpu_cgroup_css_offline, | ||
8442 | .fork = cpu_cgroup_fork, | 8436 | .fork = cpu_cgroup_fork, |
8443 | .can_attach = cpu_cgroup_can_attach, | 8437 | .can_attach = cpu_cgroup_can_attach, |
8444 | .attach = cpu_cgroup_attach, | 8438 | .attach = cpu_cgroup_attach, |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 2ddaebf7469a..4a811203c04a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) | |||
145 | } | 145 | } |
146 | 146 | ||
147 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, | 147 | static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, |
148 | u64 reset) | 148 | u64 val) |
149 | { | 149 | { |
150 | struct cpuacct *ca = css_ca(css); | 150 | struct cpuacct *ca = css_ca(css); |
151 | int err = 0; | 151 | int err = 0; |
152 | int i; | 152 | int i; |
153 | 153 | ||
154 | if (reset) { | 154 | /* |
155 | * Only allow '0' here to do a reset. | ||
156 | */ | ||
157 | if (val) { | ||
155 | err = -EINVAL; | 158 | err = -EINVAL; |
156 | goto out; | 159 | goto out; |
157 | } | 160 | } |
@@ -235,23 +238,10 @@ static struct cftype files[] = { | |||
235 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | 238 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) |
236 | { | 239 | { |
237 | struct cpuacct *ca; | 240 | struct cpuacct *ca; |
238 | int cpu; | ||
239 | |||
240 | cpu = task_cpu(tsk); | ||
241 | 241 | ||
242 | rcu_read_lock(); | 242 | rcu_read_lock(); |
243 | 243 | for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) | |
244 | ca = task_ca(tsk); | 244 | *this_cpu_ptr(ca->cpuusage) += cputime; |
245 | |||
246 | while (true) { | ||
247 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
248 | *cpuusage += cputime; | ||
249 | |||
250 | ca = parent_ca(ca); | ||
251 | if (!ca) | ||
252 | break; | ||
253 | } | ||
254 | |||
255 | rcu_read_unlock(); | 245 | rcu_read_unlock(); |
256 | } | 246 | } |
257 | 247 | ||
@@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
260 | * | 250 | * |
261 | * Note: it's the caller that updates the account of the root cgroup. | 251 | * Note: it's the caller that updates the account of the root cgroup. |
262 | */ | 252 | */ |
263 | void cpuacct_account_field(struct task_struct *p, int index, u64 val) | 253 | void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) |
264 | { | 254 | { |
265 | struct kernel_cpustat *kcpustat; | ||
266 | struct cpuacct *ca; | 255 | struct cpuacct *ca; |
267 | 256 | ||
268 | rcu_read_lock(); | 257 | rcu_read_lock(); |
269 | ca = task_ca(p); | 258 | for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) |
270 | while (ca != &root_cpuacct) { | 259 | this_cpu_ptr(ca->cpustat)->cpustat[index] += val; |
271 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
272 | kcpustat->cpustat[index] += val; | ||
273 | ca = parent_ca(ca); | ||
274 | } | ||
275 | rcu_read_unlock(); | 260 | rcu_read_unlock(); |
276 | } | 261 | } |
277 | 262 | ||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #ifdef CONFIG_CGROUP_CPUACCT | 1 | #ifdef CONFIG_CGROUP_CPUACCT |
2 | 2 | ||
3 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | 3 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); |
4 | extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); | 4 | extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); |
5 | 5 | ||
6 | #else | 6 | #else |
7 | 7 | ||
@@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) | |||
10 | } | 10 | } |
11 | 11 | ||
12 | static inline void | 12 | static inline void |
13 | cpuacct_account_field(struct task_struct *p, int index, u64 val) | 13 | cpuacct_account_field(struct task_struct *tsk, int index, u64 val) |
14 | { | 14 | { |
15 | } | 15 | } |
16 | 16 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46d64e4ccfde..0fe30e66aff1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -3181,17 +3181,25 @@ static inline void check_schedstat_required(void) | |||
3181 | static void | 3181 | static void |
3182 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | 3182 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) |
3183 | { | 3183 | { |
3184 | bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); | ||
3185 | bool curr = cfs_rq->curr == se; | ||
3186 | |||
3184 | /* | 3187 | /* |
3185 | * Update the normalized vruntime before updating min_vruntime | 3188 | * If we're the current task, we must renormalise before calling |
3186 | * through calling update_curr(). | 3189 | * update_curr(). |
3187 | */ | 3190 | */ |
3188 | if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) | 3191 | if (renorm && curr) |
3189 | se->vruntime += cfs_rq->min_vruntime; | 3192 | se->vruntime += cfs_rq->min_vruntime; |
3190 | 3193 | ||
3194 | update_curr(cfs_rq); | ||
3195 | |||
3191 | /* | 3196 | /* |
3192 | * Update run-time statistics of the 'current'. | 3197 | * Otherwise, renormalise after, such that we're placed at the current |
3198 | * moment in time, instead of some random moment in the past. | ||
3193 | */ | 3199 | */ |
3194 | update_curr(cfs_rq); | 3200 | if (renorm && !curr) |
3201 | se->vruntime += cfs_rq->min_vruntime; | ||
3202 | |||
3195 | enqueue_entity_load_avg(cfs_rq, se); | 3203 | enqueue_entity_load_avg(cfs_rq, se); |
3196 | account_entity_enqueue(cfs_rq, se); | 3204 | account_entity_enqueue(cfs_rq, se); |
3197 | update_cfs_shares(cfs_rq); | 3205 | update_cfs_shares(cfs_rq); |
@@ -3207,7 +3215,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3207 | update_stats_enqueue(cfs_rq, se); | 3215 | update_stats_enqueue(cfs_rq, se); |
3208 | check_spread(cfs_rq, se); | 3216 | check_spread(cfs_rq, se); |
3209 | } | 3217 | } |
3210 | if (se != cfs_rq->curr) | 3218 | if (!curr) |
3211 | __enqueue_entity(cfs_rq, se); | 3219 | __enqueue_entity(cfs_rq, se); |
3212 | se->on_rq = 1; | 3220 | se->on_rq = 1; |
3213 | 3221 | ||
@@ -5071,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
5071 | return i; | 5079 | return i; |
5072 | 5080 | ||
5073 | /* | 5081 | /* |
5074 | * Otherwise, iterate the domains and find an elegible idle cpu. | 5082 | * Otherwise, iterate the domains and find an eligible idle cpu. |
5083 | * | ||
5084 | * A completely idle sched group at higher domains is more | ||
5085 | * desirable than an idle group at a lower level, because lower | ||
5086 | * domains have smaller groups and usually share hardware | ||
5087 | * resources which causes tasks to contend on them, e.g. x86 | ||
5088 | * hyperthread siblings in the lowest domain (SMT) can contend | ||
5089 | * on the shared cpu pipeline. | ||
5090 | * | ||
5091 | * However, while we prefer idle groups at higher domains | ||
5092 | * finding an idle cpu at the lowest domain is still better than | ||
5093 | * returning 'target', which we've already established, isn't | ||
5094 | * idle. | ||
5075 | */ | 5095 | */ |
5076 | sd = rcu_dereference(per_cpu(sd_llc, target)); | 5096 | sd = rcu_dereference(per_cpu(sd_llc, target)); |
5077 | for_each_lower_domain(sd) { | 5097 | for_each_lower_domain(sd) { |
@@ -5081,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target) | |||
5081 | tsk_cpus_allowed(p))) | 5101 | tsk_cpus_allowed(p))) |
5082 | goto next; | 5102 | goto next; |
5083 | 5103 | ||
5104 | /* Ensure the entire group is idle */ | ||
5084 | for_each_cpu(i, sched_group_cpus(sg)) { | 5105 | for_each_cpu(i, sched_group_cpus(sg)) { |
5085 | if (i == target || !idle_cpu(i)) | 5106 | if (i == target || !idle_cpu(i)) |
5086 | goto next; | 5107 | goto next; |
5087 | } | 5108 | } |
5088 | 5109 | ||
5110 | /* | ||
5111 | * It doesn't matter which cpu we pick, the | ||
5112 | * whole group is idle. | ||
5113 | */ | ||
5089 | target = cpumask_first_and(sched_group_cpus(sg), | 5114 | target = cpumask_first_and(sched_group_cpus(sg), |
5090 | tsk_cpus_allowed(p)); | 5115 | tsk_cpus_allowed(p)); |
5091 | goto done; | 5116 | goto done; |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 382848a24ed9..ec2e8d23527e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1841,3 +1841,16 @@ static inline void cpufreq_trigger_update(u64 time) | |||
1841 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} | 1841 | static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} |
1842 | static inline void cpufreq_trigger_update(u64 time) {} | 1842 | static inline void cpufreq_trigger_update(u64 time) {} |
1843 | #endif /* CONFIG_CPU_FREQ */ | 1843 | #endif /* CONFIG_CPU_FREQ */ |
1844 | |||
1845 | static inline void account_reset_rq(struct rq *rq) | ||
1846 | { | ||
1847 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1848 | rq->prev_irq_time = 0; | ||
1849 | #endif | ||
1850 | #ifdef CONFIG_PARAVIRT | ||
1851 | rq->prev_steal_time = 0; | ||
1852 | #endif | ||
1853 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
1854 | rq->prev_steal_time_rq = 0; | ||
1855 | #endif | ||
1856 | } | ||