aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2013-05-02 11:37:49 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2013-05-02 11:54:19 -0400
commitc032862fba51a3ca504752d3a25186b324c5ce83 (patch)
tree955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /kernel/sched
parentfda76e074c7737fc57855dd17c762e50ed526052 (diff)
parent8700c95adb033843fc163d112b9d21d4fda78018 (diff)
Merge commit '8700c95adb03' into timers/nohz
The full dynticks tree needs the latest RCU and sched upstream updates in order to fix some dependencies. Merge a common upstream merge point that has these updates. Conflicts: include/linux/perf_event.h kernel/rcutree.h kernel/rcutree_plugin.h Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c299
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c20
-rw-r--r--kernel/sched/fair.c131
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/idle_task.c16
-rw-r--r--kernel/sched/sched.h61
10 files changed, 464 insertions, 410 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
176 u64 this_clock, remote_clock; 176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val; 177 u64 *ptr, old_val, val;
178 178
179#if BITS_PER_LONG != 64
180again:
181 /*
182 * Careful here: The local and the remote clock values need to
183 * be read out atomic as we need to compare the values and
184 * then update either the local or the remote side. So the
185 * cmpxchg64 below only protects one readout.
186 *
187 * We must reread via sched_clock_local() in the retry case on
188 * 32bit as an NMI could use sched_clock_local() via the
189 * tracer and hit between the readout of
190 * the low32bit and the high 32bit portion.
191 */
192 this_clock = sched_clock_local(my_scd);
193 /*
194 * We must enforce atomic readout on 32bit, otherwise the
195 * update on the remote cpu can hit inbetween the readout of
196 * the low32bit and the high 32bit portion.
197 */
198 remote_clock = cmpxchg64(&scd->clock, 0, 0);
199#else
200 /*
201 * On 64bit the read of [my]scd->clock is atomic versus the
202 * update, so we can avoid the above 32bit dance.
203 */
179 sched_clock_local(my_scd); 204 sched_clock_local(my_scd);
180again: 205again:
181 this_clock = my_scd->clock; 206 this_clock = my_scd->clock;
182 remote_clock = scd->clock; 207 remote_clock = scd->clock;
208#endif
183 209
184 /* 210 /*
185 * Use the opportunity that we have both locks 211 * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dd09def88567..e94842d4400c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
512 * the target CPU. 512 * the target CPU.
513 */ 513 */
514#ifdef CONFIG_SMP 514#ifdef CONFIG_SMP
515
516#ifndef tsk_is_polling
517#define tsk_is_polling(t) 0
518#endif
519
520void resched_task(struct task_struct *p) 515void resched_task(struct task_struct *p)
521{ 516{
522 int cpu; 517 int cpu;
@@ -1536,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p)
1536{ 1531{
1537 struct rq *rq = task_rq(p); 1532 struct rq *rq = task_rq(p);
1538 1533
1539 BUG_ON(rq != this_rq()); 1534 if (WARN_ON_ONCE(rq != this_rq()) ||
1540 BUG_ON(p == current); 1535 WARN_ON_ONCE(p == current))
1536 return;
1537
1541 lockdep_assert_held(&rq->lock); 1538 lockdep_assert_held(&rq->lock);
1542 1539
1543 if (!raw_spin_trylock(&p->pi_lock)) { 1540 if (!raw_spin_trylock(&p->pi_lock)) {
@@ -3037,51 +3034,6 @@ void __sched schedule_preempt_disabled(void)
3037 preempt_disable(); 3034 preempt_disable();
3038} 3035}
3039 3036
3040#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3041
3042static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3043{
3044 if (lock->owner != owner)
3045 return false;
3046
3047 /*
3048 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3049 * lock->owner still matches owner, if that fails, owner might
3050 * point to free()d memory, if it still matches, the rcu_read_lock()
3051 * ensures the memory stays valid.
3052 */
3053 barrier();
3054
3055 return owner->on_cpu;
3056}
3057
3058/*
3059 * Look out! "owner" is an entirely speculative pointer
3060 * access and not reliable.
3061 */
3062int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3063{
3064 if (!sched_feat(OWNER_SPIN))
3065 return 0;
3066
3067 rcu_read_lock();
3068 while (owner_running(lock, owner)) {
3069 if (need_resched())
3070 break;
3071
3072 arch_mutex_cpu_relax();
3073 }
3074 rcu_read_unlock();
3075
3076 /*
3077 * We break out the loop above on need_resched() and when the
3078 * owner changed, which is a sign for heavy contention. Return
3079 * success only when lock->owner is NULL.
3080 */
3081 return lock->owner == NULL;
3082}
3083#endif
3084
3085#ifdef CONFIG_PREEMPT 3037#ifdef CONFIG_PREEMPT
3086/* 3038/*
3087 * this is the entry point to schedule() from in-kernel preemption 3039 * this is the entry point to schedule() from in-kernel preemption
@@ -4170,6 +4122,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4170 get_task_struct(p); 4122 get_task_struct(p);
4171 rcu_read_unlock(); 4123 rcu_read_unlock();
4172 4124
4125 if (p->flags & PF_NO_SETAFFINITY) {
4126 retval = -EINVAL;
4127 goto out_put_task;
4128 }
4173 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4129 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4174 retval = -ENOMEM; 4130 retval = -ENOMEM;
4175 goto out_put_task; 4131 goto out_put_task;
@@ -4817,11 +4773,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4817 goto out; 4773 goto out;
4818 } 4774 }
4819 4775
4820 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4821 ret = -EINVAL;
4822 goto out;
4823 }
4824
4825 do_set_cpus_allowed(p, new_mask); 4776 do_set_cpus_allowed(p, new_mask);
4826 4777
4827 /* Can the task run on the task's current CPU? If so, we're done */ 4778 /* Can the task run on the task's current CPU? If so, we're done */
@@ -5043,7 +4994,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5043} 4994}
5044 4995
5045static int min_load_idx = 0; 4996static int min_load_idx = 0;
5046static int max_load_idx = CPU_LOAD_IDX_MAX; 4997static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5047 4998
5048static void 4999static void
5049set_table_entry(struct ctl_table *entry, 5000set_table_entry(struct ctl_table *entry,
@@ -6292,7 +6243,7 @@ static void sched_init_numa(void)
6292 * 'level' contains the number of unique distances, excluding the 6243 * 'level' contains the number of unique distances, excluding the
6293 * identity distance node_distance(i,i). 6244 * identity distance node_distance(i,i).
6294 * 6245 *
6295 * The sched_domains_nume_distance[] array includes the actual distance 6246 * The sched_domains_numa_distance[] array includes the actual distance
6296 * numbers. 6247 * numbers.
6297 */ 6248 */
6298 6249
@@ -6913,7 +6864,7 @@ struct task_group root_task_group;
6913LIST_HEAD(task_groups); 6864LIST_HEAD(task_groups);
6914#endif 6865#endif
6915 6866
6916DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6867DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6917 6868
6918void __init sched_init(void) 6869void __init sched_init(void)
6919{ 6870{
@@ -6950,7 +6901,7 @@ void __init sched_init(void)
6950#endif /* CONFIG_RT_GROUP_SCHED */ 6901#endif /* CONFIG_RT_GROUP_SCHED */
6951#ifdef CONFIG_CPUMASK_OFFSTACK 6902#ifdef CONFIG_CPUMASK_OFFSTACK
6952 for_each_possible_cpu(i) { 6903 for_each_possible_cpu(i) {
6953 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6904 per_cpu(load_balance_mask, i) = (void *)ptr;
6954 ptr += cpumask_size(); 6905 ptr += cpumask_size();
6955 } 6906 }
6956#endif /* CONFIG_CPUMASK_OFFSTACK */ 6907#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6976,12 +6927,6 @@ void __init sched_init(void)
6976 6927
6977#endif /* CONFIG_CGROUP_SCHED */ 6928#endif /* CONFIG_CGROUP_SCHED */
6978 6929
6979#ifdef CONFIG_CGROUP_CPUACCT
6980 root_cpuacct.cpustat = &kernel_cpustat;
6981 root_cpuacct.cpuusage = alloc_percpu(u64);
6982 /* Too early, not expected to fail */
6983 BUG_ON(!root_cpuacct.cpuusage);
6984#endif
6985 for_each_possible_cpu(i) { 6930 for_each_possible_cpu(i) {
6986 struct rq *rq; 6931 struct rq *rq;
6987 6932
@@ -8083,226 +8028,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8083 8028
8084#endif /* CONFIG_CGROUP_SCHED */ 8029#endif /* CONFIG_CGROUP_SCHED */
8085 8030
8086#ifdef CONFIG_CGROUP_CPUACCT
8087
8088/*
8089 * CPU accounting code for task groups.
8090 *
8091 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8092 * (balbir@in.ibm.com).
8093 */
8094
8095struct cpuacct root_cpuacct;
8096
8097/* create a new cpu accounting group */
8098static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8099{
8100 struct cpuacct *ca;
8101
8102 if (!cgrp->parent)
8103 return &root_cpuacct.css;
8104
8105 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8106 if (!ca)
8107 goto out;
8108
8109 ca->cpuusage = alloc_percpu(u64);
8110 if (!ca->cpuusage)
8111 goto out_free_ca;
8112
8113 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8114 if (!ca->cpustat)
8115 goto out_free_cpuusage;
8116
8117 return &ca->css;
8118
8119out_free_cpuusage:
8120 free_percpu(ca->cpuusage);
8121out_free_ca:
8122 kfree(ca);
8123out:
8124 return ERR_PTR(-ENOMEM);
8125}
8126
8127/* destroy an existing cpu accounting group */
8128static void cpuacct_css_free(struct cgroup *cgrp)
8129{
8130 struct cpuacct *ca = cgroup_ca(cgrp);
8131
8132 free_percpu(ca->cpustat);
8133 free_percpu(ca->cpuusage);
8134 kfree(ca);
8135}
8136
8137static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8138{
8139 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8140 u64 data;
8141
8142#ifndef CONFIG_64BIT
8143 /*
8144 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8145 */
8146 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8147 data = *cpuusage;
8148 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8149#else
8150 data = *cpuusage;
8151#endif
8152
8153 return data;
8154}
8155
8156static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8157{
8158 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8159
8160#ifndef CONFIG_64BIT
8161 /*
8162 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8163 */
8164 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8165 *cpuusage = val;
8166 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8167#else
8168 *cpuusage = val;
8169#endif
8170}
8171
8172/* return total cpu usage (in nanoseconds) of a group */
8173static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8174{
8175 struct cpuacct *ca = cgroup_ca(cgrp);
8176 u64 totalcpuusage = 0;
8177 int i;
8178
8179 for_each_present_cpu(i)
8180 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8181
8182 return totalcpuusage;
8183}
8184
8185static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8186 u64 reset)
8187{
8188 struct cpuacct *ca = cgroup_ca(cgrp);
8189 int err = 0;
8190 int i;
8191
8192 if (reset) {
8193 err = -EINVAL;
8194 goto out;
8195 }
8196
8197 for_each_present_cpu(i)
8198 cpuacct_cpuusage_write(ca, i, 0);
8199
8200out:
8201 return err;
8202}
8203
8204static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8205 struct seq_file *m)
8206{
8207 struct cpuacct *ca = cgroup_ca(cgroup);
8208 u64 percpu;
8209 int i;
8210
8211 for_each_present_cpu(i) {
8212 percpu = cpuacct_cpuusage_read(ca, i);
8213 seq_printf(m, "%llu ", (unsigned long long) percpu);
8214 }
8215 seq_printf(m, "\n");
8216 return 0;
8217}
8218
8219static const char *cpuacct_stat_desc[] = {
8220 [CPUACCT_STAT_USER] = "user",
8221 [CPUACCT_STAT_SYSTEM] = "system",
8222};
8223
8224static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8225 struct cgroup_map_cb *cb)
8226{
8227 struct cpuacct *ca = cgroup_ca(cgrp);
8228 int cpu;
8229 s64 val = 0;
8230
8231 for_each_online_cpu(cpu) {
8232 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8233 val += kcpustat->cpustat[CPUTIME_USER];
8234 val += kcpustat->cpustat[CPUTIME_NICE];
8235 }
8236 val = cputime64_to_clock_t(val);
8237 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8238
8239 val = 0;
8240 for_each_online_cpu(cpu) {
8241 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8242 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8243 val += kcpustat->cpustat[CPUTIME_IRQ];
8244 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8245 }
8246
8247 val = cputime64_to_clock_t(val);
8248 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8249
8250 return 0;
8251}
8252
8253static struct cftype files[] = {
8254 {
8255 .name = "usage",
8256 .read_u64 = cpuusage_read,
8257 .write_u64 = cpuusage_write,
8258 },
8259 {
8260 .name = "usage_percpu",
8261 .read_seq_string = cpuacct_percpu_seq_read,
8262 },
8263 {
8264 .name = "stat",
8265 .read_map = cpuacct_stats_show,
8266 },
8267 { } /* terminate */
8268};
8269
8270/*
8271 * charge this task's execution time to its accounting group.
8272 *
8273 * called with rq->lock held.
8274 */
8275void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8276{
8277 struct cpuacct *ca;
8278 int cpu;
8279
8280 if (unlikely(!cpuacct_subsys.active))
8281 return;
8282
8283 cpu = task_cpu(tsk);
8284
8285 rcu_read_lock();
8286
8287 ca = task_ca(tsk);
8288
8289 for (; ca; ca = parent_ca(ca)) {
8290 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8291 *cpuusage += cputime;
8292 }
8293
8294 rcu_read_unlock();
8295}
8296
8297struct cgroup_subsys cpuacct_subsys = {
8298 .name = "cpuacct",
8299 .css_alloc = cpuacct_css_alloc,
8300 .css_free = cpuacct_css_free,
8301 .subsys_id = cpuacct_subsys_id,
8302 .base_cftypes = files,
8303};
8304#endif /* CONFIG_CGROUP_CPUACCT */
8305
8306void dump_cpu_task(int cpu) 8031void dump_cpu_task(int cpu)
8307{ 8032{
8308 pr_info("Task dump for CPU %d:\n", cpu); 8033 pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
1#include <linux/cgroup.h>
2#include <linux/slab.h>
3#include <linux/percpu.h>
4#include <linux/spinlock.h>
5#include <linux/cpumask.h>
6#include <linux/seq_file.h>
7#include <linux/rcupdate.h>
8#include <linux/kernel_stat.h>
9#include <linux/err.h>
10
11#include "sched.h"
12
13/*
14 * CPU accounting code for task groups.
15 *
16 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
17 * (balbir@in.ibm.com).
18 */
19
20/* Time spent by the tasks of the cpu accounting group executing in ... */
21enum cpuacct_stat_index {
22 CPUACCT_STAT_USER, /* ... user mode */
23 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
24
25 CPUACCT_STAT_NSTATS,
26};
27
28/* track cpu usage of a group of tasks and its child groups */
29struct cpuacct {
30 struct cgroup_subsys_state css;
31 /* cpuusage holds pointer to a u64-type object on every cpu */
32 u64 __percpu *cpuusage;
33 struct kernel_cpustat __percpu *cpustat;
34};
35
36/* return cpu accounting group corresponding to this container */
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
40 struct cpuacct, css);
41}
42
43/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53}
54
55static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{
57 if (!ca->css.cgroup->parent)
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60}
61
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
63static struct cpuacct root_cpuacct = {
64 .cpustat = &kernel_cpustat,
65 .cpuusage = &root_cpuacct_cpuusage,
66};
67
68/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
70{
71 struct cpuacct *ca;
72
73 if (!cgrp->parent)
74 return &root_cpuacct.css;
75
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
77 if (!ca)
78 goto out;
79
80 ca->cpuusage = alloc_percpu(u64);
81 if (!ca->cpuusage)
82 goto out_free_ca;
83
84 ca->cpustat = alloc_percpu(struct kernel_cpustat);
85 if (!ca->cpustat)
86 goto out_free_cpuusage;
87
88 return &ca->css;
89
90out_free_cpuusage:
91 free_percpu(ca->cpuusage);
92out_free_ca:
93 kfree(ca);
94out:
95 return ERR_PTR(-ENOMEM);
96}
97
98/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp)
100{
101 struct cpuacct *ca = cgroup_ca(cgrp);
102
103 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage);
105 kfree(ca);
106}
107
108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
109{
110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
111 u64 data;
112
113#ifndef CONFIG_64BIT
114 /*
115 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
116 */
117 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
118 data = *cpuusage;
119 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
120#else
121 data = *cpuusage;
122#endif
123
124 return data;
125}
126
127static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
128{
129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
130
131#ifndef CONFIG_64BIT
132 /*
133 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
134 */
135 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
136 *cpuusage = val;
137 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
138#else
139 *cpuusage = val;
140#endif
141}
142
143/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
145{
146 struct cpuacct *ca = cgroup_ca(cgrp);
147 u64 totalcpuusage = 0;
148 int i;
149
150 for_each_present_cpu(i)
151 totalcpuusage += cpuacct_cpuusage_read(ca, i);
152
153 return totalcpuusage;
154}
155
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
157 u64 reset)
158{
159 struct cpuacct *ca = cgroup_ca(cgrp);
160 int err = 0;
161 int i;
162
163 if (reset) {
164 err = -EINVAL;
165 goto out;
166 }
167
168 for_each_present_cpu(i)
169 cpuacct_cpuusage_write(ca, i, 0);
170
171out:
172 return err;
173}
174
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
176 struct seq_file *m)
177{
178 struct cpuacct *ca = cgroup_ca(cgroup);
179 u64 percpu;
180 int i;
181
182 for_each_present_cpu(i) {
183 percpu = cpuacct_cpuusage_read(ca, i);
184 seq_printf(m, "%llu ", (unsigned long long) percpu);
185 }
186 seq_printf(m, "\n");
187 return 0;
188}
189
190static const char * const cpuacct_stat_desc[] = {
191 [CPUACCT_STAT_USER] = "user",
192 [CPUACCT_STAT_SYSTEM] = "system",
193};
194
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
196 struct cgroup_map_cb *cb)
197{
198 struct cpuacct *ca = cgroup_ca(cgrp);
199 int cpu;
200 s64 val = 0;
201
202 for_each_online_cpu(cpu) {
203 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
204 val += kcpustat->cpustat[CPUTIME_USER];
205 val += kcpustat->cpustat[CPUTIME_NICE];
206 }
207 val = cputime64_to_clock_t(val);
208 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
209
210 val = 0;
211 for_each_online_cpu(cpu) {
212 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
213 val += kcpustat->cpustat[CPUTIME_SYSTEM];
214 val += kcpustat->cpustat[CPUTIME_IRQ];
215 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
216 }
217
218 val = cputime64_to_clock_t(val);
219 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
220
221 return 0;
222}
223
224static struct cftype files[] = {
225 {
226 .name = "usage",
227 .read_u64 = cpuusage_read,
228 .write_u64 = cpuusage_write,
229 },
230 {
231 .name = "usage_percpu",
232 .read_seq_string = cpuacct_percpu_seq_read,
233 },
234 {
235 .name = "stat",
236 .read_map = cpuacct_stats_show,
237 },
238 { } /* terminate */
239};
240
241/*
242 * charge this task's execution time to its accounting group.
243 *
244 * called with rq->lock held.
245 */
246void cpuacct_charge(struct task_struct *tsk, u64 cputime)
247{
248 struct cpuacct *ca;
249 int cpu;
250
251 cpu = task_cpu(tsk);
252
253 rcu_read_lock();
254
255 ca = task_ca(tsk);
256
257 while (true) {
258 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
259 *cpuusage += cputime;
260
261 ca = parent_ca(ca);
262 if (!ca)
263 break;
264 }
265
266 rcu_read_unlock();
267}
268
269/*
270 * Add user/system time to cpuacct.
271 *
272 * Note: it's the caller that updates the account of the root cgroup.
273 */
274void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275{
276 struct kernel_cpustat *kcpustat;
277 struct cpuacct *ca;
278
279 rcu_read_lock();
280 ca = task_ca(p);
281 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca);
285 }
286 rcu_read_unlock();
287}
288
289struct cgroup_subsys cpuacct_subsys = {
290 .name = "cpuacct",
291 .css_alloc = cpuacct_css_alloc,
292 .css_free = cpuacct_css_free,
293 .subsys_id = cpuacct_subsys_id,
294 .base_cftypes = files,
295 .early_init = 1,
296};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
1#ifdef CONFIG_CGROUP_CPUACCT
2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
5
6#else
7
8static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9{
10}
11
12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val)
14{
15}
16
17#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 699d59756ece..ea32f02bf2c3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
115static inline void task_group_account_field(struct task_struct *p, int index, 115static inline void task_group_account_field(struct task_struct *p, int index,
116 u64 tmp) 116 u64 tmp)
117{ 117{
118#ifdef CONFIG_CGROUP_CPUACCT
119 struct kernel_cpustat *kcpustat;
120 struct cpuacct *ca;
121#endif
122 /* 118 /*
123 * Since all updates are sure to touch the root cgroup, we 119 * Since all updates are sure to touch the root cgroup, we
124 * get ourselves ahead and touch it first. If the root cgroup 120 * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
127 */ 123 */
128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
129 125
130#ifdef CONFIG_CGROUP_CPUACCT 126 cpuacct_account_field(p, index, tmp);
131 if (unlikely(!cpuacct_subsys.active))
132 return;
133
134 rcu_read_lock();
135 ca = task_ca(p);
136 while (ca && (ca != &root_cpuacct)) {
137 kcpustat = this_cpu_ptr(ca->cpustat);
138 kcpustat->cpustat[index] += tmp;
139 ca = parent_ca(ca);
140 }
141 rcu_read_unlock();
142#endif
143} 127}
144 128
145/* 129/*
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
310 294
311 t = tsk; 295 t = tsk;
312 do { 296 do {
313 task_cputime(tsk, &utime, &stime); 297 task_cputime(t, &utime, &stime);
314 times->utime += utime; 298 times->utime += utime;
315 times->stime += stime; 299 times->stime += stime;
316 times->sum_exec_runtime += task_sched_runtime(t); 300 times->sum_exec_runtime += task_sched_runtime(t);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c97fca091a7..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1565} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1566#else 1587#else
1567static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1568 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -3875,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3875 int tsk_cache_hot = 0; 3896 int tsk_cache_hot = 0;
3876 /* 3897 /*
3877 * We do not migrate tasks that are: 3898 * We do not migrate tasks that are:
3878 * 1) running (obviously), or 3899 * 1) throttled_lb_pair, or
3879 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3900 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3880 * 3) are cache-hot on their current CPU. 3901 * 3) running (obviously), or
3902 * 4) are cache-hot on their current CPU.
3881 */ 3903 */
3904 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 return 0;
3906
3882 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3907 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3883 int new_dst_cpu; 3908 int cpu;
3884 3909
3885 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3910 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3886 3911
@@ -3895,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3895 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3920 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3896 return 0; 3921 return 0;
3897 3922
3898 new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3923 /* Prevent to re-select dst_cpu via env's cpus */
3899 tsk_cpus_allowed(p)); 3924 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3900 if (new_dst_cpu < nr_cpu_ids) { 3925 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3901 env->flags |= LBF_SOME_PINNED; 3926 env->flags |= LBF_SOME_PINNED;
3902 env->new_dst_cpu = new_dst_cpu; 3927 env->new_dst_cpu = cpu;
3928 break;
3929 }
3903 } 3930 }
3931
3904 return 0; 3932 return 0;
3905 } 3933 }
3906 3934
@@ -3921,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3921 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3922 if (!tsk_cache_hot || 3950 if (!tsk_cache_hot ||
3923 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3924#ifdef CONFIG_SCHEDSTATS 3952
3925 if (tsk_cache_hot) { 3953 if (tsk_cache_hot) {
3926 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3954 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3927 schedstat_inc(p, se.statistics.nr_forced_migrations); 3955 schedstat_inc(p, se.statistics.nr_forced_migrations);
3928 } 3956 }
3929#endif 3957
3930 return 1; 3958 return 1;
3931 } 3959 }
3932 3960
3933 if (tsk_cache_hot) { 3961 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3934 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3962 return 0;
3935 return 0;
3936 }
3937 return 1;
3938} 3963}
3939 3964
3940/* 3965/*
@@ -3949,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
3949 struct task_struct *p, *n; 3974 struct task_struct *p, *n;
3950 3975
3951 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3976 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3952 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3953 continue;
3954
3955 if (!can_migrate_task(p, env)) 3977 if (!can_migrate_task(p, env))
3956 continue; 3978 continue;
3957 3979
@@ -4003,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
4003 break; 4025 break;
4004 } 4026 }
4005 4027
4006 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4028 if (!can_migrate_task(p, env))
4007 goto next; 4029 goto next;
4008 4030
4009 load = task_h_load(p); 4031 load = task_h_load(p);
@@ -4014,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
4014 if ((load / 2) > env->imbalance) 4036 if ((load / 2) > env->imbalance)
4015 goto next; 4037 goto next;
4016 4038
4017 if (!can_migrate_task(p, env))
4018 goto next;
4019
4020 move_task(p, env); 4039 move_task(p, env);
4021 pulled++; 4040 pulled++;
4022 env->imbalance -= load; 4041 env->imbalance -= load;
@@ -4961,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4961#define MAX_PINNED_INTERVAL 512 4980#define MAX_PINNED_INTERVAL 512
4962 4981
4963/* Working cpumask for load_balance and load_balance_newidle. */ 4982/* Working cpumask for load_balance and load_balance_newidle. */
4964DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4983DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4965 4984
4966static int need_active_balance(struct lb_env *env) 4985static int need_active_balance(struct lb_env *env)
4967{ 4986{
@@ -4992,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4992 int *balance) 5011 int *balance)
4993{ 5012{
4994 int ld_moved, cur_ld_moved, active_balance = 0; 5013 int ld_moved, cur_ld_moved, active_balance = 0;
4995 int lb_iterations, max_lb_iterations;
4996 struct sched_group *group; 5014 struct sched_group *group;
4997 struct rq *busiest; 5015 struct rq *busiest;
4998 unsigned long flags; 5016 unsigned long flags;
4999 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 5017 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5000 5018
5001 struct lb_env env = { 5019 struct lb_env env = {
5002 .sd = sd, 5020 .sd = sd,
@@ -5008,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5008 .cpus = cpus, 5026 .cpus = cpus,
5009 }; 5027 };
5010 5028
5029 /*
5030 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 * other cpus in our group
5032 */
5033 if (idle == CPU_NEWLY_IDLE)
5034 env.dst_grpmask = NULL;
5035
5011 cpumask_copy(cpus, cpu_active_mask); 5036 cpumask_copy(cpus, cpu_active_mask);
5012 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5013 5037
5014 schedstat_inc(sd, lb_count[idle]); 5038 schedstat_inc(sd, lb_count[idle]);
5015 5039
@@ -5035,7 +5059,6 @@ redo:
5035 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5059 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5036 5060
5037 ld_moved = 0; 5061 ld_moved = 0;
5038 lb_iterations = 1;
5039 if (busiest->nr_running > 1) { 5062 if (busiest->nr_running > 1) {
5040 /* 5063 /*
5041 * Attempt to move tasks. If find_busiest_group has found 5064 * Attempt to move tasks. If find_busiest_group has found
@@ -5062,17 +5085,17 @@ more_balance:
5062 double_rq_unlock(env.dst_rq, busiest); 5085 double_rq_unlock(env.dst_rq, busiest);
5063 local_irq_restore(flags); 5086 local_irq_restore(flags);
5064 5087
5065 if (env.flags & LBF_NEED_BREAK) {
5066 env.flags &= ~LBF_NEED_BREAK;
5067 goto more_balance;
5068 }
5069
5070 /* 5088 /*
5071 * some other cpu did the load balance for us. 5089 * some other cpu did the load balance for us.
5072 */ 5090 */
5073 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5091 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5074 resched_cpu(env.dst_cpu); 5092 resched_cpu(env.dst_cpu);
5075 5093
5094 if (env.flags & LBF_NEED_BREAK) {
5095 env.flags &= ~LBF_NEED_BREAK;
5096 goto more_balance;
5097 }
5098
5076 /* 5099 /*
5077 * Revisit (affine) tasks on src_cpu that couldn't be moved to 5100 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5078 * us and move them to an alternate dst_cpu in our sched_group 5101 * us and move them to an alternate dst_cpu in our sched_group
@@ -5092,14 +5115,17 @@ more_balance:
5092 * moreover subsequent load balance cycles should correct the 5115 * moreover subsequent load balance cycles should correct the
5093 * excess load moved. 5116 * excess load moved.
5094 */ 5117 */
5095 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5118 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5096 lb_iterations++ < max_lb_iterations) {
5097 5119
5098 env.dst_rq = cpu_rq(env.new_dst_cpu); 5120 env.dst_rq = cpu_rq(env.new_dst_cpu);
5099 env.dst_cpu = env.new_dst_cpu; 5121 env.dst_cpu = env.new_dst_cpu;
5100 env.flags &= ~LBF_SOME_PINNED; 5122 env.flags &= ~LBF_SOME_PINNED;
5101 env.loop = 0; 5123 env.loop = 0;
5102 env.loop_break = sched_nr_migrate_break; 5124 env.loop_break = sched_nr_migrate_break;
5125
5126 /* Prevent to re-select dst_cpu via env's cpus */
5127 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128
5103 /* 5129 /*
5104 * Go back to "more_balance" rather than "redo" since we 5130 * Go back to "more_balance" rather than "redo" since we
5105 * need to continue with same src_cpu. 5131 * need to continue with same src_cpu.
@@ -5220,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5220 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5246 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5221 return; 5247 return;
5222 5248
5223 update_rq_runnable_avg(this_rq, 1);
5224
5225 /* 5249 /*
5226 * Drop the rq->lock, but keep IRQ/preempt disabled. 5250 * Drop the rq->lock, but keep IRQ/preempt disabled.
5227 */ 5251 */
@@ -5396,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
5396 struct sched_domain *sd; 5420 struct sched_domain *sd;
5397 int cpu = smp_processor_id(); 5421 int cpu = smp_processor_id();
5398 5422
5399 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5400 return;
5401 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5402
5403 rcu_read_lock(); 5423 rcu_read_lock();
5404 for_each_domain(cpu, sd) 5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425
5426 if (!sd || !sd->nohz_idle)
5427 goto unlock;
5428 sd->nohz_idle = 0;
5429
5430 for (; sd; sd = sd->parent)
5405 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5431 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432unlock:
5406 rcu_read_unlock(); 5433 rcu_read_unlock();
5407} 5434}
5408 5435
@@ -5411,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
5411 struct sched_domain *sd; 5438 struct sched_domain *sd;
5412 int cpu = smp_processor_id(); 5439 int cpu = smp_processor_id();
5413 5440
5414 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5415 return;
5416 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5417
5418 rcu_read_lock(); 5441 rcu_read_lock();
5419 for_each_domain(cpu, sd) 5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443
5444 if (!sd || sd->nohz_idle)
5445 goto unlock;
5446 sd->nohz_idle = 1;
5447
5448 for (; sd; sd = sd->parent)
5420 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5449 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450unlock:
5421 rcu_read_unlock(); 5451 rcu_read_unlock();
5422} 5452}
5423 5453
@@ -5469,7 +5499,7 @@ void update_max_interval(void)
5469 * It checks each scheduling domain to see if it is due to be balanced, 5499 * It checks each scheduling domain to see if it is due to be balanced,
5470 * and initiates a balancing operation if so. 5500 * and initiates a balancing operation if so.
5471 * 5501 *
5472 * Balancing parameters are set up in arch_init_sched_domains. 5502 * Balancing parameters are set up in init_sched_domains.
5473 */ 5503 */
5474static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5504static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5475{ 5505{
@@ -5507,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5507 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5537 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5508 if (load_balance(cpu, rq, sd, idle, &balance)) { 5538 if (load_balance(cpu, rq, sd, idle, &balance)) {
5509 /* 5539 /*
5510 * We've pulled tasks over so either we're no 5540 * The LBF_SOME_PINNED logic could have changed
5511 * longer idle. 5541 * env->dst_cpu, so we can't know our idle
5542 * state even if we migrated tasks. Update it.
5512 */ 5543 */
5513 idle = CPU_NOT_IDLE; 5544 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5514 } 5545 }
5515 sd->last_balance = jiffies; 5546 sd->last_balance = jiffies;
5516 } 5547 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
46SCHED_FEAT(LB_BIAS, true) 46SCHED_FEAT(LB_BIAS, true)
47 47
48/* 48/*
49 * Spin-wait on mutex acquisition when the mutex owner is running on
50 * another cpu -- assumes that when the owner is running, it will soon
51 * release the lock. Decreases scheduling overhead.
52 */
53SCHED_FEAT(OWNER_SPIN, true)
54
55/*
56 * Decrement CPU power based on time not spent running tasks 49 * Decrement CPU power based on time not spent running tasks
57 */ 50 */
58SCHED_FEAT(NONTASK_POWER, true) 51SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20}
21
22static void post_schedule_idle(struct rq *rq)
23{
24 idle_enter_fair(rq);
25}
16#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
17/* 27/*
18 * Idle tasks are unconditionally rescheduled: 28 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 35static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 36{
27 schedstat_inc(rq, sched_goidle); 37 schedstat_inc(rq, sched_goidle);
38#ifdef CONFIG_SMP
39 /* Trigger the post schedule to do an idle_enter for CFS */
40 rq->post_schedule = 1;
41#endif
28 return rq->idle; 42 return rq->idle;
29} 43}
30 44
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
86 100
87#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 102 .select_task_rq = select_task_rq_idle,
103 .pre_schedule = pre_schedule_idle,
104 .post_schedule = post_schedule_idle,
89#endif 105#endif
90 106
91 .set_curr_task = set_curr_task_idle, 107 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb363aa5d83c..24dc29897749 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -8,6 +8,7 @@
8#include <linux/tick.h> 8#include <linux/tick.h>
9 9
10#include "cpupri.h" 10#include "cpupri.h"
11#include "cpuacct.h"
11 12
12extern __read_mostly int scheduler_running; 13extern __read_mostly int scheduler_running;
13 14
@@ -951,14 +952,6 @@ static const u32 prio_to_wmult[40] = {
951 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 952 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
952}; 953};
953 954
954/* Time spent by the tasks of the cpu accounting group executing in ... */
955enum cpuacct_stat_index {
956 CPUACCT_STAT_USER, /* ... user mode */
957 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
958
959 CPUACCT_STAT_NSTATS,
960};
961
962#define ENQUEUE_WAKEUP 1 955#define ENQUEUE_WAKEUP 1
963#define ENQUEUE_HEAD 2 956#define ENQUEUE_HEAD 2
964#ifdef CONFIG_SMP 957#ifdef CONFIG_SMP
@@ -1032,6 +1025,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
1032extern void trigger_load_balance(struct rq *rq, int cpu); 1025extern void trigger_load_balance(struct rq *rq, int cpu);
1033extern void idle_balance(int this_cpu, struct rq *this_rq); 1026extern void idle_balance(int this_cpu, struct rq *this_rq);
1034 1027
1028/*
1029 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1030 * becomes useful in lb
1031 */
1032#if defined(CONFIG_FAIR_GROUP_SCHED)
1033extern void idle_enter_fair(struct rq *this_rq);
1034extern void idle_exit_fair(struct rq *this_rq);
1035#else
1036static inline void idle_enter_fair(struct rq *this_rq) {}
1037static inline void idle_exit_fair(struct rq *this_rq) {}
1038#endif
1039
1035#else /* CONFIG_SMP */ 1040#else /* CONFIG_SMP */
1036 1041
1037static inline void idle_balance(int cpu, struct rq *rq) 1042static inline void idle_balance(int cpu, struct rq *rq)
@@ -1055,45 +1060,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
1055 1060
1056extern void update_idle_cpu_load(struct rq *this_rq); 1061extern void update_idle_cpu_load(struct rq *this_rq);
1057 1062
1058#ifdef CONFIG_CGROUP_CPUACCT
1059#include <linux/cgroup.h>
1060/* track cpu usage of a group of tasks and its child groups */
1061struct cpuacct {
1062 struct cgroup_subsys_state css;
1063 /* cpuusage holds pointer to a u64-type object on every cpu */
1064 u64 __percpu *cpuusage;
1065 struct kernel_cpustat __percpu *cpustat;
1066};
1067
1068extern struct cgroup_subsys cpuacct_subsys;
1069extern struct cpuacct root_cpuacct;
1070
1071/* return cpu accounting group corresponding to this container */
1072static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
1073{
1074 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
1075 struct cpuacct, css);
1076}
1077
1078/* return cpu accounting group to which this task belongs */
1079static inline struct cpuacct *task_ca(struct task_struct *tsk)
1080{
1081 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
1082 struct cpuacct, css);
1083}
1084
1085static inline struct cpuacct *parent_ca(struct cpuacct *ca)
1086{
1087 if (!ca || !ca->css.cgroup->parent)
1088 return NULL;
1089 return cgroup_ca(ca->css.cgroup->parent);
1090}
1091
1092extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1093#else
1094static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1095#endif
1096
1097#ifdef CONFIG_PARAVIRT 1063#ifdef CONFIG_PARAVIRT
1098static inline u64 steal_ticks(u64 steal) 1064static inline u64 steal_ticks(u64 steal)
1099{ 1065{
@@ -1348,7 +1314,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1348enum rq_nohz_flag_bits { 1314enum rq_nohz_flag_bits {
1349 NOHZ_TICK_STOPPED, 1315 NOHZ_TICK_STOPPED,
1350 NOHZ_BALANCE_KICK, 1316 NOHZ_BALANCE_KICK,
1351 NOHZ_IDLE,
1352}; 1317};
1353 1318
1354#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1319#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)