diff options
author | Frederic Weisbecker <fweisbec@gmail.com> | 2013-05-02 11:37:49 -0400 |
---|---|---|
committer | Frederic Weisbecker <fweisbec@gmail.com> | 2013-05-02 11:54:19 -0400 |
commit | c032862fba51a3ca504752d3a25186b324c5ce83 (patch) | |
tree | 955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /kernel/sched | |
parent | fda76e074c7737fc57855dd17c762e50ed526052 (diff) | |
parent | 8700c95adb033843fc163d112b9d21d4fda78018 (diff) |
Merge commit '8700c95adb03' into timers/nohz
The full dynticks tree needs the latest RCU and sched
upstream updates in order to fix some dependencies.
Merge a common upstream merge point that has these
updates.
Conflicts:
include/linux/perf_event.h
kernel/rcutree.h
kernel/rcutree_plugin.h
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 1 | ||||
-rw-r--r-- | kernel/sched/clock.c | 26 | ||||
-rw-r--r-- | kernel/sched/core.c | 299 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 296 | ||||
-rw-r--r-- | kernel/sched/cpuacct.h | 17 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 20 | ||||
-rw-r--r-- | kernel/sched/fair.c | 131 | ||||
-rw-r--r-- | kernel/sched/features.h | 7 | ||||
-rw-r--r-- | kernel/sched/idle_task.c | 16 | ||||
-rw-r--r-- | kernel/sched/sched.h | 61 |
10 files changed, 464 insertions, 410 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o | |||
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 18 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
19 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o | ||
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) | |||
176 | u64 this_clock, remote_clock; | 176 | u64 this_clock, remote_clock; |
177 | u64 *ptr, old_val, val; | 177 | u64 *ptr, old_val, val; |
178 | 178 | ||
179 | #if BITS_PER_LONG != 64 | ||
180 | again: | ||
181 | /* | ||
182 | * Careful here: The local and the remote clock values need to | ||
183 | * be read out atomic as we need to compare the values and | ||
184 | * then update either the local or the remote side. So the | ||
185 | * cmpxchg64 below only protects one readout. | ||
186 | * | ||
187 | * We must reread via sched_clock_local() in the retry case on | ||
188 | * 32bit as an NMI could use sched_clock_local() via the | ||
189 | * tracer and hit between the readout of | ||
190 | * the low32bit and the high 32bit portion. | ||
191 | */ | ||
192 | this_clock = sched_clock_local(my_scd); | ||
193 | /* | ||
194 | * We must enforce atomic readout on 32bit, otherwise the | ||
195 | * update on the remote cpu can hit inbetween the readout of | ||
196 | * the low32bit and the high 32bit portion. | ||
197 | */ | ||
198 | remote_clock = cmpxchg64(&scd->clock, 0, 0); | ||
199 | #else | ||
200 | /* | ||
201 | * On 64bit the read of [my]scd->clock is atomic versus the | ||
202 | * update, so we can avoid the above 32bit dance. | ||
203 | */ | ||
179 | sched_clock_local(my_scd); | 204 | sched_clock_local(my_scd); |
180 | again: | 205 | again: |
181 | this_clock = my_scd->clock; | 206 | this_clock = my_scd->clock; |
182 | remote_clock = scd->clock; | 207 | remote_clock = scd->clock; |
208 | #endif | ||
183 | 209 | ||
184 | /* | 210 | /* |
185 | * Use the opportunity that we have both locks | 211 | * Use the opportunity that we have both locks |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd09def88567..e94842d4400c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -512,11 +512,6 @@ static inline void init_hrtick(void) | |||
512 | * the target CPU. | 512 | * the target CPU. |
513 | */ | 513 | */ |
514 | #ifdef CONFIG_SMP | 514 | #ifdef CONFIG_SMP |
515 | |||
516 | #ifndef tsk_is_polling | ||
517 | #define tsk_is_polling(t) 0 | ||
518 | #endif | ||
519 | |||
520 | void resched_task(struct task_struct *p) | 515 | void resched_task(struct task_struct *p) |
521 | { | 516 | { |
522 | int cpu; | 517 | int cpu; |
@@ -1536,8 +1531,10 @@ static void try_to_wake_up_local(struct task_struct *p) | |||
1536 | { | 1531 | { |
1537 | struct rq *rq = task_rq(p); | 1532 | struct rq *rq = task_rq(p); |
1538 | 1533 | ||
1539 | BUG_ON(rq != this_rq()); | 1534 | if (WARN_ON_ONCE(rq != this_rq()) || |
1540 | BUG_ON(p == current); | 1535 | WARN_ON_ONCE(p == current)) |
1536 | return; | ||
1537 | |||
1541 | lockdep_assert_held(&rq->lock); | 1538 | lockdep_assert_held(&rq->lock); |
1542 | 1539 | ||
1543 | if (!raw_spin_trylock(&p->pi_lock)) { | 1540 | if (!raw_spin_trylock(&p->pi_lock)) { |
@@ -3037,51 +3034,6 @@ void __sched schedule_preempt_disabled(void) | |||
3037 | preempt_disable(); | 3034 | preempt_disable(); |
3038 | } | 3035 | } |
3039 | 3036 | ||
3040 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER | ||
3041 | |||
3042 | static inline bool owner_running(struct mutex *lock, struct task_struct *owner) | ||
3043 | { | ||
3044 | if (lock->owner != owner) | ||
3045 | return false; | ||
3046 | |||
3047 | /* | ||
3048 | * Ensure we emit the owner->on_cpu, dereference _after_ checking | ||
3049 | * lock->owner still matches owner, if that fails, owner might | ||
3050 | * point to free()d memory, if it still matches, the rcu_read_lock() | ||
3051 | * ensures the memory stays valid. | ||
3052 | */ | ||
3053 | barrier(); | ||
3054 | |||
3055 | return owner->on_cpu; | ||
3056 | } | ||
3057 | |||
3058 | /* | ||
3059 | * Look out! "owner" is an entirely speculative pointer | ||
3060 | * access and not reliable. | ||
3061 | */ | ||
3062 | int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) | ||
3063 | { | ||
3064 | if (!sched_feat(OWNER_SPIN)) | ||
3065 | return 0; | ||
3066 | |||
3067 | rcu_read_lock(); | ||
3068 | while (owner_running(lock, owner)) { | ||
3069 | if (need_resched()) | ||
3070 | break; | ||
3071 | |||
3072 | arch_mutex_cpu_relax(); | ||
3073 | } | ||
3074 | rcu_read_unlock(); | ||
3075 | |||
3076 | /* | ||
3077 | * We break out the loop above on need_resched() and when the | ||
3078 | * owner changed, which is a sign for heavy contention. Return | ||
3079 | * success only when lock->owner is NULL. | ||
3080 | */ | ||
3081 | return lock->owner == NULL; | ||
3082 | } | ||
3083 | #endif | ||
3084 | |||
3085 | #ifdef CONFIG_PREEMPT | 3037 | #ifdef CONFIG_PREEMPT |
3086 | /* | 3038 | /* |
3087 | * this is the entry point to schedule() from in-kernel preemption | 3039 | * this is the entry point to schedule() from in-kernel preemption |
@@ -4170,6 +4122,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4170 | get_task_struct(p); | 4122 | get_task_struct(p); |
4171 | rcu_read_unlock(); | 4123 | rcu_read_unlock(); |
4172 | 4124 | ||
4125 | if (p->flags & PF_NO_SETAFFINITY) { | ||
4126 | retval = -EINVAL; | ||
4127 | goto out_put_task; | ||
4128 | } | ||
4173 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | 4129 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { |
4174 | retval = -ENOMEM; | 4130 | retval = -ENOMEM; |
4175 | goto out_put_task; | 4131 | goto out_put_task; |
@@ -4817,11 +4773,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
4817 | goto out; | 4773 | goto out; |
4818 | } | 4774 | } |
4819 | 4775 | ||
4820 | if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { | ||
4821 | ret = -EINVAL; | ||
4822 | goto out; | ||
4823 | } | ||
4824 | |||
4825 | do_set_cpus_allowed(p, new_mask); | 4776 | do_set_cpus_allowed(p, new_mask); |
4826 | 4777 | ||
4827 | /* Can the task run on the task's current CPU? If so, we're done */ | 4778 | /* Can the task run on the task's current CPU? If so, we're done */ |
@@ -5043,7 +4994,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
5043 | } | 4994 | } |
5044 | 4995 | ||
5045 | static int min_load_idx = 0; | 4996 | static int min_load_idx = 0; |
5046 | static int max_load_idx = CPU_LOAD_IDX_MAX; | 4997 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; |
5047 | 4998 | ||
5048 | static void | 4999 | static void |
5049 | set_table_entry(struct ctl_table *entry, | 5000 | set_table_entry(struct ctl_table *entry, |
@@ -6292,7 +6243,7 @@ static void sched_init_numa(void) | |||
6292 | * 'level' contains the number of unique distances, excluding the | 6243 | * 'level' contains the number of unique distances, excluding the |
6293 | * identity distance node_distance(i,i). | 6244 | * identity distance node_distance(i,i). |
6294 | * | 6245 | * |
6295 | * The sched_domains_nume_distance[] array includes the actual distance | 6246 | * The sched_domains_numa_distance[] array includes the actual distance |
6296 | * numbers. | 6247 | * numbers. |
6297 | */ | 6248 | */ |
6298 | 6249 | ||
@@ -6913,7 +6864,7 @@ struct task_group root_task_group; | |||
6913 | LIST_HEAD(task_groups); | 6864 | LIST_HEAD(task_groups); |
6914 | #endif | 6865 | #endif |
6915 | 6866 | ||
6916 | DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 6867 | DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); |
6917 | 6868 | ||
6918 | void __init sched_init(void) | 6869 | void __init sched_init(void) |
6919 | { | 6870 | { |
@@ -6950,7 +6901,7 @@ void __init sched_init(void) | |||
6950 | #endif /* CONFIG_RT_GROUP_SCHED */ | 6901 | #endif /* CONFIG_RT_GROUP_SCHED */ |
6951 | #ifdef CONFIG_CPUMASK_OFFSTACK | 6902 | #ifdef CONFIG_CPUMASK_OFFSTACK |
6952 | for_each_possible_cpu(i) { | 6903 | for_each_possible_cpu(i) { |
6953 | per_cpu(load_balance_tmpmask, i) = (void *)ptr; | 6904 | per_cpu(load_balance_mask, i) = (void *)ptr; |
6954 | ptr += cpumask_size(); | 6905 | ptr += cpumask_size(); |
6955 | } | 6906 | } |
6956 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 6907 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
@@ -6976,12 +6927,6 @@ void __init sched_init(void) | |||
6976 | 6927 | ||
6977 | #endif /* CONFIG_CGROUP_SCHED */ | 6928 | #endif /* CONFIG_CGROUP_SCHED */ |
6978 | 6929 | ||
6979 | #ifdef CONFIG_CGROUP_CPUACCT | ||
6980 | root_cpuacct.cpustat = &kernel_cpustat; | ||
6981 | root_cpuacct.cpuusage = alloc_percpu(u64); | ||
6982 | /* Too early, not expected to fail */ | ||
6983 | BUG_ON(!root_cpuacct.cpuusage); | ||
6984 | #endif | ||
6985 | for_each_possible_cpu(i) { | 6930 | for_each_possible_cpu(i) { |
6986 | struct rq *rq; | 6931 | struct rq *rq; |
6987 | 6932 | ||
@@ -8083,226 +8028,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8083 | 8028 | ||
8084 | #endif /* CONFIG_CGROUP_SCHED */ | 8029 | #endif /* CONFIG_CGROUP_SCHED */ |
8085 | 8030 | ||
8086 | #ifdef CONFIG_CGROUP_CPUACCT | ||
8087 | |||
8088 | /* | ||
8089 | * CPU accounting code for task groups. | ||
8090 | * | ||
8091 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
8092 | * (balbir@in.ibm.com). | ||
8093 | */ | ||
8094 | |||
8095 | struct cpuacct root_cpuacct; | ||
8096 | |||
8097 | /* create a new cpu accounting group */ | ||
8098 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | ||
8099 | { | ||
8100 | struct cpuacct *ca; | ||
8101 | |||
8102 | if (!cgrp->parent) | ||
8103 | return &root_cpuacct.css; | ||
8104 | |||
8105 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
8106 | if (!ca) | ||
8107 | goto out; | ||
8108 | |||
8109 | ca->cpuusage = alloc_percpu(u64); | ||
8110 | if (!ca->cpuusage) | ||
8111 | goto out_free_ca; | ||
8112 | |||
8113 | ca->cpustat = alloc_percpu(struct kernel_cpustat); | ||
8114 | if (!ca->cpustat) | ||
8115 | goto out_free_cpuusage; | ||
8116 | |||
8117 | return &ca->css; | ||
8118 | |||
8119 | out_free_cpuusage: | ||
8120 | free_percpu(ca->cpuusage); | ||
8121 | out_free_ca: | ||
8122 | kfree(ca); | ||
8123 | out: | ||
8124 | return ERR_PTR(-ENOMEM); | ||
8125 | } | ||
8126 | |||
8127 | /* destroy an existing cpu accounting group */ | ||
8128 | static void cpuacct_css_free(struct cgroup *cgrp) | ||
8129 | { | ||
8130 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8131 | |||
8132 | free_percpu(ca->cpustat); | ||
8133 | free_percpu(ca->cpuusage); | ||
8134 | kfree(ca); | ||
8135 | } | ||
8136 | |||
8137 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
8138 | { | ||
8139 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8140 | u64 data; | ||
8141 | |||
8142 | #ifndef CONFIG_64BIT | ||
8143 | /* | ||
8144 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
8145 | */ | ||
8146 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
8147 | data = *cpuusage; | ||
8148 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
8149 | #else | ||
8150 | data = *cpuusage; | ||
8151 | #endif | ||
8152 | |||
8153 | return data; | ||
8154 | } | ||
8155 | |||
8156 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
8157 | { | ||
8158 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8159 | |||
8160 | #ifndef CONFIG_64BIT | ||
8161 | /* | ||
8162 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
8163 | */ | ||
8164 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
8165 | *cpuusage = val; | ||
8166 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
8167 | #else | ||
8168 | *cpuusage = val; | ||
8169 | #endif | ||
8170 | } | ||
8171 | |||
8172 | /* return total cpu usage (in nanoseconds) of a group */ | ||
8173 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
8174 | { | ||
8175 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8176 | u64 totalcpuusage = 0; | ||
8177 | int i; | ||
8178 | |||
8179 | for_each_present_cpu(i) | ||
8180 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | ||
8181 | |||
8182 | return totalcpuusage; | ||
8183 | } | ||
8184 | |||
8185 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
8186 | u64 reset) | ||
8187 | { | ||
8188 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8189 | int err = 0; | ||
8190 | int i; | ||
8191 | |||
8192 | if (reset) { | ||
8193 | err = -EINVAL; | ||
8194 | goto out; | ||
8195 | } | ||
8196 | |||
8197 | for_each_present_cpu(i) | ||
8198 | cpuacct_cpuusage_write(ca, i, 0); | ||
8199 | |||
8200 | out: | ||
8201 | return err; | ||
8202 | } | ||
8203 | |||
8204 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
8205 | struct seq_file *m) | ||
8206 | { | ||
8207 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
8208 | u64 percpu; | ||
8209 | int i; | ||
8210 | |||
8211 | for_each_present_cpu(i) { | ||
8212 | percpu = cpuacct_cpuusage_read(ca, i); | ||
8213 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
8214 | } | ||
8215 | seq_printf(m, "\n"); | ||
8216 | return 0; | ||
8217 | } | ||
8218 | |||
8219 | static const char *cpuacct_stat_desc[] = { | ||
8220 | [CPUACCT_STAT_USER] = "user", | ||
8221 | [CPUACCT_STAT_SYSTEM] = "system", | ||
8222 | }; | ||
8223 | |||
8224 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
8225 | struct cgroup_map_cb *cb) | ||
8226 | { | ||
8227 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
8228 | int cpu; | ||
8229 | s64 val = 0; | ||
8230 | |||
8231 | for_each_online_cpu(cpu) { | ||
8232 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8233 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
8234 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
8235 | } | ||
8236 | val = cputime64_to_clock_t(val); | ||
8237 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
8238 | |||
8239 | val = 0; | ||
8240 | for_each_online_cpu(cpu) { | ||
8241 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
8242 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
8243 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
8244 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
8245 | } | ||
8246 | |||
8247 | val = cputime64_to_clock_t(val); | ||
8248 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
8249 | |||
8250 | return 0; | ||
8251 | } | ||
8252 | |||
8253 | static struct cftype files[] = { | ||
8254 | { | ||
8255 | .name = "usage", | ||
8256 | .read_u64 = cpuusage_read, | ||
8257 | .write_u64 = cpuusage_write, | ||
8258 | }, | ||
8259 | { | ||
8260 | .name = "usage_percpu", | ||
8261 | .read_seq_string = cpuacct_percpu_seq_read, | ||
8262 | }, | ||
8263 | { | ||
8264 | .name = "stat", | ||
8265 | .read_map = cpuacct_stats_show, | ||
8266 | }, | ||
8267 | { } /* terminate */ | ||
8268 | }; | ||
8269 | |||
8270 | /* | ||
8271 | * charge this task's execution time to its accounting group. | ||
8272 | * | ||
8273 | * called with rq->lock held. | ||
8274 | */ | ||
8275 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
8276 | { | ||
8277 | struct cpuacct *ca; | ||
8278 | int cpu; | ||
8279 | |||
8280 | if (unlikely(!cpuacct_subsys.active)) | ||
8281 | return; | ||
8282 | |||
8283 | cpu = task_cpu(tsk); | ||
8284 | |||
8285 | rcu_read_lock(); | ||
8286 | |||
8287 | ca = task_ca(tsk); | ||
8288 | |||
8289 | for (; ca; ca = parent_ca(ca)) { | ||
8290 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
8291 | *cpuusage += cputime; | ||
8292 | } | ||
8293 | |||
8294 | rcu_read_unlock(); | ||
8295 | } | ||
8296 | |||
8297 | struct cgroup_subsys cpuacct_subsys = { | ||
8298 | .name = "cpuacct", | ||
8299 | .css_alloc = cpuacct_css_alloc, | ||
8300 | .css_free = cpuacct_css_free, | ||
8301 | .subsys_id = cpuacct_subsys_id, | ||
8302 | .base_cftypes = files, | ||
8303 | }; | ||
8304 | #endif /* CONFIG_CGROUP_CPUACCT */ | ||
8305 | |||
8306 | void dump_cpu_task(int cpu) | 8031 | void dump_cpu_task(int cpu) |
8307 | { | 8032 | { |
8308 | pr_info("Task dump for CPU %d:\n", cpu); | 8033 | pr_info("Task dump for CPU %d:\n", cpu); |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..dbb7e2cd95eb --- /dev/null +++ b/kernel/sched/cpuacct.c | |||
@@ -0,0 +1,296 @@ | |||
1 | #include <linux/cgroup.h> | ||
2 | #include <linux/slab.h> | ||
3 | #include <linux/percpu.h> | ||
4 | #include <linux/spinlock.h> | ||
5 | #include <linux/cpumask.h> | ||
6 | #include <linux/seq_file.h> | ||
7 | #include <linux/rcupdate.h> | ||
8 | #include <linux/kernel_stat.h> | ||
9 | #include <linux/err.h> | ||
10 | |||
11 | #include "sched.h" | ||
12 | |||
13 | /* | ||
14 | * CPU accounting code for task groups. | ||
15 | * | ||
16 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
17 | * (balbir@in.ibm.com). | ||
18 | */ | ||
19 | |||
20 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
21 | enum cpuacct_stat_index { | ||
22 | CPUACCT_STAT_USER, /* ... user mode */ | ||
23 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
24 | |||
25 | CPUACCT_STAT_NSTATS, | ||
26 | }; | ||
27 | |||
28 | /* track cpu usage of a group of tasks and its child groups */ | ||
29 | struct cpuacct { | ||
30 | struct cgroup_subsys_state css; | ||
31 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
32 | u64 __percpu *cpuusage; | ||
33 | struct kernel_cpustat __percpu *cpustat; | ||
34 | }; | ||
35 | |||
36 | /* return cpu accounting group corresponding to this container */ | ||
37 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
38 | { | ||
39 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
40 | struct cpuacct, css); | ||
41 | } | ||
42 | |||
43 | /* return cpu accounting group to which this task belongs */ | ||
44 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
45 | { | ||
46 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
47 | struct cpuacct, css); | ||
48 | } | ||
49 | |||
50 | static inline struct cpuacct *__parent_ca(struct cpuacct *ca) | ||
51 | { | ||
52 | return cgroup_ca(ca->css.cgroup->parent); | ||
53 | } | ||
54 | |||
55 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
56 | { | ||
57 | if (!ca->css.cgroup->parent) | ||
58 | return NULL; | ||
59 | return cgroup_ca(ca->css.cgroup->parent); | ||
60 | } | ||
61 | |||
62 | static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); | ||
63 | static struct cpuacct root_cpuacct = { | ||
64 | .cpustat = &kernel_cpustat, | ||
65 | .cpuusage = &root_cpuacct_cpuusage, | ||
66 | }; | ||
67 | |||
68 | /* create a new cpu accounting group */ | ||
69 | static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) | ||
70 | { | ||
71 | struct cpuacct *ca; | ||
72 | |||
73 | if (!cgrp->parent) | ||
74 | return &root_cpuacct.css; | ||
75 | |||
76 | ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
77 | if (!ca) | ||
78 | goto out; | ||
79 | |||
80 | ca->cpuusage = alloc_percpu(u64); | ||
81 | if (!ca->cpuusage) | ||
82 | goto out_free_ca; | ||
83 | |||
84 | ca->cpustat = alloc_percpu(struct kernel_cpustat); | ||
85 | if (!ca->cpustat) | ||
86 | goto out_free_cpuusage; | ||
87 | |||
88 | return &ca->css; | ||
89 | |||
90 | out_free_cpuusage: | ||
91 | free_percpu(ca->cpuusage); | ||
92 | out_free_ca: | ||
93 | kfree(ca); | ||
94 | out: | ||
95 | return ERR_PTR(-ENOMEM); | ||
96 | } | ||
97 | |||
98 | /* destroy an existing cpu accounting group */ | ||
99 | static void cpuacct_css_free(struct cgroup *cgrp) | ||
100 | { | ||
101 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
102 | |||
103 | free_percpu(ca->cpustat); | ||
104 | free_percpu(ca->cpuusage); | ||
105 | kfree(ca); | ||
106 | } | ||
107 | |||
108 | static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) | ||
109 | { | ||
110 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
111 | u64 data; | ||
112 | |||
113 | #ifndef CONFIG_64BIT | ||
114 | /* | ||
115 | * Take rq->lock to make 64-bit read safe on 32-bit platforms. | ||
116 | */ | ||
117 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
118 | data = *cpuusage; | ||
119 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
120 | #else | ||
121 | data = *cpuusage; | ||
122 | #endif | ||
123 | |||
124 | return data; | ||
125 | } | ||
126 | |||
127 | static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | ||
128 | { | ||
129 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
130 | |||
131 | #ifndef CONFIG_64BIT | ||
132 | /* | ||
133 | * Take rq->lock to make 64-bit write safe on 32-bit platforms. | ||
134 | */ | ||
135 | raw_spin_lock_irq(&cpu_rq(cpu)->lock); | ||
136 | *cpuusage = val; | ||
137 | raw_spin_unlock_irq(&cpu_rq(cpu)->lock); | ||
138 | #else | ||
139 | *cpuusage = val; | ||
140 | #endif | ||
141 | } | ||
142 | |||
143 | /* return total cpu usage (in nanoseconds) of a group */ | ||
144 | static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) | ||
145 | { | ||
146 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
147 | u64 totalcpuusage = 0; | ||
148 | int i; | ||
149 | |||
150 | for_each_present_cpu(i) | ||
151 | totalcpuusage += cpuacct_cpuusage_read(ca, i); | ||
152 | |||
153 | return totalcpuusage; | ||
154 | } | ||
155 | |||
156 | static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, | ||
157 | u64 reset) | ||
158 | { | ||
159 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
160 | int err = 0; | ||
161 | int i; | ||
162 | |||
163 | if (reset) { | ||
164 | err = -EINVAL; | ||
165 | goto out; | ||
166 | } | ||
167 | |||
168 | for_each_present_cpu(i) | ||
169 | cpuacct_cpuusage_write(ca, i, 0); | ||
170 | |||
171 | out: | ||
172 | return err; | ||
173 | } | ||
174 | |||
175 | static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, | ||
176 | struct seq_file *m) | ||
177 | { | ||
178 | struct cpuacct *ca = cgroup_ca(cgroup); | ||
179 | u64 percpu; | ||
180 | int i; | ||
181 | |||
182 | for_each_present_cpu(i) { | ||
183 | percpu = cpuacct_cpuusage_read(ca, i); | ||
184 | seq_printf(m, "%llu ", (unsigned long long) percpu); | ||
185 | } | ||
186 | seq_printf(m, "\n"); | ||
187 | return 0; | ||
188 | } | ||
189 | |||
190 | static const char * const cpuacct_stat_desc[] = { | ||
191 | [CPUACCT_STAT_USER] = "user", | ||
192 | [CPUACCT_STAT_SYSTEM] = "system", | ||
193 | }; | ||
194 | |||
195 | static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, | ||
196 | struct cgroup_map_cb *cb) | ||
197 | { | ||
198 | struct cpuacct *ca = cgroup_ca(cgrp); | ||
199 | int cpu; | ||
200 | s64 val = 0; | ||
201 | |||
202 | for_each_online_cpu(cpu) { | ||
203 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
204 | val += kcpustat->cpustat[CPUTIME_USER]; | ||
205 | val += kcpustat->cpustat[CPUTIME_NICE]; | ||
206 | } | ||
207 | val = cputime64_to_clock_t(val); | ||
208 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); | ||
209 | |||
210 | val = 0; | ||
211 | for_each_online_cpu(cpu) { | ||
212 | struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); | ||
213 | val += kcpustat->cpustat[CPUTIME_SYSTEM]; | ||
214 | val += kcpustat->cpustat[CPUTIME_IRQ]; | ||
215 | val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; | ||
216 | } | ||
217 | |||
218 | val = cputime64_to_clock_t(val); | ||
219 | cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); | ||
220 | |||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static struct cftype files[] = { | ||
225 | { | ||
226 | .name = "usage", | ||
227 | .read_u64 = cpuusage_read, | ||
228 | .write_u64 = cpuusage_write, | ||
229 | }, | ||
230 | { | ||
231 | .name = "usage_percpu", | ||
232 | .read_seq_string = cpuacct_percpu_seq_read, | ||
233 | }, | ||
234 | { | ||
235 | .name = "stat", | ||
236 | .read_map = cpuacct_stats_show, | ||
237 | }, | ||
238 | { } /* terminate */ | ||
239 | }; | ||
240 | |||
241 | /* | ||
242 | * charge this task's execution time to its accounting group. | ||
243 | * | ||
244 | * called with rq->lock held. | ||
245 | */ | ||
246 | void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
247 | { | ||
248 | struct cpuacct *ca; | ||
249 | int cpu; | ||
250 | |||
251 | cpu = task_cpu(tsk); | ||
252 | |||
253 | rcu_read_lock(); | ||
254 | |||
255 | ca = task_ca(tsk); | ||
256 | |||
257 | while (true) { | ||
258 | u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); | ||
259 | *cpuusage += cputime; | ||
260 | |||
261 | ca = parent_ca(ca); | ||
262 | if (!ca) | ||
263 | break; | ||
264 | } | ||
265 | |||
266 | rcu_read_unlock(); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Add user/system time to cpuacct. | ||
271 | * | ||
272 | * Note: it's the caller that updates the account of the root cgroup. | ||
273 | */ | ||
274 | void cpuacct_account_field(struct task_struct *p, int index, u64 val) | ||
275 | { | ||
276 | struct kernel_cpustat *kcpustat; | ||
277 | struct cpuacct *ca; | ||
278 | |||
279 | rcu_read_lock(); | ||
280 | ca = task_ca(p); | ||
281 | while (ca != &root_cpuacct) { | ||
282 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
283 | kcpustat->cpustat[index] += val; | ||
284 | ca = __parent_ca(ca); | ||
285 | } | ||
286 | rcu_read_unlock(); | ||
287 | } | ||
288 | |||
289 | struct cgroup_subsys cpuacct_subsys = { | ||
290 | .name = "cpuacct", | ||
291 | .css_alloc = cpuacct_css_alloc, | ||
292 | .css_free = cpuacct_css_free, | ||
293 | .subsys_id = cpuacct_subsys_id, | ||
294 | .base_cftypes = files, | ||
295 | .early_init = 1, | ||
296 | }; | ||
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..ed605624a5e7 --- /dev/null +++ b/kernel/sched/cpuacct.h | |||
@@ -0,0 +1,17 @@ | |||
1 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2 | |||
3 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
4 | extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); | ||
5 | |||
6 | #else | ||
7 | |||
8 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
9 | { | ||
10 | } | ||
11 | |||
12 | static inline void | ||
13 | cpuacct_account_field(struct task_struct *p, int index, u64 val) | ||
14 | { | ||
15 | } | ||
16 | |||
17 | #endif | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 699d59756ece..ea32f02bf2c3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) | |||
115 | static inline void task_group_account_field(struct task_struct *p, int index, | 115 | static inline void task_group_account_field(struct task_struct *p, int index, |
116 | u64 tmp) | 116 | u64 tmp) |
117 | { | 117 | { |
118 | #ifdef CONFIG_CGROUP_CPUACCT | ||
119 | struct kernel_cpustat *kcpustat; | ||
120 | struct cpuacct *ca; | ||
121 | #endif | ||
122 | /* | 118 | /* |
123 | * Since all updates are sure to touch the root cgroup, we | 119 | * Since all updates are sure to touch the root cgroup, we |
124 | * get ourselves ahead and touch it first. If the root cgroup | 120 | * get ourselves ahead and touch it first. If the root cgroup |
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
127 | */ | 123 | */ |
128 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | 124 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; |
129 | 125 | ||
130 | #ifdef CONFIG_CGROUP_CPUACCT | 126 | cpuacct_account_field(p, index, tmp); |
131 | if (unlikely(!cpuacct_subsys.active)) | ||
132 | return; | ||
133 | |||
134 | rcu_read_lock(); | ||
135 | ca = task_ca(p); | ||
136 | while (ca && (ca != &root_cpuacct)) { | ||
137 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
138 | kcpustat->cpustat[index] += tmp; | ||
139 | ca = parent_ca(ca); | ||
140 | } | ||
141 | rcu_read_unlock(); | ||
142 | #endif | ||
143 | } | 127 | } |
144 | 128 | ||
145 | /* | 129 | /* |
@@ -310,7 +294,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
310 | 294 | ||
311 | t = tsk; | 295 | t = tsk; |
312 | do { | 296 | do { |
313 | task_cputime(tsk, &utime, &stime); | 297 | task_cputime(t, &utime, &stime); |
314 | times->utime += utime; | 298 | times->utime += utime; |
315 | times->stime += stime; | 299 | times->stime += stime; |
316 | times->sum_exec_runtime += task_sched_runtime(t); | 300 | times->sum_exec_runtime += task_sched_runtime(t); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c97fca091a7..c61a614465c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | |||
1563 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | 1563 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); |
1564 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | 1564 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ |
1565 | } | 1565 | } |
1566 | |||
1567 | /* | ||
1568 | * Update the rq's load with the elapsed running time before entering | ||
1569 | * idle. if the last scheduled task is not a CFS task, idle_enter will | ||
1570 | * be the only way to update the runnable statistic. | ||
1571 | */ | ||
1572 | void idle_enter_fair(struct rq *this_rq) | ||
1573 | { | ||
1574 | update_rq_runnable_avg(this_rq, 1); | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * Update the rq's load with the elapsed idle time before a task is | ||
1579 | * scheduled. if the newly scheduled task is not a CFS task, idle_exit will | ||
1580 | * be the only way to update the runnable statistic. | ||
1581 | */ | ||
1582 | void idle_exit_fair(struct rq *this_rq) | ||
1583 | { | ||
1584 | update_rq_runnable_avg(this_rq, 0); | ||
1585 | } | ||
1586 | |||
1566 | #else | 1587 | #else |
1567 | static inline void update_entity_load_avg(struct sched_entity *se, | 1588 | static inline void update_entity_load_avg(struct sched_entity *se, |
1568 | int update_cfs_rq) {} | 1589 | int update_cfs_rq) {} |
@@ -3875,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3875 | int tsk_cache_hot = 0; | 3896 | int tsk_cache_hot = 0; |
3876 | /* | 3897 | /* |
3877 | * We do not migrate tasks that are: | 3898 | * We do not migrate tasks that are: |
3878 | * 1) running (obviously), or | 3899 | * 1) throttled_lb_pair, or |
3879 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 3900 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
3880 | * 3) are cache-hot on their current CPU. | 3901 | * 3) running (obviously), or |
3902 | * 4) are cache-hot on their current CPU. | ||
3881 | */ | 3903 | */ |
3904 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | ||
3905 | return 0; | ||
3906 | |||
3882 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { | 3907 | if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { |
3883 | int new_dst_cpu; | 3908 | int cpu; |
3884 | 3909 | ||
3885 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); | 3910 | schedstat_inc(p, se.statistics.nr_failed_migrations_affine); |
3886 | 3911 | ||
@@ -3895,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3895 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) | 3920 | if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) |
3896 | return 0; | 3921 | return 0; |
3897 | 3922 | ||
3898 | new_dst_cpu = cpumask_first_and(env->dst_grpmask, | 3923 | /* Prevent to re-select dst_cpu via env's cpus */ |
3899 | tsk_cpus_allowed(p)); | 3924 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
3900 | if (new_dst_cpu < nr_cpu_ids) { | 3925 | if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { |
3901 | env->flags |= LBF_SOME_PINNED; | 3926 | env->flags |= LBF_SOME_PINNED; |
3902 | env->new_dst_cpu = new_dst_cpu; | 3927 | env->new_dst_cpu = cpu; |
3928 | break; | ||
3929 | } | ||
3903 | } | 3930 | } |
3931 | |||
3904 | return 0; | 3932 | return 0; |
3905 | } | 3933 | } |
3906 | 3934 | ||
@@ -3921,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
3921 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); | 3949 | tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); |
3922 | if (!tsk_cache_hot || | 3950 | if (!tsk_cache_hot || |
3923 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { | 3951 | env->sd->nr_balance_failed > env->sd->cache_nice_tries) { |
3924 | #ifdef CONFIG_SCHEDSTATS | 3952 | |
3925 | if (tsk_cache_hot) { | 3953 | if (tsk_cache_hot) { |
3926 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); | 3954 | schedstat_inc(env->sd, lb_hot_gained[env->idle]); |
3927 | schedstat_inc(p, se.statistics.nr_forced_migrations); | 3955 | schedstat_inc(p, se.statistics.nr_forced_migrations); |
3928 | } | 3956 | } |
3929 | #endif | 3957 | |
3930 | return 1; | 3958 | return 1; |
3931 | } | 3959 | } |
3932 | 3960 | ||
3933 | if (tsk_cache_hot) { | 3961 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); |
3934 | schedstat_inc(p, se.statistics.nr_failed_migrations_hot); | 3962 | return 0; |
3935 | return 0; | ||
3936 | } | ||
3937 | return 1; | ||
3938 | } | 3963 | } |
3939 | 3964 | ||
3940 | /* | 3965 | /* |
@@ -3949,9 +3974,6 @@ static int move_one_task(struct lb_env *env) | |||
3949 | struct task_struct *p, *n; | 3974 | struct task_struct *p, *n; |
3950 | 3975 | ||
3951 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { | 3976 | list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { |
3952 | if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) | ||
3953 | continue; | ||
3954 | |||
3955 | if (!can_migrate_task(p, env)) | 3977 | if (!can_migrate_task(p, env)) |
3956 | continue; | 3978 | continue; |
3957 | 3979 | ||
@@ -4003,7 +4025,7 @@ static int move_tasks(struct lb_env *env) | |||
4003 | break; | 4025 | break; |
4004 | } | 4026 | } |
4005 | 4027 | ||
4006 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | 4028 | if (!can_migrate_task(p, env)) |
4007 | goto next; | 4029 | goto next; |
4008 | 4030 | ||
4009 | load = task_h_load(p); | 4031 | load = task_h_load(p); |
@@ -4014,9 +4036,6 @@ static int move_tasks(struct lb_env *env) | |||
4014 | if ((load / 2) > env->imbalance) | 4036 | if ((load / 2) > env->imbalance) |
4015 | goto next; | 4037 | goto next; |
4016 | 4038 | ||
4017 | if (!can_migrate_task(p, env)) | ||
4018 | goto next; | ||
4019 | |||
4020 | move_task(p, env); | 4039 | move_task(p, env); |
4021 | pulled++; | 4040 | pulled++; |
4022 | env->imbalance -= load; | 4041 | env->imbalance -= load; |
@@ -4961,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
4961 | #define MAX_PINNED_INTERVAL 512 | 4980 | #define MAX_PINNED_INTERVAL 512 |
4962 | 4981 | ||
4963 | /* Working cpumask for load_balance and load_balance_newidle. */ | 4982 | /* Working cpumask for load_balance and load_balance_newidle. */ |
4964 | DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 4983 | DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); |
4965 | 4984 | ||
4966 | static int need_active_balance(struct lb_env *env) | 4985 | static int need_active_balance(struct lb_env *env) |
4967 | { | 4986 | { |
@@ -4992,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4992 | int *balance) | 5011 | int *balance) |
4993 | { | 5012 | { |
4994 | int ld_moved, cur_ld_moved, active_balance = 0; | 5013 | int ld_moved, cur_ld_moved, active_balance = 0; |
4995 | int lb_iterations, max_lb_iterations; | ||
4996 | struct sched_group *group; | 5014 | struct sched_group *group; |
4997 | struct rq *busiest; | 5015 | struct rq *busiest; |
4998 | unsigned long flags; | 5016 | unsigned long flags; |
4999 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 5017 | struct cpumask *cpus = __get_cpu_var(load_balance_mask); |
5000 | 5018 | ||
5001 | struct lb_env env = { | 5019 | struct lb_env env = { |
5002 | .sd = sd, | 5020 | .sd = sd, |
@@ -5008,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
5008 | .cpus = cpus, | 5026 | .cpus = cpus, |
5009 | }; | 5027 | }; |
5010 | 5028 | ||
5029 | /* | ||
5030 | * For NEWLY_IDLE load_balancing, we don't need to consider | ||
5031 | * other cpus in our group | ||
5032 | */ | ||
5033 | if (idle == CPU_NEWLY_IDLE) | ||
5034 | env.dst_grpmask = NULL; | ||
5035 | |||
5011 | cpumask_copy(cpus, cpu_active_mask); | 5036 | cpumask_copy(cpus, cpu_active_mask); |
5012 | max_lb_iterations = cpumask_weight(env.dst_grpmask); | ||
5013 | 5037 | ||
5014 | schedstat_inc(sd, lb_count[idle]); | 5038 | schedstat_inc(sd, lb_count[idle]); |
5015 | 5039 | ||
@@ -5035,7 +5059,6 @@ redo: | |||
5035 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 5059 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
5036 | 5060 | ||
5037 | ld_moved = 0; | 5061 | ld_moved = 0; |
5038 | lb_iterations = 1; | ||
5039 | if (busiest->nr_running > 1) { | 5062 | if (busiest->nr_running > 1) { |
5040 | /* | 5063 | /* |
5041 | * Attempt to move tasks. If find_busiest_group has found | 5064 | * Attempt to move tasks. If find_busiest_group has found |
@@ -5062,17 +5085,17 @@ more_balance: | |||
5062 | double_rq_unlock(env.dst_rq, busiest); | 5085 | double_rq_unlock(env.dst_rq, busiest); |
5063 | local_irq_restore(flags); | 5086 | local_irq_restore(flags); |
5064 | 5087 | ||
5065 | if (env.flags & LBF_NEED_BREAK) { | ||
5066 | env.flags &= ~LBF_NEED_BREAK; | ||
5067 | goto more_balance; | ||
5068 | } | ||
5069 | |||
5070 | /* | 5088 | /* |
5071 | * some other cpu did the load balance for us. | 5089 | * some other cpu did the load balance for us. |
5072 | */ | 5090 | */ |
5073 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) | 5091 | if (cur_ld_moved && env.dst_cpu != smp_processor_id()) |
5074 | resched_cpu(env.dst_cpu); | 5092 | resched_cpu(env.dst_cpu); |
5075 | 5093 | ||
5094 | if (env.flags & LBF_NEED_BREAK) { | ||
5095 | env.flags &= ~LBF_NEED_BREAK; | ||
5096 | goto more_balance; | ||
5097 | } | ||
5098 | |||
5076 | /* | 5099 | /* |
5077 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 5100 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
5078 | * us and move them to an alternate dst_cpu in our sched_group | 5101 | * us and move them to an alternate dst_cpu in our sched_group |
@@ -5092,14 +5115,17 @@ more_balance: | |||
5092 | * moreover subsequent load balance cycles should correct the | 5115 | * moreover subsequent load balance cycles should correct the |
5093 | * excess load moved. | 5116 | * excess load moved. |
5094 | */ | 5117 | */ |
5095 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 5118 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { |
5096 | lb_iterations++ < max_lb_iterations) { | ||
5097 | 5119 | ||
5098 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 5120 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
5099 | env.dst_cpu = env.new_dst_cpu; | 5121 | env.dst_cpu = env.new_dst_cpu; |
5100 | env.flags &= ~LBF_SOME_PINNED; | 5122 | env.flags &= ~LBF_SOME_PINNED; |
5101 | env.loop = 0; | 5123 | env.loop = 0; |
5102 | env.loop_break = sched_nr_migrate_break; | 5124 | env.loop_break = sched_nr_migrate_break; |
5125 | |||
5126 | /* Prevent to re-select dst_cpu via env's cpus */ | ||
5127 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | ||
5128 | |||
5103 | /* | 5129 | /* |
5104 | * Go back to "more_balance" rather than "redo" since we | 5130 | * Go back to "more_balance" rather than "redo" since we |
5105 | * need to continue with same src_cpu. | 5131 | * need to continue with same src_cpu. |
@@ -5220,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) | |||
5220 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | 5246 | if (this_rq->avg_idle < sysctl_sched_migration_cost) |
5221 | return; | 5247 | return; |
5222 | 5248 | ||
5223 | update_rq_runnable_avg(this_rq, 1); | ||
5224 | |||
5225 | /* | 5249 | /* |
5226 | * Drop the rq->lock, but keep IRQ/preempt disabled. | 5250 | * Drop the rq->lock, but keep IRQ/preempt disabled. |
5227 | */ | 5251 | */ |
@@ -5396,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) | |||
5396 | struct sched_domain *sd; | 5420 | struct sched_domain *sd; |
5397 | int cpu = smp_processor_id(); | 5421 | int cpu = smp_processor_id(); |
5398 | 5422 | ||
5399 | if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) | ||
5400 | return; | ||
5401 | clear_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
5402 | |||
5403 | rcu_read_lock(); | 5423 | rcu_read_lock(); |
5404 | for_each_domain(cpu, sd) | 5424 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); |
5425 | |||
5426 | if (!sd || !sd->nohz_idle) | ||
5427 | goto unlock; | ||
5428 | sd->nohz_idle = 0; | ||
5429 | |||
5430 | for (; sd; sd = sd->parent) | ||
5405 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); | 5431 | atomic_inc(&sd->groups->sgp->nr_busy_cpus); |
5432 | unlock: | ||
5406 | rcu_read_unlock(); | 5433 | rcu_read_unlock(); |
5407 | } | 5434 | } |
5408 | 5435 | ||
@@ -5411,13 +5438,16 @@ void set_cpu_sd_state_idle(void) | |||
5411 | struct sched_domain *sd; | 5438 | struct sched_domain *sd; |
5412 | int cpu = smp_processor_id(); | 5439 | int cpu = smp_processor_id(); |
5413 | 5440 | ||
5414 | if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) | ||
5415 | return; | ||
5416 | set_bit(NOHZ_IDLE, nohz_flags(cpu)); | ||
5417 | |||
5418 | rcu_read_lock(); | 5441 | rcu_read_lock(); |
5419 | for_each_domain(cpu, sd) | 5442 | sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); |
5443 | |||
5444 | if (!sd || sd->nohz_idle) | ||
5445 | goto unlock; | ||
5446 | sd->nohz_idle = 1; | ||
5447 | |||
5448 | for (; sd; sd = sd->parent) | ||
5420 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); | 5449 | atomic_dec(&sd->groups->sgp->nr_busy_cpus); |
5450 | unlock: | ||
5421 | rcu_read_unlock(); | 5451 | rcu_read_unlock(); |
5422 | } | 5452 | } |
5423 | 5453 | ||
@@ -5469,7 +5499,7 @@ void update_max_interval(void) | |||
5469 | * It checks each scheduling domain to see if it is due to be balanced, | 5499 | * It checks each scheduling domain to see if it is due to be balanced, |
5470 | * and initiates a balancing operation if so. | 5500 | * and initiates a balancing operation if so. |
5471 | * | 5501 | * |
5472 | * Balancing parameters are set up in arch_init_sched_domains. | 5502 | * Balancing parameters are set up in init_sched_domains. |
5473 | */ | 5503 | */ |
5474 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) | 5504 | static void rebalance_domains(int cpu, enum cpu_idle_type idle) |
5475 | { | 5505 | { |
@@ -5507,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
5507 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 5537 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
5508 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 5538 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
5509 | /* | 5539 | /* |
5510 | * We've pulled tasks over so either we're no | 5540 | * The LBF_SOME_PINNED logic could have changed |
5511 | * longer idle. | 5541 | * env->dst_cpu, so we can't know our idle |
5542 | * state even if we migrated tasks. Update it. | ||
5512 | */ | 5543 | */ |
5513 | idle = CPU_NOT_IDLE; | 5544 | idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; |
5514 | } | 5545 | } |
5515 | sd->last_balance = jiffies; | 5546 | sd->last_balance = jiffies; |
5516 | } | 5547 | } |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false) | |||
46 | SCHED_FEAT(LB_BIAS, true) | 46 | SCHED_FEAT(LB_BIAS, true) |
47 | 47 | ||
48 | /* | 48 | /* |
49 | * Spin-wait on mutex acquisition when the mutex owner is running on | ||
50 | * another cpu -- assumes that when the owner is running, it will soon | ||
51 | * release the lock. Decreases scheduling overhead. | ||
52 | */ | ||
53 | SCHED_FEAT(OWNER_SPIN, true) | ||
54 | |||
55 | /* | ||
56 | * Decrement CPU power based on time not spent running tasks | 49 | * Decrement CPU power based on time not spent running tasks |
57 | */ | 50 | */ |
58 | SCHED_FEAT(NONTASK_POWER, true) | 51 | SCHED_FEAT(NONTASK_POWER, true) |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) | |||
13 | { | 13 | { |
14 | return task_cpu(p); /* IDLE tasks as never migrated */ | 14 | return task_cpu(p); /* IDLE tasks as never migrated */ |
15 | } | 15 | } |
16 | |||
17 | static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) | ||
18 | { | ||
19 | idle_exit_fair(rq); | ||
20 | } | ||
21 | |||
22 | static void post_schedule_idle(struct rq *rq) | ||
23 | { | ||
24 | idle_enter_fair(rq); | ||
25 | } | ||
16 | #endif /* CONFIG_SMP */ | 26 | #endif /* CONFIG_SMP */ |
17 | /* | 27 | /* |
18 | * Idle tasks are unconditionally rescheduled: | 28 | * Idle tasks are unconditionally rescheduled: |
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 35 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
26 | { | 36 | { |
27 | schedstat_inc(rq, sched_goidle); | 37 | schedstat_inc(rq, sched_goidle); |
38 | #ifdef CONFIG_SMP | ||
39 | /* Trigger the post schedule to do an idle_enter for CFS */ | ||
40 | rq->post_schedule = 1; | ||
41 | #endif | ||
28 | return rq->idle; | 42 | return rq->idle; |
29 | } | 43 | } |
30 | 44 | ||
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { | |||
86 | 100 | ||
87 | #ifdef CONFIG_SMP | 101 | #ifdef CONFIG_SMP |
88 | .select_task_rq = select_task_rq_idle, | 102 | .select_task_rq = select_task_rq_idle, |
103 | .pre_schedule = pre_schedule_idle, | ||
104 | .post_schedule = post_schedule_idle, | ||
89 | #endif | 105 | #endif |
90 | 106 | ||
91 | .set_curr_task = set_curr_task_idle, | 107 | .set_curr_task = set_curr_task_idle, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eb363aa5d83c..24dc29897749 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/tick.h> | 8 | #include <linux/tick.h> |
9 | 9 | ||
10 | #include "cpupri.h" | 10 | #include "cpupri.h" |
11 | #include "cpuacct.h" | ||
11 | 12 | ||
12 | extern __read_mostly int scheduler_running; | 13 | extern __read_mostly int scheduler_running; |
13 | 14 | ||
@@ -951,14 +952,6 @@ static const u32 prio_to_wmult[40] = { | |||
951 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 952 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
952 | }; | 953 | }; |
953 | 954 | ||
954 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | ||
955 | enum cpuacct_stat_index { | ||
956 | CPUACCT_STAT_USER, /* ... user mode */ | ||
957 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | ||
958 | |||
959 | CPUACCT_STAT_NSTATS, | ||
960 | }; | ||
961 | |||
962 | #define ENQUEUE_WAKEUP 1 | 955 | #define ENQUEUE_WAKEUP 1 |
963 | #define ENQUEUE_HEAD 2 | 956 | #define ENQUEUE_HEAD 2 |
964 | #ifdef CONFIG_SMP | 957 | #ifdef CONFIG_SMP |
@@ -1032,6 +1025,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu); | |||
1032 | extern void trigger_load_balance(struct rq *rq, int cpu); | 1025 | extern void trigger_load_balance(struct rq *rq, int cpu); |
1033 | extern void idle_balance(int this_cpu, struct rq *this_rq); | 1026 | extern void idle_balance(int this_cpu, struct rq *this_rq); |
1034 | 1027 | ||
1028 | /* | ||
1029 | * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg | ||
1030 | * becomes useful in lb | ||
1031 | */ | ||
1032 | #if defined(CONFIG_FAIR_GROUP_SCHED) | ||
1033 | extern void idle_enter_fair(struct rq *this_rq); | ||
1034 | extern void idle_exit_fair(struct rq *this_rq); | ||
1035 | #else | ||
1036 | static inline void idle_enter_fair(struct rq *this_rq) {} | ||
1037 | static inline void idle_exit_fair(struct rq *this_rq) {} | ||
1038 | #endif | ||
1039 | |||
1035 | #else /* CONFIG_SMP */ | 1040 | #else /* CONFIG_SMP */ |
1036 | 1041 | ||
1037 | static inline void idle_balance(int cpu, struct rq *rq) | 1042 | static inline void idle_balance(int cpu, struct rq *rq) |
@@ -1055,45 +1060,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime | |||
1055 | 1060 | ||
1056 | extern void update_idle_cpu_load(struct rq *this_rq); | 1061 | extern void update_idle_cpu_load(struct rq *this_rq); |
1057 | 1062 | ||
1058 | #ifdef CONFIG_CGROUP_CPUACCT | ||
1059 | #include <linux/cgroup.h> | ||
1060 | /* track cpu usage of a group of tasks and its child groups */ | ||
1061 | struct cpuacct { | ||
1062 | struct cgroup_subsys_state css; | ||
1063 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
1064 | u64 __percpu *cpuusage; | ||
1065 | struct kernel_cpustat __percpu *cpustat; | ||
1066 | }; | ||
1067 | |||
1068 | extern struct cgroup_subsys cpuacct_subsys; | ||
1069 | extern struct cpuacct root_cpuacct; | ||
1070 | |||
1071 | /* return cpu accounting group corresponding to this container */ | ||
1072 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | ||
1073 | { | ||
1074 | return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), | ||
1075 | struct cpuacct, css); | ||
1076 | } | ||
1077 | |||
1078 | /* return cpu accounting group to which this task belongs */ | ||
1079 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
1080 | { | ||
1081 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
1082 | struct cpuacct, css); | ||
1083 | } | ||
1084 | |||
1085 | static inline struct cpuacct *parent_ca(struct cpuacct *ca) | ||
1086 | { | ||
1087 | if (!ca || !ca->css.cgroup->parent) | ||
1088 | return NULL; | ||
1089 | return cgroup_ca(ca->css.cgroup->parent); | ||
1090 | } | ||
1091 | |||
1092 | extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
1093 | #else | ||
1094 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
1095 | #endif | ||
1096 | |||
1097 | #ifdef CONFIG_PARAVIRT | 1063 | #ifdef CONFIG_PARAVIRT |
1098 | static inline u64 steal_ticks(u64 steal) | 1064 | static inline u64 steal_ticks(u64 steal) |
1099 | { | 1065 | { |
@@ -1348,7 +1314,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | |||
1348 | enum rq_nohz_flag_bits { | 1314 | enum rq_nohz_flag_bits { |
1349 | NOHZ_TICK_STOPPED, | 1315 | NOHZ_TICK_STOPPED, |
1350 | NOHZ_BALANCE_KICK, | 1316 | NOHZ_BALANCE_KICK, |
1351 | NOHZ_IDLE, | ||
1352 | }; | 1317 | }; |
1353 | 1318 | ||
1354 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1319 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |