aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-04-30 10:43:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-04-30 10:43:28 -0400
commit16fa94b532b1958f508e07eca1a9256351241fbc (patch)
tree90012a7b7fe2b8cf96f6f5ec12490e0c5e152291 /kernel/sched
parente0972916e8fe943f342b0dd1c9d43dbf5bc261c2 (diff)
parent25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this development cycle were: - full dynticks preparatory work by Frederic Weisbecker - factor out the cpu time accounting code better, by Li Zefan - multi-CPU load balancer cleanups and improvements by Joonsoo Kim - various smaller fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (45 commits) sched: Fix init NOHZ_IDLE flag sched: Prevent to re-select dst-cpu in load_balance() sched: Rename load_balance_tmpmask to load_balance_mask sched: Move up affinity check to mitigate useless redoing overhead sched: Don't consider other cpus in our group in case of NEWLY_IDLE sched: Explicitly cpu_idle_type checking in rebalance_domains() sched: Change position of resched_cpu() in load_balance() sched: Fix wrong rq's runnable_avg update with rt tasks sched: Document task_struct::personality field sched/cpuacct/UML: Fix header file dependency bug on the UML build cgroup: Kill subsys.active flag sched/cpuacct: No need to check subsys active state sched/cpuacct: Initialize cpuacct subsystem earlier sched/cpuacct: Initialize root cpuacct earlier sched/cpuacct: Allocate per_cpu cpuusage for root cpuacct statically sched/cpuacct: Clean up cpuacct.h sched/cpuacct: Remove redundant NULL checks in cpuacct_acount_field() sched/cpuacct: Remove redundant NULL checks in cpuacct_charge() sched/cpuacct: Add cpuacct_acount_field() sched/cpuacct: Add cpuacct_init() ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c254
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c214
-rw-r--r--kernel/sched/fair.c148
-rw-r--r--kernel/sched/idle_task.c16
-rw-r--r--kernel/sched/sched.h219
8 files changed, 718 insertions, 447 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8285eb0cde6..ebdb19541218 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1288,8 +1288,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1288static void 1288static void
1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1290{ 1290{
1291 trace_sched_wakeup(p, true);
1292 check_preempt_curr(rq, p, wake_flags); 1291 check_preempt_curr(rq, p, wake_flags);
1292 trace_sched_wakeup(p, true);
1293 1293
1294 p->state = TASK_RUNNING; 1294 p->state = TASK_RUNNING;
1295#ifdef CONFIG_SMP 1295#ifdef CONFIG_SMP
@@ -3039,11 +3039,13 @@ EXPORT_SYMBOL(preempt_schedule);
3039asmlinkage void __sched preempt_schedule_irq(void) 3039asmlinkage void __sched preempt_schedule_irq(void)
3040{ 3040{
3041 struct thread_info *ti = current_thread_info(); 3041 struct thread_info *ti = current_thread_info();
3042 enum ctx_state prev_state;
3042 3043
3043 /* Catch callers which need to be fixed */ 3044 /* Catch callers which need to be fixed */
3044 BUG_ON(ti->preempt_count || !irqs_disabled()); 3045 BUG_ON(ti->preempt_count || !irqs_disabled());
3045 3046
3046 user_exit(); 3047 prev_state = exception_enter();
3048
3047 do { 3049 do {
3048 add_preempt_count(PREEMPT_ACTIVE); 3050 add_preempt_count(PREEMPT_ACTIVE);
3049 local_irq_enable(); 3051 local_irq_enable();
@@ -3057,6 +3059,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
3057 */ 3059 */
3058 barrier(); 3060 barrier();
3059 } while (need_resched()); 3061 } while (need_resched());
3062
3063 exception_exit(prev_state);
3060} 3064}
3061 3065
3062#endif /* CONFIG_PREEMPT */ 3066#endif /* CONFIG_PREEMPT */
@@ -6204,7 +6208,7 @@ static void sched_init_numa(void)
6204 * 'level' contains the number of unique distances, excluding the 6208 * 'level' contains the number of unique distances, excluding the
6205 * identity distance node_distance(i,i). 6209 * identity distance node_distance(i,i).
6206 * 6210 *
6207 * The sched_domains_nume_distance[] array includes the actual distance 6211 * The sched_domains_numa_distance[] array includes the actual distance
6208 * numbers. 6212 * numbers.
6209 */ 6213 */
6210 6214
@@ -6817,11 +6821,15 @@ int in_sched_functions(unsigned long addr)
6817} 6821}
6818 6822
6819#ifdef CONFIG_CGROUP_SCHED 6823#ifdef CONFIG_CGROUP_SCHED
6824/*
6825 * Default task group.
6826 * Every task in system belongs to this group at bootup.
6827 */
6820struct task_group root_task_group; 6828struct task_group root_task_group;
6821LIST_HEAD(task_groups); 6829LIST_HEAD(task_groups);
6822#endif 6830#endif
6823 6831
6824DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6832DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6825 6833
6826void __init sched_init(void) 6834void __init sched_init(void)
6827{ 6835{
@@ -6858,7 +6866,7 @@ void __init sched_init(void)
6858#endif /* CONFIG_RT_GROUP_SCHED */ 6866#endif /* CONFIG_RT_GROUP_SCHED */
6859#ifdef CONFIG_CPUMASK_OFFSTACK 6867#ifdef CONFIG_CPUMASK_OFFSTACK
6860 for_each_possible_cpu(i) { 6868 for_each_possible_cpu(i) {
6861 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6869 per_cpu(load_balance_mask, i) = (void *)ptr;
6862 ptr += cpumask_size(); 6870 ptr += cpumask_size();
6863 } 6871 }
6864#endif /* CONFIG_CPUMASK_OFFSTACK */ 6872#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6884,12 +6892,6 @@ void __init sched_init(void)
6884 6892
6885#endif /* CONFIG_CGROUP_SCHED */ 6893#endif /* CONFIG_CGROUP_SCHED */
6886 6894
6887#ifdef CONFIG_CGROUP_CPUACCT
6888 root_cpuacct.cpustat = &kernel_cpustat;
6889 root_cpuacct.cpuusage = alloc_percpu(u64);
6890 /* Too early, not expected to fail */
6891 BUG_ON(!root_cpuacct.cpuusage);
6892#endif
6893 for_each_possible_cpu(i) { 6895 for_each_possible_cpu(i) {
6894 struct rq *rq; 6896 struct rq *rq;
6895 6897
@@ -7411,7 +7413,7 @@ unlock:
7411 return err; 7413 return err;
7412} 7414}
7413 7415
7414int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7416static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7415{ 7417{
7416 u64 rt_runtime, rt_period; 7418 u64 rt_runtime, rt_period;
7417 7419
@@ -7423,7 +7425,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7423 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7425 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7424} 7426}
7425 7427
7426long sched_group_rt_runtime(struct task_group *tg) 7428static long sched_group_rt_runtime(struct task_group *tg)
7427{ 7429{
7428 u64 rt_runtime_us; 7430 u64 rt_runtime_us;
7429 7431
@@ -7435,7 +7437,7 @@ long sched_group_rt_runtime(struct task_group *tg)
7435 return rt_runtime_us; 7437 return rt_runtime_us;
7436} 7438}
7437 7439
7438int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7440static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7439{ 7441{
7440 u64 rt_runtime, rt_period; 7442 u64 rt_runtime, rt_period;
7441 7443
@@ -7448,7 +7450,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7448 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7450 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7449} 7451}
7450 7452
7451long sched_group_rt_period(struct task_group *tg) 7453static long sched_group_rt_period(struct task_group *tg)
7452{ 7454{
7453 u64 rt_period_us; 7455 u64 rt_period_us;
7454 7456
@@ -7483,7 +7485,7 @@ static int sched_rt_global_constraints(void)
7483 return ret; 7485 return ret;
7484} 7486}
7485 7487
7486int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7488static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7487{ 7489{
7488 /* Don't accept realtime tasks when there is no way for them to run */ 7490 /* Don't accept realtime tasks when there is no way for them to run */
7489 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7491 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7991,226 +7993,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7991 7993
7992#endif /* CONFIG_CGROUP_SCHED */ 7994#endif /* CONFIG_CGROUP_SCHED */
7993 7995
7994#ifdef CONFIG_CGROUP_CPUACCT
7995
7996/*
7997 * CPU accounting code for task groups.
7998 *
7999 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8000 * (balbir@in.ibm.com).
8001 */
8002
8003struct cpuacct root_cpuacct;
8004
8005/* create a new cpu accounting group */
8006static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8007{
8008 struct cpuacct *ca;
8009
8010 if (!cgrp->parent)
8011 return &root_cpuacct.css;
8012
8013 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8014 if (!ca)
8015 goto out;
8016
8017 ca->cpuusage = alloc_percpu(u64);
8018 if (!ca->cpuusage)
8019 goto out_free_ca;
8020
8021 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8022 if (!ca->cpustat)
8023 goto out_free_cpuusage;
8024
8025 return &ca->css;
8026
8027out_free_cpuusage:
8028 free_percpu(ca->cpuusage);
8029out_free_ca:
8030 kfree(ca);
8031out:
8032 return ERR_PTR(-ENOMEM);
8033}
8034
8035/* destroy an existing cpu accounting group */
8036static void cpuacct_css_free(struct cgroup *cgrp)
8037{
8038 struct cpuacct *ca = cgroup_ca(cgrp);
8039
8040 free_percpu(ca->cpustat);
8041 free_percpu(ca->cpuusage);
8042 kfree(ca);
8043}
8044
8045static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8046{
8047 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8048 u64 data;
8049
8050#ifndef CONFIG_64BIT
8051 /*
8052 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8053 */
8054 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8055 data = *cpuusage;
8056 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8057#else
8058 data = *cpuusage;
8059#endif
8060
8061 return data;
8062}
8063
8064static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8065{
8066 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8067
8068#ifndef CONFIG_64BIT
8069 /*
8070 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8071 */
8072 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8073 *cpuusage = val;
8074 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8075#else
8076 *cpuusage = val;
8077#endif
8078}
8079
8080/* return total cpu usage (in nanoseconds) of a group */
8081static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8082{
8083 struct cpuacct *ca = cgroup_ca(cgrp);
8084 u64 totalcpuusage = 0;
8085 int i;
8086
8087 for_each_present_cpu(i)
8088 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8089
8090 return totalcpuusage;
8091}
8092
8093static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8094 u64 reset)
8095{
8096 struct cpuacct *ca = cgroup_ca(cgrp);
8097 int err = 0;
8098 int i;
8099
8100 if (reset) {
8101 err = -EINVAL;
8102 goto out;
8103 }
8104
8105 for_each_present_cpu(i)
8106 cpuacct_cpuusage_write(ca, i, 0);
8107
8108out:
8109 return err;
8110}
8111
8112static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8113 struct seq_file *m)
8114{
8115 struct cpuacct *ca = cgroup_ca(cgroup);
8116 u64 percpu;
8117 int i;
8118
8119 for_each_present_cpu(i) {
8120 percpu = cpuacct_cpuusage_read(ca, i);
8121 seq_printf(m, "%llu ", (unsigned long long) percpu);
8122 }
8123 seq_printf(m, "\n");
8124 return 0;
8125}
8126
8127static const char *cpuacct_stat_desc[] = {
8128 [CPUACCT_STAT_USER] = "user",
8129 [CPUACCT_STAT_SYSTEM] = "system",
8130};
8131
8132static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8133 struct cgroup_map_cb *cb)
8134{
8135 struct cpuacct *ca = cgroup_ca(cgrp);
8136 int cpu;
8137 s64 val = 0;
8138
8139 for_each_online_cpu(cpu) {
8140 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8141 val += kcpustat->cpustat[CPUTIME_USER];
8142 val += kcpustat->cpustat[CPUTIME_NICE];
8143 }
8144 val = cputime64_to_clock_t(val);
8145 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8146
8147 val = 0;
8148 for_each_online_cpu(cpu) {
8149 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8150 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8151 val += kcpustat->cpustat[CPUTIME_IRQ];
8152 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8153 }
8154
8155 val = cputime64_to_clock_t(val);
8156 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8157
8158 return 0;
8159}
8160
8161static struct cftype files[] = {
8162 {
8163 .name = "usage",
8164 .read_u64 = cpuusage_read,
8165 .write_u64 = cpuusage_write,
8166 },
8167 {
8168 .name = "usage_percpu",
8169 .read_seq_string = cpuacct_percpu_seq_read,
8170 },
8171 {
8172 .name = "stat",
8173 .read_map = cpuacct_stats_show,
8174 },
8175 { } /* terminate */
8176};
8177
8178/*
8179 * charge this task's execution time to its accounting group.
8180 *
8181 * called with rq->lock held.
8182 */
8183void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8184{
8185 struct cpuacct *ca;
8186 int cpu;
8187
8188 if (unlikely(!cpuacct_subsys.active))
8189 return;
8190
8191 cpu = task_cpu(tsk);
8192
8193 rcu_read_lock();
8194
8195 ca = task_ca(tsk);
8196
8197 for (; ca; ca = parent_ca(ca)) {
8198 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8199 *cpuusage += cputime;
8200 }
8201
8202 rcu_read_unlock();
8203}
8204
8205struct cgroup_subsys cpuacct_subsys = {
8206 .name = "cpuacct",
8207 .css_alloc = cpuacct_css_alloc,
8208 .css_free = cpuacct_css_free,
8209 .subsys_id = cpuacct_subsys_id,
8210 .base_cftypes = files,
8211};
8212#endif /* CONFIG_CGROUP_CPUACCT */
8213
8214void dump_cpu_task(int cpu) 7996void dump_cpu_task(int cpu)
8215{ 7997{
8216 pr_info("Task dump for CPU %d:\n", cpu); 7998 pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
1#include <linux/cgroup.h>
2#include <linux/slab.h>
3#include <linux/percpu.h>
4#include <linux/spinlock.h>
5#include <linux/cpumask.h>
6#include <linux/seq_file.h>
7#include <linux/rcupdate.h>
8#include <linux/kernel_stat.h>
9#include <linux/err.h>
10
11#include "sched.h"
12
13/*
14 * CPU accounting code for task groups.
15 *
16 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
17 * (balbir@in.ibm.com).
18 */
19
20/* Time spent by the tasks of the cpu accounting group executing in ... */
21enum cpuacct_stat_index {
22 CPUACCT_STAT_USER, /* ... user mode */
23 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
24
25 CPUACCT_STAT_NSTATS,
26};
27
28/* track cpu usage of a group of tasks and its child groups */
29struct cpuacct {
30 struct cgroup_subsys_state css;
31 /* cpuusage holds pointer to a u64-type object on every cpu */
32 u64 __percpu *cpuusage;
33 struct kernel_cpustat __percpu *cpustat;
34};
35
36/* return cpu accounting group corresponding to this container */
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
40 struct cpuacct, css);
41}
42
43/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53}
54
55static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{
57 if (!ca->css.cgroup->parent)
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60}
61
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
63static struct cpuacct root_cpuacct = {
64 .cpustat = &kernel_cpustat,
65 .cpuusage = &root_cpuacct_cpuusage,
66};
67
68/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
70{
71 struct cpuacct *ca;
72
73 if (!cgrp->parent)
74 return &root_cpuacct.css;
75
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
77 if (!ca)
78 goto out;
79
80 ca->cpuusage = alloc_percpu(u64);
81 if (!ca->cpuusage)
82 goto out_free_ca;
83
84 ca->cpustat = alloc_percpu(struct kernel_cpustat);
85 if (!ca->cpustat)
86 goto out_free_cpuusage;
87
88 return &ca->css;
89
90out_free_cpuusage:
91 free_percpu(ca->cpuusage);
92out_free_ca:
93 kfree(ca);
94out:
95 return ERR_PTR(-ENOMEM);
96}
97
98/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp)
100{
101 struct cpuacct *ca = cgroup_ca(cgrp);
102
103 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage);
105 kfree(ca);
106}
107
108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
109{
110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
111 u64 data;
112
113#ifndef CONFIG_64BIT
114 /*
115 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
116 */
117 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
118 data = *cpuusage;
119 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
120#else
121 data = *cpuusage;
122#endif
123
124 return data;
125}
126
127static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
128{
129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
130
131#ifndef CONFIG_64BIT
132 /*
133 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
134 */
135 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
136 *cpuusage = val;
137 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
138#else
139 *cpuusage = val;
140#endif
141}
142
143/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
145{
146 struct cpuacct *ca = cgroup_ca(cgrp);
147 u64 totalcpuusage = 0;
148 int i;
149
150 for_each_present_cpu(i)
151 totalcpuusage += cpuacct_cpuusage_read(ca, i);
152
153 return totalcpuusage;
154}
155
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
157 u64 reset)
158{
159 struct cpuacct *ca = cgroup_ca(cgrp);
160 int err = 0;
161 int i;
162
163 if (reset) {
164 err = -EINVAL;
165 goto out;
166 }
167
168 for_each_present_cpu(i)
169 cpuacct_cpuusage_write(ca, i, 0);
170
171out:
172 return err;
173}
174
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
176 struct seq_file *m)
177{
178 struct cpuacct *ca = cgroup_ca(cgroup);
179 u64 percpu;
180 int i;
181
182 for_each_present_cpu(i) {
183 percpu = cpuacct_cpuusage_read(ca, i);
184 seq_printf(m, "%llu ", (unsigned long long) percpu);
185 }
186 seq_printf(m, "\n");
187 return 0;
188}
189
190static const char * const cpuacct_stat_desc[] = {
191 [CPUACCT_STAT_USER] = "user",
192 [CPUACCT_STAT_SYSTEM] = "system",
193};
194
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
196 struct cgroup_map_cb *cb)
197{
198 struct cpuacct *ca = cgroup_ca(cgrp);
199 int cpu;
200 s64 val = 0;
201
202 for_each_online_cpu(cpu) {
203 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
204 val += kcpustat->cpustat[CPUTIME_USER];
205 val += kcpustat->cpustat[CPUTIME_NICE];
206 }
207 val = cputime64_to_clock_t(val);
208 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
209
210 val = 0;
211 for_each_online_cpu(cpu) {
212 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
213 val += kcpustat->cpustat[CPUTIME_SYSTEM];
214 val += kcpustat->cpustat[CPUTIME_IRQ];
215 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
216 }
217
218 val = cputime64_to_clock_t(val);
219 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
220
221 return 0;
222}
223
224static struct cftype files[] = {
225 {
226 .name = "usage",
227 .read_u64 = cpuusage_read,
228 .write_u64 = cpuusage_write,
229 },
230 {
231 .name = "usage_percpu",
232 .read_seq_string = cpuacct_percpu_seq_read,
233 },
234 {
235 .name = "stat",
236 .read_map = cpuacct_stats_show,
237 },
238 { } /* terminate */
239};
240
241/*
242 * charge this task's execution time to its accounting group.
243 *
244 * called with rq->lock held.
245 */
246void cpuacct_charge(struct task_struct *tsk, u64 cputime)
247{
248 struct cpuacct *ca;
249 int cpu;
250
251 cpu = task_cpu(tsk);
252
253 rcu_read_lock();
254
255 ca = task_ca(tsk);
256
257 while (true) {
258 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
259 *cpuusage += cputime;
260
261 ca = parent_ca(ca);
262 if (!ca)
263 break;
264 }
265
266 rcu_read_unlock();
267}
268
269/*
270 * Add user/system time to cpuacct.
271 *
272 * Note: it's the caller that updates the account of the root cgroup.
273 */
274void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275{
276 struct kernel_cpustat *kcpustat;
277 struct cpuacct *ca;
278
279 rcu_read_lock();
280 ca = task_ca(p);
281 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca);
285 }
286 rcu_read_unlock();
287}
288
289struct cgroup_subsys cpuacct_subsys = {
290 .name = "cpuacct",
291 .css_alloc = cpuacct_css_alloc,
292 .css_free = cpuacct_css_free,
293 .subsys_id = cpuacct_subsys_id,
294 .base_cftypes = files,
295 .early_init = 1,
296};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
1#ifdef CONFIG_CGROUP_CPUACCT
2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
5
6#else
7
8static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9{
10}
11
12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val)
14{
15}
16
17#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e93cca92f38b..ea32f02bf2c3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
115static inline void task_group_account_field(struct task_struct *p, int index, 115static inline void task_group_account_field(struct task_struct *p, int index,
116 u64 tmp) 116 u64 tmp)
117{ 117{
118#ifdef CONFIG_CGROUP_CPUACCT
119 struct kernel_cpustat *kcpustat;
120 struct cpuacct *ca;
121#endif
122 /* 118 /*
123 * Since all updates are sure to touch the root cgroup, we 119 * Since all updates are sure to touch the root cgroup, we
124 * get ourselves ahead and touch it first. If the root cgroup 120 * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
127 */ 123 */
128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
129 125
130#ifdef CONFIG_CGROUP_CPUACCT 126 cpuacct_account_field(p, index, tmp);
131 if (unlikely(!cpuacct_subsys.active))
132 return;
133
134 rcu_read_lock();
135 ca = task_ca(p);
136 while (ca && (ca != &root_cpuacct)) {
137 kcpustat = this_cpu_ptr(ca->cpustat);
138 kcpustat->cpustat[index] += tmp;
139 ca = parent_ca(ca);
140 }
141 rcu_read_unlock();
142#endif
143} 127}
144 128
145/* 129/*
@@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
388 struct rq *rq) {} 372 struct rq *rq) {}
389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 373#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
390 374
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
392/*
393 * Account a single tick of cpu time.
394 * @p: the process that the cpu time gets accounted to
395 * @user_tick: indicates if the tick is a user or a system tick
396 */
397void account_process_tick(struct task_struct *p, int user_tick)
398{
399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
400 struct rq *rq = this_rq();
401
402 if (vtime_accounting_enabled())
403 return;
404
405 if (sched_clock_irqtime) {
406 irqtime_account_process_tick(p, user_tick, rq);
407 return;
408 }
409
410 if (steal_account_process_tick())
411 return;
412
413 if (user_tick)
414 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
415 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
416 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
417 one_jiffy_scaled);
418 else
419 account_idle_time(cputime_one_jiffy);
420}
421
422/*
423 * Account multiple ticks of steal time.
424 * @p: the process from which the cpu time has been stolen
425 * @ticks: number of stolen ticks
426 */
427void account_steal_ticks(unsigned long ticks)
428{
429 account_steal_time(jiffies_to_cputime(ticks));
430}
431
432/*
433 * Account multiple ticks of idle time.
434 * @ticks: number of stolen ticks
435 */
436void account_idle_ticks(unsigned long ticks)
437{
438
439 if (sched_clock_irqtime) {
440 irqtime_account_idle_ticks(ticks);
441 return;
442 }
443
444 account_idle_time(jiffies_to_cputime(ticks));
445}
446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
447
448/* 375/*
449 * Use precise platform statistics if available: 376 * Use precise platform statistics if available:
450 */ 377 */
451#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
452void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
453{
454 *ut = p->utime;
455 *st = p->stime;
456}
457
458void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
459{
460 struct task_cputime cputime;
461
462 thread_group_cputime(p, &cputime);
463
464 *ut = cputime.utime;
465 *st = cputime.stime;
466}
467 379
468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
469void vtime_task_switch(struct task_struct *prev) 381void vtime_task_switch(struct task_struct *prev)
@@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk)
518} 430}
519EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 431EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
520#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 432#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434
435
436#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
437void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
438{
439 *ut = p->utime;
440 *st = p->stime;
441}
521 442
522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 443void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444{
445 struct task_cputime cputime;
523 446
524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) 447 thread_group_cputime(p, &cputime);
448
449 *ut = cputime.utime;
450 *st = cputime.stime;
451}
452#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
453/*
454 * Account a single tick of cpu time.
455 * @p: the process that the cpu time gets accounted to
456 * @user_tick: indicates if the tick is a user or a system tick
457 */
458void account_process_tick(struct task_struct *p, int user_tick)
525{ 459{
526 u64 temp = (__force u64) rtime; 460 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
461 struct rq *rq = this_rq();
527 462
528 temp *= (__force u64) stime; 463 if (vtime_accounting_enabled())
464 return;
465
466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq);
468 return;
469 }
470
471 if (steal_account_process_tick())
472 return;
529 473
530 if (sizeof(cputime_t) == 4) 474 if (user_tick)
531 temp = div_u64(temp, (__force u32) total); 475 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
476 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
477 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
478 one_jiffy_scaled);
532 else 479 else
533 temp = div64_u64(temp, (__force u64) total); 480 account_idle_time(cputime_one_jiffy);
481}
534 482
535 return (__force cputime_t) temp; 483/*
484 * Account multiple ticks of steal time.
485 * @p: the process from which the cpu time has been stolen
486 * @ticks: number of stolen ticks
487 */
488void account_steal_ticks(unsigned long ticks)
489{
490 account_steal_time(jiffies_to_cputime(ticks));
491}
492
493/*
494 * Account multiple ticks of idle time.
495 * @ticks: number of stolen ticks
496 */
497void account_idle_ticks(unsigned long ticks)
498{
499
500 if (sched_clock_irqtime) {
501 irqtime_account_idle_ticks(ticks);
502 return;
503 }
504
505 account_idle_time(jiffies_to_cputime(ticks));
506}
507
508/*
509 * Perform (stime * rtime) / total with reduced chances
510 * of multiplication overflows by using smaller factors
511 * like quotient and remainders of divisions between
512 * rtime and total.
513 */
514static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515{
516 u64 rem, res, scaled;
517
518 if (rtime >= total) {
519 /*
520 * Scale up to rtime / total then add
521 * the remainder scaled to stime / total.
522 */
523 res = div64_u64_rem(rtime, total, &rem);
524 scaled = stime * res;
525 scaled += div64_u64(stime * rem, total);
526 } else {
527 /*
528 * Same in reverse: scale down to total / rtime
529 * then substract that result scaled to
530 * to the remaining part.
531 */
532 res = div64_u64_rem(total, rtime, &rem);
533 scaled = div64_u64(stime, res);
534 scaled -= div64_u64(scaled * rem, total);
535 }
536
537 return (__force cputime_t) scaled;
536} 538}
537 539
538/* 540/*
@@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr,
545{ 547{
546 cputime_t rtime, stime, total; 548 cputime_t rtime, stime, total;
547 549
550 if (vtime_accounting_enabled()) {
551 *ut = curr->utime;
552 *st = curr->stime;
553 return;
554 }
555
548 stime = curr->stime; 556 stime = curr->stime;
549 total = stime + curr->utime; 557 total = stime + curr->utime;
550 558
@@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr,
560 */ 568 */
561 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 569 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
562 570
563 if (total) 571 if (!rtime) {
564 stime = scale_stime(stime, rtime, total); 572 stime = 0;
565 else 573 } else if (!total) {
566 stime = rtime; 574 stime = rtime;
575 } else {
576 stime = scale_stime((__force u64)stime,
577 (__force u64)rtime, (__force u64)total);
578 }
567 579
568 /* 580 /*
569 * If the tick based count grows faster than the scheduler one, 581 * If the tick based count grows faster than the scheduler one,
@@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
597 thread_group_cputime(p, &cputime); 609 thread_group_cputime(p, &cputime);
598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 610 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
599} 611}
600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 612#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
601 613
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 614#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk) 615static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..8bf7081b1ec5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
431 * Scheduling class tree data structure manipulation methods: 431 * Scheduling class tree data structure manipulation methods:
432 */ 432 */
433 433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 434static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
435{ 435{
436 s64 delta = (s64)(vruntime - min_vruntime); 436 s64 delta = (s64)(vruntime - max_vruntime);
437 if (delta > 0) 437 if (delta > 0)
438 min_vruntime = vruntime; 438 max_vruntime = vruntime;
439 439
440 return min_vruntime; 440 return max_vruntime;
441} 441}
442 442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
473 vruntime = min_vruntime(vruntime, se->vruntime); 473 vruntime = min_vruntime(vruntime, se->vruntime);
474 } 474 }
475 475
476 /* ensure we never gain time by being placed backwards. */
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT 478#ifndef CONFIG_64BIT
478 smp_wmb(); 479 smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652} 653}
653 654
654/* 655/*
655 * We calculate the vruntime slice of a to be inserted task 656 * We calculate the vruntime slice of a to-be-inserted task.
656 * 657 *
657 * vs = s/w 658 * vs = s/w
658 */ 659 */
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1565#else 1587#else
1566static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3874 int tsk_cache_hot = 0; 3896 int tsk_cache_hot = 0;
3875 /* 3897 /*
3876 * We do not migrate tasks that are: 3898 * We do not migrate tasks that are:
3877 * 1) running (obviously), or 3899 * 1) throttled_lb_pair, or
3878 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3900 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3879 * 3) are cache-hot on their current CPU. 3901 * 3) running (obviously), or
3902 * 4) are cache-hot on their current CPU.
3880 */ 3903 */
3904 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 return 0;
3906
3881 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3907 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3882 int new_dst_cpu; 3908 int cpu;
3883 3909
3884 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3910 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3885 3911
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3894 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3920 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3895 return 0; 3921 return 0;
3896 3922
3897 new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3923 /* Prevent to re-select dst_cpu via env's cpus */
3898 tsk_cpus_allowed(p)); 3924 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3899 if (new_dst_cpu < nr_cpu_ids) { 3925 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3900 env->flags |= LBF_SOME_PINNED; 3926 env->flags |= LBF_SOME_PINNED;
3901 env->new_dst_cpu = new_dst_cpu; 3927 env->new_dst_cpu = cpu;
3928 break;
3929 }
3902 } 3930 }
3931
3903 return 0; 3932 return 0;
3904 } 3933 }
3905 3934
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3920 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3921 if (!tsk_cache_hot || 3950 if (!tsk_cache_hot ||
3922 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3923#ifdef CONFIG_SCHEDSTATS 3952
3924 if (tsk_cache_hot) { 3953 if (tsk_cache_hot) {
3925 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3954 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3926 schedstat_inc(p, se.statistics.nr_forced_migrations); 3955 schedstat_inc(p, se.statistics.nr_forced_migrations);
3927 } 3956 }
3928#endif 3957
3929 return 1; 3958 return 1;
3930 } 3959 }
3931 3960
3932 if (tsk_cache_hot) { 3961 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3933 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3962 return 0;
3934 return 0;
3935 }
3936 return 1;
3937} 3963}
3938 3964
3939/* 3965/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
3948 struct task_struct *p, *n; 3974 struct task_struct *p, *n;
3949 3975
3950 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3976 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3951 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3952 continue;
3953
3954 if (!can_migrate_task(p, env)) 3977 if (!can_migrate_task(p, env))
3955 continue; 3978 continue;
3956 3979
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
4002 break; 4025 break;
4003 } 4026 }
4004 4027
4005 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4028 if (!can_migrate_task(p, env))
4006 goto next; 4029 goto next;
4007 4030
4008 load = task_h_load(p); 4031 load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
4013 if ((load / 2) > env->imbalance) 4036 if ((load / 2) > env->imbalance)
4014 goto next; 4037 goto next;
4015 4038
4016 if (!can_migrate_task(p, env))
4017 goto next;
4018
4019 move_task(p, env); 4039 move_task(p, env);
4020 pulled++; 4040 pulled++;
4021 env->imbalance -= load; 4041 env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
4245 return load_idx; 4265 return load_idx;
4246} 4266}
4247 4267
4248unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4268static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4249{ 4269{
4250 return SCHED_POWER_SCALE; 4270 return SCHED_POWER_SCALE;
4251} 4271}
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4255 return default_scale_freq_power(sd, cpu); 4275 return default_scale_freq_power(sd, cpu);
4256} 4276}
4257 4277
4258unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4278static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4259{ 4279{
4260 unsigned long weight = sd->span_weight; 4280 unsigned long weight = sd->span_weight;
4261 unsigned long smt_gain = sd->smt_gain; 4281 unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4270 return default_scale_smt_power(sd, cpu); 4290 return default_scale_smt_power(sd, cpu);
4271} 4291}
4272 4292
4273unsigned long scale_rt_power(int cpu) 4293static unsigned long scale_rt_power(int cpu)
4274{ 4294{
4275 struct rq *rq = cpu_rq(cpu); 4295 struct rq *rq = cpu_rq(cpu);
4276 u64 total, available, age_stamp, avg; 4296 u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4960#define MAX_PINNED_INTERVAL 512 4980#define MAX_PINNED_INTERVAL 512
4961 4981
4962/* Working cpumask for load_balance and load_balance_newidle. */ 4982/* Working cpumask for load_balance and load_balance_newidle. */
4963DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4983DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4964 4984
4965static int need_active_balance(struct lb_env *env) 4985static int need_active_balance(struct lb_env *env)
4966{ 4986{
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4991 int *balance) 5011 int *balance)
4992{ 5012{
4993 int ld_moved, cur_ld_moved, active_balance = 0; 5013 int ld_moved, cur_ld_moved, active_balance = 0;
4994 int lb_iterations, max_lb_iterations;
4995 struct sched_group *group; 5014 struct sched_group *group;
4996 struct rq *busiest; 5015 struct rq *busiest;
4997 unsigned long flags; 5016 unsigned long flags;
4998 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 5017 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
4999 5018
5000 struct lb_env env = { 5019 struct lb_env env = {
5001 .sd = sd, 5020 .sd = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5007 .cpus = cpus, 5026 .cpus = cpus,
5008 }; 5027 };
5009 5028
5029 /*
5030 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 * other cpus in our group
5032 */
5033 if (idle == CPU_NEWLY_IDLE)
5034 env.dst_grpmask = NULL;
5035
5010 cpumask_copy(cpus, cpu_active_mask); 5036 cpumask_copy(cpus, cpu_active_mask);
5011 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5012 5037
5013 schedstat_inc(sd, lb_count[idle]); 5038 schedstat_inc(sd, lb_count[idle]);
5014 5039
@@ -5034,7 +5059,6 @@ redo:
5034 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5059 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5035 5060
5036 ld_moved = 0; 5061 ld_moved = 0;
5037 lb_iterations = 1;
5038 if (busiest->nr_running > 1) { 5062 if (busiest->nr_running > 1) {
5039 /* 5063 /*
5040 * Attempt to move tasks. If find_busiest_group has found 5064 * Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
5061 double_rq_unlock(env.dst_rq, busiest); 5085 double_rq_unlock(env.dst_rq, busiest);
5062 local_irq_restore(flags); 5086 local_irq_restore(flags);
5063 5087
5064 if (env.flags & LBF_NEED_BREAK) {
5065 env.flags &= ~LBF_NEED_BREAK;
5066 goto more_balance;
5067 }
5068
5069 /* 5088 /*
5070 * some other cpu did the load balance for us. 5089 * some other cpu did the load balance for us.
5071 */ 5090 */
5072 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5091 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5073 resched_cpu(env.dst_cpu); 5092 resched_cpu(env.dst_cpu);
5074 5093
5094 if (env.flags & LBF_NEED_BREAK) {
5095 env.flags &= ~LBF_NEED_BREAK;
5096 goto more_balance;
5097 }
5098
5075 /* 5099 /*
5076 * Revisit (affine) tasks on src_cpu that couldn't be moved to 5100 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5077 * us and move them to an alternate dst_cpu in our sched_group 5101 * us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
5091 * moreover subsequent load balance cycles should correct the 5115 * moreover subsequent load balance cycles should correct the
5092 * excess load moved. 5116 * excess load moved.
5093 */ 5117 */
5094 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5118 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5095 lb_iterations++ < max_lb_iterations) {
5096 5119
5097 env.dst_rq = cpu_rq(env.new_dst_cpu); 5120 env.dst_rq = cpu_rq(env.new_dst_cpu);
5098 env.dst_cpu = env.new_dst_cpu; 5121 env.dst_cpu = env.new_dst_cpu;
5099 env.flags &= ~LBF_SOME_PINNED; 5122 env.flags &= ~LBF_SOME_PINNED;
5100 env.loop = 0; 5123 env.loop = 0;
5101 env.loop_break = sched_nr_migrate_break; 5124 env.loop_break = sched_nr_migrate_break;
5125
5126 /* Prevent to re-select dst_cpu via env's cpus */
5127 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128
5102 /* 5129 /*
5103 * Go back to "more_balance" rather than "redo" since we 5130 * Go back to "more_balance" rather than "redo" since we
5104 * need to continue with same src_cpu. 5131 * need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5219 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5246 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5220 return; 5247 return;
5221 5248
5222 update_rq_runnable_avg(this_rq, 1);
5223
5224 /* 5249 /*
5225 * Drop the rq->lock, but keep IRQ/preempt disabled. 5250 * Drop the rq->lock, but keep IRQ/preempt disabled.
5226 */ 5251 */
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
5395 struct sched_domain *sd; 5420 struct sched_domain *sd;
5396 int cpu = smp_processor_id(); 5421 int cpu = smp_processor_id();
5397 5422
5398 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5399 return;
5400 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5401
5402 rcu_read_lock(); 5423 rcu_read_lock();
5403 for_each_domain(cpu, sd) 5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425
5426 if (!sd || !sd->nohz_idle)
5427 goto unlock;
5428 sd->nohz_idle = 0;
5429
5430 for (; sd; sd = sd->parent)
5404 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5431 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432unlock:
5405 rcu_read_unlock(); 5433 rcu_read_unlock();
5406} 5434}
5407 5435
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
5410 struct sched_domain *sd; 5438 struct sched_domain *sd;
5411 int cpu = smp_processor_id(); 5439 int cpu = smp_processor_id();
5412 5440
5413 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5414 return;
5415 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5416
5417 rcu_read_lock(); 5441 rcu_read_lock();
5418 for_each_domain(cpu, sd) 5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443
5444 if (!sd || sd->nohz_idle)
5445 goto unlock;
5446 sd->nohz_idle = 1;
5447
5448 for (; sd; sd = sd->parent)
5419 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5449 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450unlock:
5420 rcu_read_unlock(); 5451 rcu_read_unlock();
5421} 5452}
5422 5453
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
5468 * It checks each scheduling domain to see if it is due to be balanced, 5499 * It checks each scheduling domain to see if it is due to be balanced,
5469 * and initiates a balancing operation if so. 5500 * and initiates a balancing operation if so.
5470 * 5501 *
5471 * Balancing parameters are set up in arch_init_sched_domains. 5502 * Balancing parameters are set up in init_sched_domains.
5472 */ 5503 */
5473static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5504static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5474{ 5505{
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5506 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5537 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5507 if (load_balance(cpu, rq, sd, idle, &balance)) { 5538 if (load_balance(cpu, rq, sd, idle, &balance)) {
5508 /* 5539 /*
5509 * We've pulled tasks over so either we're no 5540 * The LBF_SOME_PINNED logic could have changed
5510 * longer idle. 5541 * env->dst_cpu, so we can't know our idle
5542 * state even if we migrated tasks. Update it.
5511 */ 5543 */
5512 idle = CPU_NOT_IDLE; 5544 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5513 } 5545 }
5514 sd->last_balance = jiffies; 5546 sd->last_balance = jiffies;
5515 } 5547 }
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20}
21
22static void post_schedule_idle(struct rq *rq)
23{
24 idle_enter_fair(rq);
25}
16#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
17/* 27/*
18 * Idle tasks are unconditionally rescheduled: 28 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 35static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 36{
27 schedstat_inc(rq, sched_goidle); 37 schedstat_inc(rq, sched_goidle);
38#ifdef CONFIG_SMP
39 /* Trigger the post schedule to do an idle_enter for CFS */
40 rq->post_schedule = 1;
41#endif
28 return rq->idle; 42 return rq->idle;
29} 43}
30 44
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
86 100
87#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 102 .select_task_rq = select_task_rq_idle,
103 .pre_schedule = pre_schedule_idle,
104 .post_schedule = post_schedule_idle,
89#endif 105#endif
90 106
91 .set_curr_task = set_curr_task_idle, 107 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..4c225c4c7111 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -7,6 +7,7 @@
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8 8
9#include "cpupri.h" 9#include "cpupri.h"
10#include "cpuacct.h"
10 11
11extern __read_mostly int scheduler_running; 12extern __read_mostly int scheduler_running;
12 13
@@ -33,6 +34,31 @@ extern __read_mostly int scheduler_running;
33 */ 34 */
34#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 35#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
35 36
37/*
38 * Increase resolution of nice-level calculations for 64-bit architectures.
39 * The extra resolution improves shares distribution and load balancing of
40 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
41 * hierarchies, especially on larger systems. This is not a user-visible change
42 * and does not change the user-interface for setting shares/weights.
43 *
44 * We increase resolution only if we have enough bits to allow this increased
45 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
46 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
47 * increased costs.
48 */
49#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
50# define SCHED_LOAD_RESOLUTION 10
51# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
52# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
53#else
54# define SCHED_LOAD_RESOLUTION 0
55# define scale_load(w) (w)
56# define scale_load_down(w) (w)
57#endif
58
59#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
60#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
61
36#define NICE_0_LOAD SCHED_LOAD_SCALE 62#define NICE_0_LOAD SCHED_LOAD_SCALE
37#define NICE_0_SHIFT SCHED_LOAD_SHIFT 63#define NICE_0_SHIFT SCHED_LOAD_SHIFT
38 64
@@ -154,11 +180,6 @@ struct task_group {
154#define MAX_SHARES (1UL << 18) 180#define MAX_SHARES (1UL << 18)
155#endif 181#endif
156 182
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *); 183typedef int (*tg_visitor)(struct task_group *, void *);
163 184
164extern int walk_tg_tree_from(struct task_group *from, 185extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +217,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu, 217 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent); 218 struct sched_rt_entity *parent);
198 219
220extern struct task_group *sched_create_group(struct task_group *parent);
221extern void sched_online_group(struct task_group *tg,
222 struct task_group *parent);
223extern void sched_destroy_group(struct task_group *tg);
224extern void sched_offline_group(struct task_group *tg);
225
226extern void sched_move_task(struct task_struct *tsk);
227
228#ifdef CONFIG_FAIR_GROUP_SCHED
229extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
230#endif
231
199#else /* CONFIG_CGROUP_SCHED */ 232#else /* CONFIG_CGROUP_SCHED */
200 233
201struct cfs_bandwidth { }; 234struct cfs_bandwidth { };
@@ -547,6 +580,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
547DECLARE_PER_CPU(struct sched_domain *, sd_llc); 580DECLARE_PER_CPU(struct sched_domain *, sd_llc);
548DECLARE_PER_CPU(int, sd_llc_id); 581DECLARE_PER_CPU(int, sd_llc_id);
549 582
583struct sched_group_power {
584 atomic_t ref;
585 /*
586 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
587 * single CPU.
588 */
589 unsigned int power, power_orig;
590 unsigned long next_update;
591 /*
592 * Number of busy cpus in this group.
593 */
594 atomic_t nr_busy_cpus;
595
596 unsigned long cpumask[0]; /* iteration mask */
597};
598
599struct sched_group {
600 struct sched_group *next; /* Must be a circular list */
601 atomic_t ref;
602
603 unsigned int group_weight;
604 struct sched_group_power *sgp;
605
606 /*
607 * The CPUs this group covers.
608 *
609 * NOTE: this field is variable length. (Allocated dynamically
610 * by attaching extra space to the end of the structure,
611 * depending on how many CPUs the kernel has booted up with)
612 */
613 unsigned long cpumask[0];
614};
615
616static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
617{
618 return to_cpumask(sg->cpumask);
619}
620
621/*
622 * cpumask masking which cpus in the group are allowed to iterate up the domain
623 * tree.
624 */
625static inline struct cpumask *sched_group_mask(struct sched_group *sg)
626{
627 return to_cpumask(sg->sgp->cpumask);
628}
629
630/**
631 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
632 * @group: The group whose first cpu is to be returned.
633 */
634static inline unsigned int group_first_cpu(struct sched_group *group)
635{
636 return cpumask_first(sched_group_cpus(group));
637}
638
550extern int group_balance_cpu(struct sched_group *sg); 639extern int group_balance_cpu(struct sched_group *sg);
551 640
552#endif /* CONFIG_SMP */ 641#endif /* CONFIG_SMP */
@@ -784,6 +873,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
784} 873}
785#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 874#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
786 875
876/*
877 * wake flags
878 */
879#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
880#define WF_FORK 0x02 /* child wakeup after fork */
881#define WF_MIGRATED 0x4 /* internal use, task got migrated */
787 882
788static inline void update_load_add(struct load_weight *lw, unsigned long inc) 883static inline void update_load_add(struct load_weight *lw, unsigned long inc)
789{ 884{
@@ -856,14 +951,61 @@ static const u32 prio_to_wmult[40] = {
856 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 951 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
857}; 952};
858 953
859/* Time spent by the tasks of the cpu accounting group executing in ... */ 954#define ENQUEUE_WAKEUP 1
860enum cpuacct_stat_index { 955#define ENQUEUE_HEAD 2
861 CPUACCT_STAT_USER, /* ... user mode */ 956#ifdef CONFIG_SMP
862 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 957#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
958#else
959#define ENQUEUE_WAKING 0
960#endif
863 961
864 CPUACCT_STAT_NSTATS, 962#define DEQUEUE_SLEEP 1
865}; 963
964struct sched_class {
965 const struct sched_class *next;
966
967 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
968 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
969 void (*yield_task) (struct rq *rq);
970 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
971
972 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
973
974 struct task_struct * (*pick_next_task) (struct rq *rq);
975 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
976
977#ifdef CONFIG_SMP
978 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
979 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
980
981 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
982 void (*post_schedule) (struct rq *this_rq);
983 void (*task_waking) (struct task_struct *task);
984 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
985
986 void (*set_cpus_allowed)(struct task_struct *p,
987 const struct cpumask *newmask);
866 988
989 void (*rq_online)(struct rq *rq);
990 void (*rq_offline)(struct rq *rq);
991#endif
992
993 void (*set_curr_task) (struct rq *rq);
994 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
995 void (*task_fork) (struct task_struct *p);
996
997 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
998 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
999 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1000 int oldprio);
1001
1002 unsigned int (*get_rr_interval) (struct rq *rq,
1003 struct task_struct *task);
1004
1005#ifdef CONFIG_FAIR_GROUP_SCHED
1006 void (*task_move_group) (struct task_struct *p, int on_rq);
1007#endif
1008};
867 1009
868#define sched_class_highest (&stop_sched_class) 1010#define sched_class_highest (&stop_sched_class)
869#define for_each_class(class) \ 1011#define for_each_class(class) \
@@ -877,9 +1019,23 @@ extern const struct sched_class idle_sched_class;
877 1019
878#ifdef CONFIG_SMP 1020#ifdef CONFIG_SMP
879 1021
1022extern void update_group_power(struct sched_domain *sd, int cpu);
1023
880extern void trigger_load_balance(struct rq *rq, int cpu); 1024extern void trigger_load_balance(struct rq *rq, int cpu);
881extern void idle_balance(int this_cpu, struct rq *this_rq); 1025extern void idle_balance(int this_cpu, struct rq *this_rq);
882 1026
1027/*
1028 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1029 * becomes useful in lb
1030 */
1031#if defined(CONFIG_FAIR_GROUP_SCHED)
1032extern void idle_enter_fair(struct rq *this_rq);
1033extern void idle_exit_fair(struct rq *this_rq);
1034#else
1035static inline void idle_enter_fair(struct rq *this_rq) {}
1036static inline void idle_exit_fair(struct rq *this_rq) {}
1037#endif
1038
883#else /* CONFIG_SMP */ 1039#else /* CONFIG_SMP */
884 1040
885static inline void idle_balance(int cpu, struct rq *rq) 1041static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1047,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
891extern void sysrq_sched_debug_show(void); 1047extern void sysrq_sched_debug_show(void);
892extern void sched_init_granularity(void); 1048extern void sched_init_granularity(void);
893extern void update_max_interval(void); 1049extern void update_max_interval(void);
894extern void update_group_power(struct sched_domain *sd, int cpu);
895extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 1050extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
896extern void init_sched_rt_class(void); 1051extern void init_sched_rt_class(void);
897extern void init_sched_fair_class(void); 1052extern void init_sched_fair_class(void);
@@ -904,45 +1059,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
904 1059
905extern void update_idle_cpu_load(struct rq *this_rq); 1060extern void update_idle_cpu_load(struct rq *this_rq);
906 1061
907#ifdef CONFIG_CGROUP_CPUACCT
908#include <linux/cgroup.h>
909/* track cpu usage of a group of tasks and its child groups */
910struct cpuacct {
911 struct cgroup_subsys_state css;
912 /* cpuusage holds pointer to a u64-type object on every cpu */
913 u64 __percpu *cpuusage;
914 struct kernel_cpustat __percpu *cpustat;
915};
916
917extern struct cgroup_subsys cpuacct_subsys;
918extern struct cpuacct root_cpuacct;
919
920/* return cpu accounting group corresponding to this container */
921static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
922{
923 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
924 struct cpuacct, css);
925}
926
927/* return cpu accounting group to which this task belongs */
928static inline struct cpuacct *task_ca(struct task_struct *tsk)
929{
930 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
931 struct cpuacct, css);
932}
933
934static inline struct cpuacct *parent_ca(struct cpuacct *ca)
935{
936 if (!ca || !ca->css.cgroup->parent)
937 return NULL;
938 return cgroup_ca(ca->css.cgroup->parent);
939}
940
941extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
942#else
943static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
944#endif
945
946#ifdef CONFIG_PARAVIRT 1062#ifdef CONFIG_PARAVIRT
947static inline u64 steal_ticks(u64 steal) 1063static inline u64 steal_ticks(u64 steal)
948{ 1064{
@@ -1187,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1187enum rq_nohz_flag_bits { 1303enum rq_nohz_flag_bits {
1188 NOHZ_TICK_STOPPED, 1304 NOHZ_TICK_STOPPED,
1189 NOHZ_BALANCE_KICK, 1305 NOHZ_BALANCE_KICK,
1190 NOHZ_IDLE,
1191}; 1306};
1192 1307
1193#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1308#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)