aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-06 11:33:28 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-06 11:44:54 -0500
commit0db49b72bce26341274b74fd968501489a361ae3 (patch)
treecdb076827aefb38d719d4c42f8ef291c36072fa8
parent35b740e4662ef386f0c60e1b60aaf5b44db9914c (diff)
parent1ac9bc6943edf7d181b4b1cc734981350d4f6bae (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits) sched/tracing: Add a new tracepoint for sleeptime sched: Disable scheduler warnings during oopses sched: Fix cgroup movement of waking process sched: Fix cgroup movement of newly created process sched: Fix cgroup movement of forking process sched: Remove cfs bandwidth period check in tg_set_cfs_period() sched: Fix load-balance lock-breaking sched: Replace all_pinned with a generic flags field sched: Only queue remote wakeups when crossing cache boundaries sched: Add missing rcu_dereference() around ->real_parent usage [S390] fix cputime overflow in uptime_proc_show [S390] cputime: add sparse checking and cleanup sched: Mark parent and real_parent as __rcu sched, nohz: Fix missing RCU read lock sched, nohz: Set the NOHZ_BALANCE_KICK flag for idle load balancer sched, nohz: Fix the idle cpu check in nohz_idle_balance sched: Use jump_labels for sched_feat sched/accounting: Fix parameter passing in task_group_account_field sched/accounting: Fix user/system tick double accounting sched/accounting: Re-use scheduler statistics for the root cgroup ... Fix up conflicts in - arch/ia64/include/asm/cputime.h, include/asm-generic/cputime.h usecs_to_cputime64() vs the sparse cleanups - kernel/sched/fair.c, kernel/time/tick-sched.c scheduler changes in multiple branches
-rw-r--r--arch/ia64/include/asm/cputime.h72
-rw-r--r--arch/powerpc/include/asm/cputime.h70
-rw-r--r--arch/s390/appldata/appldata_os.c16
-rw-r--r--arch/s390/include/asm/cputime.h140
-rw-r--r--arch/x86/include/asm/i387.h2
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c50
-rw-r--r--drivers/cpufreq/cpufreq_ondemand.c54
-rw-r--r--drivers/cpufreq/cpufreq_stats.c5
-rw-r--r--drivers/macintosh/rack-meter.c14
-rw-r--r--fs/proc/array.c8
-rw-r--r--fs/proc/stat.c63
-rw-r--r--fs/proc/uptime.c11
-rw-r--r--include/asm-generic/cputime.h65
-rw-r--r--include/linux/kernel_stat.h36
-rw-r--r--include/linux/latencytop.h3
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/trace/events/sched.h57
-rw-r--r--kernel/Makefile20
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/cpu.c3
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/posix-cpu-timers.c132
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2187
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)4
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1000
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)30
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)218
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)109
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/tsacct.c2
44 files changed, 3049 insertions, 2790 deletions
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h
index 5a274af31b2b..3deac956d325 100644
--- a/arch/ia64/include/asm/cputime.h
+++ b/arch/ia64/include/asm/cputime.h
@@ -26,60 +26,53 @@
26#include <linux/jiffies.h> 26#include <linux/jiffies.h>
27#include <asm/processor.h> 27#include <asm/processor.h>
28 28
29typedef u64 cputime_t; 29typedef u64 __nocast cputime_t;
30typedef u64 cputime64_t; 30typedef u64 __nocast cputime64_t;
31 31
32#define cputime_zero ((cputime_t)0)
33#define cputime_one_jiffy jiffies_to_cputime(1) 32#define cputime_one_jiffy jiffies_to_cputime(1)
34#define cputime_max ((~((cputime_t)0) >> 1) - 1)
35#define cputime_add(__a, __b) ((__a) + (__b))
36#define cputime_sub(__a, __b) ((__a) - (__b))
37#define cputime_div(__a, __n) ((__a) / (__n))
38#define cputime_halve(__a) ((__a) >> 1)
39#define cputime_eq(__a, __b) ((__a) == (__b))
40#define cputime_gt(__a, __b) ((__a) > (__b))
41#define cputime_ge(__a, __b) ((__a) >= (__b))
42#define cputime_lt(__a, __b) ((__a) < (__b))
43#define cputime_le(__a, __b) ((__a) <= (__b))
44
45#define cputime64_zero ((cputime64_t)0)
46#define cputime64_add(__a, __b) ((__a) + (__b))
47#define cputime64_sub(__a, __b) ((__a) - (__b))
48#define cputime_to_cputime64(__ct) (__ct)
49 33
50/* 34/*
51 * Convert cputime <-> jiffies (HZ) 35 * Convert cputime <-> jiffies (HZ)
52 */ 36 */
53#define cputime_to_jiffies(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) 37#define cputime_to_jiffies(__ct) \
54#define jiffies_to_cputime(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) 38 ((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
55#define cputime64_to_jiffies64(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) 39#define jiffies_to_cputime(__jif) \
56#define jiffies64_to_cputime64(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) 40 (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
41#define cputime64_to_jiffies64(__ct) \
42 ((__force u64)(__ct) / (NSEC_PER_SEC / HZ))
43#define jiffies64_to_cputime64(__jif) \
44 (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ))
57 45
58/* 46/*
59 * Convert cputime <-> microseconds 47 * Convert cputime <-> microseconds
60 */ 48 */
61#define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) 49#define cputime_to_usecs(__ct) \
62#define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) 50 ((__force u64)(__ct) / NSEC_PER_USEC)
63#define usecs_to_cputime64(__usecs) usecs_to_cputime(__usecs) 51#define usecs_to_cputime(__usecs) \
52 (__force cputime_t)((__usecs) * NSEC_PER_USEC)
53#define usecs_to_cputime64(__usecs) \
54 (__force cputime64_t)((__usecs) * NSEC_PER_USEC)
64 55
65/* 56/*
66 * Convert cputime <-> seconds 57 * Convert cputime <-> seconds
67 */ 58 */
68#define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) 59#define cputime_to_secs(__ct) \
69#define secs_to_cputime(__secs) ((__secs) * NSEC_PER_SEC) 60 ((__force u64)(__ct) / NSEC_PER_SEC)
61#define secs_to_cputime(__secs) \
62 (__force cputime_t)((__secs) * NSEC_PER_SEC)
70 63
71/* 64/*
72 * Convert cputime <-> timespec (nsec) 65 * Convert cputime <-> timespec (nsec)
73 */ 66 */
74static inline cputime_t timespec_to_cputime(const struct timespec *val) 67static inline cputime_t timespec_to_cputime(const struct timespec *val)
75{ 68{
76 cputime_t ret = val->tv_sec * NSEC_PER_SEC; 69 u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec;
77 return (ret + val->tv_nsec); 70 return (__force cputime_t) ret;
78} 71}
79static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) 72static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
80{ 73{
81 val->tv_sec = ct / NSEC_PER_SEC; 74 val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
82 val->tv_nsec = ct % NSEC_PER_SEC; 75 val->tv_nsec = (__force u64) ct % NSEC_PER_SEC;
83} 76}
84 77
85/* 78/*
@@ -87,25 +80,28 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
87 */ 80 */
88static inline cputime_t timeval_to_cputime(struct timeval *val) 81static inline cputime_t timeval_to_cputime(struct timeval *val)
89{ 82{
90 cputime_t ret = val->tv_sec * NSEC_PER_SEC; 83 u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC;
91 return (ret + val->tv_usec * NSEC_PER_USEC); 84 return (__force cputime_t) ret;
92} 85}
93static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) 86static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
94{ 87{
95 val->tv_sec = ct / NSEC_PER_SEC; 88 val->tv_sec = (__force u64) ct / NSEC_PER_SEC;
96 val->tv_usec = (ct % NSEC_PER_SEC) / NSEC_PER_USEC; 89 val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC;
97} 90}
98 91
99/* 92/*
100 * Convert cputime <-> clock (USER_HZ) 93 * Convert cputime <-> clock (USER_HZ)
101 */ 94 */
102#define cputime_to_clock_t(__ct) ((__ct) / (NSEC_PER_SEC / USER_HZ)) 95#define cputime_to_clock_t(__ct) \
103#define clock_t_to_cputime(__x) ((__x) * (NSEC_PER_SEC / USER_HZ)) 96 ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ))
97#define clock_t_to_cputime(__x) \
98 (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ))
104 99
105/* 100/*
106 * Convert cputime64 to clock. 101 * Convert cputime64 to clock.
107 */ 102 */
108#define cputime64_to_clock_t(__ct) cputime_to_clock_t((cputime_t)__ct) 103#define cputime64_to_clock_t(__ct) \
104 cputime_to_clock_t((__force cputime_t)__ct)
109 105
110#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 106#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
111#endif /* __IA64_CPUTIME_H */ 107#endif /* __IA64_CPUTIME_H */
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 98b7c4b49c9d..6ec1c380a4d6 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -29,25 +29,8 @@ static inline void setup_cputime_one_jiffy(void) { }
29#include <asm/time.h> 29#include <asm/time.h>
30#include <asm/param.h> 30#include <asm/param.h>
31 31
32typedef u64 cputime_t; 32typedef u64 __nocast cputime_t;
33typedef u64 cputime64_t; 33typedef u64 __nocast cputime64_t;
34
35#define cputime_zero ((cputime_t)0)
36#define cputime_max ((~((cputime_t)0) >> 1) - 1)
37#define cputime_add(__a, __b) ((__a) + (__b))
38#define cputime_sub(__a, __b) ((__a) - (__b))
39#define cputime_div(__a, __n) ((__a) / (__n))
40#define cputime_halve(__a) ((__a) >> 1)
41#define cputime_eq(__a, __b) ((__a) == (__b))
42#define cputime_gt(__a, __b) ((__a) > (__b))
43#define cputime_ge(__a, __b) ((__a) >= (__b))
44#define cputime_lt(__a, __b) ((__a) < (__b))
45#define cputime_le(__a, __b) ((__a) <= (__b))
46
47#define cputime64_zero ((cputime64_t)0)
48#define cputime64_add(__a, __b) ((__a) + (__b))
49#define cputime64_sub(__a, __b) ((__a) - (__b))
50#define cputime_to_cputime64(__ct) (__ct)
51 34
52#ifdef __KERNEL__ 35#ifdef __KERNEL__
53 36
@@ -65,7 +48,7 @@ DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta);
65 48
66static inline unsigned long cputime_to_jiffies(const cputime_t ct) 49static inline unsigned long cputime_to_jiffies(const cputime_t ct)
67{ 50{
68 return mulhdu(ct, __cputime_jiffies_factor); 51 return mulhdu((__force u64) ct, __cputime_jiffies_factor);
69} 52}
70 53
71/* Estimate the scaled cputime by scaling the real cputime based on 54/* Estimate the scaled cputime by scaling the real cputime based on
@@ -74,14 +57,15 @@ static inline cputime_t cputime_to_scaled(const cputime_t ct)
74{ 57{
75 if (cpu_has_feature(CPU_FTR_SPURR) && 58 if (cpu_has_feature(CPU_FTR_SPURR) &&
76 __get_cpu_var(cputime_last_delta)) 59 __get_cpu_var(cputime_last_delta))
77 return ct * __get_cpu_var(cputime_scaled_last_delta) / 60 return (__force u64) ct *
78 __get_cpu_var(cputime_last_delta); 61 __get_cpu_var(cputime_scaled_last_delta) /
62 __get_cpu_var(cputime_last_delta);
79 return ct; 63 return ct;
80} 64}
81 65
82static inline cputime_t jiffies_to_cputime(const unsigned long jif) 66static inline cputime_t jiffies_to_cputime(const unsigned long jif)
83{ 67{
84 cputime_t ct; 68 u64 ct;
85 unsigned long sec; 69 unsigned long sec;
86 70
87 /* have to be a little careful about overflow */ 71 /* have to be a little careful about overflow */
@@ -93,7 +77,7 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif)
93 } 77 }
94 if (sec) 78 if (sec)
95 ct += (cputime_t) sec * tb_ticks_per_sec; 79 ct += (cputime_t) sec * tb_ticks_per_sec;
96 return ct; 80 return (__force cputime_t) ct;
97} 81}
98 82
99static inline void setup_cputime_one_jiffy(void) 83static inline void setup_cputime_one_jiffy(void)
@@ -103,7 +87,7 @@ static inline void setup_cputime_one_jiffy(void)
103 87
104static inline cputime64_t jiffies64_to_cputime64(const u64 jif) 88static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
105{ 89{
106 cputime_t ct; 90 u64 ct;
107 u64 sec; 91 u64 sec;
108 92
109 /* have to be a little careful about overflow */ 93 /* have to be a little careful about overflow */
@@ -114,13 +98,13 @@ static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
114 do_div(ct, HZ); 98 do_div(ct, HZ);
115 } 99 }
116 if (sec) 100 if (sec)
117 ct += (cputime_t) sec * tb_ticks_per_sec; 101 ct += (u64) sec * tb_ticks_per_sec;
118 return ct; 102 return (__force cputime64_t) ct;
119} 103}
120 104
121static inline u64 cputime64_to_jiffies64(const cputime_t ct) 105static inline u64 cputime64_to_jiffies64(const cputime_t ct)
122{ 106{
123 return mulhdu(ct, __cputime_jiffies_factor); 107 return mulhdu((__force u64) ct, __cputime_jiffies_factor);
124} 108}
125 109
126/* 110/*
@@ -130,12 +114,12 @@ extern u64 __cputime_msec_factor;
130 114
131static inline unsigned long cputime_to_usecs(const cputime_t ct) 115static inline unsigned long cputime_to_usecs(const cputime_t ct)
132{ 116{
133 return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; 117 return mulhdu((__force u64) ct, __cputime_msec_factor) * USEC_PER_MSEC;
134} 118}
135 119
136static inline cputime_t usecs_to_cputime(const unsigned long us) 120static inline cputime_t usecs_to_cputime(const unsigned long us)
137{ 121{
138 cputime_t ct; 122 u64 ct;
139 unsigned long sec; 123 unsigned long sec;
140 124
141 /* have to be a little careful about overflow */ 125 /* have to be a little careful about overflow */
@@ -147,7 +131,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us)
147 } 131 }
148 if (sec) 132 if (sec)
149 ct += (cputime_t) sec * tb_ticks_per_sec; 133 ct += (cputime_t) sec * tb_ticks_per_sec;
150 return ct; 134 return (__force cputime_t) ct;
151} 135}
152 136
153#define usecs_to_cputime64(us) usecs_to_cputime(us) 137#define usecs_to_cputime64(us) usecs_to_cputime(us)
@@ -159,12 +143,12 @@ extern u64 __cputime_sec_factor;
159 143
160static inline unsigned long cputime_to_secs(const cputime_t ct) 144static inline unsigned long cputime_to_secs(const cputime_t ct)
161{ 145{
162 return mulhdu(ct, __cputime_sec_factor); 146 return mulhdu((__force u64) ct, __cputime_sec_factor);
163} 147}
164 148
165static inline cputime_t secs_to_cputime(const unsigned long sec) 149static inline cputime_t secs_to_cputime(const unsigned long sec)
166{ 150{
167 return (cputime_t) sec * tb_ticks_per_sec; 151 return (__force cputime_t)((u64) sec * tb_ticks_per_sec);
168} 152}
169 153
170/* 154/*
@@ -172,7 +156,7 @@ static inline cputime_t secs_to_cputime(const unsigned long sec)
172 */ 156 */
173static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) 157static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p)
174{ 158{
175 u64 x = ct; 159 u64 x = (__force u64) ct;
176 unsigned int frac; 160 unsigned int frac;
177 161
178 frac = do_div(x, tb_ticks_per_sec); 162 frac = do_div(x, tb_ticks_per_sec);
@@ -184,11 +168,11 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p)
184 168
185static inline cputime_t timespec_to_cputime(const struct timespec *p) 169static inline cputime_t timespec_to_cputime(const struct timespec *p)
186{ 170{
187 cputime_t ct; 171 u64 ct;
188 172
189 ct = (u64) p->tv_nsec * tb_ticks_per_sec; 173 ct = (u64) p->tv_nsec * tb_ticks_per_sec;
190 do_div(ct, 1000000000); 174 do_div(ct, 1000000000);
191 return ct + (u64) p->tv_sec * tb_ticks_per_sec; 175 return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec);
192} 176}
193 177
194/* 178/*
@@ -196,7 +180,7 @@ static inline cputime_t timespec_to_cputime(const struct timespec *p)
196 */ 180 */
197static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) 181static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p)
198{ 182{
199 u64 x = ct; 183 u64 x = (__force u64) ct;
200 unsigned int frac; 184 unsigned int frac;
201 185
202 frac = do_div(x, tb_ticks_per_sec); 186 frac = do_div(x, tb_ticks_per_sec);
@@ -208,11 +192,11 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p)
208 192
209static inline cputime_t timeval_to_cputime(const struct timeval *p) 193static inline cputime_t timeval_to_cputime(const struct timeval *p)
210{ 194{
211 cputime_t ct; 195 u64 ct;
212 196
213 ct = (u64) p->tv_usec * tb_ticks_per_sec; 197 ct = (u64) p->tv_usec * tb_ticks_per_sec;
214 do_div(ct, 1000000); 198 do_div(ct, 1000000);
215 return ct + (u64) p->tv_sec * tb_ticks_per_sec; 199 return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec);
216} 200}
217 201
218/* 202/*
@@ -222,12 +206,12 @@ extern u64 __cputime_clockt_factor;
222 206
223static inline unsigned long cputime_to_clock_t(const cputime_t ct) 207static inline unsigned long cputime_to_clock_t(const cputime_t ct)
224{ 208{
225 return mulhdu(ct, __cputime_clockt_factor); 209 return mulhdu((__force u64) ct, __cputime_clockt_factor);
226} 210}
227 211
228static inline cputime_t clock_t_to_cputime(const unsigned long clk) 212static inline cputime_t clock_t_to_cputime(const unsigned long clk)
229{ 213{
230 cputime_t ct; 214 u64 ct;
231 unsigned long sec; 215 unsigned long sec;
232 216
233 /* have to be a little careful about overflow */ 217 /* have to be a little careful about overflow */
@@ -238,8 +222,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk)
238 do_div(ct, USER_HZ); 222 do_div(ct, USER_HZ);
239 } 223 }
240 if (sec) 224 if (sec)
241 ct += (cputime_t) sec * tb_ticks_per_sec; 225 ct += (u64) sec * tb_ticks_per_sec;
242 return ct; 226 return (__force cputime_t) ct;
243} 227}
244 228
245#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) 229#define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct))
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 92f1cb745d69..4de031d6b76c 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data)
115 j = 0; 115 j = 0;
116 for_each_online_cpu(i) { 116 for_each_online_cpu(i) {
117 os_data->os_cpu[j].per_cpu_user = 117 os_data->os_cpu[j].per_cpu_user =
118 cputime_to_jiffies(kstat_cpu(i).cpustat.user); 118 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]);
119 os_data->os_cpu[j].per_cpu_nice = 119 os_data->os_cpu[j].per_cpu_nice =
120 cputime_to_jiffies(kstat_cpu(i).cpustat.nice); 120 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]);
121 os_data->os_cpu[j].per_cpu_system = 121 os_data->os_cpu[j].per_cpu_system =
122 cputime_to_jiffies(kstat_cpu(i).cpustat.system); 122 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]);
123 os_data->os_cpu[j].per_cpu_idle = 123 os_data->os_cpu[j].per_cpu_idle =
124 cputime_to_jiffies(kstat_cpu(i).cpustat.idle); 124 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]);
125 os_data->os_cpu[j].per_cpu_irq = 125 os_data->os_cpu[j].per_cpu_irq =
126 cputime_to_jiffies(kstat_cpu(i).cpustat.irq); 126 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]);
127 os_data->os_cpu[j].per_cpu_softirq = 127 os_data->os_cpu[j].per_cpu_softirq =
128 cputime_to_jiffies(kstat_cpu(i).cpustat.softirq); 128 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]);
129 os_data->os_cpu[j].per_cpu_iowait = 129 os_data->os_cpu[j].per_cpu_iowait =
130 cputime_to_jiffies(kstat_cpu(i).cpustat.iowait); 130 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]);
131 os_data->os_cpu[j].per_cpu_steal = 131 os_data->os_cpu[j].per_cpu_steal =
132 cputime_to_jiffies(kstat_cpu(i).cpustat.steal); 132 cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]);
133 os_data->os_cpu[j].cpu_id = i; 133 os_data->os_cpu[j].cpu_id = i;
134 j++; 134 j++;
135 } 135 }
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index b9acaaa175d8..c23c3900c304 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -16,75 +16,60 @@
16 16
17/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ 17/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
18 18
19typedef unsigned long long cputime_t; 19typedef unsigned long long __nocast cputime_t;
20typedef unsigned long long cputime64_t; 20typedef unsigned long long __nocast cputime64_t;
21 21
22#ifndef __s390x__ 22static inline unsigned long __div(unsigned long long n, unsigned long base)
23
24static inline unsigned int
25__div(unsigned long long n, unsigned int base)
26{ 23{
24#ifndef __s390x__
27 register_pair rp; 25 register_pair rp;
28 26
29 rp.pair = n >> 1; 27 rp.pair = n >> 1;
30 asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1)); 28 asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1));
31 return rp.subreg.odd; 29 return rp.subreg.odd;
30#else /* __s390x__ */
31 return n / base;
32#endif /* __s390x__ */
32} 33}
33 34
34#else /* __s390x__ */ 35#define cputime_one_jiffy jiffies_to_cputime(1)
35 36
36static inline unsigned int 37/*
37__div(unsigned long long n, unsigned int base) 38 * Convert cputime to jiffies and back.
39 */
40static inline unsigned long cputime_to_jiffies(const cputime_t cputime)
38{ 41{
39 return n / base; 42 return __div((__force unsigned long long) cputime, 4096000000ULL / HZ);
40} 43}
41 44
42#endif /* __s390x__ */ 45static inline cputime_t jiffies_to_cputime(const unsigned int jif)
46{
47 return (__force cputime_t)(jif * (4096000000ULL / HZ));
48}
43 49
44#define cputime_zero (0ULL) 50static inline u64 cputime64_to_jiffies64(cputime64_t cputime)
45#define cputime_one_jiffy jiffies_to_cputime(1) 51{
46#define cputime_max ((~0UL >> 1) - 1) 52 unsigned long long jif = (__force unsigned long long) cputime;
47#define cputime_add(__a, __b) ((__a) + (__b)) 53 do_div(jif, 4096000000ULL / HZ);
48#define cputime_sub(__a, __b) ((__a) - (__b)) 54 return jif;
49#define cputime_div(__a, __n) ({ \ 55}
50 unsigned long long __div = (__a); \ 56
51 do_div(__div,__n); \ 57static inline cputime64_t jiffies64_to_cputime64(const u64 jif)
52 __div; \ 58{
53}) 59 return (__force cputime64_t)(jif * (4096000000ULL / HZ));
54#define cputime_halve(__a) ((__a) >> 1)
55#define cputime_eq(__a, __b) ((__a) == (__b))
56#define cputime_gt(__a, __b) ((__a) > (__b))
57#define cputime_ge(__a, __b) ((__a) >= (__b))
58#define cputime_lt(__a, __b) ((__a) < (__b))
59#define cputime_le(__a, __b) ((__a) <= (__b))
60#define cputime_to_jiffies(__ct) (__div((__ct), 4096000000ULL / HZ))
61#define cputime_to_scaled(__ct) (__ct)
62#define jiffies_to_cputime(__hz) ((cputime_t)(__hz) * (4096000000ULL / HZ))
63
64#define cputime64_zero (0ULL)
65#define cputime64_add(__a, __b) ((__a) + (__b))
66#define cputime_to_cputime64(__ct) (__ct)
67
68static inline u64
69cputime64_to_jiffies64(cputime64_t cputime)
70{
71 do_div(cputime, 4096000000ULL / HZ);
72 return cputime;
73} 60}
74 61
75/* 62/*
76 * Convert cputime to microseconds and back. 63 * Convert cputime to microseconds and back.
77 */ 64 */
78static inline unsigned int 65static inline unsigned int cputime_to_usecs(const cputime_t cputime)
79cputime_to_usecs(const cputime_t cputime)
80{ 66{
81 return cputime_div(cputime, 4096); 67 return (__force unsigned long long) cputime >> 12;
82} 68}
83 69
84static inline cputime_t 70static inline cputime_t usecs_to_cputime(const unsigned int m)
85usecs_to_cputime(const unsigned int m)
86{ 71{
87 return (cputime_t) m * 4096; 72 return (__force cputime_t)(m * 4096ULL);
88} 73}
89 74
90#define usecs_to_cputime64(m) usecs_to_cputime(m) 75#define usecs_to_cputime64(m) usecs_to_cputime(m)
@@ -92,40 +77,39 @@ usecs_to_cputime(const unsigned int m)
92/* 77/*
93 * Convert cputime to milliseconds and back. 78 * Convert cputime to milliseconds and back.
94 */ 79 */
95static inline unsigned int 80static inline unsigned int cputime_to_secs(const cputime_t cputime)
96cputime_to_secs(const cputime_t cputime)
97{ 81{
98 return __div(cputime, 2048000000) >> 1; 82 return __div((__force unsigned long long) cputime, 2048000000) >> 1;
99} 83}
100 84
101static inline cputime_t 85static inline cputime_t secs_to_cputime(const unsigned int s)
102secs_to_cputime(const unsigned int s)
103{ 86{
104 return (cputime_t) s * 4096000000ULL; 87 return (__force cputime_t)(s * 4096000000ULL);
105} 88}
106 89
107/* 90/*
108 * Convert cputime to timespec and back. 91 * Convert cputime to timespec and back.
109 */ 92 */
110static inline cputime_t 93static inline cputime_t timespec_to_cputime(const struct timespec *value)
111timespec_to_cputime(const struct timespec *value)
112{ 94{
113 return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL; 95 unsigned long long ret = value->tv_sec * 4096000000ULL;
96 return (__force cputime_t)(ret + value->tv_nsec * 4096 / 1000);
114} 97}
115 98
116static inline void 99static inline void cputime_to_timespec(const cputime_t cputime,
117cputime_to_timespec(const cputime_t cputime, struct timespec *value) 100 struct timespec *value)
118{ 101{
102 unsigned long long __cputime = (__force unsigned long long) cputime;
119#ifndef __s390x__ 103#ifndef __s390x__
120 register_pair rp; 104 register_pair rp;
121 105
122 rp.pair = cputime >> 1; 106 rp.pair = __cputime >> 1;
123 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); 107 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
124 value->tv_nsec = rp.subreg.even * 1000 / 4096; 108 value->tv_nsec = rp.subreg.even * 1000 / 4096;
125 value->tv_sec = rp.subreg.odd; 109 value->tv_sec = rp.subreg.odd;
126#else 110#else
127 value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096; 111 value->tv_nsec = (__cputime % 4096000000ULL) * 1000 / 4096;
128 value->tv_sec = cputime / 4096000000ULL; 112 value->tv_sec = __cputime / 4096000000ULL;
129#endif 113#endif
130} 114}
131 115
@@ -134,50 +118,52 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value)
134 * Since cputime and timeval have the same resolution (microseconds) 118 * Since cputime and timeval have the same resolution (microseconds)
135 * this is easy. 119 * this is easy.
136 */ 120 */
137static inline cputime_t 121static inline cputime_t timeval_to_cputime(const struct timeval *value)
138timeval_to_cputime(const struct timeval *value)
139{ 122{
140 return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL; 123 unsigned long long ret = value->tv_sec * 4096000000ULL;
124 return (__force cputime_t)(ret + value->tv_usec * 4096ULL);
141} 125}
142 126
143static inline void 127static inline void cputime_to_timeval(const cputime_t cputime,
144cputime_to_timeval(const cputime_t cputime, struct timeval *value) 128 struct timeval *value)
145{ 129{
130 unsigned long long __cputime = (__force unsigned long long) cputime;
146#ifndef __s390x__ 131#ifndef __s390x__
147 register_pair rp; 132 register_pair rp;
148 133
149 rp.pair = cputime >> 1; 134 rp.pair = __cputime >> 1;
150 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); 135 asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL));
151 value->tv_usec = rp.subreg.even / 4096; 136 value->tv_usec = rp.subreg.even / 4096;
152 value->tv_sec = rp.subreg.odd; 137 value->tv_sec = rp.subreg.odd;
153#else 138#else
154 value->tv_usec = (cputime % 4096000000ULL) / 4096; 139 value->tv_usec = (__cputime % 4096000000ULL) / 4096;
155 value->tv_sec = cputime / 4096000000ULL; 140 value->tv_sec = __cputime / 4096000000ULL;
156#endif 141#endif
157} 142}
158 143
159/* 144/*
160 * Convert cputime to clock and back. 145 * Convert cputime to clock and back.
161 */ 146 */
162static inline clock_t 147static inline clock_t cputime_to_clock_t(cputime_t cputime)
163cputime_to_clock_t(cputime_t cputime)
164{ 148{
165 return cputime_div(cputime, 4096000000ULL / USER_HZ); 149 unsigned long long clock = (__force unsigned long long) cputime;
150 do_div(clock, 4096000000ULL / USER_HZ);
151 return clock;
166} 152}
167 153
168static inline cputime_t 154static inline cputime_t clock_t_to_cputime(unsigned long x)
169clock_t_to_cputime(unsigned long x)
170{ 155{
171 return (cputime_t) x * (4096000000ULL / USER_HZ); 156 return (__force cputime_t)(x * (4096000000ULL / USER_HZ));
172} 157}
173 158
174/* 159/*
175 * Convert cputime64 to clock. 160 * Convert cputime64 to clock.
176 */ 161 */
177static inline clock_t 162static inline clock_t cputime64_to_clock_t(cputime64_t cputime)
178cputime64_to_clock_t(cputime64_t cputime)
179{ 163{
180 return cputime_div(cputime, 4096000000ULL / USER_HZ); 164 unsigned long long clock = (__force unsigned long long) cputime;
165 do_div(clock, 4096000000ULL / USER_HZ);
166 return clock;
181} 167}
182 168
183struct s390_idle_data { 169struct s390_idle_data {
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c9e09ea05644..6919e936345b 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu)
218#ifdef CONFIG_SMP 218#ifdef CONFIG_SMP
219#define safe_address (__per_cpu_offset[0]) 219#define safe_address (__per_cpu_offset[0])
220#else 220#else
221#define safe_address (kstat_cpu(0).cpustat.user) 221#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER])
222#endif 222#endif
223 223
224/* 224/*
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index c97b468ee9f7..235a340e81f2 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -95,27 +95,26 @@ static struct dbs_tuners {
95 .freq_step = 5, 95 .freq_step = 5,
96}; 96};
97 97
98static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 98static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
99 cputime64_t *wall)
100{ 99{
101 cputime64_t idle_time; 100 u64 idle_time;
102 cputime64_t cur_wall_time; 101 u64 cur_wall_time;
103 cputime64_t busy_time; 102 u64 busy_time;
104 103
105 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 104 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
106 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
107 kstat_cpu(cpu).cpustat.system);
108 105
109 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); 106 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
110 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); 107 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
111 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); 108 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
112 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); 109 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
110 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
111 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
113 112
114 idle_time = cputime64_sub(cur_wall_time, busy_time); 113 idle_time = cur_wall_time - busy_time;
115 if (wall) 114 if (wall)
116 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 115 *wall = jiffies_to_usecs(cur_wall_time);
117 116
118 return (cputime64_t)jiffies_to_usecs(idle_time); 117 return jiffies_to_usecs(idle_time);
119} 118}
120 119
121static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 120static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -272,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
272 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 271 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
273 &dbs_info->prev_cpu_wall); 272 &dbs_info->prev_cpu_wall);
274 if (dbs_tuners_ins.ignore_nice) 273 if (dbs_tuners_ins.ignore_nice)
275 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 274 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
276 } 275 }
277 return count; 276 return count;
278} 277}
@@ -353,20 +352,20 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
353 352
354 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 353 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
355 354
356 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 355 wall_time = (unsigned int)
357 j_dbs_info->prev_cpu_wall); 356 (cur_wall_time - j_dbs_info->prev_cpu_wall);
358 j_dbs_info->prev_cpu_wall = cur_wall_time; 357 j_dbs_info->prev_cpu_wall = cur_wall_time;
359 358
360 idle_time = (unsigned int) cputime64_sub(cur_idle_time, 359 idle_time = (unsigned int)
361 j_dbs_info->prev_cpu_idle); 360 (cur_idle_time - j_dbs_info->prev_cpu_idle);
362 j_dbs_info->prev_cpu_idle = cur_idle_time; 361 j_dbs_info->prev_cpu_idle = cur_idle_time;
363 362
364 if (dbs_tuners_ins.ignore_nice) { 363 if (dbs_tuners_ins.ignore_nice) {
365 cputime64_t cur_nice; 364 u64 cur_nice;
366 unsigned long cur_nice_jiffies; 365 unsigned long cur_nice_jiffies;
367 366
368 cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, 367 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
369 j_dbs_info->prev_cpu_nice); 368 j_dbs_info->prev_cpu_nice;
370 /* 369 /*
371 * Assumption: nice time between sampling periods will 370 * Assumption: nice time between sampling periods will
372 * be less than 2^32 jiffies for 32 bit sys 371 * be less than 2^32 jiffies for 32 bit sys
@@ -374,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
374 cur_nice_jiffies = (unsigned long) 373 cur_nice_jiffies = (unsigned long)
375 cputime64_to_jiffies64(cur_nice); 374 cputime64_to_jiffies64(cur_nice);
376 375
377 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 376 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
378 idle_time += jiffies_to_usecs(cur_nice_jiffies); 377 idle_time += jiffies_to_usecs(cur_nice_jiffies);
379 } 378 }
380 379
@@ -501,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
501 500
502 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 501 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
503 &j_dbs_info->prev_cpu_wall); 502 &j_dbs_info->prev_cpu_wall);
504 if (dbs_tuners_ins.ignore_nice) { 503 if (dbs_tuners_ins.ignore_nice)
505 j_dbs_info->prev_cpu_nice = 504 j_dbs_info->prev_cpu_nice =
506 kstat_cpu(j).cpustat.nice; 505 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
507 }
508 } 506 }
509 this_dbs_info->down_skip = 0; 507 this_dbs_info->down_skip = 0;
510 this_dbs_info->requested_freq = policy->cur; 508 this_dbs_info->requested_freq = policy->cur;
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index fa8af4ebb1d6..3d679eee70a1 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -119,27 +119,26 @@ static struct dbs_tuners {
119 .powersave_bias = 0, 119 .powersave_bias = 0,
120}; 120};
121 121
122static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, 122static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
123 cputime64_t *wall)
124{ 123{
125 cputime64_t idle_time; 124 u64 idle_time;
126 cputime64_t cur_wall_time; 125 u64 cur_wall_time;
127 cputime64_t busy_time; 126 u64 busy_time;
128 127
129 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); 128 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
130 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
131 kstat_cpu(cpu).cpustat.system);
132 129
133 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); 130 busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
134 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); 131 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
135 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); 132 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
136 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); 133 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
134 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
135 busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
137 136
138 idle_time = cputime64_sub(cur_wall_time, busy_time); 137 idle_time = cur_wall_time - busy_time;
139 if (wall) 138 if (wall)
140 *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); 139 *wall = jiffies_to_usecs(cur_wall_time);
141 140
142 return (cputime64_t)jiffies_to_usecs(idle_time); 141 return jiffies_to_usecs(idle_time);
143} 142}
144 143
145static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) 144static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
@@ -345,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
345 dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 344 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
346 &dbs_info->prev_cpu_wall); 345 &dbs_info->prev_cpu_wall);
347 if (dbs_tuners_ins.ignore_nice) 346 if (dbs_tuners_ins.ignore_nice)
348 dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 347 dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
349 348
350 } 349 }
351 return count; 350 return count;
@@ -442,24 +441,24 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
442 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); 441 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
443 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); 442 cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time);
444 443
445 wall_time = (unsigned int) cputime64_sub(cur_wall_time, 444 wall_time = (unsigned int)
446 j_dbs_info->prev_cpu_wall); 445 (cur_wall_time - j_dbs_info->prev_cpu_wall);
447 j_dbs_info->prev_cpu_wall = cur_wall_time; 446 j_dbs_info->prev_cpu_wall = cur_wall_time;
448 447
449 idle_time = (unsigned int) cputime64_sub(cur_idle_time, 448 idle_time = (unsigned int)
450 j_dbs_info->prev_cpu_idle); 449 (cur_idle_time - j_dbs_info->prev_cpu_idle);
451 j_dbs_info->prev_cpu_idle = cur_idle_time; 450 j_dbs_info->prev_cpu_idle = cur_idle_time;
452 451
453 iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, 452 iowait_time = (unsigned int)
454 j_dbs_info->prev_cpu_iowait); 453 (cur_iowait_time - j_dbs_info->prev_cpu_iowait);
455 j_dbs_info->prev_cpu_iowait = cur_iowait_time; 454 j_dbs_info->prev_cpu_iowait = cur_iowait_time;
456 455
457 if (dbs_tuners_ins.ignore_nice) { 456 if (dbs_tuners_ins.ignore_nice) {
458 cputime64_t cur_nice; 457 u64 cur_nice;
459 unsigned long cur_nice_jiffies; 458 unsigned long cur_nice_jiffies;
460 459
461 cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, 460 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] -
462 j_dbs_info->prev_cpu_nice); 461 j_dbs_info->prev_cpu_nice;
463 /* 462 /*
464 * Assumption: nice time between sampling periods will 463 * Assumption: nice time between sampling periods will
465 * be less than 2^32 jiffies for 32 bit sys 464 * be less than 2^32 jiffies for 32 bit sys
@@ -467,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
467 cur_nice_jiffies = (unsigned long) 466 cur_nice_jiffies = (unsigned long)
468 cputime64_to_jiffies64(cur_nice); 467 cputime64_to_jiffies64(cur_nice);
469 468
470 j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; 469 j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
471 idle_time += jiffies_to_usecs(cur_nice_jiffies); 470 idle_time += jiffies_to_usecs(cur_nice_jiffies);
472 } 471 }
473 472
@@ -646,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
646 645
647 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, 646 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
648 &j_dbs_info->prev_cpu_wall); 647 &j_dbs_info->prev_cpu_wall);
649 if (dbs_tuners_ins.ignore_nice) { 648 if (dbs_tuners_ins.ignore_nice)
650 j_dbs_info->prev_cpu_nice = 649 j_dbs_info->prev_cpu_nice =
651 kstat_cpu(j).cpustat.nice; 650 kcpustat_cpu(j).cpustat[CPUTIME_NICE];
652 }
653 } 651 }
654 this_dbs_info->cpu = cpu; 652 this_dbs_info->cpu = cpu;
655 this_dbs_info->rate_mult = 1; 653 this_dbs_info->rate_mult = 1;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index c5072a91e848..2a508edd768b 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -61,9 +61,8 @@ static int cpufreq_stats_update(unsigned int cpu)
61 spin_lock(&cpufreq_stats_lock); 61 spin_lock(&cpufreq_stats_lock);
62 stat = per_cpu(cpufreq_stats_table, cpu); 62 stat = per_cpu(cpufreq_stats_table, cpu);
63 if (stat->time_in_state) 63 if (stat->time_in_state)
64 stat->time_in_state[stat->last_index] = 64 stat->time_in_state[stat->last_index] +=
65 cputime64_add(stat->time_in_state[stat->last_index], 65 cur_time - stat->last_time;
66 cputime_sub(cur_time, stat->last_time));
67 stat->last_time = cur_time; 66 stat->last_time = cur_time;
68 spin_unlock(&cpufreq_stats_lock); 67 spin_unlock(&cpufreq_stats_lock);
69 return 0; 68 return 0;
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 2637c139777b..6dc26b61219b 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -81,13 +81,13 @@ static int rackmeter_ignore_nice;
81 */ 81 */
82static inline cputime64_t get_cpu_idle_time(unsigned int cpu) 82static inline cputime64_t get_cpu_idle_time(unsigned int cpu)
83{ 83{
84 cputime64_t retval; 84 u64 retval;
85 85
86 retval = cputime64_add(kstat_cpu(cpu).cpustat.idle, 86 retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] +
87 kstat_cpu(cpu).cpustat.iowait); 87 kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
88 88
89 if (rackmeter_ignore_nice) 89 if (rackmeter_ignore_nice)
90 retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice); 90 retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
91 91
92 return retval; 92 return retval;
93} 93}
@@ -220,13 +220,11 @@ static void rackmeter_do_timer(struct work_struct *work)
220 int i, offset, load, cumm, pause; 220 int i, offset, load, cumm, pause;
221 221
222 cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); 222 cur_jiffies = jiffies64_to_cputime64(get_jiffies_64());
223 total_ticks = (unsigned int)cputime64_sub(cur_jiffies, 223 total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall);
224 rcpu->prev_wall);
225 rcpu->prev_wall = cur_jiffies; 224 rcpu->prev_wall = cur_jiffies;
226 225
227 total_idle_ticks = get_cpu_idle_time(cpu); 226 total_idle_ticks = get_cpu_idle_time(cpu);
228 idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks, 227 idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle);
229 rcpu->prev_idle);
230 rcpu->prev_idle = total_idle_ticks; 228 rcpu->prev_idle = total_idle_ticks;
231 229
232 /* We do a very dumb calculation to update the LEDs for now, 230 /* We do a very dumb calculation to update the LEDs for now,
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd228d1..8c344f037bd0 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
394 394
395 sigemptyset(&sigign); 395 sigemptyset(&sigign);
396 sigemptyset(&sigcatch); 396 sigemptyset(&sigcatch);
397 cutime = cstime = utime = stime = cputime_zero; 397 cutime = cstime = utime = stime = 0;
398 cgtime = gtime = cputime_zero; 398 cgtime = gtime = 0;
399 399
400 if (lock_task_sighand(task, &flags)) { 400 if (lock_task_sighand(task, &flags)) {
401 struct signal_struct *sig = task->signal; 401 struct signal_struct *sig = task->signal;
@@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
423 do { 423 do {
424 min_flt += t->min_flt; 424 min_flt += t->min_flt;
425 maj_flt += t->maj_flt; 425 maj_flt += t->maj_flt;
426 gtime = cputime_add(gtime, t->gtime); 426 gtime += t->gtime;
427 t = next_thread(t); 427 t = next_thread(t);
428 } while (t != task); 428 } while (t != task);
429 429
430 min_flt += sig->min_flt; 430 min_flt += sig->min_flt;
431 maj_flt += sig->maj_flt; 431 maj_flt += sig->maj_flt;
432 thread_group_times(task, &utime, &stime); 432 thread_group_times(task, &utime, &stime);
433 gtime = cputime_add(gtime, sig->gtime); 433 gtime += sig->gtime;
434 } 434 }
435 435
436 sid = task_session_nr_ns(task, ns); 436 sid = task_session_nr_ns(task, ns);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 0855e6f20391..d76ca6ae2b1b 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,29 +22,27 @@
22#define arch_idle_time(cpu) 0 22#define arch_idle_time(cpu) 0
23#endif 23#endif
24 24
25static cputime64_t get_idle_time(int cpu) 25static u64 get_idle_time(int cpu)
26{ 26{
27 u64 idle_time = get_cpu_idle_time_us(cpu, NULL); 27 u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
28 cputime64_t idle;
29 28
30 if (idle_time == -1ULL) { 29 if (idle_time == -1ULL) {
31 /* !NO_HZ so we can rely on cpustat.idle */ 30 /* !NO_HZ so we can rely on cpustat.idle */
32 idle = kstat_cpu(cpu).cpustat.idle; 31 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
33 idle = cputime64_add(idle, arch_idle_time(cpu)); 32 idle += arch_idle_time(cpu);
34 } else 33 } else
35 idle = usecs_to_cputime64(idle_time); 34 idle = usecs_to_cputime64(idle_time);
36 35
37 return idle; 36 return idle;
38} 37}
39 38
40static cputime64_t get_iowait_time(int cpu) 39static u64 get_iowait_time(int cpu)
41{ 40{
42 u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); 41 u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
43 cputime64_t iowait;
44 42
45 if (iowait_time == -1ULL) 43 if (iowait_time == -1ULL)
46 /* !NO_HZ so we can rely on cpustat.iowait */ 44 /* !NO_HZ so we can rely on cpustat.iowait */
47 iowait = kstat_cpu(cpu).cpustat.iowait; 45 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
48 else 46 else
49 iowait = usecs_to_cputime64(iowait_time); 47 iowait = usecs_to_cputime64(iowait_time);
50 48
@@ -55,33 +53,30 @@ static int show_stat(struct seq_file *p, void *v)
55{ 53{
56 int i, j; 54 int i, j;
57 unsigned long jif; 55 unsigned long jif;
58 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; 56 u64 user, nice, system, idle, iowait, irq, softirq, steal;
59 cputime64_t guest, guest_nice; 57 u64 guest, guest_nice;
60 u64 sum = 0; 58 u64 sum = 0;
61 u64 sum_softirq = 0; 59 u64 sum_softirq = 0;
62 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 60 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
63 struct timespec boottime; 61 struct timespec boottime;
64 62
65 user = nice = system = idle = iowait = 63 user = nice = system = idle = iowait =
66 irq = softirq = steal = cputime64_zero; 64 irq = softirq = steal = 0;
67 guest = guest_nice = cputime64_zero; 65 guest = guest_nice = 0;
68 getboottime(&boottime); 66 getboottime(&boottime);
69 jif = boottime.tv_sec; 67 jif = boottime.tv_sec;
70 68
71 for_each_possible_cpu(i) { 69 for_each_possible_cpu(i) {
72 user = cputime64_add(user, kstat_cpu(i).cpustat.user); 70 user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
73 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); 71 nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
74 system = cputime64_add(system, kstat_cpu(i).cpustat.system); 72 system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
75 idle = cputime64_add(idle, get_idle_time(i)); 73 idle += get_idle_time(i);
76 iowait = cputime64_add(iowait, get_iowait_time(i)); 74 iowait += get_iowait_time(i);
77 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); 75 irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
78 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 76 softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
79 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
80 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
81 guest_nice = cputime64_add(guest_nice, 79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
82 kstat_cpu(i).cpustat.guest_nice);
83 sum += kstat_cpu_irqs_sum(i);
84 sum += arch_irq_stat_cpu(i);
85 80
86 for (j = 0; j < NR_SOFTIRQS; j++) { 81 for (j = 0; j < NR_SOFTIRQS; j++) {
87 unsigned int softirq_stat = kstat_softirqs_cpu(j, i); 82 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@@ -106,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v)
106 (unsigned long long)cputime64_to_clock_t(guest_nice)); 101 (unsigned long long)cputime64_to_clock_t(guest_nice));
107 for_each_online_cpu(i) { 102 for_each_online_cpu(i) {
108 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 103 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
109 user = kstat_cpu(i).cpustat.user; 104 user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
110 nice = kstat_cpu(i).cpustat.nice; 105 nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
111 system = kstat_cpu(i).cpustat.system; 106 system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
112 idle = get_idle_time(i); 107 idle = get_idle_time(i);
113 iowait = get_iowait_time(i); 108 iowait = get_iowait_time(i);
114 irq = kstat_cpu(i).cpustat.irq; 109 irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
115 softirq = kstat_cpu(i).cpustat.softirq; 110 softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
116 steal = kstat_cpu(i).cpustat.steal; 111 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
117 guest = kstat_cpu(i).cpustat.guest; 112 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
118 guest_nice = kstat_cpu(i).cpustat.guest_nice; 113 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
119 seq_printf(p, 114 seq_printf(p,
120 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " 115 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
121 "%llu\n", 116 "%llu\n",
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 766b1d456050..9610ac772d7e 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
12 struct timespec uptime; 12 struct timespec uptime;
13 struct timespec idle; 13 struct timespec idle;
14 u64 idletime;
15 u64 nsec;
16 u32 rem;
14 int i; 17 int i;
15 cputime_t idletime = cputime_zero;
16 18
19 idletime = 0;
17 for_each_possible_cpu(i) 20 for_each_possible_cpu(i)
18 idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); 21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
19 22
20 do_posix_clock_monotonic_gettime(&uptime); 23 do_posix_clock_monotonic_gettime(&uptime);
21 monotonic_to_bootbased(&uptime); 24 monotonic_to_bootbased(&uptime);
22 cputime_to_timespec(idletime, &idle); 25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem;
23 seq_printf(m, "%lu.%02lu %lu.%02lu\n", 28 seq_printf(m, "%lu.%02lu %lu.%02lu\n",
24 (unsigned long) uptime.tv_sec, 29 (unsigned long) uptime.tv_sec,
25 (uptime.tv_nsec / (NSEC_PER_SEC / 100)), 30 (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 12a1764f612b..9a62937c56ca 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -4,71 +4,66 @@
4#include <linux/time.h> 4#include <linux/time.h>
5#include <linux/jiffies.h> 5#include <linux/jiffies.h>
6 6
7typedef unsigned long cputime_t; 7typedef unsigned long __nocast cputime_t;
8 8
9#define cputime_zero (0UL)
10#define cputime_one_jiffy jiffies_to_cputime(1) 9#define cputime_one_jiffy jiffies_to_cputime(1)
11#define cputime_max ((~0UL >> 1) - 1) 10#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct)
12#define cputime_add(__a, __b) ((__a) + (__b))
13#define cputime_sub(__a, __b) ((__a) - (__b))
14#define cputime_div(__a, __n) ((__a) / (__n))
15#define cputime_halve(__a) ((__a) >> 1)
16#define cputime_eq(__a, __b) ((__a) == (__b))
17#define cputime_gt(__a, __b) ((__a) > (__b))
18#define cputime_ge(__a, __b) ((__a) >= (__b))
19#define cputime_lt(__a, __b) ((__a) < (__b))
20#define cputime_le(__a, __b) ((__a) <= (__b))
21#define cputime_to_jiffies(__ct) (__ct)
22#define cputime_to_scaled(__ct) (__ct) 11#define cputime_to_scaled(__ct) (__ct)
23#define jiffies_to_cputime(__hz) (__hz) 12#define jiffies_to_cputime(__hz) (__force cputime_t)(__hz)
24 13
25typedef u64 cputime64_t; 14typedef u64 __nocast cputime64_t;
26 15
27#define cputime64_zero (0ULL) 16#define cputime64_to_jiffies64(__ct) (__force u64)(__ct)
28#define cputime64_add(__a, __b) ((__a) + (__b)) 17#define jiffies64_to_cputime64(__jif) (__force cputime64_t)(__jif)
29#define cputime64_sub(__a, __b) ((__a) - (__b))
30#define cputime64_to_jiffies64(__ct) (__ct)
31#define jiffies64_to_cputime64(__jif) (__jif)
32#define cputime_to_cputime64(__ct) ((u64) __ct)
33#define cputime64_gt(__a, __b) ((__a) > (__b))
34 18
35#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) 19#define nsecs_to_cputime64(__ct) \
20 jiffies64_to_cputime64(nsecs_to_jiffies64(__ct))
36 21
37 22
38/* 23/*
39 * Convert cputime to microseconds and back. 24 * Convert cputime to microseconds and back.
40 */ 25 */
41#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) 26#define cputime_to_usecs(__ct) \
42#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) 27 jiffies_to_usecs(cputime_to_jiffies(__ct))
43#define usecs_to_cputime64(__msecs) nsecs_to_jiffies64((__msecs) * 1000) 28#define usecs_to_cputime(__usec) \
29 jiffies_to_cputime(usecs_to_jiffies(__usec))
30#define usecs_to_cputime64(__usec) \
31 jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000))
44 32
45/* 33/*
46 * Convert cputime to seconds and back. 34 * Convert cputime to seconds and back.
47 */ 35 */
48#define cputime_to_secs(jif) ((jif) / HZ) 36#define cputime_to_secs(jif) (cputime_to_jiffies(jif) / HZ)
49#define secs_to_cputime(sec) ((sec) * HZ) 37#define secs_to_cputime(sec) jiffies_to_cputime((sec) * HZ)
50 38
51/* 39/*
52 * Convert cputime to timespec and back. 40 * Convert cputime to timespec and back.
53 */ 41 */
54#define timespec_to_cputime(__val) timespec_to_jiffies(__val) 42#define timespec_to_cputime(__val) \
55#define cputime_to_timespec(__ct,__val) jiffies_to_timespec(__ct,__val) 43 jiffies_to_cputime(timespec_to_jiffies(__val))
44#define cputime_to_timespec(__ct,__val) \
45 jiffies_to_timespec(cputime_to_jiffies(__ct),__val)
56 46
57/* 47/*
58 * Convert cputime to timeval and back. 48 * Convert cputime to timeval and back.
59 */ 49 */
60#define timeval_to_cputime(__val) timeval_to_jiffies(__val) 50#define timeval_to_cputime(__val) \
61#define cputime_to_timeval(__ct,__val) jiffies_to_timeval(__ct,__val) 51 jiffies_to_cputime(timeval_to_jiffies(__val))
52#define cputime_to_timeval(__ct,__val) \
53 jiffies_to_timeval(cputime_to_jiffies(__ct),__val)
62 54
63/* 55/*
64 * Convert cputime to clock and back. 56 * Convert cputime to clock and back.
65 */ 57 */
66#define cputime_to_clock_t(__ct) jiffies_to_clock_t(__ct) 58#define cputime_to_clock_t(__ct) \
67#define clock_t_to_cputime(__x) clock_t_to_jiffies(__x) 59 jiffies_to_clock_t(cputime_to_jiffies(__ct))
60#define clock_t_to_cputime(__x) \
61 jiffies_to_cputime(clock_t_to_jiffies(__x))
68 62
69/* 63/*
70 * Convert cputime64 to clock. 64 * Convert cputime64 to clock.
71 */ 65 */
72#define cputime64_to_clock_t(__ct) jiffies_64_to_clock_t(__ct) 66#define cputime64_to_clock_t(__ct) \
67 jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct))
73 68
74#endif 69#endif
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0cce2db580c3..2fbd9053c2df 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -6,6 +6,7 @@
6#include <linux/percpu.h> 6#include <linux/percpu.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <linux/interrupt.h> 8#include <linux/interrupt.h>
9#include <linux/sched.h>
9#include <asm/irq.h> 10#include <asm/irq.h>
10#include <asm/cputime.h> 11#include <asm/cputime.h>
11 12
@@ -15,21 +16,25 @@
15 * used by rstatd/perfmeter 16 * used by rstatd/perfmeter
16 */ 17 */
17 18
18struct cpu_usage_stat { 19enum cpu_usage_stat {
19 cputime64_t user; 20 CPUTIME_USER,
20 cputime64_t nice; 21 CPUTIME_NICE,
21 cputime64_t system; 22 CPUTIME_SYSTEM,
22 cputime64_t softirq; 23 CPUTIME_SOFTIRQ,
23 cputime64_t irq; 24 CPUTIME_IRQ,
24 cputime64_t idle; 25 CPUTIME_IDLE,
25 cputime64_t iowait; 26 CPUTIME_IOWAIT,
26 cputime64_t steal; 27 CPUTIME_STEAL,
27 cputime64_t guest; 28 CPUTIME_GUEST,
28 cputime64_t guest_nice; 29 CPUTIME_GUEST_NICE,
30 NR_STATS,
31};
32
33struct kernel_cpustat {
34 u64 cpustat[NR_STATS];
29}; 35};
30 36
31struct kernel_stat { 37struct kernel_stat {
32 struct cpu_usage_stat cpustat;
33#ifndef CONFIG_GENERIC_HARDIRQS 38#ifndef CONFIG_GENERIC_HARDIRQS
34 unsigned int irqs[NR_IRQS]; 39 unsigned int irqs[NR_IRQS];
35#endif 40#endif
@@ -38,10 +43,13 @@ struct kernel_stat {
38}; 43};
39 44
40DECLARE_PER_CPU(struct kernel_stat, kstat); 45DECLARE_PER_CPU(struct kernel_stat, kstat);
46DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
41 47
42#define kstat_cpu(cpu) per_cpu(kstat, cpu)
43/* Must have preemption disabled for this to be meaningful. */ 48/* Must have preemption disabled for this to be meaningful. */
44#define kstat_this_cpu __get_cpu_var(kstat) 49#define kstat_this_cpu (&__get_cpu_var(kstat))
50#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat))
51#define kstat_cpu(cpu) per_cpu(kstat, cpu)
52#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu)
45 53
46extern unsigned long long nr_context_switches(void); 54extern unsigned long long nr_context_switches(void);
47 55
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index b0e99898527c..e23121f9d82a 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -10,6 +10,8 @@
10#define _INCLUDE_GUARD_LATENCYTOP_H_ 10#define _INCLUDE_GUARD_LATENCYTOP_H_
11 11
12#include <linux/compiler.h> 12#include <linux/compiler.h>
13struct task_struct;
14
13#ifdef CONFIG_LATENCYTOP 15#ifdef CONFIG_LATENCYTOP
14 16
15#define LT_SAVECOUNT 32 17#define LT_SAVECOUNT 32
@@ -23,7 +25,6 @@ struct latency_record {
23}; 25};
24 26
25 27
26struct task_struct;
27 28
28extern int latencytop_enabled; 29extern int latencytop_enabled;
29void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); 30void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4a7e4d333a27..cf0eb342bcba 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,9 +273,11 @@ extern int runqueue_is_locked(int cpu);
273 273
274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
275extern void select_nohz_load_balancer(int stop_tick); 275extern void select_nohz_load_balancer(int stop_tick);
276extern void set_cpu_sd_state_idle(void);
276extern int get_nohz_timer_target(void); 277extern int get_nohz_timer_target(void);
277#else 278#else
278static inline void select_nohz_load_balancer(int stop_tick) { } 279static inline void select_nohz_load_balancer(int stop_tick) { }
280static inline void set_cpu_sd_state_idle(void) { }
279#endif 281#endif
280 282
281/* 283/*
@@ -483,8 +485,8 @@ struct task_cputime {
483 485
484#define INIT_CPUTIME \ 486#define INIT_CPUTIME \
485 (struct task_cputime) { \ 487 (struct task_cputime) { \
486 .utime = cputime_zero, \ 488 .utime = 0, \
487 .stime = cputime_zero, \ 489 .stime = 0, \
488 .sum_exec_runtime = 0, \ 490 .sum_exec_runtime = 0, \
489 } 491 }
490 492
@@ -901,6 +903,10 @@ struct sched_group_power {
901 * single CPU. 903 * single CPU.
902 */ 904 */
903 unsigned int power, power_orig; 905 unsigned int power, power_orig;
906 /*
907 * Number of busy cpus in this group.
908 */
909 atomic_t nr_busy_cpus;
904}; 910};
905 911
906struct sched_group { 912struct sched_group {
@@ -925,6 +931,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
925 return to_cpumask(sg->cpumask); 931 return to_cpumask(sg->cpumask);
926} 932}
927 933
934/**
935 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
936 * @group: The group whose first cpu is to be returned.
937 */
938static inline unsigned int group_first_cpu(struct sched_group *group)
939{
940 return cpumask_first(sched_group_cpus(group));
941}
942
928struct sched_domain_attr { 943struct sched_domain_attr {
929 int relax_domain_level; 944 int relax_domain_level;
930}; 945};
@@ -1315,8 +1330,8 @@ struct task_struct {
1315 * older sibling, respectively. (p->father can be replaced with 1330 * older sibling, respectively. (p->father can be replaced with
1316 * p->real_parent->pid) 1331 * p->real_parent->pid)
1317 */ 1332 */
1318 struct task_struct *real_parent; /* real parent process */ 1333 struct task_struct __rcu *real_parent; /* real parent process */
1319 struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ 1334 struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
1320 /* 1335 /*
1321 * children/sibling forms the list of my natural children 1336 * children/sibling forms the list of my natural children
1322 */ 1337 */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 959ff18b63b6..6ba596b07a72 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -331,6 +331,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
331 TP_ARGS(tsk, delay)); 331 TP_ARGS(tsk, delay));
332 332
333/* 333/*
334 * Tracepoint for accounting blocked time (time the task is in uninterruptible).
335 */
336DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
337 TP_PROTO(struct task_struct *tsk, u64 delay),
338 TP_ARGS(tsk, delay));
339
340/*
334 * Tracepoint for accounting runtime (time the task is executing 341 * Tracepoint for accounting runtime (time the task is executing
335 * on a CPU). 342 * on a CPU).
336 */ 343 */
@@ -363,6 +370,56 @@ TRACE_EVENT(sched_stat_runtime,
363 (unsigned long long)__entry->vruntime) 370 (unsigned long long)__entry->vruntime)
364); 371);
365 372
373#ifdef CREATE_TRACE_POINTS
374static inline u64 trace_get_sleeptime(struct task_struct *tsk)
375{
376#ifdef CONFIG_SCHEDSTATS
377 u64 block, sleep;
378
379 block = tsk->se.statistics.block_start;
380 sleep = tsk->se.statistics.sleep_start;
381 tsk->se.statistics.block_start = 0;
382 tsk->se.statistics.sleep_start = 0;
383
384 return block ? block : sleep ? sleep : 0;
385#else
386 return 0;
387#endif
388}
389#endif
390
391/*
392 * Tracepoint for accounting sleeptime (time the task is sleeping
393 * or waiting for I/O).
394 */
395TRACE_EVENT(sched_stat_sleeptime,
396
397 TP_PROTO(struct task_struct *tsk, u64 now),
398
399 TP_ARGS(tsk, now),
400
401 TP_STRUCT__entry(
402 __array( char, comm, TASK_COMM_LEN )
403 __field( pid_t, pid )
404 __field( u64, sleeptime )
405 ),
406
407 TP_fast_assign(
408 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
409 __entry->pid = tsk->pid;
410 __entry->sleeptime = trace_get_sleeptime(tsk);
411 __entry->sleeptime = __entry->sleeptime ?
412 now - __entry->sleeptime : 0;
413 )
414 TP_perf_assign(
415 __perf_count(__entry->sleeptime);
416 ),
417
418 TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]",
419 __entry->comm, __entry->pid,
420 (unsigned long long)__entry->sleeptime)
421);
422
366/* 423/*
367 * Tracepoint for showing priority inheritance modifying a tasks 424 * Tracepoint for showing priority inheritance modifying a tasks
368 * priority. 425 * priority.
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..f70396e5a24b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ 5obj-y = fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o sched_clock.o cred.o \ 12 notifier.o ksysfs.o cred.o \
13 async.o range.o 13 async.o range.o groups.o
14obj-y += groups.o
15 14
16ifdef CONFIG_FUNCTION_TRACER 15ifdef CONFIG_FUNCTION_TRACER
17# Do not trace debug files and internal ftrace files 16# Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg 19CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg 20CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg 21CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_irq_work.o = -pg 22CFLAGS_REMOVE_irq_work.o = -pg
25endif 23endif
26 24
25obj-y += sched/
26
27obj-$(CONFIG_FREEZER) += freezer.o 27obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o 28obj-$(CONFIG_PROFILING) += profile.o
29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
99obj-$(CONFIG_X86_DS) += trace/ 99obj-$(CONFIG_X86_DS) += trace/
100obj-$(CONFIG_RING_BUFFER) += trace/ 100obj-$(CONFIG_RING_BUFFER) += trace/
101obj-$(CONFIG_TRACEPOINTS) += trace/ 101obj-$(CONFIG_TRACEPOINTS) += trace/
102obj-$(CONFIG_SMP) += sched_cpupri.o
103obj-$(CONFIG_IRQ_WORK) += irq_work.o 102obj-$(CONFIG_IRQ_WORK) += irq_work.o
104obj-$(CONFIG_CPU_PM) += cpu_pm.o 103obj-$(CONFIG_CPU_PM) += cpu_pm.o
105 104
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
110obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 109obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
111obj-$(CONFIG_JUMP_LABEL) += jump_label.o 110obj-$(CONFIG_JUMP_LABEL) += jump_label.o
112 111
113ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
114# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
115# needed for x86 only. Why this used to be enabled for all architectures is beyond
116# me. I suspect most platforms don't need this, but until we know that for sure
117# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
118# to get a correct value for the wait-channel (WCHAN in ps). --davidm
119CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
120endif
121
122$(obj)/configs.o: $(obj)/config_data.h 112$(obj)/configs.o: $(obj)/config_data.h
123 113
124# config_data.h contains the same information as ikconfig.h but gzipped. 114# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..203dfead2e06 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
613 pacct->ac_flag |= ACORE; 613 pacct->ac_flag |= ACORE;
614 if (current->flags & PF_SIGNALED) 614 if (current->flags & PF_SIGNALED)
615 pacct->ac_flag |= AXSIG; 615 pacct->ac_flag |= AXSIG;
616 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); 616 pacct->ac_utime += current->utime;
617 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); 617 pacct->ac_stime += current->stime;
618 pacct->ac_minflt += current->min_flt; 618 pacct->ac_minflt += current->min_flt;
619 pacct->ac_majflt += current->maj_flt; 619 pacct->ac_majflt += current->maj_flt;
620 spin_unlock_irq(&current->sighand->siglock); 620 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9d448ddb2247..5ca38d5d238a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
178 write_lock_irq(&tasklist_lock); 178 write_lock_irq(&tasklist_lock);
179 for_each_process(p) { 179 for_each_process(p) {
180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 180 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
181 (!cputime_eq(p->utime, cputime_zero) || 181 (p->utime || p->stime))
182 !cputime_eq(p->stime, cputime_zero)))
183 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 182 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
184 "(state = %ld, flags = %x)\n", 183 "(state = %ld, flags = %x)\n",
185 p->comm, task_pid_nr(p), cpu, 184 p->comm, task_pid_nr(p), cpu,
diff --git a/kernel/exit.c b/kernel/exit.c
index e6e01b959a0e..d579a459309d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
121 * We won't ever get here for the group leader, since it 121 * We won't ever get here for the group leader, since it
122 * will have been the last reference on the signal_struct. 122 * will have been the last reference on the signal_struct.
123 */ 123 */
124 sig->utime = cputime_add(sig->utime, tsk->utime); 124 sig->utime += tsk->utime;
125 sig->stime = cputime_add(sig->stime, tsk->stime); 125 sig->stime += tsk->stime;
126 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 126 sig->gtime += tsk->gtime;
127 sig->min_flt += tsk->min_flt; 127 sig->min_flt += tsk->min_flt;
128 sig->maj_flt += tsk->maj_flt; 128 sig->maj_flt += tsk->maj_flt;
129 sig->nvcsw += tsk->nvcsw; 129 sig->nvcsw += tsk->nvcsw;
@@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1255 spin_lock_irq(&p->real_parent->sighand->siglock); 1255 spin_lock_irq(&p->real_parent->sighand->siglock);
1256 psig = p->real_parent->signal; 1256 psig = p->real_parent->signal;
1257 sig = p->signal; 1257 sig = p->signal;
1258 psig->cutime = 1258 psig->cutime += tgutime + sig->cutime;
1259 cputime_add(psig->cutime, 1259 psig->cstime += tgstime + sig->cstime;
1260 cputime_add(tgutime, 1260 psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
1261 sig->cutime));
1262 psig->cstime =
1263 cputime_add(psig->cstime,
1264 cputime_add(tgstime,
1265 sig->cstime));
1266 psig->cgtime =
1267 cputime_add(psig->cgtime,
1268 cputime_add(p->gtime,
1269 cputime_add(sig->gtime,
1270 sig->cgtime)));
1271 psig->cmin_flt += 1261 psig->cmin_flt +=
1272 p->min_flt + sig->min_flt + sig->cmin_flt; 1262 p->min_flt + sig->min_flt + sig->cmin_flt;
1273 psig->cmaj_flt += 1263 psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..b058c5820ecd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1023 */ 1023 */
1024static void posix_cpu_timers_init(struct task_struct *tsk) 1024static void posix_cpu_timers_init(struct task_struct *tsk)
1025{ 1025{
1026 tsk->cputime_expires.prof_exp = cputime_zero; 1026 tsk->cputime_expires.prof_exp = 0;
1027 tsk->cputime_expires.virt_exp = cputime_zero; 1027 tsk->cputime_expires.virt_exp = 0;
1028 tsk->cputime_expires.sched_exp = 0; 1028 tsk->cputime_expires.sched_exp = 0;
1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]); 1029 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]); 1030 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1132 1132
1133 init_sigpending(&p->pending); 1133 init_sigpending(&p->pending);
1134 1134
1135 p->utime = cputime_zero; 1135 p->utime = p->stime = p->gtime = 0;
1136 p->stime = cputime_zero; 1136 p->utimescaled = p->stimescaled = 0;
1137 p->gtime = cputime_zero;
1138 p->utimescaled = cputime_zero;
1139 p->stimescaled = cputime_zero;
1140#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1137#ifndef CONFIG_VIRT_CPU_ACCOUNTING
1141 p->prev_utime = cputime_zero; 1138 p->prev_utime = p->prev_stime = 0;
1142 p->prev_stime = cputime_zero;
1143#endif 1139#endif
1144#if defined(SPLIT_RSS_COUNTING) 1140#if defined(SPLIT_RSS_COUNTING)
1145 memset(&p->rss_stat, 0, sizeof(p->rss_stat)); 1141 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
52 52
53 cval = it->expires; 53 cval = it->expires;
54 cinterval = it->incr; 54 cinterval = it->incr;
55 if (!cputime_eq(cval, cputime_zero)) { 55 if (cval) {
56 struct task_cputime cputime; 56 struct task_cputime cputime;
57 cputime_t t; 57 cputime_t t;
58 58
59 thread_group_cputimer(tsk, &cputime); 59 thread_group_cputimer(tsk, &cputime);
60 if (clock_id == CPUCLOCK_PROF) 60 if (clock_id == CPUCLOCK_PROF)
61 t = cputime_add(cputime.utime, cputime.stime); 61 t = cputime.utime + cputime.stime;
62 else 62 else
63 /* CPUCLOCK_VIRT */ 63 /* CPUCLOCK_VIRT */
64 t = cputime.utime; 64 t = cputime.utime;
65 65
66 if (cputime_le(cval, t)) 66 if (cval < t)
67 /* about to fire */ 67 /* about to fire */
68 cval = cputime_one_jiffy; 68 cval = cputime_one_jiffy;
69 else 69 else
70 cval = cputime_sub(cval, t); 70 cval = cval - t;
71 } 71 }
72 72
73 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
161 161
162 cval = it->expires; 162 cval = it->expires;
163 cinterval = it->incr; 163 cinterval = it->incr;
164 if (!cputime_eq(cval, cputime_zero) || 164 if (cval || nval) {
165 !cputime_eq(nval, cputime_zero)) { 165 if (nval > 0)
166 if (cputime_gt(nval, cputime_zero)) 166 nval += cputime_one_jiffy;
167 nval = cputime_add(nval, cputime_one_jiffy);
168 set_process_cpu_timer(tsk, clock_id, &nval, &cval); 167 set_process_cpu_timer(tsk, clock_id, &nval, &cval);
169 } 168 }
170 it->expires = nval; 169 it->expires = nval;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f5..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 78 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
79 return now.sched < then.sched; 79 return now.sched < then.sched;
80 } else { 80 } else {
81 return cputime_lt(now.cpu, then.cpu); 81 return now.cpu < then.cpu;
82 } 82 }
83} 83}
84static inline void cpu_time_add(const clockid_t which_clock, 84static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 88 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
89 acc->sched += val.sched; 89 acc->sched += val.sched;
90 } else { 90 } else {
91 acc->cpu = cputime_add(acc->cpu, val.cpu); 91 acc->cpu += val.cpu;
92 } 92 }
93} 93}
94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, 94static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 98 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
99 a.sched -= b.sched; 99 a.sched -= b.sched;
100 } else { 100 } else {
101 a.cpu = cputime_sub(a.cpu, b.cpu); 101 a.cpu -= b.cpu;
102 } 102 }
103 return a; 103 return a;
104} 104}
105 105
106/* 106/*
107 * Divide and limit the result to res >= 1
108 *
109 * This is necessary to prevent signal delivery starvation, when the result of
110 * the division would be rounded down to 0.
111 */
112static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
113{
114 cputime_t res = cputime_div(time, div);
115
116 return max_t(cputime_t, res, 1);
117}
118
119/*
120 * Update expiry time from increment, and increase overrun count, 107 * Update expiry time from increment, and increase overrun count,
121 * given the current clock sample. 108 * given the current clock sample.
122 */ 109 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
148 } else { 135 } else {
149 cputime_t delta, incr; 136 cputime_t delta, incr;
150 137
151 if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) 138 if (now.cpu < timer->it.cpu.expires.cpu)
152 return; 139 return;
153 incr = timer->it.cpu.incr.cpu; 140 incr = timer->it.cpu.incr.cpu;
154 delta = cputime_sub(cputime_add(now.cpu, incr), 141 delta = now.cpu + incr - timer->it.cpu.expires.cpu;
155 timer->it.cpu.expires.cpu);
156 /* Don't use (incr*2 < delta), incr*2 might overflow. */ 142 /* Don't use (incr*2 < delta), incr*2 might overflow. */
157 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 143 for (i = 0; incr < delta - incr; i++)
158 incr = cputime_add(incr, incr); 144 incr += incr;
159 for (; i >= 0; incr = cputime_halve(incr), i--) { 145 for (; i >= 0; incr = incr >> 1, i--) {
160 if (cputime_lt(delta, incr)) 146 if (delta < incr)
161 continue; 147 continue;
162 timer->it.cpu.expires.cpu = 148 timer->it.cpu.expires.cpu += incr;
163 cputime_add(timer->it.cpu.expires.cpu, incr);
164 timer->it_overrun += 1 << i; 149 timer->it_overrun += 1 << i;
165 delta = cputime_sub(delta, incr); 150 delta -= incr;
166 } 151 }
167 } 152 }
168} 153}
169 154
170static inline cputime_t prof_ticks(struct task_struct *p) 155static inline cputime_t prof_ticks(struct task_struct *p)
171{ 156{
172 return cputime_add(p->utime, p->stime); 157 return p->utime + p->stime;
173} 158}
174static inline cputime_t virt_ticks(struct task_struct *p) 159static inline cputime_t virt_ticks(struct task_struct *p)
175{ 160{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
248 233
249 t = tsk; 234 t = tsk;
250 do { 235 do {
251 times->utime = cputime_add(times->utime, t->utime); 236 times->utime += t->utime;
252 times->stime = cputime_add(times->stime, t->stime); 237 times->stime += t->stime;
253 times->sum_exec_runtime += task_sched_runtime(t); 238 times->sum_exec_runtime += task_sched_runtime(t);
254 } while_each_thread(tsk, t); 239 } while_each_thread(tsk, t);
255out: 240out:
@@ -258,10 +243,10 @@ out:
258 243
259static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) 244static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
260{ 245{
261 if (cputime_gt(b->utime, a->utime)) 246 if (b->utime > a->utime)
262 a->utime = b->utime; 247 a->utime = b->utime;
263 248
264 if (cputime_gt(b->stime, a->stime)) 249 if (b->stime > a->stime)
265 a->stime = b->stime; 250 a->stime = b->stime;
266 251
267 if (b->sum_exec_runtime > a->sum_exec_runtime) 252 if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
306 return -EINVAL; 291 return -EINVAL;
307 case CPUCLOCK_PROF: 292 case CPUCLOCK_PROF:
308 thread_group_cputime(p, &cputime); 293 thread_group_cputime(p, &cputime);
309 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 294 cpu->cpu = cputime.utime + cputime.stime;
310 break; 295 break;
311 case CPUCLOCK_VIRT: 296 case CPUCLOCK_VIRT:
312 thread_group_cputime(p, &cputime); 297 thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
470 unsigned long long sum_exec_runtime) 455 unsigned long long sum_exec_runtime)
471{ 456{
472 struct cpu_timer_list *timer, *next; 457 struct cpu_timer_list *timer, *next;
473 cputime_t ptime = cputime_add(utime, stime); 458 cputime_t ptime = utime + stime;
474 459
475 list_for_each_entry_safe(timer, next, head, entry) { 460 list_for_each_entry_safe(timer, next, head, entry) {
476 list_del_init(&timer->entry); 461 list_del_init(&timer->entry);
477 if (cputime_lt(timer->expires.cpu, ptime)) { 462 if (timer->expires.cpu < ptime) {
478 timer->expires.cpu = cputime_zero; 463 timer->expires.cpu = 0;
479 } else { 464 } else {
480 timer->expires.cpu = cputime_sub(timer->expires.cpu, 465 timer->expires.cpu -= ptime;
481 ptime);
482 } 466 }
483 } 467 }
484 468
485 ++head; 469 ++head;
486 list_for_each_entry_safe(timer, next, head, entry) { 470 list_for_each_entry_safe(timer, next, head, entry) {
487 list_del_init(&timer->entry); 471 list_del_init(&timer->entry);
488 if (cputime_lt(timer->expires.cpu, utime)) { 472 if (timer->expires.cpu < utime) {
489 timer->expires.cpu = cputime_zero; 473 timer->expires.cpu = 0;
490 } else { 474 } else {
491 timer->expires.cpu = cputime_sub(timer->expires.cpu, 475 timer->expires.cpu -= utime;
492 utime);
493 } 476 }
494 } 477 }
495 478
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
520 struct signal_struct *const sig = tsk->signal; 503 struct signal_struct *const sig = tsk->signal;
521 504
522 cleanup_timers(tsk->signal->cpu_timers, 505 cleanup_timers(tsk->signal->cpu_timers,
523 cputime_add(tsk->utime, sig->utime), 506 tsk->utime + sig->utime, tsk->stime + sig->stime,
524 cputime_add(tsk->stime, sig->stime),
525 tsk->se.sum_exec_runtime + sig->sum_sched_runtime); 507 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
526} 508}
527 509
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
540 522
541static inline int expires_gt(cputime_t expires, cputime_t new_exp) 523static inline int expires_gt(cputime_t expires, cputime_t new_exp)
542{ 524{
543 return cputime_eq(expires, cputime_zero) || 525 return expires == 0 || expires > new_exp;
544 cputime_gt(expires, new_exp);
545} 526}
546 527
547/* 528/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
651 default: 632 default:
652 return -EINVAL; 633 return -EINVAL;
653 case CPUCLOCK_PROF: 634 case CPUCLOCK_PROF:
654 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 635 cpu->cpu = cputime.utime + cputime.stime;
655 break; 636 break;
656 case CPUCLOCK_VIRT: 637 case CPUCLOCK_VIRT:
657 cpu->cpu = cputime.utime; 638 cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
918 unsigned long soft; 899 unsigned long soft;
919 900
920 maxfire = 20; 901 maxfire = 20;
921 tsk->cputime_expires.prof_exp = cputime_zero; 902 tsk->cputime_expires.prof_exp = 0;
922 while (!list_empty(timers)) { 903 while (!list_empty(timers)) {
923 struct cpu_timer_list *t = list_first_entry(timers, 904 struct cpu_timer_list *t = list_first_entry(timers,
924 struct cpu_timer_list, 905 struct cpu_timer_list,
925 entry); 906 entry);
926 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 907 if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
927 tsk->cputime_expires.prof_exp = t->expires.cpu; 908 tsk->cputime_expires.prof_exp = t->expires.cpu;
928 break; 909 break;
929 } 910 }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
933 914
934 ++timers; 915 ++timers;
935 maxfire = 20; 916 maxfire = 20;
936 tsk->cputime_expires.virt_exp = cputime_zero; 917 tsk->cputime_expires.virt_exp = 0;
937 while (!list_empty(timers)) { 918 while (!list_empty(timers)) {
938 struct cpu_timer_list *t = list_first_entry(timers, 919 struct cpu_timer_list *t = list_first_entry(timers,
939 struct cpu_timer_list, 920 struct cpu_timer_list,
940 entry); 921 entry);
941 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 922 if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
942 tsk->cputime_expires.virt_exp = t->expires.cpu; 923 tsk->cputime_expires.virt_exp = t->expires.cpu;
943 break; 924 break;
944 } 925 }
@@ -1009,20 +990,19 @@ static u32 onecputick;
1009static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, 990static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1010 cputime_t *expires, cputime_t cur_time, int signo) 991 cputime_t *expires, cputime_t cur_time, int signo)
1011{ 992{
1012 if (cputime_eq(it->expires, cputime_zero)) 993 if (!it->expires)
1013 return; 994 return;
1014 995
1015 if (cputime_ge(cur_time, it->expires)) { 996 if (cur_time >= it->expires) {
1016 if (!cputime_eq(it->incr, cputime_zero)) { 997 if (it->incr) {
1017 it->expires = cputime_add(it->expires, it->incr); 998 it->expires += it->incr;
1018 it->error += it->incr_error; 999 it->error += it->incr_error;
1019 if (it->error >= onecputick) { 1000 if (it->error >= onecputick) {
1020 it->expires = cputime_sub(it->expires, 1001 it->expires -= cputime_one_jiffy;
1021 cputime_one_jiffy);
1022 it->error -= onecputick; 1002 it->error -= onecputick;
1023 } 1003 }
1024 } else { 1004 } else {
1025 it->expires = cputime_zero; 1005 it->expires = 0;
1026 } 1006 }
1027 1007
1028 trace_itimer_expire(signo == SIGPROF ? 1008 trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1031 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); 1011 __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
1032 } 1012 }
1033 1013
1034 if (!cputime_eq(it->expires, cputime_zero) && 1014 if (it->expires && (!*expires || it->expires < *expires)) {
1035 (cputime_eq(*expires, cputime_zero) ||
1036 cputime_lt(it->expires, *expires))) {
1037 *expires = it->expires; 1015 *expires = it->expires;
1038 } 1016 }
1039} 1017}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1048 */ 1026 */
1049static inline int task_cputime_zero(const struct task_cputime *cputime) 1027static inline int task_cputime_zero(const struct task_cputime *cputime)
1050{ 1028{
1051 if (cputime_eq(cputime->utime, cputime_zero) && 1029 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1052 cputime_eq(cputime->stime, cputime_zero) &&
1053 cputime->sum_exec_runtime == 0)
1054 return 1; 1030 return 1;
1055 return 0; 1031 return 0;
1056} 1032}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
1076 */ 1052 */
1077 thread_group_cputimer(tsk, &cputime); 1053 thread_group_cputimer(tsk, &cputime);
1078 utime = cputime.utime; 1054 utime = cputime.utime;
1079 ptime = cputime_add(utime, cputime.stime); 1055 ptime = utime + cputime.stime;
1080 sum_sched_runtime = cputime.sum_exec_runtime; 1056 sum_sched_runtime = cputime.sum_exec_runtime;
1081 maxfire = 20; 1057 maxfire = 20;
1082 prof_expires = cputime_zero; 1058 prof_expires = 0;
1083 while (!list_empty(timers)) { 1059 while (!list_empty(timers)) {
1084 struct cpu_timer_list *tl = list_first_entry(timers, 1060 struct cpu_timer_list *tl = list_first_entry(timers,
1085 struct cpu_timer_list, 1061 struct cpu_timer_list,
1086 entry); 1062 entry);
1087 if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { 1063 if (!--maxfire || ptime < tl->expires.cpu) {
1088 prof_expires = tl->expires.cpu; 1064 prof_expires = tl->expires.cpu;
1089 break; 1065 break;
1090 } 1066 }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
1094 1070
1095 ++timers; 1071 ++timers;
1096 maxfire = 20; 1072 maxfire = 20;
1097 virt_expires = cputime_zero; 1073 virt_expires = 0;
1098 while (!list_empty(timers)) { 1074 while (!list_empty(timers)) {
1099 struct cpu_timer_list *tl = list_first_entry(timers, 1075 struct cpu_timer_list *tl = list_first_entry(timers,
1100 struct cpu_timer_list, 1076 struct cpu_timer_list,
1101 entry); 1077 entry);
1102 if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { 1078 if (!--maxfire || utime < tl->expires.cpu) {
1103 virt_expires = tl->expires.cpu; 1079 virt_expires = tl->expires.cpu;
1104 break; 1080 break;
1105 } 1081 }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
1154 } 1130 }
1155 } 1131 }
1156 x = secs_to_cputime(soft); 1132 x = secs_to_cputime(soft);
1157 if (cputime_eq(prof_expires, cputime_zero) || 1133 if (!prof_expires || x < prof_expires) {
1158 cputime_lt(x, prof_expires)) {
1159 prof_expires = x; 1134 prof_expires = x;
1160 } 1135 }
1161 } 1136 }
@@ -1249,12 +1224,9 @@ out:
1249static inline int task_cputime_expired(const struct task_cputime *sample, 1224static inline int task_cputime_expired(const struct task_cputime *sample,
1250 const struct task_cputime *expires) 1225 const struct task_cputime *expires)
1251{ 1226{
1252 if (!cputime_eq(expires->utime, cputime_zero) && 1227 if (expires->utime && sample->utime >= expires->utime)
1253 cputime_ge(sample->utime, expires->utime))
1254 return 1; 1228 return 1;
1255 if (!cputime_eq(expires->stime, cputime_zero) && 1229 if (expires->stime && sample->utime + sample->stime >= expires->stime)
1256 cputime_ge(cputime_add(sample->utime, sample->stime),
1257 expires->stime))
1258 return 1; 1230 return 1;
1259 if (expires->sum_exec_runtime != 0 && 1231 if (expires->sum_exec_runtime != 0 &&
1260 sample->sum_exec_runtime >= expires->sum_exec_runtime) 1232 sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1389 * it to be relative, *newval argument is relative and we update 1361 * it to be relative, *newval argument is relative and we update
1390 * it to be absolute. 1362 * it to be absolute.
1391 */ 1363 */
1392 if (!cputime_eq(*oldval, cputime_zero)) { 1364 if (*oldval) {
1393 if (cputime_le(*oldval, now.cpu)) { 1365 if (*oldval <= now.cpu) {
1394 /* Just about to fire. */ 1366 /* Just about to fire. */
1395 *oldval = cputime_one_jiffy; 1367 *oldval = cputime_one_jiffy;
1396 } else { 1368 } else {
1397 *oldval = cputime_sub(*oldval, now.cpu); 1369 *oldval -= now.cpu;
1398 } 1370 }
1399 } 1371 }
1400 1372
1401 if (cputime_eq(*newval, cputime_zero)) 1373 if (!*newval)
1402 return; 1374 return;
1403 *newval = cputime_add(*newval, now.cpu); 1375 *newval += now.cpu;
1404 } 1376 }
1405 1377
1406 /* 1378 /*
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
1ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_clock.o = -pg
3endif
4
5ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
6# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
7# needed for x86 only. Why this used to be enabled for all architectures is beyond
8# me. I suspect most platforms don't need this, but until we know that for sure
9# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
10# to get a correct value for the wait-channel (WCHAN in ps). --davidm
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif
13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19
20
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include "sched.h"
4
3#include <linux/proc_fs.h> 5#include <linux/proc_fs.h>
4#include <linux/seq_file.h> 6#include <linux/seq_file.h>
5#include <linux/kallsyms.h> 7#include <linux/kallsyms.h>
6#include <linux/utsname.h> 8#include <linux/utsname.h>
9#include <linux/security.h>
10#include <linux/export.h>
7 11
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 12unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default; 13static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr; 14static atomic_t autogroup_seq_nr;
11 15
12static void __init autogroup_init(struct task_struct *init_task) 16void __init autogroup_init(struct task_struct *init_task)
13{ 17{
14 autogroup_default.tg = &root_task_group; 18 autogroup_default.tg = &root_task_group;
15 kref_init(&autogroup_default.kref); 19 kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
17 init_task->signal->autogroup = &autogroup_default; 21 init_task->signal->autogroup = &autogroup_default;
18} 22}
19 23
20static inline void autogroup_free(struct task_group *tg) 24void autogroup_free(struct task_group *tg)
21{ 25{
22 kfree(tg->autogroup); 26 kfree(tg->autogroup);
23} 27}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
59 return ag; 63 return ag;
60} 64}
61 65
62#ifdef CONFIG_RT_GROUP_SCHED
63static void free_rt_sched_group(struct task_group *tg);
64#endif
65
66static inline struct autogroup *autogroup_create(void) 66static inline struct autogroup *autogroup_create(void)
67{ 67{
68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); 68 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
108 return autogroup_kref_get(&autogroup_default); 108 return autogroup_kref_get(&autogroup_default);
109} 109}
110 110
111static inline bool 111bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
112task_wants_autogroup(struct task_struct *p, struct task_group *tg)
113{ 112{
114 if (tg != &root_task_group) 113 if (tg != &root_task_group)
115 return false; 114 return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
127 return true; 126 return true;
128} 127}
129 128
130static inline bool task_group_is_autogroup(struct task_group *tg)
131{
132 return !!tg->autogroup;
133}
134
135static inline struct task_group *
136autogroup_task_group(struct task_struct *p, struct task_group *tg)
137{
138 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
139
140 if (enabled && task_wants_autogroup(p, tg))
141 return p->signal->autogroup->tg;
142
143 return tg;
144}
145
146static void 129static void
147autogroup_move_group(struct task_struct *p, struct autogroup *ag) 130autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148{ 131{
@@ -263,7 +246,7 @@ out:
263#endif /* CONFIG_PROC_FS */ 246#endif /* CONFIG_PROC_FS */
264 247
265#ifdef CONFIG_SCHED_DEBUG 248#ifdef CONFIG_SCHED_DEBUG
266static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 249int autogroup_path(struct task_group *tg, char *buf, int buflen)
267{ 250{
268 if (!task_group_is_autogroup(tg)) 251 if (!task_group_is_autogroup(tg))
269 return 0; 252 return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
1#ifdef CONFIG_SCHED_AUTOGROUP 1#ifdef CONFIG_SCHED_AUTOGROUP
2 2
3#include <linux/kref.h>
4#include <linux/rwsem.h>
5
3struct autogroup { 6struct autogroup {
4 /* 7 /*
5 * reference doesn't mean how many thread attach to this 8 * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
13 int nice; 16 int nice;
14}; 17};
15 18
16static inline bool task_group_is_autogroup(struct task_group *tg); 19extern void autogroup_init(struct task_struct *init_task);
20extern void autogroup_free(struct task_group *tg);
21
22static inline bool task_group_is_autogroup(struct task_group *tg)
23{
24 return !!tg->autogroup;
25}
26
27extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
28
17static inline struct task_group * 29static inline struct task_group *
18autogroup_task_group(struct task_struct *p, struct task_group *tg); 30autogroup_task_group(struct task_struct *p, struct task_group *tg)
31{
32 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
33
34 if (enabled && task_wants_autogroup(p, tg))
35 return p->signal->autogroup->tg;
36
37 return tg;
38}
39
40extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
19 41
20#else /* !CONFIG_SCHED_AUTOGROUP */ 42#else /* !CONFIG_SCHED_AUTOGROUP */
21 43
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492df..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index d6b149ccf925..4dbfd04a2148 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
@@ -56,7 +56,6 @@
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/proc_fs.h> 57#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 58#include <linux/seq_file.h>
59#include <linux/stop_machine.h>
60#include <linux/sysctl.h> 59#include <linux/sysctl.h>
61#include <linux/syscalls.h> 60#include <linux/syscalls.h>
62#include <linux/times.h> 61#include <linux/times.h>
@@ -75,129 +74,17 @@
75 74
76#include <asm/tlb.h> 75#include <asm/tlb.h>
77#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
78#include <asm/mutex.h>
79#ifdef CONFIG_PARAVIRT 77#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h> 78#include <asm/paravirt.h>
81#endif 79#endif
82 80
83#include "sched_cpupri.h" 81#include "sched.h"
84#include "workqueue_sched.h" 82#include "../workqueue_sched.h"
85#include "sched_autogroup.h"
86 83
87#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 85#include <trace/events/sched.h>
89 86
90/* 87void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
91 * Convert user-nice values [ -20 ... 0 ... 19 ]
92 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
93 * and back.
94 */
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99/*
100 * 'User priority' is the nice value converted to something we
101 * can work with better when scaling various scheduler parameters,
102 * it's a [ 0 ... 39 ] range.
103 */
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108/*
109 * Helpers for converting nanosecond timing to jiffy resolution
110 */
111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
112
113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 *
119 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
120 * Timeslices get refilled after they expire.
121 */
122#define DEF_TIMESLICE (100 * HZ / 1000)
123
124/*
125 * single value that denotes runtime == period, ie unlimited time.
126 */
127#define RUNTIME_INF ((u64)~0ULL)
128
129static inline int rt_policy(int policy)
130{
131 if (policy == SCHED_FIFO || policy == SCHED_RR)
132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
141/*
142 * This is the priority-queue data structure of the RT scheduling class:
143 */
144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
146 struct list_head queue[MAX_RT_PRIO];
147};
148
149struct rt_bandwidth {
150 /* nests inside the rq lock: */
151 raw_spinlock_t rt_runtime_lock;
152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
189
190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
193}
194
195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
198}
199
200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{ 88{
202 unsigned long delta; 89 unsigned long delta;
203 ktime_t soft, hard, now; 90 ktime_t soft, hard, now;
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
217 } 104 }
218} 105}
219 106
220static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 107DEFINE_MUTEX(sched_domains_mutex);
221{ 108DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
228 raw_spin_lock(&rt_b->rt_runtime_lock);
229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
230 raw_spin_unlock(&rt_b->rt_runtime_lock);
231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
240/*
241 * sched_domains_mutex serializes calls to init_sched_domains,
242 * detach_destroy_domains and partition_sched_domains.
243 */
244static DEFINE_MUTEX(sched_domains_mutex);
245
246#ifdef CONFIG_CGROUP_SCHED
247
248#include <linux/cgroup.h>
249
250struct cfs_rq;
251
252static LIST_HEAD(task_groups);
253
254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
258 u64 quota, runtime;
259 s64 hierarchal_quota;
260 u64 runtime_expires;
261
262 int idle, timer_active;
263 struct hrtimer period_timer, slack_timer;
264 struct list_head throttled_cfs_rq;
265
266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
269#endif
270};
271
272/* task group related information */
273struct task_group {
274 struct cgroup_subsys_state css;
275
276#ifdef CONFIG_FAIR_GROUP_SCHED
277 /* schedulable entities of this group on each cpu */
278 struct sched_entity **se;
279 /* runqueue "owned" by this group on each cpu */
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
282
283 atomic_t load_weight;
284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
290 struct rt_bandwidth rt_bandwidth;
291#endif
292
293 struct rcu_head rcu;
294 struct list_head list;
295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
303
304 struct cfs_bandwidth cfs_bandwidth;
305};
306
307/* task_group_lock serializes the addition/removal of task groups */
308static DEFINE_SPINLOCK(task_group_lock);
309
310#ifdef CONFIG_FAIR_GROUP_SCHED
311
312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
313
314/*
315 * A weight of 0 or 1 can cause arithmetics problems.
316 * A weight of a cfs_rq is the sum of weights of which entities
317 * are queued on this cfs_rq, so a weight of a entity should not be
318 * too large, so as the shares value of a task group.
319 * (The default weight is 1024 - so there's no practical
320 * limitation from this.)
321 */
322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
324
325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
326#endif
327
328/* Default task group.
329 * Every task in system belong to this group at bootup.
330 */
331struct task_group root_task_group;
332
333#endif /* CONFIG_CGROUP_SCHED */
334
335/* CFS-related fields in a runqueue */
336struct cfs_rq {
337 struct load_weight load;
338 unsigned long nr_running, h_nr_running;
339
340 u64 exec_clock;
341 u64 min_vruntime;
342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352 /*
353 * 'curr' points to currently running entity on this cfs_rq.
354 * It is set to NULL otherwise (i.e when none are currently running).
355 */
356 struct sched_entity *curr, *next, *last, *skip;
357
358#ifdef CONFIG_SCHED_DEBUG
359 unsigned int nr_spread_over;
360#endif
361
362#ifdef CONFIG_FAIR_GROUP_SCHED
363 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
364
365 /*
366 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
367 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
368 * (like users, containers etc.)
369 *
370 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
371 * list is used during load balance.
372 */
373 int on_list;
374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg; /* group that "owns" this runqueue */
376
377#ifdef CONFIG_SMP
378 /*
379 * the part of load.weight contributed by tasks
380 */
381 unsigned long task_weight;
382
383 /*
384 * h_load = weight * f(tg)
385 *
386 * Where f(tg) is the recursive weight fraction assigned to
387 * this group.
388 */
389 unsigned long h_load;
390
391 /*
392 * Maintaining per-cpu shares distribution for group scheduling
393 *
394 * load_stamp is the last time we updated the load average
395 * load_last is the last time we updated the load average and saw load
396 * load_unacc_exec_time is currently unaccounted execution time
397 */
398 u64 load_avg;
399 u64 load_period;
400 u64 load_stamp, load_last, load_unacc_exec_time;
401
402 unsigned long load_contribution;
403#endif
404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
406 u64 runtime_expires;
407 s64 runtime_remaining;
408
409 u64 throttled_timestamp;
410 int throttled, throttle_count;
411 struct list_head throttled_list;
412#endif
413#endif
414};
415
416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
460 cfs_b->runtime = 0;
461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
463
464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
475}
476
477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
502{
503 hrtimer_cancel(&cfs_b->period_timer);
504 hrtimer_cancel(&cfs_b->slack_timer);
505}
506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
518/* Real-Time classes' related field in a runqueue: */
519struct rt_rq {
520 struct rt_prio_array active;
521 unsigned long rt_nr_running;
522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
523 struct {
524 int curr; /* highest queued rt task prio */
525#ifdef CONFIG_SMP
526 int next; /* next highest */
527#endif
528 } highest_prio;
529#endif
530#ifdef CONFIG_SMP
531 unsigned long rt_nr_migratory;
532 unsigned long rt_nr_total;
533 int overloaded;
534 struct plist_head pushable_tasks;
535#endif
536 int rt_throttled;
537 u64 rt_time;
538 u64 rt_runtime;
539 /* Nests inside the rq lock: */
540 raw_spinlock_t rt_runtime_lock;
541
542#ifdef CONFIG_RT_GROUP_SCHED
543 unsigned long rt_nr_boosted;
544
545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
548#endif
549};
550
551#ifdef CONFIG_SMP
552
553/*
554 * We add the notion of a root-domain which will be used to define per-domain
555 * variables. Each exclusive cpuset essentially defines an island domain by
556 * fully partitioning the member cpus from any other cpuset. Whenever a new
557 * exclusive cpuset is created, we also create and attach a new root-domain
558 * object.
559 *
560 */
561struct root_domain {
562 atomic_t refcount;
563 atomic_t rto_count;
564 struct rcu_head rcu;
565 cpumask_var_t span;
566 cpumask_var_t online;
567
568 /*
569 * The "RT overload" flag: it gets set if a CPU has more than
570 * one runnable RT task.
571 */
572 cpumask_var_t rto_mask;
573 struct cpupri cpupri;
574};
575
576/*
577 * By default the system creates a single root-domain with all cpus as
578 * members (mimicking the global state we have today).
579 */
580static struct root_domain def_root_domain;
581
582#endif /* CONFIG_SMP */
583
584/*
585 * This is the main, per-CPU runqueue data structure.
586 *
587 * Locking rule: those places that want to lock multiple runqueues
588 * (such as the load balancing or the thread migration code), lock
589 * acquire operations must be ordered by ascending &runqueue.
590 */
591struct rq {
592 /* runqueue lock: */
593 raw_spinlock_t lock;
594
595 /*
596 * nr_running and cpu_load should be in the same cacheline because
597 * remote CPUs use both these fields when doing load calculation.
598 */
599 unsigned long nr_running;
600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
602 unsigned long last_load_update_tick;
603#ifdef CONFIG_NO_HZ
604 u64 nohz_stamp;
605 unsigned char nohz_balance_kick;
606#endif
607 int skip_clock_update;
608
609 /* capture load from *all* tasks on this cpu: */
610 struct load_weight load;
611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
615 struct rt_rq rt;
616
617#ifdef CONFIG_FAIR_GROUP_SCHED
618 /* list of leaf cfs_rq on this cpu: */
619 struct list_head leaf_cfs_rq_list;
620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
622 struct list_head leaf_rt_rq_list;
623#endif
624
625 /*
626 * This is part of a global counter where only the total sum
627 * over all CPUs matters. A task can increase this counter on
628 * one CPU and if it got migrated afterwards it may decrease
629 * it on another CPU. Always updated under the runqueue lock:
630 */
631 unsigned long nr_uninterruptible;
632
633 struct task_struct *curr, *idle, *stop;
634 unsigned long next_balance;
635 struct mm_struct *prev_mm;
636
637 u64 clock;
638 u64 clock_task;
639
640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
643 struct root_domain *rd;
644 struct sched_domain *sd;
645
646 unsigned long cpu_power;
647
648 unsigned char idle_balance;
649 /* For active balancing */
650 int post_schedule;
651 int active_balance;
652 int push_cpu;
653 struct cpu_stop_work active_balance_work;
654 /* cpu of this runqueue: */
655 int cpu;
656 int online;
657
658 u64 rt_avg;
659 u64 age_stamp;
660 u64 idle_stamp;
661 u64 avg_idle;
662#endif
663
664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
673
674 /* calc_load related fields */
675 unsigned long calc_load_update;
676 long calc_load_active;
677
678#ifdef CONFIG_SCHED_HRTICK
679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
683 struct hrtimer hrtick_timer;
684#endif
685
686#ifdef CONFIG_SCHEDSTATS
687 /* latency stats */
688 struct sched_info rq_sched_info;
689 unsigned long long rq_cpu_time;
690 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
691
692 /* sys_sched_yield() stats */
693 unsigned int yld_count;
694
695 /* schedule() stats */
696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
699
700 /* try_to_wake_up() stats */
701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
703#endif
704
705#ifdef CONFIG_SMP
706 struct llist_head wake_list;
707#endif
708};
709
710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
711
712
713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
714
715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
724#define rcu_dereference_check_sched_domain(p) \
725 rcu_dereference_check((p), \
726 lockdep_is_held(&sched_domains_mutex))
727
728/*
729 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
730 * See detach_destroy_domains: synchronize_sched for details.
731 *
732 * The domain tree of any CPU may only be accessed from within
733 * preempt-disabled sections.
734 */
735#define for_each_domain(cpu, __sd) \
736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
742#define raw_rq() (&__raw_get_cpu_var(runqueues))
743
744#ifdef CONFIG_CGROUP_SCHED
745
746/*
747 * Return the group to which this tasks belongs.
748 *
749 * We use task_subsys_state_check() and extend the RCU verification with
750 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
751 * task it moves into the cgroup. Therefore by holding either of those locks,
752 * we pin the task to the current cgroup.
753 */
754static inline struct task_group *task_group(struct task_struct *p)
755{
756 struct task_group *tg;
757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
765}
766
767/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else /* CONFIG_CGROUP_SCHED */
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif /* CONFIG_CGROUP_SCHED */
790 109
791static void update_rq_clock_task(struct rq *rq, s64 delta); 110static void update_rq_clock_task(struct rq *rq, s64 delta);
792 111
793static void update_rq_clock(struct rq *rq) 112void update_rq_clock(struct rq *rq)
794{ 113{
795 s64 delta; 114 s64 delta;
796 115
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
803} 122}
804 123
805/* 124/*
806 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
807 */
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
814/**
815 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
816 * @cpu: the processor in question.
817 *
818 * This interface allows printk to be called with the runqueue lock
819 * held and know whether or not it is OK to wake up the klogd.
820 */
821int runqueue_is_locked(int cpu)
822{
823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
824}
825
826/*
827 * Debugging: various feature bits 125 * Debugging: various feature bits
828 */ 126 */
829 127
830#define SCHED_FEAT(name, enabled) \ 128#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
833enum {
834#include "sched_features.h"
835};
836
837#undef SCHED_FEAT
838
839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled | 129 (1UL << __SCHED_FEAT_##name) * enabled |
841 130
842const_debug unsigned int sysctl_sched_features = 131const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h" 132#include "features.h"
844 0; 133 0;
845 134
846#undef SCHED_FEAT 135#undef SCHED_FEAT
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
850 #name , 139 #name ,
851 140
852static __read_mostly char *sched_feat_names[] = { 141static __read_mostly char *sched_feat_names[] = {
853#include "sched_features.h" 142#include "features.h"
854 NULL 143 NULL
855}; 144};
856 145
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
860{ 149{
861 int i; 150 int i;
862 151
863 for (i = 0; sched_feat_names[i]; i++) { 152 for (i = 0; i < __SCHED_FEAT_NR; i++) {
864 if (!(sysctl_sched_features & (1UL << i))) 153 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_"); 154 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]); 155 seq_printf(m, "%s ", sched_feat_names[i]);
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
870 return 0; 159 return 0;
871} 160}
872 161
162#ifdef HAVE_JUMP_LABEL
163
164#define jump_label_key__true jump_label_key_enabled
165#define jump_label_key__false jump_label_key_disabled
166
167#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled ,
169
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h"
172};
173
174#undef SCHED_FEAT
175
176static void sched_feat_disable(int i)
177{
178 if (jump_label_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]);
180}
181
182static void sched_feat_enable(int i)
183{
184 if (!jump_label_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]);
186}
187#else
188static void sched_feat_disable(int i) { };
189static void sched_feat_enable(int i) { };
190#endif /* HAVE_JUMP_LABEL */
191
873static ssize_t 192static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf, 193sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos) 194 size_t cnt, loff_t *ppos)
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
893 cmp += 3; 212 cmp += 3;
894 } 213 }
895 214
896 for (i = 0; sched_feat_names[i]; i++) { 215 for (i = 0; i < __SCHED_FEAT_NR; i++) {
897 if (strcmp(cmp, sched_feat_names[i]) == 0) { 216 if (strcmp(cmp, sched_feat_names[i]) == 0) {
898 if (neg) 217 if (neg) {
899 sysctl_sched_features &= ~(1UL << i); 218 sysctl_sched_features &= ~(1UL << i);
900 else 219 sched_feat_disable(i);
220 } else {
901 sysctl_sched_features |= (1UL << i); 221 sysctl_sched_features |= (1UL << i);
222 sched_feat_enable(i);
223 }
902 break; 224 break;
903 } 225 }
904 } 226 }
905 227
906 if (!sched_feat_names[i]) 228 if (i == __SCHED_FEAT_NR)
907 return -EINVAL; 229 return -EINVAL;
908 230
909 *ppos += cnt; 231 *ppos += cnt;
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
932 return 0; 254 return 0;
933} 255}
934late_initcall(sched_init_debug); 256late_initcall(sched_init_debug);
935 257#endif /* CONFIG_SCHED_DEBUG */
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
939 258
940/* 259/*
941 * Number of tasks to iterate in a single balance run. 260 * Number of tasks to iterate in a single balance run.
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
957 */ 276 */
958unsigned int sysctl_sched_rt_period = 1000000; 277unsigned int sysctl_sched_rt_period = 1000000;
959 278
960static __read_mostly int scheduler_running; 279__read_mostly int scheduler_running;
961 280
962/* 281/*
963 * part of the period that we allow rt tasks to run in us. 282 * part of the period that we allow rt tasks to run in us.
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
965 */ 284 */
966int sysctl_sched_rt_runtime = 950000; 285int sysctl_sched_rt_runtime = 950000;
967 286
968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972 287
973static inline u64 global_rt_runtime(void)
974{
975 if (sysctl_sched_rt_runtime < 0)
976 return RUNTIME_INF;
977
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
980
981#ifndef prepare_arch_switch
982# define prepare_arch_switch(next) do { } while (0)
983#endif
984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
993static inline int task_running(struct rq *rq, struct task_struct *p)
994{
995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
998 return task_current(rq, p);
999#endif
1000}
1001
1002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1004{
1005#ifdef CONFIG_SMP
1006 /*
1007 * We can optimise this out completely for !SMP, because the
1008 * SMP rebalancing from interrupt is the only thing that cares
1009 * here.
1010 */
1011 next->on_cpu = 1;
1012#endif
1013}
1014
1015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1016{
1017#ifdef CONFIG_SMP
1018 /*
1019 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1020 * We must ensure this doesn't happen until the switch is completely
1021 * finished.
1022 */
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
1026#ifdef CONFIG_DEBUG_SPINLOCK
1027 /* this is a valid case when another task releases the spinlock */
1028 rq->lock.owner = current;
1029#endif
1030 /*
1031 * If we are tracking spinlock dependencies then we have to
1032 * fix up the runqueue lock - which gets 'carried over' from
1033 * prev into current:
1034 */
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
1037 raw_spin_unlock_irq(&rq->lock);
1038}
1039
1040#else /* __ARCH_WANT_UNLOCKED_CTXSW */
1041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1042{
1043#ifdef CONFIG_SMP
1044 /*
1045 * We can optimise this out completely for !SMP, because the
1046 * SMP rebalancing from interrupt is the only thing that cares
1047 * here.
1048 */
1049 next->on_cpu = 1;
1050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1052 raw_spin_unlock_irq(&rq->lock);
1053#else
1054 raw_spin_unlock(&rq->lock);
1055#endif
1056}
1057
1058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1059{
1060#ifdef CONFIG_SMP
1061 /*
1062 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1063 * We must ensure this doesn't happen until the switch is completely
1064 * finished.
1065 */
1066 smp_wmb();
1067 prev->on_cpu = 0;
1068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1074 288
1075/* 289/*
1076 * __task_rq_lock - lock the rq @p resides on. 290 * __task_rq_lock - lock the rq @p resides on.
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
1153 * rq->lock. 367 * rq->lock.
1154 */ 368 */
1155 369
1156/*
1157 * Use hrtick when:
1158 * - enabled by features
1159 * - hrtimer is actually high res
1160 */
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
1165 if (!cpu_active(cpu_of(rq)))
1166 return 0;
1167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
1170static void hrtick_clear(struct rq *rq) 370static void hrtick_clear(struct rq *rq)
1171{ 371{
1172 if (hrtimer_active(&rq->hrtick_timer)) 372 if (hrtimer_active(&rq->hrtick_timer))
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
1210 * 410 *
1211 * called with rq->lock held and irqs disabled 411 * called with rq->lock held and irqs disabled
1212 */ 412 */
1213static void hrtick_start(struct rq *rq, u64 delay) 413void hrtick_start(struct rq *rq, u64 delay)
1214{ 414{
1215 struct hrtimer *timer = &rq->hrtick_timer; 415 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 416 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
1254 * 454 *
1255 * called with rq->lock held and irqs disabled 455 * called with rq->lock held and irqs disabled
1256 */ 456 */
1257static void hrtick_start(struct rq *rq, u64 delay) 457void hrtick_start(struct rq *rq, u64 delay)
1258{ 458{
1259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 459 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1260 HRTIMER_MODE_REL_PINNED, 0); 460 HRTIMER_MODE_REL_PINNED, 0);
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 505#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif 506#endif
1307 507
1308static void resched_task(struct task_struct *p) 508void resched_task(struct task_struct *p)
1309{ 509{
1310 int cpu; 510 int cpu;
1311 511
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
1326 smp_send_reschedule(cpu); 526 smp_send_reschedule(cpu);
1327} 527}
1328 528
1329static void resched_cpu(int cpu) 529void resched_cpu(int cpu)
1330{ 530{
1331 struct rq *rq = cpu_rq(cpu); 531 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags; 532 unsigned long flags;
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu)
1407 607
1408static inline bool got_nohz_idle_kick(void) 608static inline bool got_nohz_idle_kick(void)
1409{ 609{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; 610 int cpu = smp_processor_id();
611 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
1411} 612}
1412 613
1413#else /* CONFIG_NO_HZ */ 614#else /* CONFIG_NO_HZ */
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void)
1419 620
1420#endif /* CONFIG_NO_HZ */ 621#endif /* CONFIG_NO_HZ */
1421 622
1422static u64 sched_avg_period(void) 623void sched_avg_update(struct rq *rq)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{ 624{
1429 s64 period = sched_avg_period(); 625 s64 period = sched_avg_period();
1430 626
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq)
1440 } 636 }
1441} 637}
1442 638
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
1449#else /* !CONFIG_SMP */ 639#else /* !CONFIG_SMP */
1450static void resched_task(struct task_struct *p) 640void resched_task(struct task_struct *p)
1451{ 641{
1452 assert_raw_spin_locked(&task_rq(p)->lock); 642 assert_raw_spin_locked(&task_rq(p)->lock);
1453 set_tsk_need_resched(p); 643 set_tsk_need_resched(p);
1454} 644}
1455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
1459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
1463#endif /* CONFIG_SMP */ 645#endif /* CONFIG_SMP */
1464 646
1465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
1473/*
1474 * Shift right and round:
1475 */
1476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1477
1478/*
1479 * delta *= weight / lw
1480 */
1481static unsigned long
1482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
1487 /*
1488 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1489 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1490 * 2^SCHED_LOAD_RESOLUTION.
1491 */
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
1496
1497 if (!lw->inv_weight) {
1498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1501 lw->inv_weight = 1;
1502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
1504 else
1505 lw->inv_weight = WMULT_CONST / w;
1506 }
1507
1508 /*
1509 * Check whether we'd overflow the 64-bit multiplication:
1510 */
1511 if (unlikely(tmp > WMULT_CONST))
1512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1513 WMULT_SHIFT/2);
1514 else
1515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1516
1517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1518}
1519
1520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1521{
1522 lw->weight += inc;
1523 lw->inv_weight = 0;
1524}
1525
1526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1527{
1528 lw->weight -= dec;
1529 lw->inv_weight = 0;
1530}
1531
1532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
1538/*
1539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
1542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
1547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
1549
1550/*
1551 * Nice levels are multiplicative, with a gentle 10% change for every
1552 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1553 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1554 * that remained on nice 0.
1555 *
1556 * The "10% effect" is relative and cumulative: from _any_ nice level,
1557 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1558 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1559 * If a task goes up by ~10% and another task goes down by ~10% then
1560 * the relative distance between them is ~25%.)
1561 */
1562static const int prio_to_weight[40] = {
1563 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1564 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1565 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1566 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1567 /* 0 */ 1024, 820, 655, 526, 423,
1568 /* 5 */ 335, 272, 215, 172, 137,
1569 /* 10 */ 110, 87, 70, 56, 45,
1570 /* 15 */ 36, 29, 23, 18, 15,
1571};
1572
1573/*
1574 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1575 *
1576 * In cases where the weight does not change often, we can use the
1577 * precalculated inverse to speed up arithmetics by turning divisions
1578 * into multiplications:
1579 */
1580static const u32 prio_to_wmult[40] = {
1581 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1582 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1583 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1584 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1585 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1586 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1587 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1588 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1589};
1590
1591/* Time spent by the tasks of the cpu accounting group executing in ... */
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER, /* ... user mode */
1594 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
1599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
1603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
1607#endif
1608
1609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
1619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 647#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 648 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
1621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623/* 649/*
1624 * Iterate task_group tree rooted at *from, calling @down when first entering a 650 * Iterate task_group tree rooted at *from, calling @down when first entering a
1625 * node and @up when leaving it for the final time. 651 * node and @up when leaving it for the final time.
1626 * 652 *
1627 * Caller must hold rcu_lock or sufficient equivalent. 653 * Caller must hold rcu_lock or sufficient equivalent.
1628 */ 654 */
1629static int walk_tg_tree_from(struct task_group *from, 655int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data) 656 tg_visitor down, tg_visitor up, void *data)
1631{ 657{
1632 struct task_group *parent, *child; 658 struct task_group *parent, *child;
@@ -1657,270 +683,13 @@ out:
1657 return ret; 683 return ret;
1658} 684}
1659 685
1660/* 686int tg_nop(struct task_group *tg, void *data)
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
1672static int tg_nop(struct task_group *tg, void *data)
1673{ 687{
1674 return 0; 688 return 0;
1675} 689}
1676#endif 690#endif
1677 691
1678#ifdef CONFIG_SMP 692void update_cpu_load(struct rq *this_rq);
1679/* Used instead of source_load when we know the type == 0 */
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685/*
1686 * Return a low guess at the load of a migration-source cpu weighted
1687 * according to the scheduling class and "nice" value.
1688 *
1689 * We want to under-estimate the load of migration sources, to
1690 * balance conservatively.
1691 */
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703/*
1704 * Return a high guess at the load of a migration-target cpu weighted
1705 * according to the scheduling class and "nice" value.
1706 */
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
1718static unsigned long power_of(int cpu)
1719{
1720 return cpu_rq(cpu)->cpu_power;
1721}
1722
1723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1724
1725static unsigned long cpu_avg_load_per_task(int cpu)
1726{
1727 struct rq *rq = cpu_rq(cpu);
1728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1729
1730 if (nr_running)
1731 return rq->load.weight / nr_running;
1732
1733 return 0;
1734}
1735
1736#ifdef CONFIG_PREEMPT
1737
1738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
1740/*
1741 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1742 * way at the expense of forcing extra atomic operations in all
1743 * invocations. This assures that the double_lock is acquired using the
1744 * same underlying policy as the spinlock_t on this architecture, which
1745 * reduces latency compared to the unfair variant below. However, it
1746 * also adds more overhead and therefore may reduce throughput.
1747 */
1748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
1753 raw_spin_unlock(&this_rq->lock);
1754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760/*
1761 * Unfair double_lock_balance: Optimizes throughput at the expense of
1762 * latency by eliminating extra atomic operations when the locks are
1763 * already in proper order on entry. This favors lower cpu-ids and will
1764 * grant the double lock to lower cpus over higher ids under contention,
1765 * regardless of entry order into the function.
1766 */
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
1774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1775 if (busiest < this_rq) {
1776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
1780 ret = 1;
1781 } else
1782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
1784 }
1785 return ret;
1786}
1787
1788#endif /* CONFIG_PREEMPT */
1789
1790/*
1791 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1792 */
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796 /* printk() doesn't work good under rq->lock */
1797 raw_spin_unlock(&this_rq->lock);
1798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
1804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
1807 raw_spin_unlock(&busiest->lock);
1808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
1810
1811/*
1812 * double_rq_lock - safely lock two runqueues
1813 *
1814 * Note this does not disable interrupts like task_rq_lock,
1815 * you need to do so manually before calling.
1816 */
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock); /* Fake it out ;) */
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
1834}
1835
1836/*
1837 * double_rq_unlock - safely unlock two runqueues
1838 *
1839 * Note this does not restore interrupts like task_rq_unlock,
1840 * you need to do so manually after calling.
1841 */
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
1853#else /* CONFIG_SMP */
1854
1855/*
1856 * double_rq_lock - safely lock two runqueues
1857 *
1858 * Note this does not disable interrupts like task_rq_lock,
1859 * you need to do so manually before calling.
1860 */
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock); /* Fake it out ;) */
1869}
1870
1871/*
1872 * double_rq_unlock - safely unlock two runqueues
1873 *
1874 * Note this does not restore interrupts like task_rq_unlock,
1875 * you need to do so manually after calling.
1876 */
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
1886#endif
1887
1888static void calc_load_account_idle(struct rq *this_rq);
1889static void update_sysctl(void);
1890static int get_update_sysctl_factor(void);
1891static void update_cpu_load(struct rq *this_rq);
1892
1893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897 /*
1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1899 * successfully executed on another CPU. We must ensure that updates of
1900 * per-task data have been completed by this moment.
1901 */
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
1906
1907static const struct sched_class rt_sched_class;
1908
1909#define sched_class_highest (&stop_sched_class)
1910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
1912
1913#include "sched_stats.h"
1914
1915static void inc_nr_running(struct rq *rq)
1916{
1917 rq->nr_running++;
1918}
1919
1920static void dec_nr_running(struct rq *rq)
1921{
1922 rq->nr_running--;
1923}
1924 693
1925static void set_load_weight(struct task_struct *p) 694static void set_load_weight(struct task_struct *p)
1926{ 695{
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1957/* 726/*
1958 * activate_task - move a task to the runqueue. 727 * activate_task - move a task to the runqueue.
1959 */ 728 */
1960static void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
1961{ 730{
1962 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--; 732 rq->nr_uninterruptible--;
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1968/* 737/*
1969 * deactivate_task - remove a task from the runqueue. 738 * deactivate_task - remove a task from the runqueue.
1970 */ 739 */
1971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 740void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1972{ 741{
1973 if (task_contributes_to_load(p)) 742 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++; 743 rq->nr_uninterruptible++;
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2159#ifdef CONFIG_IRQ_TIME_ACCOUNTING 928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2160static int irqtime_account_hi_update(void) 929static int irqtime_account_hi_update(void)
2161{ 930{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 931 u64 *cpustat = kcpustat_this_cpu->cpustat;
2163 unsigned long flags; 932 unsigned long flags;
2164 u64 latest_ns; 933 u64 latest_ns;
2165 int ret = 0; 934 int ret = 0;
2166 935
2167 local_irq_save(flags); 936 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time); 937 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) 938 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
2170 ret = 1; 939 ret = 1;
2171 local_irq_restore(flags); 940 local_irq_restore(flags);
2172 return ret; 941 return ret;
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
2174 943
2175static int irqtime_account_si_update(void) 944static int irqtime_account_si_update(void)
2176{ 945{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 946 u64 *cpustat = kcpustat_this_cpu->cpustat;
2178 unsigned long flags; 947 unsigned long flags;
2179 u64 latest_ns; 948 u64 latest_ns;
2180 int ret = 0; 949 int ret = 0;
2181 950
2182 local_irq_save(flags); 951 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time); 952 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) 953 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
2185 ret = 1; 954 ret = 1;
2186 local_irq_restore(flags); 955 local_irq_restore(flags);
2187 return ret; 956 return ret;
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
2193 962
2194#endif 963#endif
2195 964
2196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
2199#include "sched_autogroup.c"
2200#include "sched_stoptask.c"
2201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
2205void sched_set_stop_task(int cpu, struct task_struct *stop) 965void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{ 966{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 967 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2299 p->sched_class->prio_changed(rq, p, oldprio); 1059 p->sched_class->prio_changed(rq, p, oldprio);
2300} 1060}
2301 1061
2302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1062void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{ 1063{
2304 const struct sched_class *class; 1064 const struct sched_class *class;
2305 1065
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2325} 1085}
2326 1086
2327#ifdef CONFIG_SMP 1087#ifdef CONFIG_SMP
2328/*
2329 * Is this task likely cache-hot:
2330 */
2331static int
2332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
2336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
2339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
2342 /*
2343 * Buddy candidates are cache hot:
2344 */
2345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
2348 return 1;
2349
2350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
2355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
2360void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1088void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2361{ 1089{
2362#ifdef CONFIG_SCHED_DEBUG 1090#ifdef CONFIG_SCHED_DEBUG
@@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2783 1511
2784} 1512}
2785#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1513#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1514
1515static inline int ttwu_share_cache(int this_cpu, int that_cpu)
1516{
1517 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1518}
2786#endif /* CONFIG_SMP */ 1519#endif /* CONFIG_SMP */
2787 1520
2788static void ttwu_queue(struct task_struct *p, int cpu) 1521static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2790 struct rq *rq = cpu_rq(cpu); 1523 struct rq *rq = cpu_rq(cpu);
2791 1524
2792#if defined(CONFIG_SMP) 1525#if defined(CONFIG_SMP)
2793 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 1526 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
2794 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1527 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2795 ttwu_queue_remote(p, cpu); 1528 ttwu_queue_remote(p, cpu);
2796 return; 1529 return;
@@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3204 local_irq_enable(); 1937 local_irq_enable();
3205#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1938#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3206 finish_lock_switch(rq, prev); 1939 finish_lock_switch(rq, prev);
1940 trace_sched_stat_sleeptime(current, rq->clock);
3207 1941
3208 fire_sched_in_preempt_notifiers(current); 1942 fire_sched_in_preempt_notifiers(current);
3209 if (mm) 1943 if (mm)
@@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
3439 */ 2173 */
3440static atomic_long_t calc_load_tasks_idle; 2174static atomic_long_t calc_load_tasks_idle;
3441 2175
3442static void calc_load_account_idle(struct rq *this_rq) 2176void calc_load_account_idle(struct rq *this_rq)
3443{ 2177{
3444 long delta; 2178 long delta;
3445 2179
@@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
3583 */ 2317 */
3584} 2318}
3585#else 2319#else
3586static void calc_load_account_idle(struct rq *this_rq) 2320void calc_load_account_idle(struct rq *this_rq)
3587{ 2321{
3588} 2322}
3589 2323
@@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3726 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2460 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3727 * every tick. We fix it up based on jiffies. 2461 * every tick. We fix it up based on jiffies.
3728 */ 2462 */
3729static void update_cpu_load(struct rq *this_rq) 2463void update_cpu_load(struct rq *this_rq)
3730{ 2464{
3731 unsigned long this_load = this_rq->load.weight; 2465 unsigned long this_load = this_rq->load.weight;
3732 unsigned long curr_jiffies = jiffies; 2466 unsigned long curr_jiffies = jiffies;
@@ -3804,8 +2538,10 @@ unlock:
3804#endif 2538#endif
3805 2539
3806DEFINE_PER_CPU(struct kernel_stat, kstat); 2540DEFINE_PER_CPU(struct kernel_stat, kstat);
2541DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
3807 2542
3808EXPORT_PER_CPU_SYMBOL(kstat); 2543EXPORT_PER_CPU_SYMBOL(kstat);
2544EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
3809 2545
3810/* 2546/*
3811 * Return any ns on the sched_clock that have not yet been accounted in 2547 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3858 return ns; 2594 return ns;
3859} 2595}
3860 2596
2597#ifdef CONFIG_CGROUP_CPUACCT
2598struct cgroup_subsys cpuacct_subsys;
2599struct cpuacct root_cpuacct;
2600#endif
2601
2602static inline void task_group_account_field(struct task_struct *p, int index,
2603 u64 tmp)
2604{
2605#ifdef CONFIG_CGROUP_CPUACCT
2606 struct kernel_cpustat *kcpustat;
2607 struct cpuacct *ca;
2608#endif
2609 /*
2610 * Since all updates are sure to touch the root cgroup, we
2611 * get ourselves ahead and touch it first. If the root cgroup
2612 * is the only cgroup, then nothing else should be necessary.
2613 *
2614 */
2615 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2616
2617#ifdef CONFIG_CGROUP_CPUACCT
2618 if (unlikely(!cpuacct_subsys.active))
2619 return;
2620
2621 rcu_read_lock();
2622 ca = task_ca(p);
2623 while (ca && (ca != &root_cpuacct)) {
2624 kcpustat = this_cpu_ptr(ca->cpustat);
2625 kcpustat->cpustat[index] += tmp;
2626 ca = parent_ca(ca);
2627 }
2628 rcu_read_unlock();
2629#endif
2630}
2631
2632
3861/* 2633/*
3862 * Account user cpu time to a process. 2634 * Account user cpu time to a process.
3863 * @p: the process that the cpu time gets accounted to 2635 * @p: the process that the cpu time gets accounted to
@@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3867void account_user_time(struct task_struct *p, cputime_t cputime, 2639void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled) 2640 cputime_t cputime_scaled)
3869{ 2641{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2642 int index;
3871 cputime64_t tmp;
3872 2643
3873 /* Add user time to process. */ 2644 /* Add user time to process. */
3874 p->utime = cputime_add(p->utime, cputime); 2645 p->utime += cputime;
3875 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2646 p->utimescaled += cputime_scaled;
3876 account_group_user_time(p, cputime); 2647 account_group_user_time(p, cputime);
3877 2648
2649 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2650
3878 /* Add user time to cpustat. */ 2651 /* Add user time to cpustat. */
3879 tmp = cputime_to_cputime64(cputime); 2652 task_group_account_field(p, index, (__force u64) cputime);
3880 if (TASK_NICE(p) > 0)
3881 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3882 else
3883 cpustat->user = cputime64_add(cpustat->user, tmp);
3884 2653
3885 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3886 /* Account for user time used */ 2654 /* Account for user time used */
3887 acct_update_integrals(p); 2655 acct_update_integrals(p);
3888} 2656}
@@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
3896static void account_guest_time(struct task_struct *p, cputime_t cputime, 2664static void account_guest_time(struct task_struct *p, cputime_t cputime,
3897 cputime_t cputime_scaled) 2665 cputime_t cputime_scaled)
3898{ 2666{
3899 cputime64_t tmp; 2667 u64 *cpustat = kcpustat_this_cpu->cpustat;
3900 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3901
3902 tmp = cputime_to_cputime64(cputime);
3903 2668
3904 /* Add guest time to process. */ 2669 /* Add guest time to process. */
3905 p->utime = cputime_add(p->utime, cputime); 2670 p->utime += cputime;
3906 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 2671 p->utimescaled += cputime_scaled;
3907 account_group_user_time(p, cputime); 2672 account_group_user_time(p, cputime);
3908 p->gtime = cputime_add(p->gtime, cputime); 2673 p->gtime += cputime;
3909 2674
3910 /* Add guest time to cpustat. */ 2675 /* Add guest time to cpustat. */
3911 if (TASK_NICE(p) > 0) { 2676 if (TASK_NICE(p) > 0) {
3912 cpustat->nice = cputime64_add(cpustat->nice, tmp); 2677 cpustat[CPUTIME_NICE] += (__force u64) cputime;
3913 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 2678 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
3914 } else { 2679 } else {
3915 cpustat->user = cputime64_add(cpustat->user, tmp); 2680 cpustat[CPUTIME_USER] += (__force u64) cputime;
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp); 2681 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
3917 } 2682 }
3918} 2683}
3919 2684
@@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3926 */ 2691 */
3927static inline 2692static inline
3928void __account_system_time(struct task_struct *p, cputime_t cputime, 2693void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 cputime_t cputime_scaled, cputime64_t *target_cputime64) 2694 cputime_t cputime_scaled, int index)
3930{ 2695{
3931 cputime64_t tmp = cputime_to_cputime64(cputime);
3932
3933 /* Add system time to process. */ 2696 /* Add system time to process. */
3934 p->stime = cputime_add(p->stime, cputime); 2697 p->stime += cputime;
3935 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 2698 p->stimescaled += cputime_scaled;
3936 account_group_system_time(p, cputime); 2699 account_group_system_time(p, cputime);
3937 2700
3938 /* Add system time to cpustat. */ 2701 /* Add system time to cpustat. */
3939 *target_cputime64 = cputime64_add(*target_cputime64, tmp); 2702 task_group_account_field(p, index, (__force u64) cputime);
3940 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3941 2703
3942 /* Account for system time used */ 2704 /* Account for system time used */
3943 acct_update_integrals(p); 2705 acct_update_integrals(p);
@@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
3953void account_system_time(struct task_struct *p, int hardirq_offset, 2715void account_system_time(struct task_struct *p, int hardirq_offset,
3954 cputime_t cputime, cputime_t cputime_scaled) 2716 cputime_t cputime, cputime_t cputime_scaled)
3955{ 2717{
3956 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2718 int index;
3957 cputime64_t *target_cputime64;
3958 2719
3959 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2720 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3960 account_guest_time(p, cputime, cputime_scaled); 2721 account_guest_time(p, cputime, cputime_scaled);
@@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3962 } 2723 }
3963 2724
3964 if (hardirq_count() - hardirq_offset) 2725 if (hardirq_count() - hardirq_offset)
3965 target_cputime64 = &cpustat->irq; 2726 index = CPUTIME_IRQ;
3966 else if (in_serving_softirq()) 2727 else if (in_serving_softirq())
3967 target_cputime64 = &cpustat->softirq; 2728 index = CPUTIME_SOFTIRQ;
3968 else 2729 else
3969 target_cputime64 = &cpustat->system; 2730 index = CPUTIME_SYSTEM;
3970 2731
3971 __account_system_time(p, cputime, cputime_scaled, target_cputime64); 2732 __account_system_time(p, cputime, cputime_scaled, index);
3972} 2733}
3973 2734
3974/* 2735/*
@@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3977 */ 2738 */
3978void account_steal_time(cputime_t cputime) 2739void account_steal_time(cputime_t cputime)
3979{ 2740{
3980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2741 u64 *cpustat = kcpustat_this_cpu->cpustat;
3981 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3982 2742
3983 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 2743 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
3984} 2744}
3985 2745
3986/* 2746/*
@@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
3989 */ 2749 */
3990void account_idle_time(cputime_t cputime) 2750void account_idle_time(cputime_t cputime)
3991{ 2751{
3992 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2752 u64 *cpustat = kcpustat_this_cpu->cpustat;
3993 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3994 struct rq *rq = this_rq(); 2753 struct rq *rq = this_rq();
3995 2754
3996 if (atomic_read(&rq->nr_iowait) > 0) 2755 if (atomic_read(&rq->nr_iowait) > 0)
3997 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 2756 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
3998 else 2757 else
3999 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 2758 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
4000} 2759}
4001 2760
4002static __always_inline bool steal_account_process_tick(void) 2761static __always_inline bool steal_account_process_tick(void)
@@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4046 struct rq *rq) 2805 struct rq *rq)
4047{ 2806{
4048 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2807 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4049 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); 2808 u64 *cpustat = kcpustat_this_cpu->cpustat;
4050 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4051 2809
4052 if (steal_account_process_tick()) 2810 if (steal_account_process_tick())
4053 return; 2811 return;
4054 2812
4055 if (irqtime_account_hi_update()) { 2813 if (irqtime_account_hi_update()) {
4056 cpustat->irq = cputime64_add(cpustat->irq, tmp); 2814 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
4057 } else if (irqtime_account_si_update()) { 2815 } else if (irqtime_account_si_update()) {
4058 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 2816 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
4059 } else if (this_cpu_ksoftirqd() == p) { 2817 } else if (this_cpu_ksoftirqd() == p) {
4060 /* 2818 /*
4061 * ksoftirqd time do not get accounted in cpu_softirq_time. 2819 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4063 * Also, p->stime needs to be updated for ksoftirqd. 2821 * Also, p->stime needs to be updated for ksoftirqd.
4064 */ 2822 */
4065 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2823 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4066 &cpustat->softirq); 2824 CPUTIME_SOFTIRQ);
4067 } else if (user_tick) { 2825 } else if (user_tick) {
4068 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2826 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4069 } else if (p == rq->idle) { 2827 } else if (p == rq->idle) {
@@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4072 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2830 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4073 } else { 2831 } else {
4074 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2832 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4075 &cpustat->system); 2833 CPUTIME_SYSTEM);
4076 } 2834 }
4077} 2835}
4078 2836
@@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4171 2929
4172void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2930void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4173{ 2931{
4174 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 2932 cputime_t rtime, utime = p->utime, total = utime + p->stime;
4175 2933
4176 /* 2934 /*
4177 * Use CFS's precise accounting: 2935 * Use CFS's precise accounting:
@@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4179 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2937 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4180 2938
4181 if (total) { 2939 if (total) {
4182 u64 temp = rtime; 2940 u64 temp = (__force u64) rtime;
4183 2941
4184 temp *= utime; 2942 temp *= (__force u64) utime;
4185 do_div(temp, total); 2943 do_div(temp, (__force u32) total);
4186 utime = (cputime_t)temp; 2944 utime = (__force cputime_t) temp;
4187 } else 2945 } else
4188 utime = rtime; 2946 utime = rtime;
4189 2947
@@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4191 * Compare with previous values, to keep monotonicity: 2949 * Compare with previous values, to keep monotonicity:
4192 */ 2950 */
4193 p->prev_utime = max(p->prev_utime, utime); 2951 p->prev_utime = max(p->prev_utime, utime);
4194 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 2952 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
4195 2953
4196 *ut = p->prev_utime; 2954 *ut = p->prev_utime;
4197 *st = p->prev_stime; 2955 *st = p->prev_stime;
@@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4208 2966
4209 thread_group_cputime(p, &cputime); 2967 thread_group_cputime(p, &cputime);
4210 2968
4211 total = cputime_add(cputime.utime, cputime.stime); 2969 total = cputime.utime + cputime.stime;
4212 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2970 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4213 2971
4214 if (total) { 2972 if (total) {
4215 u64 temp = rtime; 2973 u64 temp = (__force u64) rtime;
4216 2974
4217 temp *= cputime.utime; 2975 temp *= (__force u64) cputime.utime;
4218 do_div(temp, total); 2976 do_div(temp, (__force u32) total);
4219 utime = (cputime_t)temp; 2977 utime = (__force cputime_t) temp;
4220 } else 2978 } else
4221 utime = rtime; 2979 utime = rtime;
4222 2980
4223 sig->prev_utime = max(sig->prev_utime, utime); 2981 sig->prev_utime = max(sig->prev_utime, utime);
4224 sig->prev_stime = max(sig->prev_stime, 2982 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
4225 cputime_sub(rtime, sig->prev_utime));
4226 2983
4227 *ut = sig->prev_utime; 2984 *ut = sig->prev_utime;
4228 *st = sig->prev_stime; 2985 *st = sig->prev_stime;
@@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
4321{ 3078{
4322 struct pt_regs *regs = get_irq_regs(); 3079 struct pt_regs *regs = get_irq_regs();
4323 3080
3081 if (oops_in_progress)
3082 return;
3083
4324 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3084 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4325 prev->comm, prev->pid, preempt_count()); 3085 prev->comm, prev->pid, preempt_count());
4326 3086
@@ -5852,6 +4612,13 @@ again:
5852 */ 4612 */
5853 if (preempt && rq != p_rq) 4613 if (preempt && rq != p_rq)
5854 resched_task(p_rq->curr); 4614 resched_task(p_rq->curr);
4615 } else {
4616 /*
4617 * We might have set it in task_yield_fair(), but are
4618 * not going to schedule(), so don't want to skip
4619 * the next update.
4620 */
4621 rq->skip_clock_update = 0;
5855 } 4622 }
5856 4623
5857out: 4624out:
@@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
6019 free = stack_not_used(p); 4786 free = stack_not_used(p);
6020#endif 4787#endif
6021 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4788 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6022 task_pid_nr(p), task_pid_nr(p->real_parent), 4789 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
6023 (unsigned long)task_thread_info(p)->flags); 4790 (unsigned long)task_thread_info(p)->flags);
6024 4791
6025 show_stack(p, NULL); 4792 show_stack(p, NULL);
@@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6118#endif 4885#endif
6119} 4886}
6120 4887
6121/*
6122 * Increase the granularity value when there are more CPUs,
6123 * because with more CPUs the 'effective latency' as visible
6124 * to users decreases. But the relationship is not linear,
6125 * so pick a second-best guess by going with the log2 of the
6126 * number of CPUs.
6127 *
6128 * This idea comes from the SD scheduler of Con Kolivas:
6129 */
6130static int get_update_sysctl_factor(void)
6131{
6132 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6133 unsigned int factor;
6134
6135 switch (sysctl_sched_tunable_scaling) {
6136 case SCHED_TUNABLESCALING_NONE:
6137 factor = 1;
6138 break;
6139 case SCHED_TUNABLESCALING_LINEAR:
6140 factor = cpus;
6141 break;
6142 case SCHED_TUNABLESCALING_LOG:
6143 default:
6144 factor = 1 + ilog2(cpus);
6145 break;
6146 }
6147
6148 return factor;
6149}
6150
6151static void update_sysctl(void)
6152{
6153 unsigned int factor = get_update_sysctl_factor();
6154
6155#define SET_SYSCTL(name) \
6156 (sysctl_##name = (factor) * normalized_sysctl_##name)
6157 SET_SYSCTL(sched_min_granularity);
6158 SET_SYSCTL(sched_latency);
6159 SET_SYSCTL(sched_wakeup_granularity);
6160#undef SET_SYSCTL
6161}
6162
6163static inline void sched_init_granularity(void)
6164{
6165 update_sysctl();
6166}
6167
6168#ifdef CONFIG_SMP 4888#ifdef CONFIG_SMP
6169void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4889void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6170{ 4890{
@@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq)
6351 rq->calc_load_active = 0; 5071 rq->calc_load_active = 0;
6352} 5072}
6353 5073
6354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365 /*
6366 * clock_task is not advancing so we just need to make sure
6367 * there's some valid quota amount
6368 */
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
6378/* 5074/*
6379 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5075 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6380 * try_to_wake_up()->select_task_rq(). 5076 * try_to_wake_up()->select_task_rq().
@@ -6980,6 +5676,12 @@ out:
6980 return -ENOMEM; 5676 return -ENOMEM;
6981} 5677}
6982 5678
5679/*
5680 * By default the system creates a single root-domain with all cpus as
5681 * members (mimicking the global state we have today).
5682 */
5683struct root_domain def_root_domain;
5684
6983static void init_defrootdomain(void) 5685static void init_defrootdomain(void)
6984{ 5686{
6985 init_rootdomain(&def_root_domain); 5687 init_rootdomain(&def_root_domain);
@@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7051} 5753}
7052 5754
7053/* 5755/*
5756 * Keep a special pointer to the highest sched_domain that has
5757 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5758 * allows us to avoid some pointer chasing select_idle_sibling().
5759 *
5760 * Also keep a unique ID per domain (we use the first cpu number in
5761 * the cpumask of the domain), this allows us to quickly tell if
5762 * two cpus are in the same cache domain, see ttwu_share_cache().
5763 */
5764DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5765DEFINE_PER_CPU(int, sd_llc_id);
5766
5767static void update_top_cache_domain(int cpu)
5768{
5769 struct sched_domain *sd;
5770 int id = cpu;
5771
5772 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5773 if (sd)
5774 id = cpumask_first(sched_domain_span(sd));
5775
5776 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5777 per_cpu(sd_llc_id, cpu) = id;
5778}
5779
5780/*
7054 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5781 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7055 * hold the hotplug lock. 5782 * hold the hotplug lock.
7056 */ 5783 */
@@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7089 tmp = rq->sd; 5816 tmp = rq->sd;
7090 rcu_assign_pointer(rq->sd, sd); 5817 rcu_assign_pointer(rq->sd, sd);
7091 destroy_sched_domains(tmp, cpu); 5818 destroy_sched_domains(tmp, cpu);
5819
5820 update_top_cache_domain(cpu);
7092} 5821}
7093 5822
7094/* cpus with isolated domains */ 5823/* cpus with isolated domains */
@@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7248 continue; 5977 continue;
7249 5978
7250 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5979 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7251 GFP_KERNEL, cpu_to_node(i)); 5980 GFP_KERNEL, cpu_to_node(cpu));
7252 5981
7253 if (!sg) 5982 if (!sg)
7254 goto fail; 5983 goto fail;
@@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7386 return; 6115 return;
7387 6116
7388 update_group_power(sd, cpu); 6117 update_group_power(sd, cpu);
6118 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6119}
6120
6121int __weak arch_sd_sibling_asym_packing(void)
6122{
6123 return 0*SD_ASYM_PACKING;
7389} 6124}
7390 6125
7391/* 6126/*
@@ -8023,29 +6758,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8023 } 6758 }
8024} 6759}
8025 6760
8026static int update_runtime(struct notifier_block *nfb,
8027 unsigned long action, void *hcpu)
8028{
8029 int cpu = (int)(long)hcpu;
8030
8031 switch (action) {
8032 case CPU_DOWN_PREPARE:
8033 case CPU_DOWN_PREPARE_FROZEN:
8034 disable_runtime(cpu_rq(cpu));
8035 return NOTIFY_OK;
8036
8037 case CPU_DOWN_FAILED:
8038 case CPU_DOWN_FAILED_FROZEN:
8039 case CPU_ONLINE:
8040 case CPU_ONLINE_FROZEN:
8041 enable_runtime(cpu_rq(cpu));
8042 return NOTIFY_OK;
8043
8044 default:
8045 return NOTIFY_DONE;
8046 }
8047}
8048
8049void __init sched_init_smp(void) 6761void __init sched_init_smp(void)
8050{ 6762{
8051 cpumask_var_t non_isolated_cpus; 6763 cpumask_var_t non_isolated_cpus;
@@ -8094,104 +6806,11 @@ int in_sched_functions(unsigned long addr)
8094 && addr < (unsigned long)__sched_text_end); 6806 && addr < (unsigned long)__sched_text_end);
8095} 6807}
8096 6808
8097static void init_cfs_rq(struct cfs_rq *cfs_rq) 6809#ifdef CONFIG_CGROUP_SCHED
8098{ 6810struct task_group root_task_group;
8099 cfs_rq->tasks_timeline = RB_ROOT;
8100 INIT_LIST_HEAD(&cfs_rq->tasks);
8101 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8102#ifndef CONFIG_64BIT
8103 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8104#endif
8105}
8106
8107static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8108{
8109 struct rt_prio_array *array;
8110 int i;
8111
8112 array = &rt_rq->active;
8113 for (i = 0; i < MAX_RT_PRIO; i++) {
8114 INIT_LIST_HEAD(array->queue + i);
8115 __clear_bit(i, array->bitmap);
8116 }
8117 /* delimiter for bitsearch: */
8118 __set_bit(MAX_RT_PRIO, array->bitmap);
8119
8120#if defined CONFIG_SMP
8121 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8122 rt_rq->highest_prio.next = MAX_RT_PRIO;
8123 rt_rq->rt_nr_migratory = 0;
8124 rt_rq->overloaded = 0;
8125 plist_head_init(&rt_rq->pushable_tasks);
8126#endif
8127
8128 rt_rq->rt_time = 0;
8129 rt_rq->rt_throttled = 0;
8130 rt_rq->rt_runtime = 0;
8131 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
8132}
8133
8134#ifdef CONFIG_FAIR_GROUP_SCHED
8135static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8136 struct sched_entity *se, int cpu,
8137 struct sched_entity *parent)
8138{
8139 struct rq *rq = cpu_rq(cpu);
8140
8141 cfs_rq->tg = tg;
8142 cfs_rq->rq = rq;
8143#ifdef CONFIG_SMP
8144 /* allow initial update_cfs_load() to truncate */
8145 cfs_rq->load_stamp = 1;
8146#endif
8147 init_cfs_rq_runtime(cfs_rq);
8148
8149 tg->cfs_rq[cpu] = cfs_rq;
8150 tg->se[cpu] = se;
8151
8152 /* se could be NULL for root_task_group */
8153 if (!se)
8154 return;
8155
8156 if (!parent)
8157 se->cfs_rq = &rq->cfs;
8158 else
8159 se->cfs_rq = parent->my_q;
8160
8161 se->my_q = cfs_rq;
8162 update_load_set(&se->load, 0);
8163 se->parent = parent;
8164}
8165#endif 6811#endif
8166 6812
8167#ifdef CONFIG_RT_GROUP_SCHED 6813DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
8168static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8169 struct sched_rt_entity *rt_se, int cpu,
8170 struct sched_rt_entity *parent)
8171{
8172 struct rq *rq = cpu_rq(cpu);
8173
8174 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8175 rt_rq->rt_nr_boosted = 0;
8176 rt_rq->rq = rq;
8177 rt_rq->tg = tg;
8178
8179 tg->rt_rq[cpu] = rt_rq;
8180 tg->rt_se[cpu] = rt_se;
8181
8182 if (!rt_se)
8183 return;
8184
8185 if (!parent)
8186 rt_se->rt_rq = &rq->rt;
8187 else
8188 rt_se->rt_rq = parent->my_q;
8189
8190 rt_se->my_q = rt_rq;
8191 rt_se->parent = parent;
8192 INIT_LIST_HEAD(&rt_se->run_list);
8193}
8194#endif
8195 6814
8196void __init sched_init(void) 6815void __init sched_init(void)
8197{ 6816{
@@ -8249,9 +6868,17 @@ void __init sched_init(void)
8249#ifdef CONFIG_CGROUP_SCHED 6868#ifdef CONFIG_CGROUP_SCHED
8250 list_add(&root_task_group.list, &task_groups); 6869 list_add(&root_task_group.list, &task_groups);
8251 INIT_LIST_HEAD(&root_task_group.children); 6870 INIT_LIST_HEAD(&root_task_group.children);
6871 INIT_LIST_HEAD(&root_task_group.siblings);
8252 autogroup_init(&init_task); 6872 autogroup_init(&init_task);
6873
8253#endif /* CONFIG_CGROUP_SCHED */ 6874#endif /* CONFIG_CGROUP_SCHED */
8254 6875
6876#ifdef CONFIG_CGROUP_CPUACCT
6877 root_cpuacct.cpustat = &kernel_cpustat;
6878 root_cpuacct.cpuusage = alloc_percpu(u64);
6879 /* Too early, not expected to fail */
6880 BUG_ON(!root_cpuacct.cpuusage);
6881#endif
8255 for_each_possible_cpu(i) { 6882 for_each_possible_cpu(i) {
8256 struct rq *rq; 6883 struct rq *rq;
8257 6884
@@ -8263,7 +6890,7 @@ void __init sched_init(void)
8263 init_cfs_rq(&rq->cfs); 6890 init_cfs_rq(&rq->cfs);
8264 init_rt_rq(&rq->rt, rq); 6891 init_rt_rq(&rq->rt, rq);
8265#ifdef CONFIG_FAIR_GROUP_SCHED 6892#ifdef CONFIG_FAIR_GROUP_SCHED
8266 root_task_group.shares = root_task_group_load; 6893 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8267 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6894 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8268 /* 6895 /*
8269 * How much cpu bandwidth does root_task_group get? 6896 * How much cpu bandwidth does root_task_group get?
@@ -8313,7 +6940,7 @@ void __init sched_init(void)
8313 rq->avg_idle = 2*sysctl_sched_migration_cost; 6940 rq->avg_idle = 2*sysctl_sched_migration_cost;
8314 rq_attach_root(rq, &def_root_domain); 6941 rq_attach_root(rq, &def_root_domain);
8315#ifdef CONFIG_NO_HZ 6942#ifdef CONFIG_NO_HZ
8316 rq->nohz_balance_kick = 0; 6943 rq->nohz_flags = 0;
8317#endif 6944#endif
8318#endif 6945#endif
8319 init_rq_hrtick(rq); 6946 init_rq_hrtick(rq);
@@ -8326,10 +6953,6 @@ void __init sched_init(void)
8326 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6953 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8327#endif 6954#endif
8328 6955
8329#ifdef CONFIG_SMP
8330 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8331#endif
8332
8333#ifdef CONFIG_RT_MUTEXES 6956#ifdef CONFIG_RT_MUTEXES
8334 plist_head_init(&init_task.pi_waiters); 6957 plist_head_init(&init_task.pi_waiters);
8335#endif 6958#endif
@@ -8357,17 +6980,11 @@ void __init sched_init(void)
8357 6980
8358#ifdef CONFIG_SMP 6981#ifdef CONFIG_SMP
8359 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6982 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8360#ifdef CONFIG_NO_HZ
8361 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8362 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8363 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8364 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8365 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8366#endif
8367 /* May be allocated at isolcpus cmdline parse time */ 6983 /* May be allocated at isolcpus cmdline parse time */
8368 if (cpu_isolated_map == NULL) 6984 if (cpu_isolated_map == NULL)
8369 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6985 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8370#endif /* SMP */ 6986#endif
6987 init_sched_fair_class();
8371 6988
8372 scheduler_running = 1; 6989 scheduler_running = 1;
8373} 6990}
@@ -8519,169 +7136,14 @@ void set_curr_task(int cpu, struct task_struct *p)
8519 7136
8520#endif 7137#endif
8521 7138
8522#ifdef CONFIG_FAIR_GROUP_SCHED
8523static void free_fair_sched_group(struct task_group *tg)
8524{
8525 int i;
8526
8527 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8528
8529 for_each_possible_cpu(i) {
8530 if (tg->cfs_rq)
8531 kfree(tg->cfs_rq[i]);
8532 if (tg->se)
8533 kfree(tg->se[i]);
8534 }
8535
8536 kfree(tg->cfs_rq);
8537 kfree(tg->se);
8538}
8539
8540static
8541int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8542{
8543 struct cfs_rq *cfs_rq;
8544 struct sched_entity *se;
8545 int i;
8546
8547 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8548 if (!tg->cfs_rq)
8549 goto err;
8550 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8551 if (!tg->se)
8552 goto err;
8553
8554 tg->shares = NICE_0_LOAD;
8555
8556 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8557
8558 for_each_possible_cpu(i) {
8559 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8560 GFP_KERNEL, cpu_to_node(i));
8561 if (!cfs_rq)
8562 goto err;
8563
8564 se = kzalloc_node(sizeof(struct sched_entity),
8565 GFP_KERNEL, cpu_to_node(i));
8566 if (!se)
8567 goto err_free_rq;
8568
8569 init_cfs_rq(cfs_rq);
8570 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8571 }
8572
8573 return 1;
8574
8575err_free_rq:
8576 kfree(cfs_rq);
8577err:
8578 return 0;
8579}
8580
8581static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8582{
8583 struct rq *rq = cpu_rq(cpu);
8584 unsigned long flags;
8585
8586 /*
8587 * Only empty task groups can be destroyed; so we can speculatively
8588 * check on_list without danger of it being re-added.
8589 */
8590 if (!tg->cfs_rq[cpu]->on_list)
8591 return;
8592
8593 raw_spin_lock_irqsave(&rq->lock, flags);
8594 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8595 raw_spin_unlock_irqrestore(&rq->lock, flags);
8596}
8597#else /* !CONFIG_FAIR_GROUP_SCHED */
8598static inline void free_fair_sched_group(struct task_group *tg)
8599{
8600}
8601
8602static inline
8603int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8604{
8605 return 1;
8606}
8607
8608static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8609{
8610}
8611#endif /* CONFIG_FAIR_GROUP_SCHED */
8612
8613#ifdef CONFIG_RT_GROUP_SCHED 7139#ifdef CONFIG_RT_GROUP_SCHED
8614static void free_rt_sched_group(struct task_group *tg)
8615{
8616 int i;
8617
8618 if (tg->rt_se)
8619 destroy_rt_bandwidth(&tg->rt_bandwidth);
8620
8621 for_each_possible_cpu(i) {
8622 if (tg->rt_rq)
8623 kfree(tg->rt_rq[i]);
8624 if (tg->rt_se)
8625 kfree(tg->rt_se[i]);
8626 }
8627
8628 kfree(tg->rt_rq);
8629 kfree(tg->rt_se);
8630}
8631
8632static
8633int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8634{
8635 struct rt_rq *rt_rq;
8636 struct sched_rt_entity *rt_se;
8637 int i;
8638
8639 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8640 if (!tg->rt_rq)
8641 goto err;
8642 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8643 if (!tg->rt_se)
8644 goto err;
8645
8646 init_rt_bandwidth(&tg->rt_bandwidth,
8647 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8648
8649 for_each_possible_cpu(i) {
8650 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8651 GFP_KERNEL, cpu_to_node(i));
8652 if (!rt_rq)
8653 goto err;
8654
8655 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8656 GFP_KERNEL, cpu_to_node(i));
8657 if (!rt_se)
8658 goto err_free_rq;
8659
8660 init_rt_rq(rt_rq, cpu_rq(i));
8661 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8662 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8663 }
8664
8665 return 1;
8666
8667err_free_rq:
8668 kfree(rt_rq);
8669err:
8670 return 0;
8671}
8672#else /* !CONFIG_RT_GROUP_SCHED */ 7140#else /* !CONFIG_RT_GROUP_SCHED */
8673static inline void free_rt_sched_group(struct task_group *tg)
8674{
8675}
8676
8677static inline
8678int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8679{
8680 return 1;
8681}
8682#endif /* CONFIG_RT_GROUP_SCHED */ 7141#endif /* CONFIG_RT_GROUP_SCHED */
8683 7142
8684#ifdef CONFIG_CGROUP_SCHED 7143#ifdef CONFIG_CGROUP_SCHED
7144/* task_group_lock serializes the addition/removal of task groups */
7145static DEFINE_SPINLOCK(task_group_lock);
7146
8685static void free_sched_group(struct task_group *tg) 7147static void free_sched_group(struct task_group *tg)
8686{ 7148{
8687 free_fair_sched_group(tg); 7149 free_fair_sched_group(tg);
@@ -8787,47 +7249,6 @@ void sched_move_task(struct task_struct *tsk)
8787#endif /* CONFIG_CGROUP_SCHED */ 7249#endif /* CONFIG_CGROUP_SCHED */
8788 7250
8789#ifdef CONFIG_FAIR_GROUP_SCHED 7251#ifdef CONFIG_FAIR_GROUP_SCHED
8790static DEFINE_MUTEX(shares_mutex);
8791
8792int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8793{
8794 int i;
8795 unsigned long flags;
8796
8797 /*
8798 * We can't change the weight of the root cgroup.
8799 */
8800 if (!tg->se[0])
8801 return -EINVAL;
8802
8803 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8804
8805 mutex_lock(&shares_mutex);
8806 if (tg->shares == shares)
8807 goto done;
8808
8809 tg->shares = shares;
8810 for_each_possible_cpu(i) {
8811 struct rq *rq = cpu_rq(i);
8812 struct sched_entity *se;
8813
8814 se = tg->se[i];
8815 /* Propagate contribution to hierarchy */
8816 raw_spin_lock_irqsave(&rq->lock, flags);
8817 for_each_sched_entity(se)
8818 update_cfs_shares(group_cfs_rq(se));
8819 raw_spin_unlock_irqrestore(&rq->lock, flags);
8820 }
8821
8822done:
8823 mutex_unlock(&shares_mutex);
8824 return 0;
8825}
8826
8827unsigned long sched_group_shares(struct task_group *tg)
8828{
8829 return tg->shares;
8830}
8831#endif 7252#endif
8832 7253
8833#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7254#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8852,7 +7273,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
8852 struct task_struct *g, *p; 7273 struct task_struct *g, *p;
8853 7274
8854 do_each_thread(g, p) { 7275 do_each_thread(g, p) {
8855 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 7276 if (rt_task(p) && task_rq(p)->rt.tg == tg)
8856 return 1; 7277 return 1;
8857 } while_each_thread(g, p); 7278 } while_each_thread(g, p);
8858 7279
@@ -9203,8 +7624,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9203 7624
9204static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7625static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9205{ 7626{
9206 int i, ret = 0, runtime_enabled; 7627 int i, ret = 0, runtime_enabled, runtime_was_enabled;
9207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7628 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9208 7629
9209 if (tg == &root_task_group) 7630 if (tg == &root_task_group)
9210 return -EINVAL; 7631 return -EINVAL;
@@ -9231,6 +7652,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9231 goto out_unlock; 7652 goto out_unlock;
9232 7653
9233 runtime_enabled = quota != RUNTIME_INF; 7654 runtime_enabled = quota != RUNTIME_INF;
7655 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7656 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
9234 raw_spin_lock_irq(&cfs_b->lock); 7657 raw_spin_lock_irq(&cfs_b->lock);
9235 cfs_b->period = ns_to_ktime(period); 7658 cfs_b->period = ns_to_ktime(period);
9236 cfs_b->quota = quota; 7659 cfs_b->quota = quota;
@@ -9246,13 +7669,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9246 7669
9247 for_each_possible_cpu(i) { 7670 for_each_possible_cpu(i) {
9248 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7671 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9249 struct rq *rq = rq_of(cfs_rq); 7672 struct rq *rq = cfs_rq->rq;
9250 7673
9251 raw_spin_lock_irq(&rq->lock); 7674 raw_spin_lock_irq(&rq->lock);
9252 cfs_rq->runtime_enabled = runtime_enabled; 7675 cfs_rq->runtime_enabled = runtime_enabled;
9253 cfs_rq->runtime_remaining = 0; 7676 cfs_rq->runtime_remaining = 0;
9254 7677
9255 if (cfs_rq_throttled(cfs_rq)) 7678 if (cfs_rq->throttled)
9256 unthrottle_cfs_rq(cfs_rq); 7679 unthrottle_cfs_rq(cfs_rq);
9257 raw_spin_unlock_irq(&rq->lock); 7680 raw_spin_unlock_irq(&rq->lock);
9258 } 7681 }
@@ -9266,7 +7689,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9266{ 7689{
9267 u64 quota, period; 7690 u64 quota, period;
9268 7691
9269 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7692 period = ktime_to_ns(tg->cfs_bandwidth.period);
9270 if (cfs_quota_us < 0) 7693 if (cfs_quota_us < 0)
9271 quota = RUNTIME_INF; 7694 quota = RUNTIME_INF;
9272 else 7695 else
@@ -9279,10 +7702,10 @@ long tg_get_cfs_quota(struct task_group *tg)
9279{ 7702{
9280 u64 quota_us; 7703 u64 quota_us;
9281 7704
9282 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) 7705 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
9283 return -1; 7706 return -1;
9284 7707
9285 quota_us = tg_cfs_bandwidth(tg)->quota; 7708 quota_us = tg->cfs_bandwidth.quota;
9286 do_div(quota_us, NSEC_PER_USEC); 7709 do_div(quota_us, NSEC_PER_USEC);
9287 7710
9288 return quota_us; 7711 return quota_us;
@@ -9293,10 +7716,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9293 u64 quota, period; 7716 u64 quota, period;
9294 7717
9295 period = (u64)cfs_period_us * NSEC_PER_USEC; 7718 period = (u64)cfs_period_us * NSEC_PER_USEC;
9296 quota = tg_cfs_bandwidth(tg)->quota; 7719 quota = tg->cfs_bandwidth.quota;
9297
9298 if (period <= 0)
9299 return -EINVAL;
9300 7720
9301 return tg_set_cfs_bandwidth(tg, period, quota); 7721 return tg_set_cfs_bandwidth(tg, period, quota);
9302} 7722}
@@ -9305,7 +7725,7 @@ long tg_get_cfs_period(struct task_group *tg)
9305{ 7725{
9306 u64 cfs_period_us; 7726 u64 cfs_period_us;
9307 7727
9308 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); 7728 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9309 do_div(cfs_period_us, NSEC_PER_USEC); 7729 do_div(cfs_period_us, NSEC_PER_USEC);
9310 7730
9311 return cfs_period_us; 7731 return cfs_period_us;
@@ -9365,13 +7785,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
9365static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7785static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9366{ 7786{
9367 struct cfs_schedulable_data *d = data; 7787 struct cfs_schedulable_data *d = data;
9368 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7788 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9369 s64 quota = 0, parent_quota = -1; 7789 s64 quota = 0, parent_quota = -1;
9370 7790
9371 if (!tg->parent) { 7791 if (!tg->parent) {
9372 quota = RUNTIME_INF; 7792 quota = RUNTIME_INF;
9373 } else { 7793 } else {
9374 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); 7794 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9375 7795
9376 quota = normalize_cfs_quota(tg, d); 7796 quota = normalize_cfs_quota(tg, d);
9377 parent_quota = parent_b->hierarchal_quota; 7797 parent_quota = parent_b->hierarchal_quota;
@@ -9415,7 +7835,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9415 struct cgroup_map_cb *cb) 7835 struct cgroup_map_cb *cb)
9416{ 7836{
9417 struct task_group *tg = cgroup_tg(cgrp); 7837 struct task_group *tg = cgroup_tg(cgrp);
9418 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 7838 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9419 7839
9420 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7840 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9421 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7841 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9516,38 +7936,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9516 * (balbir@in.ibm.com). 7936 * (balbir@in.ibm.com).
9517 */ 7937 */
9518 7938
9519/* track cpu usage of a group of tasks and its child groups */
9520struct cpuacct {
9521 struct cgroup_subsys_state css;
9522 /* cpuusage holds pointer to a u64-type object on every cpu */
9523 u64 __percpu *cpuusage;
9524 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9525 struct cpuacct *parent;
9526};
9527
9528struct cgroup_subsys cpuacct_subsys;
9529
9530/* return cpu accounting group corresponding to this container */
9531static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9532{
9533 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9534 struct cpuacct, css);
9535}
9536
9537/* return cpu accounting group to which this task belongs */
9538static inline struct cpuacct *task_ca(struct task_struct *tsk)
9539{
9540 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9541 struct cpuacct, css);
9542}
9543
9544/* create a new cpu accounting group */ 7939/* create a new cpu accounting group */
9545static struct cgroup_subsys_state *cpuacct_create( 7940static struct cgroup_subsys_state *cpuacct_create(
9546 struct cgroup_subsys *ss, struct cgroup *cgrp) 7941 struct cgroup_subsys *ss, struct cgroup *cgrp)
9547{ 7942{
9548 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7943 struct cpuacct *ca;
9549 int i;
9550 7944
7945 if (!cgrp->parent)
7946 return &root_cpuacct.css;
7947
7948 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9551 if (!ca) 7949 if (!ca)
9552 goto out; 7950 goto out;
9553 7951
@@ -9555,18 +7953,13 @@ static struct cgroup_subsys_state *cpuacct_create(
9555 if (!ca->cpuusage) 7953 if (!ca->cpuusage)
9556 goto out_free_ca; 7954 goto out_free_ca;
9557 7955
9558 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7956 ca->cpustat = alloc_percpu(struct kernel_cpustat);
9559 if (percpu_counter_init(&ca->cpustat[i], 0)) 7957 if (!ca->cpustat)
9560 goto out_free_counters; 7958 goto out_free_cpuusage;
9561
9562 if (cgrp->parent)
9563 ca->parent = cgroup_ca(cgrp->parent);
9564 7959
9565 return &ca->css; 7960 return &ca->css;
9566 7961
9567out_free_counters: 7962out_free_cpuusage:
9568 while (--i >= 0)
9569 percpu_counter_destroy(&ca->cpustat[i]);
9570 free_percpu(ca->cpuusage); 7963 free_percpu(ca->cpuusage);
9571out_free_ca: 7964out_free_ca:
9572 kfree(ca); 7965 kfree(ca);
@@ -9579,10 +7972,8 @@ static void
9579cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7972cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9580{ 7973{
9581 struct cpuacct *ca = cgroup_ca(cgrp); 7974 struct cpuacct *ca = cgroup_ca(cgrp);
9582 int i;
9583 7975
9584 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 7976 free_percpu(ca->cpustat);
9585 percpu_counter_destroy(&ca->cpustat[i]);
9586 free_percpu(ca->cpuusage); 7977 free_percpu(ca->cpuusage);
9587 kfree(ca); 7978 kfree(ca);
9588} 7979}
@@ -9675,16 +8066,31 @@ static const char *cpuacct_stat_desc[] = {
9675}; 8066};
9676 8067
9677static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8068static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9678 struct cgroup_map_cb *cb) 8069 struct cgroup_map_cb *cb)
9679{ 8070{
9680 struct cpuacct *ca = cgroup_ca(cgrp); 8071 struct cpuacct *ca = cgroup_ca(cgrp);
9681 int i; 8072 int cpu;
8073 s64 val = 0;
8074
8075 for_each_online_cpu(cpu) {
8076 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8077 val += kcpustat->cpustat[CPUTIME_USER];
8078 val += kcpustat->cpustat[CPUTIME_NICE];
8079 }
8080 val = cputime64_to_clock_t(val);
8081 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
9682 8082
9683 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8083 val = 0;
9684 s64 val = percpu_counter_read(&ca->cpustat[i]); 8084 for_each_online_cpu(cpu) {
9685 val = cputime64_to_clock_t(val); 8085 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
9686 cb->fill(cb, cpuacct_stat_desc[i], val); 8086 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8087 val += kcpustat->cpustat[CPUTIME_IRQ];
8088 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
9687 } 8089 }
8090
8091 val = cputime64_to_clock_t(val);
8092 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8093
9688 return 0; 8094 return 0;
9689} 8095}
9690 8096
@@ -9714,7 +8120,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9714 * 8120 *
9715 * called with rq->lock held. 8121 * called with rq->lock held.
9716 */ 8122 */
9717static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8123void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9718{ 8124{
9719 struct cpuacct *ca; 8125 struct cpuacct *ca;
9720 int cpu; 8126 int cpu;
@@ -9728,7 +8134,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9728 8134
9729 ca = task_ca(tsk); 8135 ca = task_ca(tsk);
9730 8136
9731 for (; ca; ca = ca->parent) { 8137 for (; ca; ca = parent_ca(ca)) {
9732 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8138 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9733 *cpuusage += cputime; 8139 *cpuusage += cputime;
9734 } 8140 }
@@ -9736,45 +8142,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9736 rcu_read_unlock(); 8142 rcu_read_unlock();
9737} 8143}
9738 8144
9739/*
9740 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9741 * in cputime_t units. As a result, cpuacct_update_stats calls
9742 * percpu_counter_add with values large enough to always overflow the
9743 * per cpu batch limit causing bad SMP scalability.
9744 *
9745 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9746 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9747 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9748 */
9749#ifdef CONFIG_SMP
9750#define CPUACCT_BATCH \
9751 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9752#else
9753#define CPUACCT_BATCH 0
9754#endif
9755
9756/*
9757 * Charge the system/user time to the task's accounting group.
9758 */
9759static void cpuacct_update_stats(struct task_struct *tsk,
9760 enum cpuacct_stat_index idx, cputime_t val)
9761{
9762 struct cpuacct *ca;
9763 int batch = CPUACCT_BATCH;
9764
9765 if (unlikely(!cpuacct_subsys.active))
9766 return;
9767
9768 rcu_read_lock();
9769 ca = task_ca(tsk);
9770
9771 do {
9772 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9773 ca = ca->parent;
9774 } while (ca);
9775 rcu_read_unlock();
9776}
9777
9778struct cgroup_subsys cpuacct_subsys = { 8145struct cgroup_subsys cpuacct_subsys = {
9779 .name = "cpuacct", 8146 .name = "cpuacct",
9780 .create = cpuacct_create, 8147 .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb11..b0d798eaf130 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/sched_cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
@@ -28,7 +28,7 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include "sched_cpupri.h" 31#include "cpupri.h"
32 32
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
34static int convert_prio(int prio) 34static int convert_prio(int prio)
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * kernel/time/sched_debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree
5 * 5 *
@@ -16,6 +16,8 @@
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18 18
19#include "sched.h"
20
19static DEFINE_SPINLOCK(sched_debug_lock); 21static DEFINE_SPINLOCK(sched_debug_lock);
20 22
21/* 23/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
373 return 0; 375 return 0;
374} 376}
375 377
376static void sysrq_sched_debug_show(void) 378void sysrq_sched_debug_show(void)
377{ 379{
378 sched_debug_show(NULL, NULL); 380 sched_debug_show(NULL, NULL);
379} 381}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 8a39fa3e3c6c..8e42de9105f8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
29
30#include <trace/events/sched.h>
31
32#include "sched.h"
26 33
27/* 34/*
28 * Targeted preemption latency for CPU-bound tasks: 35 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
103unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
104#endif 111#endif
105 112
106static const struct sched_class fair_sched_class; 113/*
114 * Increase the granularity value when there are more CPUs,
115 * because with more CPUs the 'effective latency' as visible
116 * to users decreases. But the relationship is not linear,
117 * so pick a second-best guess by going with the log2 of the
118 * number of CPUs.
119 *
120 * This idea comes from the SD scheduler of Con Kolivas:
121 */
122static int get_update_sysctl_factor(void)
123{
124 unsigned int cpus = min_t(int, num_online_cpus(), 8);
125 unsigned int factor;
126
127 switch (sysctl_sched_tunable_scaling) {
128 case SCHED_TUNABLESCALING_NONE:
129 factor = 1;
130 break;
131 case SCHED_TUNABLESCALING_LINEAR:
132 factor = cpus;
133 break;
134 case SCHED_TUNABLESCALING_LOG:
135 default:
136 factor = 1 + ilog2(cpus);
137 break;
138 }
139
140 return factor;
141}
142
143static void update_sysctl(void)
144{
145 unsigned int factor = get_update_sysctl_factor();
146
147#define SET_SYSCTL(name) \
148 (sysctl_##name = (factor) * normalized_sysctl_##name)
149 SET_SYSCTL(sched_min_granularity);
150 SET_SYSCTL(sched_latency);
151 SET_SYSCTL(sched_wakeup_granularity);
152#undef SET_SYSCTL
153}
154
155void sched_init_granularity(void)
156{
157 update_sysctl();
158}
159
160#if BITS_PER_LONG == 32
161# define WMULT_CONST (~0UL)
162#else
163# define WMULT_CONST (1UL << 32)
164#endif
165
166#define WMULT_SHIFT 32
167
168/*
169 * Shift right and round:
170 */
171#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
172
173/*
174 * delta *= weight / lw
175 */
176static unsigned long
177calc_delta_mine(unsigned long delta_exec, unsigned long weight,
178 struct load_weight *lw)
179{
180 u64 tmp;
181
182 /*
183 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
184 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
185 * 2^SCHED_LOAD_RESOLUTION.
186 */
187 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
188 tmp = (u64)delta_exec * scale_load_down(weight);
189 else
190 tmp = (u64)delta_exec;
191
192 if (!lw->inv_weight) {
193 unsigned long w = scale_load_down(lw->weight);
194
195 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
196 lw->inv_weight = 1;
197 else if (unlikely(!w))
198 lw->inv_weight = WMULT_CONST;
199 else
200 lw->inv_weight = WMULT_CONST / w;
201 }
202
203 /*
204 * Check whether we'd overflow the 64-bit multiplication:
205 */
206 if (unlikely(tmp > WMULT_CONST))
207 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
208 WMULT_SHIFT/2);
209 else
210 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
211
212 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
213}
214
215
216const struct sched_class fair_sched_class;
107 217
108/************************************************************** 218/**************************************************************
109 * CFS operations on generic schedulable entities: 219 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
413 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 523 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
414} 524}
415 525
416static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) 526struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
417{ 527{
418 struct rb_node *left = cfs_rq->rb_leftmost; 528 struct rb_node *left = cfs_rq->rb_leftmost;
419 529
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
434} 544}
435 545
436#ifdef CONFIG_SCHED_DEBUG 546#ifdef CONFIG_SCHED_DEBUG
437static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 547struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
438{ 548{
439 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 549 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
440 550
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
684{ 794{
685 update_load_add(&cfs_rq->load, se->load.weight); 795 update_load_add(&cfs_rq->load, se->load.weight);
686 if (!parent_entity(se)) 796 if (!parent_entity(se))
687 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
688 if (entity_is_task(se)) { 798 if (entity_is_task(se)) {
689 add_cfs_task_weight(cfs_rq, se->load.weight); 799 add_cfs_task_weight(cfs_rq, se->load.weight);
690 list_add(&se->group_node, &cfs_rq->tasks); 800 list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
697{ 807{
698 update_load_sub(&cfs_rq->load, se->load.weight); 808 update_load_sub(&cfs_rq->load, se->load.weight);
699 if (!parent_entity(se)) 809 if (!parent_entity(se))
700 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
701 if (entity_is_task(se)) { 811 if (entity_is_task(se)) {
702 add_cfs_task_weight(cfs_rq, -se->load.weight); 812 add_cfs_task_weight(cfs_rq, -se->load.weight);
703 list_del_init(&se->group_node); 813 list_del_init(&se->group_node);
@@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
893 if (unlikely(delta > se->statistics.sleep_max)) 1003 if (unlikely(delta > se->statistics.sleep_max))
894 se->statistics.sleep_max = delta; 1004 se->statistics.sleep_max = delta;
895 1005
896 se->statistics.sleep_start = 0;
897 se->statistics.sum_sleep_runtime += delta; 1006 se->statistics.sum_sleep_runtime += delta;
898 1007
899 if (tsk) { 1008 if (tsk) {
@@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
910 if (unlikely(delta > se->statistics.block_max)) 1019 if (unlikely(delta > se->statistics.block_max))
911 se->statistics.block_max = delta; 1020 se->statistics.block_max = delta;
912 1021
913 se->statistics.block_start = 0;
914 se->statistics.sum_sleep_runtime += delta; 1022 se->statistics.sum_sleep_runtime += delta;
915 1023
916 if (tsk) { 1024 if (tsk) {
@@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
920 trace_sched_stat_iowait(tsk, delta); 1028 trace_sched_stat_iowait(tsk, delta);
921 } 1029 }
922 1030
1031 trace_sched_stat_blocked(tsk, delta);
1032
923 /* 1033 /*
924 * Blocking time is in units of nanosecs, so shift by 1034 * Blocking time is in units of nanosecs, so shift by
925 * 20 to get a milliseconds-range estimation of the 1035 * 20 to get a milliseconds-range estimation of the
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1287 */ 1397 */
1288 1398
1289#ifdef CONFIG_CFS_BANDWIDTH 1399#ifdef CONFIG_CFS_BANDWIDTH
1400
1401#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used;
1403
1404static inline bool cfs_bandwidth_used(void)
1405{
1406 return static_branch(&__cfs_bandwidth_used);
1407}
1408
1409void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{
1411 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used);
1416}
1417#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void)
1419{
1420 return true;
1421}
1422
1423void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
1424#endif /* HAVE_JUMP_LABEL */
1425
1290/* 1426/*
1291 * default period for cfs group bandwidth. 1427 * default period for cfs group bandwidth.
1292 * default: 0.1s, units: nanoseconds 1428 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1308 * 1444 *
1309 * requires cfs_b->lock 1445 * requires cfs_b->lock
1310 */ 1446 */
1311static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) 1447void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1312{ 1448{
1313 u64 now; 1449 u64 now;
1314 1450
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1320 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 1456 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1321} 1457}
1322 1458
1459static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
1460{
1461 return &tg->cfs_bandwidth;
1462}
1463
1323/* returns 0 on failure to allocate runtime */ 1464/* returns 0 on failure to allocate runtime */
1324static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1465static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1325{ 1466{
@@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1421static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1422 unsigned long delta_exec) 1563 unsigned long delta_exec)
1423{ 1564{
1424 if (!cfs_rq->runtime_enabled) 1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1425 return; 1566 return;
1426 1567
1427 __account_cfs_rq_runtime(cfs_rq, delta_exec); 1568 __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1429 1570
1430static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1571static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1431{ 1572{
1432 return cfs_rq->throttled; 1573 return cfs_bandwidth_used() && cfs_rq->throttled;
1433} 1574}
1434 1575
1435/* check whether cfs_rq, or any parent, is throttled */ 1576/* check whether cfs_rq, or any parent, is throttled */
1436static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) 1577static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
1437{ 1578{
1438 return cfs_rq->throttle_count; 1579 return cfs_bandwidth_used() && cfs_rq->throttle_count;
1439} 1580}
1440 1581
1441/* 1582/*
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
1530 raw_spin_unlock(&cfs_b->lock); 1671 raw_spin_unlock(&cfs_b->lock);
1531} 1672}
1532 1673
1533static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) 1674void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
1534{ 1675{
1535 struct rq *rq = rq_of(cfs_rq); 1676 struct rq *rq = rq_of(cfs_rq);
1536 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 1677 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1756 1897
1757static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1898static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1758{ 1899{
1900 if (!cfs_bandwidth_used())
1901 return;
1902
1759 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) 1903 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
1760 return; 1904 return;
1761 1905
@@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1801 */ 1945 */
1802static void check_enqueue_throttle(struct cfs_rq *cfs_rq) 1946static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1803{ 1947{
1948 if (!cfs_bandwidth_used())
1949 return;
1950
1804 /* an active group must be handled by the update_curr()->put() path */ 1951 /* an active group must be handled by the update_curr()->put() path */
1805 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 1952 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
1806 return; 1953 return;
@@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
1818/* conditionally throttle active cfs_rq's from put_prev_entity() */ 1965/* conditionally throttle active cfs_rq's from put_prev_entity() */
1819static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1966static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1820{ 1967{
1968 if (!cfs_bandwidth_used())
1969 return;
1970
1821 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) 1971 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
1822 return; 1972 return;
1823 1973
@@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1830 1980
1831 throttle_cfs_rq(cfs_rq); 1981 throttle_cfs_rq(cfs_rq);
1832} 1982}
1833#else 1983
1984static inline u64 default_cfs_period(void);
1985static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
1986static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
1987
1988static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
1989{
1990 struct cfs_bandwidth *cfs_b =
1991 container_of(timer, struct cfs_bandwidth, slack_timer);
1992 do_sched_cfs_slack_timer(cfs_b);
1993
1994 return HRTIMER_NORESTART;
1995}
1996
1997static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
1998{
1999 struct cfs_bandwidth *cfs_b =
2000 container_of(timer, struct cfs_bandwidth, period_timer);
2001 ktime_t now;
2002 int overrun;
2003 int idle = 0;
2004
2005 for (;;) {
2006 now = hrtimer_cb_get_time(timer);
2007 overrun = hrtimer_forward(timer, now, cfs_b->period);
2008
2009 if (!overrun)
2010 break;
2011
2012 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2013 }
2014
2015 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2016}
2017
2018void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2019{
2020 raw_spin_lock_init(&cfs_b->lock);
2021 cfs_b->runtime = 0;
2022 cfs_b->quota = RUNTIME_INF;
2023 cfs_b->period = ns_to_ktime(default_cfs_period());
2024
2025 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2026 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2027 cfs_b->period_timer.function = sched_cfs_period_timer;
2028 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2029 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2030}
2031
2032static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2033{
2034 cfs_rq->runtime_enabled = 0;
2035 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2036}
2037
2038/* requires cfs_b->lock, may release to reprogram timer */
2039void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2040{
2041 /*
2042 * The timer may be active because we're trying to set a new bandwidth
2043 * period or because we're racing with the tear-down path
2044 * (timer_active==0 becomes visible before the hrtimer call-back
2045 * terminates). In either case we ensure that it's re-programmed
2046 */
2047 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2048 raw_spin_unlock(&cfs_b->lock);
2049 /* ensure cfs_b->lock is available while we wait */
2050 hrtimer_cancel(&cfs_b->period_timer);
2051
2052 raw_spin_lock(&cfs_b->lock);
2053 /* if someone else restarted the timer then we're done */
2054 if (cfs_b->timer_active)
2055 return;
2056 }
2057
2058 cfs_b->timer_active = 1;
2059 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2060}
2061
2062static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2063{
2064 hrtimer_cancel(&cfs_b->period_timer);
2065 hrtimer_cancel(&cfs_b->slack_timer);
2066}
2067
2068void unthrottle_offline_cfs_rqs(struct rq *rq)
2069{
2070 struct cfs_rq *cfs_rq;
2071
2072 for_each_leaf_cfs_rq(rq, cfs_rq) {
2073 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2074
2075 if (!cfs_rq->runtime_enabled)
2076 continue;
2077
2078 /*
2079 * clock_task is not advancing so we just need to make sure
2080 * there's some valid quota amount
2081 */
2082 cfs_rq->runtime_remaining = cfs_b->quota;
2083 if (cfs_rq_throttled(cfs_rq))
2084 unthrottle_cfs_rq(cfs_rq);
2085 }
2086}
2087
2088#else /* CONFIG_CFS_BANDWIDTH */
1834static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1835 unsigned long delta_exec) {} 2090 unsigned long delta_exec) {}
1836static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
1852{ 2107{
1853 return 0; 2108 return 0;
1854} 2109}
2110
2111void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2112
2113#ifdef CONFIG_FAIR_GROUP_SCHED
2114static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1855#endif 2115#endif
1856 2116
2117static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2118{
2119 return NULL;
2120}
2121static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2122void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2123
2124#endif /* CONFIG_CFS_BANDWIDTH */
2125
1857/************************************************** 2126/**************************************************
1858 * CFS operations on tasks: 2127 * CFS operations on tasks:
1859 */ 2128 */
@@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
1866 2135
1867 WARN_ON(task_rq(p) != rq); 2136 WARN_ON(task_rq(p) != rq);
1868 2137
1869 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { 2138 if (cfs_rq->nr_running > 1) {
1870 u64 slice = sched_slice(cfs_rq, se); 2139 u64 slice = sched_slice(cfs_rq, se);
1871 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; 2140 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1872 s64 delta = slice - ran; 2141 s64 delta = slice - ran;
@@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
1897{ 2166{
1898 struct task_struct *curr = rq->curr; 2167 struct task_struct *curr = rq->curr;
1899 2168
1900 if (curr->sched_class != &fair_sched_class) 2169 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
1901 return; 2170 return;
1902 2171
1903 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) 2172 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2020} 2289}
2021 2290
2022#ifdef CONFIG_SMP 2291#ifdef CONFIG_SMP
2292/* Used instead of source_load when we know the type == 0 */
2293static unsigned long weighted_cpuload(const int cpu)
2294{
2295 return cpu_rq(cpu)->load.weight;
2296}
2297
2298/*
2299 * Return a low guess at the load of a migration-source cpu weighted
2300 * according to the scheduling class and "nice" value.
2301 *
2302 * We want to under-estimate the load of migration sources, to
2303 * balance conservatively.
2304 */
2305static unsigned long source_load(int cpu, int type)
2306{
2307 struct rq *rq = cpu_rq(cpu);
2308 unsigned long total = weighted_cpuload(cpu);
2309
2310 if (type == 0 || !sched_feat(LB_BIAS))
2311 return total;
2312
2313 return min(rq->cpu_load[type-1], total);
2314}
2315
2316/*
2317 * Return a high guess at the load of a migration-target cpu weighted
2318 * according to the scheduling class and "nice" value.
2319 */
2320static unsigned long target_load(int cpu, int type)
2321{
2322 struct rq *rq = cpu_rq(cpu);
2323 unsigned long total = weighted_cpuload(cpu);
2324
2325 if (type == 0 || !sched_feat(LB_BIAS))
2326 return total;
2327
2328 return max(rq->cpu_load[type-1], total);
2329}
2330
2331static unsigned long power_of(int cpu)
2332{
2333 return cpu_rq(cpu)->cpu_power;
2334}
2335
2336static unsigned long cpu_avg_load_per_task(int cpu)
2337{
2338 struct rq *rq = cpu_rq(cpu);
2339 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2340
2341 if (nr_running)
2342 return rq->load.weight / nr_running;
2343
2344 return 0;
2345}
2346
2023 2347
2024static void task_waking_fair(struct task_struct *p) 2348static void task_waking_fair(struct task_struct *p)
2025{ 2349{
@@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
2327 int prev_cpu = task_cpu(p); 2651 int prev_cpu = task_cpu(p);
2328 struct sched_domain *sd; 2652 struct sched_domain *sd;
2329 struct sched_group *sg; 2653 struct sched_group *sg;
2330 int i, smt = 0; 2654 int i;
2331 2655
2332 /* 2656 /*
2333 * If the task is going to be woken-up on this cpu and if it is 2657 * If the task is going to be woken-up on this cpu and if it is
@@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target)
2347 * Otherwise, iterate the domains and find an elegible idle cpu. 2671 * Otherwise, iterate the domains and find an elegible idle cpu.
2348 */ 2672 */
2349 rcu_read_lock(); 2673 rcu_read_lock();
2350again:
2351 for_each_domain(target, sd) {
2352 if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
2353 continue;
2354
2355 if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
2356 break;
2357
2358 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
2359 break;
2360 2674
2675 sd = rcu_dereference(per_cpu(sd_llc, target));
2676 for_each_lower_domain(sd) {
2361 sg = sd->groups; 2677 sg = sd->groups;
2362 do { 2678 do {
2363 if (!cpumask_intersects(sched_group_cpus(sg), 2679 if (!cpumask_intersects(sched_group_cpus(sg),
@@ -2376,10 +2692,6 @@ next:
2376 sg = sg->next; 2692 sg = sg->next;
2377 } while (sg != sd->groups); 2693 } while (sg != sd->groups);
2378 } 2694 }
2379 if (!smt) {
2380 smt = 1;
2381 goto again;
2382 }
2383done: 2695done:
2384 rcu_read_unlock(); 2696 rcu_read_unlock();
2385 2697
@@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2408 int want_sd = 1; 2720 int want_sd = 1;
2409 int sync = wake_flags & WF_SYNC; 2721 int sync = wake_flags & WF_SYNC;
2410 2722
2723 if (p->rt.nr_cpus_allowed == 1)
2724 return prev_cpu;
2725
2411 if (sd_flag & SD_BALANCE_WAKE) { 2726 if (sd_flag & SD_BALANCE_WAKE) {
2412 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 2727 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
2413 want_affine = 1; 2728 want_affine = 1;
@@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
2692 } while (cfs_rq); 3007 } while (cfs_rq);
2693 3008
2694 p = task_of(se); 3009 p = task_of(se);
2695 hrtick_start_fair(rq, p); 3010 if (hrtick_enabled(rq))
3011 hrtick_start_fair(rq, p);
2696 3012
2697 return p; 3013 return p;
2698} 3014}
@@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
2736 * Update run-time statistics of the 'current'. 3052 * Update run-time statistics of the 'current'.
2737 */ 3053 */
2738 update_curr(cfs_rq); 3054 update_curr(cfs_rq);
3055 /*
3056 * Tell update_rq_clock() that we've just updated,
3057 * so we don't do microscopic update in schedule()
3058 * and double the fastpath cost.
3059 */
3060 rq->skip_clock_update = 1;
2739 } 3061 }
2740 3062
2741 set_skip_buddy(se); 3063 set_skip_buddy(se);
@@ -2776,12 +3098,48 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2776} 3098}
2777 3099
2778/* 3100/*
3101 * Is this task likely cache-hot:
3102 */
3103static int
3104task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3105{
3106 s64 delta;
3107
3108 if (p->sched_class != &fair_sched_class)
3109 return 0;
3110
3111 if (unlikely(p->policy == SCHED_IDLE))
3112 return 0;
3113
3114 /*
3115 * Buddy candidates are cache hot:
3116 */
3117 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3118 (&p->se == cfs_rq_of(&p->se)->next ||
3119 &p->se == cfs_rq_of(&p->se)->last))
3120 return 1;
3121
3122 if (sysctl_sched_migration_cost == -1)
3123 return 1;
3124 if (sysctl_sched_migration_cost == 0)
3125 return 0;
3126
3127 delta = now - p->se.exec_start;
3128
3129 return delta < (s64)sysctl_sched_migration_cost;
3130}
3131
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02
3134#define LBF_ABORT 0x04
3135
3136/*
2779 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3137 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2780 */ 3138 */
2781static 3139static
2782int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3140int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2783 struct sched_domain *sd, enum cpu_idle_type idle, 3141 struct sched_domain *sd, enum cpu_idle_type idle,
2784 int *all_pinned) 3142 int *lb_flags)
2785{ 3143{
2786 int tsk_cache_hot = 0; 3144 int tsk_cache_hot = 0;
2787 /* 3145 /*
@@ -2794,7 +3152,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2794 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3152 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
2795 return 0; 3153 return 0;
2796 } 3154 }
2797 *all_pinned = 0; 3155 *lb_flags &= ~LBF_ALL_PINNED;
2798 3156
2799 if (task_running(rq, p)) { 3157 if (task_running(rq, p)) {
2800 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3158 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2868,7 +3226,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2868static unsigned long 3226static unsigned long
2869balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3227balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2870 unsigned long max_load_move, struct sched_domain *sd, 3228 unsigned long max_load_move, struct sched_domain *sd,
2871 enum cpu_idle_type idle, int *all_pinned, 3229 enum cpu_idle_type idle, int *lb_flags,
2872 struct cfs_rq *busiest_cfs_rq) 3230 struct cfs_rq *busiest_cfs_rq)
2873{ 3231{
2874 int loops = 0, pulled = 0; 3232 int loops = 0, pulled = 0;
@@ -2879,12 +3237,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2879 goto out; 3237 goto out;
2880 3238
2881 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3239 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
2882 if (loops++ > sysctl_sched_nr_migrate) 3240 if (loops++ > sysctl_sched_nr_migrate) {
3241 *lb_flags |= LBF_NEED_BREAK;
2883 break; 3242 break;
3243 }
2884 3244
2885 if ((p->se.load.weight >> 1) > rem_load_move || 3245 if ((p->se.load.weight >> 1) > rem_load_move ||
2886 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3246 !can_migrate_task(p, busiest, this_cpu, sd, idle,
2887 all_pinned)) 3247 lb_flags))
2888 continue; 3248 continue;
2889 3249
2890 pull_task(busiest, p, this_rq, this_cpu); 3250 pull_task(busiest, p, this_rq, this_cpu);
@@ -2897,8 +3257,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2897 * kernels will stop after the first task is pulled to minimize 3257 * kernels will stop after the first task is pulled to minimize
2898 * the critical section. 3258 * the critical section.
2899 */ 3259 */
2900 if (idle == CPU_NEWLY_IDLE) 3260 if (idle == CPU_NEWLY_IDLE) {
3261 *lb_flags |= LBF_ABORT;
2901 break; 3262 break;
3263 }
2902#endif 3264#endif
2903 3265
2904 /* 3266 /*
@@ -3003,7 +3365,7 @@ static unsigned long
3003load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3365load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3004 unsigned long max_load_move, 3366 unsigned long max_load_move,
3005 struct sched_domain *sd, enum cpu_idle_type idle, 3367 struct sched_domain *sd, enum cpu_idle_type idle,
3006 int *all_pinned) 3368 int *lb_flags)
3007{ 3369{
3008 long rem_load_move = max_load_move; 3370 long rem_load_move = max_load_move;
3009 struct cfs_rq *busiest_cfs_rq; 3371 struct cfs_rq *busiest_cfs_rq;
@@ -3016,6 +3378,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3016 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 3378 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3017 u64 rem_load, moved_load; 3379 u64 rem_load, moved_load;
3018 3380
3381 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3382 break;
3383
3019 /* 3384 /*
3020 * empty group or part of a throttled hierarchy 3385 * empty group or part of a throttled hierarchy
3021 */ 3386 */
@@ -3027,7 +3392,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3027 rem_load = div_u64(rem_load, busiest_h_load + 1); 3392 rem_load = div_u64(rem_load, busiest_h_load + 1);
3028 3393
3029 moved_load = balance_tasks(this_rq, this_cpu, busiest, 3394 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3030 rem_load, sd, idle, all_pinned, 3395 rem_load, sd, idle, lb_flags,
3031 busiest_cfs_rq); 3396 busiest_cfs_rq);
3032 3397
3033 if (!moved_load) 3398 if (!moved_load)
@@ -3053,10 +3418,10 @@ static unsigned long
3053load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 3418load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3054 unsigned long max_load_move, 3419 unsigned long max_load_move,
3055 struct sched_domain *sd, enum cpu_idle_type idle, 3420 struct sched_domain *sd, enum cpu_idle_type idle,
3056 int *all_pinned) 3421 int *lb_flags)
3057{ 3422{
3058 return balance_tasks(this_rq, this_cpu, busiest, 3423 return balance_tasks(this_rq, this_cpu, busiest,
3059 max_load_move, sd, idle, all_pinned, 3424 max_load_move, sd, idle, lb_flags,
3060 &busiest->cfs); 3425 &busiest->cfs);
3061} 3426}
3062#endif 3427#endif
@@ -3071,29 +3436,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3071static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3436static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3072 unsigned long max_load_move, 3437 unsigned long max_load_move,
3073 struct sched_domain *sd, enum cpu_idle_type idle, 3438 struct sched_domain *sd, enum cpu_idle_type idle,
3074 int *all_pinned) 3439 int *lb_flags)
3075{ 3440{
3076 unsigned long total_load_moved = 0, load_moved; 3441 unsigned long total_load_moved = 0, load_moved;
3077 3442
3078 do { 3443 do {
3079 load_moved = load_balance_fair(this_rq, this_cpu, busiest, 3444 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3080 max_load_move - total_load_moved, 3445 max_load_move - total_load_moved,
3081 sd, idle, all_pinned); 3446 sd, idle, lb_flags);
3082 3447
3083 total_load_moved += load_moved; 3448 total_load_moved += load_moved;
3084 3449
3450 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3451 break;
3452
3085#ifdef CONFIG_PREEMPT 3453#ifdef CONFIG_PREEMPT
3086 /* 3454 /*
3087 * NEWIDLE balancing is a source of latency, so preemptible 3455 * NEWIDLE balancing is a source of latency, so preemptible
3088 * kernels will stop after the first task is pulled to minimize 3456 * kernels will stop after the first task is pulled to minimize
3089 * the critical section. 3457 * the critical section.
3090 */ 3458 */
3091 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3459 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
3092 break; 3460 *lb_flags |= LBF_ABORT;
3093
3094 if (raw_spin_is_contended(&this_rq->lock) ||
3095 raw_spin_is_contended(&busiest->lock))
3096 break; 3461 break;
3462 }
3097#endif 3463#endif
3098 } while (load_moved && max_load_move > total_load_moved); 3464 } while (load_moved && max_load_move > total_load_moved);
3099 3465
@@ -3155,15 +3521,6 @@ struct sg_lb_stats {
3155}; 3521};
3156 3522
3157/** 3523/**
3158 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3159 * @group: The group whose first cpu is to be returned.
3160 */
3161static inline unsigned int group_first_cpu(struct sched_group *group)
3162{
3163 return cpumask_first(sched_group_cpus(group));
3164}
3165
3166/**
3167 * get_sd_load_idx - Obtain the load index for a given sched domain. 3524 * get_sd_load_idx - Obtain the load index for a given sched domain.
3168 * @sd: The sched_domain whose load_idx is to be obtained. 3525 * @sd: The sched_domain whose load_idx is to be obtained.
3169 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3526 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3412,7 +3769,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
3412 sdg->sgp->power = power; 3769 sdg->sgp->power = power;
3413} 3770}
3414 3771
3415static void update_group_power(struct sched_domain *sd, int cpu) 3772void update_group_power(struct sched_domain *sd, int cpu)
3416{ 3773{
3417 struct sched_domain *child = sd->child; 3774 struct sched_domain *child = sd->child;
3418 struct sched_group *group, *sdg = sd->groups; 3775 struct sched_group *group, *sdg = sd->groups;
@@ -3678,11 +4035,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3678 } while (sg != sd->groups); 4035 } while (sg != sd->groups);
3679} 4036}
3680 4037
3681int __weak arch_sd_sibling_asym_packing(void)
3682{
3683 return 0*SD_ASYM_PACKING;
3684}
3685
3686/** 4038/**
3687 * check_asym_packing - Check to see if the group is packed into the 4039 * check_asym_packing - Check to see if the group is packed into the
3688 * sched doman. 4040 * sched doman.
@@ -4046,7 +4398,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
4046#define MAX_PINNED_INTERVAL 512 4398#define MAX_PINNED_INTERVAL 512
4047 4399
4048/* Working cpumask for load_balance and load_balance_newidle. */ 4400/* Working cpumask for load_balance and load_balance_newidle. */
4049static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4401DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4050 4402
4051static int need_active_balance(struct sched_domain *sd, int idle, 4403static int need_active_balance(struct sched_domain *sd, int idle,
4052 int busiest_cpu, int this_cpu) 4404 int busiest_cpu, int this_cpu)
@@ -4097,7 +4449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4097 struct sched_domain *sd, enum cpu_idle_type idle, 4449 struct sched_domain *sd, enum cpu_idle_type idle,
4098 int *balance) 4450 int *balance)
4099{ 4451{
4100 int ld_moved, all_pinned = 0, active_balance = 0; 4452 int ld_moved, lb_flags = 0, active_balance = 0;
4101 struct sched_group *group; 4453 struct sched_group *group;
4102 unsigned long imbalance; 4454 unsigned long imbalance;
4103 struct rq *busiest; 4455 struct rq *busiest;
@@ -4138,11 +4490,11 @@ redo:
4138 * still unbalanced. ld_moved simply stays zero, so it is 4490 * still unbalanced. ld_moved simply stays zero, so it is
4139 * correctly treated as an imbalance. 4491 * correctly treated as an imbalance.
4140 */ 4492 */
4141 all_pinned = 1; 4493 lb_flags |= LBF_ALL_PINNED;
4142 local_irq_save(flags); 4494 local_irq_save(flags);
4143 double_rq_lock(this_rq, busiest); 4495 double_rq_lock(this_rq, busiest);
4144 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4496 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4145 imbalance, sd, idle, &all_pinned); 4497 imbalance, sd, idle, &lb_flags);
4146 double_rq_unlock(this_rq, busiest); 4498 double_rq_unlock(this_rq, busiest);
4147 local_irq_restore(flags); 4499 local_irq_restore(flags);
4148 4500
@@ -4152,8 +4504,16 @@ redo:
4152 if (ld_moved && this_cpu != smp_processor_id()) 4504 if (ld_moved && this_cpu != smp_processor_id())
4153 resched_cpu(this_cpu); 4505 resched_cpu(this_cpu);
4154 4506
4507 if (lb_flags & LBF_ABORT)
4508 goto out_balanced;
4509
4510 if (lb_flags & LBF_NEED_BREAK) {
4511 lb_flags &= ~LBF_NEED_BREAK;
4512 goto redo;
4513 }
4514
4155 /* All tasks on this runqueue were pinned by CPU affinity */ 4515 /* All tasks on this runqueue were pinned by CPU affinity */
4156 if (unlikely(all_pinned)) { 4516 if (unlikely(lb_flags & LBF_ALL_PINNED)) {
4157 cpumask_clear_cpu(cpu_of(busiest), cpus); 4517 cpumask_clear_cpu(cpu_of(busiest), cpus);
4158 if (!cpumask_empty(cpus)) 4518 if (!cpumask_empty(cpus))
4159 goto redo; 4519 goto redo;
@@ -4183,7 +4543,7 @@ redo:
4183 tsk_cpus_allowed(busiest->curr))) { 4543 tsk_cpus_allowed(busiest->curr))) {
4184 raw_spin_unlock_irqrestore(&busiest->lock, 4544 raw_spin_unlock_irqrestore(&busiest->lock,
4185 flags); 4545 flags);
4186 all_pinned = 1; 4546 lb_flags |= LBF_ALL_PINNED;
4187 goto out_one_pinned; 4547 goto out_one_pinned;
4188 } 4548 }
4189 4549
@@ -4236,7 +4596,8 @@ out_balanced:
4236 4596
4237out_one_pinned: 4597out_one_pinned:
4238 /* tune up the balancing interval */ 4598 /* tune up the balancing interval */
4239 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4599 if (((lb_flags & LBF_ALL_PINNED) &&
4600 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4240 (sd->balance_interval < sd->max_interval)) 4601 (sd->balance_interval < sd->max_interval))
4241 sd->balance_interval *= 2; 4602 sd->balance_interval *= 2;
4242 4603
@@ -4249,7 +4610,7 @@ out:
4249 * idle_balance is called by schedule() if this_cpu is about to become 4610 * idle_balance is called by schedule() if this_cpu is about to become
4250 * idle. Attempts to pull tasks from other CPUs. 4611 * idle. Attempts to pull tasks from other CPUs.
4251 */ 4612 */
4252static void idle_balance(int this_cpu, struct rq *this_rq) 4613void idle_balance(int this_cpu, struct rq *this_rq)
4253{ 4614{
4254 struct sched_domain *sd; 4615 struct sched_domain *sd;
4255 int pulled_task = 0; 4616 int pulled_task = 0;
@@ -4364,28 +4725,16 @@ out_unlock:
4364#ifdef CONFIG_NO_HZ 4725#ifdef CONFIG_NO_HZ
4365/* 4726/*
4366 * idle load balancing details 4727 * idle load balancing details
4367 * - One of the idle CPUs nominates itself as idle load_balancer, while
4368 * entering idle.
4369 * - This idle load balancer CPU will also go into tickless mode when
4370 * it is idle, just like all other idle CPUs
4371 * - When one of the busy CPUs notice that there may be an idle rebalancing 4728 * - When one of the busy CPUs notice that there may be an idle rebalancing
4372 * needed, they will kick the idle load balancer, which then does idle 4729 * needed, they will kick the idle load balancer, which then does idle
4373 * load balancing for all the idle CPUs. 4730 * load balancing for all the idle CPUs.
4374 */ 4731 */
4375static struct { 4732static struct {
4376 atomic_t load_balancer;
4377 atomic_t first_pick_cpu;
4378 atomic_t second_pick_cpu;
4379 cpumask_var_t idle_cpus_mask; 4733 cpumask_var_t idle_cpus_mask;
4380 cpumask_var_t grp_idle_mask; 4734 atomic_t nr_cpus;
4381 unsigned long next_balance; /* in jiffy units */ 4735 unsigned long next_balance; /* in jiffy units */
4382} nohz ____cacheline_aligned; 4736} nohz ____cacheline_aligned;
4383 4737
4384int get_nohz_load_balancer(void)
4385{
4386 return atomic_read(&nohz.load_balancer);
4387}
4388
4389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4738#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4390/** 4739/**
4391 * lowest_flag_domain - Return lowest sched_domain containing flag. 4740 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4422,33 +4771,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4422 (sd && (sd->flags & flag)); sd = sd->parent) 4771 (sd && (sd->flags & flag)); sd = sd->parent)
4423 4772
4424/** 4773/**
4425 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4426 * @ilb_group: group to be checked for semi-idleness
4427 *
4428 * Returns: 1 if the group is semi-idle. 0 otherwise.
4429 *
4430 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4431 * and atleast one non-idle CPU. This helper function checks if the given
4432 * sched_group is semi-idle or not.
4433 */
4434static inline int is_semi_idle_group(struct sched_group *ilb_group)
4435{
4436 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
4437 sched_group_cpus(ilb_group));
4438
4439 /*
4440 * A sched_group is semi-idle when it has atleast one busy cpu
4441 * and atleast one idle cpu.
4442 */
4443 if (cpumask_empty(nohz.grp_idle_mask))
4444 return 0;
4445
4446 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
4447 return 0;
4448
4449 return 1;
4450}
4451/**
4452 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4774 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4453 * @cpu: The cpu which is nominating a new idle_load_balancer. 4775 * @cpu: The cpu which is nominating a new idle_load_balancer.
4454 * 4776 *
@@ -4462,9 +4784,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
4462 */ 4784 */
4463static int find_new_ilb(int cpu) 4785static int find_new_ilb(int cpu)
4464{ 4786{
4787 int ilb = cpumask_first(nohz.idle_cpus_mask);
4788 struct sched_group *ilbg;
4465 struct sched_domain *sd; 4789 struct sched_domain *sd;
4466 struct sched_group *ilb_group;
4467 int ilb = nr_cpu_ids;
4468 4790
4469 /* 4791 /*
4470 * Have idle load balancer selection from semi-idle packages only 4792 * Have idle load balancer selection from semi-idle packages only
@@ -4482,23 +4804,28 @@ static int find_new_ilb(int cpu)
4482 4804
4483 rcu_read_lock(); 4805 rcu_read_lock();
4484 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4806 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4485 ilb_group = sd->groups; 4807 ilbg = sd->groups;
4486 4808
4487 do { 4809 do {
4488 if (is_semi_idle_group(ilb_group)) { 4810 if (ilbg->group_weight !=
4489 ilb = cpumask_first(nohz.grp_idle_mask); 4811 atomic_read(&ilbg->sgp->nr_busy_cpus)) {
4812 ilb = cpumask_first_and(nohz.idle_cpus_mask,
4813 sched_group_cpus(ilbg));
4490 goto unlock; 4814 goto unlock;
4491 } 4815 }
4492 4816
4493 ilb_group = ilb_group->next; 4817 ilbg = ilbg->next;
4494 4818
4495 } while (ilb_group != sd->groups); 4819 } while (ilbg != sd->groups);
4496 } 4820 }
4497unlock: 4821unlock:
4498 rcu_read_unlock(); 4822 rcu_read_unlock();
4499 4823
4500out_done: 4824out_done:
4501 return ilb; 4825 if (ilb < nr_cpu_ids && idle_cpu(ilb))
4826 return ilb;
4827
4828 return nr_cpu_ids;
4502} 4829}
4503#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4830#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4504static inline int find_new_ilb(int call_cpu) 4831static inline int find_new_ilb(int call_cpu)
@@ -4518,99 +4845,68 @@ static void nohz_balancer_kick(int cpu)
4518 4845
4519 nohz.next_balance++; 4846 nohz.next_balance++;
4520 4847
4521 ilb_cpu = get_nohz_load_balancer(); 4848 ilb_cpu = find_new_ilb(cpu);
4522
4523 if (ilb_cpu >= nr_cpu_ids) {
4524 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
4525 if (ilb_cpu >= nr_cpu_ids)
4526 return;
4527 }
4528 4849
4529 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { 4850 if (ilb_cpu >= nr_cpu_ids)
4530 cpu_rq(ilb_cpu)->nohz_balance_kick = 1; 4851 return;
4531 4852
4532 smp_mb(); 4853 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
4533 /* 4854 return;
4534 * Use smp_send_reschedule() instead of resched_cpu(). 4855 /*
4535 * This way we generate a sched IPI on the target cpu which 4856 * Use smp_send_reschedule() instead of resched_cpu().
4536 * is idle. And the softirq performing nohz idle load balance 4857 * This way we generate a sched IPI on the target cpu which
4537 * will be run before returning from the IPI. 4858 * is idle. And the softirq performing nohz idle load balance
4538 */ 4859 * will be run before returning from the IPI.
4539 smp_send_reschedule(ilb_cpu); 4860 */
4540 } 4861 smp_send_reschedule(ilb_cpu);
4541 return; 4862 return;
4542} 4863}
4543 4864
4544/* 4865static inline void set_cpu_sd_state_busy(void)
4545 * This routine will try to nominate the ilb (idle load balancing)
4546 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4547 * load balancing on behalf of all those cpus.
4548 *
4549 * When the ilb owner becomes busy, we will not have new ilb owner until some
4550 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
4551 * idle load balancing by kicking one of the idle CPUs.
4552 *
4553 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
4554 * ilb owner CPU in future (when there is a need for idle load balancing on
4555 * behalf of all idle CPUs).
4556 */
4557void select_nohz_load_balancer(int stop_tick)
4558{ 4866{
4867 struct sched_domain *sd;
4559 int cpu = smp_processor_id(); 4868 int cpu = smp_processor_id();
4560 4869
4561 if (stop_tick) { 4870 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4562 if (!cpu_active(cpu)) { 4871 return;
4563 if (atomic_read(&nohz.load_balancer) != cpu) 4872 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
4564 return;
4565
4566 /*
4567 * If we are going offline and still the leader,
4568 * give up!
4569 */
4570 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4571 nr_cpu_ids) != cpu)
4572 BUG();
4573 4873
4574 return; 4874 rcu_read_lock();
4575 } 4875 for_each_domain(cpu, sd)
4876 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
4877 rcu_read_unlock();
4878}
4576 4879
4577 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4880void set_cpu_sd_state_idle(void)
4881{
4882 struct sched_domain *sd;
4883 int cpu = smp_processor_id();
4578 4884
4579 if (atomic_read(&nohz.first_pick_cpu) == cpu) 4885 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
4580 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); 4886 return;
4581 if (atomic_read(&nohz.second_pick_cpu) == cpu) 4887 set_bit(NOHZ_IDLE, nohz_flags(cpu));
4582 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
4583 4888
4584 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { 4889 rcu_read_lock();
4585 int new_ilb; 4890 for_each_domain(cpu, sd)
4891 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
4892 rcu_read_unlock();
4893}
4586 4894
4587 /* make me the ilb owner */ 4895/*
4588 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, 4896 * This routine will record that this cpu is going idle with tick stopped.
4589 cpu) != nr_cpu_ids) 4897 * This info will be used in performing idle load balancing in the future.
4590 return; 4898 */
4899void select_nohz_load_balancer(int stop_tick)
4900{
4901 int cpu = smp_processor_id();
4591 4902
4592 /* 4903 if (stop_tick) {
4593 * Check to see if there is a more power-efficient 4904 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4594 * ilb.
4595 */
4596 new_ilb = find_new_ilb(cpu);
4597 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4598 atomic_set(&nohz.load_balancer, nr_cpu_ids);
4599 resched_cpu(new_ilb);
4600 return;
4601 }
4602 return;
4603 }
4604 } else {
4605 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
4606 return; 4905 return;
4607 4906
4608 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4907 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4609 4908 atomic_inc(&nohz.nr_cpus);
4610 if (atomic_read(&nohz.load_balancer) == cpu) 4909 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4611 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
4612 nr_cpu_ids) != cpu)
4613 BUG();
4614 } 4910 }
4615 return; 4911 return;
4616} 4912}
@@ -4624,7 +4920,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4624 * Scale the max load_balance interval with the number of CPUs in the system. 4920 * Scale the max load_balance interval with the number of CPUs in the system.
4625 * This trades load-balance latency on larger machines for less cross talk. 4921 * This trades load-balance latency on larger machines for less cross talk.
4626 */ 4922 */
4627static void update_max_interval(void) 4923void update_max_interval(void)
4628{ 4924{
4629 max_load_balance_interval = HZ*num_online_cpus()/10; 4925 max_load_balance_interval = HZ*num_online_cpus()/10;
4630} 4926}
@@ -4716,11 +5012,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4716 struct rq *rq; 5012 struct rq *rq;
4717 int balance_cpu; 5013 int balance_cpu;
4718 5014
4719 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) 5015 if (idle != CPU_IDLE ||
4720 return; 5016 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5017 goto end;
4721 5018
4722 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { 5019 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
4723 if (balance_cpu == this_cpu) 5020 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
4724 continue; 5021 continue;
4725 5022
4726 /* 5023 /*
@@ -4728,10 +5025,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4728 * work being done for other cpus. Next load 5025 * work being done for other cpus. Next load
4729 * balancing owner will pick it up. 5026 * balancing owner will pick it up.
4730 */ 5027 */
4731 if (need_resched()) { 5028 if (need_resched())
4732 this_rq->nohz_balance_kick = 0;
4733 break; 5029 break;
4734 }
4735 5030
4736 raw_spin_lock_irq(&this_rq->lock); 5031 raw_spin_lock_irq(&this_rq->lock);
4737 update_rq_clock(this_rq); 5032 update_rq_clock(this_rq);
@@ -4745,53 +5040,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4745 this_rq->next_balance = rq->next_balance; 5040 this_rq->next_balance = rq->next_balance;
4746 } 5041 }
4747 nohz.next_balance = this_rq->next_balance; 5042 nohz.next_balance = this_rq->next_balance;
4748 this_rq->nohz_balance_kick = 0; 5043end:
5044 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
4749} 5045}
4750 5046
4751/* 5047/*
4752 * Current heuristic for kicking the idle load balancer 5048 * Current heuristic for kicking the idle load balancer in the presence
4753 * - first_pick_cpu is the one of the busy CPUs. It will kick 5049 * of an idle cpu is the system.
4754 * idle load balancer when it has more than one process active. This 5050 * - This rq has more than one task.
4755 * eliminates the need for idle load balancing altogether when we have 5051 * - At any scheduler domain level, this cpu's scheduler group has multiple
4756 * only one running process in the system (common case). 5052 * busy cpu's exceeding the group's power.
4757 * - If there are more than one busy CPU, idle load balancer may have 5053 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
4758 * to run for active_load_balance to happen (i.e., two busy CPUs are 5054 * domain span are idle.
4759 * SMT or core siblings and can run better if they move to different
4760 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
4761 * which will kick idle load balancer as soon as it has any load.
4762 */ 5055 */
4763static inline int nohz_kick_needed(struct rq *rq, int cpu) 5056static inline int nohz_kick_needed(struct rq *rq, int cpu)
4764{ 5057{
4765 unsigned long now = jiffies; 5058 unsigned long now = jiffies;
4766 int ret; 5059 struct sched_domain *sd;
4767 int first_pick_cpu, second_pick_cpu;
4768 5060
4769 if (time_before(now, nohz.next_balance)) 5061 if (unlikely(idle_cpu(cpu)))
4770 return 0; 5062 return 0;
4771 5063
4772 if (idle_cpu(cpu)) 5064 /*
4773 return 0; 5065 * We may be recently in ticked or tickless idle mode. At the first
5066 * busy tick after returning from idle, we will update the busy stats.
5067 */
5068 set_cpu_sd_state_busy();
5069 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5070 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5071 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5072 atomic_dec(&nohz.nr_cpus);
5073 }
4774 5074
4775 first_pick_cpu = atomic_read(&nohz.first_pick_cpu); 5075 /*
4776 second_pick_cpu = atomic_read(&nohz.second_pick_cpu); 5076 * None are in tickless mode and hence no need for NOHZ idle load
5077 * balancing.
5078 */
5079 if (likely(!atomic_read(&nohz.nr_cpus)))
5080 return 0;
4777 5081
4778 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && 5082 if (time_before(now, nohz.next_balance))
4779 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
4780 return 0; 5083 return 0;
4781 5084
4782 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); 5085 if (rq->nr_running >= 2)
4783 if (ret == nr_cpu_ids || ret == cpu) { 5086 goto need_kick;
4784 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); 5087
4785 if (rq->nr_running > 1) 5088 rcu_read_lock();
4786 return 1; 5089 for_each_domain(cpu, sd) {
4787 } else { 5090 struct sched_group *sg = sd->groups;
4788 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); 5091 struct sched_group_power *sgp = sg->sgp;
4789 if (ret == nr_cpu_ids || ret == cpu) { 5092 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
4790 if (rq->nr_running) 5093
4791 return 1; 5094 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
4792 } 5095 goto need_kick_unlock;
5096
5097 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5098 && (cpumask_first_and(nohz.idle_cpus_mask,
5099 sched_domain_span(sd)) < cpu))
5100 goto need_kick_unlock;
5101
5102 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5103 break;
4793 } 5104 }
5105 rcu_read_unlock();
4794 return 0; 5106 return 0;
5107
5108need_kick_unlock:
5109 rcu_read_unlock();
5110need_kick:
5111 return 1;
4795} 5112}
4796#else 5113#else
4797static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } 5114static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4826,14 +5143,14 @@ static inline int on_null_domain(int cpu)
4826/* 5143/*
4827 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 5144 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4828 */ 5145 */
4829static inline void trigger_load_balance(struct rq *rq, int cpu) 5146void trigger_load_balance(struct rq *rq, int cpu)
4830{ 5147{
4831 /* Don't need to rebalance while attached to NULL domain */ 5148 /* Don't need to rebalance while attached to NULL domain */
4832 if (time_after_eq(jiffies, rq->next_balance) && 5149 if (time_after_eq(jiffies, rq->next_balance) &&
4833 likely(!on_null_domain(cpu))) 5150 likely(!on_null_domain(cpu)))
4834 raise_softirq(SCHED_SOFTIRQ); 5151 raise_softirq(SCHED_SOFTIRQ);
4835#ifdef CONFIG_NO_HZ 5152#ifdef CONFIG_NO_HZ
4836 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5153 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
4837 nohz_balancer_kick(cpu); 5154 nohz_balancer_kick(cpu);
4838#endif 5155#endif
4839} 5156}
@@ -4848,15 +5165,6 @@ static void rq_offline_fair(struct rq *rq)
4848 update_sysctl(); 5165 update_sysctl();
4849} 5166}
4850 5167
4851#else /* CONFIG_SMP */
4852
4853/*
4854 * on UP we do not need to balance between CPUs:
4855 */
4856static inline void idle_balance(int cpu, struct rq *rq)
4857{
4858}
4859
4860#endif /* CONFIG_SMP */ 5168#endif /* CONFIG_SMP */
4861 5169
4862/* 5170/*
@@ -4880,8 +5188,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4880 */ 5188 */
4881static void task_fork_fair(struct task_struct *p) 5189static void task_fork_fair(struct task_struct *p)
4882{ 5190{
4883 struct cfs_rq *cfs_rq = task_cfs_rq(current); 5191 struct cfs_rq *cfs_rq;
4884 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 5192 struct sched_entity *se = &p->se, *curr;
4885 int this_cpu = smp_processor_id(); 5193 int this_cpu = smp_processor_id();
4886 struct rq *rq = this_rq(); 5194 struct rq *rq = this_rq();
4887 unsigned long flags; 5195 unsigned long flags;
@@ -4890,6 +5198,9 @@ static void task_fork_fair(struct task_struct *p)
4890 5198
4891 update_rq_clock(rq); 5199 update_rq_clock(rq);
4892 5200
5201 cfs_rq = task_cfs_rq(current);
5202 curr = cfs_rq->curr;
5203
4893 if (unlikely(task_cpu(p) != this_cpu)) { 5204 if (unlikely(task_cpu(p) != this_cpu)) {
4894 rcu_read_lock(); 5205 rcu_read_lock();
4895 __set_task_cpu(p, this_cpu); 5206 __set_task_cpu(p, this_cpu);
@@ -4999,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq)
4999 } 5310 }
5000} 5311}
5001 5312
5313void init_cfs_rq(struct cfs_rq *cfs_rq)
5314{
5315 cfs_rq->tasks_timeline = RB_ROOT;
5316 INIT_LIST_HEAD(&cfs_rq->tasks);
5317 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5318#ifndef CONFIG_64BIT
5319 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5320#endif
5321}
5322
5002#ifdef CONFIG_FAIR_GROUP_SCHED 5323#ifdef CONFIG_FAIR_GROUP_SCHED
5003static void task_move_group_fair(struct task_struct *p, int on_rq) 5324static void task_move_group_fair(struct task_struct *p, int on_rq)
5004{ 5325{
@@ -5015,13 +5336,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5015 * to another cgroup's rq. This does somewhat interfere with the 5336 * to another cgroup's rq. This does somewhat interfere with the
5016 * fair sleeper stuff for the first placement, but who cares. 5337 * fair sleeper stuff for the first placement, but who cares.
5017 */ 5338 */
5339 /*
5340 * When !on_rq, vruntime of the task has usually NOT been normalized.
5341 * But there are some cases where it has already been normalized:
5342 *
5343 * - Moving a forked child which is waiting for being woken up by
5344 * wake_up_new_task().
5345 * - Moving a task which has been woken up by try_to_wake_up() and
5346 * waiting for actually being woken up by sched_ttwu_pending().
5347 *
5348 * To prevent boost or penalty in the new cfs_rq caused by delta
5349 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5350 */
5351 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5352 on_rq = 1;
5353
5018 if (!on_rq) 5354 if (!on_rq)
5019 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5355 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5020 set_task_rq(p, task_cpu(p)); 5356 set_task_rq(p, task_cpu(p));
5021 if (!on_rq) 5357 if (!on_rq)
5022 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5358 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
5023} 5359}
5360
5361void free_fair_sched_group(struct task_group *tg)
5362{
5363 int i;
5364
5365 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5366
5367 for_each_possible_cpu(i) {
5368 if (tg->cfs_rq)
5369 kfree(tg->cfs_rq[i]);
5370 if (tg->se)
5371 kfree(tg->se[i]);
5372 }
5373
5374 kfree(tg->cfs_rq);
5375 kfree(tg->se);
5376}
5377
5378int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5379{
5380 struct cfs_rq *cfs_rq;
5381 struct sched_entity *se;
5382 int i;
5383
5384 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5385 if (!tg->cfs_rq)
5386 goto err;
5387 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5388 if (!tg->se)
5389 goto err;
5390
5391 tg->shares = NICE_0_LOAD;
5392
5393 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5394
5395 for_each_possible_cpu(i) {
5396 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5397 GFP_KERNEL, cpu_to_node(i));
5398 if (!cfs_rq)
5399 goto err;
5400
5401 se = kzalloc_node(sizeof(struct sched_entity),
5402 GFP_KERNEL, cpu_to_node(i));
5403 if (!se)
5404 goto err_free_rq;
5405
5406 init_cfs_rq(cfs_rq);
5407 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5408 }
5409
5410 return 1;
5411
5412err_free_rq:
5413 kfree(cfs_rq);
5414err:
5415 return 0;
5416}
5417
5418void unregister_fair_sched_group(struct task_group *tg, int cpu)
5419{
5420 struct rq *rq = cpu_rq(cpu);
5421 unsigned long flags;
5422
5423 /*
5424 * Only empty task groups can be destroyed; so we can speculatively
5425 * check on_list without danger of it being re-added.
5426 */
5427 if (!tg->cfs_rq[cpu]->on_list)
5428 return;
5429
5430 raw_spin_lock_irqsave(&rq->lock, flags);
5431 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
5432 raw_spin_unlock_irqrestore(&rq->lock, flags);
5433}
5434
5435void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
5436 struct sched_entity *se, int cpu,
5437 struct sched_entity *parent)
5438{
5439 struct rq *rq = cpu_rq(cpu);
5440
5441 cfs_rq->tg = tg;
5442 cfs_rq->rq = rq;
5443#ifdef CONFIG_SMP
5444 /* allow initial update_cfs_load() to truncate */
5445 cfs_rq->load_stamp = 1;
5024#endif 5446#endif
5447 init_cfs_rq_runtime(cfs_rq);
5448
5449 tg->cfs_rq[cpu] = cfs_rq;
5450 tg->se[cpu] = se;
5451
5452 /* se could be NULL for root_task_group */
5453 if (!se)
5454 return;
5455
5456 if (!parent)
5457 se->cfs_rq = &rq->cfs;
5458 else
5459 se->cfs_rq = parent->my_q;
5460
5461 se->my_q = cfs_rq;
5462 update_load_set(&se->load, 0);
5463 se->parent = parent;
5464}
5465
5466static DEFINE_MUTEX(shares_mutex);
5467
5468int sched_group_set_shares(struct task_group *tg, unsigned long shares)
5469{
5470 int i;
5471 unsigned long flags;
5472
5473 /*
5474 * We can't change the weight of the root cgroup.
5475 */
5476 if (!tg->se[0])
5477 return -EINVAL;
5478
5479 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
5480
5481 mutex_lock(&shares_mutex);
5482 if (tg->shares == shares)
5483 goto done;
5484
5485 tg->shares = shares;
5486 for_each_possible_cpu(i) {
5487 struct rq *rq = cpu_rq(i);
5488 struct sched_entity *se;
5489
5490 se = tg->se[i];
5491 /* Propagate contribution to hierarchy */
5492 raw_spin_lock_irqsave(&rq->lock, flags);
5493 for_each_sched_entity(se)
5494 update_cfs_shares(group_cfs_rq(se));
5495 raw_spin_unlock_irqrestore(&rq->lock, flags);
5496 }
5497
5498done:
5499 mutex_unlock(&shares_mutex);
5500 return 0;
5501}
5502#else /* CONFIG_FAIR_GROUP_SCHED */
5503
5504void free_fair_sched_group(struct task_group *tg) { }
5505
5506int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5507{
5508 return 1;
5509}
5510
5511void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
5512
5513#endif /* CONFIG_FAIR_GROUP_SCHED */
5514
5025 5515
5026static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 5516static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
5027{ 5517{
@@ -5041,7 +5531,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
5041/* 5531/*
5042 * All the scheduling class methods: 5532 * All the scheduling class methods:
5043 */ 5533 */
5044static const struct sched_class fair_sched_class = { 5534const struct sched_class fair_sched_class = {
5045 .next = &idle_sched_class, 5535 .next = &idle_sched_class,
5046 .enqueue_task = enqueue_task_fair, 5536 .enqueue_task = enqueue_task_fair,
5047 .dequeue_task = dequeue_task_fair, 5537 .dequeue_task = dequeue_task_fair,
@@ -5078,7 +5568,7 @@ static const struct sched_class fair_sched_class = {
5078}; 5568};
5079 5569
5080#ifdef CONFIG_SCHED_DEBUG 5570#ifdef CONFIG_SCHED_DEBUG
5081static void print_cfs_stats(struct seq_file *m, int cpu) 5571void print_cfs_stats(struct seq_file *m, int cpu)
5082{ 5572{
5083 struct cfs_rq *cfs_rq; 5573 struct cfs_rq *cfs_rq;
5084 5574
@@ -5088,3 +5578,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
5088 rcu_read_unlock(); 5578 rcu_read_unlock();
5089} 5579}
5090#endif 5580#endif
5581
5582__init void init_sched_fair_class(void)
5583{
5584#ifdef CONFIG_SMP
5585 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5586
5587#ifdef CONFIG_NO_HZ
5588 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5589#endif
5590#endif /* SMP */
5591
5592}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 84802245abd2..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
3 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
4 * rip the spread apart. 4 * rip the spread apart.
5 */ 5 */
6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
7 7
8/* 8/*
9 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
10 * tasks 10 * tasks
11 */ 11 */
12SCHED_FEAT(START_DEBIT, 1) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place 15 * Based on load and program behaviour, see if it makes sense to place
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
17 * improve cache locality. Typically used with SYNC wakeups as 17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS. 18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */ 19 */
20SCHED_FEAT(AFFINE_WAKEUPS, 1) 20SCHED_FEAT(AFFINE_WAKEUPS, true)
21 21
22/* 22/*
23 * Prefer to schedule the task we woke last (assuming it failed 23 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 24 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 25 * touched, increases cache locality.
26 */ 26 */
27SCHED_FEAT(NEXT_BUDDY, 0) 27SCHED_FEAT(NEXT_BUDDY, false)
28 28
29/* 29/*
30 * Prefer to schedule the task that ran last (when we did 30 * Prefer to schedule the task that ran last (when we did
31 * wake-preempt) as that likely will touch the same data, increases 31 * wake-preempt) as that likely will touch the same data, increases
32 * cache locality. 32 * cache locality.
33 */ 33 */
34SCHED_FEAT(LAST_BUDDY, 1) 34SCHED_FEAT(LAST_BUDDY, true)
35 35
36/* 36/*
37 * Consider buddies to be cache hot, decreases the likelyness of a 37 * Consider buddies to be cache hot, decreases the likelyness of a
38 * cache buddy being migrated away, increases cache locality. 38 * cache buddy being migrated away, increases cache locality.
39 */ 39 */
40SCHED_FEAT(CACHE_HOT_BUDDY, 1) 40SCHED_FEAT(CACHE_HOT_BUDDY, true)
41 41
42/* 42/*
43 * Use arch dependent cpu power functions 43 * Use arch dependent cpu power functions
44 */ 44 */
45SCHED_FEAT(ARCH_POWER, 0) 45SCHED_FEAT(ARCH_POWER, false)
46 46
47SCHED_FEAT(HRTICK, 0) 47SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, 0) 48SCHED_FEAT(DOUBLE_TICK, false)
49SCHED_FEAT(LB_BIAS, 1) 49SCHED_FEAT(LB_BIAS, true)
50 50
51/* 51/*
52 * Spin-wait on mutex acquisition when the mutex owner is running on 52 * Spin-wait on mutex acquisition when the mutex owner is running on
53 * another cpu -- assumes that when the owner is running, it will soon 53 * another cpu -- assumes that when the owner is running, it will soon
54 * release the lock. Decreases scheduling overhead. 54 * release the lock. Decreases scheduling overhead.
55 */ 55 */
56SCHED_FEAT(OWNER_SPIN, 1) 56SCHED_FEAT(OWNER_SPIN, true)
57 57
58/* 58/*
59 * Decrement CPU power based on time not spent running tasks 59 * Decrement CPU power based on time not spent running tasks
60 */ 60 */
61SCHED_FEAT(NONTASK_POWER, 1) 61SCHED_FEAT(NONTASK_POWER, true)
62 62
63/* 63/*
64 * Queue remote wakeups on the target CPU and process them 64 * Queue remote wakeups on the target CPU and process them
65 * using the scheduler IPI. Reduces rq->lock contention/bounces. 65 * using the scheduler IPI. Reduces rq->lock contention/bounces.
66 */ 66 */
67SCHED_FEAT(TTWU_QUEUE, 1) 67SCHED_FEAT(TTWU_QUEUE, true)
68 68
69SCHED_FEAT(FORCE_SD_OVERLAP, 0) 69SCHED_FEAT(FORCE_SD_OVERLAP, false)
70SCHED_FEAT(RT_RUNTIME_SHARE, 1) 70SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * idle-task scheduling class. 4 * idle-task scheduling class.
3 * 5 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
71/* 73/*
72 * Simple, special scheduling class for the per-CPU idle tasks: 74 * Simple, special scheduling class for the per-CPU idle tasks:
73 */ 75 */
74static const struct sched_class idle_sched_class = { 76const struct sched_class idle_sched_class = {
75 /* .next is NULL */ 77 /* .next is NULL */
76 /* no enqueue/yield_task for idle tasks */ 78 /* no enqueue/yield_task for idle tasks */
77 79
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 583a1368afe6..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#include "sched.h"
7
8#include <linux/slab.h>
9
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11
12struct rt_bandwidth def_rt_bandwidth;
13
14static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
15{
16 struct rt_bandwidth *rt_b =
17 container_of(timer, struct rt_bandwidth, rt_period_timer);
18 ktime_t now;
19 int overrun;
20 int idle = 0;
21
22 for (;;) {
23 now = hrtimer_cb_get_time(timer);
24 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
25
26 if (!overrun)
27 break;
28
29 idle = do_sched_rt_period_timer(rt_b, overrun);
30 }
31
32 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
33}
34
35void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
36{
37 rt_b->rt_period = ns_to_ktime(period);
38 rt_b->rt_runtime = runtime;
39
40 raw_spin_lock_init(&rt_b->rt_runtime_lock);
41
42 hrtimer_init(&rt_b->rt_period_timer,
43 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
44 rt_b->rt_period_timer.function = sched_rt_period_timer;
45}
46
47static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
48{
49 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
50 return;
51
52 if (hrtimer_active(&rt_b->rt_period_timer))
53 return;
54
55 raw_spin_lock(&rt_b->rt_runtime_lock);
56 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
57 raw_spin_unlock(&rt_b->rt_runtime_lock);
58}
59
60void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
61{
62 struct rt_prio_array *array;
63 int i;
64
65 array = &rt_rq->active;
66 for (i = 0; i < MAX_RT_PRIO; i++) {
67 INIT_LIST_HEAD(array->queue + i);
68 __clear_bit(i, array->bitmap);
69 }
70 /* delimiter for bitsearch: */
71 __set_bit(MAX_RT_PRIO, array->bitmap);
72
73#if defined CONFIG_SMP
74 rt_rq->highest_prio.curr = MAX_RT_PRIO;
75 rt_rq->highest_prio.next = MAX_RT_PRIO;
76 rt_rq->rt_nr_migratory = 0;
77 rt_rq->overloaded = 0;
78 plist_head_init(&rt_rq->pushable_tasks);
79#endif
80
81 rt_rq->rt_time = 0;
82 rt_rq->rt_throttled = 0;
83 rt_rq->rt_runtime = 0;
84 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
85}
86
6#ifdef CONFIG_RT_GROUP_SCHED 87#ifdef CONFIG_RT_GROUP_SCHED
88static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
89{
90 hrtimer_cancel(&rt_b->rt_period_timer);
91}
7 92
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) 93#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9 94
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
25 return rt_se->rt_rq; 110 return rt_se->rt_rq;
26} 111}
27 112
113void free_rt_sched_group(struct task_group *tg)
114{
115 int i;
116
117 if (tg->rt_se)
118 destroy_rt_bandwidth(&tg->rt_bandwidth);
119
120 for_each_possible_cpu(i) {
121 if (tg->rt_rq)
122 kfree(tg->rt_rq[i]);
123 if (tg->rt_se)
124 kfree(tg->rt_se[i]);
125 }
126
127 kfree(tg->rt_rq);
128 kfree(tg->rt_se);
129}
130
131void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
132 struct sched_rt_entity *rt_se, int cpu,
133 struct sched_rt_entity *parent)
134{
135 struct rq *rq = cpu_rq(cpu);
136
137 rt_rq->highest_prio.curr = MAX_RT_PRIO;
138 rt_rq->rt_nr_boosted = 0;
139 rt_rq->rq = rq;
140 rt_rq->tg = tg;
141
142 tg->rt_rq[cpu] = rt_rq;
143 tg->rt_se[cpu] = rt_se;
144
145 if (!rt_se)
146 return;
147
148 if (!parent)
149 rt_se->rt_rq = &rq->rt;
150 else
151 rt_se->rt_rq = parent->my_q;
152
153 rt_se->my_q = rt_rq;
154 rt_se->parent = parent;
155 INIT_LIST_HEAD(&rt_se->run_list);
156}
157
158int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
159{
160 struct rt_rq *rt_rq;
161 struct sched_rt_entity *rt_se;
162 int i;
163
164 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
165 if (!tg->rt_rq)
166 goto err;
167 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
168 if (!tg->rt_se)
169 goto err;
170
171 init_rt_bandwidth(&tg->rt_bandwidth,
172 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
173
174 for_each_possible_cpu(i) {
175 rt_rq = kzalloc_node(sizeof(struct rt_rq),
176 GFP_KERNEL, cpu_to_node(i));
177 if (!rt_rq)
178 goto err;
179
180 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
181 GFP_KERNEL, cpu_to_node(i));
182 if (!rt_se)
183 goto err_free_rq;
184
185 init_rt_rq(rt_rq, cpu_rq(i));
186 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
187 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
188 }
189
190 return 1;
191
192err_free_rq:
193 kfree(rt_rq);
194err:
195 return 0;
196}
197
28#else /* CONFIG_RT_GROUP_SCHED */ 198#else /* CONFIG_RT_GROUP_SCHED */
29 199
30#define rt_entity_is_task(rt_se) (1) 200#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
47 return &rq->rt; 217 return &rq->rt;
48} 218}
49 219
220void free_rt_sched_group(struct task_group *tg) { }
221
222int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
223{
224 return 1;
225}
50#endif /* CONFIG_RT_GROUP_SCHED */ 226#endif /* CONFIG_RT_GROUP_SCHED */
51 227
52#ifdef CONFIG_SMP 228#ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
556 raw_spin_unlock_irqrestore(&rq->lock, flags); 732 raw_spin_unlock_irqrestore(&rq->lock, flags);
557} 733}
558 734
735int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
736{
737 int cpu = (int)(long)hcpu;
738
739 switch (action) {
740 case CPU_DOWN_PREPARE:
741 case CPU_DOWN_PREPARE_FROZEN:
742 disable_runtime(cpu_rq(cpu));
743 return NOTIFY_OK;
744
745 case CPU_DOWN_FAILED:
746 case CPU_DOWN_FAILED_FROZEN:
747 case CPU_ONLINE:
748 case CPU_ONLINE_FROZEN:
749 enable_runtime(cpu_rq(cpu));
750 return NOTIFY_OK;
751
752 default:
753 return NOTIFY_DONE;
754 }
755}
756
559static int balance_runtime(struct rt_rq *rt_rq) 757static int balance_runtime(struct rt_rq *rt_rq)
560{ 758{
561 int more = 0; 759 int more = 0;
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
648 if (rt_rq->rt_throttled) 846 if (rt_rq->rt_throttled)
649 return rt_rq_throttled(rt_rq); 847 return rt_rq_throttled(rt_rq);
650 848
651 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 849 if (runtime >= sched_rt_period(rt_rq))
652 return 0; 850 return 0;
653 851
654 balance_runtime(rt_rq); 852 balance_runtime(rt_rq);
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
957} 1155}
958 1156
959/* 1157/*
960 * Put task to the end of the run list without the overhead of dequeue 1158 * Put task to the head or the end of the run list without the overhead of
961 * followed by enqueue. 1159 * dequeue followed by enqueue.
962 */ 1160 */
963static void 1161static void
964requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) 1162requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1002 1200
1003 cpu = task_cpu(p); 1201 cpu = task_cpu(p);
1004 1202
1203 if (p->rt.nr_cpus_allowed == 1)
1204 goto out;
1205
1005 /* For anything but wake ups, just return the task_cpu */ 1206 /* For anything but wake ups, just return the task_cpu */
1006 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) 1207 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1007 goto out; 1208 goto out;
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1178/* Only try algorithms three times */ 1379/* Only try algorithms three times */
1179#define RT_MAX_TRIES 3 1380#define RT_MAX_TRIES 3
1180 1381
1181static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
1182
1183static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1382static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1184{ 1383{
1185 if (!task_running(rq, p) && 1384 if (!task_running(rq, p) &&
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1653 pull_rt_task(rq); 1852 pull_rt_task(rq);
1654} 1853}
1655 1854
1656static inline void init_sched_rt_class(void) 1855void init_sched_rt_class(void)
1657{ 1856{
1658 unsigned int i; 1857 unsigned int i;
1659 1858
1660 for_each_possible_cpu(i) 1859 for_each_possible_cpu(i) {
1661 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), 1860 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
1662 GFP_KERNEL, cpu_to_node(i)); 1861 GFP_KERNEL, cpu_to_node(i));
1862 }
1663} 1863}
1664#endif /* CONFIG_SMP */ 1864#endif /* CONFIG_SMP */
1665 1865
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1800 return 0; 2000 return 0;
1801} 2001}
1802 2002
1803static const struct sched_class rt_sched_class = { 2003const struct sched_class rt_sched_class = {
1804 .next = &fair_sched_class, 2004 .next = &fair_sched_class,
1805 .enqueue_task = enqueue_task_rt, 2005 .enqueue_task = enqueue_task_rt,
1806 .dequeue_task = dequeue_task_rt, 2006 .dequeue_task = dequeue_task_rt,
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
1835#ifdef CONFIG_SCHED_DEBUG 2035#ifdef CONFIG_SCHED_DEBUG
1836extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); 2036extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1837 2037
1838static void print_rt_stats(struct seq_file *m, int cpu) 2038void print_rt_stats(struct seq_file *m, int cpu)
1839{ 2039{
1840 rt_rq_iter_t iter; 2040 rt_rq_iter_t iter;
1841 struct rt_rq *rt_rq; 2041 struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
1
2#include <linux/sched.h>
3#include <linux/mutex.h>
4#include <linux/spinlock.h>
5#include <linux/stop_machine.h>
6
7#include "cpupri.h"
8
9extern __read_mostly int scheduler_running;
10
11/*
12 * Convert user-nice values [ -20 ... 0 ... 19 ]
13 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
14 * and back.
15 */
16#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
17#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
18#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
19
20/*
21 * 'User priority' is the nice value converted to something we
22 * can work with better when scaling various scheduler parameters,
23 * it's a [ 0 ... 39 ] range.
24 */
25#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
26#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
27#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
28
29/*
30 * Helpers for converting nanosecond timing to jiffy resolution
31 */
32#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
33
34#define NICE_0_LOAD SCHED_LOAD_SCALE
35#define NICE_0_SHIFT SCHED_LOAD_SHIFT
36
37/*
38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44
45/*
46 * single value that denotes runtime == period, ie unlimited time.
47 */
48#define RUNTIME_INF ((u64)~0ULL)
49
50static inline int rt_policy(int policy)
51{
52 if (policy == SCHED_FIFO || policy == SCHED_RR)
53 return 1;
54 return 0;
55}
56
57static inline int task_has_rt_policy(struct task_struct *p)
58{
59 return rt_policy(p->policy);
60}
61
62/*
63 * This is the priority-queue data structure of the RT scheduling class:
64 */
65struct rt_prio_array {
66 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
67 struct list_head queue[MAX_RT_PRIO];
68};
69
70struct rt_bandwidth {
71 /* nests inside the rq lock: */
72 raw_spinlock_t rt_runtime_lock;
73 ktime_t rt_period;
74 u64 rt_runtime;
75 struct hrtimer rt_period_timer;
76};
77
78extern struct mutex sched_domains_mutex;
79
80#ifdef CONFIG_CGROUP_SCHED
81
82#include <linux/cgroup.h>
83
84struct cfs_rq;
85struct rt_rq;
86
87static LIST_HEAD(task_groups);
88
89struct cfs_bandwidth {
90#ifdef CONFIG_CFS_BANDWIDTH
91 raw_spinlock_t lock;
92 ktime_t period;
93 u64 quota, runtime;
94 s64 hierarchal_quota;
95 u64 runtime_expires;
96
97 int idle, timer_active;
98 struct hrtimer period_timer, slack_timer;
99 struct list_head throttled_cfs_rq;
100
101 /* statistics */
102 int nr_periods, nr_throttled;
103 u64 throttled_time;
104#endif
105};
106
107/* task group related information */
108struct task_group {
109 struct cgroup_subsys_state css;
110
111#ifdef CONFIG_FAIR_GROUP_SCHED
112 /* schedulable entities of this group on each cpu */
113 struct sched_entity **se;
114 /* runqueue "owned" by this group on each cpu */
115 struct cfs_rq **cfs_rq;
116 unsigned long shares;
117
118 atomic_t load_weight;
119#endif
120
121#ifdef CONFIG_RT_GROUP_SCHED
122 struct sched_rt_entity **rt_se;
123 struct rt_rq **rt_rq;
124
125 struct rt_bandwidth rt_bandwidth;
126#endif
127
128 struct rcu_head rcu;
129 struct list_head list;
130
131 struct task_group *parent;
132 struct list_head siblings;
133 struct list_head children;
134
135#ifdef CONFIG_SCHED_AUTOGROUP
136 struct autogroup *autogroup;
137#endif
138
139 struct cfs_bandwidth cfs_bandwidth;
140};
141
142#ifdef CONFIG_FAIR_GROUP_SCHED
143#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
144
145/*
146 * A weight of 0 or 1 can cause arithmetics problems.
147 * A weight of a cfs_rq is the sum of weights of which entities
148 * are queued on this cfs_rq, so a weight of a entity should not be
149 * too large, so as the shares value of a task group.
150 * (The default weight is 1024 - so there's no practical
151 * limitation from this.)
152 */
153#define MIN_SHARES (1UL << 1)
154#define MAX_SHARES (1UL << 18)
155#endif
156
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *);
163
164extern int walk_tg_tree_from(struct task_group *from,
165 tg_visitor down, tg_visitor up, void *data);
166
167/*
168 * Iterate the full tree, calling @down when first entering a node and @up when
169 * leaving it for the final time.
170 *
171 * Caller must hold rcu_lock or sufficient equivalent.
172 */
173static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
174{
175 return walk_tg_tree_from(&root_task_group, down, up, data);
176}
177
178extern int tg_nop(struct task_group *tg, void *data);
179
180extern void free_fair_sched_group(struct task_group *tg);
181extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
182extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
183extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
184 struct sched_entity *se, int cpu,
185 struct sched_entity *parent);
186extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
187extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
188
189extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
190extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
191extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
192
193extern void free_rt_sched_group(struct task_group *tg);
194extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
195extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent);
198
199#else /* CONFIG_CGROUP_SCHED */
200
201struct cfs_bandwidth { };
202
203#endif /* CONFIG_CGROUP_SCHED */
204
205/* CFS-related fields in a runqueue */
206struct cfs_rq {
207 struct load_weight load;
208 unsigned long nr_running, h_nr_running;
209
210 u64 exec_clock;
211 u64 min_vruntime;
212#ifndef CONFIG_64BIT
213 u64 min_vruntime_copy;
214#endif
215
216 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost;
218
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /*
223 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running).
225 */
226 struct sched_entity *curr, *next, *last, *skip;
227
228#ifdef CONFIG_SCHED_DEBUG
229 unsigned int nr_spread_over;
230#endif
231
232#ifdef CONFIG_FAIR_GROUP_SCHED
233 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
234
235 /*
236 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
237 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
238 * (like users, containers etc.)
239 *
240 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
241 * list is used during load balance.
242 */
243 int on_list;
244 struct list_head leaf_cfs_rq_list;
245 struct task_group *tg; /* group that "owns" this runqueue */
246
247#ifdef CONFIG_SMP
248 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg)
255 *
256 * Where f(tg) is the recursive weight fraction assigned to
257 * this group.
258 */
259 unsigned long h_load;
260
261 /*
262 * Maintaining per-cpu shares distribution for group scheduling
263 *
264 * load_stamp is the last time we updated the load average
265 * load_last is the last time we updated the load average and saw load
266 * load_unacc_exec_time is currently unaccounted execution time
267 */
268 u64 load_avg;
269 u64 load_period;
270 u64 load_stamp, load_last, load_unacc_exec_time;
271
272 unsigned long load_contribution;
273#endif /* CONFIG_SMP */
274#ifdef CONFIG_CFS_BANDWIDTH
275 int runtime_enabled;
276 u64 runtime_expires;
277 s64 runtime_remaining;
278
279 u64 throttled_timestamp;
280 int throttled, throttle_count;
281 struct list_head throttled_list;
282#endif /* CONFIG_CFS_BANDWIDTH */
283#endif /* CONFIG_FAIR_GROUP_SCHED */
284};
285
286static inline int rt_bandwidth_enabled(void)
287{
288 return sysctl_sched_rt_runtime >= 0;
289}
290
291/* Real-Time classes' related field in a runqueue: */
292struct rt_rq {
293 struct rt_prio_array active;
294 unsigned long rt_nr_running;
295#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
296 struct {
297 int curr; /* highest queued rt task prio */
298#ifdef CONFIG_SMP
299 int next; /* next highest */
300#endif
301 } highest_prio;
302#endif
303#ifdef CONFIG_SMP
304 unsigned long rt_nr_migratory;
305 unsigned long rt_nr_total;
306 int overloaded;
307 struct plist_head pushable_tasks;
308#endif
309 int rt_throttled;
310 u64 rt_time;
311 u64 rt_runtime;
312 /* Nests inside the rq lock: */
313 raw_spinlock_t rt_runtime_lock;
314
315#ifdef CONFIG_RT_GROUP_SCHED
316 unsigned long rt_nr_boosted;
317
318 struct rq *rq;
319 struct list_head leaf_rt_rq_list;
320 struct task_group *tg;
321#endif
322};
323
324#ifdef CONFIG_SMP
325
326/*
327 * We add the notion of a root-domain which will be used to define per-domain
328 * variables. Each exclusive cpuset essentially defines an island domain by
329 * fully partitioning the member cpus from any other cpuset. Whenever a new
330 * exclusive cpuset is created, we also create and attach a new root-domain
331 * object.
332 *
333 */
334struct root_domain {
335 atomic_t refcount;
336 atomic_t rto_count;
337 struct rcu_head rcu;
338 cpumask_var_t span;
339 cpumask_var_t online;
340
341 /*
342 * The "RT overload" flag: it gets set if a CPU has more than
343 * one runnable RT task.
344 */
345 cpumask_var_t rto_mask;
346 struct cpupri cpupri;
347};
348
349extern struct root_domain def_root_domain;
350
351#endif /* CONFIG_SMP */
352
353/*
354 * This is the main, per-CPU runqueue data structure.
355 *
356 * Locking rule: those places that want to lock multiple runqueues
357 * (such as the load balancing or the thread migration code), lock
358 * acquire operations must be ordered by ascending &runqueue.
359 */
360struct rq {
361 /* runqueue lock: */
362 raw_spinlock_t lock;
363
364 /*
365 * nr_running and cpu_load should be in the same cacheline because
366 * remote CPUs use both these fields when doing load calculation.
367 */
368 unsigned long nr_running;
369 #define CPU_LOAD_IDX_MAX 5
370 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
371 unsigned long last_load_update_tick;
372#ifdef CONFIG_NO_HZ
373 u64 nohz_stamp;
374 unsigned long nohz_flags;
375#endif
376 int skip_clock_update;
377
378 /* capture load from *all* tasks on this cpu: */
379 struct load_weight load;
380 unsigned long nr_load_updates;
381 u64 nr_switches;
382
383 struct cfs_rq cfs;
384 struct rt_rq rt;
385
386#ifdef CONFIG_FAIR_GROUP_SCHED
387 /* list of leaf cfs_rq on this cpu: */
388 struct list_head leaf_cfs_rq_list;
389#endif
390#ifdef CONFIG_RT_GROUP_SCHED
391 struct list_head leaf_rt_rq_list;
392#endif
393
394 /*
395 * This is part of a global counter where only the total sum
396 * over all CPUs matters. A task can increase this counter on
397 * one CPU and if it got migrated afterwards it may decrease
398 * it on another CPU. Always updated under the runqueue lock:
399 */
400 unsigned long nr_uninterruptible;
401
402 struct task_struct *curr, *idle, *stop;
403 unsigned long next_balance;
404 struct mm_struct *prev_mm;
405
406 u64 clock;
407 u64 clock_task;
408
409 atomic_t nr_iowait;
410
411#ifdef CONFIG_SMP
412 struct root_domain *rd;
413 struct sched_domain *sd;
414
415 unsigned long cpu_power;
416
417 unsigned char idle_balance;
418 /* For active balancing */
419 int post_schedule;
420 int active_balance;
421 int push_cpu;
422 struct cpu_stop_work active_balance_work;
423 /* cpu of this runqueue: */
424 int cpu;
425 int online;
426
427 u64 rt_avg;
428 u64 age_stamp;
429 u64 idle_stamp;
430 u64 avg_idle;
431#endif
432
433#ifdef CONFIG_IRQ_TIME_ACCOUNTING
434 u64 prev_irq_time;
435#endif
436#ifdef CONFIG_PARAVIRT
437 u64 prev_steal_time;
438#endif
439#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
440 u64 prev_steal_time_rq;
441#endif
442
443 /* calc_load related fields */
444 unsigned long calc_load_update;
445 long calc_load_active;
446
447#ifdef CONFIG_SCHED_HRTICK
448#ifdef CONFIG_SMP
449 int hrtick_csd_pending;
450 struct call_single_data hrtick_csd;
451#endif
452 struct hrtimer hrtick_timer;
453#endif
454
455#ifdef CONFIG_SCHEDSTATS
456 /* latency stats */
457 struct sched_info rq_sched_info;
458 unsigned long long rq_cpu_time;
459 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
460
461 /* sys_sched_yield() stats */
462 unsigned int yld_count;
463
464 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count;
467 unsigned int sched_goidle;
468
469 /* try_to_wake_up() stats */
470 unsigned int ttwu_count;
471 unsigned int ttwu_local;
472#endif
473
474#ifdef CONFIG_SMP
475 struct llist_head wake_list;
476#endif
477};
478
479static inline int cpu_of(struct rq *rq)
480{
481#ifdef CONFIG_SMP
482 return rq->cpu;
483#else
484 return 0;
485#endif
486}
487
488DECLARE_PER_CPU(struct rq, runqueues);
489
490#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
491#define this_rq() (&__get_cpu_var(runqueues))
492#define task_rq(p) cpu_rq(task_cpu(p))
493#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
494#define raw_rq() (&__raw_get_cpu_var(runqueues))
495
496#ifdef CONFIG_SMP
497
498#define rcu_dereference_check_sched_domain(p) \
499 rcu_dereference_check((p), \
500 lockdep_is_held(&sched_domains_mutex))
501
502/*
503 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
504 * See detach_destroy_domains: synchronize_sched for details.
505 *
506 * The domain tree of any CPU may only be accessed from within
507 * preempt-disabled sections.
508 */
509#define for_each_domain(cpu, __sd) \
510 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
511 __sd; __sd = __sd->parent)
512
513#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
514
515/**
516 * highest_flag_domain - Return highest sched_domain containing flag.
517 * @cpu: The cpu whose highest level of sched domain is to
518 * be returned.
519 * @flag: The flag to check for the highest sched_domain
520 * for the given cpu.
521 *
522 * Returns the highest sched_domain of a cpu which contains the given flag.
523 */
524static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
525{
526 struct sched_domain *sd, *hsd = NULL;
527
528 for_each_domain(cpu, sd) {
529 if (!(sd->flags & flag))
530 break;
531 hsd = sd;
532 }
533
534 return hsd;
535}
536
537DECLARE_PER_CPU(struct sched_domain *, sd_llc);
538DECLARE_PER_CPU(int, sd_llc_id);
539
540#endif /* CONFIG_SMP */
541
542#include "stats.h"
543#include "auto_group.h"
544
545#ifdef CONFIG_CGROUP_SCHED
546
547/*
548 * Return the group to which this tasks belongs.
549 *
550 * We use task_subsys_state_check() and extend the RCU verification with
551 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
552 * task it moves into the cgroup. Therefore by holding either of those locks,
553 * we pin the task to the current cgroup.
554 */
555static inline struct task_group *task_group(struct task_struct *p)
556{
557 struct task_group *tg;
558 struct cgroup_subsys_state *css;
559
560 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
561 lockdep_is_held(&p->pi_lock) ||
562 lockdep_is_held(&task_rq(p)->lock));
563 tg = container_of(css, struct task_group, css);
564
565 return autogroup_task_group(p, tg);
566}
567
568/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
569static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
570{
571#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
572 struct task_group *tg = task_group(p);
573#endif
574
575#ifdef CONFIG_FAIR_GROUP_SCHED
576 p->se.cfs_rq = tg->cfs_rq[cpu];
577 p->se.parent = tg->se[cpu];
578#endif
579
580#ifdef CONFIG_RT_GROUP_SCHED
581 p->rt.rt_rq = tg->rt_rq[cpu];
582 p->rt.parent = tg->rt_se[cpu];
583#endif
584}
585
586#else /* CONFIG_CGROUP_SCHED */
587
588static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
589static inline struct task_group *task_group(struct task_struct *p)
590{
591 return NULL;
592}
593
594#endif /* CONFIG_CGROUP_SCHED */
595
596static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
597{
598 set_task_rq(p, cpu);
599#ifdef CONFIG_SMP
600 /*
601 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
602 * successfuly executed on another CPU. We must ensure that updates of
603 * per-task data have been completed by this moment.
604 */
605 smp_wmb();
606 task_thread_info(p)->cpu = cpu;
607#endif
608}
609
610/*
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */
613#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h>
615# define const_debug __read_mostly
616#else
617# define const_debug const
618#endif
619
620extern const_debug unsigned int sysctl_sched_features;
621
622#define SCHED_FEAT(name, enabled) \
623 __SCHED_FEAT_##name ,
624
625enum {
626#include "features.h"
627 __SCHED_FEAT_NR,
628};
629
630#undef SCHED_FEAT
631
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key)
634{
635 return likely(static_branch(key)); /* Not out of line branch. */
636}
637
638static __always_inline bool static_branch__false(struct jump_label_key *key)
639{
640 return unlikely(static_branch(key)); /* Out of line branch. */
641}
642
643#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \
645{ \
646 return static_branch__##enabled(key); \
647}
648
649#include "features.h"
650
651#undef SCHED_FEAT
652
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
657#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
658
659static inline u64 global_rt_period(void)
660{
661 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
662}
663
664static inline u64 global_rt_runtime(void)
665{
666 if (sysctl_sched_rt_runtime < 0)
667 return RUNTIME_INF;
668
669 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
670}
671
672
673
674static inline int task_current(struct rq *rq, struct task_struct *p)
675{
676 return rq->curr == p;
677}
678
679static inline int task_running(struct rq *rq, struct task_struct *p)
680{
681#ifdef CONFIG_SMP
682 return p->on_cpu;
683#else
684 return task_current(rq, p);
685#endif
686}
687
688
689#ifndef prepare_arch_switch
690# define prepare_arch_switch(next) do { } while (0)
691#endif
692#ifndef finish_arch_switch
693# define finish_arch_switch(prev) do { } while (0)
694#endif
695
696#ifndef __ARCH_WANT_UNLOCKED_CTXSW
697static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
698{
699#ifdef CONFIG_SMP
700 /*
701 * We can optimise this out completely for !SMP, because the
702 * SMP rebalancing from interrupt is the only thing that cares
703 * here.
704 */
705 next->on_cpu = 1;
706#endif
707}
708
709static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
710{
711#ifdef CONFIG_SMP
712 /*
713 * After ->on_cpu is cleared, the task can be moved to a different CPU.
714 * We must ensure this doesn't happen until the switch is completely
715 * finished.
716 */
717 smp_wmb();
718 prev->on_cpu = 0;
719#endif
720#ifdef CONFIG_DEBUG_SPINLOCK
721 /* this is a valid case when another task releases the spinlock */
722 rq->lock.owner = current;
723#endif
724 /*
725 * If we are tracking spinlock dependencies then we have to
726 * fix up the runqueue lock - which gets 'carried over' from
727 * prev into current:
728 */
729 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
730
731 raw_spin_unlock_irq(&rq->lock);
732}
733
734#else /* __ARCH_WANT_UNLOCKED_CTXSW */
735static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
736{
737#ifdef CONFIG_SMP
738 /*
739 * We can optimise this out completely for !SMP, because the
740 * SMP rebalancing from interrupt is the only thing that cares
741 * here.
742 */
743 next->on_cpu = 1;
744#endif
745#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
746 raw_spin_unlock_irq(&rq->lock);
747#else
748 raw_spin_unlock(&rq->lock);
749#endif
750}
751
752static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
753{
754#ifdef CONFIG_SMP
755 /*
756 * After ->on_cpu is cleared, the task can be moved to a different CPU.
757 * We must ensure this doesn't happen until the switch is completely
758 * finished.
759 */
760 smp_wmb();
761 prev->on_cpu = 0;
762#endif
763#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
764 local_irq_enable();
765#endif
766}
767#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
768
769
770static inline void update_load_add(struct load_weight *lw, unsigned long inc)
771{
772 lw->weight += inc;
773 lw->inv_weight = 0;
774}
775
776static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
777{
778 lw->weight -= dec;
779 lw->inv_weight = 0;
780}
781
782static inline void update_load_set(struct load_weight *lw, unsigned long w)
783{
784 lw->weight = w;
785 lw->inv_weight = 0;
786}
787
788/*
789 * To aid in avoiding the subversion of "niceness" due to uneven distribution
790 * of tasks with abnormal "nice" values across CPUs the contribution that
791 * each task makes to its run queue's load is weighted according to its
792 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
793 * scaled version of the new time slice allocation that they receive on time
794 * slice expiry etc.
795 */
796
797#define WEIGHT_IDLEPRIO 3
798#define WMULT_IDLEPRIO 1431655765
799
800/*
801 * Nice levels are multiplicative, with a gentle 10% change for every
802 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
803 * nice 1, it will get ~10% less CPU time than another CPU-bound task
804 * that remained on nice 0.
805 *
806 * The "10% effect" is relative and cumulative: from _any_ nice level,
807 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
808 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
809 * If a task goes up by ~10% and another task goes down by ~10% then
810 * the relative distance between them is ~25%.)
811 */
812static const int prio_to_weight[40] = {
813 /* -20 */ 88761, 71755, 56483, 46273, 36291,
814 /* -15 */ 29154, 23254, 18705, 14949, 11916,
815 /* -10 */ 9548, 7620, 6100, 4904, 3906,
816 /* -5 */ 3121, 2501, 1991, 1586, 1277,
817 /* 0 */ 1024, 820, 655, 526, 423,
818 /* 5 */ 335, 272, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45,
820 /* 15 */ 36, 29, 23, 18, 15,
821};
822
823/*
824 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
825 *
826 * In cases where the weight does not change often, we can use the
827 * precalculated inverse to speed up arithmetics by turning divisions
828 * into multiplications:
829 */
830static const u32 prio_to_wmult[40] = {
831 /* -20 */ 48388, 59856, 76040, 92818, 118348,
832 /* -15 */ 147320, 184698, 229616, 287308, 360437,
833 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
834 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
835 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
836 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
837 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
838 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
839};
840
841/* Time spent by the tasks of the cpu accounting group executing in ... */
842enum cpuacct_stat_index {
843 CPUACCT_STAT_USER, /* ... user mode */
844 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
845
846 CPUACCT_STAT_NSTATS,
847};
848
849
850#define sched_class_highest (&stop_sched_class)
851#define for_each_class(class) \
852 for (class = sched_class_highest; class; class = class->next)
853
854extern const struct sched_class stop_sched_class;
855extern const struct sched_class rt_sched_class;
856extern const struct sched_class fair_sched_class;
857extern const struct sched_class idle_sched_class;
858
859
860#ifdef CONFIG_SMP
861
862extern void trigger_load_balance(struct rq *rq, int cpu);
863extern void idle_balance(int this_cpu, struct rq *this_rq);
864
865#else /* CONFIG_SMP */
866
867static inline void idle_balance(int cpu, struct rq *rq)
868{
869}
870
871#endif
872
873extern void sysrq_sched_debug_show(void);
874extern void sched_init_granularity(void);
875extern void update_max_interval(void);
876extern void update_group_power(struct sched_domain *sd, int cpu);
877extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
878extern void init_sched_rt_class(void);
879extern void init_sched_fair_class(void);
880
881extern void resched_task(struct task_struct *p);
882extern void resched_cpu(int cpu);
883
884extern struct rt_bandwidth def_rt_bandwidth;
885extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
886
887extern void update_cpu_load(struct rq *this_rq);
888
889#ifdef CONFIG_CGROUP_CPUACCT
890#include <linux/cgroup.h>
891/* track cpu usage of a group of tasks and its child groups */
892struct cpuacct {
893 struct cgroup_subsys_state css;
894 /* cpuusage holds pointer to a u64-type object on every cpu */
895 u64 __percpu *cpuusage;
896 struct kernel_cpustat __percpu *cpustat;
897};
898
899/* return cpu accounting group corresponding to this container */
900static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
901{
902 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
903 struct cpuacct, css);
904}
905
906/* return cpu accounting group to which this task belongs */
907static inline struct cpuacct *task_ca(struct task_struct *tsk)
908{
909 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
910 struct cpuacct, css);
911}
912
913static inline struct cpuacct *parent_ca(struct cpuacct *ca)
914{
915 if (!ca || !ca->css.cgroup->parent)
916 return NULL;
917 return cgroup_ca(ca->css.cgroup->parent);
918}
919
920extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
921#else
922static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
923#endif
924
925static inline void inc_nr_running(struct rq *rq)
926{
927 rq->nr_running++;
928}
929
930static inline void dec_nr_running(struct rq *rq)
931{
932 rq->nr_running--;
933}
934
935extern void update_rq_clock(struct rq *rq);
936
937extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
938extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
939
940extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
941
942extern const_debug unsigned int sysctl_sched_time_avg;
943extern const_debug unsigned int sysctl_sched_nr_migrate;
944extern const_debug unsigned int sysctl_sched_migration_cost;
945
946static inline u64 sched_avg_period(void)
947{
948 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
949}
950
951void calc_load_account_idle(struct rq *this_rq);
952
953#ifdef CONFIG_SCHED_HRTICK
954
955/*
956 * Use hrtick when:
957 * - enabled by features
958 * - hrtimer is actually high res
959 */
960static inline int hrtick_enabled(struct rq *rq)
961{
962 if (!sched_feat(HRTICK))
963 return 0;
964 if (!cpu_active(cpu_of(rq)))
965 return 0;
966 return hrtimer_is_hres_active(&rq->hrtick_timer);
967}
968
969void hrtick_start(struct rq *rq, u64 delay);
970
971#else
972
973static inline int hrtick_enabled(struct rq *rq)
974{
975 return 0;
976}
977
978#endif /* CONFIG_SCHED_HRTICK */
979
980#ifdef CONFIG_SMP
981extern void sched_avg_update(struct rq *rq);
982static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
983{
984 rq->rt_avg += rt_delta;
985 sched_avg_update(rq);
986}
987#else
988static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
989static inline void sched_avg_update(struct rq *rq) { }
990#endif
991
992extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
993
994#ifdef CONFIG_SMP
995#ifdef CONFIG_PREEMPT
996
997static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
998
999/*
1000 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1001 * way at the expense of forcing extra atomic operations in all
1002 * invocations. This assures that the double_lock is acquired using the
1003 * same underlying policy as the spinlock_t on this architecture, which
1004 * reduces latency compared to the unfair variant below. However, it
1005 * also adds more overhead and therefore may reduce throughput.
1006 */
1007static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1008 __releases(this_rq->lock)
1009 __acquires(busiest->lock)
1010 __acquires(this_rq->lock)
1011{
1012 raw_spin_unlock(&this_rq->lock);
1013 double_rq_lock(this_rq, busiest);
1014
1015 return 1;
1016}
1017
1018#else
1019/*
1020 * Unfair double_lock_balance: Optimizes throughput at the expense of
1021 * latency by eliminating extra atomic operations when the locks are
1022 * already in proper order on entry. This favors lower cpu-ids and will
1023 * grant the double lock to lower cpus over higher ids under contention,
1024 * regardless of entry order into the function.
1025 */
1026static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1027 __releases(this_rq->lock)
1028 __acquires(busiest->lock)
1029 __acquires(this_rq->lock)
1030{
1031 int ret = 0;
1032
1033 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1034 if (busiest < this_rq) {
1035 raw_spin_unlock(&this_rq->lock);
1036 raw_spin_lock(&busiest->lock);
1037 raw_spin_lock_nested(&this_rq->lock,
1038 SINGLE_DEPTH_NESTING);
1039 ret = 1;
1040 } else
1041 raw_spin_lock_nested(&busiest->lock,
1042 SINGLE_DEPTH_NESTING);
1043 }
1044 return ret;
1045}
1046
1047#endif /* CONFIG_PREEMPT */
1048
1049/*
1050 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1051 */
1052static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1053{
1054 if (unlikely(!irqs_disabled())) {
1055 /* printk() doesn't work good under rq->lock */
1056 raw_spin_unlock(&this_rq->lock);
1057 BUG_ON(1);
1058 }
1059
1060 return _double_lock_balance(this_rq, busiest);
1061}
1062
1063static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1064 __releases(busiest->lock)
1065{
1066 raw_spin_unlock(&busiest->lock);
1067 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1068}
1069
1070/*
1071 * double_rq_lock - safely lock two runqueues
1072 *
1073 * Note this does not disable interrupts like task_rq_lock,
1074 * you need to do so manually before calling.
1075 */
1076static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1077 __acquires(rq1->lock)
1078 __acquires(rq2->lock)
1079{
1080 BUG_ON(!irqs_disabled());
1081 if (rq1 == rq2) {
1082 raw_spin_lock(&rq1->lock);
1083 __acquire(rq2->lock); /* Fake it out ;) */
1084 } else {
1085 if (rq1 < rq2) {
1086 raw_spin_lock(&rq1->lock);
1087 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1088 } else {
1089 raw_spin_lock(&rq2->lock);
1090 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1091 }
1092 }
1093}
1094
1095/*
1096 * double_rq_unlock - safely unlock two runqueues
1097 *
1098 * Note this does not restore interrupts like task_rq_unlock,
1099 * you need to do so manually after calling.
1100 */
1101static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1102 __releases(rq1->lock)
1103 __releases(rq2->lock)
1104{
1105 raw_spin_unlock(&rq1->lock);
1106 if (rq1 != rq2)
1107 raw_spin_unlock(&rq2->lock);
1108 else
1109 __release(rq2->lock);
1110}
1111
1112#else /* CONFIG_SMP */
1113
1114/*
1115 * double_rq_lock - safely lock two runqueues
1116 *
1117 * Note this does not disable interrupts like task_rq_lock,
1118 * you need to do so manually before calling.
1119 */
1120static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
1121 __acquires(rq1->lock)
1122 __acquires(rq2->lock)
1123{
1124 BUG_ON(!irqs_disabled());
1125 BUG_ON(rq1 != rq2);
1126 raw_spin_lock(&rq1->lock);
1127 __acquire(rq2->lock); /* Fake it out ;) */
1128}
1129
1130/*
1131 * double_rq_unlock - safely unlock two runqueues
1132 *
1133 * Note this does not restore interrupts like task_rq_unlock,
1134 * you need to do so manually after calling.
1135 */
1136static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1137 __releases(rq1->lock)
1138 __releases(rq2->lock)
1139{
1140 BUG_ON(rq1 != rq2);
1141 raw_spin_unlock(&rq1->lock);
1142 __release(rq2->lock);
1143}
1144
1145#endif
1146
1147extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
1148extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
1149extern void print_cfs_stats(struct seq_file *m, int cpu);
1150extern void print_rt_stats(struct seq_file *m, int cpu);
1151
1152extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1153extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1154extern void unthrottle_offline_cfs_rqs(struct rq *rq);
1155
1156extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1157
1158#ifdef CONFIG_NO_HZ
1159enum rq_nohz_flag_bits {
1160 NOHZ_TICK_STOPPED,
1161 NOHZ_BALANCE_KICK,
1162 NOHZ_IDLE,
1163};
1164
1165#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1166#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
1
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/seq_file.h>
5#include <linux/proc_fs.h>
6
7#include "sched.h"
8
9/*
10 * bump this up when changing the output format or the meaning of an existing
11 * format, so that tools can adapt (or abort)
12 */
13#define SCHEDSTAT_VERSION 15
14
15static int show_schedstat(struct seq_file *seq, void *v)
16{
17 int cpu;
18 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
19 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
20
21 if (mask_str == NULL)
22 return -ENOMEM;
23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
25 seq_printf(seq, "timestamp %lu\n", jiffies);
26 for_each_online_cpu(cpu) {
27 struct rq *rq = cpu_rq(cpu);
28#ifdef CONFIG_SMP
29 struct sched_domain *sd;
30 int dcount = 0;
31#endif
32
33 /* runqueue-specific stats */
34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
41
42 seq_printf(seq, "\n");
43
44#ifdef CONFIG_SMP
45 /* domain-specific stats */
46 rcu_read_lock();
47 for_each_domain(cpu, sd) {
48 enum cpu_idle_type itype;
49
50 cpumask_scnprintf(mask_str, mask_len,
51 sched_domain_span(sd));
52 seq_printf(seq, "domain%d %s", dcount++, mask_str);
53 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
54 itype++) {
55 seq_printf(seq, " %u %u %u %u %u %u %u %u",
56 sd->lb_count[itype],
57 sd->lb_balanced[itype],
58 sd->lb_failed[itype],
59 sd->lb_imbalance[itype],
60 sd->lb_gained[itype],
61 sd->lb_hot_gained[itype],
62 sd->lb_nobusyq[itype],
63 sd->lb_nobusyg[itype]);
64 }
65 seq_printf(seq,
66 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
67 sd->alb_count, sd->alb_failed, sd->alb_pushed,
68 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
69 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
70 sd->ttwu_wake_remote, sd->ttwu_move_affine,
71 sd->ttwu_move_balance);
72 }
73 rcu_read_unlock();
74#endif
75 }
76 kfree(mask_str);
77 return 0;
78}
79
80static int schedstat_open(struct inode *inode, struct file *file)
81{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86
87 if (!buf)
88 return -ENOMEM;
89 res = single_open(file, show_schedstat, NULL);
90 if (!res) {
91 m = file->private_data;
92 m->buf = buf;
93 m->size = size;
94 } else
95 kfree(buf);
96 return res;
97}
98
99static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open,
101 .read = seq_read,
102 .llseek = seq_lseek,
103 .release = single_release,
104};
105
106static int __init proc_schedstat_init(void)
107{
108 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
109 return 0;
110}
111module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56e..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
1 1
2#ifdef CONFIG_SCHEDSTATS 2#ifdef CONFIG_SCHEDSTATS
3/*
4 * bump this up when changing the output format or the meaning of an existing
5 * format, so that tools can adapt (or abort)
6 */
7#define SCHEDSTAT_VERSION 15
8
9static int show_schedstat(struct seq_file *seq, void *v)
10{
11 int cpu;
12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14
15 if (mask_str == NULL)
16 return -ENOMEM;
17
18 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
19 seq_printf(seq, "timestamp %lu\n", jiffies);
20 for_each_online_cpu(cpu) {
21 struct rq *rq = cpu_rq(cpu);
22#ifdef CONFIG_SMP
23 struct sched_domain *sd;
24 int dcount = 0;
25#endif
26
27 /* runqueue-specific stats */
28 seq_printf(seq,
29 "cpu%d %u %u %u %u %u %u %llu %llu %lu",
30 cpu, rq->yld_count,
31 rq->sched_switch, rq->sched_count, rq->sched_goidle,
32 rq->ttwu_count, rq->ttwu_local,
33 rq->rq_cpu_time,
34 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
35
36 seq_printf(seq, "\n");
37
38#ifdef CONFIG_SMP
39 /* domain-specific stats */
40 rcu_read_lock();
41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype;
43
44 cpumask_scnprintf(mask_str, mask_len,
45 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) {
49 seq_printf(seq, " %u %u %u %u %u %u %u %u",
50 sd->lb_count[itype],
51 sd->lb_balanced[itype],
52 sd->lb_failed[itype],
53 sd->lb_imbalance[itype],
54 sd->lb_gained[itype],
55 sd->lb_hot_gained[itype],
56 sd->lb_nobusyq[itype],
57 sd->lb_nobusyg[itype]);
58 }
59 seq_printf(seq,
60 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
61 sd->alb_count, sd->alb_failed, sd->alb_pushed,
62 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
63 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance);
66 }
67 rcu_read_unlock();
68#endif
69 }
70 kfree(mask_str);
71 return 0;
72}
73
74static int schedstat_open(struct inode *inode, struct file *file)
75{
76 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
77 char *buf = kmalloc(size, GFP_KERNEL);
78 struct seq_file *m;
79 int res;
80
81 if (!buf)
82 return -ENOMEM;
83 res = single_open(file, show_schedstat, NULL);
84 if (!res) {
85 m = file->private_data;
86 m->buf = buf;
87 m->size = size;
88 } else
89 kfree(buf);
90 return res;
91}
92
93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106 3
107/* 4/*
108 * Expects runqueue lock to be held for atomicity of update 5 * Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
283 return; 180 return;
284 181
285 raw_spin_lock(&cputimer->lock); 182 raw_spin_lock(&cputimer->lock);
286 cputimer->cputime.utime = 183 cputimer->cputime.utime += cputime;
287 cputime_add(cputimer->cputime.utime, cputime);
288 raw_spin_unlock(&cputimer->lock); 184 raw_spin_unlock(&cputimer->lock);
289} 185}
290 186
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
307 return; 203 return;
308 204
309 raw_spin_lock(&cputimer->lock); 205 raw_spin_lock(&cputimer->lock);
310 cputimer->cputime.stime = 206 cputimer->cputime.stime += cputime;
311 cputime_add(cputimer->cputime.stime, cputime);
312 raw_spin_unlock(&cputimer->lock); 207 raw_spin_unlock(&cputimer->lock);
313} 208}
314 209
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
1#include "sched.h"
2
1/* 3/*
2 * stop-task scheduling class. 4 * stop-task scheduling class.
3 * 5 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
80/* 82/*
81 * Simple, special scheduling class for the per-CPU stop tasks: 83 * Simple, special scheduling class for the per-CPU stop tasks:
82 */ 84 */
83static const struct sched_class stop_sched_class = { 85const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class, 86 .next = &rt_sched_class,
85 87
86 .enqueue_task = enqueue_task_stop, 88 .enqueue_task = enqueue_task_stop,
diff --git a/kernel/signal.c b/kernel/signal.c
index 206551563cce..56ce3a618b28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1629 info.si_uid = __task_cred(tsk)->uid; 1629 info.si_uid = __task_cred(tsk)->uid;
1630 rcu_read_unlock(); 1630 rcu_read_unlock();
1631 1631
1632 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, 1632 info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
1633 tsk->signal->utime)); 1633 info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
1634 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1635 tsk->signal->stime));
1636 1634
1637 info.si_status = tsk->exit_code & 0x7f; 1635 info.si_status = tsk->exit_code & 0x7f;
1638 if (tsk->exit_code & 0x80) 1636 if (tsk->exit_code & 0x80)
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd079..ddf8155bf3f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1605 unsigned long maxrss = 0; 1605 unsigned long maxrss = 0;
1606 1606
1607 memset((char *) r, 0, sizeof *r); 1607 memset((char *) r, 0, sizeof *r);
1608 utime = stime = cputime_zero; 1608 utime = stime = 0;
1609 1609
1610 if (who == RUSAGE_THREAD) { 1610 if (who == RUSAGE_THREAD) {
1611 task_times(current, &utime, &stime); 1611 task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1635 1635
1636 case RUSAGE_SELF: 1636 case RUSAGE_SELF:
1637 thread_group_times(p, &tgutime, &tgstime); 1637 thread_group_times(p, &tgutime, &tgstime);
1638 utime = cputime_add(utime, tgutime); 1638 utime += tgutime;
1639 stime = cputime_add(stime, tgstime); 1639 stime += tgstime;
1640 r->ru_nvcsw += p->signal->nvcsw; 1640 r->ru_nvcsw += p->signal->nvcsw;
1641 r->ru_nivcsw += p->signal->nivcsw; 1641 r->ru_nivcsw += p->signal->nivcsw;
1642 r->ru_minflt += p->signal->min_flt; 1642 r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0ec8b832ab6b..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -466,6 +466,14 @@ void tick_nohz_idle_enter(void)
466 466
467 WARN_ON_ONCE(irqs_disabled()); 467 WARN_ON_ONCE(irqs_disabled());
468 468
469 /*
470 * Update the idle state in the scheduler domain hierarchy
471 * when tick_nohz_stop_sched_tick() is called from the idle loop.
472 * State will be updated to busy during the first busy tick after
473 * exiting idle.
474 */
475 set_cpu_sd_state_idle();
476
469 local_irq_disable(); 477 local_irq_disable();
470 478
471 ts = &__get_cpu_var(tick_cpu_sched); 479 ts = &__get_cpu_var(tick_cpu_sched);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
127 127
128 local_irq_save(flags); 128 local_irq_save(flags);
129 time = tsk->stime + tsk->utime; 129 time = tsk->stime + tsk->utime;
130 dtime = cputime_sub(time, tsk->acct_timexpd); 130 dtime = time - tsk->acct_timexpd;
131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value); 131 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
132 delta = value.tv_sec; 132 delta = value.tv_sec;
133 delta = delta * USEC_PER_SEC + value.tv_usec; 133 delta = delta * USEC_PER_SEC + value.tv_usec;