aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/scheduler/sched-arch.txt10
-rw-r--r--arch/Kconfig9
-rw-r--r--arch/ia64/Kconfig12
-rw-r--r--arch/ia64/include/asm/switch_to.h8
-rw-r--r--arch/ia64/kernel/time.c66
-rw-r--r--arch/powerpc/include/asm/time.h6
-rw-r--r--arch/powerpc/kernel/process.c3
-rw-r--r--arch/powerpc/kernel/time.c55
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype16
-rw-r--r--arch/s390/Kconfig5
-rw-r--r--arch/s390/include/asm/cputime.h3
-rw-r--r--arch/s390/include/asm/switch_to.h4
-rw-r--r--arch/s390/kernel/vtime.c8
-rw-r--r--arch/tile/include/asm/topology.h1
-rw-r--r--arch/x86/Kconfig12
-rw-r--r--include/linux/hardirq.h8
-rw-r--r--include/linux/kernel_stat.h8
-rw-r--r--include/linux/kvm_host.h4
-rw-r--r--include/linux/sched.h10
-rw-r--r--include/linux/topology.h2
-rw-r--r--init/Kconfig157
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c675
-rw-r--r--kernel/sched/cputime.c530
-rw-r--r--kernel/sched/fair.c81
-rw-r--r--kernel/sched/features.h10
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h69
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/tick-sched.c3
32 files changed, 892 insertions, 906 deletions
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
index 28aa1075e291..b1b8587b86f0 100644
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@@ -17,16 +17,6 @@ you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
17Unlocked context switches introduce only a very minor performance 17Unlocked context switches introduce only a very minor performance
18penalty to the core scheduler implementation in the CONFIG_SMP case. 18penalty to the core scheduler implementation in the CONFIG_SMP case.
19 19
202. Interrupt status
21By default, the switch_to arch function is called with interrupts
22disabled. Interrupts may be enabled over the call if it is likely to
23introduce a significant interrupt latency by adding the line
24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
25unlocked context switches. This define also implies
26`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
27example.
28
29
30CPU idle 20CPU idle
31======== 21========
32Your cpu_idle routines need to obey the following rules: 22Your cpu_idle routines need to obey the following rules:
diff --git a/arch/Kconfig b/arch/Kconfig
index 1a7b468abf4a..a62965d057f6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -304,4 +304,13 @@ config HAVE_RCU_USER_QS
304 are already protected inside rcu_irq_enter/rcu_irq_exit() but 304 are already protected inside rcu_irq_enter/rcu_irq_exit() but
305 preemption or signal handling on irq exit still need to be protected. 305 preemption or signal handling on irq exit still need to be protected.
306 306
307config HAVE_VIRT_CPU_ACCOUNTING
308 bool
309
310config HAVE_IRQ_TIME_ACCOUNTING
311 bool
312 help
313 Archs need to ensure they use a high enough resolution clock to
314 support irq time accounting and then call enable_sched_clock_irqtime().
315
307source "kernel/gcov/Kconfig" 316source "kernel/gcov/Kconfig"
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 310cf5781fad..3c720ef6c32d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -25,6 +25,7 @@ config IA64
25 select HAVE_GENERIC_HARDIRQS 25 select HAVE_GENERIC_HARDIRQS
26 select HAVE_MEMBLOCK 26 select HAVE_MEMBLOCK
27 select HAVE_MEMBLOCK_NODE_MAP 27 select HAVE_MEMBLOCK_NODE_MAP
28 select HAVE_VIRT_CPU_ACCOUNTING
28 select ARCH_DISCARD_MEMBLOCK 29 select ARCH_DISCARD_MEMBLOCK
29 select GENERIC_IRQ_PROBE 30 select GENERIC_IRQ_PROBE
30 select GENERIC_PENDING_IRQ if SMP 31 select GENERIC_PENDING_IRQ if SMP
@@ -340,17 +341,6 @@ config FORCE_MAX_ZONEORDER
340 default "17" if HUGETLB_PAGE 341 default "17" if HUGETLB_PAGE
341 default "11" 342 default "11"
342 343
343config VIRT_CPU_ACCOUNTING
344 bool "Deterministic task and CPU time accounting"
345 default n
346 help
347 Select this option to enable more accurate task and CPU time
348 accounting. This is done by reading a CPU counter on each
349 kernel entry and exit and on transitions within the kernel
350 between system, softirq and hardirq state, so there is a
351 small performance impact.
352 If in doubt, say N here.
353
354config SMP 344config SMP
355 bool "Symmetric multi-processing support" 345 bool "Symmetric multi-processing support"
356 select USE_GENERIC_SMP_HELPERS 346 select USE_GENERIC_SMP_HELPERS
diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h
index cb2412fcd17f..d38c7ea5eea5 100644
--- a/arch/ia64/include/asm/switch_to.h
+++ b/arch/ia64/include/asm/switch_to.h
@@ -30,13 +30,6 @@ extern struct task_struct *ia64_switch_to (void *next_task);
30extern void ia64_save_extra (struct task_struct *task); 30extern void ia64_save_extra (struct task_struct *task);
31extern void ia64_load_extra (struct task_struct *task); 31extern void ia64_load_extra (struct task_struct *task);
32 32
33#ifdef CONFIG_VIRT_CPU_ACCOUNTING
34extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next);
35# define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n)
36#else
37# define IA64_ACCOUNT_ON_SWITCH(p,n)
38#endif
39
40#ifdef CONFIG_PERFMON 33#ifdef CONFIG_PERFMON
41 DECLARE_PER_CPU(unsigned long, pfm_syst_info); 34 DECLARE_PER_CPU(unsigned long, pfm_syst_info);
42# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) 35# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
@@ -49,7 +42,6 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct
49 || PERFMON_IS_SYSWIDE()) 42 || PERFMON_IS_SYSWIDE())
50 43
51#define __switch_to(prev,next,last) do { \ 44#define __switch_to(prev,next,last) do { \
52 IA64_ACCOUNT_ON_SWITCH(prev, next); \
53 if (IA64_HAS_EXTRA_STATE(prev)) \ 45 if (IA64_HAS_EXTRA_STATE(prev)) \
54 ia64_save_extra(prev); \ 46 ia64_save_extra(prev); \
55 if (IA64_HAS_EXTRA_STATE(next)) \ 47 if (IA64_HAS_EXTRA_STATE(next)) \
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index ecc904b33c5f..80ff9acc5edf 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -83,32 +83,36 @@ static struct clocksource *itc_clocksource;
83 83
84extern cputime_t cycle_to_cputime(u64 cyc); 84extern cputime_t cycle_to_cputime(u64 cyc);
85 85
86static void vtime_account_user(struct task_struct *tsk)
87{
88 cputime_t delta_utime;
89 struct thread_info *ti = task_thread_info(tsk);
90
91 if (ti->ac_utime) {
92 delta_utime = cycle_to_cputime(ti->ac_utime);
93 account_user_time(tsk, delta_utime, delta_utime);
94 ti->ac_utime = 0;
95 }
96}
97
86/* 98/*
87 * Called from the context switch with interrupts disabled, to charge all 99 * Called from the context switch with interrupts disabled, to charge all
88 * accumulated times to the current process, and to prepare accounting on 100 * accumulated times to the current process, and to prepare accounting on
89 * the next process. 101 * the next process.
90 */ 102 */
91void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) 103void vtime_task_switch(struct task_struct *prev)
92{ 104{
93 struct thread_info *pi = task_thread_info(prev); 105 struct thread_info *pi = task_thread_info(prev);
94 struct thread_info *ni = task_thread_info(next); 106 struct thread_info *ni = task_thread_info(current);
95 cputime_t delta_stime, delta_utime;
96 __u64 now;
97 107
98 now = ia64_get_itc();
99
100 delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp));
101 if (idle_task(smp_processor_id()) != prev) 108 if (idle_task(smp_processor_id()) != prev)
102 account_system_time(prev, 0, delta_stime, delta_stime); 109 vtime_account_system(prev);
103 else 110 else
104 account_idle_time(delta_stime); 111 vtime_account_idle(prev);
105 112
106 if (pi->ac_utime) { 113 vtime_account_user(prev);
107 delta_utime = cycle_to_cputime(pi->ac_utime);
108 account_user_time(prev, delta_utime, delta_utime);
109 }
110 114
111 pi->ac_stamp = ni->ac_stamp = now; 115 pi->ac_stamp = ni->ac_stamp;
112 ni->ac_stime = ni->ac_utime = 0; 116 ni->ac_stime = ni->ac_utime = 0;
113} 117}
114 118
@@ -116,29 +120,32 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next)
116 * Account time for a transition between system, hard irq or soft irq state. 120 * Account time for a transition between system, hard irq or soft irq state.
117 * Note that this function is called with interrupts enabled. 121 * Note that this function is called with interrupts enabled.
118 */ 122 */
119void account_system_vtime(struct task_struct *tsk) 123static cputime_t vtime_delta(struct task_struct *tsk)
120{ 124{
121 struct thread_info *ti = task_thread_info(tsk); 125 struct thread_info *ti = task_thread_info(tsk);
122 unsigned long flags;
123 cputime_t delta_stime; 126 cputime_t delta_stime;
124 __u64 now; 127 __u64 now;
125 128
126 local_irq_save(flags);
127
128 now = ia64_get_itc(); 129 now = ia64_get_itc();
129 130
130 delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); 131 delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp));
131 if (irq_count() || idle_task(smp_processor_id()) != tsk)
132 account_system_time(tsk, 0, delta_stime, delta_stime);
133 else
134 account_idle_time(delta_stime);
135 ti->ac_stime = 0; 132 ti->ac_stime = 0;
136
137 ti->ac_stamp = now; 133 ti->ac_stamp = now;
138 134
139 local_irq_restore(flags); 135 return delta_stime;
136}
137
138void vtime_account_system(struct task_struct *tsk)
139{
140 cputime_t delta = vtime_delta(tsk);
141
142 account_system_time(tsk, 0, delta, delta);
143}
144
145void vtime_account_idle(struct task_struct *tsk)
146{
147 account_idle_time(vtime_delta(tsk));
140} 148}
141EXPORT_SYMBOL_GPL(account_system_vtime);
142 149
143/* 150/*
144 * Called from the timer interrupt handler to charge accumulated user time 151 * Called from the timer interrupt handler to charge accumulated user time
@@ -146,14 +153,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
146 */ 153 */
147void account_process_tick(struct task_struct *p, int user_tick) 154void account_process_tick(struct task_struct *p, int user_tick)
148{ 155{
149 struct thread_info *ti = task_thread_info(p); 156 vtime_account_user(p);
150 cputime_t delta_utime;
151
152 if (ti->ac_utime) {
153 delta_utime = cycle_to_cputime(ti->ac_utime);
154 account_user_time(p, delta_utime, delta_utime);
155 ti->ac_utime = 0;
156 }
157} 157}
158 158
159#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 159#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 3b4b4a8da922..c1f267694acb 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -197,12 +197,6 @@ struct cpu_usage {
197 197
198DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); 198DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array);
199 199
200#if defined(CONFIG_VIRT_CPU_ACCOUNTING)
201#define account_process_vtime(tsk) account_process_tick(tsk, 0)
202#else
203#define account_process_vtime(tsk) do { } while (0)
204#endif
205
206extern void secondary_cpu_time_init(void); 200extern void secondary_cpu_time_init(void);
207 201
208DECLARE_PER_CPU(u64, decrementers_next_tb); 202DECLARE_PER_CPU(u64, decrementers_next_tb);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1a1f2ddfb581..e9cb51f5f801 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -514,9 +514,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
514 514
515 local_irq_save(flags); 515 local_irq_save(flags);
516 516
517 account_system_vtime(current);
518 account_process_vtime(current);
519
520 /* 517 /*
521 * We can't take a PMU exception inside _switch() since there is a 518 * We can't take a PMU exception inside _switch() since there is a
522 * window where the kernel stack SLB and the kernel stack are out 519 * window where the kernel stack SLB and the kernel stack are out
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e49e93191b69..eaa9d0e6abca 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -291,13 +291,12 @@ static inline u64 calculate_stolen_time(u64 stop_tb)
291 * Account time for a transition between system, hard irq 291 * Account time for a transition between system, hard irq
292 * or soft irq state. 292 * or soft irq state.
293 */ 293 */
294void account_system_vtime(struct task_struct *tsk) 294static u64 vtime_delta(struct task_struct *tsk,
295 u64 *sys_scaled, u64 *stolen)
295{ 296{
296 u64 now, nowscaled, delta, deltascaled; 297 u64 now, nowscaled, deltascaled;
297 unsigned long flags; 298 u64 udelta, delta, user_scaled;
298 u64 stolen, udelta, sys_scaled, user_scaled;
299 299
300 local_irq_save(flags);
301 now = mftb(); 300 now = mftb();
302 nowscaled = read_spurr(now); 301 nowscaled = read_spurr(now);
303 get_paca()->system_time += now - get_paca()->starttime; 302 get_paca()->system_time += now - get_paca()->starttime;
@@ -305,7 +304,7 @@ void account_system_vtime(struct task_struct *tsk)
305 deltascaled = nowscaled - get_paca()->startspurr; 304 deltascaled = nowscaled - get_paca()->startspurr;
306 get_paca()->startspurr = nowscaled; 305 get_paca()->startspurr = nowscaled;
307 306
308 stolen = calculate_stolen_time(now); 307 *stolen = calculate_stolen_time(now);
309 308
310 delta = get_paca()->system_time; 309 delta = get_paca()->system_time;
311 get_paca()->system_time = 0; 310 get_paca()->system_time = 0;
@@ -322,35 +321,45 @@ void account_system_vtime(struct task_struct *tsk)
322 * the user ticks get saved up in paca->user_time_scaled to be 321 * the user ticks get saved up in paca->user_time_scaled to be
323 * used by account_process_tick. 322 * used by account_process_tick.
324 */ 323 */
325 sys_scaled = delta; 324 *sys_scaled = delta;
326 user_scaled = udelta; 325 user_scaled = udelta;
327 if (deltascaled != delta + udelta) { 326 if (deltascaled != delta + udelta) {
328 if (udelta) { 327 if (udelta) {
329 sys_scaled = deltascaled * delta / (delta + udelta); 328 *sys_scaled = deltascaled * delta / (delta + udelta);
330 user_scaled = deltascaled - sys_scaled; 329 user_scaled = deltascaled - *sys_scaled;
331 } else { 330 } else {
332 sys_scaled = deltascaled; 331 *sys_scaled = deltascaled;
333 } 332 }
334 } 333 }
335 get_paca()->user_time_scaled += user_scaled; 334 get_paca()->user_time_scaled += user_scaled;
336 335
337 if (in_interrupt() || idle_task(smp_processor_id()) != tsk) { 336 return delta;
338 account_system_time(tsk, 0, delta, sys_scaled); 337}
339 if (stolen) 338
340 account_steal_time(stolen); 339void vtime_account_system(struct task_struct *tsk)
341 } else { 340{
342 account_idle_time(delta + stolen); 341 u64 delta, sys_scaled, stolen;
343 } 342
344 local_irq_restore(flags); 343 delta = vtime_delta(tsk, &sys_scaled, &stolen);
344 account_system_time(tsk, 0, delta, sys_scaled);
345 if (stolen)
346 account_steal_time(stolen);
347}
348
349void vtime_account_idle(struct task_struct *tsk)
350{
351 u64 delta, sys_scaled, stolen;
352
353 delta = vtime_delta(tsk, &sys_scaled, &stolen);
354 account_idle_time(delta + stolen);
345} 355}
346EXPORT_SYMBOL_GPL(account_system_vtime);
347 356
348/* 357/*
349 * Transfer the user and system times accumulated in the paca 358 * Transfer the user and system times accumulated in the paca
350 * by the exception entry and exit code to the generic process 359 * by the exception entry and exit code to the generic process
351 * user and system time records. 360 * user and system time records.
352 * Must be called with interrupts disabled. 361 * Must be called with interrupts disabled.
353 * Assumes that account_system_vtime() has been called recently 362 * Assumes that vtime_account() has been called recently
354 * (i.e. since the last entry from usermode) so that 363 * (i.e. since the last entry from usermode) so that
355 * get_paca()->user_time_scaled is up to date. 364 * get_paca()->user_time_scaled is up to date.
356 */ 365 */
@@ -366,6 +375,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
366 account_user_time(tsk, utime, utimescaled); 375 account_user_time(tsk, utime, utimescaled);
367} 376}
368 377
378void vtime_task_switch(struct task_struct *prev)
379{
380 vtime_account(prev);
381 account_process_tick(prev, 0);
382}
383
369#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ 384#else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
370#define calc_cputime_factors() 385#define calc_cputime_factors()
371#endif 386#endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 30fd01de6bed..72afd2888cad 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_VIRT_CPU_ACCOUNTING
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
@@ -337,21 +338,6 @@ config PPC_MM_SLICES
337 default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) 338 default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
338 default n 339 default n
339 340
340config VIRT_CPU_ACCOUNTING
341 bool "Deterministic task and CPU time accounting"
342 depends on PPC64
343 default y
344 help
345 Select this option to enable more accurate task and CPU time
346 accounting. This is done by reading a CPU counter on each
347 kernel entry and exit and on transitions within the kernel
348 between system, softirq and hardirq state, so there is a
349 small performance impact. This also enables accounting of
350 stolen time on logically-partitioned systems running on
351 IBM POWER5-based machines.
352
353 If in doubt, say Y here.
354
355config PPC_HAVE_PMU_SUPPORT 341config PPC_HAVE_PMU_SUPPORT
356 bool 342 bool
357 343
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 107610e01a29..f5ab543396da 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -49,9 +49,6 @@ config GENERIC_LOCKBREAK
49config PGSTE 49config PGSTE
50 def_bool y if KVM 50 def_bool y if KVM
51 51
52config VIRT_CPU_ACCOUNTING
53 def_bool y
54
55config ARCH_SUPPORTS_DEBUG_PAGEALLOC 52config ARCH_SUPPORTS_DEBUG_PAGEALLOC
56 def_bool y 53 def_bool y
57 54
@@ -89,6 +86,8 @@ config S390
89 select HAVE_MEMBLOCK 86 select HAVE_MEMBLOCK
90 select HAVE_MEMBLOCK_NODE_MAP 87 select HAVE_MEMBLOCK_NODE_MAP
91 select HAVE_CMPXCHG_LOCAL 88 select HAVE_CMPXCHG_LOCAL
89 select HAVE_VIRT_CPU_ACCOUNTING
90 select VIRT_CPU_ACCOUNTING
92 select ARCH_DISCARD_MEMBLOCK 91 select ARCH_DISCARD_MEMBLOCK
93 select BUILDTIME_EXTABLE_SORT 92 select BUILDTIME_EXTABLE_SORT
94 select ARCH_INLINE_SPIN_TRYLOCK 93 select ARCH_INLINE_SPIN_TRYLOCK
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h
index 8709bdef233c..023d5ae24482 100644
--- a/arch/s390/include/asm/cputime.h
+++ b/arch/s390/include/asm/cputime.h
@@ -12,6 +12,9 @@
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <asm/div64.h> 13#include <asm/div64.h>
14 14
15
16#define __ARCH_HAS_VTIME_ACCOUNT
17
15/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ 18/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */
16 19
17typedef unsigned long long __nocast cputime_t; 20typedef unsigned long long __nocast cputime_t;
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
index f223068b7822..314cc9426fc4 100644
--- a/arch/s390/include/asm/switch_to.h
+++ b/arch/s390/include/asm/switch_to.h
@@ -89,12 +89,8 @@ static inline void restore_access_regs(unsigned int *acrs)
89 prev = __switch_to(prev,next); \ 89 prev = __switch_to(prev,next); \
90} while (0) 90} while (0)
91 91
92extern void account_vtime(struct task_struct *, struct task_struct *);
93extern void account_tick_vtime(struct task_struct *);
94
95#define finish_arch_switch(prev) do { \ 92#define finish_arch_switch(prev) do { \
96 set_fs(current->thread.mm_segment); \ 93 set_fs(current->thread.mm_segment); \
97 account_vtime(prev, current); \
98} while (0) 94} while (0)
99 95
100#endif /* __ASM_SWITCH_TO_H */ 96#endif /* __ASM_SWITCH_TO_H */
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 4fc97b40a6e1..cb5093c26d16 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
99 return virt_timer_forward(user + system); 99 return virt_timer_forward(user + system);
100} 100}
101 101
102void account_vtime(struct task_struct *prev, struct task_struct *next) 102void vtime_task_switch(struct task_struct *prev)
103{ 103{
104 struct thread_info *ti; 104 struct thread_info *ti;
105 105
@@ -107,7 +107,7 @@ void account_vtime(struct task_struct *prev, struct task_struct *next)
107 ti = task_thread_info(prev); 107 ti = task_thread_info(prev);
108 ti->user_timer = S390_lowcore.user_timer; 108 ti->user_timer = S390_lowcore.user_timer;
109 ti->system_timer = S390_lowcore.system_timer; 109 ti->system_timer = S390_lowcore.system_timer;
110 ti = task_thread_info(next); 110 ti = task_thread_info(current);
111 S390_lowcore.user_timer = ti->user_timer; 111 S390_lowcore.user_timer = ti->user_timer;
112 S390_lowcore.system_timer = ti->system_timer; 112 S390_lowcore.system_timer = ti->system_timer;
113} 113}
@@ -122,7 +122,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick)
122 * Update process times based on virtual cpu times stored by entry.S 122 * Update process times based on virtual cpu times stored by entry.S
123 * to the lowcore fields user_timer, system_timer & steal_clock. 123 * to the lowcore fields user_timer, system_timer & steal_clock.
124 */ 124 */
125void account_system_vtime(struct task_struct *tsk) 125void vtime_account(struct task_struct *tsk)
126{ 126{
127 struct thread_info *ti = task_thread_info(tsk); 127 struct thread_info *ti = task_thread_info(tsk);
128 u64 timer, system; 128 u64 timer, system;
@@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk)
138 138
139 virt_timer_forward(system); 139 virt_timer_forward(system);
140} 140}
141EXPORT_SYMBOL_GPL(account_system_vtime); 141EXPORT_SYMBOL_GPL(vtime_account);
142 142
143void __kprobes vtime_stop_cpu(void) 143void __kprobes vtime_stop_cpu(void)
144{ 144{
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 7a7ce390534f..d5e86c9f74fd 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -69,7 +69,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
69 | 1*SD_BALANCE_FORK \ 69 | 1*SD_BALANCE_FORK \
70 | 0*SD_BALANCE_WAKE \ 70 | 0*SD_BALANCE_WAKE \
71 | 0*SD_WAKE_AFFINE \ 71 | 0*SD_WAKE_AFFINE \
72 | 0*SD_PREFER_LOCAL \
73 | 0*SD_SHARE_CPUPOWER \ 72 | 0*SD_SHARE_CPUPOWER \
74 | 0*SD_SHARE_PKG_RESOURCES \ 73 | 0*SD_SHARE_PKG_RESOURCES \
75 | 0*SD_SERIALIZE \ 74 | 0*SD_SERIALIZE \
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ff1f56a0188..488ba8da8fef 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -101,6 +101,7 @@ config X86
101 select GENERIC_STRNCPY_FROM_USER 101 select GENERIC_STRNCPY_FROM_USER
102 select GENERIC_STRNLEN_USER 102 select GENERIC_STRNLEN_USER
103 select HAVE_RCU_USER_QS if X86_64 103 select HAVE_RCU_USER_QS if X86_64
104 select HAVE_IRQ_TIME_ACCOUNTING
104 105
105config INSTRUCTION_DECODER 106config INSTRUCTION_DECODER
106 def_bool (KPROBES || PERF_EVENTS || UPROBES) 107 def_bool (KPROBES || PERF_EVENTS || UPROBES)
@@ -800,17 +801,6 @@ config SCHED_MC
800 making when dealing with multi-core CPU chips at a cost of slightly 801 making when dealing with multi-core CPU chips at a cost of slightly
801 increased overhead in some places. If unsure say N here. 802 increased overhead in some places. If unsure say N here.
802 803
803config IRQ_TIME_ACCOUNTING
804 bool "Fine granularity task level IRQ time accounting"
805 default n
806 ---help---
807 Select this option to enable fine granularity task irq time
808 accounting. This is done by reading a timestamp on each
809 transitions between softirq and hardirq state, so there can be a
810 small performance impact.
811
812 If in doubt, say N here.
813
814source "kernel/Kconfig.preempt" 804source "kernel/Kconfig.preempt"
815 805
816config X86_UP_APIC 806config X86_UP_APIC
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 305f23cd7cff..cab3da3d0949 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq);
132struct task_struct; 132struct task_struct;
133 133
134#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) 134#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
135static inline void account_system_vtime(struct task_struct *tsk) 135static inline void vtime_account(struct task_struct *tsk)
136{ 136{
137} 137}
138#else 138#else
139extern void account_system_vtime(struct task_struct *tsk); 139extern void vtime_account(struct task_struct *tsk);
140#endif 140#endif
141 141
142#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) 142#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
@@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void);
162 */ 162 */
163#define __irq_enter() \ 163#define __irq_enter() \
164 do { \ 164 do { \
165 account_system_vtime(current); \ 165 vtime_account(current); \
166 add_preempt_count(HARDIRQ_OFFSET); \ 166 add_preempt_count(HARDIRQ_OFFSET); \
167 trace_hardirq_enter(); \ 167 trace_hardirq_enter(); \
168 } while (0) 168 } while (0)
@@ -178,7 +178,7 @@ extern void irq_enter(void);
178#define __irq_exit() \ 178#define __irq_exit() \
179 do { \ 179 do { \
180 trace_hardirq_exit(); \ 180 trace_hardirq_exit(); \
181 account_system_vtime(current); \ 181 vtime_account(current); \
182 sub_preempt_count(HARDIRQ_OFFSET); \ 182 sub_preempt_count(HARDIRQ_OFFSET); \
183 } while (0) 183 } while (0)
184 184
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 2fbd9053c2df..36d12f0884c3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -130,4 +130,12 @@ extern void account_process_tick(struct task_struct *, int user);
130extern void account_steal_ticks(unsigned long ticks); 130extern void account_steal_ticks(unsigned long ticks);
131extern void account_idle_ticks(unsigned long ticks); 131extern void account_idle_ticks(unsigned long ticks);
132 132
133#ifdef CONFIG_VIRT_CPU_ACCOUNTING
134extern void vtime_task_switch(struct task_struct *prev);
135extern void vtime_account_system(struct task_struct *tsk);
136extern void vtime_account_idle(struct task_struct *tsk);
137#else
138static inline void vtime_task_switch(struct task_struct *prev) { }
139#endif
140
133#endif /* _LINUX_KERNEL_STAT_H */ 141#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b70b48b01098..8a59e0abe5fa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -685,7 +685,7 @@ static inline int kvm_deassign_device(struct kvm *kvm,
685static inline void kvm_guest_enter(void) 685static inline void kvm_guest_enter(void)
686{ 686{
687 BUG_ON(preemptible()); 687 BUG_ON(preemptible());
688 account_system_vtime(current); 688 vtime_account(current);
689 current->flags |= PF_VCPU; 689 current->flags |= PF_VCPU;
690 /* KVM does not hold any references to rcu protected data when it 690 /* KVM does not hold any references to rcu protected data when it
691 * switches CPU into a guest mode. In fact switching to a guest mode 691 * switches CPU into a guest mode. In fact switching to a guest mode
@@ -699,7 +699,7 @@ static inline void kvm_guest_enter(void)
699 699
700static inline void kvm_guest_exit(void) 700static inline void kvm_guest_exit(void)
701{ 701{
702 account_system_vtime(current); 702 vtime_account(current);
703 current->flags &= ~PF_VCPU; 703 current->flags &= ~PF_VCPU;
704} 704}
705 705
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 83035269e597..765dffbb085e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -273,11 +273,11 @@ extern void init_idle_bootup_task(struct task_struct *idle);
273extern int runqueue_is_locked(int cpu); 273extern int runqueue_is_locked(int cpu);
274 274
275#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 275#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
276extern void select_nohz_load_balancer(int stop_tick); 276extern void nohz_balance_enter_idle(int cpu);
277extern void set_cpu_sd_state_idle(void); 277extern void set_cpu_sd_state_idle(void);
278extern int get_nohz_timer_target(void); 278extern int get_nohz_timer_target(void);
279#else 279#else
280static inline void select_nohz_load_balancer(int stop_tick) { } 280static inline void nohz_balance_enter_idle(int cpu) { }
281static inline void set_cpu_sd_state_idle(void) { } 281static inline void set_cpu_sd_state_idle(void) { }
282#endif 282#endif
283 283
@@ -681,11 +681,6 @@ struct signal_struct {
681 * (notably. ptrace) */ 681 * (notably. ptrace) */
682}; 682};
683 683
684/* Context switch must be unlocked if interrupts are to be enabled */
685#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
686# define __ARCH_WANT_UNLOCKED_CTXSW
687#endif
688
689/* 684/*
690 * Bits in flags field of signal_struct. 685 * Bits in flags field of signal_struct.
691 */ 686 */
@@ -863,7 +858,6 @@ enum cpu_idle_type {
863#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ 858#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
864#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ 859#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
865#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 860#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
866#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
867#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ 861#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
868#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 862#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
869#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 863#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index fec12d667211..d3cf0d6e7712 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -129,7 +129,6 @@ int arch_update_cpu_topology(void);
129 | 1*SD_BALANCE_FORK \ 129 | 1*SD_BALANCE_FORK \
130 | 0*SD_BALANCE_WAKE \ 130 | 0*SD_BALANCE_WAKE \
131 | 1*SD_WAKE_AFFINE \ 131 | 1*SD_WAKE_AFFINE \
132 | 0*SD_PREFER_LOCAL \
133 | 0*SD_SHARE_CPUPOWER \ 132 | 0*SD_SHARE_CPUPOWER \
134 | 1*SD_SHARE_PKG_RESOURCES \ 133 | 1*SD_SHARE_PKG_RESOURCES \
135 | 0*SD_SERIALIZE \ 134 | 0*SD_SERIALIZE \
@@ -160,7 +159,6 @@ int arch_update_cpu_topology(void);
160 | 1*SD_BALANCE_FORK \ 159 | 1*SD_BALANCE_FORK \
161 | 0*SD_BALANCE_WAKE \ 160 | 0*SD_BALANCE_WAKE \
162 | 1*SD_WAKE_AFFINE \ 161 | 1*SD_WAKE_AFFINE \
163 | 0*SD_PREFER_LOCAL \
164 | 0*SD_SHARE_CPUPOWER \ 162 | 0*SD_SHARE_CPUPOWER \
165 | 0*SD_SHARE_PKG_RESOURCES \ 163 | 0*SD_SHARE_PKG_RESOURCES \
166 | 0*SD_SERIALIZE \ 164 | 0*SD_SERIALIZE \
diff --git a/init/Kconfig b/init/Kconfig
index c26b8a1d2b57..3466a6e017b7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -267,6 +267,106 @@ config POSIX_MQUEUE_SYSCTL
267 depends on SYSCTL 267 depends on SYSCTL
268 default y 268 default y
269 269
270config FHANDLE
271 bool "open by fhandle syscalls"
272 select EXPORTFS
273 help
274 If you say Y here, a user level program will be able to map
275 file names to handle and then later use the handle for
276 different file system operations. This is useful in implementing
277 userspace file servers, which now track files using handles instead
278 of names. The handle would remain the same even if file names
279 get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
280 syscalls.
281
282config AUDIT
283 bool "Auditing support"
284 depends on NET
285 help
286 Enable auditing infrastructure that can be used with another
287 kernel subsystem, such as SELinux (which requires this for
288 logging of avc messages output). Does not do system-call
289 auditing without CONFIG_AUDITSYSCALL.
290
291config AUDITSYSCALL
292 bool "Enable system-call auditing support"
293 depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
294 default y if SECURITY_SELINUX
295 help
296 Enable low-overhead system-call auditing infrastructure that
297 can be used independently or with another kernel subsystem,
298 such as SELinux.
299
300config AUDIT_WATCH
301 def_bool y
302 depends on AUDITSYSCALL
303 select FSNOTIFY
304
305config AUDIT_TREE
306 def_bool y
307 depends on AUDITSYSCALL
308 select FSNOTIFY
309
310config AUDIT_LOGINUID_IMMUTABLE
311 bool "Make audit loginuid immutable"
312 depends on AUDIT
313 help
314 The config option toggles if a task setting its loginuid requires
315 CAP_SYS_AUDITCONTROL or if that task should require no special permissions
316 but should instead only allow setting its loginuid if it was never
317 previously set. On systems which use systemd or a similar central
318 process to restart login services this should be set to true. On older
319 systems in which an admin would typically have to directly stop and
320 start processes this should be set to false. Setting this to true allows
321 one to drop potentially dangerous capabilites from the login tasks,
322 but may not be backwards compatible with older init systems.
323
324source "kernel/irq/Kconfig"
325source "kernel/time/Kconfig"
326
327menu "CPU/Task time and stats accounting"
328
329choice
330 prompt "Cputime accounting"
331 default TICK_CPU_ACCOUNTING if !PPC64
332 default VIRT_CPU_ACCOUNTING if PPC64
333
334# Kind of a stub config for the pure tick based cputime accounting
335config TICK_CPU_ACCOUNTING
336 bool "Simple tick based cputime accounting"
337 depends on !S390
338 help
339 This is the basic tick based cputime accounting that maintains
340 statistics about user, system and idle time spent on per jiffies
341 granularity.
342
343 If unsure, say Y.
344
345config VIRT_CPU_ACCOUNTING
346 bool "Deterministic task and CPU time accounting"
347 depends on HAVE_VIRT_CPU_ACCOUNTING
348 help
349 Select this option to enable more accurate task and CPU time
350 accounting. This is done by reading a CPU counter on each
351 kernel entry and exit and on transitions within the kernel
352 between system, softirq and hardirq state, so there is a
353 small performance impact. In the case of s390 or IBM POWER > 5,
354 this also enables accounting of stolen time on logically-partitioned
355 systems.
356
357config IRQ_TIME_ACCOUNTING
358 bool "Fine granularity task level IRQ time accounting"
359 depends on HAVE_IRQ_TIME_ACCOUNTING
360 help
361 Select this option to enable fine granularity task irq time
362 accounting. This is done by reading a timestamp on each
363 transitions between softirq and hardirq state, so there can be a
364 small performance impact.
365
366 If in doubt, say N here.
367
368endchoice
369
270config BSD_PROCESS_ACCT 370config BSD_PROCESS_ACCT
271 bool "BSD Process Accounting" 371 bool "BSD Process Accounting"
272 help 372 help
@@ -292,18 +392,6 @@ config BSD_PROCESS_ACCT_V3
292 for processing it. A preliminary version of these tools is available 392 for processing it. A preliminary version of these tools is available
293 at <http://www.gnu.org/software/acct/>. 393 at <http://www.gnu.org/software/acct/>.
294 394
295config FHANDLE
296 bool "open by fhandle syscalls"
297 select EXPORTFS
298 help
299 If you say Y here, a user level program will be able to map
300 file names to handle and then later use the handle for
301 different file system operations. This is useful in implementing
302 userspace file servers, which now track files using handles instead
303 of names. The handle would remain the same even if file names
304 get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
305 syscalls.
306
307config TASKSTATS 395config TASKSTATS
308 bool "Export task/process statistics through netlink (EXPERIMENTAL)" 396 bool "Export task/process statistics through netlink (EXPERIMENTAL)"
309 depends on NET 397 depends on NET
@@ -346,50 +434,7 @@ config TASK_IO_ACCOUNTING
346 434
347 Say N if unsure. 435 Say N if unsure.
348 436
349config AUDIT 437endmenu # "CPU/Task time and stats accounting"
350 bool "Auditing support"
351 depends on NET
352 help
353 Enable auditing infrastructure that can be used with another
354 kernel subsystem, such as SELinux (which requires this for
355 logging of avc messages output). Does not do system-call
356 auditing without CONFIG_AUDITSYSCALL.
357
358config AUDITSYSCALL
359 bool "Enable system-call auditing support"
360 depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
361 default y if SECURITY_SELINUX
362 help
363 Enable low-overhead system-call auditing infrastructure that
364 can be used independently or with another kernel subsystem,
365 such as SELinux.
366
367config AUDIT_WATCH
368 def_bool y
369 depends on AUDITSYSCALL
370 select FSNOTIFY
371
372config AUDIT_TREE
373 def_bool y
374 depends on AUDITSYSCALL
375 select FSNOTIFY
376
377config AUDIT_LOGINUID_IMMUTABLE
378 bool "Make audit loginuid immutable"
379 depends on AUDIT
380 help
381 The config option toggles if a task setting its loginuid requires
382 CAP_SYS_AUDITCONTROL or if that task should require no special permissions
383 but should instead only allow setting its loginuid if it was never
384 previously set. On systems which use systemd or a similar central
385 process to restart login services this should be set to true. On older
386 systems in which an admin would typically have to directly stop and
387 start processes this should be set to false. Setting this to true allows
388 one to drop potentially dangerous capabilites from the login tasks,
389 but may not be backwards compatible with older init systems.
390
391source "kernel/irq/Kconfig"
392source "kernel/time/Kconfig"
393 438
394menu "RCU Subsystem" 439menu "RCU Subsystem"
395 440
diff --git a/kernel/fork.c b/kernel/fork.c
index 2343c9eaaaf4..5a0e74d89a5a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1276,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1276#endif 1276#endif
1277#ifdef CONFIG_TRACE_IRQFLAGS 1277#ifdef CONFIG_TRACE_IRQFLAGS
1278 p->irq_events = 0; 1278 p->irq_events = 0;
1279#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1280 p->hardirqs_enabled = 1;
1281#else
1282 p->hardirqs_enabled = 0; 1279 p->hardirqs_enabled = 0;
1283#endif
1284 p->hardirq_enable_ip = 0; 1280 p->hardirq_enable_ip = 0;
1285 p->hardirq_enable_event = 0; 1281 p->hardirq_enable_event = 0;
1286 p->hardirq_disable_ip = _THIS_IP_; 1282 p->hardirq_disable_ip = _THIS_IP_;
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 173ea52f3af0..f06d249e103b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer 11CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-$(CONFIG_SMP) += cpupri.o 15obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3c4dec0594d6..c17747236438 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
740 dequeue_task(rq, p, flags); 740 dequeue_task(rq, p, flags);
741} 741}
742 742
743#ifdef CONFIG_IRQ_TIME_ACCOUNTING
744
745/*
746 * There are no locks covering percpu hardirq/softirq time.
747 * They are only modified in account_system_vtime, on corresponding CPU
748 * with interrupts disabled. So, writes are safe.
749 * They are read and saved off onto struct rq in update_rq_clock().
750 * This may result in other CPU reading this CPU's irq time and can
751 * race with irq/account_system_vtime on this CPU. We would either get old
752 * or new value with a side effect of accounting a slice of irq time to wrong
753 * task when irq is in progress while we read rq->clock. That is a worthy
754 * compromise in place of having locks on each irq in account_system_time.
755 */
756static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757static DEFINE_PER_CPU(u64, cpu_softirq_time);
758
759static DEFINE_PER_CPU(u64, irq_start_time);
760static int sched_clock_irqtime;
761
762void enable_sched_clock_irqtime(void)
763{
764 sched_clock_irqtime = 1;
765}
766
767void disable_sched_clock_irqtime(void)
768{
769 sched_clock_irqtime = 0;
770}
771
772#ifndef CONFIG_64BIT
773static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774
775static inline void irq_time_write_begin(void)
776{
777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb();
779}
780
781static inline void irq_time_write_end(void)
782{
783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence);
785}
786
787static inline u64 irq_time_read(int cpu)
788{
789 u64 irq_time;
790 unsigned seq;
791
792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797
798 return irq_time;
799}
800#else /* CONFIG_64BIT */
801static inline void irq_time_write_begin(void)
802{
803}
804
805static inline void irq_time_write_end(void)
806{
807}
808
809static inline u64 irq_time_read(int cpu)
810{
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812}
813#endif /* CONFIG_64BIT */
814
815/*
816 * Called before incrementing preempt_count on {soft,}irq_enter
817 * and before decrementing preempt_count on {soft,}irq_exit.
818 */
819void account_system_vtime(struct task_struct *curr)
820{
821 unsigned long flags;
822 s64 delta;
823 int cpu;
824
825 if (!sched_clock_irqtime)
826 return;
827
828 local_irq_save(flags);
829
830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta);
833
834 irq_time_write_begin();
835 /*
836 * We do not account for softirq time from ksoftirqd here.
837 * We want to continue accounting softirq time to ksoftirqd thread
838 * in that case, so as not to confuse scheduler with a special task
839 * that do not consume any time, but still wants to run.
840 */
841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta);
845
846 irq_time_write_end();
847 local_irq_restore(flags);
848}
849EXPORT_SYMBOL_GPL(account_system_vtime);
850
851#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
852
853#ifdef CONFIG_PARAVIRT
854static inline u64 steal_ticks(u64 steal)
855{
856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC);
858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860}
861#endif
862
863static void update_rq_clock_task(struct rq *rq, s64 delta) 743static void update_rq_clock_task(struct rq *rq, s64 delta)
864{ 744{
865/* 745/*
@@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
920#endif 800#endif
921} 801}
922 802
923#ifdef CONFIG_IRQ_TIME_ACCOUNTING
924static int irqtime_account_hi_update(void)
925{
926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags;
928 u64 latest_ns;
929 int ret = 0;
930
931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1;
935 local_irq_restore(flags);
936 return ret;
937}
938
939static int irqtime_account_si_update(void)
940{
941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags;
943 u64 latest_ns;
944 int ret = 0;
945
946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1;
950 local_irq_restore(flags);
951 return ret;
952}
953
954#else /* CONFIG_IRQ_TIME_ACCOUNTING */
955
956#define sched_clock_irqtime (0)
957
958#endif
959
960void sched_set_stop_task(int cpu, struct task_struct *stop) 803void sched_set_stop_task(int cpu, struct task_struct *stop)
961{ 804{
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 805 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1518 smp_send_reschedule(cpu); 1361 smp_send_reschedule(cpu);
1519} 1362}
1520 1363
1521#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1522static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1523{
1524 struct rq *rq;
1525 int ret = 0;
1526
1527 rq = __task_rq_lock(p);
1528 if (p->on_cpu) {
1529 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1530 ttwu_do_wakeup(rq, p, wake_flags);
1531 ret = 1;
1532 }
1533 __task_rq_unlock(rq);
1534
1535 return ret;
1536
1537}
1538#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1539
1540bool cpus_share_cache(int this_cpu, int that_cpu) 1364bool cpus_share_cache(int this_cpu, int that_cpu)
1541{ 1365{
1542 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1366 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1597 * If the owning (remote) cpu is still in the middle of schedule() with 1421 * If the owning (remote) cpu is still in the middle of schedule() with
1598 * this task as prev, wait until its done referencing the task. 1422 * this task as prev, wait until its done referencing the task.
1599 */ 1423 */
1600 while (p->on_cpu) { 1424 while (p->on_cpu)
1601#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1602 /*
1603 * In case the architecture enables interrupts in
1604 * context_switch(), we cannot busy wait, since that
1605 * would lead to deadlocks when an interrupt hits and
1606 * tries to wake up @prev. So bail and do a complete
1607 * remote wakeup.
1608 */
1609 if (ttwu_activate_remote(p, wake_flags))
1610 goto stat;
1611#else
1612 cpu_relax(); 1425 cpu_relax();
1613#endif
1614 }
1615 /* 1426 /*
1616 * Pairs with the smp_wmb() in finish_lock_switch(). 1427 * Pairs with the smp_wmb() in finish_lock_switch().
1617 */ 1428 */
@@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1953 * Manfred Spraul <manfred@colorfullife.com> 1764 * Manfred Spraul <manfred@colorfullife.com>
1954 */ 1765 */
1955 prev_state = prev->state; 1766 prev_state = prev->state;
1767 vtime_task_switch(prev);
1956 finish_arch_switch(prev); 1768 finish_arch_switch(prev);
1957#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1958 local_irq_disable();
1959#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1960 perf_event_task_sched_in(prev, current); 1769 perf_event_task_sched_in(prev, current);
1961#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1962 local_irq_enable();
1963#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1964 finish_lock_switch(rq, prev); 1770 finish_lock_switch(rq, prev);
1965 finish_arch_post_lock_switch(); 1771 finish_arch_post_lock_switch();
1966 1772
@@ -2810,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2810 return ns; 2616 return ns;
2811} 2617}
2812 2618
2813#ifdef CONFIG_CGROUP_CPUACCT
2814struct cgroup_subsys cpuacct_subsys;
2815struct cpuacct root_cpuacct;
2816#endif
2817
2818static inline void task_group_account_field(struct task_struct *p, int index,
2819 u64 tmp)
2820{
2821#ifdef CONFIG_CGROUP_CPUACCT
2822 struct kernel_cpustat *kcpustat;
2823 struct cpuacct *ca;
2824#endif
2825 /*
2826 * Since all updates are sure to touch the root cgroup, we
2827 * get ourselves ahead and touch it first. If the root cgroup
2828 * is the only cgroup, then nothing else should be necessary.
2829 *
2830 */
2831 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2832
2833#ifdef CONFIG_CGROUP_CPUACCT
2834 if (unlikely(!cpuacct_subsys.active))
2835 return;
2836
2837 rcu_read_lock();
2838 ca = task_ca(p);
2839 while (ca && (ca != &root_cpuacct)) {
2840 kcpustat = this_cpu_ptr(ca->cpustat);
2841 kcpustat->cpustat[index] += tmp;
2842 ca = parent_ca(ca);
2843 }
2844 rcu_read_unlock();
2845#endif
2846}
2847
2848
2849/*
2850 * Account user cpu time to a process.
2851 * @p: the process that the cpu time gets accounted to
2852 * @cputime: the cpu time spent in user space since the last update
2853 * @cputime_scaled: cputime scaled by cpu frequency
2854 */
2855void account_user_time(struct task_struct *p, cputime_t cputime,
2856 cputime_t cputime_scaled)
2857{
2858 int index;
2859
2860 /* Add user time to process. */
2861 p->utime += cputime;
2862 p->utimescaled += cputime_scaled;
2863 account_group_user_time(p, cputime);
2864
2865 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2866
2867 /* Add user time to cpustat. */
2868 task_group_account_field(p, index, (__force u64) cputime);
2869
2870 /* Account for user time used */
2871 acct_update_integrals(p);
2872}
2873
2874/*
2875 * Account guest cpu time to a process.
2876 * @p: the process that the cpu time gets accounted to
2877 * @cputime: the cpu time spent in virtual machine since the last update
2878 * @cputime_scaled: cputime scaled by cpu frequency
2879 */
2880static void account_guest_time(struct task_struct *p, cputime_t cputime,
2881 cputime_t cputime_scaled)
2882{
2883 u64 *cpustat = kcpustat_this_cpu->cpustat;
2884
2885 /* Add guest time to process. */
2886 p->utime += cputime;
2887 p->utimescaled += cputime_scaled;
2888 account_group_user_time(p, cputime);
2889 p->gtime += cputime;
2890
2891 /* Add guest time to cpustat. */
2892 if (TASK_NICE(p) > 0) {
2893 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2894 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2895 } else {
2896 cpustat[CPUTIME_USER] += (__force u64) cputime;
2897 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2898 }
2899}
2900
2901/*
2902 * Account system cpu time to a process and desired cpustat field
2903 * @p: the process that the cpu time gets accounted to
2904 * @cputime: the cpu time spent in kernel space since the last update
2905 * @cputime_scaled: cputime scaled by cpu frequency
2906 * @target_cputime64: pointer to cpustat field that has to be updated
2907 */
2908static inline
2909void __account_system_time(struct task_struct *p, cputime_t cputime,
2910 cputime_t cputime_scaled, int index)
2911{
2912 /* Add system time to process. */
2913 p->stime += cputime;
2914 p->stimescaled += cputime_scaled;
2915 account_group_system_time(p, cputime);
2916
2917 /* Add system time to cpustat. */
2918 task_group_account_field(p, index, (__force u64) cputime);
2919
2920 /* Account for system time used */
2921 acct_update_integrals(p);
2922}
2923
2924/*
2925 * Account system cpu time to a process.
2926 * @p: the process that the cpu time gets accounted to
2927 * @hardirq_offset: the offset to subtract from hardirq_count()
2928 * @cputime: the cpu time spent in kernel space since the last update
2929 * @cputime_scaled: cputime scaled by cpu frequency
2930 */
2931void account_system_time(struct task_struct *p, int hardirq_offset,
2932 cputime_t cputime, cputime_t cputime_scaled)
2933{
2934 int index;
2935
2936 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2937 account_guest_time(p, cputime, cputime_scaled);
2938 return;
2939 }
2940
2941 if (hardirq_count() - hardirq_offset)
2942 index = CPUTIME_IRQ;
2943 else if (in_serving_softirq())
2944 index = CPUTIME_SOFTIRQ;
2945 else
2946 index = CPUTIME_SYSTEM;
2947
2948 __account_system_time(p, cputime, cputime_scaled, index);
2949}
2950
2951/*
2952 * Account for involuntary wait time.
2953 * @cputime: the cpu time spent in involuntary wait
2954 */
2955void account_steal_time(cputime_t cputime)
2956{
2957 u64 *cpustat = kcpustat_this_cpu->cpustat;
2958
2959 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2960}
2961
2962/*
2963 * Account for idle time.
2964 * @cputime: the cpu time spent in idle wait
2965 */
2966void account_idle_time(cputime_t cputime)
2967{
2968 u64 *cpustat = kcpustat_this_cpu->cpustat;
2969 struct rq *rq = this_rq();
2970
2971 if (atomic_read(&rq->nr_iowait) > 0)
2972 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2973 else
2974 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2975}
2976
2977static __always_inline bool steal_account_process_tick(void)
2978{
2979#ifdef CONFIG_PARAVIRT
2980 if (static_key_false(&paravirt_steal_enabled)) {
2981 u64 steal, st = 0;
2982
2983 steal = paravirt_steal_clock(smp_processor_id());
2984 steal -= this_rq()->prev_steal_time;
2985
2986 st = steal_ticks(steal);
2987 this_rq()->prev_steal_time += st * TICK_NSEC;
2988
2989 account_steal_time(st);
2990 return st;
2991 }
2992#endif
2993 return false;
2994}
2995
2996#ifndef CONFIG_VIRT_CPU_ACCOUNTING
2997
2998#ifdef CONFIG_IRQ_TIME_ACCOUNTING
2999/*
3000 * Account a tick to a process and cpustat
3001 * @p: the process that the cpu time gets accounted to
3002 * @user_tick: is the tick from userspace
3003 * @rq: the pointer to rq
3004 *
3005 * Tick demultiplexing follows the order
3006 * - pending hardirq update
3007 * - pending softirq update
3008 * - user_time
3009 * - idle_time
3010 * - system time
3011 * - check for guest_time
3012 * - else account as system_time
3013 *
3014 * Check for hardirq is done both for system and user time as there is
3015 * no timer going off while we are on hardirq and hence we may never get an
3016 * opportunity to update it solely in system time.
3017 * p->stime and friends are only updated on system time and not on irq
3018 * softirq as those do not count in task exec_runtime any more.
3019 */
3020static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3021 struct rq *rq)
3022{
3023 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3024 u64 *cpustat = kcpustat_this_cpu->cpustat;
3025
3026 if (steal_account_process_tick())
3027 return;
3028
3029 if (irqtime_account_hi_update()) {
3030 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
3031 } else if (irqtime_account_si_update()) {
3032 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
3033 } else if (this_cpu_ksoftirqd() == p) {
3034 /*
3035 * ksoftirqd time do not get accounted in cpu_softirq_time.
3036 * So, we have to handle it separately here.
3037 * Also, p->stime needs to be updated for ksoftirqd.
3038 */
3039 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3040 CPUTIME_SOFTIRQ);
3041 } else if (user_tick) {
3042 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3043 } else if (p == rq->idle) {
3044 account_idle_time(cputime_one_jiffy);
3045 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3046 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3047 } else {
3048 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3049 CPUTIME_SYSTEM);
3050 }
3051}
3052
3053static void irqtime_account_idle_ticks(int ticks)
3054{
3055 int i;
3056 struct rq *rq = this_rq();
3057
3058 for (i = 0; i < ticks; i++)
3059 irqtime_account_process_tick(current, 0, rq);
3060}
3061#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3062static void irqtime_account_idle_ticks(int ticks) {}
3063static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3064 struct rq *rq) {}
3065#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3066
3067/*
3068 * Account a single tick of cpu time.
3069 * @p: the process that the cpu time gets accounted to
3070 * @user_tick: indicates if the tick is a user or a system tick
3071 */
3072void account_process_tick(struct task_struct *p, int user_tick)
3073{
3074 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3075 struct rq *rq = this_rq();
3076
3077 if (sched_clock_irqtime) {
3078 irqtime_account_process_tick(p, user_tick, rq);
3079 return;
3080 }
3081
3082 if (steal_account_process_tick())
3083 return;
3084
3085 if (user_tick)
3086 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3087 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3088 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3089 one_jiffy_scaled);
3090 else
3091 account_idle_time(cputime_one_jiffy);
3092}
3093
3094/*
3095 * Account multiple ticks of steal time.
3096 * @p: the process from which the cpu time has been stolen
3097 * @ticks: number of stolen ticks
3098 */
3099void account_steal_ticks(unsigned long ticks)
3100{
3101 account_steal_time(jiffies_to_cputime(ticks));
3102}
3103
3104/*
3105 * Account multiple ticks of idle time.
3106 * @ticks: number of stolen ticks
3107 */
3108void account_idle_ticks(unsigned long ticks)
3109{
3110
3111 if (sched_clock_irqtime) {
3112 irqtime_account_idle_ticks(ticks);
3113 return;
3114 }
3115
3116 account_idle_time(jiffies_to_cputime(ticks));
3117}
3118
3119#endif
3120
3121/*
3122 * Use precise platform statistics if available:
3123 */
3124#ifdef CONFIG_VIRT_CPU_ACCOUNTING
3125void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3126{
3127 *ut = p->utime;
3128 *st = p->stime;
3129}
3130
3131void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3132{
3133 struct task_cputime cputime;
3134
3135 thread_group_cputime(p, &cputime);
3136
3137 *ut = cputime.utime;
3138 *st = cputime.stime;
3139}
3140#else
3141
3142#ifndef nsecs_to_cputime
3143# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3144#endif
3145
3146static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
3147{
3148 u64 temp = (__force u64) rtime;
3149
3150 temp *= (__force u64) utime;
3151
3152 if (sizeof(cputime_t) == 4)
3153 temp = div_u64(temp, (__force u32) total);
3154 else
3155 temp = div64_u64(temp, (__force u64) total);
3156
3157 return (__force cputime_t) temp;
3158}
3159
3160void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3161{
3162 cputime_t rtime, utime = p->utime, total = utime + p->stime;
3163
3164 /*
3165 * Use CFS's precise accounting:
3166 */
3167 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3168
3169 if (total)
3170 utime = scale_utime(utime, rtime, total);
3171 else
3172 utime = rtime;
3173
3174 /*
3175 * Compare with previous values, to keep monotonicity:
3176 */
3177 p->prev_utime = max(p->prev_utime, utime);
3178 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
3179
3180 *ut = p->prev_utime;
3181 *st = p->prev_stime;
3182}
3183
3184/*
3185 * Must be called with siglock held.
3186 */
3187void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3188{
3189 struct signal_struct *sig = p->signal;
3190 struct task_cputime cputime;
3191 cputime_t rtime, utime, total;
3192
3193 thread_group_cputime(p, &cputime);
3194
3195 total = cputime.utime + cputime.stime;
3196 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3197
3198 if (total)
3199 utime = scale_utime(cputime.utime, rtime, total);
3200 else
3201 utime = rtime;
3202
3203 sig->prev_utime = max(sig->prev_utime, utime);
3204 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
3205
3206 *ut = sig->prev_utime;
3207 *st = sig->prev_stime;
3208}
3209#endif
3210
3211/* 2619/*
3212 * This function gets called by the timer code, with HZ frequency. 2620 * This function gets called by the timer code, with HZ frequency.
3213 * We call it with interrupts disabled. 2621 * We call it with interrupts disabled.
@@ -3368,6 +2776,40 @@ pick_next_task(struct rq *rq)
3368 2776
3369/* 2777/*
3370 * __schedule() is the main scheduler function. 2778 * __schedule() is the main scheduler function.
2779 *
2780 * The main means of driving the scheduler and thus entering this function are:
2781 *
2782 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2783 *
2784 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2785 * paths. For example, see arch/x86/entry_64.S.
2786 *
2787 * To drive preemption between tasks, the scheduler sets the flag in timer
2788 * interrupt handler scheduler_tick().
2789 *
2790 * 3. Wakeups don't really cause entry into schedule(). They add a
2791 * task to the run-queue and that's it.
2792 *
2793 * Now, if the new task added to the run-queue preempts the current
2794 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2795 * called on the nearest possible occasion:
2796 *
2797 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2798 *
2799 * - in syscall or exception context, at the next outmost
2800 * preempt_enable(). (this might be as soon as the wake_up()'s
2801 * spin_unlock()!)
2802 *
2803 * - in IRQ context, return from interrupt-handler to
2804 * preemptible context
2805 *
2806 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2807 * then at the next:
2808 *
2809 * - cond_resched() call
2810 * - explicit schedule() call
2811 * - return from syscall or exception to user-space
2812 * - return from interrupt-handler to user-space
3371 */ 2813 */
3372static void __sched __schedule(void) 2814static void __sched __schedule(void)
3373{ 2815{
@@ -4885,13 +4327,6 @@ again:
4885 */ 4327 */
4886 if (preempt && rq != p_rq) 4328 if (preempt && rq != p_rq)
4887 resched_task(p_rq->curr); 4329 resched_task(p_rq->curr);
4888 } else {
4889 /*
4890 * We might have set it in task_yield_fair(), but are
4891 * not going to schedule(), so don't want to skip
4892 * the next update.
4893 */
4894 rq->skip_clock_update = 0;
4895 } 4330 }
4896 4331
4897out: 4332out:
@@ -5433,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5433 *tablep = NULL; 4868 *tablep = NULL;
5434} 4869}
5435 4870
4871static int min_load_idx = 0;
4872static int max_load_idx = CPU_LOAD_IDX_MAX;
4873
5436static void 4874static void
5437set_table_entry(struct ctl_table *entry, 4875set_table_entry(struct ctl_table *entry,
5438 const char *procname, void *data, int maxlen, 4876 const char *procname, void *data, int maxlen,
5439 umode_t mode, proc_handler *proc_handler) 4877 umode_t mode, proc_handler *proc_handler,
4878 bool load_idx)
5440{ 4879{
5441 entry->procname = procname; 4880 entry->procname = procname;
5442 entry->data = data; 4881 entry->data = data;
5443 entry->maxlen = maxlen; 4882 entry->maxlen = maxlen;
5444 entry->mode = mode; 4883 entry->mode = mode;
5445 entry->proc_handler = proc_handler; 4884 entry->proc_handler = proc_handler;
4885
4886 if (load_idx) {
4887 entry->extra1 = &min_load_idx;
4888 entry->extra2 = &max_load_idx;
4889 }
5446} 4890}
5447 4891
5448static struct ctl_table * 4892static struct ctl_table *
@@ -5454,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5454 return NULL; 4898 return NULL;
5455 4899
5456 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4900 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5457 sizeof(long), 0644, proc_doulongvec_minmax); 4901 sizeof(long), 0644, proc_doulongvec_minmax, false);
5458 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4902 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5459 sizeof(long), 0644, proc_doulongvec_minmax); 4903 sizeof(long), 0644, proc_doulongvec_minmax, false);
5460 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4904 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5461 sizeof(int), 0644, proc_dointvec_minmax); 4905 sizeof(int), 0644, proc_dointvec_minmax, true);
5462 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4906 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5463 sizeof(int), 0644, proc_dointvec_minmax); 4907 sizeof(int), 0644, proc_dointvec_minmax, true);
5464 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4908 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5465 sizeof(int), 0644, proc_dointvec_minmax); 4909 sizeof(int), 0644, proc_dointvec_minmax, true);
5466 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4910 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5467 sizeof(int), 0644, proc_dointvec_minmax); 4911 sizeof(int), 0644, proc_dointvec_minmax, true);
5468 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4912 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5469 sizeof(int), 0644, proc_dointvec_minmax); 4913 sizeof(int), 0644, proc_dointvec_minmax, true);
5470 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4914 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5471 sizeof(int), 0644, proc_dointvec_minmax); 4915 sizeof(int), 0644, proc_dointvec_minmax, false);
5472 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4916 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5473 sizeof(int), 0644, proc_dointvec_minmax); 4917 sizeof(int), 0644, proc_dointvec_minmax, false);
5474 set_table_entry(&table[9], "cache_nice_tries", 4918 set_table_entry(&table[9], "cache_nice_tries",
5475 &sd->cache_nice_tries, 4919 &sd->cache_nice_tries,
5476 sizeof(int), 0644, proc_dointvec_minmax); 4920 sizeof(int), 0644, proc_dointvec_minmax, false);
5477 set_table_entry(&table[10], "flags", &sd->flags, 4921 set_table_entry(&table[10], "flags", &sd->flags,
5478 sizeof(int), 0644, proc_dointvec_minmax); 4922 sizeof(int), 0644, proc_dointvec_minmax, false);
5479 set_table_entry(&table[11], "name", sd->name, 4923 set_table_entry(&table[11], "name", sd->name,
5480 CORENAME_MAX_SIZE, 0444, proc_dostring); 4924 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
5481 /* &table[12] is terminator */ 4925 /* &table[12] is terminator */
5482 4926
5483 return table; 4927 return table;
@@ -6556,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6556 | 0*SD_BALANCE_FORK 6000 | 0*SD_BALANCE_FORK
6557 | 0*SD_BALANCE_WAKE 6001 | 0*SD_BALANCE_WAKE
6558 | 0*SD_WAKE_AFFINE 6002 | 0*SD_WAKE_AFFINE
6559 | 0*SD_PREFER_LOCAL
6560 | 0*SD_SHARE_CPUPOWER 6003 | 0*SD_SHARE_CPUPOWER
6561 | 0*SD_SHARE_PKG_RESOURCES 6004 | 0*SD_SHARE_PKG_RESOURCES
6562 | 1*SD_SERIALIZE 6005 | 1*SD_SERIALIZE
@@ -8354,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8354 * (balbir@in.ibm.com). 7797 * (balbir@in.ibm.com).
8355 */ 7798 */
8356 7799
7800struct cpuacct root_cpuacct;
7801
8357/* create a new cpu accounting group */ 7802/* create a new cpu accounting group */
8358static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) 7803static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
8359{ 7804{
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 000000000000..81b763ba58a6
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,530 @@
1#include <linux/export.h>
2#include <linux/sched.h>
3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include "sched.h"
7
8
9#ifdef CONFIG_IRQ_TIME_ACCOUNTING
10
11/*
12 * There are no locks covering percpu hardirq/softirq time.
13 * They are only modified in vtime_account, on corresponding CPU
14 * with interrupts disabled. So, writes are safe.
15 * They are read and saved off onto struct rq in update_rq_clock().
16 * This may result in other CPU reading this CPU's irq time and can
17 * race with irq/vtime_account on this CPU. We would either get old
18 * or new value with a side effect of accounting a slice of irq time to wrong
19 * task when irq is in progress while we read rq->clock. That is a worthy
20 * compromise in place of having locks on each irq in account_system_time.
21 */
22DEFINE_PER_CPU(u64, cpu_hardirq_time);
23DEFINE_PER_CPU(u64, cpu_softirq_time);
24
25static DEFINE_PER_CPU(u64, irq_start_time);
26static int sched_clock_irqtime;
27
28void enable_sched_clock_irqtime(void)
29{
30 sched_clock_irqtime = 1;
31}
32
33void disable_sched_clock_irqtime(void)
34{
35 sched_clock_irqtime = 0;
36}
37
38#ifndef CONFIG_64BIT
39DEFINE_PER_CPU(seqcount_t, irq_time_seq);
40#endif /* CONFIG_64BIT */
41
42/*
43 * Called before incrementing preempt_count on {soft,}irq_enter
44 * and before decrementing preempt_count on {soft,}irq_exit.
45 */
46void vtime_account(struct task_struct *curr)
47{
48 unsigned long flags;
49 s64 delta;
50 int cpu;
51
52 if (!sched_clock_irqtime)
53 return;
54
55 local_irq_save(flags);
56
57 cpu = smp_processor_id();
58 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
59 __this_cpu_add(irq_start_time, delta);
60
61 irq_time_write_begin();
62 /*
63 * We do not account for softirq time from ksoftirqd here.
64 * We want to continue accounting softirq time to ksoftirqd thread
65 * in that case, so as not to confuse scheduler with a special task
66 * that do not consume any time, but still wants to run.
67 */
68 if (hardirq_count())
69 __this_cpu_add(cpu_hardirq_time, delta);
70 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
71 __this_cpu_add(cpu_softirq_time, delta);
72
73 irq_time_write_end();
74 local_irq_restore(flags);
75}
76EXPORT_SYMBOL_GPL(vtime_account);
77
78static int irqtime_account_hi_update(void)
79{
80 u64 *cpustat = kcpustat_this_cpu->cpustat;
81 unsigned long flags;
82 u64 latest_ns;
83 int ret = 0;
84
85 local_irq_save(flags);
86 latest_ns = this_cpu_read(cpu_hardirq_time);
87 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
88 ret = 1;
89 local_irq_restore(flags);
90 return ret;
91}
92
93static int irqtime_account_si_update(void)
94{
95 u64 *cpustat = kcpustat_this_cpu->cpustat;
96 unsigned long flags;
97 u64 latest_ns;
98 int ret = 0;
99
100 local_irq_save(flags);
101 latest_ns = this_cpu_read(cpu_softirq_time);
102 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
103 ret = 1;
104 local_irq_restore(flags);
105 return ret;
106}
107
108#else /* CONFIG_IRQ_TIME_ACCOUNTING */
109
110#define sched_clock_irqtime (0)
111
112#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
113
114static inline void task_group_account_field(struct task_struct *p, int index,
115 u64 tmp)
116{
117#ifdef CONFIG_CGROUP_CPUACCT
118 struct kernel_cpustat *kcpustat;
119 struct cpuacct *ca;
120#endif
121 /*
122 * Since all updates are sure to touch the root cgroup, we
123 * get ourselves ahead and touch it first. If the root cgroup
124 * is the only cgroup, then nothing else should be necessary.
125 *
126 */
127 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
128
129#ifdef CONFIG_CGROUP_CPUACCT
130 if (unlikely(!cpuacct_subsys.active))
131 return;
132
133 rcu_read_lock();
134 ca = task_ca(p);
135 while (ca && (ca != &root_cpuacct)) {
136 kcpustat = this_cpu_ptr(ca->cpustat);
137 kcpustat->cpustat[index] += tmp;
138 ca = parent_ca(ca);
139 }
140 rcu_read_unlock();
141#endif
142}
143
144/*
145 * Account user cpu time to a process.
146 * @p: the process that the cpu time gets accounted to
147 * @cputime: the cpu time spent in user space since the last update
148 * @cputime_scaled: cputime scaled by cpu frequency
149 */
150void account_user_time(struct task_struct *p, cputime_t cputime,
151 cputime_t cputime_scaled)
152{
153 int index;
154
155 /* Add user time to process. */
156 p->utime += cputime;
157 p->utimescaled += cputime_scaled;
158 account_group_user_time(p, cputime);
159
160 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
161
162 /* Add user time to cpustat. */
163 task_group_account_field(p, index, (__force u64) cputime);
164
165 /* Account for user time used */
166 acct_update_integrals(p);
167}
168
169/*
170 * Account guest cpu time to a process.
171 * @p: the process that the cpu time gets accounted to
172 * @cputime: the cpu time spent in virtual machine since the last update
173 * @cputime_scaled: cputime scaled by cpu frequency
174 */
175static void account_guest_time(struct task_struct *p, cputime_t cputime,
176 cputime_t cputime_scaled)
177{
178 u64 *cpustat = kcpustat_this_cpu->cpustat;
179
180 /* Add guest time to process. */
181 p->utime += cputime;
182 p->utimescaled += cputime_scaled;
183 account_group_user_time(p, cputime);
184 p->gtime += cputime;
185
186 /* Add guest time to cpustat. */
187 if (TASK_NICE(p) > 0) {
188 cpustat[CPUTIME_NICE] += (__force u64) cputime;
189 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
190 } else {
191 cpustat[CPUTIME_USER] += (__force u64) cputime;
192 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
193 }
194}
195
196/*
197 * Account system cpu time to a process and desired cpustat field
198 * @p: the process that the cpu time gets accounted to
199 * @cputime: the cpu time spent in kernel space since the last update
200 * @cputime_scaled: cputime scaled by cpu frequency
201 * @target_cputime64: pointer to cpustat field that has to be updated
202 */
203static inline
204void __account_system_time(struct task_struct *p, cputime_t cputime,
205 cputime_t cputime_scaled, int index)
206{
207 /* Add system time to process. */
208 p->stime += cputime;
209 p->stimescaled += cputime_scaled;
210 account_group_system_time(p, cputime);
211
212 /* Add system time to cpustat. */
213 task_group_account_field(p, index, (__force u64) cputime);
214
215 /* Account for system time used */
216 acct_update_integrals(p);
217}
218
219/*
220 * Account system cpu time to a process.
221 * @p: the process that the cpu time gets accounted to
222 * @hardirq_offset: the offset to subtract from hardirq_count()
223 * @cputime: the cpu time spent in kernel space since the last update
224 * @cputime_scaled: cputime scaled by cpu frequency
225 */
226void account_system_time(struct task_struct *p, int hardirq_offset,
227 cputime_t cputime, cputime_t cputime_scaled)
228{
229 int index;
230
231 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
232 account_guest_time(p, cputime, cputime_scaled);
233 return;
234 }
235
236 if (hardirq_count() - hardirq_offset)
237 index = CPUTIME_IRQ;
238 else if (in_serving_softirq())
239 index = CPUTIME_SOFTIRQ;
240 else
241 index = CPUTIME_SYSTEM;
242
243 __account_system_time(p, cputime, cputime_scaled, index);
244}
245
246/*
247 * Account for involuntary wait time.
248 * @cputime: the cpu time spent in involuntary wait
249 */
250void account_steal_time(cputime_t cputime)
251{
252 u64 *cpustat = kcpustat_this_cpu->cpustat;
253
254 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
255}
256
257/*
258 * Account for idle time.
259 * @cputime: the cpu time spent in idle wait
260 */
261void account_idle_time(cputime_t cputime)
262{
263 u64 *cpustat = kcpustat_this_cpu->cpustat;
264 struct rq *rq = this_rq();
265
266 if (atomic_read(&rq->nr_iowait) > 0)
267 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
268 else
269 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
270}
271
272static __always_inline bool steal_account_process_tick(void)
273{
274#ifdef CONFIG_PARAVIRT
275 if (static_key_false(&paravirt_steal_enabled)) {
276 u64 steal, st = 0;
277
278 steal = paravirt_steal_clock(smp_processor_id());
279 steal -= this_rq()->prev_steal_time;
280
281 st = steal_ticks(steal);
282 this_rq()->prev_steal_time += st * TICK_NSEC;
283
284 account_steal_time(st);
285 return st;
286 }
287#endif
288 return false;
289}
290
291#ifndef CONFIG_VIRT_CPU_ACCOUNTING
292
293#ifdef CONFIG_IRQ_TIME_ACCOUNTING
294/*
295 * Account a tick to a process and cpustat
296 * @p: the process that the cpu time gets accounted to
297 * @user_tick: is the tick from userspace
298 * @rq: the pointer to rq
299 *
300 * Tick demultiplexing follows the order
301 * - pending hardirq update
302 * - pending softirq update
303 * - user_time
304 * - idle_time
305 * - system time
306 * - check for guest_time
307 * - else account as system_time
308 *
309 * Check for hardirq is done both for system and user time as there is
310 * no timer going off while we are on hardirq and hence we may never get an
311 * opportunity to update it solely in system time.
312 * p->stime and friends are only updated on system time and not on irq
313 * softirq as those do not count in task exec_runtime any more.
314 */
315static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
316 struct rq *rq)
317{
318 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
319 u64 *cpustat = kcpustat_this_cpu->cpustat;
320
321 if (steal_account_process_tick())
322 return;
323
324 if (irqtime_account_hi_update()) {
325 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
326 } else if (irqtime_account_si_update()) {
327 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
328 } else if (this_cpu_ksoftirqd() == p) {
329 /*
330 * ksoftirqd time do not get accounted in cpu_softirq_time.
331 * So, we have to handle it separately here.
332 * Also, p->stime needs to be updated for ksoftirqd.
333 */
334 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
335 CPUTIME_SOFTIRQ);
336 } else if (user_tick) {
337 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
338 } else if (p == rq->idle) {
339 account_idle_time(cputime_one_jiffy);
340 } else if (p->flags & PF_VCPU) { /* System time or guest time */
341 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
342 } else {
343 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
344 CPUTIME_SYSTEM);
345 }
346}
347
348static void irqtime_account_idle_ticks(int ticks)
349{
350 int i;
351 struct rq *rq = this_rq();
352
353 for (i = 0; i < ticks; i++)
354 irqtime_account_process_tick(current, 0, rq);
355}
356#else /* CONFIG_IRQ_TIME_ACCOUNTING */
357static void irqtime_account_idle_ticks(int ticks) {}
358static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
359 struct rq *rq) {}
360#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
361
362/*
363 * Account a single tick of cpu time.
364 * @p: the process that the cpu time gets accounted to
365 * @user_tick: indicates if the tick is a user or a system tick
366 */
367void account_process_tick(struct task_struct *p, int user_tick)
368{
369 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
370 struct rq *rq = this_rq();
371
372 if (sched_clock_irqtime) {
373 irqtime_account_process_tick(p, user_tick, rq);
374 return;
375 }
376
377 if (steal_account_process_tick())
378 return;
379
380 if (user_tick)
381 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
382 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
383 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
384 one_jiffy_scaled);
385 else
386 account_idle_time(cputime_one_jiffy);
387}
388
389/*
390 * Account multiple ticks of steal time.
391 * @p: the process from which the cpu time has been stolen
392 * @ticks: number of stolen ticks
393 */
394void account_steal_ticks(unsigned long ticks)
395{
396 account_steal_time(jiffies_to_cputime(ticks));
397}
398
399/*
400 * Account multiple ticks of idle time.
401 * @ticks: number of stolen ticks
402 */
403void account_idle_ticks(unsigned long ticks)
404{
405
406 if (sched_clock_irqtime) {
407 irqtime_account_idle_ticks(ticks);
408 return;
409 }
410
411 account_idle_time(jiffies_to_cputime(ticks));
412}
413
414#endif
415
416/*
417 * Use precise platform statistics if available:
418 */
419#ifdef CONFIG_VIRT_CPU_ACCOUNTING
420void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
421{
422 *ut = p->utime;
423 *st = p->stime;
424}
425
426void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
427{
428 struct task_cputime cputime;
429
430 thread_group_cputime(p, &cputime);
431
432 *ut = cputime.utime;
433 *st = cputime.stime;
434}
435
436/*
437 * Archs that account the whole time spent in the idle task
438 * (outside irq) as idle time can rely on this and just implement
439 * vtime_account_system() and vtime_account_idle(). Archs that
440 * have other meaning of the idle time (s390 only includes the
441 * time spent by the CPU when it's in low power mode) must override
442 * vtime_account().
443 */
444#ifndef __ARCH_HAS_VTIME_ACCOUNT
445void vtime_account(struct task_struct *tsk)
446{
447 unsigned long flags;
448
449 local_irq_save(flags);
450
451 if (in_interrupt() || !is_idle_task(tsk))
452 vtime_account_system(tsk);
453 else
454 vtime_account_idle(tsk);
455
456 local_irq_restore(flags);
457}
458EXPORT_SYMBOL_GPL(vtime_account);
459#endif /* __ARCH_HAS_VTIME_ACCOUNT */
460
461#else
462
463#ifndef nsecs_to_cputime
464# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
465#endif
466
467static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
468{
469 u64 temp = (__force u64) rtime;
470
471 temp *= (__force u64) utime;
472
473 if (sizeof(cputime_t) == 4)
474 temp = div_u64(temp, (__force u32) total);
475 else
476 temp = div64_u64(temp, (__force u64) total);
477
478 return (__force cputime_t) temp;
479}
480
481void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
482{
483 cputime_t rtime, utime = p->utime, total = utime + p->stime;
484
485 /*
486 * Use CFS's precise accounting:
487 */
488 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
489
490 if (total)
491 utime = scale_utime(utime, rtime, total);
492 else
493 utime = rtime;
494
495 /*
496 * Compare with previous values, to keep monotonicity:
497 */
498 p->prev_utime = max(p->prev_utime, utime);
499 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
500
501 *ut = p->prev_utime;
502 *st = p->prev_stime;
503}
504
505/*
506 * Must be called with siglock held.
507 */
508void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
509{
510 struct signal_struct *sig = p->signal;
511 struct task_cputime cputime;
512 cputime_t rtime, utime, total;
513
514 thread_group_cputime(p, &cputime);
515
516 total = cputime.utime + cputime.stime;
517 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
518
519 if (total)
520 utime = scale_utime(cputime.utime, rtime, total);
521 else
522 utime = rtime;
523
524 sig->prev_utime = max(sig->prev_utime, utime);
525 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
526
527 *ut = sig->prev_utime;
528 *st = sig->prev_stime;
529}
530#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 96e2b18b6283..6b800a14b990 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
597/* 597/*
598 * The idea is to set a period in which each task runs once. 598 * The idea is to set a period in which each task runs once.
599 * 599 *
600 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 600 * When there are too many tasks (sched_nr_latency) we have to stretch
601 * this period because otherwise the slices get too small. 601 * this period because otherwise the slices get too small.
602 * 602 *
603 * p = (nr <= nl) ? l : l*nr/nl 603 * p = (nr <= nl) ? l : l*nr/nl
@@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2700 int prev_cpu = task_cpu(p); 2700 int prev_cpu = task_cpu(p);
2701 int new_cpu = cpu; 2701 int new_cpu = cpu;
2702 int want_affine = 0; 2702 int want_affine = 0;
2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2703 int sync = wake_flags & WF_SYNC;
2705 2704
2706 if (p->nr_cpus_allowed == 1) 2705 if (p->nr_cpus_allowed == 1)
@@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2718 continue; 2717 continue;
2719 2718
2720 /* 2719 /*
2721 * If power savings logic is enabled for a domain, see if we
2722 * are not overloaded, if so, don't balance wider.
2723 */
2724 if (tmp->flags & (SD_PREFER_LOCAL)) {
2725 unsigned long power = 0;
2726 unsigned long nr_running = 0;
2727 unsigned long capacity;
2728 int i;
2729
2730 for_each_cpu(i, sched_domain_span(tmp)) {
2731 power += power_of(i);
2732 nr_running += cpu_rq(i)->cfs.nr_running;
2733 }
2734
2735 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
2736
2737 if (nr_running < capacity)
2738 want_sd = 0;
2739 }
2740
2741 /*
2742 * If both cpu and prev_cpu are part of this domain, 2720 * If both cpu and prev_cpu are part of this domain,
2743 * cpu is a valid SD_WAKE_AFFINE target. 2721 * cpu is a valid SD_WAKE_AFFINE target.
2744 */ 2722 */
2745 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 2723 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
2746 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { 2724 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
2747 affine_sd = tmp; 2725 affine_sd = tmp;
2748 want_affine = 0;
2749 }
2750
2751 if (!want_sd && !want_affine)
2752 break; 2726 break;
2727 }
2753 2728
2754 if (!(tmp->flags & sd_flag)) 2729 if (tmp->flags & sd_flag)
2755 continue;
2756
2757 if (want_sd)
2758 sd = tmp; 2730 sd = tmp;
2759 } 2731 }
2760 2732
2761 if (affine_sd) { 2733 if (affine_sd) {
2762 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 2734 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
2763 prev_cpu = cpu; 2735 prev_cpu = cpu;
2764 2736
2765 new_cpu = select_idle_sibling(p, prev_cpu); 2737 new_cpu = select_idle_sibling(p, prev_cpu);
@@ -4295,7 +4267,7 @@ redo:
4295 goto out_balanced; 4267 goto out_balanced;
4296 } 4268 }
4297 4269
4298 BUG_ON(busiest == this_rq); 4270 BUG_ON(busiest == env.dst_rq);
4299 4271
4300 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 4272 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
4301 4273
@@ -4316,7 +4288,7 @@ redo:
4316 update_h_load(env.src_cpu); 4288 update_h_load(env.src_cpu);
4317more_balance: 4289more_balance:
4318 local_irq_save(flags); 4290 local_irq_save(flags);
4319 double_rq_lock(this_rq, busiest); 4291 double_rq_lock(env.dst_rq, busiest);
4320 4292
4321 /* 4293 /*
4322 * cur_ld_moved - load moved in current iteration 4294 * cur_ld_moved - load moved in current iteration
@@ -4324,7 +4296,7 @@ more_balance:
4324 */ 4296 */
4325 cur_ld_moved = move_tasks(&env); 4297 cur_ld_moved = move_tasks(&env);
4326 ld_moved += cur_ld_moved; 4298 ld_moved += cur_ld_moved;
4327 double_rq_unlock(this_rq, busiest); 4299 double_rq_unlock(env.dst_rq, busiest);
4328 local_irq_restore(flags); 4300 local_irq_restore(flags);
4329 4301
4330 if (env.flags & LBF_NEED_BREAK) { 4302 if (env.flags & LBF_NEED_BREAK) {
@@ -4360,8 +4332,7 @@ more_balance:
4360 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 4332 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
4361 lb_iterations++ < max_lb_iterations) { 4333 lb_iterations++ < max_lb_iterations) {
4362 4334
4363 this_rq = cpu_rq(env.new_dst_cpu); 4335 env.dst_rq = cpu_rq(env.new_dst_cpu);
4364 env.dst_rq = this_rq;
4365 env.dst_cpu = env.new_dst_cpu; 4336 env.dst_cpu = env.new_dst_cpu;
4366 env.flags &= ~LBF_SOME_PINNED; 4337 env.flags &= ~LBF_SOME_PINNED;
4367 env.loop = 0; 4338 env.loop = 0;
@@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu)
4646 return; 4617 return;
4647} 4618}
4648 4619
4649static inline void clear_nohz_tick_stopped(int cpu) 4620static inline void nohz_balance_exit_idle(int cpu)
4650{ 4621{
4651 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 4622 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4652 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); 4623 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
@@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void)
4686} 4657}
4687 4658
4688/* 4659/*
4689 * This routine will record that this cpu is going idle with tick stopped. 4660 * This routine will record that the cpu is going idle with tick stopped.
4690 * This info will be used in performing idle load balancing in the future. 4661 * This info will be used in performing idle load balancing in the future.
4691 */ 4662 */
4692void select_nohz_load_balancer(int stop_tick) 4663void nohz_balance_enter_idle(int cpu)
4693{ 4664{
4694 int cpu = smp_processor_id();
4695
4696 /* 4665 /*
4697 * If this cpu is going down, then nothing needs to be done. 4666 * If this cpu is going down, then nothing needs to be done.
4698 */ 4667 */
4699 if (!cpu_active(cpu)) 4668 if (!cpu_active(cpu))
4700 return; 4669 return;
4701 4670
4702 if (stop_tick) { 4671 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4703 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 4672 return;
4704 return;
4705 4673
4706 cpumask_set_cpu(cpu, nohz.idle_cpus_mask); 4674 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
4707 atomic_inc(&nohz.nr_cpus); 4675 atomic_inc(&nohz.nr_cpus);
4708 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); 4676 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4709 }
4710 return;
4711} 4677}
4712 4678
4713static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, 4679static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
@@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4715{ 4681{
4716 switch (action & ~CPU_TASKS_FROZEN) { 4682 switch (action & ~CPU_TASKS_FROZEN) {
4717 case CPU_DYING: 4683 case CPU_DYING:
4718 clear_nohz_tick_stopped(smp_processor_id()); 4684 nohz_balance_exit_idle(smp_processor_id());
4719 return NOTIFY_OK; 4685 return NOTIFY_OK;
4720 default: 4686 default:
4721 return NOTIFY_DONE; 4687 return NOTIFY_DONE;
@@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
4837 if (need_resched()) 4803 if (need_resched())
4838 break; 4804 break;
4839 4805
4840 raw_spin_lock_irq(&this_rq->lock); 4806 rq = cpu_rq(balance_cpu);
4841 update_rq_clock(this_rq); 4807
4842 update_idle_cpu_load(this_rq); 4808 raw_spin_lock_irq(&rq->lock);
4843 raw_spin_unlock_irq(&this_rq->lock); 4809 update_rq_clock(rq);
4810 update_idle_cpu_load(rq);
4811 raw_spin_unlock_irq(&rq->lock);
4844 4812
4845 rebalance_domains(balance_cpu, CPU_IDLE); 4813 rebalance_domains(balance_cpu, CPU_IDLE);
4846 4814
4847 rq = cpu_rq(balance_cpu);
4848 if (time_after(this_rq->next_balance, rq->next_balance)) 4815 if (time_after(this_rq->next_balance, rq->next_balance))
4849 this_rq->next_balance = rq->next_balance; 4816 this_rq->next_balance = rq->next_balance;
4850 } 4817 }
@@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
4875 * busy tick after returning from idle, we will update the busy stats. 4842 * busy tick after returning from idle, we will update the busy stats.
4876 */ 4843 */
4877 set_cpu_sd_state_busy(); 4844 set_cpu_sd_state_busy();
4878 clear_nohz_tick_stopped(cpu); 4845 nohz_balance_exit_idle(cpu);
4879 4846
4880 /* 4847 /*
4881 * None are in tickless mode and hence no need for NOHZ idle load 4848 * None are in tickless mode and hence no need for NOHZ idle load
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c6..eebefcad7027 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
12SCHED_FEAT(START_DEBIT, true) 12SCHED_FEAT(START_DEBIT, true)
13 13
14/* 14/*
15 * Based on load and program behaviour, see if it makes sense to place
16 * a newly woken task on the same cpu as the task that woke it --
17 * improve cache locality. Typically used with SYNC wakeups as
18 * generated by pipes and the like, see also SYNC_WAKEUPS.
19 */
20SCHED_FEAT(AFFINE_WAKEUPS, true)
21
22/*
23 * Prefer to schedule the task we woke last (assuming it failed 15 * Prefer to schedule the task we woke last (assuming it failed
24 * wakeup-preemption), since its likely going to consume data we 16 * wakeup-preemption), since its likely going to consume data we
25 * touched, increases cache locality. 17 * touched, increases cache locality.
@@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
42/* 34/*
43 * Use arch dependent cpu power functions 35 * Use arch dependent cpu power functions
44 */ 36 */
45SCHED_FEAT(ARCH_POWER, false) 37SCHED_FEAT(ARCH_POWER, true)
46 38
47SCHED_FEAT(HRTICK, false) 39SCHED_FEAT(HRTICK, false)
48SCHED_FEAT(DOUBLE_TICK, false) 40SCHED_FEAT(DOUBLE_TICK, false)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e0b7ba9c040f..418feb01344e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq)
1632 if (!next_task) 1632 if (!next_task)
1633 return 0; 1633 return 0;
1634 1634
1635#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1636 if (unlikely(task_running(rq, next_task)))
1637 return 0;
1638#endif
1639
1640retry: 1635retry:
1641 if (unlikely(next_task == rq->curr)) { 1636 if (unlikely(next_task == rq->curr)) {
1642 WARN_ON(1); 1637 WARN_ON(1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0848fa36c383..7a7db09cfabc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
737 */ 737 */
738 next->on_cpu = 1; 738 next->on_cpu = 1;
739#endif 739#endif
740#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
741 raw_spin_unlock_irq(&rq->lock);
742#else
743 raw_spin_unlock(&rq->lock); 740 raw_spin_unlock(&rq->lock);
744#endif
745} 741}
746 742
747static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 743static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
755 smp_wmb(); 751 smp_wmb();
756 prev->on_cpu = 0; 752 prev->on_cpu = 0;
757#endif 753#endif
758#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
759 local_irq_enable(); 754 local_irq_enable();
760#endif
761} 755}
762#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 756#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
763 757
@@ -891,6 +885,9 @@ struct cpuacct {
891 struct kernel_cpustat __percpu *cpustat; 885 struct kernel_cpustat __percpu *cpustat;
892}; 886};
893 887
888extern struct cgroup_subsys cpuacct_subsys;
889extern struct cpuacct root_cpuacct;
890
894/* return cpu accounting group corresponding to this container */ 891/* return cpu accounting group corresponding to this container */
895static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 892static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
896{ 893{
@@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
917static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 914static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
918#endif 915#endif
919 916
917#ifdef CONFIG_PARAVIRT
918static inline u64 steal_ticks(u64 steal)
919{
920 if (unlikely(steal > NSEC_PER_SEC))
921 return div_u64(steal, TICK_NSEC);
922
923 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
924}
925#endif
926
920static inline void inc_nr_running(struct rq *rq) 927static inline void inc_nr_running(struct rq *rq)
921{ 928{
922 rq->nr_running++; 929 rq->nr_running++;
@@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits {
1156 1163
1157#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1164#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
1158#endif 1165#endif
1166
1167#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1168
1169DECLARE_PER_CPU(u64, cpu_hardirq_time);
1170DECLARE_PER_CPU(u64, cpu_softirq_time);
1171
1172#ifndef CONFIG_64BIT
1173DECLARE_PER_CPU(seqcount_t, irq_time_seq);
1174
1175static inline void irq_time_write_begin(void)
1176{
1177 __this_cpu_inc(irq_time_seq.sequence);
1178 smp_wmb();
1179}
1180
1181static inline void irq_time_write_end(void)
1182{
1183 smp_wmb();
1184 __this_cpu_inc(irq_time_seq.sequence);
1185}
1186
1187static inline u64 irq_time_read(int cpu)
1188{
1189 u64 irq_time;
1190 unsigned seq;
1191
1192 do {
1193 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1194 irq_time = per_cpu(cpu_softirq_time, cpu) +
1195 per_cpu(cpu_hardirq_time, cpu);
1196 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1197
1198 return irq_time;
1199}
1200#else /* CONFIG_64BIT */
1201static inline void irq_time_write_begin(void)
1202{
1203}
1204
1205static inline void irq_time_write_end(void)
1206{
1207}
1208
1209static inline u64 irq_time_read(int cpu)
1210{
1211 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1212}
1213#endif /* CONFIG_64BIT */
1214#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1215
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5c6a5bd8462f..cc96bdc0c2c9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void)
221 current->flags &= ~PF_MEMALLOC; 221 current->flags &= ~PF_MEMALLOC;
222 222
223 pending = local_softirq_pending(); 223 pending = local_softirq_pending();
224 account_system_vtime(current); 224 vtime_account(current);
225 225
226 __local_bh_disable((unsigned long)__builtin_return_address(0), 226 __local_bh_disable((unsigned long)__builtin_return_address(0),
227 SOFTIRQ_OFFSET); 227 SOFTIRQ_OFFSET);
@@ -272,7 +272,7 @@ restart:
272 272
273 lockdep_softirq_exit(); 273 lockdep_softirq_exit();
274 274
275 account_system_vtime(current); 275 vtime_account(current);
276 __local_bh_enable(SOFTIRQ_OFFSET); 276 __local_bh_enable(SOFTIRQ_OFFSET);
277 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 277 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
278} 278}
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void)
341 */ 341 */
342void irq_exit(void) 342void irq_exit(void)
343{ 343{
344 account_system_vtime(current); 344 vtime_account(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87174ef59161..81c7b1a1a307 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
307 .extra2 = &max_sched_tunable_scaling, 307 .extra2 = &max_sched_tunable_scaling,
308 }, 308 },
309 { 309 {
310 .procname = "sched_migration_cost", 310 .procname = "sched_migration_cost_ns",
311 .data = &sysctl_sched_migration_cost, 311 .data = &sysctl_sched_migration_cost,
312 .maxlen = sizeof(unsigned int), 312 .maxlen = sizeof(unsigned int),
313 .mode = 0644, 313 .mode = 0644,
@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = {
321 .proc_handler = proc_dointvec, 321 .proc_handler = proc_dointvec,
322 }, 322 },
323 { 323 {
324 .procname = "sched_time_avg", 324 .procname = "sched_time_avg_ms",
325 .data = &sysctl_sched_time_avg, 325 .data = &sysctl_sched_time_avg,
326 .maxlen = sizeof(unsigned int), 326 .maxlen = sizeof(unsigned int),
327 .mode = 0644, 327 .mode = 0644,
328 .proc_handler = proc_dointvec, 328 .proc_handler = proc_dointvec,
329 }, 329 },
330 { 330 {
331 .procname = "sched_shares_window", 331 .procname = "sched_shares_window_ns",
332 .data = &sysctl_sched_shares_window, 332 .data = &sysctl_sched_shares_window,
333 .maxlen = sizeof(unsigned int), 333 .maxlen = sizeof(unsigned int),
334 .mode = 0644, 334 .mode = 0644,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cf5f6b262673..f423bdd035c2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
372 * the scheduler tick in nohz_restart_sched_tick. 372 * the scheduler tick in nohz_restart_sched_tick.
373 */ 373 */
374 if (!ts->tick_stopped) { 374 if (!ts->tick_stopped) {
375 select_nohz_load_balancer(1); 375 nohz_balance_enter_idle(cpu);
376 calc_load_enter_idle(); 376 calc_load_enter_idle();
377 377
378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 378 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
570static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) 570static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
571{ 571{
572 /* Update jiffies first */ 572 /* Update jiffies first */
573 select_nohz_load_balancer(0);
574 tick_do_update_jiffies64(now); 573 tick_do_update_jiffies64(now);
575 update_cpu_load_nohz(); 574 update_cpu_load_nohz();
576 575