diff options
32 files changed, 892 insertions, 906 deletions
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 28aa1075e291..b1b8587b86f0 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt | |||
@@ -17,16 +17,6 @@ you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file | |||
17 | Unlocked context switches introduce only a very minor performance | 17 | Unlocked context switches introduce only a very minor performance |
18 | penalty to the core scheduler implementation in the CONFIG_SMP case. | 18 | penalty to the core scheduler implementation in the CONFIG_SMP case. |
19 | 19 | ||
20 | 2. Interrupt status | ||
21 | By default, the switch_to arch function is called with interrupts | ||
22 | disabled. Interrupts may be enabled over the call if it is likely to | ||
23 | introduce a significant interrupt latency by adding the line | ||
24 | `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for | ||
25 | unlocked context switches. This define also implies | ||
26 | `__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an | ||
27 | example. | ||
28 | |||
29 | |||
30 | CPU idle | 20 | CPU idle |
31 | ======== | 21 | ======== |
32 | Your cpu_idle routines need to obey the following rules: | 22 | Your cpu_idle routines need to obey the following rules: |
diff --git a/arch/Kconfig b/arch/Kconfig index 1a7b468abf4a..a62965d057f6 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -304,4 +304,13 @@ config HAVE_RCU_USER_QS | |||
304 | are already protected inside rcu_irq_enter/rcu_irq_exit() but | 304 | are already protected inside rcu_irq_enter/rcu_irq_exit() but |
305 | preemption or signal handling on irq exit still need to be protected. | 305 | preemption or signal handling on irq exit still need to be protected. |
306 | 306 | ||
307 | config HAVE_VIRT_CPU_ACCOUNTING | ||
308 | bool | ||
309 | |||
310 | config HAVE_IRQ_TIME_ACCOUNTING | ||
311 | bool | ||
312 | help | ||
313 | Archs need to ensure they use a high enough resolution clock to | ||
314 | support irq time accounting and then call enable_sched_clock_irqtime(). | ||
315 | |||
307 | source "kernel/gcov/Kconfig" | 316 | source "kernel/gcov/Kconfig" |
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 310cf5781fad..3c720ef6c32d 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig | |||
@@ -25,6 +25,7 @@ config IA64 | |||
25 | select HAVE_GENERIC_HARDIRQS | 25 | select HAVE_GENERIC_HARDIRQS |
26 | select HAVE_MEMBLOCK | 26 | select HAVE_MEMBLOCK |
27 | select HAVE_MEMBLOCK_NODE_MAP | 27 | select HAVE_MEMBLOCK_NODE_MAP |
28 | select HAVE_VIRT_CPU_ACCOUNTING | ||
28 | select ARCH_DISCARD_MEMBLOCK | 29 | select ARCH_DISCARD_MEMBLOCK |
29 | select GENERIC_IRQ_PROBE | 30 | select GENERIC_IRQ_PROBE |
30 | select GENERIC_PENDING_IRQ if SMP | 31 | select GENERIC_PENDING_IRQ if SMP |
@@ -340,17 +341,6 @@ config FORCE_MAX_ZONEORDER | |||
340 | default "17" if HUGETLB_PAGE | 341 | default "17" if HUGETLB_PAGE |
341 | default "11" | 342 | default "11" |
342 | 343 | ||
343 | config VIRT_CPU_ACCOUNTING | ||
344 | bool "Deterministic task and CPU time accounting" | ||
345 | default n | ||
346 | help | ||
347 | Select this option to enable more accurate task and CPU time | ||
348 | accounting. This is done by reading a CPU counter on each | ||
349 | kernel entry and exit and on transitions within the kernel | ||
350 | between system, softirq and hardirq state, so there is a | ||
351 | small performance impact. | ||
352 | If in doubt, say N here. | ||
353 | |||
354 | config SMP | 344 | config SMP |
355 | bool "Symmetric multi-processing support" | 345 | bool "Symmetric multi-processing support" |
356 | select USE_GENERIC_SMP_HELPERS | 346 | select USE_GENERIC_SMP_HELPERS |
diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h index cb2412fcd17f..d38c7ea5eea5 100644 --- a/arch/ia64/include/asm/switch_to.h +++ b/arch/ia64/include/asm/switch_to.h | |||
@@ -30,13 +30,6 @@ extern struct task_struct *ia64_switch_to (void *next_task); | |||
30 | extern void ia64_save_extra (struct task_struct *task); | 30 | extern void ia64_save_extra (struct task_struct *task); |
31 | extern void ia64_load_extra (struct task_struct *task); | 31 | extern void ia64_load_extra (struct task_struct *task); |
32 | 32 | ||
33 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
34 | extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next); | ||
35 | # define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n) | ||
36 | #else | ||
37 | # define IA64_ACCOUNT_ON_SWITCH(p,n) | ||
38 | #endif | ||
39 | |||
40 | #ifdef CONFIG_PERFMON | 33 | #ifdef CONFIG_PERFMON |
41 | DECLARE_PER_CPU(unsigned long, pfm_syst_info); | 34 | DECLARE_PER_CPU(unsigned long, pfm_syst_info); |
42 | # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) | 35 | # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) |
@@ -49,7 +42,6 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct | |||
49 | || PERFMON_IS_SYSWIDE()) | 42 | || PERFMON_IS_SYSWIDE()) |
50 | 43 | ||
51 | #define __switch_to(prev,next,last) do { \ | 44 | #define __switch_to(prev,next,last) do { \ |
52 | IA64_ACCOUNT_ON_SWITCH(prev, next); \ | ||
53 | if (IA64_HAS_EXTRA_STATE(prev)) \ | 45 | if (IA64_HAS_EXTRA_STATE(prev)) \ |
54 | ia64_save_extra(prev); \ | 46 | ia64_save_extra(prev); \ |
55 | if (IA64_HAS_EXTRA_STATE(next)) \ | 47 | if (IA64_HAS_EXTRA_STATE(next)) \ |
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index ecc904b33c5f..80ff9acc5edf 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c | |||
@@ -83,32 +83,36 @@ static struct clocksource *itc_clocksource; | |||
83 | 83 | ||
84 | extern cputime_t cycle_to_cputime(u64 cyc); | 84 | extern cputime_t cycle_to_cputime(u64 cyc); |
85 | 85 | ||
86 | static void vtime_account_user(struct task_struct *tsk) | ||
87 | { | ||
88 | cputime_t delta_utime; | ||
89 | struct thread_info *ti = task_thread_info(tsk); | ||
90 | |||
91 | if (ti->ac_utime) { | ||
92 | delta_utime = cycle_to_cputime(ti->ac_utime); | ||
93 | account_user_time(tsk, delta_utime, delta_utime); | ||
94 | ti->ac_utime = 0; | ||
95 | } | ||
96 | } | ||
97 | |||
86 | /* | 98 | /* |
87 | * Called from the context switch with interrupts disabled, to charge all | 99 | * Called from the context switch with interrupts disabled, to charge all |
88 | * accumulated times to the current process, and to prepare accounting on | 100 | * accumulated times to the current process, and to prepare accounting on |
89 | * the next process. | 101 | * the next process. |
90 | */ | 102 | */ |
91 | void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) | 103 | void vtime_task_switch(struct task_struct *prev) |
92 | { | 104 | { |
93 | struct thread_info *pi = task_thread_info(prev); | 105 | struct thread_info *pi = task_thread_info(prev); |
94 | struct thread_info *ni = task_thread_info(next); | 106 | struct thread_info *ni = task_thread_info(current); |
95 | cputime_t delta_stime, delta_utime; | ||
96 | __u64 now; | ||
97 | 107 | ||
98 | now = ia64_get_itc(); | ||
99 | |||
100 | delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp)); | ||
101 | if (idle_task(smp_processor_id()) != prev) | 108 | if (idle_task(smp_processor_id()) != prev) |
102 | account_system_time(prev, 0, delta_stime, delta_stime); | 109 | vtime_account_system(prev); |
103 | else | 110 | else |
104 | account_idle_time(delta_stime); | 111 | vtime_account_idle(prev); |
105 | 112 | ||
106 | if (pi->ac_utime) { | 113 | vtime_account_user(prev); |
107 | delta_utime = cycle_to_cputime(pi->ac_utime); | ||
108 | account_user_time(prev, delta_utime, delta_utime); | ||
109 | } | ||
110 | 114 | ||
111 | pi->ac_stamp = ni->ac_stamp = now; | 115 | pi->ac_stamp = ni->ac_stamp; |
112 | ni->ac_stime = ni->ac_utime = 0; | 116 | ni->ac_stime = ni->ac_utime = 0; |
113 | } | 117 | } |
114 | 118 | ||
@@ -116,29 +120,32 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) | |||
116 | * Account time for a transition between system, hard irq or soft irq state. | 120 | * Account time for a transition between system, hard irq or soft irq state. |
117 | * Note that this function is called with interrupts enabled. | 121 | * Note that this function is called with interrupts enabled. |
118 | */ | 122 | */ |
119 | void account_system_vtime(struct task_struct *tsk) | 123 | static cputime_t vtime_delta(struct task_struct *tsk) |
120 | { | 124 | { |
121 | struct thread_info *ti = task_thread_info(tsk); | 125 | struct thread_info *ti = task_thread_info(tsk); |
122 | unsigned long flags; | ||
123 | cputime_t delta_stime; | 126 | cputime_t delta_stime; |
124 | __u64 now; | 127 | __u64 now; |
125 | 128 | ||
126 | local_irq_save(flags); | ||
127 | |||
128 | now = ia64_get_itc(); | 129 | now = ia64_get_itc(); |
129 | 130 | ||
130 | delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); | 131 | delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); |
131 | if (irq_count() || idle_task(smp_processor_id()) != tsk) | ||
132 | account_system_time(tsk, 0, delta_stime, delta_stime); | ||
133 | else | ||
134 | account_idle_time(delta_stime); | ||
135 | ti->ac_stime = 0; | 132 | ti->ac_stime = 0; |
136 | |||
137 | ti->ac_stamp = now; | 133 | ti->ac_stamp = now; |
138 | 134 | ||
139 | local_irq_restore(flags); | 135 | return delta_stime; |
136 | } | ||
137 | |||
138 | void vtime_account_system(struct task_struct *tsk) | ||
139 | { | ||
140 | cputime_t delta = vtime_delta(tsk); | ||
141 | |||
142 | account_system_time(tsk, 0, delta, delta); | ||
143 | } | ||
144 | |||
145 | void vtime_account_idle(struct task_struct *tsk) | ||
146 | { | ||
147 | account_idle_time(vtime_delta(tsk)); | ||
140 | } | 148 | } |
141 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
142 | 149 | ||
143 | /* | 150 | /* |
144 | * Called from the timer interrupt handler to charge accumulated user time | 151 | * Called from the timer interrupt handler to charge accumulated user time |
@@ -146,14 +153,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime); | |||
146 | */ | 153 | */ |
147 | void account_process_tick(struct task_struct *p, int user_tick) | 154 | void account_process_tick(struct task_struct *p, int user_tick) |
148 | { | 155 | { |
149 | struct thread_info *ti = task_thread_info(p); | 156 | vtime_account_user(p); |
150 | cputime_t delta_utime; | ||
151 | |||
152 | if (ti->ac_utime) { | ||
153 | delta_utime = cycle_to_cputime(ti->ac_utime); | ||
154 | account_user_time(p, delta_utime, delta_utime); | ||
155 | ti->ac_utime = 0; | ||
156 | } | ||
157 | } | 157 | } |
158 | 158 | ||
159 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 159 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 3b4b4a8da922..c1f267694acb 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h | |||
@@ -197,12 +197,6 @@ struct cpu_usage { | |||
197 | 197 | ||
198 | DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); | 198 | DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); |
199 | 199 | ||
200 | #if defined(CONFIG_VIRT_CPU_ACCOUNTING) | ||
201 | #define account_process_vtime(tsk) account_process_tick(tsk, 0) | ||
202 | #else | ||
203 | #define account_process_vtime(tsk) do { } while (0) | ||
204 | #endif | ||
205 | |||
206 | extern void secondary_cpu_time_init(void); | 200 | extern void secondary_cpu_time_init(void); |
207 | 201 | ||
208 | DECLARE_PER_CPU(u64, decrementers_next_tb); | 202 | DECLARE_PER_CPU(u64, decrementers_next_tb); |
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 1a1f2ddfb581..e9cb51f5f801 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c | |||
@@ -514,9 +514,6 @@ struct task_struct *__switch_to(struct task_struct *prev, | |||
514 | 514 | ||
515 | local_irq_save(flags); | 515 | local_irq_save(flags); |
516 | 516 | ||
517 | account_system_vtime(current); | ||
518 | account_process_vtime(current); | ||
519 | |||
520 | /* | 517 | /* |
521 | * We can't take a PMU exception inside _switch() since there is a | 518 | * We can't take a PMU exception inside _switch() since there is a |
522 | * window where the kernel stack SLB and the kernel stack are out | 519 | * window where the kernel stack SLB and the kernel stack are out |
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e49e93191b69..eaa9d0e6abca 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c | |||
@@ -291,13 +291,12 @@ static inline u64 calculate_stolen_time(u64 stop_tb) | |||
291 | * Account time for a transition between system, hard irq | 291 | * Account time for a transition between system, hard irq |
292 | * or soft irq state. | 292 | * or soft irq state. |
293 | */ | 293 | */ |
294 | void account_system_vtime(struct task_struct *tsk) | 294 | static u64 vtime_delta(struct task_struct *tsk, |
295 | u64 *sys_scaled, u64 *stolen) | ||
295 | { | 296 | { |
296 | u64 now, nowscaled, delta, deltascaled; | 297 | u64 now, nowscaled, deltascaled; |
297 | unsigned long flags; | 298 | u64 udelta, delta, user_scaled; |
298 | u64 stolen, udelta, sys_scaled, user_scaled; | ||
299 | 299 | ||
300 | local_irq_save(flags); | ||
301 | now = mftb(); | 300 | now = mftb(); |
302 | nowscaled = read_spurr(now); | 301 | nowscaled = read_spurr(now); |
303 | get_paca()->system_time += now - get_paca()->starttime; | 302 | get_paca()->system_time += now - get_paca()->starttime; |
@@ -305,7 +304,7 @@ void account_system_vtime(struct task_struct *tsk) | |||
305 | deltascaled = nowscaled - get_paca()->startspurr; | 304 | deltascaled = nowscaled - get_paca()->startspurr; |
306 | get_paca()->startspurr = nowscaled; | 305 | get_paca()->startspurr = nowscaled; |
307 | 306 | ||
308 | stolen = calculate_stolen_time(now); | 307 | *stolen = calculate_stolen_time(now); |
309 | 308 | ||
310 | delta = get_paca()->system_time; | 309 | delta = get_paca()->system_time; |
311 | get_paca()->system_time = 0; | 310 | get_paca()->system_time = 0; |
@@ -322,35 +321,45 @@ void account_system_vtime(struct task_struct *tsk) | |||
322 | * the user ticks get saved up in paca->user_time_scaled to be | 321 | * the user ticks get saved up in paca->user_time_scaled to be |
323 | * used by account_process_tick. | 322 | * used by account_process_tick. |
324 | */ | 323 | */ |
325 | sys_scaled = delta; | 324 | *sys_scaled = delta; |
326 | user_scaled = udelta; | 325 | user_scaled = udelta; |
327 | if (deltascaled != delta + udelta) { | 326 | if (deltascaled != delta + udelta) { |
328 | if (udelta) { | 327 | if (udelta) { |
329 | sys_scaled = deltascaled * delta / (delta + udelta); | 328 | *sys_scaled = deltascaled * delta / (delta + udelta); |
330 | user_scaled = deltascaled - sys_scaled; | 329 | user_scaled = deltascaled - *sys_scaled; |
331 | } else { | 330 | } else { |
332 | sys_scaled = deltascaled; | 331 | *sys_scaled = deltascaled; |
333 | } | 332 | } |
334 | } | 333 | } |
335 | get_paca()->user_time_scaled += user_scaled; | 334 | get_paca()->user_time_scaled += user_scaled; |
336 | 335 | ||
337 | if (in_interrupt() || idle_task(smp_processor_id()) != tsk) { | 336 | return delta; |
338 | account_system_time(tsk, 0, delta, sys_scaled); | 337 | } |
339 | if (stolen) | 338 | |
340 | account_steal_time(stolen); | 339 | void vtime_account_system(struct task_struct *tsk) |
341 | } else { | 340 | { |
342 | account_idle_time(delta + stolen); | 341 | u64 delta, sys_scaled, stolen; |
343 | } | 342 | |
344 | local_irq_restore(flags); | 343 | delta = vtime_delta(tsk, &sys_scaled, &stolen); |
344 | account_system_time(tsk, 0, delta, sys_scaled); | ||
345 | if (stolen) | ||
346 | account_steal_time(stolen); | ||
347 | } | ||
348 | |||
349 | void vtime_account_idle(struct task_struct *tsk) | ||
350 | { | ||
351 | u64 delta, sys_scaled, stolen; | ||
352 | |||
353 | delta = vtime_delta(tsk, &sys_scaled, &stolen); | ||
354 | account_idle_time(delta + stolen); | ||
345 | } | 355 | } |
346 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
347 | 356 | ||
348 | /* | 357 | /* |
349 | * Transfer the user and system times accumulated in the paca | 358 | * Transfer the user and system times accumulated in the paca |
350 | * by the exception entry and exit code to the generic process | 359 | * by the exception entry and exit code to the generic process |
351 | * user and system time records. | 360 | * user and system time records. |
352 | * Must be called with interrupts disabled. | 361 | * Must be called with interrupts disabled. |
353 | * Assumes that account_system_vtime() has been called recently | 362 | * Assumes that vtime_account() has been called recently |
354 | * (i.e. since the last entry from usermode) so that | 363 | * (i.e. since the last entry from usermode) so that |
355 | * get_paca()->user_time_scaled is up to date. | 364 | * get_paca()->user_time_scaled is up to date. |
356 | */ | 365 | */ |
@@ -366,6 +375,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick) | |||
366 | account_user_time(tsk, utime, utimescaled); | 375 | account_user_time(tsk, utime, utimescaled); |
367 | } | 376 | } |
368 | 377 | ||
378 | void vtime_task_switch(struct task_struct *prev) | ||
379 | { | ||
380 | vtime_account(prev); | ||
381 | account_process_tick(prev, 0); | ||
382 | } | ||
383 | |||
369 | #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ | 384 | #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ |
370 | #define calc_cputime_factors() | 385 | #define calc_cputime_factors() |
371 | #endif | 386 | #endif |
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 30fd01de6bed..72afd2888cad 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype | |||
@@ -1,6 +1,7 @@ | |||
1 | config PPC64 | 1 | config PPC64 |
2 | bool "64-bit kernel" | 2 | bool "64-bit kernel" |
3 | default n | 3 | default n |
4 | select HAVE_VIRT_CPU_ACCOUNTING | ||
4 | help | 5 | help |
5 | This option selects whether a 32-bit or a 64-bit kernel | 6 | This option selects whether a 32-bit or a 64-bit kernel |
6 | will be built. | 7 | will be built. |
@@ -337,21 +338,6 @@ config PPC_MM_SLICES | |||
337 | default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) | 338 | default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) |
338 | default n | 339 | default n |
339 | 340 | ||
340 | config VIRT_CPU_ACCOUNTING | ||
341 | bool "Deterministic task and CPU time accounting" | ||
342 | depends on PPC64 | ||
343 | default y | ||
344 | help | ||
345 | Select this option to enable more accurate task and CPU time | ||
346 | accounting. This is done by reading a CPU counter on each | ||
347 | kernel entry and exit and on transitions within the kernel | ||
348 | between system, softirq and hardirq state, so there is a | ||
349 | small performance impact. This also enables accounting of | ||
350 | stolen time on logically-partitioned systems running on | ||
351 | IBM POWER5-based machines. | ||
352 | |||
353 | If in doubt, say Y here. | ||
354 | |||
355 | config PPC_HAVE_PMU_SUPPORT | 341 | config PPC_HAVE_PMU_SUPPORT |
356 | bool | 342 | bool |
357 | 343 | ||
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 107610e01a29..f5ab543396da 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -49,9 +49,6 @@ config GENERIC_LOCKBREAK | |||
49 | config PGSTE | 49 | config PGSTE |
50 | def_bool y if KVM | 50 | def_bool y if KVM |
51 | 51 | ||
52 | config VIRT_CPU_ACCOUNTING | ||
53 | def_bool y | ||
54 | |||
55 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC | 52 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC |
56 | def_bool y | 53 | def_bool y |
57 | 54 | ||
@@ -89,6 +86,8 @@ config S390 | |||
89 | select HAVE_MEMBLOCK | 86 | select HAVE_MEMBLOCK |
90 | select HAVE_MEMBLOCK_NODE_MAP | 87 | select HAVE_MEMBLOCK_NODE_MAP |
91 | select HAVE_CMPXCHG_LOCAL | 88 | select HAVE_CMPXCHG_LOCAL |
89 | select HAVE_VIRT_CPU_ACCOUNTING | ||
90 | select VIRT_CPU_ACCOUNTING | ||
92 | select ARCH_DISCARD_MEMBLOCK | 91 | select ARCH_DISCARD_MEMBLOCK |
93 | select BUILDTIME_EXTABLE_SORT | 92 | select BUILDTIME_EXTABLE_SORT |
94 | select ARCH_INLINE_SPIN_TRYLOCK | 93 | select ARCH_INLINE_SPIN_TRYLOCK |
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 8709bdef233c..023d5ae24482 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h | |||
@@ -12,6 +12,9 @@ | |||
12 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
13 | #include <asm/div64.h> | 13 | #include <asm/div64.h> |
14 | 14 | ||
15 | |||
16 | #define __ARCH_HAS_VTIME_ACCOUNT | ||
17 | |||
15 | /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ | 18 | /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ |
16 | 19 | ||
17 | typedef unsigned long long __nocast cputime_t; | 20 | typedef unsigned long long __nocast cputime_t; |
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index f223068b7822..314cc9426fc4 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h | |||
@@ -89,12 +89,8 @@ static inline void restore_access_regs(unsigned int *acrs) | |||
89 | prev = __switch_to(prev,next); \ | 89 | prev = __switch_to(prev,next); \ |
90 | } while (0) | 90 | } while (0) |
91 | 91 | ||
92 | extern void account_vtime(struct task_struct *, struct task_struct *); | ||
93 | extern void account_tick_vtime(struct task_struct *); | ||
94 | |||
95 | #define finish_arch_switch(prev) do { \ | 92 | #define finish_arch_switch(prev) do { \ |
96 | set_fs(current->thread.mm_segment); \ | 93 | set_fs(current->thread.mm_segment); \ |
97 | account_vtime(prev, current); \ | ||
98 | } while (0) | 94 | } while (0) |
99 | 95 | ||
100 | #endif /* __ASM_SWITCH_TO_H */ | 96 | #endif /* __ASM_SWITCH_TO_H */ |
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 4fc97b40a6e1..cb5093c26d16 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c | |||
@@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) | |||
99 | return virt_timer_forward(user + system); | 99 | return virt_timer_forward(user + system); |
100 | } | 100 | } |
101 | 101 | ||
102 | void account_vtime(struct task_struct *prev, struct task_struct *next) | 102 | void vtime_task_switch(struct task_struct *prev) |
103 | { | 103 | { |
104 | struct thread_info *ti; | 104 | struct thread_info *ti; |
105 | 105 | ||
@@ -107,7 +107,7 @@ void account_vtime(struct task_struct *prev, struct task_struct *next) | |||
107 | ti = task_thread_info(prev); | 107 | ti = task_thread_info(prev); |
108 | ti->user_timer = S390_lowcore.user_timer; | 108 | ti->user_timer = S390_lowcore.user_timer; |
109 | ti->system_timer = S390_lowcore.system_timer; | 109 | ti->system_timer = S390_lowcore.system_timer; |
110 | ti = task_thread_info(next); | 110 | ti = task_thread_info(current); |
111 | S390_lowcore.user_timer = ti->user_timer; | 111 | S390_lowcore.user_timer = ti->user_timer; |
112 | S390_lowcore.system_timer = ti->system_timer; | 112 | S390_lowcore.system_timer = ti->system_timer; |
113 | } | 113 | } |
@@ -122,7 +122,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick) | |||
122 | * Update process times based on virtual cpu times stored by entry.S | 122 | * Update process times based on virtual cpu times stored by entry.S |
123 | * to the lowcore fields user_timer, system_timer & steal_clock. | 123 | * to the lowcore fields user_timer, system_timer & steal_clock. |
124 | */ | 124 | */ |
125 | void account_system_vtime(struct task_struct *tsk) | 125 | void vtime_account(struct task_struct *tsk) |
126 | { | 126 | { |
127 | struct thread_info *ti = task_thread_info(tsk); | 127 | struct thread_info *ti = task_thread_info(tsk); |
128 | u64 timer, system; | 128 | u64 timer, system; |
@@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk) | |||
138 | 138 | ||
139 | virt_timer_forward(system); | 139 | virt_timer_forward(system); |
140 | } | 140 | } |
141 | EXPORT_SYMBOL_GPL(account_system_vtime); | 141 | EXPORT_SYMBOL_GPL(vtime_account); |
142 | 142 | ||
143 | void __kprobes vtime_stop_cpu(void) | 143 | void __kprobes vtime_stop_cpu(void) |
144 | { | 144 | { |
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index 7a7ce390534f..d5e86c9f74fd 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h | |||
@@ -69,7 +69,6 @@ static inline const struct cpumask *cpumask_of_node(int node) | |||
69 | | 1*SD_BALANCE_FORK \ | 69 | | 1*SD_BALANCE_FORK \ |
70 | | 0*SD_BALANCE_WAKE \ | 70 | | 0*SD_BALANCE_WAKE \ |
71 | | 0*SD_WAKE_AFFINE \ | 71 | | 0*SD_WAKE_AFFINE \ |
72 | | 0*SD_PREFER_LOCAL \ | ||
73 | | 0*SD_SHARE_CPUPOWER \ | 72 | | 0*SD_SHARE_CPUPOWER \ |
74 | | 0*SD_SHARE_PKG_RESOURCES \ | 73 | | 0*SD_SHARE_PKG_RESOURCES \ |
75 | | 0*SD_SERIALIZE \ | 74 | | 0*SD_SERIALIZE \ |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ff1f56a0188..488ba8da8fef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -101,6 +101,7 @@ config X86 | |||
101 | select GENERIC_STRNCPY_FROM_USER | 101 | select GENERIC_STRNCPY_FROM_USER |
102 | select GENERIC_STRNLEN_USER | 102 | select GENERIC_STRNLEN_USER |
103 | select HAVE_RCU_USER_QS if X86_64 | 103 | select HAVE_RCU_USER_QS if X86_64 |
104 | select HAVE_IRQ_TIME_ACCOUNTING | ||
104 | 105 | ||
105 | config INSTRUCTION_DECODER | 106 | config INSTRUCTION_DECODER |
106 | def_bool (KPROBES || PERF_EVENTS || UPROBES) | 107 | def_bool (KPROBES || PERF_EVENTS || UPROBES) |
@@ -800,17 +801,6 @@ config SCHED_MC | |||
800 | making when dealing with multi-core CPU chips at a cost of slightly | 801 | making when dealing with multi-core CPU chips at a cost of slightly |
801 | increased overhead in some places. If unsure say N here. | 802 | increased overhead in some places. If unsure say N here. |
802 | 803 | ||
803 | config IRQ_TIME_ACCOUNTING | ||
804 | bool "Fine granularity task level IRQ time accounting" | ||
805 | default n | ||
806 | ---help--- | ||
807 | Select this option to enable fine granularity task irq time | ||
808 | accounting. This is done by reading a timestamp on each | ||
809 | transitions between softirq and hardirq state, so there can be a | ||
810 | small performance impact. | ||
811 | |||
812 | If in doubt, say N here. | ||
813 | |||
814 | source "kernel/Kconfig.preempt" | 804 | source "kernel/Kconfig.preempt" |
815 | 805 | ||
816 | config X86_UP_APIC | 806 | config X86_UP_APIC |
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 305f23cd7cff..cab3da3d0949 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
@@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq); | |||
132 | struct task_struct; | 132 | struct task_struct; |
133 | 133 | ||
134 | #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) | 134 | #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) |
135 | static inline void account_system_vtime(struct task_struct *tsk) | 135 | static inline void vtime_account(struct task_struct *tsk) |
136 | { | 136 | { |
137 | } | 137 | } |
138 | #else | 138 | #else |
139 | extern void account_system_vtime(struct task_struct *tsk); | 139 | extern void vtime_account(struct task_struct *tsk); |
140 | #endif | 140 | #endif |
141 | 141 | ||
142 | #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) | 142 | #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) |
@@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void); | |||
162 | */ | 162 | */ |
163 | #define __irq_enter() \ | 163 | #define __irq_enter() \ |
164 | do { \ | 164 | do { \ |
165 | account_system_vtime(current); \ | 165 | vtime_account(current); \ |
166 | add_preempt_count(HARDIRQ_OFFSET); \ | 166 | add_preempt_count(HARDIRQ_OFFSET); \ |
167 | trace_hardirq_enter(); \ | 167 | trace_hardirq_enter(); \ |
168 | } while (0) | 168 | } while (0) |
@@ -178,7 +178,7 @@ extern void irq_enter(void); | |||
178 | #define __irq_exit() \ | 178 | #define __irq_exit() \ |
179 | do { \ | 179 | do { \ |
180 | trace_hardirq_exit(); \ | 180 | trace_hardirq_exit(); \ |
181 | account_system_vtime(current); \ | 181 | vtime_account(current); \ |
182 | sub_preempt_count(HARDIRQ_OFFSET); \ | 182 | sub_preempt_count(HARDIRQ_OFFSET); \ |
183 | } while (0) | 183 | } while (0) |
184 | 184 | ||
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 2fbd9053c2df..36d12f0884c3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
@@ -130,4 +130,12 @@ extern void account_process_tick(struct task_struct *, int user); | |||
130 | extern void account_steal_ticks(unsigned long ticks); | 130 | extern void account_steal_ticks(unsigned long ticks); |
131 | extern void account_idle_ticks(unsigned long ticks); | 131 | extern void account_idle_ticks(unsigned long ticks); |
132 | 132 | ||
133 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
134 | extern void vtime_task_switch(struct task_struct *prev); | ||
135 | extern void vtime_account_system(struct task_struct *tsk); | ||
136 | extern void vtime_account_idle(struct task_struct *tsk); | ||
137 | #else | ||
138 | static inline void vtime_task_switch(struct task_struct *prev) { } | ||
139 | #endif | ||
140 | |||
133 | #endif /* _LINUX_KERNEL_STAT_H */ | 141 | #endif /* _LINUX_KERNEL_STAT_H */ |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b70b48b01098..8a59e0abe5fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -685,7 +685,7 @@ static inline int kvm_deassign_device(struct kvm *kvm, | |||
685 | static inline void kvm_guest_enter(void) | 685 | static inline void kvm_guest_enter(void) |
686 | { | 686 | { |
687 | BUG_ON(preemptible()); | 687 | BUG_ON(preemptible()); |
688 | account_system_vtime(current); | 688 | vtime_account(current); |
689 | current->flags |= PF_VCPU; | 689 | current->flags |= PF_VCPU; |
690 | /* KVM does not hold any references to rcu protected data when it | 690 | /* KVM does not hold any references to rcu protected data when it |
691 | * switches CPU into a guest mode. In fact switching to a guest mode | 691 | * switches CPU into a guest mode. In fact switching to a guest mode |
@@ -699,7 +699,7 @@ static inline void kvm_guest_enter(void) | |||
699 | 699 | ||
700 | static inline void kvm_guest_exit(void) | 700 | static inline void kvm_guest_exit(void) |
701 | { | 701 | { |
702 | account_system_vtime(current); | 702 | vtime_account(current); |
703 | current->flags &= ~PF_VCPU; | 703 | current->flags &= ~PF_VCPU; |
704 | } | 704 | } |
705 | 705 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 83035269e597..765dffbb085e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -273,11 +273,11 @@ extern void init_idle_bootup_task(struct task_struct *idle); | |||
273 | extern int runqueue_is_locked(int cpu); | 273 | extern int runqueue_is_locked(int cpu); |
274 | 274 | ||
275 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 275 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) |
276 | extern void select_nohz_load_balancer(int stop_tick); | 276 | extern void nohz_balance_enter_idle(int cpu); |
277 | extern void set_cpu_sd_state_idle(void); | 277 | extern void set_cpu_sd_state_idle(void); |
278 | extern int get_nohz_timer_target(void); | 278 | extern int get_nohz_timer_target(void); |
279 | #else | 279 | #else |
280 | static inline void select_nohz_load_balancer(int stop_tick) { } | 280 | static inline void nohz_balance_enter_idle(int cpu) { } |
281 | static inline void set_cpu_sd_state_idle(void) { } | 281 | static inline void set_cpu_sd_state_idle(void) { } |
282 | #endif | 282 | #endif |
283 | 283 | ||
@@ -681,11 +681,6 @@ struct signal_struct { | |||
681 | * (notably. ptrace) */ | 681 | * (notably. ptrace) */ |
682 | }; | 682 | }; |
683 | 683 | ||
684 | /* Context switch must be unlocked if interrupts are to be enabled */ | ||
685 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
686 | # define __ARCH_WANT_UNLOCKED_CTXSW | ||
687 | #endif | ||
688 | |||
689 | /* | 684 | /* |
690 | * Bits in flags field of signal_struct. | 685 | * Bits in flags field of signal_struct. |
691 | */ | 686 | */ |
@@ -863,7 +858,6 @@ enum cpu_idle_type { | |||
863 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ | 858 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ |
864 | #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ | 859 | #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ |
865 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ | 860 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ |
866 | #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ | ||
867 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ | 861 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ |
868 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ | 862 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ |
869 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ | 863 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
diff --git a/include/linux/topology.h b/include/linux/topology.h index fec12d667211..d3cf0d6e7712 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -129,7 +129,6 @@ int arch_update_cpu_topology(void); | |||
129 | | 1*SD_BALANCE_FORK \ | 129 | | 1*SD_BALANCE_FORK \ |
130 | | 0*SD_BALANCE_WAKE \ | 130 | | 0*SD_BALANCE_WAKE \ |
131 | | 1*SD_WAKE_AFFINE \ | 131 | | 1*SD_WAKE_AFFINE \ |
132 | | 0*SD_PREFER_LOCAL \ | ||
133 | | 0*SD_SHARE_CPUPOWER \ | 132 | | 0*SD_SHARE_CPUPOWER \ |
134 | | 1*SD_SHARE_PKG_RESOURCES \ | 133 | | 1*SD_SHARE_PKG_RESOURCES \ |
135 | | 0*SD_SERIALIZE \ | 134 | | 0*SD_SERIALIZE \ |
@@ -160,7 +159,6 @@ int arch_update_cpu_topology(void); | |||
160 | | 1*SD_BALANCE_FORK \ | 159 | | 1*SD_BALANCE_FORK \ |
161 | | 0*SD_BALANCE_WAKE \ | 160 | | 0*SD_BALANCE_WAKE \ |
162 | | 1*SD_WAKE_AFFINE \ | 161 | | 1*SD_WAKE_AFFINE \ |
163 | | 0*SD_PREFER_LOCAL \ | ||
164 | | 0*SD_SHARE_CPUPOWER \ | 162 | | 0*SD_SHARE_CPUPOWER \ |
165 | | 0*SD_SHARE_PKG_RESOURCES \ | 163 | | 0*SD_SHARE_PKG_RESOURCES \ |
166 | | 0*SD_SERIALIZE \ | 164 | | 0*SD_SERIALIZE \ |
diff --git a/init/Kconfig b/init/Kconfig index c26b8a1d2b57..3466a6e017b7 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -267,6 +267,106 @@ config POSIX_MQUEUE_SYSCTL | |||
267 | depends on SYSCTL | 267 | depends on SYSCTL |
268 | default y | 268 | default y |
269 | 269 | ||
270 | config FHANDLE | ||
271 | bool "open by fhandle syscalls" | ||
272 | select EXPORTFS | ||
273 | help | ||
274 | If you say Y here, a user level program will be able to map | ||
275 | file names to handle and then later use the handle for | ||
276 | different file system operations. This is useful in implementing | ||
277 | userspace file servers, which now track files using handles instead | ||
278 | of names. The handle would remain the same even if file names | ||
279 | get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) | ||
280 | syscalls. | ||
281 | |||
282 | config AUDIT | ||
283 | bool "Auditing support" | ||
284 | depends on NET | ||
285 | help | ||
286 | Enable auditing infrastructure that can be used with another | ||
287 | kernel subsystem, such as SELinux (which requires this for | ||
288 | logging of avc messages output). Does not do system-call | ||
289 | auditing without CONFIG_AUDITSYSCALL. | ||
290 | |||
291 | config AUDITSYSCALL | ||
292 | bool "Enable system-call auditing support" | ||
293 | depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT)) | ||
294 | default y if SECURITY_SELINUX | ||
295 | help | ||
296 | Enable low-overhead system-call auditing infrastructure that | ||
297 | can be used independently or with another kernel subsystem, | ||
298 | such as SELinux. | ||
299 | |||
300 | config AUDIT_WATCH | ||
301 | def_bool y | ||
302 | depends on AUDITSYSCALL | ||
303 | select FSNOTIFY | ||
304 | |||
305 | config AUDIT_TREE | ||
306 | def_bool y | ||
307 | depends on AUDITSYSCALL | ||
308 | select FSNOTIFY | ||
309 | |||
310 | config AUDIT_LOGINUID_IMMUTABLE | ||
311 | bool "Make audit loginuid immutable" | ||
312 | depends on AUDIT | ||
313 | help | ||
314 | The config option toggles if a task setting its loginuid requires | ||
315 | CAP_SYS_AUDITCONTROL or if that task should require no special permissions | ||
316 | but should instead only allow setting its loginuid if it was never | ||
317 | previously set. On systems which use systemd or a similar central | ||
318 | process to restart login services this should be set to true. On older | ||
319 | systems in which an admin would typically have to directly stop and | ||
320 | start processes this should be set to false. Setting this to true allows | ||
321 | one to drop potentially dangerous capabilites from the login tasks, | ||
322 | but may not be backwards compatible with older init systems. | ||
323 | |||
324 | source "kernel/irq/Kconfig" | ||
325 | source "kernel/time/Kconfig" | ||
326 | |||
327 | menu "CPU/Task time and stats accounting" | ||
328 | |||
329 | choice | ||
330 | prompt "Cputime accounting" | ||
331 | default TICK_CPU_ACCOUNTING if !PPC64 | ||
332 | default VIRT_CPU_ACCOUNTING if PPC64 | ||
333 | |||
334 | # Kind of a stub config for the pure tick based cputime accounting | ||
335 | config TICK_CPU_ACCOUNTING | ||
336 | bool "Simple tick based cputime accounting" | ||
337 | depends on !S390 | ||
338 | help | ||
339 | This is the basic tick based cputime accounting that maintains | ||
340 | statistics about user, system and idle time spent on per jiffies | ||
341 | granularity. | ||
342 | |||
343 | If unsure, say Y. | ||
344 | |||
345 | config VIRT_CPU_ACCOUNTING | ||
346 | bool "Deterministic task and CPU time accounting" | ||
347 | depends on HAVE_VIRT_CPU_ACCOUNTING | ||
348 | help | ||
349 | Select this option to enable more accurate task and CPU time | ||
350 | accounting. This is done by reading a CPU counter on each | ||
351 | kernel entry and exit and on transitions within the kernel | ||
352 | between system, softirq and hardirq state, so there is a | ||
353 | small performance impact. In the case of s390 or IBM POWER > 5, | ||
354 | this also enables accounting of stolen time on logically-partitioned | ||
355 | systems. | ||
356 | |||
357 | config IRQ_TIME_ACCOUNTING | ||
358 | bool "Fine granularity task level IRQ time accounting" | ||
359 | depends on HAVE_IRQ_TIME_ACCOUNTING | ||
360 | help | ||
361 | Select this option to enable fine granularity task irq time | ||
362 | accounting. This is done by reading a timestamp on each | ||
363 | transitions between softirq and hardirq state, so there can be a | ||
364 | small performance impact. | ||
365 | |||
366 | If in doubt, say N here. | ||
367 | |||
368 | endchoice | ||
369 | |||
270 | config BSD_PROCESS_ACCT | 370 | config BSD_PROCESS_ACCT |
271 | bool "BSD Process Accounting" | 371 | bool "BSD Process Accounting" |
272 | help | 372 | help |
@@ -292,18 +392,6 @@ config BSD_PROCESS_ACCT_V3 | |||
292 | for processing it. A preliminary version of these tools is available | 392 | for processing it. A preliminary version of these tools is available |
293 | at <http://www.gnu.org/software/acct/>. | 393 | at <http://www.gnu.org/software/acct/>. |
294 | 394 | ||
295 | config FHANDLE | ||
296 | bool "open by fhandle syscalls" | ||
297 | select EXPORTFS | ||
298 | help | ||
299 | If you say Y here, a user level program will be able to map | ||
300 | file names to handle and then later use the handle for | ||
301 | different file system operations. This is useful in implementing | ||
302 | userspace file servers, which now track files using handles instead | ||
303 | of names. The handle would remain the same even if file names | ||
304 | get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) | ||
305 | syscalls. | ||
306 | |||
307 | config TASKSTATS | 395 | config TASKSTATS |
308 | bool "Export task/process statistics through netlink (EXPERIMENTAL)" | 396 | bool "Export task/process statistics through netlink (EXPERIMENTAL)" |
309 | depends on NET | 397 | depends on NET |
@@ -346,50 +434,7 @@ config TASK_IO_ACCOUNTING | |||
346 | 434 | ||
347 | Say N if unsure. | 435 | Say N if unsure. |
348 | 436 | ||
349 | config AUDIT | 437 | endmenu # "CPU/Task time and stats accounting" |
350 | bool "Auditing support" | ||
351 | depends on NET | ||
352 | help | ||
353 | Enable auditing infrastructure that can be used with another | ||
354 | kernel subsystem, such as SELinux (which requires this for | ||
355 | logging of avc messages output). Does not do system-call | ||
356 | auditing without CONFIG_AUDITSYSCALL. | ||
357 | |||
358 | config AUDITSYSCALL | ||
359 | bool "Enable system-call auditing support" | ||
360 | depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT)) | ||
361 | default y if SECURITY_SELINUX | ||
362 | help | ||
363 | Enable low-overhead system-call auditing infrastructure that | ||
364 | can be used independently or with another kernel subsystem, | ||
365 | such as SELinux. | ||
366 | |||
367 | config AUDIT_WATCH | ||
368 | def_bool y | ||
369 | depends on AUDITSYSCALL | ||
370 | select FSNOTIFY | ||
371 | |||
372 | config AUDIT_TREE | ||
373 | def_bool y | ||
374 | depends on AUDITSYSCALL | ||
375 | select FSNOTIFY | ||
376 | |||
377 | config AUDIT_LOGINUID_IMMUTABLE | ||
378 | bool "Make audit loginuid immutable" | ||
379 | depends on AUDIT | ||
380 | help | ||
381 | The config option toggles if a task setting its loginuid requires | ||
382 | CAP_SYS_AUDITCONTROL or if that task should require no special permissions | ||
383 | but should instead only allow setting its loginuid if it was never | ||
384 | previously set. On systems which use systemd or a similar central | ||
385 | process to restart login services this should be set to true. On older | ||
386 | systems in which an admin would typically have to directly stop and | ||
387 | start processes this should be set to false. Setting this to true allows | ||
388 | one to drop potentially dangerous capabilites from the login tasks, | ||
389 | but may not be backwards compatible with older init systems. | ||
390 | |||
391 | source "kernel/irq/Kconfig" | ||
392 | source "kernel/time/Kconfig" | ||
393 | 438 | ||
394 | menu "RCU Subsystem" | 439 | menu "RCU Subsystem" |
395 | 440 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 2343c9eaaaf4..5a0e74d89a5a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1276,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1276 | #endif | 1276 | #endif |
1277 | #ifdef CONFIG_TRACE_IRQFLAGS | 1277 | #ifdef CONFIG_TRACE_IRQFLAGS |
1278 | p->irq_events = 0; | 1278 | p->irq_events = 0; |
1279 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1280 | p->hardirqs_enabled = 1; | ||
1281 | #else | ||
1282 | p->hardirqs_enabled = 0; | 1279 | p->hardirqs_enabled = 0; |
1283 | #endif | ||
1284 | p->hardirq_enable_ip = 0; | 1280 | p->hardirq_enable_ip = 0; |
1285 | p->hardirq_enable_event = 0; | 1281 | p->hardirq_enable_event = 0; |
1286 | p->hardirq_disable_ip = _THIS_IP_; | 1282 | p->hardirq_disable_ip = _THIS_IP_; |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c4dec0594d6..c17747236438 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
740 | dequeue_task(rq, p, flags); | 740 | dequeue_task(rq, p, flags); |
741 | } | 741 | } |
742 | 742 | ||
743 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
744 | |||
745 | /* | ||
746 | * There are no locks covering percpu hardirq/softirq time. | ||
747 | * They are only modified in account_system_vtime, on corresponding CPU | ||
748 | * with interrupts disabled. So, writes are safe. | ||
749 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
750 | * This may result in other CPU reading this CPU's irq time and can | ||
751 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
752 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
753 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
754 | * compromise in place of having locks on each irq in account_system_time. | ||
755 | */ | ||
756 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
757 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
758 | |||
759 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
760 | static int sched_clock_irqtime; | ||
761 | |||
762 | void enable_sched_clock_irqtime(void) | ||
763 | { | ||
764 | sched_clock_irqtime = 1; | ||
765 | } | ||
766 | |||
767 | void disable_sched_clock_irqtime(void) | ||
768 | { | ||
769 | sched_clock_irqtime = 0; | ||
770 | } | ||
771 | |||
772 | #ifndef CONFIG_64BIT | ||
773 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
774 | |||
775 | static inline void irq_time_write_begin(void) | ||
776 | { | ||
777 | __this_cpu_inc(irq_time_seq.sequence); | ||
778 | smp_wmb(); | ||
779 | } | ||
780 | |||
781 | static inline void irq_time_write_end(void) | ||
782 | { | ||
783 | smp_wmb(); | ||
784 | __this_cpu_inc(irq_time_seq.sequence); | ||
785 | } | ||
786 | |||
787 | static inline u64 irq_time_read(int cpu) | ||
788 | { | ||
789 | u64 irq_time; | ||
790 | unsigned seq; | ||
791 | |||
792 | do { | ||
793 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
794 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
795 | per_cpu(cpu_hardirq_time, cpu); | ||
796 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
797 | |||
798 | return irq_time; | ||
799 | } | ||
800 | #else /* CONFIG_64BIT */ | ||
801 | static inline void irq_time_write_begin(void) | ||
802 | { | ||
803 | } | ||
804 | |||
805 | static inline void irq_time_write_end(void) | ||
806 | { | ||
807 | } | ||
808 | |||
809 | static inline u64 irq_time_read(int cpu) | ||
810 | { | ||
811 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
812 | } | ||
813 | #endif /* CONFIG_64BIT */ | ||
814 | |||
815 | /* | ||
816 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
817 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
818 | */ | ||
819 | void account_system_vtime(struct task_struct *curr) | ||
820 | { | ||
821 | unsigned long flags; | ||
822 | s64 delta; | ||
823 | int cpu; | ||
824 | |||
825 | if (!sched_clock_irqtime) | ||
826 | return; | ||
827 | |||
828 | local_irq_save(flags); | ||
829 | |||
830 | cpu = smp_processor_id(); | ||
831 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
832 | __this_cpu_add(irq_start_time, delta); | ||
833 | |||
834 | irq_time_write_begin(); | ||
835 | /* | ||
836 | * We do not account for softirq time from ksoftirqd here. | ||
837 | * We want to continue accounting softirq time to ksoftirqd thread | ||
838 | * in that case, so as not to confuse scheduler with a special task | ||
839 | * that do not consume any time, but still wants to run. | ||
840 | */ | ||
841 | if (hardirq_count()) | ||
842 | __this_cpu_add(cpu_hardirq_time, delta); | ||
843 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
844 | __this_cpu_add(cpu_softirq_time, delta); | ||
845 | |||
846 | irq_time_write_end(); | ||
847 | local_irq_restore(flags); | ||
848 | } | ||
849 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
850 | |||
851 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
852 | |||
853 | #ifdef CONFIG_PARAVIRT | ||
854 | static inline u64 steal_ticks(u64 steal) | ||
855 | { | ||
856 | if (unlikely(steal > NSEC_PER_SEC)) | ||
857 | return div_u64(steal, TICK_NSEC); | ||
858 | |||
859 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
860 | } | ||
861 | #endif | ||
862 | |||
863 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 743 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
864 | { | 744 | { |
865 | /* | 745 | /* |
@@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
920 | #endif | 800 | #endif |
921 | } | 801 | } |
922 | 802 | ||
923 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
924 | static int irqtime_account_hi_update(void) | ||
925 | { | ||
926 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
927 | unsigned long flags; | ||
928 | u64 latest_ns; | ||
929 | int ret = 0; | ||
930 | |||
931 | local_irq_save(flags); | ||
932 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
933 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
934 | ret = 1; | ||
935 | local_irq_restore(flags); | ||
936 | return ret; | ||
937 | } | ||
938 | |||
939 | static int irqtime_account_si_update(void) | ||
940 | { | ||
941 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
942 | unsigned long flags; | ||
943 | u64 latest_ns; | ||
944 | int ret = 0; | ||
945 | |||
946 | local_irq_save(flags); | ||
947 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
948 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
949 | ret = 1; | ||
950 | local_irq_restore(flags); | ||
951 | return ret; | ||
952 | } | ||
953 | |||
954 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
955 | |||
956 | #define sched_clock_irqtime (0) | ||
957 | |||
958 | #endif | ||
959 | |||
960 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 803 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
961 | { | 804 | { |
962 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 805 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
1518 | smp_send_reschedule(cpu); | 1361 | smp_send_reschedule(cpu); |
1519 | } | 1362 | } |
1520 | 1363 | ||
1521 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1522 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
1523 | { | ||
1524 | struct rq *rq; | ||
1525 | int ret = 0; | ||
1526 | |||
1527 | rq = __task_rq_lock(p); | ||
1528 | if (p->on_cpu) { | ||
1529 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
1530 | ttwu_do_wakeup(rq, p, wake_flags); | ||
1531 | ret = 1; | ||
1532 | } | ||
1533 | __task_rq_unlock(rq); | ||
1534 | |||
1535 | return ret; | ||
1536 | |||
1537 | } | ||
1538 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1539 | |||
1540 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1364 | bool cpus_share_cache(int this_cpu, int that_cpu) |
1541 | { | 1365 | { |
1542 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1366 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
@@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1597 | * If the owning (remote) cpu is still in the middle of schedule() with | 1421 | * If the owning (remote) cpu is still in the middle of schedule() with |
1598 | * this task as prev, wait until its done referencing the task. | 1422 | * this task as prev, wait until its done referencing the task. |
1599 | */ | 1423 | */ |
1600 | while (p->on_cpu) { | 1424 | while (p->on_cpu) |
1601 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1602 | /* | ||
1603 | * In case the architecture enables interrupts in | ||
1604 | * context_switch(), we cannot busy wait, since that | ||
1605 | * would lead to deadlocks when an interrupt hits and | ||
1606 | * tries to wake up @prev. So bail and do a complete | ||
1607 | * remote wakeup. | ||
1608 | */ | ||
1609 | if (ttwu_activate_remote(p, wake_flags)) | ||
1610 | goto stat; | ||
1611 | #else | ||
1612 | cpu_relax(); | 1425 | cpu_relax(); |
1613 | #endif | ||
1614 | } | ||
1615 | /* | 1426 | /* |
1616 | * Pairs with the smp_wmb() in finish_lock_switch(). | 1427 | * Pairs with the smp_wmb() in finish_lock_switch(). |
1617 | */ | 1428 | */ |
@@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1953 | * Manfred Spraul <manfred@colorfullife.com> | 1764 | * Manfred Spraul <manfred@colorfullife.com> |
1954 | */ | 1765 | */ |
1955 | prev_state = prev->state; | 1766 | prev_state = prev->state; |
1767 | vtime_task_switch(prev); | ||
1956 | finish_arch_switch(prev); | 1768 | finish_arch_switch(prev); |
1957 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1958 | local_irq_disable(); | ||
1959 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1960 | perf_event_task_sched_in(prev, current); | 1769 | perf_event_task_sched_in(prev, current); |
1961 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1962 | local_irq_enable(); | ||
1963 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
1964 | finish_lock_switch(rq, prev); | 1770 | finish_lock_switch(rq, prev); |
1965 | finish_arch_post_lock_switch(); | 1771 | finish_arch_post_lock_switch(); |
1966 | 1772 | ||
@@ -2810,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
2810 | return ns; | 2616 | return ns; |
2811 | } | 2617 | } |
2812 | 2618 | ||
2813 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2814 | struct cgroup_subsys cpuacct_subsys; | ||
2815 | struct cpuacct root_cpuacct; | ||
2816 | #endif | ||
2817 | |||
2818 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
2819 | u64 tmp) | ||
2820 | { | ||
2821 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2822 | struct kernel_cpustat *kcpustat; | ||
2823 | struct cpuacct *ca; | ||
2824 | #endif | ||
2825 | /* | ||
2826 | * Since all updates are sure to touch the root cgroup, we | ||
2827 | * get ourselves ahead and touch it first. If the root cgroup | ||
2828 | * is the only cgroup, then nothing else should be necessary. | ||
2829 | * | ||
2830 | */ | ||
2831 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
2832 | |||
2833 | #ifdef CONFIG_CGROUP_CPUACCT | ||
2834 | if (unlikely(!cpuacct_subsys.active)) | ||
2835 | return; | ||
2836 | |||
2837 | rcu_read_lock(); | ||
2838 | ca = task_ca(p); | ||
2839 | while (ca && (ca != &root_cpuacct)) { | ||
2840 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
2841 | kcpustat->cpustat[index] += tmp; | ||
2842 | ca = parent_ca(ca); | ||
2843 | } | ||
2844 | rcu_read_unlock(); | ||
2845 | #endif | ||
2846 | } | ||
2847 | |||
2848 | |||
2849 | /* | ||
2850 | * Account user cpu time to a process. | ||
2851 | * @p: the process that the cpu time gets accounted to | ||
2852 | * @cputime: the cpu time spent in user space since the last update | ||
2853 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2854 | */ | ||
2855 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
2856 | cputime_t cputime_scaled) | ||
2857 | { | ||
2858 | int index; | ||
2859 | |||
2860 | /* Add user time to process. */ | ||
2861 | p->utime += cputime; | ||
2862 | p->utimescaled += cputime_scaled; | ||
2863 | account_group_user_time(p, cputime); | ||
2864 | |||
2865 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
2866 | |||
2867 | /* Add user time to cpustat. */ | ||
2868 | task_group_account_field(p, index, (__force u64) cputime); | ||
2869 | |||
2870 | /* Account for user time used */ | ||
2871 | acct_update_integrals(p); | ||
2872 | } | ||
2873 | |||
2874 | /* | ||
2875 | * Account guest cpu time to a process. | ||
2876 | * @p: the process that the cpu time gets accounted to | ||
2877 | * @cputime: the cpu time spent in virtual machine since the last update | ||
2878 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2879 | */ | ||
2880 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
2881 | cputime_t cputime_scaled) | ||
2882 | { | ||
2883 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
2884 | |||
2885 | /* Add guest time to process. */ | ||
2886 | p->utime += cputime; | ||
2887 | p->utimescaled += cputime_scaled; | ||
2888 | account_group_user_time(p, cputime); | ||
2889 | p->gtime += cputime; | ||
2890 | |||
2891 | /* Add guest time to cpustat. */ | ||
2892 | if (TASK_NICE(p) > 0) { | ||
2893 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
2894 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
2895 | } else { | ||
2896 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
2897 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
2898 | } | ||
2899 | } | ||
2900 | |||
2901 | /* | ||
2902 | * Account system cpu time to a process and desired cpustat field | ||
2903 | * @p: the process that the cpu time gets accounted to | ||
2904 | * @cputime: the cpu time spent in kernel space since the last update | ||
2905 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2906 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
2907 | */ | ||
2908 | static inline | ||
2909 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
2910 | cputime_t cputime_scaled, int index) | ||
2911 | { | ||
2912 | /* Add system time to process. */ | ||
2913 | p->stime += cputime; | ||
2914 | p->stimescaled += cputime_scaled; | ||
2915 | account_group_system_time(p, cputime); | ||
2916 | |||
2917 | /* Add system time to cpustat. */ | ||
2918 | task_group_account_field(p, index, (__force u64) cputime); | ||
2919 | |||
2920 | /* Account for system time used */ | ||
2921 | acct_update_integrals(p); | ||
2922 | } | ||
2923 | |||
2924 | /* | ||
2925 | * Account system cpu time to a process. | ||
2926 | * @p: the process that the cpu time gets accounted to | ||
2927 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
2928 | * @cputime: the cpu time spent in kernel space since the last update | ||
2929 | * @cputime_scaled: cputime scaled by cpu frequency | ||
2930 | */ | ||
2931 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
2932 | cputime_t cputime, cputime_t cputime_scaled) | ||
2933 | { | ||
2934 | int index; | ||
2935 | |||
2936 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
2937 | account_guest_time(p, cputime, cputime_scaled); | ||
2938 | return; | ||
2939 | } | ||
2940 | |||
2941 | if (hardirq_count() - hardirq_offset) | ||
2942 | index = CPUTIME_IRQ; | ||
2943 | else if (in_serving_softirq()) | ||
2944 | index = CPUTIME_SOFTIRQ; | ||
2945 | else | ||
2946 | index = CPUTIME_SYSTEM; | ||
2947 | |||
2948 | __account_system_time(p, cputime, cputime_scaled, index); | ||
2949 | } | ||
2950 | |||
2951 | /* | ||
2952 | * Account for involuntary wait time. | ||
2953 | * @cputime: the cpu time spent in involuntary wait | ||
2954 | */ | ||
2955 | void account_steal_time(cputime_t cputime) | ||
2956 | { | ||
2957 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
2958 | |||
2959 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
2960 | } | ||
2961 | |||
2962 | /* | ||
2963 | * Account for idle time. | ||
2964 | * @cputime: the cpu time spent in idle wait | ||
2965 | */ | ||
2966 | void account_idle_time(cputime_t cputime) | ||
2967 | { | ||
2968 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
2969 | struct rq *rq = this_rq(); | ||
2970 | |||
2971 | if (atomic_read(&rq->nr_iowait) > 0) | ||
2972 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
2973 | else | ||
2974 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
2975 | } | ||
2976 | |||
2977 | static __always_inline bool steal_account_process_tick(void) | ||
2978 | { | ||
2979 | #ifdef CONFIG_PARAVIRT | ||
2980 | if (static_key_false(¶virt_steal_enabled)) { | ||
2981 | u64 steal, st = 0; | ||
2982 | |||
2983 | steal = paravirt_steal_clock(smp_processor_id()); | ||
2984 | steal -= this_rq()->prev_steal_time; | ||
2985 | |||
2986 | st = steal_ticks(steal); | ||
2987 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
2988 | |||
2989 | account_steal_time(st); | ||
2990 | return st; | ||
2991 | } | ||
2992 | #endif | ||
2993 | return false; | ||
2994 | } | ||
2995 | |||
2996 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
2997 | |||
2998 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
2999 | /* | ||
3000 | * Account a tick to a process and cpustat | ||
3001 | * @p: the process that the cpu time gets accounted to | ||
3002 | * @user_tick: is the tick from userspace | ||
3003 | * @rq: the pointer to rq | ||
3004 | * | ||
3005 | * Tick demultiplexing follows the order | ||
3006 | * - pending hardirq update | ||
3007 | * - pending softirq update | ||
3008 | * - user_time | ||
3009 | * - idle_time | ||
3010 | * - system time | ||
3011 | * - check for guest_time | ||
3012 | * - else account as system_time | ||
3013 | * | ||
3014 | * Check for hardirq is done both for system and user time as there is | ||
3015 | * no timer going off while we are on hardirq and hence we may never get an | ||
3016 | * opportunity to update it solely in system time. | ||
3017 | * p->stime and friends are only updated on system time and not on irq | ||
3018 | * softirq as those do not count in task exec_runtime any more. | ||
3019 | */ | ||
3020 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3021 | struct rq *rq) | ||
3022 | { | ||
3023 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3024 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
3025 | |||
3026 | if (steal_account_process_tick()) | ||
3027 | return; | ||
3028 | |||
3029 | if (irqtime_account_hi_update()) { | ||
3030 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
3031 | } else if (irqtime_account_si_update()) { | ||
3032 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
3033 | } else if (this_cpu_ksoftirqd() == p) { | ||
3034 | /* | ||
3035 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3036 | * So, we have to handle it separately here. | ||
3037 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3038 | */ | ||
3039 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3040 | CPUTIME_SOFTIRQ); | ||
3041 | } else if (user_tick) { | ||
3042 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3043 | } else if (p == rq->idle) { | ||
3044 | account_idle_time(cputime_one_jiffy); | ||
3045 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3046 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3047 | } else { | ||
3048 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3049 | CPUTIME_SYSTEM); | ||
3050 | } | ||
3051 | } | ||
3052 | |||
3053 | static void irqtime_account_idle_ticks(int ticks) | ||
3054 | { | ||
3055 | int i; | ||
3056 | struct rq *rq = this_rq(); | ||
3057 | |||
3058 | for (i = 0; i < ticks; i++) | ||
3059 | irqtime_account_process_tick(current, 0, rq); | ||
3060 | } | ||
3061 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3062 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3063 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3064 | struct rq *rq) {} | ||
3065 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3066 | |||
3067 | /* | ||
3068 | * Account a single tick of cpu time. | ||
3069 | * @p: the process that the cpu time gets accounted to | ||
3070 | * @user_tick: indicates if the tick is a user or a system tick | ||
3071 | */ | ||
3072 | void account_process_tick(struct task_struct *p, int user_tick) | ||
3073 | { | ||
3074 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3075 | struct rq *rq = this_rq(); | ||
3076 | |||
3077 | if (sched_clock_irqtime) { | ||
3078 | irqtime_account_process_tick(p, user_tick, rq); | ||
3079 | return; | ||
3080 | } | ||
3081 | |||
3082 | if (steal_account_process_tick()) | ||
3083 | return; | ||
3084 | |||
3085 | if (user_tick) | ||
3086 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3087 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
3088 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
3089 | one_jiffy_scaled); | ||
3090 | else | ||
3091 | account_idle_time(cputime_one_jiffy); | ||
3092 | } | ||
3093 | |||
3094 | /* | ||
3095 | * Account multiple ticks of steal time. | ||
3096 | * @p: the process from which the cpu time has been stolen | ||
3097 | * @ticks: number of stolen ticks | ||
3098 | */ | ||
3099 | void account_steal_ticks(unsigned long ticks) | ||
3100 | { | ||
3101 | account_steal_time(jiffies_to_cputime(ticks)); | ||
3102 | } | ||
3103 | |||
3104 | /* | ||
3105 | * Account multiple ticks of idle time. | ||
3106 | * @ticks: number of stolen ticks | ||
3107 | */ | ||
3108 | void account_idle_ticks(unsigned long ticks) | ||
3109 | { | ||
3110 | |||
3111 | if (sched_clock_irqtime) { | ||
3112 | irqtime_account_idle_ticks(ticks); | ||
3113 | return; | ||
3114 | } | ||
3115 | |||
3116 | account_idle_time(jiffies_to_cputime(ticks)); | ||
3117 | } | ||
3118 | |||
3119 | #endif | ||
3120 | |||
3121 | /* | ||
3122 | * Use precise platform statistics if available: | ||
3123 | */ | ||
3124 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
3125 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3126 | { | ||
3127 | *ut = p->utime; | ||
3128 | *st = p->stime; | ||
3129 | } | ||
3130 | |||
3131 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3132 | { | ||
3133 | struct task_cputime cputime; | ||
3134 | |||
3135 | thread_group_cputime(p, &cputime); | ||
3136 | |||
3137 | *ut = cputime.utime; | ||
3138 | *st = cputime.stime; | ||
3139 | } | ||
3140 | #else | ||
3141 | |||
3142 | #ifndef nsecs_to_cputime | ||
3143 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
3144 | #endif | ||
3145 | |||
3146 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
3147 | { | ||
3148 | u64 temp = (__force u64) rtime; | ||
3149 | |||
3150 | temp *= (__force u64) utime; | ||
3151 | |||
3152 | if (sizeof(cputime_t) == 4) | ||
3153 | temp = div_u64(temp, (__force u32) total); | ||
3154 | else | ||
3155 | temp = div64_u64(temp, (__force u64) total); | ||
3156 | |||
3157 | return (__force cputime_t) temp; | ||
3158 | } | ||
3159 | |||
3160 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3161 | { | ||
3162 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
3163 | |||
3164 | /* | ||
3165 | * Use CFS's precise accounting: | ||
3166 | */ | ||
3167 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
3168 | |||
3169 | if (total) | ||
3170 | utime = scale_utime(utime, rtime, total); | ||
3171 | else | ||
3172 | utime = rtime; | ||
3173 | |||
3174 | /* | ||
3175 | * Compare with previous values, to keep monotonicity: | ||
3176 | */ | ||
3177 | p->prev_utime = max(p->prev_utime, utime); | ||
3178 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
3179 | |||
3180 | *ut = p->prev_utime; | ||
3181 | *st = p->prev_stime; | ||
3182 | } | ||
3183 | |||
3184 | /* | ||
3185 | * Must be called with siglock held. | ||
3186 | */ | ||
3187 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
3188 | { | ||
3189 | struct signal_struct *sig = p->signal; | ||
3190 | struct task_cputime cputime; | ||
3191 | cputime_t rtime, utime, total; | ||
3192 | |||
3193 | thread_group_cputime(p, &cputime); | ||
3194 | |||
3195 | total = cputime.utime + cputime.stime; | ||
3196 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
3197 | |||
3198 | if (total) | ||
3199 | utime = scale_utime(cputime.utime, rtime, total); | ||
3200 | else | ||
3201 | utime = rtime; | ||
3202 | |||
3203 | sig->prev_utime = max(sig->prev_utime, utime); | ||
3204 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
3205 | |||
3206 | *ut = sig->prev_utime; | ||
3207 | *st = sig->prev_stime; | ||
3208 | } | ||
3209 | #endif | ||
3210 | |||
3211 | /* | 2619 | /* |
3212 | * This function gets called by the timer code, with HZ frequency. | 2620 | * This function gets called by the timer code, with HZ frequency. |
3213 | * We call it with interrupts disabled. | 2621 | * We call it with interrupts disabled. |
@@ -3368,6 +2776,40 @@ pick_next_task(struct rq *rq) | |||
3368 | 2776 | ||
3369 | /* | 2777 | /* |
3370 | * __schedule() is the main scheduler function. | 2778 | * __schedule() is the main scheduler function. |
2779 | * | ||
2780 | * The main means of driving the scheduler and thus entering this function are: | ||
2781 | * | ||
2782 | * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. | ||
2783 | * | ||
2784 | * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return | ||
2785 | * paths. For example, see arch/x86/entry_64.S. | ||
2786 | * | ||
2787 | * To drive preemption between tasks, the scheduler sets the flag in timer | ||
2788 | * interrupt handler scheduler_tick(). | ||
2789 | * | ||
2790 | * 3. Wakeups don't really cause entry into schedule(). They add a | ||
2791 | * task to the run-queue and that's it. | ||
2792 | * | ||
2793 | * Now, if the new task added to the run-queue preempts the current | ||
2794 | * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets | ||
2795 | * called on the nearest possible occasion: | ||
2796 | * | ||
2797 | * - If the kernel is preemptible (CONFIG_PREEMPT=y): | ||
2798 | * | ||
2799 | * - in syscall or exception context, at the next outmost | ||
2800 | * preempt_enable(). (this might be as soon as the wake_up()'s | ||
2801 | * spin_unlock()!) | ||
2802 | * | ||
2803 | * - in IRQ context, return from interrupt-handler to | ||
2804 | * preemptible context | ||
2805 | * | ||
2806 | * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) | ||
2807 | * then at the next: | ||
2808 | * | ||
2809 | * - cond_resched() call | ||
2810 | * - explicit schedule() call | ||
2811 | * - return from syscall or exception to user-space | ||
2812 | * - return from interrupt-handler to user-space | ||
3371 | */ | 2813 | */ |
3372 | static void __sched __schedule(void) | 2814 | static void __sched __schedule(void) |
3373 | { | 2815 | { |
@@ -4885,13 +4327,6 @@ again: | |||
4885 | */ | 4327 | */ |
4886 | if (preempt && rq != p_rq) | 4328 | if (preempt && rq != p_rq) |
4887 | resched_task(p_rq->curr); | 4329 | resched_task(p_rq->curr); |
4888 | } else { | ||
4889 | /* | ||
4890 | * We might have set it in task_yield_fair(), but are | ||
4891 | * not going to schedule(), so don't want to skip | ||
4892 | * the next update. | ||
4893 | */ | ||
4894 | rq->skip_clock_update = 0; | ||
4895 | } | 4330 | } |
4896 | 4331 | ||
4897 | out: | 4332 | out: |
@@ -5433,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
5433 | *tablep = NULL; | 4868 | *tablep = NULL; |
5434 | } | 4869 | } |
5435 | 4870 | ||
4871 | static int min_load_idx = 0; | ||
4872 | static int max_load_idx = CPU_LOAD_IDX_MAX; | ||
4873 | |||
5436 | static void | 4874 | static void |
5437 | set_table_entry(struct ctl_table *entry, | 4875 | set_table_entry(struct ctl_table *entry, |
5438 | const char *procname, void *data, int maxlen, | 4876 | const char *procname, void *data, int maxlen, |
5439 | umode_t mode, proc_handler *proc_handler) | 4877 | umode_t mode, proc_handler *proc_handler, |
4878 | bool load_idx) | ||
5440 | { | 4879 | { |
5441 | entry->procname = procname; | 4880 | entry->procname = procname; |
5442 | entry->data = data; | 4881 | entry->data = data; |
5443 | entry->maxlen = maxlen; | 4882 | entry->maxlen = maxlen; |
5444 | entry->mode = mode; | 4883 | entry->mode = mode; |
5445 | entry->proc_handler = proc_handler; | 4884 | entry->proc_handler = proc_handler; |
4885 | |||
4886 | if (load_idx) { | ||
4887 | entry->extra1 = &min_load_idx; | ||
4888 | entry->extra2 = &max_load_idx; | ||
4889 | } | ||
5446 | } | 4890 | } |
5447 | 4891 | ||
5448 | static struct ctl_table * | 4892 | static struct ctl_table * |
@@ -5454,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5454 | return NULL; | 4898 | return NULL; |
5455 | 4899 | ||
5456 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 4900 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
5457 | sizeof(long), 0644, proc_doulongvec_minmax); | 4901 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
5458 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 4902 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
5459 | sizeof(long), 0644, proc_doulongvec_minmax); | 4903 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
5460 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 4904 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
5461 | sizeof(int), 0644, proc_dointvec_minmax); | 4905 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5462 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 4906 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
5463 | sizeof(int), 0644, proc_dointvec_minmax); | 4907 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5464 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 4908 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
5465 | sizeof(int), 0644, proc_dointvec_minmax); | 4909 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5466 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 4910 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
5467 | sizeof(int), 0644, proc_dointvec_minmax); | 4911 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5468 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 4912 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
5469 | sizeof(int), 0644, proc_dointvec_minmax); | 4913 | sizeof(int), 0644, proc_dointvec_minmax, true); |
5470 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 4914 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
5471 | sizeof(int), 0644, proc_dointvec_minmax); | 4915 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5472 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 4916 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
5473 | sizeof(int), 0644, proc_dointvec_minmax); | 4917 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5474 | set_table_entry(&table[9], "cache_nice_tries", | 4918 | set_table_entry(&table[9], "cache_nice_tries", |
5475 | &sd->cache_nice_tries, | 4919 | &sd->cache_nice_tries, |
5476 | sizeof(int), 0644, proc_dointvec_minmax); | 4920 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5477 | set_table_entry(&table[10], "flags", &sd->flags, | 4921 | set_table_entry(&table[10], "flags", &sd->flags, |
5478 | sizeof(int), 0644, proc_dointvec_minmax); | 4922 | sizeof(int), 0644, proc_dointvec_minmax, false); |
5479 | set_table_entry(&table[11], "name", sd->name, | 4923 | set_table_entry(&table[11], "name", sd->name, |
5480 | CORENAME_MAX_SIZE, 0444, proc_dostring); | 4924 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
5481 | /* &table[12] is terminator */ | 4925 | /* &table[12] is terminator */ |
5482 | 4926 | ||
5483 | return table; | 4927 | return table; |
@@ -6556,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
6556 | | 0*SD_BALANCE_FORK | 6000 | | 0*SD_BALANCE_FORK |
6557 | | 0*SD_BALANCE_WAKE | 6001 | | 0*SD_BALANCE_WAKE |
6558 | | 0*SD_WAKE_AFFINE | 6002 | | 0*SD_WAKE_AFFINE |
6559 | | 0*SD_PREFER_LOCAL | ||
6560 | | 0*SD_SHARE_CPUPOWER | 6003 | | 0*SD_SHARE_CPUPOWER |
6561 | | 0*SD_SHARE_PKG_RESOURCES | 6004 | | 0*SD_SHARE_PKG_RESOURCES |
6562 | | 1*SD_SERIALIZE | 6005 | | 1*SD_SERIALIZE |
@@ -8354,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8354 | * (balbir@in.ibm.com). | 7797 | * (balbir@in.ibm.com). |
8355 | */ | 7798 | */ |
8356 | 7799 | ||
7800 | struct cpuacct root_cpuacct; | ||
7801 | |||
8357 | /* create a new cpu accounting group */ | 7802 | /* create a new cpu accounting group */ |
8358 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7803 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) |
8359 | { | 7804 | { |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..81b763ba58a6 --- /dev/null +++ b/kernel/sched/cputime.c | |||
@@ -0,0 +1,530 @@ | |||
1 | #include <linux/export.h> | ||
2 | #include <linux/sched.h> | ||
3 | #include <linux/tsacct_kern.h> | ||
4 | #include <linux/kernel_stat.h> | ||
5 | #include <linux/static_key.h> | ||
6 | #include "sched.h" | ||
7 | |||
8 | |||
9 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
10 | |||
11 | /* | ||
12 | * There are no locks covering percpu hardirq/softirq time. | ||
13 | * They are only modified in vtime_account, on corresponding CPU | ||
14 | * with interrupts disabled. So, writes are safe. | ||
15 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
16 | * This may result in other CPU reading this CPU's irq time and can | ||
17 | * race with irq/vtime_account on this CPU. We would either get old | ||
18 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
19 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
20 | * compromise in place of having locks on each irq in account_system_time. | ||
21 | */ | ||
22 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
23 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
24 | |||
25 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
26 | static int sched_clock_irqtime; | ||
27 | |||
28 | void enable_sched_clock_irqtime(void) | ||
29 | { | ||
30 | sched_clock_irqtime = 1; | ||
31 | } | ||
32 | |||
33 | void disable_sched_clock_irqtime(void) | ||
34 | { | ||
35 | sched_clock_irqtime = 0; | ||
36 | } | ||
37 | |||
38 | #ifndef CONFIG_64BIT | ||
39 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
40 | #endif /* CONFIG_64BIT */ | ||
41 | |||
42 | /* | ||
43 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
44 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
45 | */ | ||
46 | void vtime_account(struct task_struct *curr) | ||
47 | { | ||
48 | unsigned long flags; | ||
49 | s64 delta; | ||
50 | int cpu; | ||
51 | |||
52 | if (!sched_clock_irqtime) | ||
53 | return; | ||
54 | |||
55 | local_irq_save(flags); | ||
56 | |||
57 | cpu = smp_processor_id(); | ||
58 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
59 | __this_cpu_add(irq_start_time, delta); | ||
60 | |||
61 | irq_time_write_begin(); | ||
62 | /* | ||
63 | * We do not account for softirq time from ksoftirqd here. | ||
64 | * We want to continue accounting softirq time to ksoftirqd thread | ||
65 | * in that case, so as not to confuse scheduler with a special task | ||
66 | * that do not consume any time, but still wants to run. | ||
67 | */ | ||
68 | if (hardirq_count()) | ||
69 | __this_cpu_add(cpu_hardirq_time, delta); | ||
70 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
71 | __this_cpu_add(cpu_softirq_time, delta); | ||
72 | |||
73 | irq_time_write_end(); | ||
74 | local_irq_restore(flags); | ||
75 | } | ||
76 | EXPORT_SYMBOL_GPL(vtime_account); | ||
77 | |||
78 | static int irqtime_account_hi_update(void) | ||
79 | { | ||
80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
81 | unsigned long flags; | ||
82 | u64 latest_ns; | ||
83 | int ret = 0; | ||
84 | |||
85 | local_irq_save(flags); | ||
86 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
87 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
88 | ret = 1; | ||
89 | local_irq_restore(flags); | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | static int irqtime_account_si_update(void) | ||
94 | { | ||
95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
96 | unsigned long flags; | ||
97 | u64 latest_ns; | ||
98 | int ret = 0; | ||
99 | |||
100 | local_irq_save(flags); | ||
101 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
102 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
103 | ret = 1; | ||
104 | local_irq_restore(flags); | ||
105 | return ret; | ||
106 | } | ||
107 | |||
108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
109 | |||
110 | #define sched_clock_irqtime (0) | ||
111 | |||
112 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | ||
113 | |||
114 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
115 | u64 tmp) | ||
116 | { | ||
117 | #ifdef CONFIG_CGROUP_CPUACCT | ||
118 | struct kernel_cpustat *kcpustat; | ||
119 | struct cpuacct *ca; | ||
120 | #endif | ||
121 | /* | ||
122 | * Since all updates are sure to touch the root cgroup, we | ||
123 | * get ourselves ahead and touch it first. If the root cgroup | ||
124 | * is the only cgroup, then nothing else should be necessary. | ||
125 | * | ||
126 | */ | ||
127 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
128 | |||
129 | #ifdef CONFIG_CGROUP_CPUACCT | ||
130 | if (unlikely(!cpuacct_subsys.active)) | ||
131 | return; | ||
132 | |||
133 | rcu_read_lock(); | ||
134 | ca = task_ca(p); | ||
135 | while (ca && (ca != &root_cpuacct)) { | ||
136 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
137 | kcpustat->cpustat[index] += tmp; | ||
138 | ca = parent_ca(ca); | ||
139 | } | ||
140 | rcu_read_unlock(); | ||
141 | #endif | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * Account user cpu time to a process. | ||
146 | * @p: the process that the cpu time gets accounted to | ||
147 | * @cputime: the cpu time spent in user space since the last update | ||
148 | * @cputime_scaled: cputime scaled by cpu frequency | ||
149 | */ | ||
150 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
151 | cputime_t cputime_scaled) | ||
152 | { | ||
153 | int index; | ||
154 | |||
155 | /* Add user time to process. */ | ||
156 | p->utime += cputime; | ||
157 | p->utimescaled += cputime_scaled; | ||
158 | account_group_user_time(p, cputime); | ||
159 | |||
160 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
161 | |||
162 | /* Add user time to cpustat. */ | ||
163 | task_group_account_field(p, index, (__force u64) cputime); | ||
164 | |||
165 | /* Account for user time used */ | ||
166 | acct_update_integrals(p); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Account guest cpu time to a process. | ||
171 | * @p: the process that the cpu time gets accounted to | ||
172 | * @cputime: the cpu time spent in virtual machine since the last update | ||
173 | * @cputime_scaled: cputime scaled by cpu frequency | ||
174 | */ | ||
175 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
176 | cputime_t cputime_scaled) | ||
177 | { | ||
178 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
179 | |||
180 | /* Add guest time to process. */ | ||
181 | p->utime += cputime; | ||
182 | p->utimescaled += cputime_scaled; | ||
183 | account_group_user_time(p, cputime); | ||
184 | p->gtime += cputime; | ||
185 | |||
186 | /* Add guest time to cpustat. */ | ||
187 | if (TASK_NICE(p) > 0) { | ||
188 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
189 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
190 | } else { | ||
191 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
192 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
193 | } | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * Account system cpu time to a process and desired cpustat field | ||
198 | * @p: the process that the cpu time gets accounted to | ||
199 | * @cputime: the cpu time spent in kernel space since the last update | ||
200 | * @cputime_scaled: cputime scaled by cpu frequency | ||
201 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
202 | */ | ||
203 | static inline | ||
204 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
205 | cputime_t cputime_scaled, int index) | ||
206 | { | ||
207 | /* Add system time to process. */ | ||
208 | p->stime += cputime; | ||
209 | p->stimescaled += cputime_scaled; | ||
210 | account_group_system_time(p, cputime); | ||
211 | |||
212 | /* Add system time to cpustat. */ | ||
213 | task_group_account_field(p, index, (__force u64) cputime); | ||
214 | |||
215 | /* Account for system time used */ | ||
216 | acct_update_integrals(p); | ||
217 | } | ||
218 | |||
219 | /* | ||
220 | * Account system cpu time to a process. | ||
221 | * @p: the process that the cpu time gets accounted to | ||
222 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
223 | * @cputime: the cpu time spent in kernel space since the last update | ||
224 | * @cputime_scaled: cputime scaled by cpu frequency | ||
225 | */ | ||
226 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
227 | cputime_t cputime, cputime_t cputime_scaled) | ||
228 | { | ||
229 | int index; | ||
230 | |||
231 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
232 | account_guest_time(p, cputime, cputime_scaled); | ||
233 | return; | ||
234 | } | ||
235 | |||
236 | if (hardirq_count() - hardirq_offset) | ||
237 | index = CPUTIME_IRQ; | ||
238 | else if (in_serving_softirq()) | ||
239 | index = CPUTIME_SOFTIRQ; | ||
240 | else | ||
241 | index = CPUTIME_SYSTEM; | ||
242 | |||
243 | __account_system_time(p, cputime, cputime_scaled, index); | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Account for involuntary wait time. | ||
248 | * @cputime: the cpu time spent in involuntary wait | ||
249 | */ | ||
250 | void account_steal_time(cputime_t cputime) | ||
251 | { | ||
252 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
253 | |||
254 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Account for idle time. | ||
259 | * @cputime: the cpu time spent in idle wait | ||
260 | */ | ||
261 | void account_idle_time(cputime_t cputime) | ||
262 | { | ||
263 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
264 | struct rq *rq = this_rq(); | ||
265 | |||
266 | if (atomic_read(&rq->nr_iowait) > 0) | ||
267 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
268 | else | ||
269 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
270 | } | ||
271 | |||
272 | static __always_inline bool steal_account_process_tick(void) | ||
273 | { | ||
274 | #ifdef CONFIG_PARAVIRT | ||
275 | if (static_key_false(¶virt_steal_enabled)) { | ||
276 | u64 steal, st = 0; | ||
277 | |||
278 | steal = paravirt_steal_clock(smp_processor_id()); | ||
279 | steal -= this_rq()->prev_steal_time; | ||
280 | |||
281 | st = steal_ticks(steal); | ||
282 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
283 | |||
284 | account_steal_time(st); | ||
285 | return st; | ||
286 | } | ||
287 | #endif | ||
288 | return false; | ||
289 | } | ||
290 | |||
291 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
292 | |||
293 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
294 | /* | ||
295 | * Account a tick to a process and cpustat | ||
296 | * @p: the process that the cpu time gets accounted to | ||
297 | * @user_tick: is the tick from userspace | ||
298 | * @rq: the pointer to rq | ||
299 | * | ||
300 | * Tick demultiplexing follows the order | ||
301 | * - pending hardirq update | ||
302 | * - pending softirq update | ||
303 | * - user_time | ||
304 | * - idle_time | ||
305 | * - system time | ||
306 | * - check for guest_time | ||
307 | * - else account as system_time | ||
308 | * | ||
309 | * Check for hardirq is done both for system and user time as there is | ||
310 | * no timer going off while we are on hardirq and hence we may never get an | ||
311 | * opportunity to update it solely in system time. | ||
312 | * p->stime and friends are only updated on system time and not on irq | ||
313 | * softirq as those do not count in task exec_runtime any more. | ||
314 | */ | ||
315 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
316 | struct rq *rq) | ||
317 | { | ||
318 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
319 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
320 | |||
321 | if (steal_account_process_tick()) | ||
322 | return; | ||
323 | |||
324 | if (irqtime_account_hi_update()) { | ||
325 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
326 | } else if (irqtime_account_si_update()) { | ||
327 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
328 | } else if (this_cpu_ksoftirqd() == p) { | ||
329 | /* | ||
330 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
331 | * So, we have to handle it separately here. | ||
332 | * Also, p->stime needs to be updated for ksoftirqd. | ||
333 | */ | ||
334 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
335 | CPUTIME_SOFTIRQ); | ||
336 | } else if (user_tick) { | ||
337 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
338 | } else if (p == rq->idle) { | ||
339 | account_idle_time(cputime_one_jiffy); | ||
340 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
341 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
342 | } else { | ||
343 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
344 | CPUTIME_SYSTEM); | ||
345 | } | ||
346 | } | ||
347 | |||
348 | static void irqtime_account_idle_ticks(int ticks) | ||
349 | { | ||
350 | int i; | ||
351 | struct rq *rq = this_rq(); | ||
352 | |||
353 | for (i = 0; i < ticks; i++) | ||
354 | irqtime_account_process_tick(current, 0, rq); | ||
355 | } | ||
356 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
357 | static void irqtime_account_idle_ticks(int ticks) {} | ||
358 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
359 | struct rq *rq) {} | ||
360 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
361 | |||
362 | /* | ||
363 | * Account a single tick of cpu time. | ||
364 | * @p: the process that the cpu time gets accounted to | ||
365 | * @user_tick: indicates if the tick is a user or a system tick | ||
366 | */ | ||
367 | void account_process_tick(struct task_struct *p, int user_tick) | ||
368 | { | ||
369 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
370 | struct rq *rq = this_rq(); | ||
371 | |||
372 | if (sched_clock_irqtime) { | ||
373 | irqtime_account_process_tick(p, user_tick, rq); | ||
374 | return; | ||
375 | } | ||
376 | |||
377 | if (steal_account_process_tick()) | ||
378 | return; | ||
379 | |||
380 | if (user_tick) | ||
381 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
382 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
383 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
384 | one_jiffy_scaled); | ||
385 | else | ||
386 | account_idle_time(cputime_one_jiffy); | ||
387 | } | ||
388 | |||
389 | /* | ||
390 | * Account multiple ticks of steal time. | ||
391 | * @p: the process from which the cpu time has been stolen | ||
392 | * @ticks: number of stolen ticks | ||
393 | */ | ||
394 | void account_steal_ticks(unsigned long ticks) | ||
395 | { | ||
396 | account_steal_time(jiffies_to_cputime(ticks)); | ||
397 | } | ||
398 | |||
399 | /* | ||
400 | * Account multiple ticks of idle time. | ||
401 | * @ticks: number of stolen ticks | ||
402 | */ | ||
403 | void account_idle_ticks(unsigned long ticks) | ||
404 | { | ||
405 | |||
406 | if (sched_clock_irqtime) { | ||
407 | irqtime_account_idle_ticks(ticks); | ||
408 | return; | ||
409 | } | ||
410 | |||
411 | account_idle_time(jiffies_to_cputime(ticks)); | ||
412 | } | ||
413 | |||
414 | #endif | ||
415 | |||
416 | /* | ||
417 | * Use precise platform statistics if available: | ||
418 | */ | ||
419 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
420 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
421 | { | ||
422 | *ut = p->utime; | ||
423 | *st = p->stime; | ||
424 | } | ||
425 | |||
426 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
427 | { | ||
428 | struct task_cputime cputime; | ||
429 | |||
430 | thread_group_cputime(p, &cputime); | ||
431 | |||
432 | *ut = cputime.utime; | ||
433 | *st = cputime.stime; | ||
434 | } | ||
435 | |||
436 | /* | ||
437 | * Archs that account the whole time spent in the idle task | ||
438 | * (outside irq) as idle time can rely on this and just implement | ||
439 | * vtime_account_system() and vtime_account_idle(). Archs that | ||
440 | * have other meaning of the idle time (s390 only includes the | ||
441 | * time spent by the CPU when it's in low power mode) must override | ||
442 | * vtime_account(). | ||
443 | */ | ||
444 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | ||
445 | void vtime_account(struct task_struct *tsk) | ||
446 | { | ||
447 | unsigned long flags; | ||
448 | |||
449 | local_irq_save(flags); | ||
450 | |||
451 | if (in_interrupt() || !is_idle_task(tsk)) | ||
452 | vtime_account_system(tsk); | ||
453 | else | ||
454 | vtime_account_idle(tsk); | ||
455 | |||
456 | local_irq_restore(flags); | ||
457 | } | ||
458 | EXPORT_SYMBOL_GPL(vtime_account); | ||
459 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | ||
460 | |||
461 | #else | ||
462 | |||
463 | #ifndef nsecs_to_cputime | ||
464 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
465 | #endif | ||
466 | |||
467 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
468 | { | ||
469 | u64 temp = (__force u64) rtime; | ||
470 | |||
471 | temp *= (__force u64) utime; | ||
472 | |||
473 | if (sizeof(cputime_t) == 4) | ||
474 | temp = div_u64(temp, (__force u32) total); | ||
475 | else | ||
476 | temp = div64_u64(temp, (__force u64) total); | ||
477 | |||
478 | return (__force cputime_t) temp; | ||
479 | } | ||
480 | |||
481 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
482 | { | ||
483 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
484 | |||
485 | /* | ||
486 | * Use CFS's precise accounting: | ||
487 | */ | ||
488 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
489 | |||
490 | if (total) | ||
491 | utime = scale_utime(utime, rtime, total); | ||
492 | else | ||
493 | utime = rtime; | ||
494 | |||
495 | /* | ||
496 | * Compare with previous values, to keep monotonicity: | ||
497 | */ | ||
498 | p->prev_utime = max(p->prev_utime, utime); | ||
499 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
500 | |||
501 | *ut = p->prev_utime; | ||
502 | *st = p->prev_stime; | ||
503 | } | ||
504 | |||
505 | /* | ||
506 | * Must be called with siglock held. | ||
507 | */ | ||
508 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
509 | { | ||
510 | struct signal_struct *sig = p->signal; | ||
511 | struct task_cputime cputime; | ||
512 | cputime_t rtime, utime, total; | ||
513 | |||
514 | thread_group_cputime(p, &cputime); | ||
515 | |||
516 | total = cputime.utime + cputime.stime; | ||
517 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
518 | |||
519 | if (total) | ||
520 | utime = scale_utime(cputime.utime, rtime, total); | ||
521 | else | ||
522 | utime = rtime; | ||
523 | |||
524 | sig->prev_utime = max(sig->prev_utime, utime); | ||
525 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
526 | |||
527 | *ut = sig->prev_utime; | ||
528 | *st = sig->prev_stime; | ||
529 | } | ||
530 | #endif | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e2b18b6283..6b800a14b990 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) | |||
597 | /* | 597 | /* |
598 | * The idea is to set a period in which each task runs once. | 598 | * The idea is to set a period in which each task runs once. |
599 | * | 599 | * |
600 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 600 | * When there are too many tasks (sched_nr_latency) we have to stretch |
601 | * this period because otherwise the slices get too small. | 601 | * this period because otherwise the slices get too small. |
602 | * | 602 | * |
603 | * p = (nr <= nl) ? l : l*nr/nl | 603 | * p = (nr <= nl) ? l : l*nr/nl |
@@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2700 | int prev_cpu = task_cpu(p); | 2700 | int prev_cpu = task_cpu(p); |
2701 | int new_cpu = cpu; | 2701 | int new_cpu = cpu; |
2702 | int want_affine = 0; | 2702 | int want_affine = 0; |
2703 | int want_sd = 1; | ||
2704 | int sync = wake_flags & WF_SYNC; | 2703 | int sync = wake_flags & WF_SYNC; |
2705 | 2704 | ||
2706 | if (p->nr_cpus_allowed == 1) | 2705 | if (p->nr_cpus_allowed == 1) |
@@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
2718 | continue; | 2717 | continue; |
2719 | 2718 | ||
2720 | /* | 2719 | /* |
2721 | * If power savings logic is enabled for a domain, see if we | ||
2722 | * are not overloaded, if so, don't balance wider. | ||
2723 | */ | ||
2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { | ||
2725 | unsigned long power = 0; | ||
2726 | unsigned long nr_running = 0; | ||
2727 | unsigned long capacity; | ||
2728 | int i; | ||
2729 | |||
2730 | for_each_cpu(i, sched_domain_span(tmp)) { | ||
2731 | power += power_of(i); | ||
2732 | nr_running += cpu_rq(i)->cfs.nr_running; | ||
2733 | } | ||
2734 | |||
2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
2736 | |||
2737 | if (nr_running < capacity) | ||
2738 | want_sd = 0; | ||
2739 | } | ||
2740 | |||
2741 | /* | ||
2742 | * If both cpu and prev_cpu are part of this domain, | 2720 | * If both cpu and prev_cpu are part of this domain, |
2743 | * cpu is a valid SD_WAKE_AFFINE target. | 2721 | * cpu is a valid SD_WAKE_AFFINE target. |
2744 | */ | 2722 | */ |
2745 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 2723 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
2746 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { | 2724 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { |
2747 | affine_sd = tmp; | 2725 | affine_sd = tmp; |
2748 | want_affine = 0; | ||
2749 | } | ||
2750 | |||
2751 | if (!want_sd && !want_affine) | ||
2752 | break; | 2726 | break; |
2727 | } | ||
2753 | 2728 | ||
2754 | if (!(tmp->flags & sd_flag)) | 2729 | if (tmp->flags & sd_flag) |
2755 | continue; | ||
2756 | |||
2757 | if (want_sd) | ||
2758 | sd = tmp; | 2730 | sd = tmp; |
2759 | } | 2731 | } |
2760 | 2732 | ||
2761 | if (affine_sd) { | 2733 | if (affine_sd) { |
2762 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 2734 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
2763 | prev_cpu = cpu; | 2735 | prev_cpu = cpu; |
2764 | 2736 | ||
2765 | new_cpu = select_idle_sibling(p, prev_cpu); | 2737 | new_cpu = select_idle_sibling(p, prev_cpu); |
@@ -4295,7 +4267,7 @@ redo: | |||
4295 | goto out_balanced; | 4267 | goto out_balanced; |
4296 | } | 4268 | } |
4297 | 4269 | ||
4298 | BUG_ON(busiest == this_rq); | 4270 | BUG_ON(busiest == env.dst_rq); |
4299 | 4271 | ||
4300 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 4272 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
4301 | 4273 | ||
@@ -4316,7 +4288,7 @@ redo: | |||
4316 | update_h_load(env.src_cpu); | 4288 | update_h_load(env.src_cpu); |
4317 | more_balance: | 4289 | more_balance: |
4318 | local_irq_save(flags); | 4290 | local_irq_save(flags); |
4319 | double_rq_lock(this_rq, busiest); | 4291 | double_rq_lock(env.dst_rq, busiest); |
4320 | 4292 | ||
4321 | /* | 4293 | /* |
4322 | * cur_ld_moved - load moved in current iteration | 4294 | * cur_ld_moved - load moved in current iteration |
@@ -4324,7 +4296,7 @@ more_balance: | |||
4324 | */ | 4296 | */ |
4325 | cur_ld_moved = move_tasks(&env); | 4297 | cur_ld_moved = move_tasks(&env); |
4326 | ld_moved += cur_ld_moved; | 4298 | ld_moved += cur_ld_moved; |
4327 | double_rq_unlock(this_rq, busiest); | 4299 | double_rq_unlock(env.dst_rq, busiest); |
4328 | local_irq_restore(flags); | 4300 | local_irq_restore(flags); |
4329 | 4301 | ||
4330 | if (env.flags & LBF_NEED_BREAK) { | 4302 | if (env.flags & LBF_NEED_BREAK) { |
@@ -4360,8 +4332,7 @@ more_balance: | |||
4360 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 4332 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && |
4361 | lb_iterations++ < max_lb_iterations) { | 4333 | lb_iterations++ < max_lb_iterations) { |
4362 | 4334 | ||
4363 | this_rq = cpu_rq(env.new_dst_cpu); | 4335 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
4364 | env.dst_rq = this_rq; | ||
4365 | env.dst_cpu = env.new_dst_cpu; | 4336 | env.dst_cpu = env.new_dst_cpu; |
4366 | env.flags &= ~LBF_SOME_PINNED; | 4337 | env.flags &= ~LBF_SOME_PINNED; |
4367 | env.loop = 0; | 4338 | env.loop = 0; |
@@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu) | |||
4646 | return; | 4617 | return; |
4647 | } | 4618 | } |
4648 | 4619 | ||
4649 | static inline void clear_nohz_tick_stopped(int cpu) | 4620 | static inline void nohz_balance_exit_idle(int cpu) |
4650 | { | 4621 | { |
4651 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | 4622 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
4652 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4623 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
@@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void) | |||
4686 | } | 4657 | } |
4687 | 4658 | ||
4688 | /* | 4659 | /* |
4689 | * This routine will record that this cpu is going idle with tick stopped. | 4660 | * This routine will record that the cpu is going idle with tick stopped. |
4690 | * This info will be used in performing idle load balancing in the future. | 4661 | * This info will be used in performing idle load balancing in the future. |
4691 | */ | 4662 | */ |
4692 | void select_nohz_load_balancer(int stop_tick) | 4663 | void nohz_balance_enter_idle(int cpu) |
4693 | { | 4664 | { |
4694 | int cpu = smp_processor_id(); | ||
4695 | |||
4696 | /* | 4665 | /* |
4697 | * If this cpu is going down, then nothing needs to be done. | 4666 | * If this cpu is going down, then nothing needs to be done. |
4698 | */ | 4667 | */ |
4699 | if (!cpu_active(cpu)) | 4668 | if (!cpu_active(cpu)) |
4700 | return; | 4669 | return; |
4701 | 4670 | ||
4702 | if (stop_tick) { | 4671 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
4703 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 4672 | return; |
4704 | return; | ||
4705 | 4673 | ||
4706 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4674 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
4707 | atomic_inc(&nohz.nr_cpus); | 4675 | atomic_inc(&nohz.nr_cpus); |
4708 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 4676 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
4709 | } | ||
4710 | return; | ||
4711 | } | 4677 | } |
4712 | 4678 | ||
4713 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | 4679 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, |
@@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
4715 | { | 4681 | { |
4716 | switch (action & ~CPU_TASKS_FROZEN) { | 4682 | switch (action & ~CPU_TASKS_FROZEN) { |
4717 | case CPU_DYING: | 4683 | case CPU_DYING: |
4718 | clear_nohz_tick_stopped(smp_processor_id()); | 4684 | nohz_balance_exit_idle(smp_processor_id()); |
4719 | return NOTIFY_OK; | 4685 | return NOTIFY_OK; |
4720 | default: | 4686 | default: |
4721 | return NOTIFY_DONE; | 4687 | return NOTIFY_DONE; |
@@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
4837 | if (need_resched()) | 4803 | if (need_resched()) |
4838 | break; | 4804 | break; |
4839 | 4805 | ||
4840 | raw_spin_lock_irq(&this_rq->lock); | 4806 | rq = cpu_rq(balance_cpu); |
4841 | update_rq_clock(this_rq); | 4807 | |
4842 | update_idle_cpu_load(this_rq); | 4808 | raw_spin_lock_irq(&rq->lock); |
4843 | raw_spin_unlock_irq(&this_rq->lock); | 4809 | update_rq_clock(rq); |
4810 | update_idle_cpu_load(rq); | ||
4811 | raw_spin_unlock_irq(&rq->lock); | ||
4844 | 4812 | ||
4845 | rebalance_domains(balance_cpu, CPU_IDLE); | 4813 | rebalance_domains(balance_cpu, CPU_IDLE); |
4846 | 4814 | ||
4847 | rq = cpu_rq(balance_cpu); | ||
4848 | if (time_after(this_rq->next_balance, rq->next_balance)) | 4815 | if (time_after(this_rq->next_balance, rq->next_balance)) |
4849 | this_rq->next_balance = rq->next_balance; | 4816 | this_rq->next_balance = rq->next_balance; |
4850 | } | 4817 | } |
@@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
4875 | * busy tick after returning from idle, we will update the busy stats. | 4842 | * busy tick after returning from idle, we will update the busy stats. |
4876 | */ | 4843 | */ |
4877 | set_cpu_sd_state_busy(); | 4844 | set_cpu_sd_state_busy(); |
4878 | clear_nohz_tick_stopped(cpu); | 4845 | nohz_balance_exit_idle(cpu); |
4879 | 4846 | ||
4880 | /* | 4847 | /* |
4881 | * None are in tickless mode and hence no need for NOHZ idle load | 4848 | * None are in tickless mode and hence no need for NOHZ idle load |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..eebefcad7027 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) | |||
12 | SCHED_FEAT(START_DEBIT, true) | 12 | SCHED_FEAT(START_DEBIT, true) |
13 | 13 | ||
14 | /* | 14 | /* |
15 | * Based on load and program behaviour, see if it makes sense to place | ||
16 | * a newly woken task on the same cpu as the task that woke it -- | ||
17 | * improve cache locality. Typically used with SYNC wakeups as | ||
18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | ||
19 | */ | ||
20 | SCHED_FEAT(AFFINE_WAKEUPS, true) | ||
21 | |||
22 | /* | ||
23 | * Prefer to schedule the task we woke last (assuming it failed | 15 | * Prefer to schedule the task we woke last (assuming it failed |
24 | * wakeup-preemption), since its likely going to consume data we | 16 | * wakeup-preemption), since its likely going to consume data we |
25 | * touched, increases cache locality. | 17 | * touched, increases cache locality. |
@@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
42 | /* | 34 | /* |
43 | * Use arch dependent cpu power functions | 35 | * Use arch dependent cpu power functions |
44 | */ | 36 | */ |
45 | SCHED_FEAT(ARCH_POWER, false) | 37 | SCHED_FEAT(ARCH_POWER, true) |
46 | 38 | ||
47 | SCHED_FEAT(HRTICK, false) | 39 | SCHED_FEAT(HRTICK, false) |
48 | SCHED_FEAT(DOUBLE_TICK, false) | 40 | SCHED_FEAT(DOUBLE_TICK, false) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) | |||
1632 | if (!next_task) | 1632 | if (!next_task) |
1633 | return 0; | 1633 | return 0; |
1634 | 1634 | ||
1635 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
1636 | if (unlikely(task_running(rq, next_task))) | ||
1637 | return 0; | ||
1638 | #endif | ||
1639 | |||
1640 | retry: | 1635 | retry: |
1641 | if (unlikely(next_task == rq->curr)) { | 1636 | if (unlikely(next_task == rq->curr)) { |
1642 | WARN_ON(1); | 1637 | WARN_ON(1); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0848fa36c383..7a7db09cfabc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
737 | */ | 737 | */ |
738 | next->on_cpu = 1; | 738 | next->on_cpu = 1; |
739 | #endif | 739 | #endif |
740 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
741 | raw_spin_unlock_irq(&rq->lock); | ||
742 | #else | ||
743 | raw_spin_unlock(&rq->lock); | 740 | raw_spin_unlock(&rq->lock); |
744 | #endif | ||
745 | } | 741 | } |
746 | 742 | ||
747 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 743 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
@@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
755 | smp_wmb(); | 751 | smp_wmb(); |
756 | prev->on_cpu = 0; | 752 | prev->on_cpu = 0; |
757 | #endif | 753 | #endif |
758 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
759 | local_irq_enable(); | 754 | local_irq_enable(); |
760 | #endif | ||
761 | } | 755 | } |
762 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 756 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
763 | 757 | ||
@@ -891,6 +885,9 @@ struct cpuacct { | |||
891 | struct kernel_cpustat __percpu *cpustat; | 885 | struct kernel_cpustat __percpu *cpustat; |
892 | }; | 886 | }; |
893 | 887 | ||
888 | extern struct cgroup_subsys cpuacct_subsys; | ||
889 | extern struct cpuacct root_cpuacct; | ||
890 | |||
894 | /* return cpu accounting group corresponding to this container */ | 891 | /* return cpu accounting group corresponding to this container */ |
895 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | 892 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
896 | { | 893 | { |
@@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
917 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 914 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
918 | #endif | 915 | #endif |
919 | 916 | ||
917 | #ifdef CONFIG_PARAVIRT | ||
918 | static inline u64 steal_ticks(u64 steal) | ||
919 | { | ||
920 | if (unlikely(steal > NSEC_PER_SEC)) | ||
921 | return div_u64(steal, TICK_NSEC); | ||
922 | |||
923 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
924 | } | ||
925 | #endif | ||
926 | |||
920 | static inline void inc_nr_running(struct rq *rq) | 927 | static inline void inc_nr_running(struct rq *rq) |
921 | { | 928 | { |
922 | rq->nr_running++; | 929 | rq->nr_running++; |
@@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits { | |||
1156 | 1163 | ||
1157 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1164 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
1158 | #endif | 1165 | #endif |
1166 | |||
1167 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1168 | |||
1169 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | ||
1170 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
1171 | |||
1172 | #ifndef CONFIG_64BIT | ||
1173 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
1174 | |||
1175 | static inline void irq_time_write_begin(void) | ||
1176 | { | ||
1177 | __this_cpu_inc(irq_time_seq.sequence); | ||
1178 | smp_wmb(); | ||
1179 | } | ||
1180 | |||
1181 | static inline void irq_time_write_end(void) | ||
1182 | { | ||
1183 | smp_wmb(); | ||
1184 | __this_cpu_inc(irq_time_seq.sequence); | ||
1185 | } | ||
1186 | |||
1187 | static inline u64 irq_time_read(int cpu) | ||
1188 | { | ||
1189 | u64 irq_time; | ||
1190 | unsigned seq; | ||
1191 | |||
1192 | do { | ||
1193 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1194 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1195 | per_cpu(cpu_hardirq_time, cpu); | ||
1196 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1197 | |||
1198 | return irq_time; | ||
1199 | } | ||
1200 | #else /* CONFIG_64BIT */ | ||
1201 | static inline void irq_time_write_begin(void) | ||
1202 | { | ||
1203 | } | ||
1204 | |||
1205 | static inline void irq_time_write_end(void) | ||
1206 | { | ||
1207 | } | ||
1208 | |||
1209 | static inline u64 irq_time_read(int cpu) | ||
1210 | { | ||
1211 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1212 | } | ||
1213 | #endif /* CONFIG_64BIT */ | ||
1214 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
1215 | |||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5c6a5bd8462f..cc96bdc0c2c9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
221 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
222 | 222 | ||
223 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
224 | account_system_vtime(current); | 224 | vtime_account(current); |
225 | 225 | ||
226 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
227 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
@@ -272,7 +272,7 @@ restart: | |||
272 | 272 | ||
273 | lockdep_softirq_exit(); | 273 | lockdep_softirq_exit(); |
274 | 274 | ||
275 | account_system_vtime(current); | 275 | vtime_account(current); |
276 | __local_bh_enable(SOFTIRQ_OFFSET); | 276 | __local_bh_enable(SOFTIRQ_OFFSET); |
277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
278 | } | 278 | } |
@@ -341,7 +341,7 @@ static inline void invoke_softirq(void) | |||
341 | */ | 341 | */ |
342 | void irq_exit(void) | 342 | void irq_exit(void) |
343 | { | 343 | { |
344 | account_system_vtime(current); | 344 | vtime_account(current); |
345 | trace_hardirq_exit(); | 345 | trace_hardirq_exit(); |
346 | sub_preempt_count(IRQ_EXIT_OFFSET); | 346 | sub_preempt_count(IRQ_EXIT_OFFSET); |
347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..81c7b1a1a307 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = { | |||
307 | .extra2 = &max_sched_tunable_scaling, | 307 | .extra2 = &max_sched_tunable_scaling, |
308 | }, | 308 | }, |
309 | { | 309 | { |
310 | .procname = "sched_migration_cost", | 310 | .procname = "sched_migration_cost_ns", |
311 | .data = &sysctl_sched_migration_cost, | 311 | .data = &sysctl_sched_migration_cost, |
312 | .maxlen = sizeof(unsigned int), | 312 | .maxlen = sizeof(unsigned int), |
313 | .mode = 0644, | 313 | .mode = 0644, |
@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = { | |||
321 | .proc_handler = proc_dointvec, | 321 | .proc_handler = proc_dointvec, |
322 | }, | 322 | }, |
323 | { | 323 | { |
324 | .procname = "sched_time_avg", | 324 | .procname = "sched_time_avg_ms", |
325 | .data = &sysctl_sched_time_avg, | 325 | .data = &sysctl_sched_time_avg, |
326 | .maxlen = sizeof(unsigned int), | 326 | .maxlen = sizeof(unsigned int), |
327 | .mode = 0644, | 327 | .mode = 0644, |
328 | .proc_handler = proc_dointvec, | 328 | .proc_handler = proc_dointvec, |
329 | }, | 329 | }, |
330 | { | 330 | { |
331 | .procname = "sched_shares_window", | 331 | .procname = "sched_shares_window_ns", |
332 | .data = &sysctl_sched_shares_window, | 332 | .data = &sysctl_sched_shares_window, |
333 | .maxlen = sizeof(unsigned int), | 333 | .maxlen = sizeof(unsigned int), |
334 | .mode = 0644, | 334 | .mode = 0644, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cf5f6b262673..f423bdd035c2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
372 | * the scheduler tick in nohz_restart_sched_tick. | 372 | * the scheduler tick in nohz_restart_sched_tick. |
373 | */ | 373 | */ |
374 | if (!ts->tick_stopped) { | 374 | if (!ts->tick_stopped) { |
375 | select_nohz_load_balancer(1); | 375 | nohz_balance_enter_idle(cpu); |
376 | calc_load_enter_idle(); | 376 | calc_load_enter_idle(); |
377 | 377 | ||
378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
@@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
570 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | 570 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) |
571 | { | 571 | { |
572 | /* Update jiffies first */ | 572 | /* Update jiffies first */ |
573 | select_nohz_load_balancer(0); | ||
574 | tick_do_update_jiffies64(now); | 573 | tick_do_update_jiffies64(now); |
575 | update_cpu_load_nohz(); | 574 | update_cpu_load_nohz(); |
576 | 575 | ||