diff options
32 files changed, 892 insertions, 906 deletions
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt index 28aa1075e291..b1b8587b86f0 100644 --- a/Documentation/scheduler/sched-arch.txt +++ b/Documentation/scheduler/sched-arch.txt | |||
| @@ -17,16 +17,6 @@ you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file | |||
| 17 | Unlocked context switches introduce only a very minor performance | 17 | Unlocked context switches introduce only a very minor performance |
| 18 | penalty to the core scheduler implementation in the CONFIG_SMP case. | 18 | penalty to the core scheduler implementation in the CONFIG_SMP case. |
| 19 | 19 | ||
| 20 | 2. Interrupt status | ||
| 21 | By default, the switch_to arch function is called with interrupts | ||
| 22 | disabled. Interrupts may be enabled over the call if it is likely to | ||
| 23 | introduce a significant interrupt latency by adding the line | ||
| 24 | `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for | ||
| 25 | unlocked context switches. This define also implies | ||
| 26 | `__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an | ||
| 27 | example. | ||
| 28 | |||
| 29 | |||
| 30 | CPU idle | 20 | CPU idle |
| 31 | ======== | 21 | ======== |
| 32 | Your cpu_idle routines need to obey the following rules: | 22 | Your cpu_idle routines need to obey the following rules: |
diff --git a/arch/Kconfig b/arch/Kconfig index 1a7b468abf4a..a62965d057f6 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
| @@ -304,4 +304,13 @@ config HAVE_RCU_USER_QS | |||
| 304 | are already protected inside rcu_irq_enter/rcu_irq_exit() but | 304 | are already protected inside rcu_irq_enter/rcu_irq_exit() but |
| 305 | preemption or signal handling on irq exit still need to be protected. | 305 | preemption or signal handling on irq exit still need to be protected. |
| 306 | 306 | ||
| 307 | config HAVE_VIRT_CPU_ACCOUNTING | ||
| 308 | bool | ||
| 309 | |||
| 310 | config HAVE_IRQ_TIME_ACCOUNTING | ||
| 311 | bool | ||
| 312 | help | ||
| 313 | Archs need to ensure they use a high enough resolution clock to | ||
| 314 | support irq time accounting and then call enable_sched_clock_irqtime(). | ||
| 315 | |||
| 307 | source "kernel/gcov/Kconfig" | 316 | source "kernel/gcov/Kconfig" |
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 310cf5781fad..3c720ef6c32d 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig | |||
| @@ -25,6 +25,7 @@ config IA64 | |||
| 25 | select HAVE_GENERIC_HARDIRQS | 25 | select HAVE_GENERIC_HARDIRQS |
| 26 | select HAVE_MEMBLOCK | 26 | select HAVE_MEMBLOCK |
| 27 | select HAVE_MEMBLOCK_NODE_MAP | 27 | select HAVE_MEMBLOCK_NODE_MAP |
| 28 | select HAVE_VIRT_CPU_ACCOUNTING | ||
| 28 | select ARCH_DISCARD_MEMBLOCK | 29 | select ARCH_DISCARD_MEMBLOCK |
| 29 | select GENERIC_IRQ_PROBE | 30 | select GENERIC_IRQ_PROBE |
| 30 | select GENERIC_PENDING_IRQ if SMP | 31 | select GENERIC_PENDING_IRQ if SMP |
| @@ -340,17 +341,6 @@ config FORCE_MAX_ZONEORDER | |||
| 340 | default "17" if HUGETLB_PAGE | 341 | default "17" if HUGETLB_PAGE |
| 341 | default "11" | 342 | default "11" |
| 342 | 343 | ||
| 343 | config VIRT_CPU_ACCOUNTING | ||
| 344 | bool "Deterministic task and CPU time accounting" | ||
| 345 | default n | ||
| 346 | help | ||
| 347 | Select this option to enable more accurate task and CPU time | ||
| 348 | accounting. This is done by reading a CPU counter on each | ||
| 349 | kernel entry and exit and on transitions within the kernel | ||
| 350 | between system, softirq and hardirq state, so there is a | ||
| 351 | small performance impact. | ||
| 352 | If in doubt, say N here. | ||
| 353 | |||
| 354 | config SMP | 344 | config SMP |
| 355 | bool "Symmetric multi-processing support" | 345 | bool "Symmetric multi-processing support" |
| 356 | select USE_GENERIC_SMP_HELPERS | 346 | select USE_GENERIC_SMP_HELPERS |
diff --git a/arch/ia64/include/asm/switch_to.h b/arch/ia64/include/asm/switch_to.h index cb2412fcd17f..d38c7ea5eea5 100644 --- a/arch/ia64/include/asm/switch_to.h +++ b/arch/ia64/include/asm/switch_to.h | |||
| @@ -30,13 +30,6 @@ extern struct task_struct *ia64_switch_to (void *next_task); | |||
| 30 | extern void ia64_save_extra (struct task_struct *task); | 30 | extern void ia64_save_extra (struct task_struct *task); |
| 31 | extern void ia64_load_extra (struct task_struct *task); | 31 | extern void ia64_load_extra (struct task_struct *task); |
| 32 | 32 | ||
| 33 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 34 | extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next); | ||
| 35 | # define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n) | ||
| 36 | #else | ||
| 37 | # define IA64_ACCOUNT_ON_SWITCH(p,n) | ||
| 38 | #endif | ||
| 39 | |||
| 40 | #ifdef CONFIG_PERFMON | 33 | #ifdef CONFIG_PERFMON |
| 41 | DECLARE_PER_CPU(unsigned long, pfm_syst_info); | 34 | DECLARE_PER_CPU(unsigned long, pfm_syst_info); |
| 42 | # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) | 35 | # define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) |
| @@ -49,7 +42,6 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct | |||
| 49 | || PERFMON_IS_SYSWIDE()) | 42 | || PERFMON_IS_SYSWIDE()) |
| 50 | 43 | ||
| 51 | #define __switch_to(prev,next,last) do { \ | 44 | #define __switch_to(prev,next,last) do { \ |
| 52 | IA64_ACCOUNT_ON_SWITCH(prev, next); \ | ||
| 53 | if (IA64_HAS_EXTRA_STATE(prev)) \ | 45 | if (IA64_HAS_EXTRA_STATE(prev)) \ |
| 54 | ia64_save_extra(prev); \ | 46 | ia64_save_extra(prev); \ |
| 55 | if (IA64_HAS_EXTRA_STATE(next)) \ | 47 | if (IA64_HAS_EXTRA_STATE(next)) \ |
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index ecc904b33c5f..80ff9acc5edf 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c | |||
| @@ -83,32 +83,36 @@ static struct clocksource *itc_clocksource; | |||
| 83 | 83 | ||
| 84 | extern cputime_t cycle_to_cputime(u64 cyc); | 84 | extern cputime_t cycle_to_cputime(u64 cyc); |
| 85 | 85 | ||
| 86 | static void vtime_account_user(struct task_struct *tsk) | ||
| 87 | { | ||
| 88 | cputime_t delta_utime; | ||
| 89 | struct thread_info *ti = task_thread_info(tsk); | ||
| 90 | |||
| 91 | if (ti->ac_utime) { | ||
| 92 | delta_utime = cycle_to_cputime(ti->ac_utime); | ||
| 93 | account_user_time(tsk, delta_utime, delta_utime); | ||
| 94 | ti->ac_utime = 0; | ||
| 95 | } | ||
| 96 | } | ||
| 97 | |||
| 86 | /* | 98 | /* |
| 87 | * Called from the context switch with interrupts disabled, to charge all | 99 | * Called from the context switch with interrupts disabled, to charge all |
| 88 | * accumulated times to the current process, and to prepare accounting on | 100 | * accumulated times to the current process, and to prepare accounting on |
| 89 | * the next process. | 101 | * the next process. |
| 90 | */ | 102 | */ |
| 91 | void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) | 103 | void vtime_task_switch(struct task_struct *prev) |
| 92 | { | 104 | { |
| 93 | struct thread_info *pi = task_thread_info(prev); | 105 | struct thread_info *pi = task_thread_info(prev); |
| 94 | struct thread_info *ni = task_thread_info(next); | 106 | struct thread_info *ni = task_thread_info(current); |
| 95 | cputime_t delta_stime, delta_utime; | ||
| 96 | __u64 now; | ||
| 97 | 107 | ||
| 98 | now = ia64_get_itc(); | ||
| 99 | |||
| 100 | delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp)); | ||
| 101 | if (idle_task(smp_processor_id()) != prev) | 108 | if (idle_task(smp_processor_id()) != prev) |
| 102 | account_system_time(prev, 0, delta_stime, delta_stime); | 109 | vtime_account_system(prev); |
| 103 | else | 110 | else |
| 104 | account_idle_time(delta_stime); | 111 | vtime_account_idle(prev); |
| 105 | 112 | ||
| 106 | if (pi->ac_utime) { | 113 | vtime_account_user(prev); |
| 107 | delta_utime = cycle_to_cputime(pi->ac_utime); | ||
| 108 | account_user_time(prev, delta_utime, delta_utime); | ||
| 109 | } | ||
| 110 | 114 | ||
| 111 | pi->ac_stamp = ni->ac_stamp = now; | 115 | pi->ac_stamp = ni->ac_stamp; |
| 112 | ni->ac_stime = ni->ac_utime = 0; | 116 | ni->ac_stime = ni->ac_utime = 0; |
| 113 | } | 117 | } |
| 114 | 118 | ||
| @@ -116,29 +120,32 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) | |||
| 116 | * Account time for a transition between system, hard irq or soft irq state. | 120 | * Account time for a transition between system, hard irq or soft irq state. |
| 117 | * Note that this function is called with interrupts enabled. | 121 | * Note that this function is called with interrupts enabled. |
| 118 | */ | 122 | */ |
| 119 | void account_system_vtime(struct task_struct *tsk) | 123 | static cputime_t vtime_delta(struct task_struct *tsk) |
| 120 | { | 124 | { |
| 121 | struct thread_info *ti = task_thread_info(tsk); | 125 | struct thread_info *ti = task_thread_info(tsk); |
| 122 | unsigned long flags; | ||
| 123 | cputime_t delta_stime; | 126 | cputime_t delta_stime; |
| 124 | __u64 now; | 127 | __u64 now; |
| 125 | 128 | ||
| 126 | local_irq_save(flags); | ||
| 127 | |||
| 128 | now = ia64_get_itc(); | 129 | now = ia64_get_itc(); |
| 129 | 130 | ||
| 130 | delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); | 131 | delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); |
| 131 | if (irq_count() || idle_task(smp_processor_id()) != tsk) | ||
| 132 | account_system_time(tsk, 0, delta_stime, delta_stime); | ||
| 133 | else | ||
| 134 | account_idle_time(delta_stime); | ||
| 135 | ti->ac_stime = 0; | 132 | ti->ac_stime = 0; |
| 136 | |||
| 137 | ti->ac_stamp = now; | 133 | ti->ac_stamp = now; |
| 138 | 134 | ||
| 139 | local_irq_restore(flags); | 135 | return delta_stime; |
| 136 | } | ||
| 137 | |||
| 138 | void vtime_account_system(struct task_struct *tsk) | ||
| 139 | { | ||
| 140 | cputime_t delta = vtime_delta(tsk); | ||
| 141 | |||
| 142 | account_system_time(tsk, 0, delta, delta); | ||
| 143 | } | ||
| 144 | |||
| 145 | void vtime_account_idle(struct task_struct *tsk) | ||
| 146 | { | ||
| 147 | account_idle_time(vtime_delta(tsk)); | ||
| 140 | } | 148 | } |
| 141 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 142 | 149 | ||
| 143 | /* | 150 | /* |
| 144 | * Called from the timer interrupt handler to charge accumulated user time | 151 | * Called from the timer interrupt handler to charge accumulated user time |
| @@ -146,14 +153,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime); | |||
| 146 | */ | 153 | */ |
| 147 | void account_process_tick(struct task_struct *p, int user_tick) | 154 | void account_process_tick(struct task_struct *p, int user_tick) |
| 148 | { | 155 | { |
| 149 | struct thread_info *ti = task_thread_info(p); | 156 | vtime_account_user(p); |
| 150 | cputime_t delta_utime; | ||
| 151 | |||
| 152 | if (ti->ac_utime) { | ||
| 153 | delta_utime = cycle_to_cputime(ti->ac_utime); | ||
| 154 | account_user_time(p, delta_utime, delta_utime); | ||
| 155 | ti->ac_utime = 0; | ||
| 156 | } | ||
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 159 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 3b4b4a8da922..c1f267694acb 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h | |||
| @@ -197,12 +197,6 @@ struct cpu_usage { | |||
| 197 | 197 | ||
| 198 | DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); | 198 | DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); |
| 199 | 199 | ||
| 200 | #if defined(CONFIG_VIRT_CPU_ACCOUNTING) | ||
| 201 | #define account_process_vtime(tsk) account_process_tick(tsk, 0) | ||
| 202 | #else | ||
| 203 | #define account_process_vtime(tsk) do { } while (0) | ||
| 204 | #endif | ||
| 205 | |||
| 206 | extern void secondary_cpu_time_init(void); | 200 | extern void secondary_cpu_time_init(void); |
| 207 | 201 | ||
| 208 | DECLARE_PER_CPU(u64, decrementers_next_tb); | 202 | DECLARE_PER_CPU(u64, decrementers_next_tb); |
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 1a1f2ddfb581..e9cb51f5f801 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c | |||
| @@ -514,9 +514,6 @@ struct task_struct *__switch_to(struct task_struct *prev, | |||
| 514 | 514 | ||
| 515 | local_irq_save(flags); | 515 | local_irq_save(flags); |
| 516 | 516 | ||
| 517 | account_system_vtime(current); | ||
| 518 | account_process_vtime(current); | ||
| 519 | |||
| 520 | /* | 517 | /* |
| 521 | * We can't take a PMU exception inside _switch() since there is a | 518 | * We can't take a PMU exception inside _switch() since there is a |
| 522 | * window where the kernel stack SLB and the kernel stack are out | 519 | * window where the kernel stack SLB and the kernel stack are out |
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e49e93191b69..eaa9d0e6abca 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c | |||
| @@ -291,13 +291,12 @@ static inline u64 calculate_stolen_time(u64 stop_tb) | |||
| 291 | * Account time for a transition between system, hard irq | 291 | * Account time for a transition between system, hard irq |
| 292 | * or soft irq state. | 292 | * or soft irq state. |
| 293 | */ | 293 | */ |
| 294 | void account_system_vtime(struct task_struct *tsk) | 294 | static u64 vtime_delta(struct task_struct *tsk, |
| 295 | u64 *sys_scaled, u64 *stolen) | ||
| 295 | { | 296 | { |
| 296 | u64 now, nowscaled, delta, deltascaled; | 297 | u64 now, nowscaled, deltascaled; |
| 297 | unsigned long flags; | 298 | u64 udelta, delta, user_scaled; |
| 298 | u64 stolen, udelta, sys_scaled, user_scaled; | ||
| 299 | 299 | ||
| 300 | local_irq_save(flags); | ||
| 301 | now = mftb(); | 300 | now = mftb(); |
| 302 | nowscaled = read_spurr(now); | 301 | nowscaled = read_spurr(now); |
| 303 | get_paca()->system_time += now - get_paca()->starttime; | 302 | get_paca()->system_time += now - get_paca()->starttime; |
| @@ -305,7 +304,7 @@ void account_system_vtime(struct task_struct *tsk) | |||
| 305 | deltascaled = nowscaled - get_paca()->startspurr; | 304 | deltascaled = nowscaled - get_paca()->startspurr; |
| 306 | get_paca()->startspurr = nowscaled; | 305 | get_paca()->startspurr = nowscaled; |
| 307 | 306 | ||
| 308 | stolen = calculate_stolen_time(now); | 307 | *stolen = calculate_stolen_time(now); |
| 309 | 308 | ||
| 310 | delta = get_paca()->system_time; | 309 | delta = get_paca()->system_time; |
| 311 | get_paca()->system_time = 0; | 310 | get_paca()->system_time = 0; |
| @@ -322,35 +321,45 @@ void account_system_vtime(struct task_struct *tsk) | |||
| 322 | * the user ticks get saved up in paca->user_time_scaled to be | 321 | * the user ticks get saved up in paca->user_time_scaled to be |
| 323 | * used by account_process_tick. | 322 | * used by account_process_tick. |
| 324 | */ | 323 | */ |
| 325 | sys_scaled = delta; | 324 | *sys_scaled = delta; |
| 326 | user_scaled = udelta; | 325 | user_scaled = udelta; |
| 327 | if (deltascaled != delta + udelta) { | 326 | if (deltascaled != delta + udelta) { |
| 328 | if (udelta) { | 327 | if (udelta) { |
| 329 | sys_scaled = deltascaled * delta / (delta + udelta); | 328 | *sys_scaled = deltascaled * delta / (delta + udelta); |
| 330 | user_scaled = deltascaled - sys_scaled; | 329 | user_scaled = deltascaled - *sys_scaled; |
| 331 | } else { | 330 | } else { |
| 332 | sys_scaled = deltascaled; | 331 | *sys_scaled = deltascaled; |
| 333 | } | 332 | } |
| 334 | } | 333 | } |
| 335 | get_paca()->user_time_scaled += user_scaled; | 334 | get_paca()->user_time_scaled += user_scaled; |
| 336 | 335 | ||
| 337 | if (in_interrupt() || idle_task(smp_processor_id()) != tsk) { | 336 | return delta; |
| 338 | account_system_time(tsk, 0, delta, sys_scaled); | 337 | } |
| 339 | if (stolen) | 338 | |
| 340 | account_steal_time(stolen); | 339 | void vtime_account_system(struct task_struct *tsk) |
| 341 | } else { | 340 | { |
| 342 | account_idle_time(delta + stolen); | 341 | u64 delta, sys_scaled, stolen; |
| 343 | } | 342 | |
| 344 | local_irq_restore(flags); | 343 | delta = vtime_delta(tsk, &sys_scaled, &stolen); |
| 344 | account_system_time(tsk, 0, delta, sys_scaled); | ||
| 345 | if (stolen) | ||
| 346 | account_steal_time(stolen); | ||
| 347 | } | ||
| 348 | |||
| 349 | void vtime_account_idle(struct task_struct *tsk) | ||
| 350 | { | ||
| 351 | u64 delta, sys_scaled, stolen; | ||
| 352 | |||
| 353 | delta = vtime_delta(tsk, &sys_scaled, &stolen); | ||
| 354 | account_idle_time(delta + stolen); | ||
| 345 | } | 355 | } |
| 346 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 347 | 356 | ||
| 348 | /* | 357 | /* |
| 349 | * Transfer the user and system times accumulated in the paca | 358 | * Transfer the user and system times accumulated in the paca |
| 350 | * by the exception entry and exit code to the generic process | 359 | * by the exception entry and exit code to the generic process |
| 351 | * user and system time records. | 360 | * user and system time records. |
| 352 | * Must be called with interrupts disabled. | 361 | * Must be called with interrupts disabled. |
| 353 | * Assumes that account_system_vtime() has been called recently | 362 | * Assumes that vtime_account() has been called recently |
| 354 | * (i.e. since the last entry from usermode) so that | 363 | * (i.e. since the last entry from usermode) so that |
| 355 | * get_paca()->user_time_scaled is up to date. | 364 | * get_paca()->user_time_scaled is up to date. |
| 356 | */ | 365 | */ |
| @@ -366,6 +375,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick) | |||
| 366 | account_user_time(tsk, utime, utimescaled); | 375 | account_user_time(tsk, utime, utimescaled); |
| 367 | } | 376 | } |
| 368 | 377 | ||
| 378 | void vtime_task_switch(struct task_struct *prev) | ||
| 379 | { | ||
| 380 | vtime_account(prev); | ||
| 381 | account_process_tick(prev, 0); | ||
| 382 | } | ||
| 383 | |||
| 369 | #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ | 384 | #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ |
| 370 | #define calc_cputime_factors() | 385 | #define calc_cputime_factors() |
| 371 | #endif | 386 | #endif |
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 30fd01de6bed..72afd2888cad 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | config PPC64 | 1 | config PPC64 |
| 2 | bool "64-bit kernel" | 2 | bool "64-bit kernel" |
| 3 | default n | 3 | default n |
| 4 | select HAVE_VIRT_CPU_ACCOUNTING | ||
| 4 | help | 5 | help |
| 5 | This option selects whether a 32-bit or a 64-bit kernel | 6 | This option selects whether a 32-bit or a 64-bit kernel |
| 6 | will be built. | 7 | will be built. |
| @@ -337,21 +338,6 @@ config PPC_MM_SLICES | |||
| 337 | default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) | 338 | default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) |
| 338 | default n | 339 | default n |
| 339 | 340 | ||
| 340 | config VIRT_CPU_ACCOUNTING | ||
| 341 | bool "Deterministic task and CPU time accounting" | ||
| 342 | depends on PPC64 | ||
| 343 | default y | ||
| 344 | help | ||
| 345 | Select this option to enable more accurate task and CPU time | ||
| 346 | accounting. This is done by reading a CPU counter on each | ||
| 347 | kernel entry and exit and on transitions within the kernel | ||
| 348 | between system, softirq and hardirq state, so there is a | ||
| 349 | small performance impact. This also enables accounting of | ||
| 350 | stolen time on logically-partitioned systems running on | ||
| 351 | IBM POWER5-based machines. | ||
| 352 | |||
| 353 | If in doubt, say Y here. | ||
| 354 | |||
| 355 | config PPC_HAVE_PMU_SUPPORT | 341 | config PPC_HAVE_PMU_SUPPORT |
| 356 | bool | 342 | bool |
| 357 | 343 | ||
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 107610e01a29..f5ab543396da 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
| @@ -49,9 +49,6 @@ config GENERIC_LOCKBREAK | |||
| 49 | config PGSTE | 49 | config PGSTE |
| 50 | def_bool y if KVM | 50 | def_bool y if KVM |
| 51 | 51 | ||
| 52 | config VIRT_CPU_ACCOUNTING | ||
| 53 | def_bool y | ||
| 54 | |||
| 55 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC | 52 | config ARCH_SUPPORTS_DEBUG_PAGEALLOC |
| 56 | def_bool y | 53 | def_bool y |
| 57 | 54 | ||
| @@ -89,6 +86,8 @@ config S390 | |||
| 89 | select HAVE_MEMBLOCK | 86 | select HAVE_MEMBLOCK |
| 90 | select HAVE_MEMBLOCK_NODE_MAP | 87 | select HAVE_MEMBLOCK_NODE_MAP |
| 91 | select HAVE_CMPXCHG_LOCAL | 88 | select HAVE_CMPXCHG_LOCAL |
| 89 | select HAVE_VIRT_CPU_ACCOUNTING | ||
| 90 | select VIRT_CPU_ACCOUNTING | ||
| 92 | select ARCH_DISCARD_MEMBLOCK | 91 | select ARCH_DISCARD_MEMBLOCK |
| 93 | select BUILDTIME_EXTABLE_SORT | 92 | select BUILDTIME_EXTABLE_SORT |
| 94 | select ARCH_INLINE_SPIN_TRYLOCK | 93 | select ARCH_INLINE_SPIN_TRYLOCK |
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 8709bdef233c..023d5ae24482 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h | |||
| @@ -12,6 +12,9 @@ | |||
| 12 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
| 13 | #include <asm/div64.h> | 13 | #include <asm/div64.h> |
| 14 | 14 | ||
| 15 | |||
| 16 | #define __ARCH_HAS_VTIME_ACCOUNT | ||
| 17 | |||
| 15 | /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ | 18 | /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ |
| 16 | 19 | ||
| 17 | typedef unsigned long long __nocast cputime_t; | 20 | typedef unsigned long long __nocast cputime_t; |
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index f223068b7822..314cc9426fc4 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h | |||
| @@ -89,12 +89,8 @@ static inline void restore_access_regs(unsigned int *acrs) | |||
| 89 | prev = __switch_to(prev,next); \ | 89 | prev = __switch_to(prev,next); \ |
| 90 | } while (0) | 90 | } while (0) |
| 91 | 91 | ||
| 92 | extern void account_vtime(struct task_struct *, struct task_struct *); | ||
| 93 | extern void account_tick_vtime(struct task_struct *); | ||
| 94 | |||
| 95 | #define finish_arch_switch(prev) do { \ | 92 | #define finish_arch_switch(prev) do { \ |
| 96 | set_fs(current->thread.mm_segment); \ | 93 | set_fs(current->thread.mm_segment); \ |
| 97 | account_vtime(prev, current); \ | ||
| 98 | } while (0) | 94 | } while (0) |
| 99 | 95 | ||
| 100 | #endif /* __ASM_SWITCH_TO_H */ | 96 | #endif /* __ASM_SWITCH_TO_H */ |
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 4fc97b40a6e1..cb5093c26d16 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c | |||
| @@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) | |||
| 99 | return virt_timer_forward(user + system); | 99 | return virt_timer_forward(user + system); |
| 100 | } | 100 | } |
| 101 | 101 | ||
| 102 | void account_vtime(struct task_struct *prev, struct task_struct *next) | 102 | void vtime_task_switch(struct task_struct *prev) |
| 103 | { | 103 | { |
| 104 | struct thread_info *ti; | 104 | struct thread_info *ti; |
| 105 | 105 | ||
| @@ -107,7 +107,7 @@ void account_vtime(struct task_struct *prev, struct task_struct *next) | |||
| 107 | ti = task_thread_info(prev); | 107 | ti = task_thread_info(prev); |
| 108 | ti->user_timer = S390_lowcore.user_timer; | 108 | ti->user_timer = S390_lowcore.user_timer; |
| 109 | ti->system_timer = S390_lowcore.system_timer; | 109 | ti->system_timer = S390_lowcore.system_timer; |
| 110 | ti = task_thread_info(next); | 110 | ti = task_thread_info(current); |
| 111 | S390_lowcore.user_timer = ti->user_timer; | 111 | S390_lowcore.user_timer = ti->user_timer; |
| 112 | S390_lowcore.system_timer = ti->system_timer; | 112 | S390_lowcore.system_timer = ti->system_timer; |
| 113 | } | 113 | } |
| @@ -122,7 +122,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick) | |||
| 122 | * Update process times based on virtual cpu times stored by entry.S | 122 | * Update process times based on virtual cpu times stored by entry.S |
| 123 | * to the lowcore fields user_timer, system_timer & steal_clock. | 123 | * to the lowcore fields user_timer, system_timer & steal_clock. |
| 124 | */ | 124 | */ |
| 125 | void account_system_vtime(struct task_struct *tsk) | 125 | void vtime_account(struct task_struct *tsk) |
| 126 | { | 126 | { |
| 127 | struct thread_info *ti = task_thread_info(tsk); | 127 | struct thread_info *ti = task_thread_info(tsk); |
| 128 | u64 timer, system; | 128 | u64 timer, system; |
| @@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk) | |||
| 138 | 138 | ||
| 139 | virt_timer_forward(system); | 139 | virt_timer_forward(system); |
| 140 | } | 140 | } |
| 141 | EXPORT_SYMBOL_GPL(account_system_vtime); | 141 | EXPORT_SYMBOL_GPL(vtime_account); |
| 142 | 142 | ||
| 143 | void __kprobes vtime_stop_cpu(void) | 143 | void __kprobes vtime_stop_cpu(void) |
| 144 | { | 144 | { |
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h index 7a7ce390534f..d5e86c9f74fd 100644 --- a/arch/tile/include/asm/topology.h +++ b/arch/tile/include/asm/topology.h | |||
| @@ -69,7 +69,6 @@ static inline const struct cpumask *cpumask_of_node(int node) | |||
| 69 | | 1*SD_BALANCE_FORK \ | 69 | | 1*SD_BALANCE_FORK \ |
| 70 | | 0*SD_BALANCE_WAKE \ | 70 | | 0*SD_BALANCE_WAKE \ |
| 71 | | 0*SD_WAKE_AFFINE \ | 71 | | 0*SD_WAKE_AFFINE \ |
| 72 | | 0*SD_PREFER_LOCAL \ | ||
| 73 | | 0*SD_SHARE_CPUPOWER \ | 72 | | 0*SD_SHARE_CPUPOWER \ |
| 74 | | 0*SD_SHARE_PKG_RESOURCES \ | 73 | | 0*SD_SHARE_PKG_RESOURCES \ |
| 75 | | 0*SD_SERIALIZE \ | 74 | | 0*SD_SERIALIZE \ |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ff1f56a0188..488ba8da8fef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
| @@ -101,6 +101,7 @@ config X86 | |||
| 101 | select GENERIC_STRNCPY_FROM_USER | 101 | select GENERIC_STRNCPY_FROM_USER |
| 102 | select GENERIC_STRNLEN_USER | 102 | select GENERIC_STRNLEN_USER |
| 103 | select HAVE_RCU_USER_QS if X86_64 | 103 | select HAVE_RCU_USER_QS if X86_64 |
| 104 | select HAVE_IRQ_TIME_ACCOUNTING | ||
| 104 | 105 | ||
| 105 | config INSTRUCTION_DECODER | 106 | config INSTRUCTION_DECODER |
| 106 | def_bool (KPROBES || PERF_EVENTS || UPROBES) | 107 | def_bool (KPROBES || PERF_EVENTS || UPROBES) |
| @@ -800,17 +801,6 @@ config SCHED_MC | |||
| 800 | making when dealing with multi-core CPU chips at a cost of slightly | 801 | making when dealing with multi-core CPU chips at a cost of slightly |
| 801 | increased overhead in some places. If unsure say N here. | 802 | increased overhead in some places. If unsure say N here. |
| 802 | 803 | ||
| 803 | config IRQ_TIME_ACCOUNTING | ||
| 804 | bool "Fine granularity task level IRQ time accounting" | ||
| 805 | default n | ||
| 806 | ---help--- | ||
| 807 | Select this option to enable fine granularity task irq time | ||
| 808 | accounting. This is done by reading a timestamp on each | ||
| 809 | transitions between softirq and hardirq state, so there can be a | ||
| 810 | small performance impact. | ||
| 811 | |||
| 812 | If in doubt, say N here. | ||
| 813 | |||
| 814 | source "kernel/Kconfig.preempt" | 804 | source "kernel/Kconfig.preempt" |
| 815 | 805 | ||
| 816 | config X86_UP_APIC | 806 | config X86_UP_APIC |
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 305f23cd7cff..cab3da3d0949 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h | |||
| @@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq); | |||
| 132 | struct task_struct; | 132 | struct task_struct; |
| 133 | 133 | ||
| 134 | #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) | 134 | #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) |
| 135 | static inline void account_system_vtime(struct task_struct *tsk) | 135 | static inline void vtime_account(struct task_struct *tsk) |
| 136 | { | 136 | { |
| 137 | } | 137 | } |
| 138 | #else | 138 | #else |
| 139 | extern void account_system_vtime(struct task_struct *tsk); | 139 | extern void vtime_account(struct task_struct *tsk); |
| 140 | #endif | 140 | #endif |
| 141 | 141 | ||
| 142 | #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) | 142 | #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) |
| @@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void); | |||
| 162 | */ | 162 | */ |
| 163 | #define __irq_enter() \ | 163 | #define __irq_enter() \ |
| 164 | do { \ | 164 | do { \ |
| 165 | account_system_vtime(current); \ | 165 | vtime_account(current); \ |
| 166 | add_preempt_count(HARDIRQ_OFFSET); \ | 166 | add_preempt_count(HARDIRQ_OFFSET); \ |
| 167 | trace_hardirq_enter(); \ | 167 | trace_hardirq_enter(); \ |
| 168 | } while (0) | 168 | } while (0) |
| @@ -178,7 +178,7 @@ extern void irq_enter(void); | |||
| 178 | #define __irq_exit() \ | 178 | #define __irq_exit() \ |
| 179 | do { \ | 179 | do { \ |
| 180 | trace_hardirq_exit(); \ | 180 | trace_hardirq_exit(); \ |
| 181 | account_system_vtime(current); \ | 181 | vtime_account(current); \ |
| 182 | sub_preempt_count(HARDIRQ_OFFSET); \ | 182 | sub_preempt_count(HARDIRQ_OFFSET); \ |
| 183 | } while (0) | 183 | } while (0) |
| 184 | 184 | ||
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 2fbd9053c2df..36d12f0884c3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
| @@ -130,4 +130,12 @@ extern void account_process_tick(struct task_struct *, int user); | |||
| 130 | extern void account_steal_ticks(unsigned long ticks); | 130 | extern void account_steal_ticks(unsigned long ticks); |
| 131 | extern void account_idle_ticks(unsigned long ticks); | 131 | extern void account_idle_ticks(unsigned long ticks); |
| 132 | 132 | ||
| 133 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 134 | extern void vtime_task_switch(struct task_struct *prev); | ||
| 135 | extern void vtime_account_system(struct task_struct *tsk); | ||
| 136 | extern void vtime_account_idle(struct task_struct *tsk); | ||
| 137 | #else | ||
| 138 | static inline void vtime_task_switch(struct task_struct *prev) { } | ||
| 139 | #endif | ||
| 140 | |||
| 133 | #endif /* _LINUX_KERNEL_STAT_H */ | 141 | #endif /* _LINUX_KERNEL_STAT_H */ |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b70b48b01098..8a59e0abe5fa 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
| @@ -685,7 +685,7 @@ static inline int kvm_deassign_device(struct kvm *kvm, | |||
| 685 | static inline void kvm_guest_enter(void) | 685 | static inline void kvm_guest_enter(void) |
| 686 | { | 686 | { |
| 687 | BUG_ON(preemptible()); | 687 | BUG_ON(preemptible()); |
| 688 | account_system_vtime(current); | 688 | vtime_account(current); |
| 689 | current->flags |= PF_VCPU; | 689 | current->flags |= PF_VCPU; |
| 690 | /* KVM does not hold any references to rcu protected data when it | 690 | /* KVM does not hold any references to rcu protected data when it |
| 691 | * switches CPU into a guest mode. In fact switching to a guest mode | 691 | * switches CPU into a guest mode. In fact switching to a guest mode |
| @@ -699,7 +699,7 @@ static inline void kvm_guest_enter(void) | |||
| 699 | 699 | ||
| 700 | static inline void kvm_guest_exit(void) | 700 | static inline void kvm_guest_exit(void) |
| 701 | { | 701 | { |
| 702 | account_system_vtime(current); | 702 | vtime_account(current); |
| 703 | current->flags &= ~PF_VCPU; | 703 | current->flags &= ~PF_VCPU; |
| 704 | } | 704 | } |
| 705 | 705 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 83035269e597..765dffbb085e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -273,11 +273,11 @@ extern void init_idle_bootup_task(struct task_struct *idle); | |||
| 273 | extern int runqueue_is_locked(int cpu); | 273 | extern int runqueue_is_locked(int cpu); |
| 274 | 274 | ||
| 275 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 275 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) |
| 276 | extern void select_nohz_load_balancer(int stop_tick); | 276 | extern void nohz_balance_enter_idle(int cpu); |
| 277 | extern void set_cpu_sd_state_idle(void); | 277 | extern void set_cpu_sd_state_idle(void); |
| 278 | extern int get_nohz_timer_target(void); | 278 | extern int get_nohz_timer_target(void); |
| 279 | #else | 279 | #else |
| 280 | static inline void select_nohz_load_balancer(int stop_tick) { } | 280 | static inline void nohz_balance_enter_idle(int cpu) { } |
| 281 | static inline void set_cpu_sd_state_idle(void) { } | 281 | static inline void set_cpu_sd_state_idle(void) { } |
| 282 | #endif | 282 | #endif |
| 283 | 283 | ||
| @@ -681,11 +681,6 @@ struct signal_struct { | |||
| 681 | * (notably. ptrace) */ | 681 | * (notably. ptrace) */ |
| 682 | }; | 682 | }; |
| 683 | 683 | ||
| 684 | /* Context switch must be unlocked if interrupts are to be enabled */ | ||
| 685 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 686 | # define __ARCH_WANT_UNLOCKED_CTXSW | ||
| 687 | #endif | ||
| 688 | |||
| 689 | /* | 684 | /* |
| 690 | * Bits in flags field of signal_struct. | 685 | * Bits in flags field of signal_struct. |
| 691 | */ | 686 | */ |
| @@ -863,7 +858,6 @@ enum cpu_idle_type { | |||
| 863 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ | 858 | #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ |
| 864 | #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ | 859 | #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ |
| 865 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ | 860 | #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ |
| 866 | #define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */ | ||
| 867 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ | 861 | #define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ |
| 868 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ | 862 | #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ |
| 869 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ | 863 | #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ |
diff --git a/include/linux/topology.h b/include/linux/topology.h index fec12d667211..d3cf0d6e7712 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
| @@ -129,7 +129,6 @@ int arch_update_cpu_topology(void); | |||
| 129 | | 1*SD_BALANCE_FORK \ | 129 | | 1*SD_BALANCE_FORK \ |
| 130 | | 0*SD_BALANCE_WAKE \ | 130 | | 0*SD_BALANCE_WAKE \ |
| 131 | | 1*SD_WAKE_AFFINE \ | 131 | | 1*SD_WAKE_AFFINE \ |
| 132 | | 0*SD_PREFER_LOCAL \ | ||
| 133 | | 0*SD_SHARE_CPUPOWER \ | 132 | | 0*SD_SHARE_CPUPOWER \ |
| 134 | | 1*SD_SHARE_PKG_RESOURCES \ | 133 | | 1*SD_SHARE_PKG_RESOURCES \ |
| 135 | | 0*SD_SERIALIZE \ | 134 | | 0*SD_SERIALIZE \ |
| @@ -160,7 +159,6 @@ int arch_update_cpu_topology(void); | |||
| 160 | | 1*SD_BALANCE_FORK \ | 159 | | 1*SD_BALANCE_FORK \ |
| 161 | | 0*SD_BALANCE_WAKE \ | 160 | | 0*SD_BALANCE_WAKE \ |
| 162 | | 1*SD_WAKE_AFFINE \ | 161 | | 1*SD_WAKE_AFFINE \ |
| 163 | | 0*SD_PREFER_LOCAL \ | ||
| 164 | | 0*SD_SHARE_CPUPOWER \ | 162 | | 0*SD_SHARE_CPUPOWER \ |
| 165 | | 0*SD_SHARE_PKG_RESOURCES \ | 163 | | 0*SD_SHARE_PKG_RESOURCES \ |
| 166 | | 0*SD_SERIALIZE \ | 164 | | 0*SD_SERIALIZE \ |
diff --git a/init/Kconfig b/init/Kconfig index c26b8a1d2b57..3466a6e017b7 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -267,6 +267,106 @@ config POSIX_MQUEUE_SYSCTL | |||
| 267 | depends on SYSCTL | 267 | depends on SYSCTL |
| 268 | default y | 268 | default y |
| 269 | 269 | ||
| 270 | config FHANDLE | ||
| 271 | bool "open by fhandle syscalls" | ||
| 272 | select EXPORTFS | ||
| 273 | help | ||
| 274 | If you say Y here, a user level program will be able to map | ||
| 275 | file names to handle and then later use the handle for | ||
| 276 | different file system operations. This is useful in implementing | ||
| 277 | userspace file servers, which now track files using handles instead | ||
| 278 | of names. The handle would remain the same even if file names | ||
| 279 | get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) | ||
| 280 | syscalls. | ||
| 281 | |||
| 282 | config AUDIT | ||
| 283 | bool "Auditing support" | ||
| 284 | depends on NET | ||
| 285 | help | ||
| 286 | Enable auditing infrastructure that can be used with another | ||
| 287 | kernel subsystem, such as SELinux (which requires this for | ||
| 288 | logging of avc messages output). Does not do system-call | ||
| 289 | auditing without CONFIG_AUDITSYSCALL. | ||
| 290 | |||
| 291 | config AUDITSYSCALL | ||
| 292 | bool "Enable system-call auditing support" | ||
| 293 | depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT)) | ||
| 294 | default y if SECURITY_SELINUX | ||
| 295 | help | ||
| 296 | Enable low-overhead system-call auditing infrastructure that | ||
| 297 | can be used independently or with another kernel subsystem, | ||
| 298 | such as SELinux. | ||
| 299 | |||
| 300 | config AUDIT_WATCH | ||
| 301 | def_bool y | ||
| 302 | depends on AUDITSYSCALL | ||
| 303 | select FSNOTIFY | ||
| 304 | |||
| 305 | config AUDIT_TREE | ||
| 306 | def_bool y | ||
| 307 | depends on AUDITSYSCALL | ||
| 308 | select FSNOTIFY | ||
| 309 | |||
| 310 | config AUDIT_LOGINUID_IMMUTABLE | ||
| 311 | bool "Make audit loginuid immutable" | ||
| 312 | depends on AUDIT | ||
| 313 | help | ||
| 314 | The config option toggles if a task setting its loginuid requires | ||
| 315 | CAP_SYS_AUDITCONTROL or if that task should require no special permissions | ||
| 316 | but should instead only allow setting its loginuid if it was never | ||
| 317 | previously set. On systems which use systemd or a similar central | ||
| 318 | process to restart login services this should be set to true. On older | ||
| 319 | systems in which an admin would typically have to directly stop and | ||
| 320 | start processes this should be set to false. Setting this to true allows | ||
| 321 | one to drop potentially dangerous capabilites from the login tasks, | ||
| 322 | but may not be backwards compatible with older init systems. | ||
| 323 | |||
| 324 | source "kernel/irq/Kconfig" | ||
| 325 | source "kernel/time/Kconfig" | ||
| 326 | |||
| 327 | menu "CPU/Task time and stats accounting" | ||
| 328 | |||
| 329 | choice | ||
| 330 | prompt "Cputime accounting" | ||
| 331 | default TICK_CPU_ACCOUNTING if !PPC64 | ||
| 332 | default VIRT_CPU_ACCOUNTING if PPC64 | ||
| 333 | |||
| 334 | # Kind of a stub config for the pure tick based cputime accounting | ||
| 335 | config TICK_CPU_ACCOUNTING | ||
| 336 | bool "Simple tick based cputime accounting" | ||
| 337 | depends on !S390 | ||
| 338 | help | ||
| 339 | This is the basic tick based cputime accounting that maintains | ||
| 340 | statistics about user, system and idle time spent on per jiffies | ||
| 341 | granularity. | ||
| 342 | |||
| 343 | If unsure, say Y. | ||
| 344 | |||
| 345 | config VIRT_CPU_ACCOUNTING | ||
| 346 | bool "Deterministic task and CPU time accounting" | ||
| 347 | depends on HAVE_VIRT_CPU_ACCOUNTING | ||
| 348 | help | ||
| 349 | Select this option to enable more accurate task and CPU time | ||
| 350 | accounting. This is done by reading a CPU counter on each | ||
| 351 | kernel entry and exit and on transitions within the kernel | ||
| 352 | between system, softirq and hardirq state, so there is a | ||
| 353 | small performance impact. In the case of s390 or IBM POWER > 5, | ||
| 354 | this also enables accounting of stolen time on logically-partitioned | ||
| 355 | systems. | ||
| 356 | |||
| 357 | config IRQ_TIME_ACCOUNTING | ||
| 358 | bool "Fine granularity task level IRQ time accounting" | ||
| 359 | depends on HAVE_IRQ_TIME_ACCOUNTING | ||
| 360 | help | ||
| 361 | Select this option to enable fine granularity task irq time | ||
| 362 | accounting. This is done by reading a timestamp on each | ||
| 363 | transitions between softirq and hardirq state, so there can be a | ||
| 364 | small performance impact. | ||
| 365 | |||
| 366 | If in doubt, say N here. | ||
| 367 | |||
| 368 | endchoice | ||
| 369 | |||
| 270 | config BSD_PROCESS_ACCT | 370 | config BSD_PROCESS_ACCT |
| 271 | bool "BSD Process Accounting" | 371 | bool "BSD Process Accounting" |
| 272 | help | 372 | help |
| @@ -292,18 +392,6 @@ config BSD_PROCESS_ACCT_V3 | |||
| 292 | for processing it. A preliminary version of these tools is available | 392 | for processing it. A preliminary version of these tools is available |
| 293 | at <http://www.gnu.org/software/acct/>. | 393 | at <http://www.gnu.org/software/acct/>. |
| 294 | 394 | ||
| 295 | config FHANDLE | ||
| 296 | bool "open by fhandle syscalls" | ||
| 297 | select EXPORTFS | ||
| 298 | help | ||
| 299 | If you say Y here, a user level program will be able to map | ||
| 300 | file names to handle and then later use the handle for | ||
| 301 | different file system operations. This is useful in implementing | ||
| 302 | userspace file servers, which now track files using handles instead | ||
| 303 | of names. The handle would remain the same even if file names | ||
| 304 | get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2) | ||
| 305 | syscalls. | ||
| 306 | |||
| 307 | config TASKSTATS | 395 | config TASKSTATS |
| 308 | bool "Export task/process statistics through netlink (EXPERIMENTAL)" | 396 | bool "Export task/process statistics through netlink (EXPERIMENTAL)" |
| 309 | depends on NET | 397 | depends on NET |
| @@ -346,50 +434,7 @@ config TASK_IO_ACCOUNTING | |||
| 346 | 434 | ||
| 347 | Say N if unsure. | 435 | Say N if unsure. |
| 348 | 436 | ||
| 349 | config AUDIT | 437 | endmenu # "CPU/Task time and stats accounting" |
| 350 | bool "Auditing support" | ||
| 351 | depends on NET | ||
| 352 | help | ||
| 353 | Enable auditing infrastructure that can be used with another | ||
| 354 | kernel subsystem, such as SELinux (which requires this for | ||
| 355 | logging of avc messages output). Does not do system-call | ||
| 356 | auditing without CONFIG_AUDITSYSCALL. | ||
| 357 | |||
| 358 | config AUDITSYSCALL | ||
| 359 | bool "Enable system-call auditing support" | ||
| 360 | depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT)) | ||
| 361 | default y if SECURITY_SELINUX | ||
| 362 | help | ||
| 363 | Enable low-overhead system-call auditing infrastructure that | ||
| 364 | can be used independently or with another kernel subsystem, | ||
| 365 | such as SELinux. | ||
| 366 | |||
| 367 | config AUDIT_WATCH | ||
| 368 | def_bool y | ||
| 369 | depends on AUDITSYSCALL | ||
| 370 | select FSNOTIFY | ||
| 371 | |||
| 372 | config AUDIT_TREE | ||
| 373 | def_bool y | ||
| 374 | depends on AUDITSYSCALL | ||
| 375 | select FSNOTIFY | ||
| 376 | |||
| 377 | config AUDIT_LOGINUID_IMMUTABLE | ||
| 378 | bool "Make audit loginuid immutable" | ||
| 379 | depends on AUDIT | ||
| 380 | help | ||
| 381 | The config option toggles if a task setting its loginuid requires | ||
| 382 | CAP_SYS_AUDITCONTROL or if that task should require no special permissions | ||
| 383 | but should instead only allow setting its loginuid if it was never | ||
| 384 | previously set. On systems which use systemd or a similar central | ||
| 385 | process to restart login services this should be set to true. On older | ||
| 386 | systems in which an admin would typically have to directly stop and | ||
| 387 | start processes this should be set to false. Setting this to true allows | ||
| 388 | one to drop potentially dangerous capabilites from the login tasks, | ||
| 389 | but may not be backwards compatible with older init systems. | ||
| 390 | |||
| 391 | source "kernel/irq/Kconfig" | ||
| 392 | source "kernel/time/Kconfig" | ||
| 393 | 438 | ||
| 394 | menu "RCU Subsystem" | 439 | menu "RCU Subsystem" |
| 395 | 440 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 2343c9eaaaf4..5a0e74d89a5a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1276,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1276 | #endif | 1276 | #endif |
| 1277 | #ifdef CONFIG_TRACE_IRQFLAGS | 1277 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 1278 | p->irq_events = 0; | 1278 | p->irq_events = 0; |
| 1279 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1280 | p->hardirqs_enabled = 1; | ||
| 1281 | #else | ||
| 1282 | p->hardirqs_enabled = 0; | 1279 | p->hardirqs_enabled = 0; |
| 1283 | #endif | ||
| 1284 | p->hardirq_enable_ip = 0; | 1280 | p->hardirq_enable_ip = 0; |
| 1285 | p->hardirq_enable_event = 0; | 1281 | p->hardirq_enable_event = 0; |
| 1286 | p->hardirq_disable_ip = _THIS_IP_; | 1282 | p->hardirq_disable_ip = _THIS_IP_; |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
| 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
| 12 | endif | 12 | endif |
| 13 | 13 | ||
| 14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
| 15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
| 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c4dec0594d6..c17747236438 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 740 | dequeue_task(rq, p, flags); | 740 | dequeue_task(rq, p, flags); |
| 741 | } | 741 | } |
| 742 | 742 | ||
| 743 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 744 | |||
| 745 | /* | ||
| 746 | * There are no locks covering percpu hardirq/softirq time. | ||
| 747 | * They are only modified in account_system_vtime, on corresponding CPU | ||
| 748 | * with interrupts disabled. So, writes are safe. | ||
| 749 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 750 | * This may result in other CPU reading this CPU's irq time and can | ||
| 751 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
| 752 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
| 753 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
| 754 | * compromise in place of having locks on each irq in account_system_time. | ||
| 755 | */ | ||
| 756 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 757 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 758 | |||
| 759 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 760 | static int sched_clock_irqtime; | ||
| 761 | |||
| 762 | void enable_sched_clock_irqtime(void) | ||
| 763 | { | ||
| 764 | sched_clock_irqtime = 1; | ||
| 765 | } | ||
| 766 | |||
| 767 | void disable_sched_clock_irqtime(void) | ||
| 768 | { | ||
| 769 | sched_clock_irqtime = 0; | ||
| 770 | } | ||
| 771 | |||
| 772 | #ifndef CONFIG_64BIT | ||
| 773 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 774 | |||
| 775 | static inline void irq_time_write_begin(void) | ||
| 776 | { | ||
| 777 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 778 | smp_wmb(); | ||
| 779 | } | ||
| 780 | |||
| 781 | static inline void irq_time_write_end(void) | ||
| 782 | { | ||
| 783 | smp_wmb(); | ||
| 784 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 785 | } | ||
| 786 | |||
| 787 | static inline u64 irq_time_read(int cpu) | ||
| 788 | { | ||
| 789 | u64 irq_time; | ||
| 790 | unsigned seq; | ||
| 791 | |||
| 792 | do { | ||
| 793 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
| 794 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
| 795 | per_cpu(cpu_hardirq_time, cpu); | ||
| 796 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 797 | |||
| 798 | return irq_time; | ||
| 799 | } | ||
| 800 | #else /* CONFIG_64BIT */ | ||
| 801 | static inline void irq_time_write_begin(void) | ||
| 802 | { | ||
| 803 | } | ||
| 804 | |||
| 805 | static inline void irq_time_write_end(void) | ||
| 806 | { | ||
| 807 | } | ||
| 808 | |||
| 809 | static inline u64 irq_time_read(int cpu) | ||
| 810 | { | ||
| 811 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 812 | } | ||
| 813 | #endif /* CONFIG_64BIT */ | ||
| 814 | |||
| 815 | /* | ||
| 816 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
| 817 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
| 818 | */ | ||
| 819 | void account_system_vtime(struct task_struct *curr) | ||
| 820 | { | ||
| 821 | unsigned long flags; | ||
| 822 | s64 delta; | ||
| 823 | int cpu; | ||
| 824 | |||
| 825 | if (!sched_clock_irqtime) | ||
| 826 | return; | ||
| 827 | |||
| 828 | local_irq_save(flags); | ||
| 829 | |||
| 830 | cpu = smp_processor_id(); | ||
| 831 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
| 832 | __this_cpu_add(irq_start_time, delta); | ||
| 833 | |||
| 834 | irq_time_write_begin(); | ||
| 835 | /* | ||
| 836 | * We do not account for softirq time from ksoftirqd here. | ||
| 837 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 838 | * in that case, so as not to confuse scheduler with a special task | ||
| 839 | * that do not consume any time, but still wants to run. | ||
| 840 | */ | ||
| 841 | if (hardirq_count()) | ||
| 842 | __this_cpu_add(cpu_hardirq_time, delta); | ||
| 843 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
| 844 | __this_cpu_add(cpu_softirq_time, delta); | ||
| 845 | |||
| 846 | irq_time_write_end(); | ||
| 847 | local_irq_restore(flags); | ||
| 848 | } | ||
| 849 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 850 | |||
| 851 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 852 | |||
| 853 | #ifdef CONFIG_PARAVIRT | ||
| 854 | static inline u64 steal_ticks(u64 steal) | ||
| 855 | { | ||
| 856 | if (unlikely(steal > NSEC_PER_SEC)) | ||
| 857 | return div_u64(steal, TICK_NSEC); | ||
| 858 | |||
| 859 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 860 | } | ||
| 861 | #endif | ||
| 862 | |||
| 863 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 743 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
| 864 | { | 744 | { |
| 865 | /* | 745 | /* |
| @@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 920 | #endif | 800 | #endif |
| 921 | } | 801 | } |
| 922 | 802 | ||
| 923 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 924 | static int irqtime_account_hi_update(void) | ||
| 925 | { | ||
| 926 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 927 | unsigned long flags; | ||
| 928 | u64 latest_ns; | ||
| 929 | int ret = 0; | ||
| 930 | |||
| 931 | local_irq_save(flags); | ||
| 932 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
| 933 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
| 934 | ret = 1; | ||
| 935 | local_irq_restore(flags); | ||
| 936 | return ret; | ||
| 937 | } | ||
| 938 | |||
| 939 | static int irqtime_account_si_update(void) | ||
| 940 | { | ||
| 941 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 942 | unsigned long flags; | ||
| 943 | u64 latest_ns; | ||
| 944 | int ret = 0; | ||
| 945 | |||
| 946 | local_irq_save(flags); | ||
| 947 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
| 948 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
| 949 | ret = 1; | ||
| 950 | local_irq_restore(flags); | ||
| 951 | return ret; | ||
| 952 | } | ||
| 953 | |||
| 954 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 955 | |||
| 956 | #define sched_clock_irqtime (0) | ||
| 957 | |||
| 958 | #endif | ||
| 959 | |||
| 960 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 803 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
| 961 | { | 804 | { |
| 962 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 805 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
| @@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
| 1518 | smp_send_reschedule(cpu); | 1361 | smp_send_reschedule(cpu); |
| 1519 | } | 1362 | } |
| 1520 | 1363 | ||
| 1521 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1522 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
| 1523 | { | ||
| 1524 | struct rq *rq; | ||
| 1525 | int ret = 0; | ||
| 1526 | |||
| 1527 | rq = __task_rq_lock(p); | ||
| 1528 | if (p->on_cpu) { | ||
| 1529 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
| 1530 | ttwu_do_wakeup(rq, p, wake_flags); | ||
| 1531 | ret = 1; | ||
| 1532 | } | ||
| 1533 | __task_rq_unlock(rq); | ||
| 1534 | |||
| 1535 | return ret; | ||
| 1536 | |||
| 1537 | } | ||
| 1538 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1539 | |||
| 1540 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1364 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| 1541 | { | 1365 | { |
| 1542 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1366 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
| @@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1597 | * If the owning (remote) cpu is still in the middle of schedule() with | 1421 | * If the owning (remote) cpu is still in the middle of schedule() with |
| 1598 | * this task as prev, wait until its done referencing the task. | 1422 | * this task as prev, wait until its done referencing the task. |
| 1599 | */ | 1423 | */ |
| 1600 | while (p->on_cpu) { | 1424 | while (p->on_cpu) |
| 1601 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1602 | /* | ||
| 1603 | * In case the architecture enables interrupts in | ||
| 1604 | * context_switch(), we cannot busy wait, since that | ||
| 1605 | * would lead to deadlocks when an interrupt hits and | ||
| 1606 | * tries to wake up @prev. So bail and do a complete | ||
| 1607 | * remote wakeup. | ||
| 1608 | */ | ||
| 1609 | if (ttwu_activate_remote(p, wake_flags)) | ||
| 1610 | goto stat; | ||
| 1611 | #else | ||
| 1612 | cpu_relax(); | 1425 | cpu_relax(); |
| 1613 | #endif | ||
| 1614 | } | ||
| 1615 | /* | 1426 | /* |
| 1616 | * Pairs with the smp_wmb() in finish_lock_switch(). | 1427 | * Pairs with the smp_wmb() in finish_lock_switch(). |
| 1617 | */ | 1428 | */ |
| @@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 1953 | * Manfred Spraul <manfred@colorfullife.com> | 1764 | * Manfred Spraul <manfred@colorfullife.com> |
| 1954 | */ | 1765 | */ |
| 1955 | prev_state = prev->state; | 1766 | prev_state = prev->state; |
| 1767 | vtime_task_switch(prev); | ||
| 1956 | finish_arch_switch(prev); | 1768 | finish_arch_switch(prev); |
| 1957 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1958 | local_irq_disable(); | ||
| 1959 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1960 | perf_event_task_sched_in(prev, current); | 1769 | perf_event_task_sched_in(prev, current); |
| 1961 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1962 | local_irq_enable(); | ||
| 1963 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1964 | finish_lock_switch(rq, prev); | 1770 | finish_lock_switch(rq, prev); |
| 1965 | finish_arch_post_lock_switch(); | 1771 | finish_arch_post_lock_switch(); |
| 1966 | 1772 | ||
| @@ -2810,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2810 | return ns; | 2616 | return ns; |
| 2811 | } | 2617 | } |
| 2812 | 2618 | ||
| 2813 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2814 | struct cgroup_subsys cpuacct_subsys; | ||
| 2815 | struct cpuacct root_cpuacct; | ||
| 2816 | #endif | ||
| 2817 | |||
| 2818 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
| 2819 | u64 tmp) | ||
| 2820 | { | ||
| 2821 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2822 | struct kernel_cpustat *kcpustat; | ||
| 2823 | struct cpuacct *ca; | ||
| 2824 | #endif | ||
| 2825 | /* | ||
| 2826 | * Since all updates are sure to touch the root cgroup, we | ||
| 2827 | * get ourselves ahead and touch it first. If the root cgroup | ||
| 2828 | * is the only cgroup, then nothing else should be necessary. | ||
| 2829 | * | ||
| 2830 | */ | ||
| 2831 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
| 2832 | |||
| 2833 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2834 | if (unlikely(!cpuacct_subsys.active)) | ||
| 2835 | return; | ||
| 2836 | |||
| 2837 | rcu_read_lock(); | ||
| 2838 | ca = task_ca(p); | ||
| 2839 | while (ca && (ca != &root_cpuacct)) { | ||
| 2840 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
| 2841 | kcpustat->cpustat[index] += tmp; | ||
| 2842 | ca = parent_ca(ca); | ||
| 2843 | } | ||
| 2844 | rcu_read_unlock(); | ||
| 2845 | #endif | ||
| 2846 | } | ||
| 2847 | |||
| 2848 | |||
| 2849 | /* | ||
| 2850 | * Account user cpu time to a process. | ||
| 2851 | * @p: the process that the cpu time gets accounted to | ||
| 2852 | * @cputime: the cpu time spent in user space since the last update | ||
| 2853 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2854 | */ | ||
| 2855 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
| 2856 | cputime_t cputime_scaled) | ||
| 2857 | { | ||
| 2858 | int index; | ||
| 2859 | |||
| 2860 | /* Add user time to process. */ | ||
| 2861 | p->utime += cputime; | ||
| 2862 | p->utimescaled += cputime_scaled; | ||
| 2863 | account_group_user_time(p, cputime); | ||
| 2864 | |||
| 2865 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
| 2866 | |||
| 2867 | /* Add user time to cpustat. */ | ||
| 2868 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 2869 | |||
| 2870 | /* Account for user time used */ | ||
| 2871 | acct_update_integrals(p); | ||
| 2872 | } | ||
| 2873 | |||
| 2874 | /* | ||
| 2875 | * Account guest cpu time to a process. | ||
| 2876 | * @p: the process that the cpu time gets accounted to | ||
| 2877 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 2878 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2879 | */ | ||
| 2880 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
| 2881 | cputime_t cputime_scaled) | ||
| 2882 | { | ||
| 2883 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2884 | |||
| 2885 | /* Add guest time to process. */ | ||
| 2886 | p->utime += cputime; | ||
| 2887 | p->utimescaled += cputime_scaled; | ||
| 2888 | account_group_user_time(p, cputime); | ||
| 2889 | p->gtime += cputime; | ||
| 2890 | |||
| 2891 | /* Add guest time to cpustat. */ | ||
| 2892 | if (TASK_NICE(p) > 0) { | ||
| 2893 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
| 2894 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
| 2895 | } else { | ||
| 2896 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
| 2897 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
| 2898 | } | ||
| 2899 | } | ||
| 2900 | |||
| 2901 | /* | ||
| 2902 | * Account system cpu time to a process and desired cpustat field | ||
| 2903 | * @p: the process that the cpu time gets accounted to | ||
| 2904 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 2905 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2906 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
| 2907 | */ | ||
| 2908 | static inline | ||
| 2909 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
| 2910 | cputime_t cputime_scaled, int index) | ||
| 2911 | { | ||
| 2912 | /* Add system time to process. */ | ||
| 2913 | p->stime += cputime; | ||
| 2914 | p->stimescaled += cputime_scaled; | ||
| 2915 | account_group_system_time(p, cputime); | ||
| 2916 | |||
| 2917 | /* Add system time to cpustat. */ | ||
| 2918 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 2919 | |||
| 2920 | /* Account for system time used */ | ||
| 2921 | acct_update_integrals(p); | ||
| 2922 | } | ||
| 2923 | |||
| 2924 | /* | ||
| 2925 | * Account system cpu time to a process. | ||
| 2926 | * @p: the process that the cpu time gets accounted to | ||
| 2927 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
| 2928 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 2929 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2930 | */ | ||
| 2931 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
| 2932 | cputime_t cputime, cputime_t cputime_scaled) | ||
| 2933 | { | ||
| 2934 | int index; | ||
| 2935 | |||
| 2936 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
| 2937 | account_guest_time(p, cputime, cputime_scaled); | ||
| 2938 | return; | ||
| 2939 | } | ||
| 2940 | |||
| 2941 | if (hardirq_count() - hardirq_offset) | ||
| 2942 | index = CPUTIME_IRQ; | ||
| 2943 | else if (in_serving_softirq()) | ||
| 2944 | index = CPUTIME_SOFTIRQ; | ||
| 2945 | else | ||
| 2946 | index = CPUTIME_SYSTEM; | ||
| 2947 | |||
| 2948 | __account_system_time(p, cputime, cputime_scaled, index); | ||
| 2949 | } | ||
| 2950 | |||
| 2951 | /* | ||
| 2952 | * Account for involuntary wait time. | ||
| 2953 | * @cputime: the cpu time spent in involuntary wait | ||
| 2954 | */ | ||
| 2955 | void account_steal_time(cputime_t cputime) | ||
| 2956 | { | ||
| 2957 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2958 | |||
| 2959 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
| 2960 | } | ||
| 2961 | |||
| 2962 | /* | ||
| 2963 | * Account for idle time. | ||
| 2964 | * @cputime: the cpu time spent in idle wait | ||
| 2965 | */ | ||
| 2966 | void account_idle_time(cputime_t cputime) | ||
| 2967 | { | ||
| 2968 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2969 | struct rq *rq = this_rq(); | ||
| 2970 | |||
| 2971 | if (atomic_read(&rq->nr_iowait) > 0) | ||
| 2972 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
| 2973 | else | ||
| 2974 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
| 2975 | } | ||
| 2976 | |||
| 2977 | static __always_inline bool steal_account_process_tick(void) | ||
| 2978 | { | ||
| 2979 | #ifdef CONFIG_PARAVIRT | ||
| 2980 | if (static_key_false(¶virt_steal_enabled)) { | ||
| 2981 | u64 steal, st = 0; | ||
| 2982 | |||
| 2983 | steal = paravirt_steal_clock(smp_processor_id()); | ||
| 2984 | steal -= this_rq()->prev_steal_time; | ||
| 2985 | |||
| 2986 | st = steal_ticks(steal); | ||
| 2987 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
| 2988 | |||
| 2989 | account_steal_time(st); | ||
| 2990 | return st; | ||
| 2991 | } | ||
| 2992 | #endif | ||
| 2993 | return false; | ||
| 2994 | } | ||
| 2995 | |||
| 2996 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 2997 | |||
| 2998 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 2999 | /* | ||
| 3000 | * Account a tick to a process and cpustat | ||
| 3001 | * @p: the process that the cpu time gets accounted to | ||
| 3002 | * @user_tick: is the tick from userspace | ||
| 3003 | * @rq: the pointer to rq | ||
| 3004 | * | ||
| 3005 | * Tick demultiplexing follows the order | ||
| 3006 | * - pending hardirq update | ||
| 3007 | * - pending softirq update | ||
| 3008 | * - user_time | ||
| 3009 | * - idle_time | ||
| 3010 | * - system time | ||
| 3011 | * - check for guest_time | ||
| 3012 | * - else account as system_time | ||
| 3013 | * | ||
| 3014 | * Check for hardirq is done both for system and user time as there is | ||
| 3015 | * no timer going off while we are on hardirq and hence we may never get an | ||
| 3016 | * opportunity to update it solely in system time. | ||
| 3017 | * p->stime and friends are only updated on system time and not on irq | ||
| 3018 | * softirq as those do not count in task exec_runtime any more. | ||
| 3019 | */ | ||
| 3020 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3021 | struct rq *rq) | ||
| 3022 | { | ||
| 3023 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3024 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 3025 | |||
| 3026 | if (steal_account_process_tick()) | ||
| 3027 | return; | ||
| 3028 | |||
| 3029 | if (irqtime_account_hi_update()) { | ||
| 3030 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
| 3031 | } else if (irqtime_account_si_update()) { | ||
| 3032 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
| 3033 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 3034 | /* | ||
| 3035 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
| 3036 | * So, we have to handle it separately here. | ||
| 3037 | * Also, p->stime needs to be updated for ksoftirqd. | ||
| 3038 | */ | ||
| 3039 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3040 | CPUTIME_SOFTIRQ); | ||
| 3041 | } else if (user_tick) { | ||
| 3042 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3043 | } else if (p == rq->idle) { | ||
| 3044 | account_idle_time(cputime_one_jiffy); | ||
| 3045 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
| 3046 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3047 | } else { | ||
| 3048 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3049 | CPUTIME_SYSTEM); | ||
| 3050 | } | ||
| 3051 | } | ||
| 3052 | |||
| 3053 | static void irqtime_account_idle_ticks(int ticks) | ||
| 3054 | { | ||
| 3055 | int i; | ||
| 3056 | struct rq *rq = this_rq(); | ||
| 3057 | |||
| 3058 | for (i = 0; i < ticks; i++) | ||
| 3059 | irqtime_account_process_tick(current, 0, rq); | ||
| 3060 | } | ||
| 3061 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3062 | static void irqtime_account_idle_ticks(int ticks) {} | ||
| 3063 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3064 | struct rq *rq) {} | ||
| 3065 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3066 | |||
| 3067 | /* | ||
| 3068 | * Account a single tick of cpu time. | ||
| 3069 | * @p: the process that the cpu time gets accounted to | ||
| 3070 | * @user_tick: indicates if the tick is a user or a system tick | ||
| 3071 | */ | ||
| 3072 | void account_process_tick(struct task_struct *p, int user_tick) | ||
| 3073 | { | ||
| 3074 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3075 | struct rq *rq = this_rq(); | ||
| 3076 | |||
| 3077 | if (sched_clock_irqtime) { | ||
| 3078 | irqtime_account_process_tick(p, user_tick, rq); | ||
| 3079 | return; | ||
| 3080 | } | ||
| 3081 | |||
| 3082 | if (steal_account_process_tick()) | ||
| 3083 | return; | ||
| 3084 | |||
| 3085 | if (user_tick) | ||
| 3086 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3087 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
| 3088 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
| 3089 | one_jiffy_scaled); | ||
| 3090 | else | ||
| 3091 | account_idle_time(cputime_one_jiffy); | ||
| 3092 | } | ||
| 3093 | |||
| 3094 | /* | ||
| 3095 | * Account multiple ticks of steal time. | ||
| 3096 | * @p: the process from which the cpu time has been stolen | ||
| 3097 | * @ticks: number of stolen ticks | ||
| 3098 | */ | ||
| 3099 | void account_steal_ticks(unsigned long ticks) | ||
| 3100 | { | ||
| 3101 | account_steal_time(jiffies_to_cputime(ticks)); | ||
| 3102 | } | ||
| 3103 | |||
| 3104 | /* | ||
| 3105 | * Account multiple ticks of idle time. | ||
| 3106 | * @ticks: number of stolen ticks | ||
| 3107 | */ | ||
| 3108 | void account_idle_ticks(unsigned long ticks) | ||
| 3109 | { | ||
| 3110 | |||
| 3111 | if (sched_clock_irqtime) { | ||
| 3112 | irqtime_account_idle_ticks(ticks); | ||
| 3113 | return; | ||
| 3114 | } | ||
| 3115 | |||
| 3116 | account_idle_time(jiffies_to_cputime(ticks)); | ||
| 3117 | } | ||
| 3118 | |||
| 3119 | #endif | ||
| 3120 | |||
| 3121 | /* | ||
| 3122 | * Use precise platform statistics if available: | ||
| 3123 | */ | ||
| 3124 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 3125 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3126 | { | ||
| 3127 | *ut = p->utime; | ||
| 3128 | *st = p->stime; | ||
| 3129 | } | ||
| 3130 | |||
| 3131 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3132 | { | ||
| 3133 | struct task_cputime cputime; | ||
| 3134 | |||
| 3135 | thread_group_cputime(p, &cputime); | ||
| 3136 | |||
| 3137 | *ut = cputime.utime; | ||
| 3138 | *st = cputime.stime; | ||
| 3139 | } | ||
| 3140 | #else | ||
| 3141 | |||
| 3142 | #ifndef nsecs_to_cputime | ||
| 3143 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 3144 | #endif | ||
| 3145 | |||
| 3146 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
| 3147 | { | ||
| 3148 | u64 temp = (__force u64) rtime; | ||
| 3149 | |||
| 3150 | temp *= (__force u64) utime; | ||
| 3151 | |||
| 3152 | if (sizeof(cputime_t) == 4) | ||
| 3153 | temp = div_u64(temp, (__force u32) total); | ||
| 3154 | else | ||
| 3155 | temp = div64_u64(temp, (__force u64) total); | ||
| 3156 | |||
| 3157 | return (__force cputime_t) temp; | ||
| 3158 | } | ||
| 3159 | |||
| 3160 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3161 | { | ||
| 3162 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
| 3163 | |||
| 3164 | /* | ||
| 3165 | * Use CFS's precise accounting: | ||
| 3166 | */ | ||
| 3167 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
| 3168 | |||
| 3169 | if (total) | ||
| 3170 | utime = scale_utime(utime, rtime, total); | ||
| 3171 | else | ||
| 3172 | utime = rtime; | ||
| 3173 | |||
| 3174 | /* | ||
| 3175 | * Compare with previous values, to keep monotonicity: | ||
| 3176 | */ | ||
| 3177 | p->prev_utime = max(p->prev_utime, utime); | ||
| 3178 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
| 3179 | |||
| 3180 | *ut = p->prev_utime; | ||
| 3181 | *st = p->prev_stime; | ||
| 3182 | } | ||
| 3183 | |||
| 3184 | /* | ||
| 3185 | * Must be called with siglock held. | ||
| 3186 | */ | ||
| 3187 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3188 | { | ||
| 3189 | struct signal_struct *sig = p->signal; | ||
| 3190 | struct task_cputime cputime; | ||
| 3191 | cputime_t rtime, utime, total; | ||
| 3192 | |||
| 3193 | thread_group_cputime(p, &cputime); | ||
| 3194 | |||
| 3195 | total = cputime.utime + cputime.stime; | ||
| 3196 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
| 3197 | |||
| 3198 | if (total) | ||
| 3199 | utime = scale_utime(cputime.utime, rtime, total); | ||
| 3200 | else | ||
| 3201 | utime = rtime; | ||
| 3202 | |||
| 3203 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 3204 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
| 3205 | |||
| 3206 | *ut = sig->prev_utime; | ||
| 3207 | *st = sig->prev_stime; | ||
| 3208 | } | ||
| 3209 | #endif | ||
| 3210 | |||
| 3211 | /* | 2619 | /* |
| 3212 | * This function gets called by the timer code, with HZ frequency. | 2620 | * This function gets called by the timer code, with HZ frequency. |
| 3213 | * We call it with interrupts disabled. | 2621 | * We call it with interrupts disabled. |
| @@ -3368,6 +2776,40 @@ pick_next_task(struct rq *rq) | |||
| 3368 | 2776 | ||
| 3369 | /* | 2777 | /* |
| 3370 | * __schedule() is the main scheduler function. | 2778 | * __schedule() is the main scheduler function. |
| 2779 | * | ||
| 2780 | * The main means of driving the scheduler and thus entering this function are: | ||
| 2781 | * | ||
| 2782 | * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. | ||
| 2783 | * | ||
| 2784 | * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return | ||
| 2785 | * paths. For example, see arch/x86/entry_64.S. | ||
| 2786 | * | ||
| 2787 | * To drive preemption between tasks, the scheduler sets the flag in timer | ||
| 2788 | * interrupt handler scheduler_tick(). | ||
| 2789 | * | ||
| 2790 | * 3. Wakeups don't really cause entry into schedule(). They add a | ||
| 2791 | * task to the run-queue and that's it. | ||
| 2792 | * | ||
| 2793 | * Now, if the new task added to the run-queue preempts the current | ||
| 2794 | * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets | ||
| 2795 | * called on the nearest possible occasion: | ||
| 2796 | * | ||
| 2797 | * - If the kernel is preemptible (CONFIG_PREEMPT=y): | ||
| 2798 | * | ||
| 2799 | * - in syscall or exception context, at the next outmost | ||
| 2800 | * preempt_enable(). (this might be as soon as the wake_up()'s | ||
| 2801 | * spin_unlock()!) | ||
| 2802 | * | ||
| 2803 | * - in IRQ context, return from interrupt-handler to | ||
| 2804 | * preemptible context | ||
| 2805 | * | ||
| 2806 | * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) | ||
| 2807 | * then at the next: | ||
| 2808 | * | ||
| 2809 | * - cond_resched() call | ||
| 2810 | * - explicit schedule() call | ||
| 2811 | * - return from syscall or exception to user-space | ||
| 2812 | * - return from interrupt-handler to user-space | ||
| 3371 | */ | 2813 | */ |
| 3372 | static void __sched __schedule(void) | 2814 | static void __sched __schedule(void) |
| 3373 | { | 2815 | { |
| @@ -4885,13 +4327,6 @@ again: | |||
| 4885 | */ | 4327 | */ |
| 4886 | if (preempt && rq != p_rq) | 4328 | if (preempt && rq != p_rq) |
| 4887 | resched_task(p_rq->curr); | 4329 | resched_task(p_rq->curr); |
| 4888 | } else { | ||
| 4889 | /* | ||
| 4890 | * We might have set it in task_yield_fair(), but are | ||
| 4891 | * not going to schedule(), so don't want to skip | ||
| 4892 | * the next update. | ||
| 4893 | */ | ||
| 4894 | rq->skip_clock_update = 0; | ||
| 4895 | } | 4330 | } |
| 4896 | 4331 | ||
| 4897 | out: | 4332 | out: |
| @@ -5433,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
| 5433 | *tablep = NULL; | 4868 | *tablep = NULL; |
| 5434 | } | 4869 | } |
| 5435 | 4870 | ||
| 4871 | static int min_load_idx = 0; | ||
| 4872 | static int max_load_idx = CPU_LOAD_IDX_MAX; | ||
| 4873 | |||
| 5436 | static void | 4874 | static void |
| 5437 | set_table_entry(struct ctl_table *entry, | 4875 | set_table_entry(struct ctl_table *entry, |
| 5438 | const char *procname, void *data, int maxlen, | 4876 | const char *procname, void *data, int maxlen, |
| 5439 | umode_t mode, proc_handler *proc_handler) | 4877 | umode_t mode, proc_handler *proc_handler, |
| 4878 | bool load_idx) | ||
| 5440 | { | 4879 | { |
| 5441 | entry->procname = procname; | 4880 | entry->procname = procname; |
| 5442 | entry->data = data; | 4881 | entry->data = data; |
| 5443 | entry->maxlen = maxlen; | 4882 | entry->maxlen = maxlen; |
| 5444 | entry->mode = mode; | 4883 | entry->mode = mode; |
| 5445 | entry->proc_handler = proc_handler; | 4884 | entry->proc_handler = proc_handler; |
| 4885 | |||
| 4886 | if (load_idx) { | ||
| 4887 | entry->extra1 = &min_load_idx; | ||
| 4888 | entry->extra2 = &max_load_idx; | ||
| 4889 | } | ||
| 5446 | } | 4890 | } |
| 5447 | 4891 | ||
| 5448 | static struct ctl_table * | 4892 | static struct ctl_table * |
| @@ -5454,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 5454 | return NULL; | 4898 | return NULL; |
| 5455 | 4899 | ||
| 5456 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 4900 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
| 5457 | sizeof(long), 0644, proc_doulongvec_minmax); | 4901 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 5458 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 4902 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
| 5459 | sizeof(long), 0644, proc_doulongvec_minmax); | 4903 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 5460 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 4904 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
| 5461 | sizeof(int), 0644, proc_dointvec_minmax); | 4905 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5462 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 4906 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
| 5463 | sizeof(int), 0644, proc_dointvec_minmax); | 4907 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5464 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 4908 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
| 5465 | sizeof(int), 0644, proc_dointvec_minmax); | 4909 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5466 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 4910 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
| 5467 | sizeof(int), 0644, proc_dointvec_minmax); | 4911 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5468 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 4912 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
| 5469 | sizeof(int), 0644, proc_dointvec_minmax); | 4913 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5470 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 4914 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
| 5471 | sizeof(int), 0644, proc_dointvec_minmax); | 4915 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5472 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 4916 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
| 5473 | sizeof(int), 0644, proc_dointvec_minmax); | 4917 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5474 | set_table_entry(&table[9], "cache_nice_tries", | 4918 | set_table_entry(&table[9], "cache_nice_tries", |
| 5475 | &sd->cache_nice_tries, | 4919 | &sd->cache_nice_tries, |
| 5476 | sizeof(int), 0644, proc_dointvec_minmax); | 4920 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5477 | set_table_entry(&table[10], "flags", &sd->flags, | 4921 | set_table_entry(&table[10], "flags", &sd->flags, |
| 5478 | sizeof(int), 0644, proc_dointvec_minmax); | 4922 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5479 | set_table_entry(&table[11], "name", sd->name, | 4923 | set_table_entry(&table[11], "name", sd->name, |
| 5480 | CORENAME_MAX_SIZE, 0444, proc_dostring); | 4924 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
| 5481 | /* &table[12] is terminator */ | 4925 | /* &table[12] is terminator */ |
| 5482 | 4926 | ||
| 5483 | return table; | 4927 | return table; |
| @@ -6556,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6556 | | 0*SD_BALANCE_FORK | 6000 | | 0*SD_BALANCE_FORK |
| 6557 | | 0*SD_BALANCE_WAKE | 6001 | | 0*SD_BALANCE_WAKE |
| 6558 | | 0*SD_WAKE_AFFINE | 6002 | | 0*SD_WAKE_AFFINE |
| 6559 | | 0*SD_PREFER_LOCAL | ||
| 6560 | | 0*SD_SHARE_CPUPOWER | 6003 | | 0*SD_SHARE_CPUPOWER |
| 6561 | | 0*SD_SHARE_PKG_RESOURCES | 6004 | | 0*SD_SHARE_PKG_RESOURCES |
| 6562 | | 1*SD_SERIALIZE | 6005 | | 1*SD_SERIALIZE |
| @@ -8354,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 8354 | * (balbir@in.ibm.com). | 7797 | * (balbir@in.ibm.com). |
| 8355 | */ | 7798 | */ |
| 8356 | 7799 | ||
| 7800 | struct cpuacct root_cpuacct; | ||
| 7801 | |||
| 8357 | /* create a new cpu accounting group */ | 7802 | /* create a new cpu accounting group */ |
| 8358 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7803 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) |
| 8359 | { | 7804 | { |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..81b763ba58a6 --- /dev/null +++ b/kernel/sched/cputime.c | |||
| @@ -0,0 +1,530 @@ | |||
| 1 | #include <linux/export.h> | ||
| 2 | #include <linux/sched.h> | ||
| 3 | #include <linux/tsacct_kern.h> | ||
| 4 | #include <linux/kernel_stat.h> | ||
| 5 | #include <linux/static_key.h> | ||
| 6 | #include "sched.h" | ||
| 7 | |||
| 8 | |||
| 9 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 10 | |||
| 11 | /* | ||
| 12 | * There are no locks covering percpu hardirq/softirq time. | ||
| 13 | * They are only modified in vtime_account, on corresponding CPU | ||
| 14 | * with interrupts disabled. So, writes are safe. | ||
| 15 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 16 | * This may result in other CPU reading this CPU's irq time and can | ||
| 17 | * race with irq/vtime_account on this CPU. We would either get old | ||
| 18 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
| 19 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
| 20 | * compromise in place of having locks on each irq in account_system_time. | ||
| 21 | */ | ||
| 22 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 23 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 24 | |||
| 25 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 26 | static int sched_clock_irqtime; | ||
| 27 | |||
| 28 | void enable_sched_clock_irqtime(void) | ||
| 29 | { | ||
| 30 | sched_clock_irqtime = 1; | ||
| 31 | } | ||
| 32 | |||
| 33 | void disable_sched_clock_irqtime(void) | ||
| 34 | { | ||
| 35 | sched_clock_irqtime = 0; | ||
| 36 | } | ||
| 37 | |||
| 38 | #ifndef CONFIG_64BIT | ||
| 39 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 40 | #endif /* CONFIG_64BIT */ | ||
| 41 | |||
| 42 | /* | ||
| 43 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
| 44 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
| 45 | */ | ||
| 46 | void vtime_account(struct task_struct *curr) | ||
| 47 | { | ||
| 48 | unsigned long flags; | ||
| 49 | s64 delta; | ||
| 50 | int cpu; | ||
| 51 | |||
| 52 | if (!sched_clock_irqtime) | ||
| 53 | return; | ||
| 54 | |||
| 55 | local_irq_save(flags); | ||
| 56 | |||
| 57 | cpu = smp_processor_id(); | ||
| 58 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
| 59 | __this_cpu_add(irq_start_time, delta); | ||
| 60 | |||
| 61 | irq_time_write_begin(); | ||
| 62 | /* | ||
| 63 | * We do not account for softirq time from ksoftirqd here. | ||
| 64 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 65 | * in that case, so as not to confuse scheduler with a special task | ||
| 66 | * that do not consume any time, but still wants to run. | ||
| 67 | */ | ||
| 68 | if (hardirq_count()) | ||
| 69 | __this_cpu_add(cpu_hardirq_time, delta); | ||
| 70 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
| 71 | __this_cpu_add(cpu_softirq_time, delta); | ||
| 72 | |||
| 73 | irq_time_write_end(); | ||
| 74 | local_irq_restore(flags); | ||
| 75 | } | ||
| 76 | EXPORT_SYMBOL_GPL(vtime_account); | ||
| 77 | |||
| 78 | static int irqtime_account_hi_update(void) | ||
| 79 | { | ||
| 80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 81 | unsigned long flags; | ||
| 82 | u64 latest_ns; | ||
| 83 | int ret = 0; | ||
| 84 | |||
| 85 | local_irq_save(flags); | ||
| 86 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
| 87 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
| 88 | ret = 1; | ||
| 89 | local_irq_restore(flags); | ||
| 90 | return ret; | ||
| 91 | } | ||
| 92 | |||
| 93 | static int irqtime_account_si_update(void) | ||
| 94 | { | ||
| 95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 96 | unsigned long flags; | ||
| 97 | u64 latest_ns; | ||
| 98 | int ret = 0; | ||
| 99 | |||
| 100 | local_irq_save(flags); | ||
| 101 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
| 102 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
| 103 | ret = 1; | ||
| 104 | local_irq_restore(flags); | ||
| 105 | return ret; | ||
| 106 | } | ||
| 107 | |||
| 108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 109 | |||
| 110 | #define sched_clock_irqtime (0) | ||
| 111 | |||
| 112 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 113 | |||
| 114 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
| 115 | u64 tmp) | ||
| 116 | { | ||
| 117 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 118 | struct kernel_cpustat *kcpustat; | ||
| 119 | struct cpuacct *ca; | ||
| 120 | #endif | ||
| 121 | /* | ||
| 122 | * Since all updates are sure to touch the root cgroup, we | ||
| 123 | * get ourselves ahead and touch it first. If the root cgroup | ||
| 124 | * is the only cgroup, then nothing else should be necessary. | ||
| 125 | * | ||
| 126 | */ | ||
| 127 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
| 128 | |||
| 129 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 130 | if (unlikely(!cpuacct_subsys.active)) | ||
| 131 | return; | ||
| 132 | |||
| 133 | rcu_read_lock(); | ||
| 134 | ca = task_ca(p); | ||
| 135 | while (ca && (ca != &root_cpuacct)) { | ||
| 136 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
| 137 | kcpustat->cpustat[index] += tmp; | ||
| 138 | ca = parent_ca(ca); | ||
| 139 | } | ||
| 140 | rcu_read_unlock(); | ||
| 141 | #endif | ||
| 142 | } | ||
| 143 | |||
| 144 | /* | ||
| 145 | * Account user cpu time to a process. | ||
| 146 | * @p: the process that the cpu time gets accounted to | ||
| 147 | * @cputime: the cpu time spent in user space since the last update | ||
| 148 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 149 | */ | ||
| 150 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
| 151 | cputime_t cputime_scaled) | ||
| 152 | { | ||
| 153 | int index; | ||
| 154 | |||
| 155 | /* Add user time to process. */ | ||
| 156 | p->utime += cputime; | ||
| 157 | p->utimescaled += cputime_scaled; | ||
| 158 | account_group_user_time(p, cputime); | ||
| 159 | |||
| 160 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
| 161 | |||
| 162 | /* Add user time to cpustat. */ | ||
| 163 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 164 | |||
| 165 | /* Account for user time used */ | ||
| 166 | acct_update_integrals(p); | ||
| 167 | } | ||
| 168 | |||
| 169 | /* | ||
| 170 | * Account guest cpu time to a process. | ||
| 171 | * @p: the process that the cpu time gets accounted to | ||
| 172 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 173 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 174 | */ | ||
| 175 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
| 176 | cputime_t cputime_scaled) | ||
| 177 | { | ||
| 178 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 179 | |||
| 180 | /* Add guest time to process. */ | ||
| 181 | p->utime += cputime; | ||
| 182 | p->utimescaled += cputime_scaled; | ||
| 183 | account_group_user_time(p, cputime); | ||
| 184 | p->gtime += cputime; | ||
| 185 | |||
| 186 | /* Add guest time to cpustat. */ | ||
| 187 | if (TASK_NICE(p) > 0) { | ||
| 188 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
| 189 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
| 190 | } else { | ||
| 191 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
| 192 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
| 193 | } | ||
| 194 | } | ||
| 195 | |||
| 196 | /* | ||
| 197 | * Account system cpu time to a process and desired cpustat field | ||
| 198 | * @p: the process that the cpu time gets accounted to | ||
| 199 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 200 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 201 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
| 202 | */ | ||
| 203 | static inline | ||
| 204 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
| 205 | cputime_t cputime_scaled, int index) | ||
| 206 | { | ||
| 207 | /* Add system time to process. */ | ||
| 208 | p->stime += cputime; | ||
| 209 | p->stimescaled += cputime_scaled; | ||
| 210 | account_group_system_time(p, cputime); | ||
| 211 | |||
| 212 | /* Add system time to cpustat. */ | ||
| 213 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 214 | |||
| 215 | /* Account for system time used */ | ||
| 216 | acct_update_integrals(p); | ||
| 217 | } | ||
| 218 | |||
| 219 | /* | ||
| 220 | * Account system cpu time to a process. | ||
| 221 | * @p: the process that the cpu time gets accounted to | ||
| 222 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
| 223 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 224 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 225 | */ | ||
| 226 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
| 227 | cputime_t cputime, cputime_t cputime_scaled) | ||
| 228 | { | ||
| 229 | int index; | ||
| 230 | |||
| 231 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
| 232 | account_guest_time(p, cputime, cputime_scaled); | ||
| 233 | return; | ||
| 234 | } | ||
| 235 | |||
| 236 | if (hardirq_count() - hardirq_offset) | ||
| 237 | index = CPUTIME_IRQ; | ||
| 238 | else if (in_serving_softirq()) | ||
| 239 | index = CPUTIME_SOFTIRQ; | ||
| 240 | else | ||
| 241 | index = CPUTIME_SYSTEM; | ||
| 242 | |||
| 243 | __account_system_time(p, cputime, cputime_scaled, index); | ||
| 244 | } | ||
| 245 | |||
| 246 | /* | ||
| 247 | * Account for involuntary wait time. | ||
| 248 | * @cputime: the cpu time spent in involuntary wait | ||
| 249 | */ | ||
| 250 | void account_steal_time(cputime_t cputime) | ||
| 251 | { | ||
| 252 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 253 | |||
| 254 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
| 255 | } | ||
| 256 | |||
| 257 | /* | ||
| 258 | * Account for idle time. | ||
| 259 | * @cputime: the cpu time spent in idle wait | ||
| 260 | */ | ||
| 261 | void account_idle_time(cputime_t cputime) | ||
| 262 | { | ||
| 263 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 264 | struct rq *rq = this_rq(); | ||
| 265 | |||
| 266 | if (atomic_read(&rq->nr_iowait) > 0) | ||
| 267 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
| 268 | else | ||
| 269 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
| 270 | } | ||
| 271 | |||
| 272 | static __always_inline bool steal_account_process_tick(void) | ||
| 273 | { | ||
| 274 | #ifdef CONFIG_PARAVIRT | ||
| 275 | if (static_key_false(¶virt_steal_enabled)) { | ||
| 276 | u64 steal, st = 0; | ||
| 277 | |||
| 278 | steal = paravirt_steal_clock(smp_processor_id()); | ||
| 279 | steal -= this_rq()->prev_steal_time; | ||
| 280 | |||
| 281 | st = steal_ticks(steal); | ||
| 282 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
| 283 | |||
| 284 | account_steal_time(st); | ||
| 285 | return st; | ||
| 286 | } | ||
| 287 | #endif | ||
| 288 | return false; | ||
| 289 | } | ||
| 290 | |||
| 291 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 292 | |||
| 293 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 294 | /* | ||
| 295 | * Account a tick to a process and cpustat | ||
| 296 | * @p: the process that the cpu time gets accounted to | ||
| 297 | * @user_tick: is the tick from userspace | ||
| 298 | * @rq: the pointer to rq | ||
| 299 | * | ||
| 300 | * Tick demultiplexing follows the order | ||
| 301 | * - pending hardirq update | ||
| 302 | * - pending softirq update | ||
| 303 | * - user_time | ||
| 304 | * - idle_time | ||
| 305 | * - system time | ||
| 306 | * - check for guest_time | ||
| 307 | * - else account as system_time | ||
| 308 | * | ||
| 309 | * Check for hardirq is done both for system and user time as there is | ||
| 310 | * no timer going off while we are on hardirq and hence we may never get an | ||
| 311 | * opportunity to update it solely in system time. | ||
| 312 | * p->stime and friends are only updated on system time and not on irq | ||
| 313 | * softirq as those do not count in task exec_runtime any more. | ||
| 314 | */ | ||
| 315 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 316 | struct rq *rq) | ||
| 317 | { | ||
| 318 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 319 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 320 | |||
| 321 | if (steal_account_process_tick()) | ||
| 322 | return; | ||
| 323 | |||
| 324 | if (irqtime_account_hi_update()) { | ||
| 325 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
| 326 | } else if (irqtime_account_si_update()) { | ||
| 327 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
| 328 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 329 | /* | ||
| 330 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
| 331 | * So, we have to handle it separately here. | ||
| 332 | * Also, p->stime needs to be updated for ksoftirqd. | ||
| 333 | */ | ||
| 334 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 335 | CPUTIME_SOFTIRQ); | ||
| 336 | } else if (user_tick) { | ||
| 337 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 338 | } else if (p == rq->idle) { | ||
| 339 | account_idle_time(cputime_one_jiffy); | ||
| 340 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
| 341 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 342 | } else { | ||
| 343 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 344 | CPUTIME_SYSTEM); | ||
| 345 | } | ||
| 346 | } | ||
| 347 | |||
| 348 | static void irqtime_account_idle_ticks(int ticks) | ||
| 349 | { | ||
| 350 | int i; | ||
| 351 | struct rq *rq = this_rq(); | ||
| 352 | |||
| 353 | for (i = 0; i < ticks; i++) | ||
| 354 | irqtime_account_process_tick(current, 0, rq); | ||
| 355 | } | ||
| 356 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 357 | static void irqtime_account_idle_ticks(int ticks) {} | ||
| 358 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 359 | struct rq *rq) {} | ||
| 360 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 361 | |||
| 362 | /* | ||
| 363 | * Account a single tick of cpu time. | ||
| 364 | * @p: the process that the cpu time gets accounted to | ||
| 365 | * @user_tick: indicates if the tick is a user or a system tick | ||
| 366 | */ | ||
| 367 | void account_process_tick(struct task_struct *p, int user_tick) | ||
| 368 | { | ||
| 369 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 370 | struct rq *rq = this_rq(); | ||
| 371 | |||
| 372 | if (sched_clock_irqtime) { | ||
| 373 | irqtime_account_process_tick(p, user_tick, rq); | ||
| 374 | return; | ||
| 375 | } | ||
| 376 | |||
| 377 | if (steal_account_process_tick()) | ||
| 378 | return; | ||
| 379 | |||
| 380 | if (user_tick) | ||
| 381 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 382 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
| 383 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
| 384 | one_jiffy_scaled); | ||
| 385 | else | ||
| 386 | account_idle_time(cputime_one_jiffy); | ||
| 387 | } | ||
| 388 | |||
| 389 | /* | ||
| 390 | * Account multiple ticks of steal time. | ||
| 391 | * @p: the process from which the cpu time has been stolen | ||
| 392 | * @ticks: number of stolen ticks | ||
| 393 | */ | ||
| 394 | void account_steal_ticks(unsigned long ticks) | ||
| 395 | { | ||
| 396 | account_steal_time(jiffies_to_cputime(ticks)); | ||
| 397 | } | ||
| 398 | |||
| 399 | /* | ||
| 400 | * Account multiple ticks of idle time. | ||
| 401 | * @ticks: number of stolen ticks | ||
| 402 | */ | ||
| 403 | void account_idle_ticks(unsigned long ticks) | ||
| 404 | { | ||
| 405 | |||
| 406 | if (sched_clock_irqtime) { | ||
| 407 | irqtime_account_idle_ticks(ticks); | ||
| 408 | return; | ||
| 409 | } | ||
| 410 | |||
| 411 | account_idle_time(jiffies_to_cputime(ticks)); | ||
| 412 | } | ||
| 413 | |||
| 414 | #endif | ||
| 415 | |||
| 416 | /* | ||
| 417 | * Use precise platform statistics if available: | ||
| 418 | */ | ||
| 419 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 420 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 421 | { | ||
| 422 | *ut = p->utime; | ||
| 423 | *st = p->stime; | ||
| 424 | } | ||
| 425 | |||
| 426 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 427 | { | ||
| 428 | struct task_cputime cputime; | ||
| 429 | |||
| 430 | thread_group_cputime(p, &cputime); | ||
| 431 | |||
| 432 | *ut = cputime.utime; | ||
| 433 | *st = cputime.stime; | ||
| 434 | } | ||
| 435 | |||
| 436 | /* | ||
| 437 | * Archs that account the whole time spent in the idle task | ||
| 438 | * (outside irq) as idle time can rely on this and just implement | ||
| 439 | * vtime_account_system() and vtime_account_idle(). Archs that | ||
| 440 | * have other meaning of the idle time (s390 only includes the | ||
| 441 | * time spent by the CPU when it's in low power mode) must override | ||
| 442 | * vtime_account(). | ||
| 443 | */ | ||
| 444 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | ||
| 445 | void vtime_account(struct task_struct *tsk) | ||
| 446 | { | ||
| 447 | unsigned long flags; | ||
| 448 | |||
| 449 | local_irq_save(flags); | ||
| 450 | |||
| 451 | if (in_interrupt() || !is_idle_task(tsk)) | ||
| 452 | vtime_account_system(tsk); | ||
| 453 | else | ||
| 454 | vtime_account_idle(tsk); | ||
| 455 | |||
| 456 | local_irq_restore(flags); | ||
| 457 | } | ||
| 458 | EXPORT_SYMBOL_GPL(vtime_account); | ||
| 459 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | ||
| 460 | |||
| 461 | #else | ||
| 462 | |||
| 463 | #ifndef nsecs_to_cputime | ||
| 464 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 465 | #endif | ||
| 466 | |||
| 467 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
| 468 | { | ||
| 469 | u64 temp = (__force u64) rtime; | ||
| 470 | |||
| 471 | temp *= (__force u64) utime; | ||
| 472 | |||
| 473 | if (sizeof(cputime_t) == 4) | ||
| 474 | temp = div_u64(temp, (__force u32) total); | ||
| 475 | else | ||
| 476 | temp = div64_u64(temp, (__force u64) total); | ||
| 477 | |||
| 478 | return (__force cputime_t) temp; | ||
| 479 | } | ||
| 480 | |||
| 481 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 482 | { | ||
| 483 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
| 484 | |||
| 485 | /* | ||
| 486 | * Use CFS's precise accounting: | ||
| 487 | */ | ||
| 488 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
| 489 | |||
| 490 | if (total) | ||
| 491 | utime = scale_utime(utime, rtime, total); | ||
| 492 | else | ||
| 493 | utime = rtime; | ||
| 494 | |||
| 495 | /* | ||
| 496 | * Compare with previous values, to keep monotonicity: | ||
| 497 | */ | ||
| 498 | p->prev_utime = max(p->prev_utime, utime); | ||
| 499 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
| 500 | |||
| 501 | *ut = p->prev_utime; | ||
| 502 | *st = p->prev_stime; | ||
| 503 | } | ||
| 504 | |||
| 505 | /* | ||
| 506 | * Must be called with siglock held. | ||
| 507 | */ | ||
| 508 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 509 | { | ||
| 510 | struct signal_struct *sig = p->signal; | ||
| 511 | struct task_cputime cputime; | ||
| 512 | cputime_t rtime, utime, total; | ||
| 513 | |||
| 514 | thread_group_cputime(p, &cputime); | ||
| 515 | |||
| 516 | total = cputime.utime + cputime.stime; | ||
| 517 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
| 518 | |||
| 519 | if (total) | ||
| 520 | utime = scale_utime(cputime.utime, rtime, total); | ||
| 521 | else | ||
| 522 | utime = rtime; | ||
| 523 | |||
| 524 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 525 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
| 526 | |||
| 527 | *ut = sig->prev_utime; | ||
| 528 | *st = sig->prev_stime; | ||
| 529 | } | ||
| 530 | #endif | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e2b18b6283..6b800a14b990 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) | |||
| 597 | /* | 597 | /* |
| 598 | * The idea is to set a period in which each task runs once. | 598 | * The idea is to set a period in which each task runs once. |
| 599 | * | 599 | * |
| 600 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 600 | * When there are too many tasks (sched_nr_latency) we have to stretch |
| 601 | * this period because otherwise the slices get too small. | 601 | * this period because otherwise the slices get too small. |
| 602 | * | 602 | * |
| 603 | * p = (nr <= nl) ? l : l*nr/nl | 603 | * p = (nr <= nl) ? l : l*nr/nl |
| @@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2700 | int prev_cpu = task_cpu(p); | 2700 | int prev_cpu = task_cpu(p); |
| 2701 | int new_cpu = cpu; | 2701 | int new_cpu = cpu; |
| 2702 | int want_affine = 0; | 2702 | int want_affine = 0; |
| 2703 | int want_sd = 1; | ||
| 2704 | int sync = wake_flags & WF_SYNC; | 2703 | int sync = wake_flags & WF_SYNC; |
| 2705 | 2704 | ||
| 2706 | if (p->nr_cpus_allowed == 1) | 2705 | if (p->nr_cpus_allowed == 1) |
| @@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2718 | continue; | 2717 | continue; |
| 2719 | 2718 | ||
| 2720 | /* | 2719 | /* |
| 2721 | * If power savings logic is enabled for a domain, see if we | ||
| 2722 | * are not overloaded, if so, don't balance wider. | ||
| 2723 | */ | ||
| 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { | ||
| 2725 | unsigned long power = 0; | ||
| 2726 | unsigned long nr_running = 0; | ||
| 2727 | unsigned long capacity; | ||
| 2728 | int i; | ||
| 2729 | |||
| 2730 | for_each_cpu(i, sched_domain_span(tmp)) { | ||
| 2731 | power += power_of(i); | ||
| 2732 | nr_running += cpu_rq(i)->cfs.nr_running; | ||
| 2733 | } | ||
| 2734 | |||
| 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
| 2736 | |||
| 2737 | if (nr_running < capacity) | ||
| 2738 | want_sd = 0; | ||
| 2739 | } | ||
| 2740 | |||
| 2741 | /* | ||
| 2742 | * If both cpu and prev_cpu are part of this domain, | 2720 | * If both cpu and prev_cpu are part of this domain, |
| 2743 | * cpu is a valid SD_WAKE_AFFINE target. | 2721 | * cpu is a valid SD_WAKE_AFFINE target. |
| 2744 | */ | 2722 | */ |
| 2745 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 2723 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
| 2746 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { | 2724 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { |
| 2747 | affine_sd = tmp; | 2725 | affine_sd = tmp; |
| 2748 | want_affine = 0; | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | if (!want_sd && !want_affine) | ||
| 2752 | break; | 2726 | break; |
| 2727 | } | ||
| 2753 | 2728 | ||
| 2754 | if (!(tmp->flags & sd_flag)) | 2729 | if (tmp->flags & sd_flag) |
| 2755 | continue; | ||
| 2756 | |||
| 2757 | if (want_sd) | ||
| 2758 | sd = tmp; | 2730 | sd = tmp; |
| 2759 | } | 2731 | } |
| 2760 | 2732 | ||
| 2761 | if (affine_sd) { | 2733 | if (affine_sd) { |
| 2762 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 2734 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
| 2763 | prev_cpu = cpu; | 2735 | prev_cpu = cpu; |
| 2764 | 2736 | ||
| 2765 | new_cpu = select_idle_sibling(p, prev_cpu); | 2737 | new_cpu = select_idle_sibling(p, prev_cpu); |
| @@ -4295,7 +4267,7 @@ redo: | |||
| 4295 | goto out_balanced; | 4267 | goto out_balanced; |
| 4296 | } | 4268 | } |
| 4297 | 4269 | ||
| 4298 | BUG_ON(busiest == this_rq); | 4270 | BUG_ON(busiest == env.dst_rq); |
| 4299 | 4271 | ||
| 4300 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 4272 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
| 4301 | 4273 | ||
| @@ -4316,7 +4288,7 @@ redo: | |||
| 4316 | update_h_load(env.src_cpu); | 4288 | update_h_load(env.src_cpu); |
| 4317 | more_balance: | 4289 | more_balance: |
| 4318 | local_irq_save(flags); | 4290 | local_irq_save(flags); |
| 4319 | double_rq_lock(this_rq, busiest); | 4291 | double_rq_lock(env.dst_rq, busiest); |
| 4320 | 4292 | ||
| 4321 | /* | 4293 | /* |
| 4322 | * cur_ld_moved - load moved in current iteration | 4294 | * cur_ld_moved - load moved in current iteration |
| @@ -4324,7 +4296,7 @@ more_balance: | |||
| 4324 | */ | 4296 | */ |
| 4325 | cur_ld_moved = move_tasks(&env); | 4297 | cur_ld_moved = move_tasks(&env); |
| 4326 | ld_moved += cur_ld_moved; | 4298 | ld_moved += cur_ld_moved; |
| 4327 | double_rq_unlock(this_rq, busiest); | 4299 | double_rq_unlock(env.dst_rq, busiest); |
| 4328 | local_irq_restore(flags); | 4300 | local_irq_restore(flags); |
| 4329 | 4301 | ||
| 4330 | if (env.flags & LBF_NEED_BREAK) { | 4302 | if (env.flags & LBF_NEED_BREAK) { |
| @@ -4360,8 +4332,7 @@ more_balance: | |||
| 4360 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 4332 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && |
| 4361 | lb_iterations++ < max_lb_iterations) { | 4333 | lb_iterations++ < max_lb_iterations) { |
| 4362 | 4334 | ||
| 4363 | this_rq = cpu_rq(env.new_dst_cpu); | 4335 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| 4364 | env.dst_rq = this_rq; | ||
| 4365 | env.dst_cpu = env.new_dst_cpu; | 4336 | env.dst_cpu = env.new_dst_cpu; |
| 4366 | env.flags &= ~LBF_SOME_PINNED; | 4337 | env.flags &= ~LBF_SOME_PINNED; |
| 4367 | env.loop = 0; | 4338 | env.loop = 0; |
| @@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu) | |||
| 4646 | return; | 4617 | return; |
| 4647 | } | 4618 | } |
| 4648 | 4619 | ||
| 4649 | static inline void clear_nohz_tick_stopped(int cpu) | 4620 | static inline void nohz_balance_exit_idle(int cpu) |
| 4650 | { | 4621 | { |
| 4651 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | 4622 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
| 4652 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4623 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
| @@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void) | |||
| 4686 | } | 4657 | } |
| 4687 | 4658 | ||
| 4688 | /* | 4659 | /* |
| 4689 | * This routine will record that this cpu is going idle with tick stopped. | 4660 | * This routine will record that the cpu is going idle with tick stopped. |
| 4690 | * This info will be used in performing idle load balancing in the future. | 4661 | * This info will be used in performing idle load balancing in the future. |
| 4691 | */ | 4662 | */ |
| 4692 | void select_nohz_load_balancer(int stop_tick) | 4663 | void nohz_balance_enter_idle(int cpu) |
| 4693 | { | 4664 | { |
| 4694 | int cpu = smp_processor_id(); | ||
| 4695 | |||
| 4696 | /* | 4665 | /* |
| 4697 | * If this cpu is going down, then nothing needs to be done. | 4666 | * If this cpu is going down, then nothing needs to be done. |
| 4698 | */ | 4667 | */ |
| 4699 | if (!cpu_active(cpu)) | 4668 | if (!cpu_active(cpu)) |
| 4700 | return; | 4669 | return; |
| 4701 | 4670 | ||
| 4702 | if (stop_tick) { | 4671 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
| 4703 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 4672 | return; |
| 4704 | return; | ||
| 4705 | 4673 | ||
| 4706 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4674 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
| 4707 | atomic_inc(&nohz.nr_cpus); | 4675 | atomic_inc(&nohz.nr_cpus); |
| 4708 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 4676 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
| 4709 | } | ||
| 4710 | return; | ||
| 4711 | } | 4677 | } |
| 4712 | 4678 | ||
| 4713 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | 4679 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, |
| @@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
| 4715 | { | 4681 | { |
| 4716 | switch (action & ~CPU_TASKS_FROZEN) { | 4682 | switch (action & ~CPU_TASKS_FROZEN) { |
| 4717 | case CPU_DYING: | 4683 | case CPU_DYING: |
| 4718 | clear_nohz_tick_stopped(smp_processor_id()); | 4684 | nohz_balance_exit_idle(smp_processor_id()); |
| 4719 | return NOTIFY_OK; | 4685 | return NOTIFY_OK; |
| 4720 | default: | 4686 | default: |
| 4721 | return NOTIFY_DONE; | 4687 | return NOTIFY_DONE; |
| @@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
| 4837 | if (need_resched()) | 4803 | if (need_resched()) |
| 4838 | break; | 4804 | break; |
| 4839 | 4805 | ||
| 4840 | raw_spin_lock_irq(&this_rq->lock); | 4806 | rq = cpu_rq(balance_cpu); |
| 4841 | update_rq_clock(this_rq); | 4807 | |
| 4842 | update_idle_cpu_load(this_rq); | 4808 | raw_spin_lock_irq(&rq->lock); |
| 4843 | raw_spin_unlock_irq(&this_rq->lock); | 4809 | update_rq_clock(rq); |
| 4810 | update_idle_cpu_load(rq); | ||
| 4811 | raw_spin_unlock_irq(&rq->lock); | ||
| 4844 | 4812 | ||
| 4845 | rebalance_domains(balance_cpu, CPU_IDLE); | 4813 | rebalance_domains(balance_cpu, CPU_IDLE); |
| 4846 | 4814 | ||
| 4847 | rq = cpu_rq(balance_cpu); | ||
| 4848 | if (time_after(this_rq->next_balance, rq->next_balance)) | 4815 | if (time_after(this_rq->next_balance, rq->next_balance)) |
| 4849 | this_rq->next_balance = rq->next_balance; | 4816 | this_rq->next_balance = rq->next_balance; |
| 4850 | } | 4817 | } |
| @@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 4875 | * busy tick after returning from idle, we will update the busy stats. | 4842 | * busy tick after returning from idle, we will update the busy stats. |
| 4876 | */ | 4843 | */ |
| 4877 | set_cpu_sd_state_busy(); | 4844 | set_cpu_sd_state_busy(); |
| 4878 | clear_nohz_tick_stopped(cpu); | 4845 | nohz_balance_exit_idle(cpu); |
| 4879 | 4846 | ||
| 4880 | /* | 4847 | /* |
| 4881 | * None are in tickless mode and hence no need for NOHZ idle load | 4848 | * None are in tickless mode and hence no need for NOHZ idle load |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..eebefcad7027 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) | |||
| 12 | SCHED_FEAT(START_DEBIT, true) | 12 | SCHED_FEAT(START_DEBIT, true) |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * Based on load and program behaviour, see if it makes sense to place | ||
| 16 | * a newly woken task on the same cpu as the task that woke it -- | ||
| 17 | * improve cache locality. Typically used with SYNC wakeups as | ||
| 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | ||
| 19 | */ | ||
| 20 | SCHED_FEAT(AFFINE_WAKEUPS, true) | ||
| 21 | |||
| 22 | /* | ||
| 23 | * Prefer to schedule the task we woke last (assuming it failed | 15 | * Prefer to schedule the task we woke last (assuming it failed |
| 24 | * wakeup-preemption), since its likely going to consume data we | 16 | * wakeup-preemption), since its likely going to consume data we |
| 25 | * touched, increases cache locality. | 17 | * touched, increases cache locality. |
| @@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
| 42 | /* | 34 | /* |
| 43 | * Use arch dependent cpu power functions | 35 | * Use arch dependent cpu power functions |
| 44 | */ | 36 | */ |
| 45 | SCHED_FEAT(ARCH_POWER, false) | 37 | SCHED_FEAT(ARCH_POWER, true) |
| 46 | 38 | ||
| 47 | SCHED_FEAT(HRTICK, false) | 39 | SCHED_FEAT(HRTICK, false) |
| 48 | SCHED_FEAT(DOUBLE_TICK, false) | 40 | SCHED_FEAT(DOUBLE_TICK, false) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) | |||
| 1632 | if (!next_task) | 1632 | if (!next_task) |
| 1633 | return 0; | 1633 | return 0; |
| 1634 | 1634 | ||
| 1635 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1636 | if (unlikely(task_running(rq, next_task))) | ||
| 1637 | return 0; | ||
| 1638 | #endif | ||
| 1639 | |||
| 1640 | retry: | 1635 | retry: |
| 1641 | if (unlikely(next_task == rq->curr)) { | 1636 | if (unlikely(next_task == rq->curr)) { |
| 1642 | WARN_ON(1); | 1637 | WARN_ON(1); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0848fa36c383..7a7db09cfabc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
| 737 | */ | 737 | */ |
| 738 | next->on_cpu = 1; | 738 | next->on_cpu = 1; |
| 739 | #endif | 739 | #endif |
| 740 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 741 | raw_spin_unlock_irq(&rq->lock); | ||
| 742 | #else | ||
| 743 | raw_spin_unlock(&rq->lock); | 740 | raw_spin_unlock(&rq->lock); |
| 744 | #endif | ||
| 745 | } | 741 | } |
| 746 | 742 | ||
| 747 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 743 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| @@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 755 | smp_wmb(); | 751 | smp_wmb(); |
| 756 | prev->on_cpu = 0; | 752 | prev->on_cpu = 0; |
| 757 | #endif | 753 | #endif |
| 758 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 759 | local_irq_enable(); | 754 | local_irq_enable(); |
| 760 | #endif | ||
| 761 | } | 755 | } |
| 762 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 756 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 763 | 757 | ||
| @@ -891,6 +885,9 @@ struct cpuacct { | |||
| 891 | struct kernel_cpustat __percpu *cpustat; | 885 | struct kernel_cpustat __percpu *cpustat; |
| 892 | }; | 886 | }; |
| 893 | 887 | ||
| 888 | extern struct cgroup_subsys cpuacct_subsys; | ||
| 889 | extern struct cpuacct root_cpuacct; | ||
| 890 | |||
| 894 | /* return cpu accounting group corresponding to this container */ | 891 | /* return cpu accounting group corresponding to this container */ |
| 895 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | 892 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
| 896 | { | 893 | { |
| @@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
| 917 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 914 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
| 918 | #endif | 915 | #endif |
| 919 | 916 | ||
| 917 | #ifdef CONFIG_PARAVIRT | ||
| 918 | static inline u64 steal_ticks(u64 steal) | ||
| 919 | { | ||
| 920 | if (unlikely(steal > NSEC_PER_SEC)) | ||
| 921 | return div_u64(steal, TICK_NSEC); | ||
| 922 | |||
| 923 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 924 | } | ||
| 925 | #endif | ||
| 926 | |||
| 920 | static inline void inc_nr_running(struct rq *rq) | 927 | static inline void inc_nr_running(struct rq *rq) |
| 921 | { | 928 | { |
| 922 | rq->nr_running++; | 929 | rq->nr_running++; |
| @@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits { | |||
| 1156 | 1163 | ||
| 1157 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1164 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
| 1158 | #endif | 1165 | #endif |
| 1166 | |||
| 1167 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1168 | |||
| 1169 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | ||
| 1170 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
| 1171 | |||
| 1172 | #ifndef CONFIG_64BIT | ||
| 1173 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 1174 | |||
| 1175 | static inline void irq_time_write_begin(void) | ||
| 1176 | { | ||
| 1177 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1178 | smp_wmb(); | ||
| 1179 | } | ||
| 1180 | |||
| 1181 | static inline void irq_time_write_end(void) | ||
| 1182 | { | ||
| 1183 | smp_wmb(); | ||
| 1184 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1185 | } | ||
| 1186 | |||
| 1187 | static inline u64 irq_time_read(int cpu) | ||
| 1188 | { | ||
| 1189 | u64 irq_time; | ||
| 1190 | unsigned seq; | ||
| 1191 | |||
| 1192 | do { | ||
| 1193 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
| 1194 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
| 1195 | per_cpu(cpu_hardirq_time, cpu); | ||
| 1196 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 1197 | |||
| 1198 | return irq_time; | ||
| 1199 | } | ||
| 1200 | #else /* CONFIG_64BIT */ | ||
| 1201 | static inline void irq_time_write_begin(void) | ||
| 1202 | { | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | static inline void irq_time_write_end(void) | ||
| 1206 | { | ||
| 1207 | } | ||
| 1208 | |||
| 1209 | static inline u64 irq_time_read(int cpu) | ||
| 1210 | { | ||
| 1211 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 1212 | } | ||
| 1213 | #endif /* CONFIG_64BIT */ | ||
| 1214 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 1215 | |||
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5c6a5bd8462f..cc96bdc0c2c9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -221,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
| 221 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
| 222 | 222 | ||
| 223 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
| 224 | account_system_vtime(current); | 224 | vtime_account(current); |
| 225 | 225 | ||
| 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
| 227 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
| @@ -272,7 +272,7 @@ restart: | |||
| 272 | 272 | ||
| 273 | lockdep_softirq_exit(); | 273 | lockdep_softirq_exit(); |
| 274 | 274 | ||
| 275 | account_system_vtime(current); | 275 | vtime_account(current); |
| 276 | __local_bh_enable(SOFTIRQ_OFFSET); | 276 | __local_bh_enable(SOFTIRQ_OFFSET); |
| 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
| 278 | } | 278 | } |
| @@ -341,7 +341,7 @@ static inline void invoke_softirq(void) | |||
| 341 | */ | 341 | */ |
| 342 | void irq_exit(void) | 342 | void irq_exit(void) |
| 343 | { | 343 | { |
| 344 | account_system_vtime(current); | 344 | vtime_account(current); |
| 345 | trace_hardirq_exit(); | 345 | trace_hardirq_exit(); |
| 346 | sub_preempt_count(IRQ_EXIT_OFFSET); | 346 | sub_preempt_count(IRQ_EXIT_OFFSET); |
| 347 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..81c7b1a1a307 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = { | |||
| 307 | .extra2 = &max_sched_tunable_scaling, | 307 | .extra2 = &max_sched_tunable_scaling, |
| 308 | }, | 308 | }, |
| 309 | { | 309 | { |
| 310 | .procname = "sched_migration_cost", | 310 | .procname = "sched_migration_cost_ns", |
| 311 | .data = &sysctl_sched_migration_cost, | 311 | .data = &sysctl_sched_migration_cost, |
| 312 | .maxlen = sizeof(unsigned int), | 312 | .maxlen = sizeof(unsigned int), |
| 313 | .mode = 0644, | 313 | .mode = 0644, |
| @@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = { | |||
| 321 | .proc_handler = proc_dointvec, | 321 | .proc_handler = proc_dointvec, |
| 322 | }, | 322 | }, |
| 323 | { | 323 | { |
| 324 | .procname = "sched_time_avg", | 324 | .procname = "sched_time_avg_ms", |
| 325 | .data = &sysctl_sched_time_avg, | 325 | .data = &sysctl_sched_time_avg, |
| 326 | .maxlen = sizeof(unsigned int), | 326 | .maxlen = sizeof(unsigned int), |
| 327 | .mode = 0644, | 327 | .mode = 0644, |
| 328 | .proc_handler = proc_dointvec, | 328 | .proc_handler = proc_dointvec, |
| 329 | }, | 329 | }, |
| 330 | { | 330 | { |
| 331 | .procname = "sched_shares_window", | 331 | .procname = "sched_shares_window_ns", |
| 332 | .data = &sysctl_sched_shares_window, | 332 | .data = &sysctl_sched_shares_window, |
| 333 | .maxlen = sizeof(unsigned int), | 333 | .maxlen = sizeof(unsigned int), |
| 334 | .mode = 0644, | 334 | .mode = 0644, |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cf5f6b262673..f423bdd035c2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 372 | * the scheduler tick in nohz_restart_sched_tick. | 372 | * the scheduler tick in nohz_restart_sched_tick. |
| 373 | */ | 373 | */ |
| 374 | if (!ts->tick_stopped) { | 374 | if (!ts->tick_stopped) { |
| 375 | select_nohz_load_balancer(1); | 375 | nohz_balance_enter_idle(cpu); |
| 376 | calc_load_enter_idle(); | 376 | calc_load_enter_idle(); |
| 377 | 377 | ||
| 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
| @@ -570,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
| 570 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | 570 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) |
| 571 | { | 571 | { |
| 572 | /* Update jiffies first */ | 572 | /* Update jiffies first */ |
| 573 | select_nohz_load_balancer(0); | ||
| 574 | tick_do_update_jiffies64(now); | 573 | tick_do_update_jiffies64(now); |
| 575 | update_cpu_load_nohz(); | 574 | update_cpu_load_nohz(); |
| 576 | 575 | ||
