aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/context_tracking.h21
-rw-r--r--arch/x86/kernel/kvm.c8
-rw-r--r--arch/x86/kernel/traps.c68
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/context_tracking.h24
-rw-r--r--include/linux/math64.h19
-rw-r--r--include/linux/sched.h204
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c254
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c214
-rw-r--r--kernel/sched/fair.c148
-rw-r--r--kernel/sched/idle_task.c16
-rw-r--r--kernel/sched/sched.h219
-rw-r--r--lib/div64.c19
20 files changed, 834 insertions, 709 deletions
diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h
index 1616562683e9..1fe49704b146 100644
--- a/arch/x86/include/asm/context_tracking.h
+++ b/arch/x86/include/asm/context_tracking.h
@@ -1,31 +1,10 @@
1#ifndef _ASM_X86_CONTEXT_TRACKING_H 1#ifndef _ASM_X86_CONTEXT_TRACKING_H
2#define _ASM_X86_CONTEXT_TRACKING_H 2#define _ASM_X86_CONTEXT_TRACKING_H
3 3
4#ifndef __ASSEMBLY__
5#include <linux/context_tracking.h>
6#include <asm/ptrace.h>
7
8static inline void exception_enter(struct pt_regs *regs)
9{
10 user_exit();
11}
12
13static inline void exception_exit(struct pt_regs *regs)
14{
15#ifdef CONFIG_CONTEXT_TRACKING
16 if (user_mode(regs))
17 user_enter();
18#endif
19}
20
21#else /* __ASSEMBLY__ */
22
23#ifdef CONFIG_CONTEXT_TRACKING 4#ifdef CONFIG_CONTEXT_TRACKING
24# define SCHEDULE_USER call schedule_user 5# define SCHEDULE_USER call schedule_user
25#else 6#else
26# define SCHEDULE_USER call schedule 7# define SCHEDULE_USER call schedule
27#endif 8#endif
28 9
29#endif /* !__ASSEMBLY__ */
30
31#endif 10#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b686a904d7c3..cd6d9a5a42f6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -20,6 +20,7 @@
20 * Authors: Anthony Liguori <aliguori@us.ibm.com> 20 * Authors: Anthony Liguori <aliguori@us.ibm.com>
21 */ 21 */
22 22
23#include <linux/context_tracking.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
25#include <linux/kvm_para.h> 26#include <linux/kvm_para.h>
@@ -43,7 +44,6 @@
43#include <asm/apicdef.h> 44#include <asm/apicdef.h>
44#include <asm/hypervisor.h> 45#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h> 46#include <asm/kvm_guest.h>
46#include <asm/context_tracking.h>
47 47
48static int kvmapf = 1; 48static int kvmapf = 1;
49 49
@@ -254,16 +254,18 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
254dotraplinkage void __kprobes 254dotraplinkage void __kprobes
255do_async_page_fault(struct pt_regs *regs, unsigned long error_code) 255do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
256{ 256{
257 enum ctx_state prev_state;
258
257 switch (kvm_read_and_reset_pf_reason()) { 259 switch (kvm_read_and_reset_pf_reason()) {
258 default: 260 default:
259 do_page_fault(regs, error_code); 261 do_page_fault(regs, error_code);
260 break; 262 break;
261 case KVM_PV_REASON_PAGE_NOT_PRESENT: 263 case KVM_PV_REASON_PAGE_NOT_PRESENT:
262 /* page is swapped out by the host. */ 264 /* page is swapped out by the host. */
263 exception_enter(regs); 265 prev_state = exception_enter();
264 exit_idle(); 266 exit_idle();
265 kvm_async_pf_task_wait((u32)read_cr2()); 267 kvm_async_pf_task_wait((u32)read_cr2());
266 exception_exit(regs); 268 exception_exit(prev_state);
267 break; 269 break;
268 case KVM_PV_REASON_PAGE_READY: 270 case KVM_PV_REASON_PAGE_READY:
269 rcu_irq_enter(); 271 rcu_irq_enter();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 68bda7a84159..ff6d2271cbe2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -12,6 +12,7 @@
12 12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 14
15#include <linux/context_tracking.h>
15#include <linux/interrupt.h> 16#include <linux/interrupt.h>
16#include <linux/kallsyms.h> 17#include <linux/kallsyms.h>
17#include <linux/spinlock.h> 18#include <linux/spinlock.h>
@@ -55,8 +56,6 @@
55#include <asm/i387.h> 56#include <asm/i387.h>
56#include <asm/fpu-internal.h> 57#include <asm/fpu-internal.h>
57#include <asm/mce.h> 58#include <asm/mce.h>
58#include <asm/context_tracking.h>
59
60#include <asm/mach_traps.h> 59#include <asm/mach_traps.h>
61 60
62#ifdef CONFIG_X86_64 61#ifdef CONFIG_X86_64
@@ -176,34 +175,38 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
176#define DO_ERROR(trapnr, signr, str, name) \ 175#define DO_ERROR(trapnr, signr, str, name) \
177dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 176dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
178{ \ 177{ \
179 exception_enter(regs); \ 178 enum ctx_state prev_state; \
179 \
180 prev_state = exception_enter(); \
180 if (notify_die(DIE_TRAP, str, regs, error_code, \ 181 if (notify_die(DIE_TRAP, str, regs, error_code, \
181 trapnr, signr) == NOTIFY_STOP) { \ 182 trapnr, signr) == NOTIFY_STOP) { \
182 exception_exit(regs); \ 183 exception_exit(prev_state); \
183 return; \ 184 return; \
184 } \ 185 } \
185 conditional_sti(regs); \ 186 conditional_sti(regs); \
186 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 187 do_trap(trapnr, signr, str, regs, error_code, NULL); \
187 exception_exit(regs); \ 188 exception_exit(prev_state); \
188} 189}
189 190
190#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 191#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
191dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 192dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
192{ \ 193{ \
193 siginfo_t info; \ 194 siginfo_t info; \
195 enum ctx_state prev_state; \
196 \
194 info.si_signo = signr; \ 197 info.si_signo = signr; \
195 info.si_errno = 0; \ 198 info.si_errno = 0; \
196 info.si_code = sicode; \ 199 info.si_code = sicode; \
197 info.si_addr = (void __user *)siaddr; \ 200 info.si_addr = (void __user *)siaddr; \
198 exception_enter(regs); \ 201 prev_state = exception_enter(); \
199 if (notify_die(DIE_TRAP, str, regs, error_code, \ 202 if (notify_die(DIE_TRAP, str, regs, error_code, \
200 trapnr, signr) == NOTIFY_STOP) { \ 203 trapnr, signr) == NOTIFY_STOP) { \
201 exception_exit(regs); \ 204 exception_exit(prev_state); \
202 return; \ 205 return; \
203 } \ 206 } \
204 conditional_sti(regs); \ 207 conditional_sti(regs); \
205 do_trap(trapnr, signr, str, regs, error_code, &info); \ 208 do_trap(trapnr, signr, str, regs, error_code, &info); \
206 exception_exit(regs); \ 209 exception_exit(prev_state); \
207} 210}
208 211
209DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 212DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
@@ -226,14 +229,16 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
226/* Runs on IST stack */ 229/* Runs on IST stack */
227dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) 230dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
228{ 231{
229 exception_enter(regs); 232 enum ctx_state prev_state;
233
234 prev_state = exception_enter();
230 if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 235 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
231 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { 236 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
232 preempt_conditional_sti(regs); 237 preempt_conditional_sti(regs);
233 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); 238 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
234 preempt_conditional_cli(regs); 239 preempt_conditional_cli(regs);
235 } 240 }
236 exception_exit(regs); 241 exception_exit(prev_state);
237} 242}
238 243
239dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) 244dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -241,7 +246,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
241 static const char str[] = "double fault"; 246 static const char str[] = "double fault";
242 struct task_struct *tsk = current; 247 struct task_struct *tsk = current;
243 248
244 exception_enter(regs); 249 exception_enter();
245 /* Return not checked because double check cannot be ignored */ 250 /* Return not checked because double check cannot be ignored */
246 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 251 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
247 252
@@ -261,8 +266,9 @@ dotraplinkage void __kprobes
261do_general_protection(struct pt_regs *regs, long error_code) 266do_general_protection(struct pt_regs *regs, long error_code)
262{ 267{
263 struct task_struct *tsk; 268 struct task_struct *tsk;
269 enum ctx_state prev_state;
264 270
265 exception_enter(regs); 271 prev_state = exception_enter();
266 conditional_sti(regs); 272 conditional_sti(regs);
267 273
268#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
@@ -300,12 +306,14 @@ do_general_protection(struct pt_regs *regs, long error_code)
300 306
301 force_sig(SIGSEGV, tsk); 307 force_sig(SIGSEGV, tsk);
302exit: 308exit:
303 exception_exit(regs); 309 exception_exit(prev_state);
304} 310}
305 311
306/* May run on IST stack. */ 312/* May run on IST stack. */
307dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) 313dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
308{ 314{
315 enum ctx_state prev_state;
316
309#ifdef CONFIG_DYNAMIC_FTRACE 317#ifdef CONFIG_DYNAMIC_FTRACE
310 /* 318 /*
311 * ftrace must be first, everything else may cause a recursive crash. 319 * ftrace must be first, everything else may cause a recursive crash.
@@ -315,7 +323,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
315 ftrace_int3_handler(regs)) 323 ftrace_int3_handler(regs))
316 return; 324 return;
317#endif 325#endif
318 exception_enter(regs); 326 prev_state = exception_enter();
319#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 327#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
320 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 328 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
321 SIGTRAP) == NOTIFY_STOP) 329 SIGTRAP) == NOTIFY_STOP)
@@ -336,7 +344,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
336 preempt_conditional_cli(regs); 344 preempt_conditional_cli(regs);
337 debug_stack_usage_dec(); 345 debug_stack_usage_dec();
338exit: 346exit:
339 exception_exit(regs); 347 exception_exit(prev_state);
340} 348}
341 349
342#ifdef CONFIG_X86_64 350#ifdef CONFIG_X86_64
@@ -393,11 +401,12 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
393dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 401dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
394{ 402{
395 struct task_struct *tsk = current; 403 struct task_struct *tsk = current;
404 enum ctx_state prev_state;
396 int user_icebp = 0; 405 int user_icebp = 0;
397 unsigned long dr6; 406 unsigned long dr6;
398 int si_code; 407 int si_code;
399 408
400 exception_enter(regs); 409 prev_state = exception_enter();
401 410
402 get_debugreg(dr6, 6); 411 get_debugreg(dr6, 6);
403 412
@@ -467,7 +476,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
467 debug_stack_usage_dec(); 476 debug_stack_usage_dec();
468 477
469exit: 478exit:
470 exception_exit(regs); 479 exception_exit(prev_state);
471} 480}
472 481
473/* 482/*
@@ -561,17 +570,21 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
561 570
562dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 571dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
563{ 572{
564 exception_enter(regs); 573 enum ctx_state prev_state;
574
575 prev_state = exception_enter();
565 math_error(regs, error_code, X86_TRAP_MF); 576 math_error(regs, error_code, X86_TRAP_MF);
566 exception_exit(regs); 577 exception_exit(prev_state);
567} 578}
568 579
569dotraplinkage void 580dotraplinkage void
570do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 581do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
571{ 582{
572 exception_enter(regs); 583 enum ctx_state prev_state;
584
585 prev_state = exception_enter();
573 math_error(regs, error_code, X86_TRAP_XF); 586 math_error(regs, error_code, X86_TRAP_XF);
574 exception_exit(regs); 587 exception_exit(prev_state);
575} 588}
576 589
577dotraplinkage void 590dotraplinkage void
@@ -639,7 +652,9 @@ EXPORT_SYMBOL_GPL(math_state_restore);
639dotraplinkage void __kprobes 652dotraplinkage void __kprobes
640do_device_not_available(struct pt_regs *regs, long error_code) 653do_device_not_available(struct pt_regs *regs, long error_code)
641{ 654{
642 exception_enter(regs); 655 enum ctx_state prev_state;
656
657 prev_state = exception_enter();
643 BUG_ON(use_eager_fpu()); 658 BUG_ON(use_eager_fpu());
644 659
645#ifdef CONFIG_MATH_EMULATION 660#ifdef CONFIG_MATH_EMULATION
@@ -650,7 +665,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
650 665
651 info.regs = regs; 666 info.regs = regs;
652 math_emulate(&info); 667 math_emulate(&info);
653 exception_exit(regs); 668 exception_exit(prev_state);
654 return; 669 return;
655 } 670 }
656#endif 671#endif
@@ -658,15 +673,16 @@ do_device_not_available(struct pt_regs *regs, long error_code)
658#ifdef CONFIG_X86_32 673#ifdef CONFIG_X86_32
659 conditional_sti(regs); 674 conditional_sti(regs);
660#endif 675#endif
661 exception_exit(regs); 676 exception_exit(prev_state);
662} 677}
663 678
664#ifdef CONFIG_X86_32 679#ifdef CONFIG_X86_32
665dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 680dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
666{ 681{
667 siginfo_t info; 682 siginfo_t info;
683 enum ctx_state prev_state;
668 684
669 exception_enter(regs); 685 prev_state = exception_enter();
670 local_irq_enable(); 686 local_irq_enable();
671 687
672 info.si_signo = SIGILL; 688 info.si_signo = SIGILL;
@@ -678,7 +694,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
678 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 694 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
679 &info); 695 &info);
680 } 696 }
681 exception_exit(regs); 697 exception_exit(prev_state);
682} 698}
683#endif 699#endif
684 700
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0e883364abb5..022a9a0a3c63 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,12 +13,12 @@
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */ 14#include <linux/hugetlb.h> /* hstate_index_to_shift */
15#include <linux/prefetch.h> /* prefetchw */ 15#include <linux/prefetch.h> /* prefetchw */
16#include <linux/context_tracking.h> /* exception_enter(), ... */
16 17
17#include <asm/traps.h> /* dotraplinkage, ... */ 18#include <asm/traps.h> /* dotraplinkage, ... */
18#include <asm/pgalloc.h> /* pgd_*(), ... */ 19#include <asm/pgalloc.h> /* pgd_*(), ... */
19#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ 20#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
20#include <asm/fixmap.h> /* VSYSCALL_START */ 21#include <asm/fixmap.h> /* VSYSCALL_START */
21#include <asm/context_tracking.h> /* exception_enter(), ... */
22 22
23/* 23/*
24 * Page fault error code bits: 24 * Page fault error code bits:
@@ -1224,7 +1224,9 @@ good_area:
1224dotraplinkage void __kprobes 1224dotraplinkage void __kprobes
1225do_page_fault(struct pt_regs *regs, unsigned long error_code) 1225do_page_fault(struct pt_regs *regs, unsigned long error_code)
1226{ 1226{
1227 exception_enter(regs); 1227 enum ctx_state prev_state;
1228
1229 prev_state = exception_enter();
1228 __do_page_fault(regs, error_code); 1230 __do_page_fault(regs, error_code);
1229 exception_exit(regs); 1231 exception_exit(prev_state);
1230} 1232}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d86e215ca2b8..646ab9d15e42 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -586,7 +586,6 @@ struct cgroup_subsys {
586 void (*bind)(struct cgroup *root); 586 void (*bind)(struct cgroup *root);
587 587
588 int subsys_id; 588 int subsys_id;
589 int active;
590 int disabled; 589 int disabled;
591 int early_init; 590 int early_init;
592 /* 591 /*
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index b28d161c1091..365f4a61bf04 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -1,9 +1,9 @@
1#ifndef _LINUX_CONTEXT_TRACKING_H 1#ifndef _LINUX_CONTEXT_TRACKING_H
2#define _LINUX_CONTEXT_TRACKING_H 2#define _LINUX_CONTEXT_TRACKING_H
3 3
4#ifdef CONFIG_CONTEXT_TRACKING
5#include <linux/sched.h> 4#include <linux/sched.h>
6#include <linux/percpu.h> 5#include <linux/percpu.h>
6#include <asm/ptrace.h>
7 7
8struct context_tracking { 8struct context_tracking {
9 /* 9 /*
@@ -13,12 +13,13 @@ struct context_tracking {
13 * may be further optimized using static keys. 13 * may be further optimized using static keys.
14 */ 14 */
15 bool active; 15 bool active;
16 enum { 16 enum ctx_state {
17 IN_KERNEL = 0, 17 IN_KERNEL = 0,
18 IN_USER, 18 IN_USER,
19 } state; 19 } state;
20}; 20};
21 21
22#ifdef CONFIG_CONTEXT_TRACKING
22DECLARE_PER_CPU(struct context_tracking, context_tracking); 23DECLARE_PER_CPU(struct context_tracking, context_tracking);
23 24
24static inline bool context_tracking_in_user(void) 25static inline bool context_tracking_in_user(void)
@@ -33,12 +34,31 @@ static inline bool context_tracking_active(void)
33 34
34extern void user_enter(void); 35extern void user_enter(void);
35extern void user_exit(void); 36extern void user_exit(void);
37
38static inline enum ctx_state exception_enter(void)
39{
40 enum ctx_state prev_ctx;
41
42 prev_ctx = this_cpu_read(context_tracking.state);
43 user_exit();
44
45 return prev_ctx;
46}
47
48static inline void exception_exit(enum ctx_state prev_ctx)
49{
50 if (prev_ctx == IN_USER)
51 user_enter();
52}
53
36extern void context_tracking_task_switch(struct task_struct *prev, 54extern void context_tracking_task_switch(struct task_struct *prev,
37 struct task_struct *next); 55 struct task_struct *next);
38#else 56#else
39static inline bool context_tracking_in_user(void) { return false; } 57static inline bool context_tracking_in_user(void) { return false; }
40static inline void user_enter(void) { } 58static inline void user_enter(void) { }
41static inline void user_exit(void) { } 59static inline void user_exit(void) { }
60static inline enum ctx_state exception_enter(void) { return 0; }
61static inline void exception_exit(enum ctx_state prev_ctx) { }
42static inline void context_tracking_task_switch(struct task_struct *prev, 62static inline void context_tracking_task_switch(struct task_struct *prev,
43 struct task_struct *next) { } 63 struct task_struct *next) { }
44#endif /* !CONFIG_CONTEXT_TRACKING */ 64#endif /* !CONFIG_CONTEXT_TRACKING */
diff --git a/include/linux/math64.h b/include/linux/math64.h
index b8ba85544721..931a619407bf 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -30,6 +30,15 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
30} 30}
31 31
32/** 32/**
33 * div64_u64_rem - unsigned 64bit divide with 64bit divisor
34 */
35static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
36{
37 *remainder = dividend % divisor;
38 return dividend / divisor;
39}
40
41/**
33 * div64_u64 - unsigned 64bit divide with 64bit divisor 42 * div64_u64 - unsigned 64bit divide with 64bit divisor
34 */ 43 */
35static inline u64 div64_u64(u64 dividend, u64 divisor) 44static inline u64 div64_u64(u64 dividend, u64 divisor)
@@ -61,8 +70,16 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
61extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); 70extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
62#endif 71#endif
63 72
73#ifndef div64_u64_rem
74extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
75#endif
76
64#ifndef div64_u64 77#ifndef div64_u64
65extern u64 div64_u64(u64 dividend, u64 divisor); 78static inline u64 div64_u64(u64 dividend, u64 divisor)
79{
80 u64 remainder;
81 return div64_u64_rem(dividend, divisor, &remainder);
82}
66#endif 83#endif
67 84
68#ifndef div64_s64 85#ifndef div64_s64
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bcbc30397f23..01c7d85bcaa7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -127,18 +127,6 @@ extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
127extern void proc_sched_set_task(struct task_struct *p); 127extern void proc_sched_set_task(struct task_struct *p);
128extern void 128extern void
129print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); 129print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
130#else
131static inline void
132proc_sched_show_task(struct task_struct *p, struct seq_file *m)
133{
134}
135static inline void proc_sched_set_task(struct task_struct *p)
136{
137}
138static inline void
139print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
140{
141}
142#endif 130#endif
143 131
144/* 132/*
@@ -570,7 +558,7 @@ struct signal_struct {
570 cputime_t utime, stime, cutime, cstime; 558 cputime_t utime, stime, cutime, cstime;
571 cputime_t gtime; 559 cputime_t gtime;
572 cputime_t cgtime; 560 cputime_t cgtime;
573#ifndef CONFIG_VIRT_CPU_ACCOUNTING 561#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
574 struct cputime prev_cputime; 562 struct cputime prev_cputime;
575#endif 563#endif
576 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; 564 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -768,31 +756,6 @@ enum cpu_idle_type {
768}; 756};
769 757
770/* 758/*
771 * Increase resolution of nice-level calculations for 64-bit architectures.
772 * The extra resolution improves shares distribution and load balancing of
773 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
774 * hierarchies, especially on larger systems. This is not a user-visible change
775 * and does not change the user-interface for setting shares/weights.
776 *
777 * We increase resolution only if we have enough bits to allow this increased
778 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
779 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
780 * increased costs.
781 */
782#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
783# define SCHED_LOAD_RESOLUTION 10
784# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
785# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
786#else
787# define SCHED_LOAD_RESOLUTION 0
788# define scale_load(w) (w)
789# define scale_load_down(w) (w)
790#endif
791
792#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
793#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
794
795/*
796 * Increase resolution of cpu_power calculations 759 * Increase resolution of cpu_power calculations
797 */ 760 */
798#define SCHED_POWER_SHIFT 10 761#define SCHED_POWER_SHIFT 10
@@ -817,62 +780,6 @@ enum cpu_idle_type {
817 780
818extern int __weak arch_sd_sibiling_asym_packing(void); 781extern int __weak arch_sd_sibiling_asym_packing(void);
819 782
820struct sched_group_power {
821 atomic_t ref;
822 /*
823 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
824 * single CPU.
825 */
826 unsigned int power, power_orig;
827 unsigned long next_update;
828 /*
829 * Number of busy cpus in this group.
830 */
831 atomic_t nr_busy_cpus;
832
833 unsigned long cpumask[0]; /* iteration mask */
834};
835
836struct sched_group {
837 struct sched_group *next; /* Must be a circular list */
838 atomic_t ref;
839
840 unsigned int group_weight;
841 struct sched_group_power *sgp;
842
843 /*
844 * The CPUs this group covers.
845 *
846 * NOTE: this field is variable length. (Allocated dynamically
847 * by attaching extra space to the end of the structure,
848 * depending on how many CPUs the kernel has booted up with)
849 */
850 unsigned long cpumask[0];
851};
852
853static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
854{
855 return to_cpumask(sg->cpumask);
856}
857
858/*
859 * cpumask masking which cpus in the group are allowed to iterate up the domain
860 * tree.
861 */
862static inline struct cpumask *sched_group_mask(struct sched_group *sg)
863{
864 return to_cpumask(sg->sgp->cpumask);
865}
866
867/**
868 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
869 * @group: The group whose first cpu is to be returned.
870 */
871static inline unsigned int group_first_cpu(struct sched_group *group)
872{
873 return cpumask_first(sched_group_cpus(group));
874}
875
876struct sched_domain_attr { 783struct sched_domain_attr {
877 int relax_domain_level; 784 int relax_domain_level;
878}; 785};
@@ -883,6 +790,8 @@ struct sched_domain_attr {
883 790
884extern int sched_domain_level_max; 791extern int sched_domain_level_max;
885 792
793struct sched_group;
794
886struct sched_domain { 795struct sched_domain {
887 /* These fields must be setup */ 796 /* These fields must be setup */
888 struct sched_domain *parent; /* top domain must be null terminated */ 797 struct sched_domain *parent; /* top domain must be null terminated */
@@ -899,6 +808,8 @@ struct sched_domain {
899 unsigned int wake_idx; 808 unsigned int wake_idx;
900 unsigned int forkexec_idx; 809 unsigned int forkexec_idx;
901 unsigned int smt_gain; 810 unsigned int smt_gain;
811
812 int nohz_idle; /* NOHZ IDLE status */
902 int flags; /* See SD_* */ 813 int flags; /* See SD_* */
903 int level; 814 int level;
904 815
@@ -971,18 +882,6 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
971cpumask_var_t *alloc_sched_domains(unsigned int ndoms); 882cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
972void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); 883void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
973 884
974/* Test a flag in parent sched domain */
975static inline int test_sd_parent(struct sched_domain *sd, int flag)
976{
977 if (sd->parent && (sd->parent->flags & flag))
978 return 1;
979
980 return 0;
981}
982
983unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
984unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
985
986bool cpus_share_cache(int this_cpu, int that_cpu); 885bool cpus_share_cache(int this_cpu, int that_cpu);
987 886
988#else /* CONFIG_SMP */ 887#else /* CONFIG_SMP */
@@ -1017,72 +916,6 @@ struct mempolicy;
1017struct pipe_inode_info; 916struct pipe_inode_info;
1018struct uts_namespace; 917struct uts_namespace;
1019 918
1020struct rq;
1021struct sched_domain;
1022
1023/*
1024 * wake flags
1025 */
1026#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
1027#define WF_FORK 0x02 /* child wakeup after fork */
1028#define WF_MIGRATED 0x04 /* internal use, task got migrated */
1029
1030#define ENQUEUE_WAKEUP 1
1031#define ENQUEUE_HEAD 2
1032#ifdef CONFIG_SMP
1033#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
1034#else
1035#define ENQUEUE_WAKING 0
1036#endif
1037
1038#define DEQUEUE_SLEEP 1
1039
1040struct sched_class {
1041 const struct sched_class *next;
1042
1043 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1044 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1045 void (*yield_task) (struct rq *rq);
1046 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
1047
1048 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
1049
1050 struct task_struct * (*pick_next_task) (struct rq *rq);
1051 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
1052
1053#ifdef CONFIG_SMP
1054 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
1055 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
1056
1057 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
1058 void (*post_schedule) (struct rq *this_rq);
1059 void (*task_waking) (struct task_struct *task);
1060 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
1061
1062 void (*set_cpus_allowed)(struct task_struct *p,
1063 const struct cpumask *newmask);
1064
1065 void (*rq_online)(struct rq *rq);
1066 void (*rq_offline)(struct rq *rq);
1067#endif
1068
1069 void (*set_curr_task) (struct rq *rq);
1070 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
1071 void (*task_fork) (struct task_struct *p);
1072
1073 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
1074 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1075 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1076 int oldprio);
1077
1078 unsigned int (*get_rr_interval) (struct rq *rq,
1079 struct task_struct *task);
1080
1081#ifdef CONFIG_FAIR_GROUP_SCHED
1082 void (*task_move_group) (struct task_struct *p, int on_rq);
1083#endif
1084};
1085
1086struct load_weight { 919struct load_weight {
1087 unsigned long weight, inv_weight; 920 unsigned long weight, inv_weight;
1088}; 921};
@@ -1274,8 +1107,10 @@ struct task_struct {
1274 int exit_code, exit_signal; 1107 int exit_code, exit_signal;
1275 int pdeath_signal; /* The signal sent when the parent dies */ 1108 int pdeath_signal; /* The signal sent when the parent dies */
1276 unsigned int jobctl; /* JOBCTL_*, siglock protected */ 1109 unsigned int jobctl; /* JOBCTL_*, siglock protected */
1277 /* ??? */ 1110
1111 /* Used for emulating ABI behavior of previous Linux versions */
1278 unsigned int personality; 1112 unsigned int personality;
1113
1279 unsigned did_exec:1; 1114 unsigned did_exec:1;
1280 unsigned in_execve:1; /* Tell the LSMs that the process is doing an 1115 unsigned in_execve:1; /* Tell the LSMs that the process is doing an
1281 * execve */ 1116 * execve */
@@ -1327,7 +1162,7 @@ struct task_struct {
1327 1162
1328 cputime_t utime, stime, utimescaled, stimescaled; 1163 cputime_t utime, stime, utimescaled, stimescaled;
1329 cputime_t gtime; 1164 cputime_t gtime;
1330#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1165#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1331 struct cputime prev_cputime; 1166 struct cputime prev_cputime;
1332#endif 1167#endif
1333#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1168#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -2681,28 +2516,7 @@ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
2681extern long sched_getaffinity(pid_t pid, struct cpumask *mask); 2516extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2682 2517
2683#ifdef CONFIG_CGROUP_SCHED 2518#ifdef CONFIG_CGROUP_SCHED
2684
2685extern struct task_group root_task_group; 2519extern struct task_group root_task_group;
2686
2687extern struct task_group *sched_create_group(struct task_group *parent);
2688extern void sched_online_group(struct task_group *tg,
2689 struct task_group *parent);
2690extern void sched_destroy_group(struct task_group *tg);
2691extern void sched_offline_group(struct task_group *tg);
2692extern void sched_move_task(struct task_struct *tsk);
2693#ifdef CONFIG_FAIR_GROUP_SCHED
2694extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
2695extern unsigned long sched_group_shares(struct task_group *tg);
2696#endif
2697#ifdef CONFIG_RT_GROUP_SCHED
2698extern int sched_group_set_rt_runtime(struct task_group *tg,
2699 long rt_runtime_us);
2700extern long sched_group_rt_runtime(struct task_group *tg);
2701extern int sched_group_set_rt_period(struct task_group *tg,
2702 long rt_period_us);
2703extern long sched_group_rt_period(struct task_group *tg);
2704extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
2705#endif
2706#endif /* CONFIG_CGROUP_SCHED */ 2520#endif /* CONFIG_CGROUP_SCHED */
2707 2521
2708extern int task_can_switch_user(struct user_struct *up, 2522extern int task_can_switch_user(struct user_struct *up,
diff --git a/init/Kconfig b/init/Kconfig
index 71bb9e73011a..4367e1379002 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -505,6 +505,7 @@ config RCU_USER_QS
505config CONTEXT_TRACKING_FORCE 505config CONTEXT_TRACKING_FORCE
506 bool "Force context tracking" 506 bool "Force context tracking"
507 depends on CONTEXT_TRACKING 507 depends on CONTEXT_TRACKING
508 default CONTEXT_TRACKING
508 help 509 help
509 Probe on user/kernel boundaries by default in order to 510 Probe on user/kernel boundaries by default in order to
510 test the features that rely on it such as userspace RCU extended 511 test the features that rely on it such as userspace RCU extended
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index eeb7e49946b2..d3abce2d6455 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4380,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4380 * need to invoke fork callbacks here. */ 4380 * need to invoke fork callbacks here. */
4381 BUG_ON(!list_empty(&init_task.tasks)); 4381 BUG_ON(!list_empty(&init_task.tasks));
4382 4382
4383 ss->active = 1;
4384 BUG_ON(online_css(ss, dummytop)); 4383 BUG_ON(online_css(ss, dummytop));
4385 4384
4386 mutex_unlock(&cgroup_mutex); 4385 mutex_unlock(&cgroup_mutex);
@@ -4485,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4485 } 4484 }
4486 write_unlock(&css_set_lock); 4485 write_unlock(&css_set_lock);
4487 4486
4488 ss->active = 1;
4489 ret = online_css(ss, dummytop); 4487 ret = online_css(ss, dummytop);
4490 if (ret) 4488 if (ret)
4491 goto err_unload; 4489 goto err_unload;
@@ -4526,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4526 mutex_lock(&cgroup_mutex); 4524 mutex_lock(&cgroup_mutex);
4527 4525
4528 offline_css(ss, dummytop); 4526 offline_css(ss, dummytop);
4529 ss->active = 0;
4530 4527
4531 if (ss->use_id) 4528 if (ss->use_id)
4532 idr_destroy(&ss->idr); 4529 idr_destroy(&ss->idr);
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d324d5e3..339f60dfd62b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1233 1233
1234 p->utime = p->stime = p->gtime = 0; 1234 p->utime = p->stime = p->gtime = 0;
1235 p->utimescaled = p->stimescaled = 0; 1235 p->utimescaled = p->stimescaled = 0;
1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING 1236#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
1237 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1237 p->prev_cputime.utime = p->prev_cputime.stime = 0;
1238#endif 1238#endif
1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1239#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103b..deaf90e4a1de 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 17obj-$(CONFIG_SCHEDSTATS) += stats.o
18obj-$(CONFIG_SCHED_DEBUG) += debug.o 18obj-$(CONFIG_SCHED_DEBUG) += debug.o
19obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8285eb0cde6..ebdb19541218 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1288,8 +1288,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1288static void 1288static void
1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1289ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1290{ 1290{
1291 trace_sched_wakeup(p, true);
1292 check_preempt_curr(rq, p, wake_flags); 1291 check_preempt_curr(rq, p, wake_flags);
1292 trace_sched_wakeup(p, true);
1293 1293
1294 p->state = TASK_RUNNING; 1294 p->state = TASK_RUNNING;
1295#ifdef CONFIG_SMP 1295#ifdef CONFIG_SMP
@@ -3039,11 +3039,13 @@ EXPORT_SYMBOL(preempt_schedule);
3039asmlinkage void __sched preempt_schedule_irq(void) 3039asmlinkage void __sched preempt_schedule_irq(void)
3040{ 3040{
3041 struct thread_info *ti = current_thread_info(); 3041 struct thread_info *ti = current_thread_info();
3042 enum ctx_state prev_state;
3042 3043
3043 /* Catch callers which need to be fixed */ 3044 /* Catch callers which need to be fixed */
3044 BUG_ON(ti->preempt_count || !irqs_disabled()); 3045 BUG_ON(ti->preempt_count || !irqs_disabled());
3045 3046
3046 user_exit(); 3047 prev_state = exception_enter();
3048
3047 do { 3049 do {
3048 add_preempt_count(PREEMPT_ACTIVE); 3050 add_preempt_count(PREEMPT_ACTIVE);
3049 local_irq_enable(); 3051 local_irq_enable();
@@ -3057,6 +3059,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
3057 */ 3059 */
3058 barrier(); 3060 barrier();
3059 } while (need_resched()); 3061 } while (need_resched());
3062
3063 exception_exit(prev_state);
3060} 3064}
3061 3065
3062#endif /* CONFIG_PREEMPT */ 3066#endif /* CONFIG_PREEMPT */
@@ -6204,7 +6208,7 @@ static void sched_init_numa(void)
6204 * 'level' contains the number of unique distances, excluding the 6208 * 'level' contains the number of unique distances, excluding the
6205 * identity distance node_distance(i,i). 6209 * identity distance node_distance(i,i).
6206 * 6210 *
6207 * The sched_domains_nume_distance[] array includes the actual distance 6211 * The sched_domains_numa_distance[] array includes the actual distance
6208 * numbers. 6212 * numbers.
6209 */ 6213 */
6210 6214
@@ -6817,11 +6821,15 @@ int in_sched_functions(unsigned long addr)
6817} 6821}
6818 6822
6819#ifdef CONFIG_CGROUP_SCHED 6823#ifdef CONFIG_CGROUP_SCHED
6824/*
6825 * Default task group.
6826 * Every task in system belongs to this group at bootup.
6827 */
6820struct task_group root_task_group; 6828struct task_group root_task_group;
6821LIST_HEAD(task_groups); 6829LIST_HEAD(task_groups);
6822#endif 6830#endif
6823 6831
6824DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6832DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6825 6833
6826void __init sched_init(void) 6834void __init sched_init(void)
6827{ 6835{
@@ -6858,7 +6866,7 @@ void __init sched_init(void)
6858#endif /* CONFIG_RT_GROUP_SCHED */ 6866#endif /* CONFIG_RT_GROUP_SCHED */
6859#ifdef CONFIG_CPUMASK_OFFSTACK 6867#ifdef CONFIG_CPUMASK_OFFSTACK
6860 for_each_possible_cpu(i) { 6868 for_each_possible_cpu(i) {
6861 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6869 per_cpu(load_balance_mask, i) = (void *)ptr;
6862 ptr += cpumask_size(); 6870 ptr += cpumask_size();
6863 } 6871 }
6864#endif /* CONFIG_CPUMASK_OFFSTACK */ 6872#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6884,12 +6892,6 @@ void __init sched_init(void)
6884 6892
6885#endif /* CONFIG_CGROUP_SCHED */ 6893#endif /* CONFIG_CGROUP_SCHED */
6886 6894
6887#ifdef CONFIG_CGROUP_CPUACCT
6888 root_cpuacct.cpustat = &kernel_cpustat;
6889 root_cpuacct.cpuusage = alloc_percpu(u64);
6890 /* Too early, not expected to fail */
6891 BUG_ON(!root_cpuacct.cpuusage);
6892#endif
6893 for_each_possible_cpu(i) { 6895 for_each_possible_cpu(i) {
6894 struct rq *rq; 6896 struct rq *rq;
6895 6897
@@ -7411,7 +7413,7 @@ unlock:
7411 return err; 7413 return err;
7412} 7414}
7413 7415
7414int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7416static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7415{ 7417{
7416 u64 rt_runtime, rt_period; 7418 u64 rt_runtime, rt_period;
7417 7419
@@ -7423,7 +7425,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7423 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7425 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7424} 7426}
7425 7427
7426long sched_group_rt_runtime(struct task_group *tg) 7428static long sched_group_rt_runtime(struct task_group *tg)
7427{ 7429{
7428 u64 rt_runtime_us; 7430 u64 rt_runtime_us;
7429 7431
@@ -7435,7 +7437,7 @@ long sched_group_rt_runtime(struct task_group *tg)
7435 return rt_runtime_us; 7437 return rt_runtime_us;
7436} 7438}
7437 7439
7438int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7440static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7439{ 7441{
7440 u64 rt_runtime, rt_period; 7442 u64 rt_runtime, rt_period;
7441 7443
@@ -7448,7 +7450,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7448 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7450 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7449} 7451}
7450 7452
7451long sched_group_rt_period(struct task_group *tg) 7453static long sched_group_rt_period(struct task_group *tg)
7452{ 7454{
7453 u64 rt_period_us; 7455 u64 rt_period_us;
7454 7456
@@ -7483,7 +7485,7 @@ static int sched_rt_global_constraints(void)
7483 return ret; 7485 return ret;
7484} 7486}
7485 7487
7486int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7488static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7487{ 7489{
7488 /* Don't accept realtime tasks when there is no way for them to run */ 7490 /* Don't accept realtime tasks when there is no way for them to run */
7489 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7491 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7991,226 +7993,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7991 7993
7992#endif /* CONFIG_CGROUP_SCHED */ 7994#endif /* CONFIG_CGROUP_SCHED */
7993 7995
7994#ifdef CONFIG_CGROUP_CPUACCT
7995
7996/*
7997 * CPU accounting code for task groups.
7998 *
7999 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8000 * (balbir@in.ibm.com).
8001 */
8002
8003struct cpuacct root_cpuacct;
8004
8005/* create a new cpu accounting group */
8006static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
8007{
8008 struct cpuacct *ca;
8009
8010 if (!cgrp->parent)
8011 return &root_cpuacct.css;
8012
8013 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8014 if (!ca)
8015 goto out;
8016
8017 ca->cpuusage = alloc_percpu(u64);
8018 if (!ca->cpuusage)
8019 goto out_free_ca;
8020
8021 ca->cpustat = alloc_percpu(struct kernel_cpustat);
8022 if (!ca->cpustat)
8023 goto out_free_cpuusage;
8024
8025 return &ca->css;
8026
8027out_free_cpuusage:
8028 free_percpu(ca->cpuusage);
8029out_free_ca:
8030 kfree(ca);
8031out:
8032 return ERR_PTR(-ENOMEM);
8033}
8034
8035/* destroy an existing cpu accounting group */
8036static void cpuacct_css_free(struct cgroup *cgrp)
8037{
8038 struct cpuacct *ca = cgroup_ca(cgrp);
8039
8040 free_percpu(ca->cpustat);
8041 free_percpu(ca->cpuusage);
8042 kfree(ca);
8043}
8044
8045static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8046{
8047 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8048 u64 data;
8049
8050#ifndef CONFIG_64BIT
8051 /*
8052 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8053 */
8054 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8055 data = *cpuusage;
8056 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8057#else
8058 data = *cpuusage;
8059#endif
8060
8061 return data;
8062}
8063
8064static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8065{
8066 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8067
8068#ifndef CONFIG_64BIT
8069 /*
8070 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8071 */
8072 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8073 *cpuusage = val;
8074 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8075#else
8076 *cpuusage = val;
8077#endif
8078}
8079
8080/* return total cpu usage (in nanoseconds) of a group */
8081static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8082{
8083 struct cpuacct *ca = cgroup_ca(cgrp);
8084 u64 totalcpuusage = 0;
8085 int i;
8086
8087 for_each_present_cpu(i)
8088 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8089
8090 return totalcpuusage;
8091}
8092
8093static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8094 u64 reset)
8095{
8096 struct cpuacct *ca = cgroup_ca(cgrp);
8097 int err = 0;
8098 int i;
8099
8100 if (reset) {
8101 err = -EINVAL;
8102 goto out;
8103 }
8104
8105 for_each_present_cpu(i)
8106 cpuacct_cpuusage_write(ca, i, 0);
8107
8108out:
8109 return err;
8110}
8111
8112static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8113 struct seq_file *m)
8114{
8115 struct cpuacct *ca = cgroup_ca(cgroup);
8116 u64 percpu;
8117 int i;
8118
8119 for_each_present_cpu(i) {
8120 percpu = cpuacct_cpuusage_read(ca, i);
8121 seq_printf(m, "%llu ", (unsigned long long) percpu);
8122 }
8123 seq_printf(m, "\n");
8124 return 0;
8125}
8126
8127static const char *cpuacct_stat_desc[] = {
8128 [CPUACCT_STAT_USER] = "user",
8129 [CPUACCT_STAT_SYSTEM] = "system",
8130};
8131
8132static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8133 struct cgroup_map_cb *cb)
8134{
8135 struct cpuacct *ca = cgroup_ca(cgrp);
8136 int cpu;
8137 s64 val = 0;
8138
8139 for_each_online_cpu(cpu) {
8140 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8141 val += kcpustat->cpustat[CPUTIME_USER];
8142 val += kcpustat->cpustat[CPUTIME_NICE];
8143 }
8144 val = cputime64_to_clock_t(val);
8145 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8146
8147 val = 0;
8148 for_each_online_cpu(cpu) {
8149 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8150 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8151 val += kcpustat->cpustat[CPUTIME_IRQ];
8152 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8153 }
8154
8155 val = cputime64_to_clock_t(val);
8156 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8157
8158 return 0;
8159}
8160
8161static struct cftype files[] = {
8162 {
8163 .name = "usage",
8164 .read_u64 = cpuusage_read,
8165 .write_u64 = cpuusage_write,
8166 },
8167 {
8168 .name = "usage_percpu",
8169 .read_seq_string = cpuacct_percpu_seq_read,
8170 },
8171 {
8172 .name = "stat",
8173 .read_map = cpuacct_stats_show,
8174 },
8175 { } /* terminate */
8176};
8177
8178/*
8179 * charge this task's execution time to its accounting group.
8180 *
8181 * called with rq->lock held.
8182 */
8183void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8184{
8185 struct cpuacct *ca;
8186 int cpu;
8187
8188 if (unlikely(!cpuacct_subsys.active))
8189 return;
8190
8191 cpu = task_cpu(tsk);
8192
8193 rcu_read_lock();
8194
8195 ca = task_ca(tsk);
8196
8197 for (; ca; ca = parent_ca(ca)) {
8198 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8199 *cpuusage += cputime;
8200 }
8201
8202 rcu_read_unlock();
8203}
8204
8205struct cgroup_subsys cpuacct_subsys = {
8206 .name = "cpuacct",
8207 .css_alloc = cpuacct_css_alloc,
8208 .css_free = cpuacct_css_free,
8209 .subsys_id = cpuacct_subsys_id,
8210 .base_cftypes = files,
8211};
8212#endif /* CONFIG_CGROUP_CPUACCT */
8213
8214void dump_cpu_task(int cpu) 7996void dump_cpu_task(int cpu)
8215{ 7997{
8216 pr_info("Task dump for CPU %d:\n", cpu); 7998 pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 000000000000..dbb7e2cd95eb
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
1#include <linux/cgroup.h>
2#include <linux/slab.h>
3#include <linux/percpu.h>
4#include <linux/spinlock.h>
5#include <linux/cpumask.h>
6#include <linux/seq_file.h>
7#include <linux/rcupdate.h>
8#include <linux/kernel_stat.h>
9#include <linux/err.h>
10
11#include "sched.h"
12
13/*
14 * CPU accounting code for task groups.
15 *
16 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
17 * (balbir@in.ibm.com).
18 */
19
20/* Time spent by the tasks of the cpu accounting group executing in ... */
21enum cpuacct_stat_index {
22 CPUACCT_STAT_USER, /* ... user mode */
23 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
24
25 CPUACCT_STAT_NSTATS,
26};
27
28/* track cpu usage of a group of tasks and its child groups */
29struct cpuacct {
30 struct cgroup_subsys_state css;
31 /* cpuusage holds pointer to a u64-type object on every cpu */
32 u64 __percpu *cpuusage;
33 struct kernel_cpustat __percpu *cpustat;
34};
35
36/* return cpu accounting group corresponding to this container */
37static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
38{
39 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
40 struct cpuacct, css);
41}
42
43/* return cpu accounting group to which this task belongs */
44static inline struct cpuacct *task_ca(struct task_struct *tsk)
45{
46 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
47 struct cpuacct, css);
48}
49
50static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
51{
52 return cgroup_ca(ca->css.cgroup->parent);
53}
54
55static inline struct cpuacct *parent_ca(struct cpuacct *ca)
56{
57 if (!ca->css.cgroup->parent)
58 return NULL;
59 return cgroup_ca(ca->css.cgroup->parent);
60}
61
62static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
63static struct cpuacct root_cpuacct = {
64 .cpustat = &kernel_cpustat,
65 .cpuusage = &root_cpuacct_cpuusage,
66};
67
68/* create a new cpu accounting group */
69static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
70{
71 struct cpuacct *ca;
72
73 if (!cgrp->parent)
74 return &root_cpuacct.css;
75
76 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
77 if (!ca)
78 goto out;
79
80 ca->cpuusage = alloc_percpu(u64);
81 if (!ca->cpuusage)
82 goto out_free_ca;
83
84 ca->cpustat = alloc_percpu(struct kernel_cpustat);
85 if (!ca->cpustat)
86 goto out_free_cpuusage;
87
88 return &ca->css;
89
90out_free_cpuusage:
91 free_percpu(ca->cpuusage);
92out_free_ca:
93 kfree(ca);
94out:
95 return ERR_PTR(-ENOMEM);
96}
97
98/* destroy an existing cpu accounting group */
99static void cpuacct_css_free(struct cgroup *cgrp)
100{
101 struct cpuacct *ca = cgroup_ca(cgrp);
102
103 free_percpu(ca->cpustat);
104 free_percpu(ca->cpuusage);
105 kfree(ca);
106}
107
108static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
109{
110 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
111 u64 data;
112
113#ifndef CONFIG_64BIT
114 /*
115 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
116 */
117 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
118 data = *cpuusage;
119 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
120#else
121 data = *cpuusage;
122#endif
123
124 return data;
125}
126
127static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
128{
129 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
130
131#ifndef CONFIG_64BIT
132 /*
133 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
134 */
135 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
136 *cpuusage = val;
137 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
138#else
139 *cpuusage = val;
140#endif
141}
142
143/* return total cpu usage (in nanoseconds) of a group */
144static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
145{
146 struct cpuacct *ca = cgroup_ca(cgrp);
147 u64 totalcpuusage = 0;
148 int i;
149
150 for_each_present_cpu(i)
151 totalcpuusage += cpuacct_cpuusage_read(ca, i);
152
153 return totalcpuusage;
154}
155
156static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
157 u64 reset)
158{
159 struct cpuacct *ca = cgroup_ca(cgrp);
160 int err = 0;
161 int i;
162
163 if (reset) {
164 err = -EINVAL;
165 goto out;
166 }
167
168 for_each_present_cpu(i)
169 cpuacct_cpuusage_write(ca, i, 0);
170
171out:
172 return err;
173}
174
175static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
176 struct seq_file *m)
177{
178 struct cpuacct *ca = cgroup_ca(cgroup);
179 u64 percpu;
180 int i;
181
182 for_each_present_cpu(i) {
183 percpu = cpuacct_cpuusage_read(ca, i);
184 seq_printf(m, "%llu ", (unsigned long long) percpu);
185 }
186 seq_printf(m, "\n");
187 return 0;
188}
189
190static const char * const cpuacct_stat_desc[] = {
191 [CPUACCT_STAT_USER] = "user",
192 [CPUACCT_STAT_SYSTEM] = "system",
193};
194
195static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
196 struct cgroup_map_cb *cb)
197{
198 struct cpuacct *ca = cgroup_ca(cgrp);
199 int cpu;
200 s64 val = 0;
201
202 for_each_online_cpu(cpu) {
203 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
204 val += kcpustat->cpustat[CPUTIME_USER];
205 val += kcpustat->cpustat[CPUTIME_NICE];
206 }
207 val = cputime64_to_clock_t(val);
208 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
209
210 val = 0;
211 for_each_online_cpu(cpu) {
212 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
213 val += kcpustat->cpustat[CPUTIME_SYSTEM];
214 val += kcpustat->cpustat[CPUTIME_IRQ];
215 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
216 }
217
218 val = cputime64_to_clock_t(val);
219 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
220
221 return 0;
222}
223
224static struct cftype files[] = {
225 {
226 .name = "usage",
227 .read_u64 = cpuusage_read,
228 .write_u64 = cpuusage_write,
229 },
230 {
231 .name = "usage_percpu",
232 .read_seq_string = cpuacct_percpu_seq_read,
233 },
234 {
235 .name = "stat",
236 .read_map = cpuacct_stats_show,
237 },
238 { } /* terminate */
239};
240
241/*
242 * charge this task's execution time to its accounting group.
243 *
244 * called with rq->lock held.
245 */
246void cpuacct_charge(struct task_struct *tsk, u64 cputime)
247{
248 struct cpuacct *ca;
249 int cpu;
250
251 cpu = task_cpu(tsk);
252
253 rcu_read_lock();
254
255 ca = task_ca(tsk);
256
257 while (true) {
258 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
259 *cpuusage += cputime;
260
261 ca = parent_ca(ca);
262 if (!ca)
263 break;
264 }
265
266 rcu_read_unlock();
267}
268
269/*
270 * Add user/system time to cpuacct.
271 *
272 * Note: it's the caller that updates the account of the root cgroup.
273 */
274void cpuacct_account_field(struct task_struct *p, int index, u64 val)
275{
276 struct kernel_cpustat *kcpustat;
277 struct cpuacct *ca;
278
279 rcu_read_lock();
280 ca = task_ca(p);
281 while (ca != &root_cpuacct) {
282 kcpustat = this_cpu_ptr(ca->cpustat);
283 kcpustat->cpustat[index] += val;
284 ca = __parent_ca(ca);
285 }
286 rcu_read_unlock();
287}
288
289struct cgroup_subsys cpuacct_subsys = {
290 .name = "cpuacct",
291 .css_alloc = cpuacct_css_alloc,
292 .css_free = cpuacct_css_free,
293 .subsys_id = cpuacct_subsys_id,
294 .base_cftypes = files,
295 .early_init = 1,
296};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 000000000000..ed605624a5e7
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
1#ifdef CONFIG_CGROUP_CPUACCT
2
3extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
4extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
5
6#else
7
8static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9{
10}
11
12static inline void
13cpuacct_account_field(struct task_struct *p, int index, u64 val)
14{
15}
16
17#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e93cca92f38b..ea32f02bf2c3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
115static inline void task_group_account_field(struct task_struct *p, int index, 115static inline void task_group_account_field(struct task_struct *p, int index,
116 u64 tmp) 116 u64 tmp)
117{ 117{
118#ifdef CONFIG_CGROUP_CPUACCT
119 struct kernel_cpustat *kcpustat;
120 struct cpuacct *ca;
121#endif
122 /* 118 /*
123 * Since all updates are sure to touch the root cgroup, we 119 * Since all updates are sure to touch the root cgroup, we
124 * get ourselves ahead and touch it first. If the root cgroup 120 * get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
127 */ 123 */
128 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 124 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
129 125
130#ifdef CONFIG_CGROUP_CPUACCT 126 cpuacct_account_field(p, index, tmp);
131 if (unlikely(!cpuacct_subsys.active))
132 return;
133
134 rcu_read_lock();
135 ca = task_ca(p);
136 while (ca && (ca != &root_cpuacct)) {
137 kcpustat = this_cpu_ptr(ca->cpustat);
138 kcpustat->cpustat[index] += tmp;
139 ca = parent_ca(ca);
140 }
141 rcu_read_unlock();
142#endif
143} 127}
144 128
145/* 129/*
@@ -388,82 +372,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
388 struct rq *rq) {} 372 struct rq *rq) {}
389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 373#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
390 374
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
392/*
393 * Account a single tick of cpu time.
394 * @p: the process that the cpu time gets accounted to
395 * @user_tick: indicates if the tick is a user or a system tick
396 */
397void account_process_tick(struct task_struct *p, int user_tick)
398{
399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
400 struct rq *rq = this_rq();
401
402 if (vtime_accounting_enabled())
403 return;
404
405 if (sched_clock_irqtime) {
406 irqtime_account_process_tick(p, user_tick, rq);
407 return;
408 }
409
410 if (steal_account_process_tick())
411 return;
412
413 if (user_tick)
414 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
415 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
416 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
417 one_jiffy_scaled);
418 else
419 account_idle_time(cputime_one_jiffy);
420}
421
422/*
423 * Account multiple ticks of steal time.
424 * @p: the process from which the cpu time has been stolen
425 * @ticks: number of stolen ticks
426 */
427void account_steal_ticks(unsigned long ticks)
428{
429 account_steal_time(jiffies_to_cputime(ticks));
430}
431
432/*
433 * Account multiple ticks of idle time.
434 * @ticks: number of stolen ticks
435 */
436void account_idle_ticks(unsigned long ticks)
437{
438
439 if (sched_clock_irqtime) {
440 irqtime_account_idle_ticks(ticks);
441 return;
442 }
443
444 account_idle_time(jiffies_to_cputime(ticks));
445}
446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
447
448/* 375/*
449 * Use precise platform statistics if available: 376 * Use precise platform statistics if available:
450 */ 377 */
451#ifdef CONFIG_VIRT_CPU_ACCOUNTING 378#ifdef CONFIG_VIRT_CPU_ACCOUNTING
452void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
453{
454 *ut = p->utime;
455 *st = p->stime;
456}
457
458void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
459{
460 struct task_cputime cputime;
461
462 thread_group_cputime(p, &cputime);
463
464 *ut = cputime.utime;
465 *st = cputime.stime;
466}
467 379
468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 380#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
469void vtime_task_switch(struct task_struct *prev) 381void vtime_task_switch(struct task_struct *prev)
@@ -518,21 +430,111 @@ void vtime_account_irq_enter(struct task_struct *tsk)
518} 430}
519EXPORT_SYMBOL_GPL(vtime_account_irq_enter); 431EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
520#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 432#endif /* __ARCH_HAS_VTIME_ACCOUNT */
433#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
434
435
436#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
437void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
438{
439 *ut = p->utime;
440 *st = p->stime;
441}
521 442
522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ 443void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
444{
445 struct task_cputime cputime;
523 446
524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total) 447 thread_group_cputime(p, &cputime);
448
449 *ut = cputime.utime;
450 *st = cputime.stime;
451}
452#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
453/*
454 * Account a single tick of cpu time.
455 * @p: the process that the cpu time gets accounted to
456 * @user_tick: indicates if the tick is a user or a system tick
457 */
458void account_process_tick(struct task_struct *p, int user_tick)
525{ 459{
526 u64 temp = (__force u64) rtime; 460 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
461 struct rq *rq = this_rq();
527 462
528 temp *= (__force u64) stime; 463 if (vtime_accounting_enabled())
464 return;
465
466 if (sched_clock_irqtime) {
467 irqtime_account_process_tick(p, user_tick, rq);
468 return;
469 }
470
471 if (steal_account_process_tick())
472 return;
529 473
530 if (sizeof(cputime_t) == 4) 474 if (user_tick)
531 temp = div_u64(temp, (__force u32) total); 475 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
476 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
477 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
478 one_jiffy_scaled);
532 else 479 else
533 temp = div64_u64(temp, (__force u64) total); 480 account_idle_time(cputime_one_jiffy);
481}
534 482
535 return (__force cputime_t) temp; 483/*
484 * Account multiple ticks of steal time.
485 * @p: the process from which the cpu time has been stolen
486 * @ticks: number of stolen ticks
487 */
488void account_steal_ticks(unsigned long ticks)
489{
490 account_steal_time(jiffies_to_cputime(ticks));
491}
492
493/*
494 * Account multiple ticks of idle time.
495 * @ticks: number of stolen ticks
496 */
497void account_idle_ticks(unsigned long ticks)
498{
499
500 if (sched_clock_irqtime) {
501 irqtime_account_idle_ticks(ticks);
502 return;
503 }
504
505 account_idle_time(jiffies_to_cputime(ticks));
506}
507
508/*
509 * Perform (stime * rtime) / total with reduced chances
510 * of multiplication overflows by using smaller factors
511 * like quotient and remainders of divisions between
512 * rtime and total.
513 */
514static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
515{
516 u64 rem, res, scaled;
517
518 if (rtime >= total) {
519 /*
520 * Scale up to rtime / total then add
521 * the remainder scaled to stime / total.
522 */
523 res = div64_u64_rem(rtime, total, &rem);
524 scaled = stime * res;
525 scaled += div64_u64(stime * rem, total);
526 } else {
527 /*
528 * Same in reverse: scale down to total / rtime
529 * then substract that result scaled to
530 * to the remaining part.
531 */
532 res = div64_u64_rem(total, rtime, &rem);
533 scaled = div64_u64(stime, res);
534 scaled -= div64_u64(scaled * rem, total);
535 }
536
537 return (__force cputime_t) scaled;
536} 538}
537 539
538/* 540/*
@@ -545,6 +547,12 @@ static void cputime_adjust(struct task_cputime *curr,
545{ 547{
546 cputime_t rtime, stime, total; 548 cputime_t rtime, stime, total;
547 549
550 if (vtime_accounting_enabled()) {
551 *ut = curr->utime;
552 *st = curr->stime;
553 return;
554 }
555
548 stime = curr->stime; 556 stime = curr->stime;
549 total = stime + curr->utime; 557 total = stime + curr->utime;
550 558
@@ -560,10 +568,14 @@ static void cputime_adjust(struct task_cputime *curr,
560 */ 568 */
561 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 569 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
562 570
563 if (total) 571 if (!rtime) {
564 stime = scale_stime(stime, rtime, total); 572 stime = 0;
565 else 573 } else if (!total) {
566 stime = rtime; 574 stime = rtime;
575 } else {
576 stime = scale_stime((__force u64)stime,
577 (__force u64)rtime, (__force u64)total);
578 }
567 579
568 /* 580 /*
569 * If the tick based count grows faster than the scheduler one, 581 * If the tick based count grows faster than the scheduler one,
@@ -597,7 +609,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
597 thread_group_cputime(p, &cputime); 609 thread_group_cputime(p, &cputime);
598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 610 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
599} 611}
600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ 612#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
601 613
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 614#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk) 615static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..8bf7081b1ec5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
431 * Scheduling class tree data structure manipulation methods: 431 * Scheduling class tree data structure manipulation methods:
432 */ 432 */
433 433
434static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) 434static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
435{ 435{
436 s64 delta = (s64)(vruntime - min_vruntime); 436 s64 delta = (s64)(vruntime - max_vruntime);
437 if (delta > 0) 437 if (delta > 0)
438 min_vruntime = vruntime; 438 max_vruntime = vruntime;
439 439
440 return min_vruntime; 440 return max_vruntime;
441} 441}
442 442
443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) 443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
473 vruntime = min_vruntime(vruntime, se->vruntime); 473 vruntime = min_vruntime(vruntime, se->vruntime);
474 } 474 }
475 475
476 /* ensure we never gain time by being placed backwards. */
476 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
477#ifndef CONFIG_64BIT 478#ifndef CONFIG_64BIT
478 smp_wmb(); 479 smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
652} 653}
653 654
654/* 655/*
655 * We calculate the vruntime slice of a to be inserted task 656 * We calculate the vruntime slice of a to-be-inserted task.
656 * 657 *
657 * vs = s/w 658 * vs = s/w
658 */ 659 */
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1562 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 1563 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1563 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 1564 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1564} 1565}
1566
1567/*
1568 * Update the rq's load with the elapsed running time before entering
1569 * idle. if the last scheduled task is not a CFS task, idle_enter will
1570 * be the only way to update the runnable statistic.
1571 */
1572void idle_enter_fair(struct rq *this_rq)
1573{
1574 update_rq_runnable_avg(this_rq, 1);
1575}
1576
1577/*
1578 * Update the rq's load with the elapsed idle time before a task is
1579 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1580 * be the only way to update the runnable statistic.
1581 */
1582void idle_exit_fair(struct rq *this_rq)
1583{
1584 update_rq_runnable_avg(this_rq, 0);
1585}
1586
1565#else 1587#else
1566static inline void update_entity_load_avg(struct sched_entity *se, 1588static inline void update_entity_load_avg(struct sched_entity *se,
1567 int update_cfs_rq) {} 1589 int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3874 int tsk_cache_hot = 0; 3896 int tsk_cache_hot = 0;
3875 /* 3897 /*
3876 * We do not migrate tasks that are: 3898 * We do not migrate tasks that are:
3877 * 1) running (obviously), or 3899 * 1) throttled_lb_pair, or
3878 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3900 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3879 * 3) are cache-hot on their current CPU. 3901 * 3) running (obviously), or
3902 * 4) are cache-hot on their current CPU.
3880 */ 3903 */
3904 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3905 return 0;
3906
3881 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { 3907 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3882 int new_dst_cpu; 3908 int cpu;
3883 3909
3884 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3910 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3885 3911
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3894 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 3920 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3895 return 0; 3921 return 0;
3896 3922
3897 new_dst_cpu = cpumask_first_and(env->dst_grpmask, 3923 /* Prevent to re-select dst_cpu via env's cpus */
3898 tsk_cpus_allowed(p)); 3924 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3899 if (new_dst_cpu < nr_cpu_ids) { 3925 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3900 env->flags |= LBF_SOME_PINNED; 3926 env->flags |= LBF_SOME_PINNED;
3901 env->new_dst_cpu = new_dst_cpu; 3927 env->new_dst_cpu = cpu;
3928 break;
3929 }
3902 } 3930 }
3931
3903 return 0; 3932 return 0;
3904 } 3933 }
3905 3934
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3920 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); 3949 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3921 if (!tsk_cache_hot || 3950 if (!tsk_cache_hot ||
3922 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 3951 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3923#ifdef CONFIG_SCHEDSTATS 3952
3924 if (tsk_cache_hot) { 3953 if (tsk_cache_hot) {
3925 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 3954 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3926 schedstat_inc(p, se.statistics.nr_forced_migrations); 3955 schedstat_inc(p, se.statistics.nr_forced_migrations);
3927 } 3956 }
3928#endif 3957
3929 return 1; 3958 return 1;
3930 } 3959 }
3931 3960
3932 if (tsk_cache_hot) { 3961 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3933 schedstat_inc(p, se.statistics.nr_failed_migrations_hot); 3962 return 0;
3934 return 0;
3935 }
3936 return 1;
3937} 3963}
3938 3964
3939/* 3965/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
3948 struct task_struct *p, *n; 3974 struct task_struct *p, *n;
3949 3975
3950 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 3976 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3951 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3952 continue;
3953
3954 if (!can_migrate_task(p, env)) 3977 if (!can_migrate_task(p, env))
3955 continue; 3978 continue;
3956 3979
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
4002 break; 4025 break;
4003 } 4026 }
4004 4027
4005 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 4028 if (!can_migrate_task(p, env))
4006 goto next; 4029 goto next;
4007 4030
4008 load = task_h_load(p); 4031 load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
4013 if ((load / 2) > env->imbalance) 4036 if ((load / 2) > env->imbalance)
4014 goto next; 4037 goto next;
4015 4038
4016 if (!can_migrate_task(p, env))
4017 goto next;
4018
4019 move_task(p, env); 4039 move_task(p, env);
4020 pulled++; 4040 pulled++;
4021 env->imbalance -= load; 4041 env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
4245 return load_idx; 4265 return load_idx;
4246} 4266}
4247 4267
4248unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 4268static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4249{ 4269{
4250 return SCHED_POWER_SCALE; 4270 return SCHED_POWER_SCALE;
4251} 4271}
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4255 return default_scale_freq_power(sd, cpu); 4275 return default_scale_freq_power(sd, cpu);
4256} 4276}
4257 4277
4258unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 4278static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4259{ 4279{
4260 unsigned long weight = sd->span_weight; 4280 unsigned long weight = sd->span_weight;
4261 unsigned long smt_gain = sd->smt_gain; 4281 unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4270 return default_scale_smt_power(sd, cpu); 4290 return default_scale_smt_power(sd, cpu);
4271} 4291}
4272 4292
4273unsigned long scale_rt_power(int cpu) 4293static unsigned long scale_rt_power(int cpu)
4274{ 4294{
4275 struct rq *rq = cpu_rq(cpu); 4295 struct rq *rq = cpu_rq(cpu);
4276 u64 total, available, age_stamp, avg; 4296 u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
4960#define MAX_PINNED_INTERVAL 512 4980#define MAX_PINNED_INTERVAL 512
4961 4981
4962/* Working cpumask for load_balance and load_balance_newidle. */ 4982/* Working cpumask for load_balance and load_balance_newidle. */
4963DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4983DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
4964 4984
4965static int need_active_balance(struct lb_env *env) 4985static int need_active_balance(struct lb_env *env)
4966{ 4986{
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4991 int *balance) 5011 int *balance)
4992{ 5012{
4993 int ld_moved, cur_ld_moved, active_balance = 0; 5013 int ld_moved, cur_ld_moved, active_balance = 0;
4994 int lb_iterations, max_lb_iterations;
4995 struct sched_group *group; 5014 struct sched_group *group;
4996 struct rq *busiest; 5015 struct rq *busiest;
4997 unsigned long flags; 5016 unsigned long flags;
4998 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 5017 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
4999 5018
5000 struct lb_env env = { 5019 struct lb_env env = {
5001 .sd = sd, 5020 .sd = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5007 .cpus = cpus, 5026 .cpus = cpus,
5008 }; 5027 };
5009 5028
5029 /*
5030 * For NEWLY_IDLE load_balancing, we don't need to consider
5031 * other cpus in our group
5032 */
5033 if (idle == CPU_NEWLY_IDLE)
5034 env.dst_grpmask = NULL;
5035
5010 cpumask_copy(cpus, cpu_active_mask); 5036 cpumask_copy(cpus, cpu_active_mask);
5011 max_lb_iterations = cpumask_weight(env.dst_grpmask);
5012 5037
5013 schedstat_inc(sd, lb_count[idle]); 5038 schedstat_inc(sd, lb_count[idle]);
5014 5039
@@ -5034,7 +5059,6 @@ redo:
5034 schedstat_add(sd, lb_imbalance[idle], env.imbalance); 5059 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5035 5060
5036 ld_moved = 0; 5061 ld_moved = 0;
5037 lb_iterations = 1;
5038 if (busiest->nr_running > 1) { 5062 if (busiest->nr_running > 1) {
5039 /* 5063 /*
5040 * Attempt to move tasks. If find_busiest_group has found 5064 * Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
5061 double_rq_unlock(env.dst_rq, busiest); 5085 double_rq_unlock(env.dst_rq, busiest);
5062 local_irq_restore(flags); 5086 local_irq_restore(flags);
5063 5087
5064 if (env.flags & LBF_NEED_BREAK) {
5065 env.flags &= ~LBF_NEED_BREAK;
5066 goto more_balance;
5067 }
5068
5069 /* 5088 /*
5070 * some other cpu did the load balance for us. 5089 * some other cpu did the load balance for us.
5071 */ 5090 */
5072 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 5091 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5073 resched_cpu(env.dst_cpu); 5092 resched_cpu(env.dst_cpu);
5074 5093
5094 if (env.flags & LBF_NEED_BREAK) {
5095 env.flags &= ~LBF_NEED_BREAK;
5096 goto more_balance;
5097 }
5098
5075 /* 5099 /*
5076 * Revisit (affine) tasks on src_cpu that couldn't be moved to 5100 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5077 * us and move them to an alternate dst_cpu in our sched_group 5101 * us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
5091 * moreover subsequent load balance cycles should correct the 5115 * moreover subsequent load balance cycles should correct the
5092 * excess load moved. 5116 * excess load moved.
5093 */ 5117 */
5094 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && 5118 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5095 lb_iterations++ < max_lb_iterations) {
5096 5119
5097 env.dst_rq = cpu_rq(env.new_dst_cpu); 5120 env.dst_rq = cpu_rq(env.new_dst_cpu);
5098 env.dst_cpu = env.new_dst_cpu; 5121 env.dst_cpu = env.new_dst_cpu;
5099 env.flags &= ~LBF_SOME_PINNED; 5122 env.flags &= ~LBF_SOME_PINNED;
5100 env.loop = 0; 5123 env.loop = 0;
5101 env.loop_break = sched_nr_migrate_break; 5124 env.loop_break = sched_nr_migrate_break;
5125
5126 /* Prevent to re-select dst_cpu via env's cpus */
5127 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5128
5102 /* 5129 /*
5103 * Go back to "more_balance" rather than "redo" since we 5130 * Go back to "more_balance" rather than "redo" since we
5104 * need to continue with same src_cpu. 5131 * need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5219 if (this_rq->avg_idle < sysctl_sched_migration_cost) 5246 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5220 return; 5247 return;
5221 5248
5222 update_rq_runnable_avg(this_rq, 1);
5223
5224 /* 5249 /*
5225 * Drop the rq->lock, but keep IRQ/preempt disabled. 5250 * Drop the rq->lock, but keep IRQ/preempt disabled.
5226 */ 5251 */
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
5395 struct sched_domain *sd; 5420 struct sched_domain *sd;
5396 int cpu = smp_processor_id(); 5421 int cpu = smp_processor_id();
5397 5422
5398 if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5399 return;
5400 clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5401
5402 rcu_read_lock(); 5423 rcu_read_lock();
5403 for_each_domain(cpu, sd) 5424 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5425
5426 if (!sd || !sd->nohz_idle)
5427 goto unlock;
5428 sd->nohz_idle = 0;
5429
5430 for (; sd; sd = sd->parent)
5404 atomic_inc(&sd->groups->sgp->nr_busy_cpus); 5431 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5432unlock:
5405 rcu_read_unlock(); 5433 rcu_read_unlock();
5406} 5434}
5407 5435
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
5410 struct sched_domain *sd; 5438 struct sched_domain *sd;
5411 int cpu = smp_processor_id(); 5439 int cpu = smp_processor_id();
5412 5440
5413 if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5414 return;
5415 set_bit(NOHZ_IDLE, nohz_flags(cpu));
5416
5417 rcu_read_lock(); 5441 rcu_read_lock();
5418 for_each_domain(cpu, sd) 5442 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5443
5444 if (!sd || sd->nohz_idle)
5445 goto unlock;
5446 sd->nohz_idle = 1;
5447
5448 for (; sd; sd = sd->parent)
5419 atomic_dec(&sd->groups->sgp->nr_busy_cpus); 5449 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5450unlock:
5420 rcu_read_unlock(); 5451 rcu_read_unlock();
5421} 5452}
5422 5453
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
5468 * It checks each scheduling domain to see if it is due to be balanced, 5499 * It checks each scheduling domain to see if it is due to be balanced,
5469 * and initiates a balancing operation if so. 5500 * and initiates a balancing operation if so.
5470 * 5501 *
5471 * Balancing parameters are set up in arch_init_sched_domains. 5502 * Balancing parameters are set up in init_sched_domains.
5472 */ 5503 */
5473static void rebalance_domains(int cpu, enum cpu_idle_type idle) 5504static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5474{ 5505{
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5506 if (time_after_eq(jiffies, sd->last_balance + interval)) { 5537 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5507 if (load_balance(cpu, rq, sd, idle, &balance)) { 5538 if (load_balance(cpu, rq, sd, idle, &balance)) {
5508 /* 5539 /*
5509 * We've pulled tasks over so either we're no 5540 * The LBF_SOME_PINNED logic could have changed
5510 * longer idle. 5541 * env->dst_cpu, so we can't know our idle
5542 * state even if we migrated tasks. Update it.
5511 */ 5543 */
5512 idle = CPU_NOT_IDLE; 5544 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
5513 } 5545 }
5514 sd->last_balance = jiffies; 5546 sd->last_balance = jiffies;
5515 } 5547 }
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
16
17static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
18{
19 idle_exit_fair(rq);
20}
21
22static void post_schedule_idle(struct rq *rq)
23{
24 idle_enter_fair(rq);
25}
16#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
17/* 27/*
18 * Idle tasks are unconditionally rescheduled: 28 * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 35static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 36{
27 schedstat_inc(rq, sched_goidle); 37 schedstat_inc(rq, sched_goidle);
38#ifdef CONFIG_SMP
39 /* Trigger the post schedule to do an idle_enter for CFS */
40 rq->post_schedule = 1;
41#endif
28 return rq->idle; 42 return rq->idle;
29} 43}
30 44
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
86 100
87#ifdef CONFIG_SMP 101#ifdef CONFIG_SMP
88 .select_task_rq = select_task_rq_idle, 102 .select_task_rq = select_task_rq_idle,
103 .pre_schedule = pre_schedule_idle,
104 .post_schedule = post_schedule_idle,
89#endif 105#endif
90 106
91 .set_curr_task = set_curr_task_idle, 107 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..4c225c4c7111 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -7,6 +7,7 @@
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8 8
9#include "cpupri.h" 9#include "cpupri.h"
10#include "cpuacct.h"
10 11
11extern __read_mostly int scheduler_running; 12extern __read_mostly int scheduler_running;
12 13
@@ -33,6 +34,31 @@ extern __read_mostly int scheduler_running;
33 */ 34 */
34#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 35#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
35 36
37/*
38 * Increase resolution of nice-level calculations for 64-bit architectures.
39 * The extra resolution improves shares distribution and load balancing of
40 * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
41 * hierarchies, especially on larger systems. This is not a user-visible change
42 * and does not change the user-interface for setting shares/weights.
43 *
44 * We increase resolution only if we have enough bits to allow this increased
45 * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
46 * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
47 * increased costs.
48 */
49#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
50# define SCHED_LOAD_RESOLUTION 10
51# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
52# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
53#else
54# define SCHED_LOAD_RESOLUTION 0
55# define scale_load(w) (w)
56# define scale_load_down(w) (w)
57#endif
58
59#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
60#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
61
36#define NICE_0_LOAD SCHED_LOAD_SCALE 62#define NICE_0_LOAD SCHED_LOAD_SCALE
37#define NICE_0_SHIFT SCHED_LOAD_SHIFT 63#define NICE_0_SHIFT SCHED_LOAD_SHIFT
38 64
@@ -154,11 +180,6 @@ struct task_group {
154#define MAX_SHARES (1UL << 18) 180#define MAX_SHARES (1UL << 18)
155#endif 181#endif
156 182
157/* Default task group.
158 * Every task in system belong to this group at bootup.
159 */
160extern struct task_group root_task_group;
161
162typedef int (*tg_visitor)(struct task_group *, void *); 183typedef int (*tg_visitor)(struct task_group *, void *);
163 184
164extern int walk_tg_tree_from(struct task_group *from, 185extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +217,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
196 struct sched_rt_entity *rt_se, int cpu, 217 struct sched_rt_entity *rt_se, int cpu,
197 struct sched_rt_entity *parent); 218 struct sched_rt_entity *parent);
198 219
220extern struct task_group *sched_create_group(struct task_group *parent);
221extern void sched_online_group(struct task_group *tg,
222 struct task_group *parent);
223extern void sched_destroy_group(struct task_group *tg);
224extern void sched_offline_group(struct task_group *tg);
225
226extern void sched_move_task(struct task_struct *tsk);
227
228#ifdef CONFIG_FAIR_GROUP_SCHED
229extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
230#endif
231
199#else /* CONFIG_CGROUP_SCHED */ 232#else /* CONFIG_CGROUP_SCHED */
200 233
201struct cfs_bandwidth { }; 234struct cfs_bandwidth { };
@@ -547,6 +580,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
547DECLARE_PER_CPU(struct sched_domain *, sd_llc); 580DECLARE_PER_CPU(struct sched_domain *, sd_llc);
548DECLARE_PER_CPU(int, sd_llc_id); 581DECLARE_PER_CPU(int, sd_llc_id);
549 582
583struct sched_group_power {
584 atomic_t ref;
585 /*
586 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
587 * single CPU.
588 */
589 unsigned int power, power_orig;
590 unsigned long next_update;
591 /*
592 * Number of busy cpus in this group.
593 */
594 atomic_t nr_busy_cpus;
595
596 unsigned long cpumask[0]; /* iteration mask */
597};
598
599struct sched_group {
600 struct sched_group *next; /* Must be a circular list */
601 atomic_t ref;
602
603 unsigned int group_weight;
604 struct sched_group_power *sgp;
605
606 /*
607 * The CPUs this group covers.
608 *
609 * NOTE: this field is variable length. (Allocated dynamically
610 * by attaching extra space to the end of the structure,
611 * depending on how many CPUs the kernel has booted up with)
612 */
613 unsigned long cpumask[0];
614};
615
616static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
617{
618 return to_cpumask(sg->cpumask);
619}
620
621/*
622 * cpumask masking which cpus in the group are allowed to iterate up the domain
623 * tree.
624 */
625static inline struct cpumask *sched_group_mask(struct sched_group *sg)
626{
627 return to_cpumask(sg->sgp->cpumask);
628}
629
630/**
631 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
632 * @group: The group whose first cpu is to be returned.
633 */
634static inline unsigned int group_first_cpu(struct sched_group *group)
635{
636 return cpumask_first(sched_group_cpus(group));
637}
638
550extern int group_balance_cpu(struct sched_group *sg); 639extern int group_balance_cpu(struct sched_group *sg);
551 640
552#endif /* CONFIG_SMP */ 641#endif /* CONFIG_SMP */
@@ -784,6 +873,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
784} 873}
785#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 874#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
786 875
876/*
877 * wake flags
878 */
879#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
880#define WF_FORK 0x02 /* child wakeup after fork */
881#define WF_MIGRATED 0x4 /* internal use, task got migrated */
787 882
788static inline void update_load_add(struct load_weight *lw, unsigned long inc) 883static inline void update_load_add(struct load_weight *lw, unsigned long inc)
789{ 884{
@@ -856,14 +951,61 @@ static const u32 prio_to_wmult[40] = {
856 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 951 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
857}; 952};
858 953
859/* Time spent by the tasks of the cpu accounting group executing in ... */ 954#define ENQUEUE_WAKEUP 1
860enum cpuacct_stat_index { 955#define ENQUEUE_HEAD 2
861 CPUACCT_STAT_USER, /* ... user mode */ 956#ifdef CONFIG_SMP
862 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 957#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
958#else
959#define ENQUEUE_WAKING 0
960#endif
863 961
864 CPUACCT_STAT_NSTATS, 962#define DEQUEUE_SLEEP 1
865}; 963
964struct sched_class {
965 const struct sched_class *next;
966
967 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
968 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
969 void (*yield_task) (struct rq *rq);
970 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
971
972 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
973
974 struct task_struct * (*pick_next_task) (struct rq *rq);
975 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
976
977#ifdef CONFIG_SMP
978 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
979 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
980
981 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
982 void (*post_schedule) (struct rq *this_rq);
983 void (*task_waking) (struct task_struct *task);
984 void (*task_woken) (struct rq *this_rq, struct task_struct *task);
985
986 void (*set_cpus_allowed)(struct task_struct *p,
987 const struct cpumask *newmask);
866 988
989 void (*rq_online)(struct rq *rq);
990 void (*rq_offline)(struct rq *rq);
991#endif
992
993 void (*set_curr_task) (struct rq *rq);
994 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
995 void (*task_fork) (struct task_struct *p);
996
997 void (*switched_from) (struct rq *this_rq, struct task_struct *task);
998 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
999 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1000 int oldprio);
1001
1002 unsigned int (*get_rr_interval) (struct rq *rq,
1003 struct task_struct *task);
1004
1005#ifdef CONFIG_FAIR_GROUP_SCHED
1006 void (*task_move_group) (struct task_struct *p, int on_rq);
1007#endif
1008};
867 1009
868#define sched_class_highest (&stop_sched_class) 1010#define sched_class_highest (&stop_sched_class)
869#define for_each_class(class) \ 1011#define for_each_class(class) \
@@ -877,9 +1019,23 @@ extern const struct sched_class idle_sched_class;
877 1019
878#ifdef CONFIG_SMP 1020#ifdef CONFIG_SMP
879 1021
1022extern void update_group_power(struct sched_domain *sd, int cpu);
1023
880extern void trigger_load_balance(struct rq *rq, int cpu); 1024extern void trigger_load_balance(struct rq *rq, int cpu);
881extern void idle_balance(int this_cpu, struct rq *this_rq); 1025extern void idle_balance(int this_cpu, struct rq *this_rq);
882 1026
1027/*
1028 * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
1029 * becomes useful in lb
1030 */
1031#if defined(CONFIG_FAIR_GROUP_SCHED)
1032extern void idle_enter_fair(struct rq *this_rq);
1033extern void idle_exit_fair(struct rq *this_rq);
1034#else
1035static inline void idle_enter_fair(struct rq *this_rq) {}
1036static inline void idle_exit_fair(struct rq *this_rq) {}
1037#endif
1038
883#else /* CONFIG_SMP */ 1039#else /* CONFIG_SMP */
884 1040
885static inline void idle_balance(int cpu, struct rq *rq) 1041static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1047,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
891extern void sysrq_sched_debug_show(void); 1047extern void sysrq_sched_debug_show(void);
892extern void sched_init_granularity(void); 1048extern void sched_init_granularity(void);
893extern void update_max_interval(void); 1049extern void update_max_interval(void);
894extern void update_group_power(struct sched_domain *sd, int cpu);
895extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); 1050extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
896extern void init_sched_rt_class(void); 1051extern void init_sched_rt_class(void);
897extern void init_sched_fair_class(void); 1052extern void init_sched_fair_class(void);
@@ -904,45 +1059,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
904 1059
905extern void update_idle_cpu_load(struct rq *this_rq); 1060extern void update_idle_cpu_load(struct rq *this_rq);
906 1061
907#ifdef CONFIG_CGROUP_CPUACCT
908#include <linux/cgroup.h>
909/* track cpu usage of a group of tasks and its child groups */
910struct cpuacct {
911 struct cgroup_subsys_state css;
912 /* cpuusage holds pointer to a u64-type object on every cpu */
913 u64 __percpu *cpuusage;
914 struct kernel_cpustat __percpu *cpustat;
915};
916
917extern struct cgroup_subsys cpuacct_subsys;
918extern struct cpuacct root_cpuacct;
919
920/* return cpu accounting group corresponding to this container */
921static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
922{
923 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
924 struct cpuacct, css);
925}
926
927/* return cpu accounting group to which this task belongs */
928static inline struct cpuacct *task_ca(struct task_struct *tsk)
929{
930 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
931 struct cpuacct, css);
932}
933
934static inline struct cpuacct *parent_ca(struct cpuacct *ca)
935{
936 if (!ca || !ca->css.cgroup->parent)
937 return NULL;
938 return cgroup_ca(ca->css.cgroup->parent);
939}
940
941extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
942#else
943static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
944#endif
945
946#ifdef CONFIG_PARAVIRT 1062#ifdef CONFIG_PARAVIRT
947static inline u64 steal_ticks(u64 steal) 1063static inline u64 steal_ticks(u64 steal)
948{ 1064{
@@ -1187,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1187enum rq_nohz_flag_bits { 1303enum rq_nohz_flag_bits {
1188 NOHZ_TICK_STOPPED, 1304 NOHZ_TICK_STOPPED,
1189 NOHZ_BALANCE_KICK, 1305 NOHZ_BALANCE_KICK,
1190 NOHZ_IDLE,
1191}; 1306};
1192 1307
1193#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) 1308#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
diff --git a/lib/div64.c b/lib/div64.c
index a163b6caef73..3af5728d95fd 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,9 +79,10 @@ EXPORT_SYMBOL(div_s64_rem);
79#endif 79#endif
80 80
81/** 81/**
82 * div64_u64 - unsigned 64bit divide with 64bit divisor 82 * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
83 * @dividend: 64bit dividend 83 * @dividend: 64bit dividend
84 * @divisor: 64bit divisor 84 * @divisor: 64bit divisor
85 * @remainder: 64bit remainder
85 * 86 *
86 * This implementation is a modified version of the algorithm proposed 87 * This implementation is a modified version of the algorithm proposed
87 * by the book 'Hacker's Delight'. The original source and full proof 88 * by the book 'Hacker's Delight'. The original source and full proof
@@ -89,27 +90,33 @@ EXPORT_SYMBOL(div_s64_rem);
89 * 90 *
90 * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' 91 * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
91 */ 92 */
92#ifndef div64_u64 93#ifndef div64_u64_rem
93u64 div64_u64(u64 dividend, u64 divisor) 94u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
94{ 95{
95 u32 high = divisor >> 32; 96 u32 high = divisor >> 32;
96 u64 quot; 97 u64 quot;
97 98
98 if (high == 0) { 99 if (high == 0) {
99 quot = div_u64(dividend, divisor); 100 u32 rem32;
101 quot = div_u64_rem(dividend, divisor, &rem32);
102 *remainder = rem32;
100 } else { 103 } else {
101 int n = 1 + fls(high); 104 int n = 1 + fls(high);
102 quot = div_u64(dividend >> n, divisor >> n); 105 quot = div_u64(dividend >> n, divisor >> n);
103 106
104 if (quot != 0) 107 if (quot != 0)
105 quot--; 108 quot--;
106 if ((dividend - quot * divisor) >= divisor) 109
110 *remainder = dividend - quot * divisor;
111 if (*remainder >= divisor) {
107 quot++; 112 quot++;
113 *remainder -= divisor;
114 }
108 } 115 }
109 116
110 return quot; 117 return quot;
111} 118}
112EXPORT_SYMBOL(div64_u64); 119EXPORT_SYMBOL(div64_u64_rem);
113#endif 120#endif
114 121
115/** 122/**