diff options
124 files changed, 3140 insertions, 3370 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 8e37b0ba2c9d..cbc1b46cbf70 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt | |||
@@ -408,6 +408,11 @@ CONTENTS | |||
408 | * the new scheduling related syscalls that manipulate it, i.e., | 408 | * the new scheduling related syscalls that manipulate it, i.e., |
409 | sched_setattr() and sched_getattr() are implemented. | 409 | sched_setattr() and sched_getattr() are implemented. |
410 | 410 | ||
411 | For debugging purposes, the leftover runtime and absolute deadline of a | ||
412 | SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries | ||
413 | dl.runtime and dl.deadline, both values in ns). A programmatic way to | ||
414 | retrieve these values from production code is under discussion. | ||
415 | |||
411 | 416 | ||
412 | 4.3 Default behavior | 417 | 4.3 Default behavior |
413 | --------------------- | 418 | --------------------- |
@@ -476,6 +481,7 @@ CONTENTS | |||
476 | 481 | ||
477 | Still missing: | 482 | Still missing: |
478 | 483 | ||
484 | - programmatic way to retrieve current runtime and absolute deadline | ||
479 | - refinements to deadline inheritance, especially regarding the possibility | 485 | - refinements to deadline inheritance, especially regarding the possibility |
480 | of retaining bandwidth isolation among non-interacting tasks. This is | 486 | of retaining bandwidth isolation among non-interacting tasks. This is |
481 | being studied from both theoretical and practical points of view, and | 487 | being studied from both theoretical and practical points of view, and |
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index a03f0d944fe6..d8fce3e78457 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt | |||
@@ -158,11 +158,11 @@ as its prone to starvation without deadline scheduling. | |||
158 | Consider two sibling groups A and B; both have 50% bandwidth, but A's | 158 | Consider two sibling groups A and B; both have 50% bandwidth, but A's |
159 | period is twice the length of B's. | 159 | period is twice the length of B's. |
160 | 160 | ||
161 | * group A: period=100000us, runtime=10000us | 161 | * group A: period=100000us, runtime=50000us |
162 | - this runs for 0.01s once every 0.1s | 162 | - this runs for 0.05s once every 0.1s |
163 | 163 | ||
164 | * group B: period= 50000us, runtime=10000us | 164 | * group B: period= 50000us, runtime=25000us |
165 | - this runs for 0.01s twice every 0.1s (or once every 0.05 sec). | 165 | - this runs for 0.025s twice every 0.1s (or once every 0.05 sec). |
166 | 166 | ||
167 | This means that currently a while (1) loop in A will run for the full period of | 167 | This means that currently a while (1) loop in A will run for the full period of |
168 | B and can starve B's tasks (assuming they are of lower priority) for a whole | 168 | B and can starve B's tasks (assuming they are of lower priority) for a whole |
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index bf8475ce85ee..baa152b9348e 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | 1 | ||
2 | 2 | ||
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += exec.h | 4 | generic-y += exec.h |
6 | generic-y += export.h | 5 | generic-y += export.h |
7 | generic-y += irq_work.h | 6 | generic-y += irq_work.h |
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 54d8616644e2..9d27a7d333dc 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c | |||
@@ -1145,7 +1145,7 @@ struct rusage32 { | |||
1145 | SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) | 1145 | SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) |
1146 | { | 1146 | { |
1147 | struct rusage32 r; | 1147 | struct rusage32 r; |
1148 | cputime_t utime, stime; | 1148 | u64 utime, stime; |
1149 | unsigned long utime_jiffies, stime_jiffies; | 1149 | unsigned long utime_jiffies, stime_jiffies; |
1150 | 1150 | ||
1151 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) | 1151 | if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN) |
@@ -1155,16 +1155,16 @@ SYSCALL_DEFINE2(osf_getrusage, int, who, struct rusage32 __user *, ru) | |||
1155 | switch (who) { | 1155 | switch (who) { |
1156 | case RUSAGE_SELF: | 1156 | case RUSAGE_SELF: |
1157 | task_cputime(current, &utime, &stime); | 1157 | task_cputime(current, &utime, &stime); |
1158 | utime_jiffies = cputime_to_jiffies(utime); | 1158 | utime_jiffies = nsecs_to_jiffies(utime); |
1159 | stime_jiffies = cputime_to_jiffies(stime); | 1159 | stime_jiffies = nsecs_to_jiffies(stime); |
1160 | jiffies_to_timeval32(utime_jiffies, &r.ru_utime); | 1160 | jiffies_to_timeval32(utime_jiffies, &r.ru_utime); |
1161 | jiffies_to_timeval32(stime_jiffies, &r.ru_stime); | 1161 | jiffies_to_timeval32(stime_jiffies, &r.ru_stime); |
1162 | r.ru_minflt = current->min_flt; | 1162 | r.ru_minflt = current->min_flt; |
1163 | r.ru_majflt = current->maj_flt; | 1163 | r.ru_majflt = current->maj_flt; |
1164 | break; | 1164 | break; |
1165 | case RUSAGE_CHILDREN: | 1165 | case RUSAGE_CHILDREN: |
1166 | utime_jiffies = cputime_to_jiffies(current->signal->cutime); | 1166 | utime_jiffies = nsecs_to_jiffies(current->signal->cutime); |
1167 | stime_jiffies = cputime_to_jiffies(current->signal->cstime); | 1167 | stime_jiffies = nsecs_to_jiffies(current->signal->cstime); |
1168 | jiffies_to_timeval32(utime_jiffies, &r.ru_utime); | 1168 | jiffies_to_timeval32(utime_jiffies, &r.ru_utime); |
1169 | jiffies_to_timeval32(stime_jiffies, &r.ru_stime); | 1169 | jiffies_to_timeval32(stime_jiffies, &r.ru_stime); |
1170 | r.ru_minflt = current->signal->cmin_flt; | 1170 | r.ru_minflt = current->signal->cmin_flt; |
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index c332604606dd..63a04013d05a 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild | |||
@@ -2,7 +2,6 @@ generic-y += auxvec.h | |||
2 | generic-y += bitsperlong.h | 2 | generic-y += bitsperlong.h |
3 | generic-y += bugs.h | 3 | generic-y += bugs.h |
4 | generic-y += clkdev.h | 4 | generic-y += clkdev.h |
5 | generic-y += cputime.h | ||
6 | generic-y += device.h | 5 | generic-y += device.h |
7 | generic-y += div64.h | 6 | generic-y += div64.h |
8 | generic-y += emergency-restart.h | 7 | generic-y += emergency-restart.h |
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index efb21757d41f..b14e8c7d71bd 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild | |||
@@ -2,7 +2,6 @@ | |||
2 | 2 | ||
3 | generic-y += bitsperlong.h | 3 | generic-y += bitsperlong.h |
4 | generic-y += clkdev.h | 4 | generic-y += clkdev.h |
5 | generic-y += cputime.h | ||
6 | generic-y += current.h | 5 | generic-y += current.h |
7 | generic-y += early_ioremap.h | 6 | generic-y += early_ioremap.h |
8 | generic-y += emergency-restart.h | 7 | generic-y += emergency-restart.h |
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 8365a84c2640..a12f1afc95a3 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild | |||
@@ -1,6 +1,5 @@ | |||
1 | generic-y += bugs.h | 1 | generic-y += bugs.h |
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += cputime.h | ||
4 | generic-y += delay.h | 3 | generic-y += delay.h |
5 | generic-y += div64.h | 4 | generic-y += div64.h |
6 | generic-y += dma.h | 5 | generic-y += dma.h |
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild index 241b9b9729d8..3d7ef2c17a7c 100644 --- a/arch/avr32/include/asm/Kbuild +++ b/arch/avr32/include/asm/Kbuild | |||
@@ -1,6 +1,5 @@ | |||
1 | 1 | ||
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += cputime.h | ||
4 | generic-y += delay.h | 3 | generic-y += delay.h |
5 | generic-y += device.h | 4 | generic-y += device.h |
6 | generic-y += div64.h | 5 | generic-y += div64.h |
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 2fb67b59d188..d6fa60b158be 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild | |||
@@ -2,7 +2,6 @@ | |||
2 | generic-y += auxvec.h | 2 | generic-y += auxvec.h |
3 | generic-y += bitsperlong.h | 3 | generic-y += bitsperlong.h |
4 | generic-y += bugs.h | 4 | generic-y += bugs.h |
5 | generic-y += cputime.h | ||
6 | generic-y += current.h | 5 | generic-y += current.h |
7 | generic-y += device.h | 6 | generic-y += device.h |
8 | generic-y += div64.h | 7 | generic-y += div64.h |
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 64465e7e2245..4e9f57433f3a 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild | |||
@@ -5,7 +5,6 @@ generic-y += barrier.h | |||
5 | generic-y += bitsperlong.h | 5 | generic-y += bitsperlong.h |
6 | generic-y += bugs.h | 6 | generic-y += bugs.h |
7 | generic-y += clkdev.h | 7 | generic-y += clkdev.h |
8 | generic-y += cputime.h | ||
9 | generic-y += current.h | 8 | generic-y += current.h |
10 | generic-y += device.h | 9 | generic-y += device.h |
11 | generic-y += div64.h | 10 | generic-y += div64.h |
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 1778805f6380..9f19e19bff9d 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild | |||
@@ -4,7 +4,6 @@ generic-y += barrier.h | |||
4 | generic-y += bitsperlong.h | 4 | generic-y += bitsperlong.h |
5 | generic-y += clkdev.h | 5 | generic-y += clkdev.h |
6 | generic-y += cmpxchg.h | 6 | generic-y += cmpxchg.h |
7 | generic-y += cputime.h | ||
8 | generic-y += device.h | 7 | generic-y += device.h |
9 | generic-y += div64.h | 8 | generic-y += div64.h |
10 | generic-y += errno.h | 9 | generic-y += errno.h |
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index 1fa084cf1a43..0f5b0d5d313c 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild | |||
@@ -1,6 +1,5 @@ | |||
1 | 1 | ||
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += cputime.h | ||
4 | generic-y += exec.h | 3 | generic-y += exec.h |
5 | generic-y += irq_work.h | 4 | generic-y += irq_work.h |
6 | generic-y += mcs_spinlock.h | 5 | generic-y += mcs_spinlock.h |
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index 373cb23301e3..5efd0c87f3c0 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild | |||
@@ -5,7 +5,6 @@ generic-y += bugs.h | |||
5 | generic-y += cacheflush.h | 5 | generic-y += cacheflush.h |
6 | generic-y += checksum.h | 6 | generic-y += checksum.h |
7 | generic-y += clkdev.h | 7 | generic-y += clkdev.h |
8 | generic-y += cputime.h | ||
9 | generic-y += current.h | 8 | generic-y += current.h |
10 | generic-y += delay.h | 9 | generic-y += delay.h |
11 | generic-y += device.h | 10 | generic-y += device.h |
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index db8ddabc6bd2..a43a7c90e4af 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild | |||
@@ -6,7 +6,6 @@ generic-y += barrier.h | |||
6 | generic-y += bug.h | 6 | generic-y += bug.h |
7 | generic-y += bugs.h | 7 | generic-y += bugs.h |
8 | generic-y += clkdev.h | 8 | generic-y += clkdev.h |
9 | generic-y += cputime.h | ||
10 | generic-y += current.h | 9 | generic-y += current.h |
11 | generic-y += device.h | 10 | generic-y += device.h |
12 | generic-y += div64.h | 11 | generic-y += div64.h |
diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index e2d3f5baf265..3d665c0627a8 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h | |||
@@ -18,11 +18,7 @@ | |||
18 | #ifndef __IA64_CPUTIME_H | 18 | #ifndef __IA64_CPUTIME_H |
19 | #define __IA64_CPUTIME_H | 19 | #define __IA64_CPUTIME_H |
20 | 20 | ||
21 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 21 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
22 | # include <asm-generic/cputime.h> | ||
23 | #else | ||
24 | # include <asm/processor.h> | ||
25 | # include <asm-generic/cputime_nsecs.h> | ||
26 | extern void arch_vtime_task_switch(struct task_struct *tsk); | 22 | extern void arch_vtime_task_switch(struct task_struct *tsk); |
27 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 23 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
28 | 24 | ||
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index c7026429816b..8742d741d19a 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h | |||
@@ -27,6 +27,12 @@ struct thread_info { | |||
27 | mm_segment_t addr_limit; /* user-level address space limit */ | 27 | mm_segment_t addr_limit; /* user-level address space limit */ |
28 | int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ | 28 | int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ |
29 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 29 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
30 | __u64 utime; | ||
31 | __u64 stime; | ||
32 | __u64 gtime; | ||
33 | __u64 hardirq_time; | ||
34 | __u64 softirq_time; | ||
35 | __u64 idle_time; | ||
30 | __u64 ac_stamp; | 36 | __u64 ac_stamp; |
31 | __u64 ac_leave; | 37 | __u64 ac_leave; |
32 | __u64 ac_stime; | 38 | __u64 ac_stime; |
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S index c9b5e942f671..3204fddc439c 100644 --- a/arch/ia64/kernel/head.S +++ b/arch/ia64/kernel/head.S | |||
@@ -1031,7 +1031,7 @@ GLOBAL_ENTRY(ia64_native_sched_clock) | |||
1031 | END(ia64_native_sched_clock) | 1031 | END(ia64_native_sched_clock) |
1032 | 1032 | ||
1033 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 1033 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
1034 | GLOBAL_ENTRY(cycle_to_cputime) | 1034 | GLOBAL_ENTRY(cycle_to_nsec) |
1035 | alloc r16=ar.pfs,1,0,0,0 | 1035 | alloc r16=ar.pfs,1,0,0,0 |
1036 | addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 | 1036 | addl r8=THIS_CPU(ia64_cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0 |
1037 | ;; | 1037 | ;; |
@@ -1047,7 +1047,7 @@ GLOBAL_ENTRY(cycle_to_cputime) | |||
1047 | ;; | 1047 | ;; |
1048 | shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT | 1048 | shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT |
1049 | br.ret.sptk.many rp | 1049 | br.ret.sptk.many rp |
1050 | END(cycle_to_cputime) | 1050 | END(cycle_to_nsec) |
1051 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 1051 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
1052 | 1052 | ||
1053 | #ifdef CONFIG_IA64_BRL_EMU | 1053 | #ifdef CONFIG_IA64_BRL_EMU |
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 7ec7acc844c2..c483ece3eb84 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c | |||
@@ -619,6 +619,8 @@ setup_arch (char **cmdline_p) | |||
619 | check_sal_cache_flush(); | 619 | check_sal_cache_flush(); |
620 | #endif | 620 | #endif |
621 | paging_init(); | 621 | paging_init(); |
622 | |||
623 | clear_sched_clock_stable(); | ||
622 | } | 624 | } |
623 | 625 | ||
624 | /* | 626 | /* |
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 71775b95d6cc..faa116822c4c 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/timex.h> | 21 | #include <linux/timex.h> |
22 | #include <linux/timekeeper_internal.h> | 22 | #include <linux/timekeeper_internal.h> |
23 | #include <linux/platform_device.h> | 23 | #include <linux/platform_device.h> |
24 | #include <linux/cputime.h> | ||
24 | 25 | ||
25 | #include <asm/machvec.h> | 26 | #include <asm/machvec.h> |
26 | #include <asm/delay.h> | 27 | #include <asm/delay.h> |
@@ -59,18 +60,43 @@ static struct clocksource *itc_clocksource; | |||
59 | 60 | ||
60 | #include <linux/kernel_stat.h> | 61 | #include <linux/kernel_stat.h> |
61 | 62 | ||
62 | extern cputime_t cycle_to_cputime(u64 cyc); | 63 | extern u64 cycle_to_nsec(u64 cyc); |
63 | 64 | ||
64 | void vtime_account_user(struct task_struct *tsk) | 65 | void vtime_flush(struct task_struct *tsk) |
65 | { | 66 | { |
66 | cputime_t delta_utime; | ||
67 | struct thread_info *ti = task_thread_info(tsk); | 67 | struct thread_info *ti = task_thread_info(tsk); |
68 | u64 delta; | ||
68 | 69 | ||
69 | if (ti->ac_utime) { | 70 | if (ti->utime) |
70 | delta_utime = cycle_to_cputime(ti->ac_utime); | 71 | account_user_time(tsk, cycle_to_nsec(ti->utime)); |
71 | account_user_time(tsk, delta_utime); | 72 | |
72 | ti->ac_utime = 0; | 73 | if (ti->gtime) |
74 | account_guest_time(tsk, cycle_to_nsec(ti->gtime)); | ||
75 | |||
76 | if (ti->idle_time) | ||
77 | account_idle_time(cycle_to_nsec(ti->idle_time)); | ||
78 | |||
79 | if (ti->stime) { | ||
80 | delta = cycle_to_nsec(ti->stime); | ||
81 | account_system_index_time(tsk, delta, CPUTIME_SYSTEM); | ||
82 | } | ||
83 | |||
84 | if (ti->hardirq_time) { | ||
85 | delta = cycle_to_nsec(ti->hardirq_time); | ||
86 | account_system_index_time(tsk, delta, CPUTIME_IRQ); | ||
87 | } | ||
88 | |||
89 | if (ti->softirq_time) { | ||
90 | delta = cycle_to_nsec(ti->softirq_time)); | ||
91 | account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ); | ||
73 | } | 92 | } |
93 | |||
94 | ti->utime = 0; | ||
95 | ti->gtime = 0; | ||
96 | ti->idle_time = 0; | ||
97 | ti->stime = 0; | ||
98 | ti->hardirq_time = 0; | ||
99 | ti->softirq_time = 0; | ||
74 | } | 100 | } |
75 | 101 | ||
76 | /* | 102 | /* |
@@ -83,7 +109,7 @@ void arch_vtime_task_switch(struct task_struct *prev) | |||
83 | struct thread_info *pi = task_thread_info(prev); | 109 | struct thread_info *pi = task_thread_info(prev); |
84 | struct thread_info *ni = task_thread_info(current); | 110 | struct thread_info *ni = task_thread_info(current); |
85 | 111 | ||
86 | pi->ac_stamp = ni->ac_stamp; | 112 | ni->ac_stamp = pi->ac_stamp; |
87 | ni->ac_stime = ni->ac_utime = 0; | 113 | ni->ac_stime = ni->ac_utime = 0; |
88 | } | 114 | } |
89 | 115 | ||
@@ -91,18 +117,15 @@ void arch_vtime_task_switch(struct task_struct *prev) | |||
91 | * Account time for a transition between system, hard irq or soft irq state. | 117 | * Account time for a transition between system, hard irq or soft irq state. |
92 | * Note that this function is called with interrupts enabled. | 118 | * Note that this function is called with interrupts enabled. |
93 | */ | 119 | */ |
94 | static cputime_t vtime_delta(struct task_struct *tsk) | 120 | static __u64 vtime_delta(struct task_struct *tsk) |
95 | { | 121 | { |
96 | struct thread_info *ti = task_thread_info(tsk); | 122 | struct thread_info *ti = task_thread_info(tsk); |
97 | cputime_t delta_stime; | 123 | __u64 now, delta_stime; |
98 | __u64 now; | ||
99 | 124 | ||
100 | WARN_ON_ONCE(!irqs_disabled()); | 125 | WARN_ON_ONCE(!irqs_disabled()); |
101 | 126 | ||
102 | now = ia64_get_itc(); | 127 | now = ia64_get_itc(); |
103 | 128 | delta_stime = now - ti->ac_stamp; | |
104 | delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); | ||
105 | ti->ac_stime = 0; | ||
106 | ti->ac_stamp = now; | 129 | ti->ac_stamp = now; |
107 | 130 | ||
108 | return delta_stime; | 131 | return delta_stime; |
@@ -110,15 +133,25 @@ static cputime_t vtime_delta(struct task_struct *tsk) | |||
110 | 133 | ||
111 | void vtime_account_system(struct task_struct *tsk) | 134 | void vtime_account_system(struct task_struct *tsk) |
112 | { | 135 | { |
113 | cputime_t delta = vtime_delta(tsk); | 136 | struct thread_info *ti = task_thread_info(tsk); |
114 | 137 | __u64 stime = vtime_delta(tsk); | |
115 | account_system_time(tsk, 0, delta); | 138 | |
139 | if ((tsk->flags & PF_VCPU) && !irq_count()) | ||
140 | ti->gtime += stime; | ||
141 | else if (hardirq_count()) | ||
142 | ti->hardirq_time += stime; | ||
143 | else if (in_serving_softirq()) | ||
144 | ti->softirq_time += stime; | ||
145 | else | ||
146 | ti->stime += stime; | ||
116 | } | 147 | } |
117 | EXPORT_SYMBOL_GPL(vtime_account_system); | 148 | EXPORT_SYMBOL_GPL(vtime_account_system); |
118 | 149 | ||
119 | void vtime_account_idle(struct task_struct *tsk) | 150 | void vtime_account_idle(struct task_struct *tsk) |
120 | { | 151 | { |
121 | account_idle_time(vtime_delta(tsk)); | 152 | struct thread_info *ti = task_thread_info(tsk); |
153 | |||
154 | ti->idle_time += vtime_delta(tsk); | ||
122 | } | 155 | } |
123 | 156 | ||
124 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 157 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index 860e440611c9..652100b64a71 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild | |||
@@ -1,6 +1,5 @@ | |||
1 | 1 | ||
2 | generic-y += clkdev.h | 2 | generic-y += clkdev.h |
3 | generic-y += cputime.h | ||
4 | generic-y += exec.h | 3 | generic-y += exec.h |
5 | generic-y += irq_work.h | 4 | generic-y += irq_work.h |
6 | generic-y += kvm_para.h | 5 | generic-y += kvm_para.h |
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 1f2e5d31cb24..6c76d6c24b3d 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | generic-y += barrier.h | 1 | generic-y += barrier.h |
2 | generic-y += bitsperlong.h | 2 | generic-y += bitsperlong.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += device.h | 4 | generic-y += device.h |
6 | generic-y += emergency-restart.h | 5 | generic-y += emergency-restart.h |
7 | generic-y += errno.h | 6 | generic-y += errno.h |
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild index 167150c701d1..d3731f0db73b 100644 --- a/arch/metag/include/asm/Kbuild +++ b/arch/metag/include/asm/Kbuild | |||
@@ -2,7 +2,6 @@ generic-y += auxvec.h | |||
2 | generic-y += bitsperlong.h | 2 | generic-y += bitsperlong.h |
3 | generic-y += bugs.h | 3 | generic-y += bugs.h |
4 | generic-y += clkdev.h | 4 | generic-y += clkdev.h |
5 | generic-y += cputime.h | ||
6 | generic-y += current.h | 5 | generic-y += current.h |
7 | generic-y += device.h | 6 | generic-y += device.h |
8 | generic-y += dma.h | 7 | generic-y += dma.h |
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index b0ae88c9fed9..6275eb051801 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | 1 | ||
2 | generic-y += barrier.h | 2 | generic-y += barrier.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += device.h | 4 | generic-y += device.h |
6 | generic-y += exec.h | 5 | generic-y += exec.h |
7 | generic-y += irq_work.h | 6 | generic-y += irq_work.h |
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 3269b742a75e..994b1c4392be 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | # MIPS headers | 1 | # MIPS headers |
2 | generic-(CONFIG_GENERIC_CSUM) += checksum.h | 2 | generic-(CONFIG_GENERIC_CSUM) += checksum.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += current.h | 4 | generic-y += current.h |
6 | generic-y += dma-contiguous.h | 5 | generic-y += dma-contiguous.h |
7 | generic-y += emergency-restart.h | 6 | generic-y += emergency-restart.h |
diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 9c7f3e136d50..4a2ff3953b99 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c | |||
@@ -99,15 +99,7 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value) | |||
99 | #undef TASK_SIZE | 99 | #undef TASK_SIZE |
100 | #define TASK_SIZE TASK_SIZE32 | 100 | #define TASK_SIZE TASK_SIZE32 |
101 | 101 | ||
102 | #undef cputime_to_timeval | 102 | #undef ns_to_timeval |
103 | #define cputime_to_timeval cputime_to_compat_timeval | 103 | #define ns_to_timeval ns_to_compat_timeval |
104 | static __inline__ void | ||
105 | cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value) | ||
106 | { | ||
107 | unsigned long jiffies = cputime_to_jiffies(cputime); | ||
108 | |||
109 | value->tv_usec = (jiffies % HZ) * (1000000L / HZ); | ||
110 | value->tv_sec = jiffies / HZ; | ||
111 | } | ||
112 | 104 | ||
113 | #include "../../../fs/binfmt_elf.c" | 105 | #include "../../../fs/binfmt_elf.c" |
diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index 1ab34322dd97..3916404e7fd1 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c | |||
@@ -102,15 +102,7 @@ jiffies_to_compat_timeval(unsigned long jiffies, struct compat_timeval *value) | |||
102 | #undef TASK_SIZE | 102 | #undef TASK_SIZE |
103 | #define TASK_SIZE TASK_SIZE32 | 103 | #define TASK_SIZE TASK_SIZE32 |
104 | 104 | ||
105 | #undef cputime_to_timeval | 105 | #undef ns_to_timeval |
106 | #define cputime_to_timeval cputime_to_compat_timeval | 106 | #define ns_to_timeval ns_to_compat_timeval |
107 | static __inline__ void | ||
108 | cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value) | ||
109 | { | ||
110 | unsigned long jiffies = cputime_to_jiffies(cputime); | ||
111 | |||
112 | value->tv_usec = (jiffies % HZ) * (1000000L / HZ); | ||
113 | value->tv_sec = jiffies / HZ; | ||
114 | } | ||
115 | 107 | ||
116 | #include "../../../fs/binfmt_elf.c" | 108 | #include "../../../fs/binfmt_elf.c" |
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index 1c8dd0f5cd5d..97f64c723a0c 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | 1 | ||
2 | generic-y += barrier.h | 2 | generic-y += barrier.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += exec.h | 4 | generic-y += exec.h |
6 | generic-y += irq_work.h | 5 | generic-y += irq_work.h |
7 | generic-y += mcs_spinlock.h | 6 | generic-y += mcs_spinlock.h |
diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index d63330e88379..35b0e883761a 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild | |||
@@ -6,7 +6,6 @@ generic-y += bitsperlong.h | |||
6 | generic-y += bug.h | 6 | generic-y += bug.h |
7 | generic-y += bugs.h | 7 | generic-y += bugs.h |
8 | generic-y += clkdev.h | 8 | generic-y += clkdev.h |
9 | generic-y += cputime.h | ||
10 | generic-y += current.h | 9 | generic-y += current.h |
11 | generic-y += device.h | 10 | generic-y += device.h |
12 | generic-y += div64.h | 11 | generic-y += div64.h |
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 2832f031fb11..ef8d1ccc3e45 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild | |||
@@ -12,7 +12,6 @@ generic-y += checksum.h | |||
12 | generic-y += clkdev.h | 12 | generic-y += clkdev.h |
13 | generic-y += cmpxchg-local.h | 13 | generic-y += cmpxchg-local.h |
14 | generic-y += cmpxchg.h | 14 | generic-y += cmpxchg.h |
15 | generic-y += cputime.h | ||
16 | generic-y += current.h | 15 | generic-y += current.h |
17 | generic-y += device.h | 16 | generic-y += device.h |
18 | generic-y += div64.h | 17 | generic-y += div64.h |
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 91f53c07f410..4e179d770d69 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild | |||
@@ -2,7 +2,6 @@ | |||
2 | generic-y += auxvec.h | 2 | generic-y += auxvec.h |
3 | generic-y += barrier.h | 3 | generic-y += barrier.h |
4 | generic-y += clkdev.h | 4 | generic-y += clkdev.h |
5 | generic-y += cputime.h | ||
6 | generic-y += device.h | 5 | generic-y += device.h |
7 | generic-y += div64.h | 6 | generic-y += div64.h |
8 | generic-y += emergency-restart.h | 7 | generic-y += emergency-restart.h |
diff --git a/arch/parisc/kernel/binfmt_elf32.c b/arch/parisc/kernel/binfmt_elf32.c index 00dc66f9c2ba..f2adcf33f8f2 100644 --- a/arch/parisc/kernel/binfmt_elf32.c +++ b/arch/parisc/kernel/binfmt_elf32.c | |||
@@ -91,14 +91,7 @@ struct elf_prpsinfo32 | |||
91 | current->thread.map_base = DEFAULT_MAP_BASE32; \ | 91 | current->thread.map_base = DEFAULT_MAP_BASE32; \ |
92 | current->thread.task_size = DEFAULT_TASK_SIZE32 \ | 92 | current->thread.task_size = DEFAULT_TASK_SIZE32 \ |
93 | 93 | ||
94 | #undef cputime_to_timeval | 94 | #undef ns_to_timeval |
95 | #define cputime_to_timeval cputime_to_compat_timeval | 95 | #define ns_to_timeval ns_to_compat_timeval |
96 | static __inline__ void | ||
97 | cputime_to_compat_timeval(const cputime_t cputime, struct compat_timeval *value) | ||
98 | { | ||
99 | unsigned long jiffies = cputime_to_jiffies(cputime); | ||
100 | value->tv_usec = (jiffies % HZ) * (1000000L / HZ); | ||
101 | value->tv_sec = jiffies / HZ; | ||
102 | } | ||
103 | 96 | ||
104 | #include "../../../fs/binfmt_elf.c" | 97 | #include "../../../fs/binfmt_elf.c" |
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 2e66a887788e..068ed3607bac 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #undef PCI_DEBUG | 36 | #undef PCI_DEBUG |
37 | #include <linux/proc_fs.h> | 37 | #include <linux/proc_fs.h> |
38 | #include <linux/export.h> | 38 | #include <linux/export.h> |
39 | #include <linux/sched.h> | ||
39 | 40 | ||
40 | #include <asm/processor.h> | 41 | #include <asm/processor.h> |
41 | #include <asm/sections.h> | 42 | #include <asm/sections.h> |
@@ -176,6 +177,7 @@ void __init setup_arch(char **cmdline_p) | |||
176 | conswitchp = &dummy_con; /* we use do_take_over_console() later ! */ | 177 | conswitchp = &dummy_con; /* we use do_take_over_console() later ! */ |
177 | #endif | 178 | #endif |
178 | 179 | ||
180 | clear_sched_clock_stable(); | ||
179 | } | 181 | } |
180 | 182 | ||
181 | /* | 183 | /* |
diff --git a/arch/powerpc/include/asm/accounting.h b/arch/powerpc/include/asm/accounting.h index c133246df467..3abcf98ed2e0 100644 --- a/arch/powerpc/include/asm/accounting.h +++ b/arch/powerpc/include/asm/accounting.h | |||
@@ -12,9 +12,17 @@ | |||
12 | 12 | ||
13 | /* Stuff for accurate time accounting */ | 13 | /* Stuff for accurate time accounting */ |
14 | struct cpu_accounting_data { | 14 | struct cpu_accounting_data { |
15 | unsigned long user_time; /* accumulated usermode TB ticks */ | 15 | /* Accumulated cputime values to flush on ticks*/ |
16 | unsigned long system_time; /* accumulated system TB ticks */ | 16 | unsigned long utime; |
17 | unsigned long user_time_scaled; /* accumulated usermode SPURR ticks */ | 17 | unsigned long stime; |
18 | unsigned long utime_scaled; | ||
19 | unsigned long stime_scaled; | ||
20 | unsigned long gtime; | ||
21 | unsigned long hardirq_time; | ||
22 | unsigned long softirq_time; | ||
23 | unsigned long steal_time; | ||
24 | unsigned long idle_time; | ||
25 | /* Internal counters */ | ||
18 | unsigned long starttime; /* TB value snapshot */ | 26 | unsigned long starttime; /* TB value snapshot */ |
19 | unsigned long starttime_user; /* TB value on exit to usermode */ | 27 | unsigned long starttime_user; /* TB value on exit to usermode */ |
20 | unsigned long startspurr; /* SPURR value snapshot */ | 28 | unsigned long startspurr; /* SPURR value snapshot */ |
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index aa2e6a34b872..99b541865d8d 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h | |||
@@ -16,12 +16,7 @@ | |||
16 | #ifndef __POWERPC_CPUTIME_H | 16 | #ifndef __POWERPC_CPUTIME_H |
17 | #define __POWERPC_CPUTIME_H | 17 | #define __POWERPC_CPUTIME_H |
18 | 18 | ||
19 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 19 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
20 | #include <asm-generic/cputime.h> | ||
21 | #ifdef __KERNEL__ | ||
22 | static inline void setup_cputime_one_jiffy(void) { } | ||
23 | #endif | ||
24 | #else | ||
25 | 20 | ||
26 | #include <linux/types.h> | 21 | #include <linux/types.h> |
27 | #include <linux/time.h> | 22 | #include <linux/time.h> |
@@ -36,65 +31,6 @@ typedef u64 __nocast cputime64_t; | |||
36 | #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) | 31 | #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) |
37 | 32 | ||
38 | #ifdef __KERNEL__ | 33 | #ifdef __KERNEL__ |
39 | |||
40 | /* | ||
41 | * One jiffy in timebase units computed during initialization | ||
42 | */ | ||
43 | extern cputime_t cputime_one_jiffy; | ||
44 | |||
45 | /* | ||
46 | * Convert cputime <-> jiffies | ||
47 | */ | ||
48 | extern u64 __cputime_jiffies_factor; | ||
49 | |||
50 | static inline unsigned long cputime_to_jiffies(const cputime_t ct) | ||
51 | { | ||
52 | return mulhdu((__force u64) ct, __cputime_jiffies_factor); | ||
53 | } | ||
54 | |||
55 | static inline cputime_t jiffies_to_cputime(const unsigned long jif) | ||
56 | { | ||
57 | u64 ct; | ||
58 | unsigned long sec; | ||
59 | |||
60 | /* have to be a little careful about overflow */ | ||
61 | ct = jif % HZ; | ||
62 | sec = jif / HZ; | ||
63 | if (ct) { | ||
64 | ct *= tb_ticks_per_sec; | ||
65 | do_div(ct, HZ); | ||
66 | } | ||
67 | if (sec) | ||
68 | ct += (cputime_t) sec * tb_ticks_per_sec; | ||
69 | return (__force cputime_t) ct; | ||
70 | } | ||
71 | |||
72 | static inline void setup_cputime_one_jiffy(void) | ||
73 | { | ||
74 | cputime_one_jiffy = jiffies_to_cputime(1); | ||
75 | } | ||
76 | |||
77 | static inline cputime64_t jiffies64_to_cputime64(const u64 jif) | ||
78 | { | ||
79 | u64 ct; | ||
80 | u64 sec = jif; | ||
81 | |||
82 | /* have to be a little careful about overflow */ | ||
83 | ct = do_div(sec, HZ); | ||
84 | if (ct) { | ||
85 | ct *= tb_ticks_per_sec; | ||
86 | do_div(ct, HZ); | ||
87 | } | ||
88 | if (sec) | ||
89 | ct += (u64) sec * tb_ticks_per_sec; | ||
90 | return (__force cputime64_t) ct; | ||
91 | } | ||
92 | |||
93 | static inline u64 cputime64_to_jiffies64(const cputime_t ct) | ||
94 | { | ||
95 | return mulhdu((__force u64) ct, __cputime_jiffies_factor); | ||
96 | } | ||
97 | |||
98 | /* | 34 | /* |
99 | * Convert cputime <-> microseconds | 35 | * Convert cputime <-> microseconds |
100 | */ | 36 | */ |
@@ -105,117 +41,6 @@ static inline unsigned long cputime_to_usecs(const cputime_t ct) | |||
105 | return mulhdu((__force u64) ct, __cputime_usec_factor); | 41 | return mulhdu((__force u64) ct, __cputime_usec_factor); |
106 | } | 42 | } |
107 | 43 | ||
108 | static inline cputime_t usecs_to_cputime(const unsigned long us) | ||
109 | { | ||
110 | u64 ct; | ||
111 | unsigned long sec; | ||
112 | |||
113 | /* have to be a little careful about overflow */ | ||
114 | ct = us % 1000000; | ||
115 | sec = us / 1000000; | ||
116 | if (ct) { | ||
117 | ct *= tb_ticks_per_sec; | ||
118 | do_div(ct, 1000000); | ||
119 | } | ||
120 | if (sec) | ||
121 | ct += (cputime_t) sec * tb_ticks_per_sec; | ||
122 | return (__force cputime_t) ct; | ||
123 | } | ||
124 | |||
125 | #define usecs_to_cputime64(us) usecs_to_cputime(us) | ||
126 | |||
127 | /* | ||
128 | * Convert cputime <-> seconds | ||
129 | */ | ||
130 | extern u64 __cputime_sec_factor; | ||
131 | |||
132 | static inline unsigned long cputime_to_secs(const cputime_t ct) | ||
133 | { | ||
134 | return mulhdu((__force u64) ct, __cputime_sec_factor); | ||
135 | } | ||
136 | |||
137 | static inline cputime_t secs_to_cputime(const unsigned long sec) | ||
138 | { | ||
139 | return (__force cputime_t)((u64) sec * tb_ticks_per_sec); | ||
140 | } | ||
141 | |||
142 | /* | ||
143 | * Convert cputime <-> timespec | ||
144 | */ | ||
145 | static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) | ||
146 | { | ||
147 | u64 x = (__force u64) ct; | ||
148 | unsigned int frac; | ||
149 | |||
150 | frac = do_div(x, tb_ticks_per_sec); | ||
151 | p->tv_sec = x; | ||
152 | x = (u64) frac * 1000000000; | ||
153 | do_div(x, tb_ticks_per_sec); | ||
154 | p->tv_nsec = x; | ||
155 | } | ||
156 | |||
157 | static inline cputime_t timespec_to_cputime(const struct timespec *p) | ||
158 | { | ||
159 | u64 ct; | ||
160 | |||
161 | ct = (u64) p->tv_nsec * tb_ticks_per_sec; | ||
162 | do_div(ct, 1000000000); | ||
163 | return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec); | ||
164 | } | ||
165 | |||
166 | /* | ||
167 | * Convert cputime <-> timeval | ||
168 | */ | ||
169 | static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) | ||
170 | { | ||
171 | u64 x = (__force u64) ct; | ||
172 | unsigned int frac; | ||
173 | |||
174 | frac = do_div(x, tb_ticks_per_sec); | ||
175 | p->tv_sec = x; | ||
176 | x = (u64) frac * 1000000; | ||
177 | do_div(x, tb_ticks_per_sec); | ||
178 | p->tv_usec = x; | ||
179 | } | ||
180 | |||
181 | static inline cputime_t timeval_to_cputime(const struct timeval *p) | ||
182 | { | ||
183 | u64 ct; | ||
184 | |||
185 | ct = (u64) p->tv_usec * tb_ticks_per_sec; | ||
186 | do_div(ct, 1000000); | ||
187 | return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec); | ||
188 | } | ||
189 | |||
190 | /* | ||
191 | * Convert cputime <-> clock_t (units of 1/USER_HZ seconds) | ||
192 | */ | ||
193 | extern u64 __cputime_clockt_factor; | ||
194 | |||
195 | static inline unsigned long cputime_to_clock_t(const cputime_t ct) | ||
196 | { | ||
197 | return mulhdu((__force u64) ct, __cputime_clockt_factor); | ||
198 | } | ||
199 | |||
200 | static inline cputime_t clock_t_to_cputime(const unsigned long clk) | ||
201 | { | ||
202 | u64 ct; | ||
203 | unsigned long sec; | ||
204 | |||
205 | /* have to be a little careful about overflow */ | ||
206 | ct = clk % USER_HZ; | ||
207 | sec = clk / USER_HZ; | ||
208 | if (ct) { | ||
209 | ct *= tb_ticks_per_sec; | ||
210 | do_div(ct, USER_HZ); | ||
211 | } | ||
212 | if (sec) | ||
213 | ct += (u64) sec * tb_ticks_per_sec; | ||
214 | return (__force cputime_t) ct; | ||
215 | } | ||
216 | |||
217 | #define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) | ||
218 | |||
219 | /* | 44 | /* |
220 | * PPC64 uses PACA which is task independent for storing accounting data while | 45 | * PPC64 uses PACA which is task independent for storing accounting data while |
221 | * PPC32 uses struct thread_info, therefore at task switch the accounting data | 46 | * PPC32 uses struct thread_info, therefore at task switch the accounting data |
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6a6792bb39fb..708c3e592eeb 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h | |||
@@ -187,7 +187,6 @@ struct paca_struct { | |||
187 | 187 | ||
188 | /* Stuff for accurate time accounting */ | 188 | /* Stuff for accurate time accounting */ |
189 | struct cpu_accounting_data accounting; | 189 | struct cpu_accounting_data accounting; |
190 | u64 stolen_time; /* TB ticks taken by hypervisor */ | ||
191 | u64 dtl_ridx; /* read index in dispatch log */ | 190 | u64 dtl_ridx; /* read index in dispatch log */ |
192 | struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ | 191 | struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ |
193 | 192 | ||
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 195a9fc8f81c..9e8e771f8acb 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c | |||
@@ -249,9 +249,9 @@ int main(void) | |||
249 | DEFINE(ACCOUNT_STARTTIME_USER, | 249 | DEFINE(ACCOUNT_STARTTIME_USER, |
250 | offsetof(struct paca_struct, accounting.starttime_user)); | 250 | offsetof(struct paca_struct, accounting.starttime_user)); |
251 | DEFINE(ACCOUNT_USER_TIME, | 251 | DEFINE(ACCOUNT_USER_TIME, |
252 | offsetof(struct paca_struct, accounting.user_time)); | 252 | offsetof(struct paca_struct, accounting.utime)); |
253 | DEFINE(ACCOUNT_SYSTEM_TIME, | 253 | DEFINE(ACCOUNT_SYSTEM_TIME, |
254 | offsetof(struct paca_struct, accounting.system_time)); | 254 | offsetof(struct paca_struct, accounting.stime)); |
255 | DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); | 255 | DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); |
256 | DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost)); | 256 | DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost)); |
257 | DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso)); | 257 | DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso)); |
@@ -262,9 +262,9 @@ int main(void) | |||
262 | DEFINE(ACCOUNT_STARTTIME_USER, | 262 | DEFINE(ACCOUNT_STARTTIME_USER, |
263 | offsetof(struct thread_info, accounting.starttime_user)); | 263 | offsetof(struct thread_info, accounting.starttime_user)); |
264 | DEFINE(ACCOUNT_USER_TIME, | 264 | DEFINE(ACCOUNT_USER_TIME, |
265 | offsetof(struct thread_info, accounting.user_time)); | 265 | offsetof(struct thread_info, accounting.utime)); |
266 | DEFINE(ACCOUNT_SYSTEM_TIME, | 266 | DEFINE(ACCOUNT_SYSTEM_TIME, |
267 | offsetof(struct thread_info, accounting.system_time)); | 267 | offsetof(struct thread_info, accounting.stime)); |
268 | #endif | 268 | #endif |
269 | #endif /* CONFIG_PPC64 */ | 269 | #endif /* CONFIG_PPC64 */ |
270 | 270 | ||
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc2e08d415fa..14e485525e31 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/clk-provider.h> | 57 | #include <linux/clk-provider.h> |
58 | #include <linux/suspend.h> | 58 | #include <linux/suspend.h> |
59 | #include <linux/rtc.h> | 59 | #include <linux/rtc.h> |
60 | #include <linux/cputime.h> | ||
60 | #include <asm/trace.h> | 61 | #include <asm/trace.h> |
61 | 62 | ||
62 | #include <asm/io.h> | 63 | #include <asm/io.h> |
@@ -72,7 +73,6 @@ | |||
72 | #include <asm/smp.h> | 73 | #include <asm/smp.h> |
73 | #include <asm/vdso_datapage.h> | 74 | #include <asm/vdso_datapage.h> |
74 | #include <asm/firmware.h> | 75 | #include <asm/firmware.h> |
75 | #include <asm/cputime.h> | ||
76 | #include <asm/asm-prototypes.h> | 76 | #include <asm/asm-prototypes.h> |
77 | 77 | ||
78 | /* powerpc clocksource/clockevent code */ | 78 | /* powerpc clocksource/clockevent code */ |
@@ -152,20 +152,11 @@ EXPORT_SYMBOL_GPL(ppc_tb_freq); | |||
152 | 152 | ||
153 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 153 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
154 | /* | 154 | /* |
155 | * Factors for converting from cputime_t (timebase ticks) to | 155 | * Factor for converting from cputime_t (timebase ticks) to |
156 | * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). | 156 | * microseconds. This is stored as 0.64 fixed-point binary fraction. |
157 | * These are all stored as 0.64 fixed-point binary fractions. | ||
158 | */ | 157 | */ |
159 | u64 __cputime_jiffies_factor; | ||
160 | EXPORT_SYMBOL(__cputime_jiffies_factor); | ||
161 | u64 __cputime_usec_factor; | 158 | u64 __cputime_usec_factor; |
162 | EXPORT_SYMBOL(__cputime_usec_factor); | 159 | EXPORT_SYMBOL(__cputime_usec_factor); |
163 | u64 __cputime_sec_factor; | ||
164 | EXPORT_SYMBOL(__cputime_sec_factor); | ||
165 | u64 __cputime_clockt_factor; | ||
166 | EXPORT_SYMBOL(__cputime_clockt_factor); | ||
167 | |||
168 | cputime_t cputime_one_jiffy; | ||
169 | 160 | ||
170 | #ifdef CONFIG_PPC_SPLPAR | 161 | #ifdef CONFIG_PPC_SPLPAR |
171 | void (*dtl_consumer)(struct dtl_entry *, u64); | 162 | void (*dtl_consumer)(struct dtl_entry *, u64); |
@@ -181,14 +172,8 @@ static void calc_cputime_factors(void) | |||
181 | { | 172 | { |
182 | struct div_result res; | 173 | struct div_result res; |
183 | 174 | ||
184 | div128_by_32(HZ, 0, tb_ticks_per_sec, &res); | ||
185 | __cputime_jiffies_factor = res.result_low; | ||
186 | div128_by_32(1000000, 0, tb_ticks_per_sec, &res); | 175 | div128_by_32(1000000, 0, tb_ticks_per_sec, &res); |
187 | __cputime_usec_factor = res.result_low; | 176 | __cputime_usec_factor = res.result_low; |
188 | div128_by_32(1, 0, tb_ticks_per_sec, &res); | ||
189 | __cputime_sec_factor = res.result_low; | ||
190 | div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); | ||
191 | __cputime_clockt_factor = res.result_low; | ||
192 | } | 177 | } |
193 | 178 | ||
194 | /* | 179 | /* |
@@ -271,25 +256,19 @@ void accumulate_stolen_time(void) | |||
271 | 256 | ||
272 | sst = scan_dispatch_log(acct->starttime_user); | 257 | sst = scan_dispatch_log(acct->starttime_user); |
273 | ust = scan_dispatch_log(acct->starttime); | 258 | ust = scan_dispatch_log(acct->starttime); |
274 | acct->system_time -= sst; | 259 | acct->stime -= sst; |
275 | acct->user_time -= ust; | 260 | acct->utime -= ust; |
276 | local_paca->stolen_time += ust + sst; | 261 | acct->steal_time += ust + sst; |
277 | 262 | ||
278 | local_paca->soft_enabled = save_soft_enabled; | 263 | local_paca->soft_enabled = save_soft_enabled; |
279 | } | 264 | } |
280 | 265 | ||
281 | static inline u64 calculate_stolen_time(u64 stop_tb) | 266 | static inline u64 calculate_stolen_time(u64 stop_tb) |
282 | { | 267 | { |
283 | u64 stolen = 0; | 268 | if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) |
269 | return scan_dispatch_log(stop_tb); | ||
284 | 270 | ||
285 | if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) { | 271 | return 0; |
286 | stolen = scan_dispatch_log(stop_tb); | ||
287 | get_paca()->accounting.system_time -= stolen; | ||
288 | } | ||
289 | |||
290 | stolen += get_paca()->stolen_time; | ||
291 | get_paca()->stolen_time = 0; | ||
292 | return stolen; | ||
293 | } | 272 | } |
294 | 273 | ||
295 | #else /* CONFIG_PPC_SPLPAR */ | 274 | #else /* CONFIG_PPC_SPLPAR */ |
@@ -305,28 +284,27 @@ static inline u64 calculate_stolen_time(u64 stop_tb) | |||
305 | * or soft irq state. | 284 | * or soft irq state. |
306 | */ | 285 | */ |
307 | static unsigned long vtime_delta(struct task_struct *tsk, | 286 | static unsigned long vtime_delta(struct task_struct *tsk, |
308 | unsigned long *sys_scaled, | 287 | unsigned long *stime_scaled, |
309 | unsigned long *stolen) | 288 | unsigned long *steal_time) |
310 | { | 289 | { |
311 | unsigned long now, nowscaled, deltascaled; | 290 | unsigned long now, nowscaled, deltascaled; |
312 | unsigned long udelta, delta, user_scaled; | 291 | unsigned long stime; |
292 | unsigned long utime, utime_scaled; | ||
313 | struct cpu_accounting_data *acct = get_accounting(tsk); | 293 | struct cpu_accounting_data *acct = get_accounting(tsk); |
314 | 294 | ||
315 | WARN_ON_ONCE(!irqs_disabled()); | 295 | WARN_ON_ONCE(!irqs_disabled()); |
316 | 296 | ||
317 | now = mftb(); | 297 | now = mftb(); |
318 | nowscaled = read_spurr(now); | 298 | nowscaled = read_spurr(now); |
319 | acct->system_time += now - acct->starttime; | 299 | stime = now - acct->starttime; |
320 | acct->starttime = now; | 300 | acct->starttime = now; |
321 | deltascaled = nowscaled - acct->startspurr; | 301 | deltascaled = nowscaled - acct->startspurr; |
322 | acct->startspurr = nowscaled; | 302 | acct->startspurr = nowscaled; |
323 | 303 | ||
324 | *stolen = calculate_stolen_time(now); | 304 | *steal_time = calculate_stolen_time(now); |
325 | 305 | ||
326 | delta = acct->system_time; | 306 | utime = acct->utime - acct->utime_sspurr; |
327 | acct->system_time = 0; | 307 | acct->utime_sspurr = acct->utime; |
328 | udelta = acct->user_time - acct->utime_sspurr; | ||
329 | acct->utime_sspurr = acct->user_time; | ||
330 | 308 | ||
331 | /* | 309 | /* |
332 | * Because we don't read the SPURR on every kernel entry/exit, | 310 | * Because we don't read the SPURR on every kernel entry/exit, |
@@ -338,62 +316,105 @@ static unsigned long vtime_delta(struct task_struct *tsk, | |||
338 | * the user ticks get saved up in paca->user_time_scaled to be | 316 | * the user ticks get saved up in paca->user_time_scaled to be |
339 | * used by account_process_tick. | 317 | * used by account_process_tick. |
340 | */ | 318 | */ |
341 | *sys_scaled = delta; | 319 | *stime_scaled = stime; |
342 | user_scaled = udelta; | 320 | utime_scaled = utime; |
343 | if (deltascaled != delta + udelta) { | 321 | if (deltascaled != stime + utime) { |
344 | if (udelta) { | 322 | if (utime) { |
345 | *sys_scaled = deltascaled * delta / (delta + udelta); | 323 | *stime_scaled = deltascaled * stime / (stime + utime); |
346 | user_scaled = deltascaled - *sys_scaled; | 324 | utime_scaled = deltascaled - *stime_scaled; |
347 | } else { | 325 | } else { |
348 | *sys_scaled = deltascaled; | 326 | *stime_scaled = deltascaled; |
349 | } | 327 | } |
350 | } | 328 | } |
351 | acct->user_time_scaled += user_scaled; | 329 | acct->utime_scaled += utime_scaled; |
352 | 330 | ||
353 | return delta; | 331 | return stime; |
354 | } | 332 | } |
355 | 333 | ||
356 | void vtime_account_system(struct task_struct *tsk) | 334 | void vtime_account_system(struct task_struct *tsk) |
357 | { | 335 | { |
358 | unsigned long delta, sys_scaled, stolen; | 336 | unsigned long stime, stime_scaled, steal_time; |
337 | struct cpu_accounting_data *acct = get_accounting(tsk); | ||
338 | |||
339 | stime = vtime_delta(tsk, &stime_scaled, &steal_time); | ||
359 | 340 | ||
360 | delta = vtime_delta(tsk, &sys_scaled, &stolen); | 341 | stime -= min(stime, steal_time); |
361 | account_system_time(tsk, 0, delta); | 342 | acct->steal_time += steal_time; |
362 | tsk->stimescaled += sys_scaled; | 343 | |
363 | if (stolen) | 344 | if ((tsk->flags & PF_VCPU) && !irq_count()) { |
364 | account_steal_time(stolen); | 345 | acct->gtime += stime; |
346 | acct->utime_scaled += stime_scaled; | ||
347 | } else { | ||
348 | if (hardirq_count()) | ||
349 | acct->hardirq_time += stime; | ||
350 | else if (in_serving_softirq()) | ||
351 | acct->softirq_time += stime; | ||
352 | else | ||
353 | acct->stime += stime; | ||
354 | |||
355 | acct->stime_scaled += stime_scaled; | ||
356 | } | ||
365 | } | 357 | } |
366 | EXPORT_SYMBOL_GPL(vtime_account_system); | 358 | EXPORT_SYMBOL_GPL(vtime_account_system); |
367 | 359 | ||
368 | void vtime_account_idle(struct task_struct *tsk) | 360 | void vtime_account_idle(struct task_struct *tsk) |
369 | { | 361 | { |
370 | unsigned long delta, sys_scaled, stolen; | 362 | unsigned long stime, stime_scaled, steal_time; |
363 | struct cpu_accounting_data *acct = get_accounting(tsk); | ||
371 | 364 | ||
372 | delta = vtime_delta(tsk, &sys_scaled, &stolen); | 365 | stime = vtime_delta(tsk, &stime_scaled, &steal_time); |
373 | account_idle_time(delta + stolen); | 366 | acct->idle_time += stime + steal_time; |
374 | } | 367 | } |
375 | 368 | ||
376 | /* | 369 | /* |
377 | * Transfer the user time accumulated in the paca | 370 | * Account the whole cputime accumulated in the paca |
378 | * by the exception entry and exit code to the generic | ||
379 | * process user time records. | ||
380 | * Must be called with interrupts disabled. | 371 | * Must be called with interrupts disabled. |
381 | * Assumes that vtime_account_system/idle() has been called | 372 | * Assumes that vtime_account_system/idle() has been called |
382 | * recently (i.e. since the last entry from usermode) so that | 373 | * recently (i.e. since the last entry from usermode) so that |
383 | * get_paca()->user_time_scaled is up to date. | 374 | * get_paca()->user_time_scaled is up to date. |
384 | */ | 375 | */ |
385 | void vtime_account_user(struct task_struct *tsk) | 376 | void vtime_flush(struct task_struct *tsk) |
386 | { | 377 | { |
387 | cputime_t utime, utimescaled; | ||
388 | struct cpu_accounting_data *acct = get_accounting(tsk); | 378 | struct cpu_accounting_data *acct = get_accounting(tsk); |
389 | 379 | ||
390 | utime = acct->user_time; | 380 | if (acct->utime) |
391 | utimescaled = acct->user_time_scaled; | 381 | account_user_time(tsk, cputime_to_nsecs(acct->utime)); |
392 | acct->user_time = 0; | 382 | |
393 | acct->user_time_scaled = 0; | 383 | if (acct->utime_scaled) |
384 | tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled); | ||
385 | |||
386 | if (acct->gtime) | ||
387 | account_guest_time(tsk, cputime_to_nsecs(acct->gtime)); | ||
388 | |||
389 | if (acct->steal_time) | ||
390 | account_steal_time(cputime_to_nsecs(acct->steal_time)); | ||
391 | |||
392 | if (acct->idle_time) | ||
393 | account_idle_time(cputime_to_nsecs(acct->idle_time)); | ||
394 | |||
395 | if (acct->stime) | ||
396 | account_system_index_time(tsk, cputime_to_nsecs(acct->stime), | ||
397 | CPUTIME_SYSTEM); | ||
398 | if (acct->stime_scaled) | ||
399 | tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled); | ||
400 | |||
401 | if (acct->hardirq_time) | ||
402 | account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time), | ||
403 | CPUTIME_IRQ); | ||
404 | if (acct->softirq_time) | ||
405 | account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time), | ||
406 | CPUTIME_SOFTIRQ); | ||
407 | |||
408 | acct->utime = 0; | ||
409 | acct->utime_scaled = 0; | ||
394 | acct->utime_sspurr = 0; | 410 | acct->utime_sspurr = 0; |
395 | account_user_time(tsk, utime); | 411 | acct->gtime = 0; |
396 | tsk->utimescaled += utimescaled; | 412 | acct->steal_time = 0; |
413 | acct->idle_time = 0; | ||
414 | acct->stime = 0; | ||
415 | acct->stime_scaled = 0; | ||
416 | acct->hardirq_time = 0; | ||
417 | acct->softirq_time = 0; | ||
397 | } | 418 | } |
398 | 419 | ||
399 | #ifdef CONFIG_PPC32 | 420 | #ifdef CONFIG_PPC32 |
@@ -407,8 +428,7 @@ void arch_vtime_task_switch(struct task_struct *prev) | |||
407 | struct cpu_accounting_data *acct = get_accounting(current); | 428 | struct cpu_accounting_data *acct = get_accounting(current); |
408 | 429 | ||
409 | acct->starttime = get_accounting(prev)->starttime; | 430 | acct->starttime = get_accounting(prev)->starttime; |
410 | acct->system_time = 0; | 431 | acct->startspurr = get_accounting(prev)->startspurr; |
411 | acct->user_time = 0; | ||
412 | } | 432 | } |
413 | #endif /* CONFIG_PPC32 */ | 433 | #endif /* CONFIG_PPC32 */ |
414 | 434 | ||
@@ -1018,7 +1038,6 @@ void __init time_init(void) | |||
1018 | tb_ticks_per_sec = ppc_tb_freq; | 1038 | tb_ticks_per_sec = ppc_tb_freq; |
1019 | tb_ticks_per_usec = ppc_tb_freq / 1000000; | 1039 | tb_ticks_per_usec = ppc_tb_freq / 1000000; |
1020 | calc_cputime_factors(); | 1040 | calc_cputime_factors(); |
1021 | setup_cputime_one_jiffy(); | ||
1022 | 1041 | ||
1023 | /* | 1042 | /* |
1024 | * Compute scale factor for sched_clock. | 1043 | * Compute scale factor for sched_clock. |
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 9c0e17cf6886..3f864c36d847 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c | |||
@@ -2287,14 +2287,14 @@ static void dump_one_paca(int cpu) | |||
2287 | DUMP(p, subcore_sibling_mask, "x"); | 2287 | DUMP(p, subcore_sibling_mask, "x"); |
2288 | #endif | 2288 | #endif |
2289 | 2289 | ||
2290 | DUMP(p, accounting.user_time, "llx"); | 2290 | DUMP(p, accounting.utime, "llx"); |
2291 | DUMP(p, accounting.system_time, "llx"); | 2291 | DUMP(p, accounting.stime, "llx"); |
2292 | DUMP(p, accounting.user_time_scaled, "llx"); | 2292 | DUMP(p, accounting.utime_scaled, "llx"); |
2293 | DUMP(p, accounting.starttime, "llx"); | 2293 | DUMP(p, accounting.starttime, "llx"); |
2294 | DUMP(p, accounting.starttime_user, "llx"); | 2294 | DUMP(p, accounting.starttime_user, "llx"); |
2295 | DUMP(p, accounting.startspurr, "llx"); | 2295 | DUMP(p, accounting.startspurr, "llx"); |
2296 | DUMP(p, accounting.utime_sspurr, "llx"); | 2296 | DUMP(p, accounting.utime_sspurr, "llx"); |
2297 | DUMP(p, stolen_time, "llx"); | 2297 | DUMP(p, accounting.steal_time, "llx"); |
2298 | #undef DUMP | 2298 | #undef DUMP |
2299 | 2299 | ||
2300 | catch_memory_errors = 0; | 2300 | catch_memory_errors = 0; |
diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 69b23b25ac34..08b9e942a262 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c | |||
@@ -113,21 +113,21 @@ static void appldata_get_os_data(void *data) | |||
113 | j = 0; | 113 | j = 0; |
114 | for_each_online_cpu(i) { | 114 | for_each_online_cpu(i) { |
115 | os_data->os_cpu[j].per_cpu_user = | 115 | os_data->os_cpu[j].per_cpu_user = |
116 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); | 116 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); |
117 | os_data->os_cpu[j].per_cpu_nice = | 117 | os_data->os_cpu[j].per_cpu_nice = |
118 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); | 118 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); |
119 | os_data->os_cpu[j].per_cpu_system = | 119 | os_data->os_cpu[j].per_cpu_system = |
120 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); | 120 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); |
121 | os_data->os_cpu[j].per_cpu_idle = | 121 | os_data->os_cpu[j].per_cpu_idle = |
122 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); | 122 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); |
123 | os_data->os_cpu[j].per_cpu_irq = | 123 | os_data->os_cpu[j].per_cpu_irq = |
124 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); | 124 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); |
125 | os_data->os_cpu[j].per_cpu_softirq = | 125 | os_data->os_cpu[j].per_cpu_softirq = |
126 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); | 126 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); |
127 | os_data->os_cpu[j].per_cpu_iowait = | 127 | os_data->os_cpu[j].per_cpu_iowait = |
128 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); | 128 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); |
129 | os_data->os_cpu[j].per_cpu_steal = | 129 | os_data->os_cpu[j].per_cpu_steal = |
130 | cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); | 130 | nsecs_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); |
131 | os_data->os_cpu[j].cpu_id = i; | 131 | os_data->os_cpu[j].cpu_id = i; |
132 | j++; | 132 | j++; |
133 | } | 133 | } |
diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 221b454c734a..d1c407ddf703 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h | |||
@@ -25,33 +25,6 @@ static inline unsigned long __div(unsigned long long n, unsigned long base) | |||
25 | return n / base; | 25 | return n / base; |
26 | } | 26 | } |
27 | 27 | ||
28 | #define cputime_one_jiffy jiffies_to_cputime(1) | ||
29 | |||
30 | /* | ||
31 | * Convert cputime to jiffies and back. | ||
32 | */ | ||
33 | static inline unsigned long cputime_to_jiffies(const cputime_t cputime) | ||
34 | { | ||
35 | return __div((__force unsigned long long) cputime, CPUTIME_PER_SEC / HZ); | ||
36 | } | ||
37 | |||
38 | static inline cputime_t jiffies_to_cputime(const unsigned int jif) | ||
39 | { | ||
40 | return (__force cputime_t)(jif * (CPUTIME_PER_SEC / HZ)); | ||
41 | } | ||
42 | |||
43 | static inline u64 cputime64_to_jiffies64(cputime64_t cputime) | ||
44 | { | ||
45 | unsigned long long jif = (__force unsigned long long) cputime; | ||
46 | do_div(jif, CPUTIME_PER_SEC / HZ); | ||
47 | return jif; | ||
48 | } | ||
49 | |||
50 | static inline cputime64_t jiffies64_to_cputime64(const u64 jif) | ||
51 | { | ||
52 | return (__force cputime64_t)(jif * (CPUTIME_PER_SEC / HZ)); | ||
53 | } | ||
54 | |||
55 | /* | 28 | /* |
56 | * Convert cputime to microseconds and back. | 29 | * Convert cputime to microseconds and back. |
57 | */ | 30 | */ |
@@ -60,88 +33,8 @@ static inline unsigned int cputime_to_usecs(const cputime_t cputime) | |||
60 | return (__force unsigned long long) cputime >> 12; | 33 | return (__force unsigned long long) cputime >> 12; |
61 | } | 34 | } |
62 | 35 | ||
63 | static inline cputime_t usecs_to_cputime(const unsigned int m) | ||
64 | { | ||
65 | return (__force cputime_t)(m * CPUTIME_PER_USEC); | ||
66 | } | ||
67 | |||
68 | #define usecs_to_cputime64(m) usecs_to_cputime(m) | ||
69 | |||
70 | /* | ||
71 | * Convert cputime to milliseconds and back. | ||
72 | */ | ||
73 | static inline unsigned int cputime_to_secs(const cputime_t cputime) | ||
74 | { | ||
75 | return __div((__force unsigned long long) cputime, CPUTIME_PER_SEC / 2) >> 1; | ||
76 | } | ||
77 | |||
78 | static inline cputime_t secs_to_cputime(const unsigned int s) | ||
79 | { | ||
80 | return (__force cputime_t)(s * CPUTIME_PER_SEC); | ||
81 | } | ||
82 | |||
83 | /* | ||
84 | * Convert cputime to timespec and back. | ||
85 | */ | ||
86 | static inline cputime_t timespec_to_cputime(const struct timespec *value) | ||
87 | { | ||
88 | unsigned long long ret = value->tv_sec * CPUTIME_PER_SEC; | ||
89 | return (__force cputime_t)(ret + __div(value->tv_nsec * CPUTIME_PER_USEC, NSEC_PER_USEC)); | ||
90 | } | ||
91 | |||
92 | static inline void cputime_to_timespec(const cputime_t cputime, | ||
93 | struct timespec *value) | ||
94 | { | ||
95 | unsigned long long __cputime = (__force unsigned long long) cputime; | ||
96 | value->tv_nsec = (__cputime % CPUTIME_PER_SEC) * NSEC_PER_USEC / CPUTIME_PER_USEC; | ||
97 | value->tv_sec = __cputime / CPUTIME_PER_SEC; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Convert cputime to timeval and back. | ||
102 | * Since cputime and timeval have the same resolution (microseconds) | ||
103 | * this is easy. | ||
104 | */ | ||
105 | static inline cputime_t timeval_to_cputime(const struct timeval *value) | ||
106 | { | ||
107 | unsigned long long ret = value->tv_sec * CPUTIME_PER_SEC; | ||
108 | return (__force cputime_t)(ret + value->tv_usec * CPUTIME_PER_USEC); | ||
109 | } | ||
110 | |||
111 | static inline void cputime_to_timeval(const cputime_t cputime, | ||
112 | struct timeval *value) | ||
113 | { | ||
114 | unsigned long long __cputime = (__force unsigned long long) cputime; | ||
115 | value->tv_usec = (__cputime % CPUTIME_PER_SEC) / CPUTIME_PER_USEC; | ||
116 | value->tv_sec = __cputime / CPUTIME_PER_SEC; | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Convert cputime to clock and back. | ||
121 | */ | ||
122 | static inline clock_t cputime_to_clock_t(cputime_t cputime) | ||
123 | { | ||
124 | unsigned long long clock = (__force unsigned long long) cputime; | ||
125 | do_div(clock, CPUTIME_PER_SEC / USER_HZ); | ||
126 | return clock; | ||
127 | } | ||
128 | |||
129 | static inline cputime_t clock_t_to_cputime(unsigned long x) | ||
130 | { | ||
131 | return (__force cputime_t)(x * (CPUTIME_PER_SEC / USER_HZ)); | ||
132 | } | ||
133 | |||
134 | /* | ||
135 | * Convert cputime64 to clock. | ||
136 | */ | ||
137 | static inline clock_t cputime64_to_clock_t(cputime64_t cputime) | ||
138 | { | ||
139 | unsigned long long clock = (__force unsigned long long) cputime; | ||
140 | do_div(clock, CPUTIME_PER_SEC / USER_HZ); | ||
141 | return clock; | ||
142 | } | ||
143 | 36 | ||
144 | cputime64_t arch_cpu_idle_time(int cpu); | 37 | u64 arch_cpu_idle_time(int cpu); |
145 | 38 | ||
146 | #define arch_idle_time(cpu) arch_cpu_idle_time(cpu) | 39 | #define arch_idle_time(cpu) arch_cpu_idle_time(cpu) |
147 | 40 | ||
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 9bfad2ad6312..61261e0e95c0 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h | |||
@@ -85,53 +85,56 @@ struct lowcore { | |||
85 | __u64 mcck_enter_timer; /* 0x02c0 */ | 85 | __u64 mcck_enter_timer; /* 0x02c0 */ |
86 | __u64 exit_timer; /* 0x02c8 */ | 86 | __u64 exit_timer; /* 0x02c8 */ |
87 | __u64 user_timer; /* 0x02d0 */ | 87 | __u64 user_timer; /* 0x02d0 */ |
88 | __u64 system_timer; /* 0x02d8 */ | 88 | __u64 guest_timer; /* 0x02d8 */ |
89 | __u64 steal_timer; /* 0x02e0 */ | 89 | __u64 system_timer; /* 0x02e0 */ |
90 | __u64 last_update_timer; /* 0x02e8 */ | 90 | __u64 hardirq_timer; /* 0x02e8 */ |
91 | __u64 last_update_clock; /* 0x02f0 */ | 91 | __u64 softirq_timer; /* 0x02f0 */ |
92 | __u64 int_clock; /* 0x02f8 */ | 92 | __u64 steal_timer; /* 0x02f8 */ |
93 | __u64 mcck_clock; /* 0x0300 */ | 93 | __u64 last_update_timer; /* 0x0300 */ |
94 | __u64 clock_comparator; /* 0x0308 */ | 94 | __u64 last_update_clock; /* 0x0308 */ |
95 | __u64 int_clock; /* 0x0310 */ | ||
96 | __u64 mcck_clock; /* 0x0318 */ | ||
97 | __u64 clock_comparator; /* 0x0320 */ | ||
95 | 98 | ||
96 | /* Current process. */ | 99 | /* Current process. */ |
97 | __u64 current_task; /* 0x0310 */ | 100 | __u64 current_task; /* 0x0328 */ |
98 | __u8 pad_0x318[0x320-0x318]; /* 0x0318 */ | 101 | __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ |
99 | __u64 kernel_stack; /* 0x0320 */ | 102 | __u64 kernel_stack; /* 0x0338 */ |
100 | 103 | ||
101 | /* Interrupt, panic and restart stack. */ | 104 | /* Interrupt, panic and restart stack. */ |
102 | __u64 async_stack; /* 0x0328 */ | 105 | __u64 async_stack; /* 0x0340 */ |
103 | __u64 panic_stack; /* 0x0330 */ | 106 | __u64 panic_stack; /* 0x0348 */ |
104 | __u64 restart_stack; /* 0x0338 */ | 107 | __u64 restart_stack; /* 0x0350 */ |
105 | 108 | ||
106 | /* Restart function and parameter. */ | 109 | /* Restart function and parameter. */ |
107 | __u64 restart_fn; /* 0x0340 */ | 110 | __u64 restart_fn; /* 0x0358 */ |
108 | __u64 restart_data; /* 0x0348 */ | 111 | __u64 restart_data; /* 0x0360 */ |
109 | __u64 restart_source; /* 0x0350 */ | 112 | __u64 restart_source; /* 0x0368 */ |
110 | 113 | ||
111 | /* Address space pointer. */ | 114 | /* Address space pointer. */ |
112 | __u64 kernel_asce; /* 0x0358 */ | 115 | __u64 kernel_asce; /* 0x0370 */ |
113 | __u64 user_asce; /* 0x0360 */ | 116 | __u64 user_asce; /* 0x0378 */ |
114 | 117 | ||
115 | /* | 118 | /* |
116 | * The lpp and current_pid fields form a | 119 | * The lpp and current_pid fields form a |
117 | * 64-bit value that is set as program | 120 | * 64-bit value that is set as program |
118 | * parameter with the LPP instruction. | 121 | * parameter with the LPP instruction. |
119 | */ | 122 | */ |
120 | __u32 lpp; /* 0x0368 */ | 123 | __u32 lpp; /* 0x0380 */ |
121 | __u32 current_pid; /* 0x036c */ | 124 | __u32 current_pid; /* 0x0384 */ |
122 | 125 | ||
123 | /* SMP info area */ | 126 | /* SMP info area */ |
124 | __u32 cpu_nr; /* 0x0370 */ | 127 | __u32 cpu_nr; /* 0x0388 */ |
125 | __u32 softirq_pending; /* 0x0374 */ | 128 | __u32 softirq_pending; /* 0x038c */ |
126 | __u64 percpu_offset; /* 0x0378 */ | 129 | __u64 percpu_offset; /* 0x0390 */ |
127 | __u64 vdso_per_cpu_data; /* 0x0380 */ | 130 | __u64 vdso_per_cpu_data; /* 0x0398 */ |
128 | __u64 machine_flags; /* 0x0388 */ | 131 | __u64 machine_flags; /* 0x03a0 */ |
129 | __u32 preempt_count; /* 0x0390 */ | 132 | __u32 preempt_count; /* 0x03a8 */ |
130 | __u8 pad_0x0394[0x0398-0x0394]; /* 0x0394 */ | 133 | __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ |
131 | __u64 gmap; /* 0x0398 */ | 134 | __u64 gmap; /* 0x03b0 */ |
132 | __u32 spinlock_lockval; /* 0x03a0 */ | 135 | __u32 spinlock_lockval; /* 0x03b8 */ |
133 | __u32 fpu_flags; /* 0x03a4 */ | 136 | __u32 fpu_flags; /* 0x03bc */ |
134 | __u8 pad_0x03a8[0x0400-0x03a8]; /* 0x03a8 */ | 137 | __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ |
135 | 138 | ||
136 | /* Per cpu primary space access list */ | 139 | /* Per cpu primary space access list */ |
137 | __u32 paste[16]; /* 0x0400 */ | 140 | __u32 paste[16]; /* 0x0400 */ |
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 6bca916a5ba0..977a5b6501b8 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h | |||
@@ -111,7 +111,10 @@ struct thread_struct { | |||
111 | unsigned int acrs[NUM_ACRS]; | 111 | unsigned int acrs[NUM_ACRS]; |
112 | unsigned long ksp; /* kernel stack pointer */ | 112 | unsigned long ksp; /* kernel stack pointer */ |
113 | unsigned long user_timer; /* task cputime in user space */ | 113 | unsigned long user_timer; /* task cputime in user space */ |
114 | unsigned long guest_timer; /* task cputime in kvm guest */ | ||
114 | unsigned long system_timer; /* task cputime in kernel space */ | 115 | unsigned long system_timer; /* task cputime in kernel space */ |
116 | unsigned long hardirq_timer; /* task cputime in hardirq context */ | ||
117 | unsigned long softirq_timer; /* task cputime in softirq context */ | ||
115 | unsigned long sys_call_table; /* system call table address */ | 118 | unsigned long sys_call_table; /* system call table address */ |
116 | mm_segment_t mm_segment; | 119 | mm_segment_t mm_segment; |
117 | unsigned long gmap_addr; /* address of last gmap fault. */ | 120 | unsigned long gmap_addr; /* address of last gmap fault. */ |
diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index 7a55c29b0b33..d3bf69ef42cf 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c | |||
@@ -12,7 +12,7 @@ | |||
12 | #include <linux/notifier.h> | 12 | #include <linux/notifier.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <asm/cputime.h> | 15 | #include <linux/cputime.h> |
16 | #include <asm/nmi.h> | 16 | #include <asm/nmi.h> |
17 | #include <asm/smp.h> | 17 | #include <asm/smp.h> |
18 | #include "entry.h" | 18 | #include "entry.h" |
@@ -43,7 +43,7 @@ void enabled_wait(void) | |||
43 | idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; | 43 | idle->clock_idle_enter = idle->clock_idle_exit = 0ULL; |
44 | idle->idle_time += idle_time; | 44 | idle->idle_time += idle_time; |
45 | idle->idle_count++; | 45 | idle->idle_count++; |
46 | account_idle_time(idle_time); | 46 | account_idle_time(cputime_to_nsecs(idle_time)); |
47 | write_seqcount_end(&idle->seqcount); | 47 | write_seqcount_end(&idle->seqcount); |
48 | } | 48 | } |
49 | NOKPROBE_SYMBOL(enabled_wait); | 49 | NOKPROBE_SYMBOL(enabled_wait); |
@@ -84,7 +84,7 @@ static ssize_t show_idle_time(struct device *dev, | |||
84 | } | 84 | } |
85 | DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); | 85 | DEVICE_ATTR(idle_time_us, 0444, show_idle_time, NULL); |
86 | 86 | ||
87 | cputime64_t arch_cpu_idle_time(int cpu) | 87 | u64 arch_cpu_idle_time(int cpu) |
88 | { | 88 | { |
89 | struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); | 89 | struct s390_idle_data *idle = &per_cpu(s390_idle, cpu); |
90 | unsigned long long now, idle_enter, idle_exit; | 90 | unsigned long long now, idle_enter, idle_exit; |
@@ -96,7 +96,8 @@ cputime64_t arch_cpu_idle_time(int cpu) | |||
96 | idle_enter = ACCESS_ONCE(idle->clock_idle_enter); | 96 | idle_enter = ACCESS_ONCE(idle->clock_idle_enter); |
97 | idle_exit = ACCESS_ONCE(idle->clock_idle_exit); | 97 | idle_exit = ACCESS_ONCE(idle->clock_idle_exit); |
98 | } while (read_seqcount_retry(&idle->seqcount, seq)); | 98 | } while (read_seqcount_retry(&idle->seqcount, seq)); |
99 | return idle_enter ? ((idle_exit ?: now) - idle_enter) : 0; | 99 | |
100 | return cputime_to_nsecs(idle_enter ? ((idle_exit ?: now) - idle_enter) : 0); | ||
100 | } | 101 | } |
101 | 102 | ||
102 | void arch_cpu_idle_enter(void) | 103 | void arch_cpu_idle_enter(void) |
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 1b5c5ee9fc1b..b4a3e9e06ef2 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c | |||
@@ -6,13 +6,13 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/kernel_stat.h> | 8 | #include <linux/kernel_stat.h> |
9 | #include <linux/cputime.h> | ||
9 | #include <linux/export.h> | 10 | #include <linux/export.h> |
10 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
11 | #include <linux/timex.h> | 12 | #include <linux/timex.h> |
12 | #include <linux/types.h> | 13 | #include <linux/types.h> |
13 | #include <linux/time.h> | 14 | #include <linux/time.h> |
14 | 15 | ||
15 | #include <asm/cputime.h> | ||
16 | #include <asm/vtimer.h> | 16 | #include <asm/vtimer.h> |
17 | #include <asm/vtime.h> | 17 | #include <asm/vtime.h> |
18 | #include <asm/cpu_mf.h> | 18 | #include <asm/cpu_mf.h> |
@@ -90,14 +90,41 @@ static void update_mt_scaling(void) | |||
90 | __this_cpu_write(mt_scaling_jiffies, jiffies_64); | 90 | __this_cpu_write(mt_scaling_jiffies, jiffies_64); |
91 | } | 91 | } |
92 | 92 | ||
93 | static inline u64 update_tsk_timer(unsigned long *tsk_vtime, u64 new) | ||
94 | { | ||
95 | u64 delta; | ||
96 | |||
97 | delta = new - *tsk_vtime; | ||
98 | *tsk_vtime = new; | ||
99 | return delta; | ||
100 | } | ||
101 | |||
102 | |||
103 | static inline u64 scale_vtime(u64 vtime) | ||
104 | { | ||
105 | u64 mult = __this_cpu_read(mt_scaling_mult); | ||
106 | u64 div = __this_cpu_read(mt_scaling_div); | ||
107 | |||
108 | if (smp_cpu_mtid) | ||
109 | return vtime * mult / div; | ||
110 | return vtime; | ||
111 | } | ||
112 | |||
113 | static void account_system_index_scaled(struct task_struct *p, | ||
114 | cputime_t cputime, cputime_t scaled, | ||
115 | enum cpu_usage_stat index) | ||
116 | { | ||
117 | p->stimescaled += cputime_to_nsecs(scaled); | ||
118 | account_system_index_time(p, cputime_to_nsecs(cputime), index); | ||
119 | } | ||
120 | |||
93 | /* | 121 | /* |
94 | * Update process times based on virtual cpu times stored by entry.S | 122 | * Update process times based on virtual cpu times stored by entry.S |
95 | * to the lowcore fields user_timer, system_timer & steal_clock. | 123 | * to the lowcore fields user_timer, system_timer & steal_clock. |
96 | */ | 124 | */ |
97 | static int do_account_vtime(struct task_struct *tsk) | 125 | static int do_account_vtime(struct task_struct *tsk) |
98 | { | 126 | { |
99 | u64 timer, clock, user, system, steal; | 127 | u64 timer, clock, user, guest, system, hardirq, softirq, steal; |
100 | u64 user_scaled, system_scaled; | ||
101 | 128 | ||
102 | timer = S390_lowcore.last_update_timer; | 129 | timer = S390_lowcore.last_update_timer; |
103 | clock = S390_lowcore.last_update_clock; | 130 | clock = S390_lowcore.last_update_clock; |
@@ -110,53 +137,76 @@ static int do_account_vtime(struct task_struct *tsk) | |||
110 | #endif | 137 | #endif |
111 | : "=m" (S390_lowcore.last_update_timer), | 138 | : "=m" (S390_lowcore.last_update_timer), |
112 | "=m" (S390_lowcore.last_update_clock)); | 139 | "=m" (S390_lowcore.last_update_clock)); |
113 | S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; | 140 | clock = S390_lowcore.last_update_clock - clock; |
114 | S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock; | 141 | timer -= S390_lowcore.last_update_timer; |
142 | |||
143 | if (hardirq_count()) | ||
144 | S390_lowcore.hardirq_timer += timer; | ||
145 | else | ||
146 | S390_lowcore.system_timer += timer; | ||
115 | 147 | ||
116 | /* Update MT utilization calculation */ | 148 | /* Update MT utilization calculation */ |
117 | if (smp_cpu_mtid && | 149 | if (smp_cpu_mtid && |
118 | time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) | 150 | time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) |
119 | update_mt_scaling(); | 151 | update_mt_scaling(); |
120 | 152 | ||
121 | user = S390_lowcore.user_timer - tsk->thread.user_timer; | 153 | /* Calculate cputime delta */ |
122 | S390_lowcore.steal_timer -= user; | 154 | user = update_tsk_timer(&tsk->thread.user_timer, |
123 | tsk->thread.user_timer = S390_lowcore.user_timer; | 155 | READ_ONCE(S390_lowcore.user_timer)); |
124 | 156 | guest = update_tsk_timer(&tsk->thread.guest_timer, | |
125 | system = S390_lowcore.system_timer - tsk->thread.system_timer; | 157 | READ_ONCE(S390_lowcore.guest_timer)); |
126 | S390_lowcore.steal_timer -= system; | 158 | system = update_tsk_timer(&tsk->thread.system_timer, |
127 | tsk->thread.system_timer = S390_lowcore.system_timer; | 159 | READ_ONCE(S390_lowcore.system_timer)); |
128 | 160 | hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, | |
129 | user_scaled = user; | 161 | READ_ONCE(S390_lowcore.hardirq_timer)); |
130 | system_scaled = system; | 162 | softirq = update_tsk_timer(&tsk->thread.softirq_timer, |
131 | /* Do MT utilization scaling */ | 163 | READ_ONCE(S390_lowcore.softirq_timer)); |
132 | if (smp_cpu_mtid) { | 164 | S390_lowcore.steal_timer += |
133 | u64 mult = __this_cpu_read(mt_scaling_mult); | 165 | clock - user - guest - system - hardirq - softirq; |
134 | u64 div = __this_cpu_read(mt_scaling_div); | 166 | |
167 | /* Push account value */ | ||
168 | if (user) { | ||
169 | account_user_time(tsk, cputime_to_nsecs(user)); | ||
170 | tsk->utimescaled += cputime_to_nsecs(scale_vtime(user)); | ||
171 | } | ||
135 | 172 | ||
136 | user_scaled = (user_scaled * mult) / div; | 173 | if (guest) { |
137 | system_scaled = (system_scaled * mult) / div; | 174 | account_guest_time(tsk, cputime_to_nsecs(guest)); |
175 | tsk->utimescaled += cputime_to_nsecs(scale_vtime(guest)); | ||
138 | } | 176 | } |
139 | account_user_time(tsk, user); | 177 | |
140 | tsk->utimescaled += user_scaled; | 178 | if (system) |
141 | account_system_time(tsk, 0, system); | 179 | account_system_index_scaled(tsk, system, scale_vtime(system), |
142 | tsk->stimescaled += system_scaled; | 180 | CPUTIME_SYSTEM); |
181 | if (hardirq) | ||
182 | account_system_index_scaled(tsk, hardirq, scale_vtime(hardirq), | ||
183 | CPUTIME_IRQ); | ||
184 | if (softirq) | ||
185 | account_system_index_scaled(tsk, softirq, scale_vtime(softirq), | ||
186 | CPUTIME_SOFTIRQ); | ||
143 | 187 | ||
144 | steal = S390_lowcore.steal_timer; | 188 | steal = S390_lowcore.steal_timer; |
145 | if ((s64) steal > 0) { | 189 | if ((s64) steal > 0) { |
146 | S390_lowcore.steal_timer = 0; | 190 | S390_lowcore.steal_timer = 0; |
147 | account_steal_time(steal); | 191 | account_steal_time(cputime_to_nsecs(steal)); |
148 | } | 192 | } |
149 | 193 | ||
150 | return virt_timer_forward(user + system); | 194 | return virt_timer_forward(user + guest + system + hardirq + softirq); |
151 | } | 195 | } |
152 | 196 | ||
153 | void vtime_task_switch(struct task_struct *prev) | 197 | void vtime_task_switch(struct task_struct *prev) |
154 | { | 198 | { |
155 | do_account_vtime(prev); | 199 | do_account_vtime(prev); |
156 | prev->thread.user_timer = S390_lowcore.user_timer; | 200 | prev->thread.user_timer = S390_lowcore.user_timer; |
201 | prev->thread.guest_timer = S390_lowcore.guest_timer; | ||
157 | prev->thread.system_timer = S390_lowcore.system_timer; | 202 | prev->thread.system_timer = S390_lowcore.system_timer; |
203 | prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; | ||
204 | prev->thread.softirq_timer = S390_lowcore.softirq_timer; | ||
158 | S390_lowcore.user_timer = current->thread.user_timer; | 205 | S390_lowcore.user_timer = current->thread.user_timer; |
206 | S390_lowcore.guest_timer = current->thread.guest_timer; | ||
159 | S390_lowcore.system_timer = current->thread.system_timer; | 207 | S390_lowcore.system_timer = current->thread.system_timer; |
208 | S390_lowcore.hardirq_timer = current->thread.hardirq_timer; | ||
209 | S390_lowcore.softirq_timer = current->thread.softirq_timer; | ||
160 | } | 210 | } |
161 | 211 | ||
162 | /* | 212 | /* |
@@ -164,7 +214,7 @@ void vtime_task_switch(struct task_struct *prev) | |||
164 | * accounting system time in order to correctly compute | 214 | * accounting system time in order to correctly compute |
165 | * the stolen time accounting. | 215 | * the stolen time accounting. |
166 | */ | 216 | */ |
167 | void vtime_account_user(struct task_struct *tsk) | 217 | void vtime_flush(struct task_struct *tsk) |
168 | { | 218 | { |
169 | if (do_account_vtime(tsk)) | 219 | if (do_account_vtime(tsk)) |
170 | virt_timer_expire(); | 220 | virt_timer_expire(); |
@@ -176,32 +226,22 @@ void vtime_account_user(struct task_struct *tsk) | |||
176 | */ | 226 | */ |
177 | void vtime_account_irq_enter(struct task_struct *tsk) | 227 | void vtime_account_irq_enter(struct task_struct *tsk) |
178 | { | 228 | { |
179 | u64 timer, system, system_scaled; | 229 | u64 timer; |
180 | 230 | ||
181 | timer = S390_lowcore.last_update_timer; | 231 | timer = S390_lowcore.last_update_timer; |
182 | S390_lowcore.last_update_timer = get_vtimer(); | 232 | S390_lowcore.last_update_timer = get_vtimer(); |
183 | S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; | 233 | timer -= S390_lowcore.last_update_timer; |
184 | 234 | ||
185 | /* Update MT utilization calculation */ | 235 | if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) |
186 | if (smp_cpu_mtid && | 236 | S390_lowcore.guest_timer += timer; |
187 | time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) | 237 | else if (hardirq_count()) |
188 | update_mt_scaling(); | 238 | S390_lowcore.hardirq_timer += timer; |
189 | 239 | else if (in_serving_softirq()) | |
190 | system = S390_lowcore.system_timer - tsk->thread.system_timer; | 240 | S390_lowcore.softirq_timer += timer; |
191 | S390_lowcore.steal_timer -= system; | 241 | else |
192 | tsk->thread.system_timer = S390_lowcore.system_timer; | 242 | S390_lowcore.system_timer += timer; |
193 | system_scaled = system; | 243 | |
194 | /* Do MT utilization scaling */ | 244 | virt_timer_forward(timer); |
195 | if (smp_cpu_mtid) { | ||
196 | u64 mult = __this_cpu_read(mt_scaling_mult); | ||
197 | u64 div = __this_cpu_read(mt_scaling_div); | ||
198 | |||
199 | system_scaled = (system_scaled * mult) / div; | ||
200 | } | ||
201 | account_system_time(tsk, 0, system); | ||
202 | tsk->stimescaled += system_scaled; | ||
203 | |||
204 | virt_timer_forward(system); | ||
205 | } | 245 | } |
206 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 246 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
207 | 247 | ||
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index a05218ff3fe4..51970bb6c4fe 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild | |||
@@ -4,7 +4,6 @@ header-y += | |||
4 | 4 | ||
5 | generic-y += barrier.h | 5 | generic-y += barrier.h |
6 | generic-y += clkdev.h | 6 | generic-y += clkdev.h |
7 | generic-y += cputime.h | ||
8 | generic-y += irq_work.h | 7 | generic-y += irq_work.h |
9 | generic-y += mcs_spinlock.h | 8 | generic-y += mcs_spinlock.h |
10 | generic-y += mm-arch-hooks.h | 9 | generic-y += mm-arch-hooks.h |
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 751c3373a92c..cf2a75063b53 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | 1 | ||
2 | generic-y += bitsperlong.h | 2 | generic-y += bitsperlong.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += current.h | 4 | generic-y += current.h |
6 | generic-y += delay.h | 5 | generic-y += delay.h |
7 | generic-y += div64.h | 6 | generic-y += div64.h |
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 0569bfac4afb..e9e837bc3158 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild | |||
@@ -2,7 +2,6 @@ | |||
2 | 2 | ||
3 | 3 | ||
4 | generic-y += clkdev.h | 4 | generic-y += clkdev.h |
5 | generic-y += cputime.h | ||
6 | generic-y += div64.h | 5 | generic-y += div64.h |
7 | generic-y += emergency-restart.h | 6 | generic-y += emergency-restart.h |
8 | generic-y += exec.h | 7 | generic-y += exec.h |
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index 20f2ba6d79be..aa48b6eaff2d 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild | |||
@@ -4,7 +4,6 @@ header-y += ../arch/ | |||
4 | generic-y += bug.h | 4 | generic-y += bug.h |
5 | generic-y += bugs.h | 5 | generic-y += bugs.h |
6 | generic-y += clkdev.h | 6 | generic-y += clkdev.h |
7 | generic-y += cputime.h | ||
8 | generic-y += emergency-restart.h | 7 | generic-y += emergency-restart.h |
9 | generic-y += errno.h | 8 | generic-y += errno.h |
10 | generic-y += exec.h | 9 | generic-y += exec.h |
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 052f7f6d0551..90c281cd7e1d 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | generic-y += barrier.h | 1 | generic-y += barrier.h |
2 | generic-y += bug.h | 2 | generic-y += bug.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += current.h | 4 | generic-y += current.h |
6 | generic-y += delay.h | 5 | generic-y += delay.h |
7 | generic-y += device.h | 6 | generic-y += device.h |
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index 256c45b3ae34..5d51ade89f4c 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild | |||
@@ -4,7 +4,6 @@ generic-y += auxvec.h | |||
4 | generic-y += bitsperlong.h | 4 | generic-y += bitsperlong.h |
5 | generic-y += bugs.h | 5 | generic-y += bugs.h |
6 | generic-y += clkdev.h | 6 | generic-y += clkdev.h |
7 | generic-y += cputime.h | ||
8 | generic-y += current.h | 7 | generic-y += current.h |
9 | generic-y += device.h | 8 | generic-y += device.h |
10 | generic-y += div64.h | 9 | generic-y += div64.h |
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 2b892e2313a9..5d6a53fd7521 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild | |||
@@ -7,7 +7,6 @@ generated-y += unistd_64_x32.h | |||
7 | generated-y += xen-hypercalls.h | 7 | generated-y += xen-hypercalls.h |
8 | 8 | ||
9 | generic-y += clkdev.h | 9 | generic-y += clkdev.h |
10 | generic-y += cputime.h | ||
11 | generic-y += dma-contiguous.h | 10 | generic-y += dma-contiguous.h |
12 | generic-y += early_ioremap.h | 11 | generic-y += early_ioremap.h |
13 | generic-y += mcs_spinlock.h | 12 | generic-y += mcs_spinlock.h |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 45d44c173cf9..4a7080c84a5a 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -905,8 +905,8 @@ static int apm_cpu_idle(struct cpuidle_device *dev, | |||
905 | { | 905 | { |
906 | static int use_apm_idle; /* = 0 */ | 906 | static int use_apm_idle; /* = 0 */ |
907 | static unsigned int last_jiffies; /* = 0 */ | 907 | static unsigned int last_jiffies; /* = 0 */ |
908 | static unsigned int last_stime; /* = 0 */ | 908 | static u64 last_stime; /* = 0 */ |
909 | cputime_t stime, utime; | 909 | u64 stime, utime; |
910 | 910 | ||
911 | int apm_idle_done = 0; | 911 | int apm_idle_done = 0; |
912 | unsigned int jiffies_since_last_check = jiffies - last_jiffies; | 912 | unsigned int jiffies_since_last_check = jiffies - last_jiffies; |
@@ -919,7 +919,7 @@ recalc: | |||
919 | } else if (jiffies_since_last_check > idle_period) { | 919 | } else if (jiffies_since_last_check > idle_period) { |
920 | unsigned int idle_percentage; | 920 | unsigned int idle_percentage; |
921 | 921 | ||
922 | idle_percentage = cputime_to_jiffies(stime - last_stime); | 922 | idle_percentage = nsecs_to_jiffies(stime - last_stime); |
923 | idle_percentage *= 100; | 923 | idle_percentage *= 100; |
924 | idle_percentage /= jiffies_since_last_check; | 924 | idle_percentage /= jiffies_since_last_check; |
925 | use_apm_idle = (idle_percentage > idle_threshold); | 925 | use_apm_idle = (idle_percentage > idle_threshold); |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2b4cf04239b6..4e95b2e0d95f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -555,8 +555,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) | |||
555 | if (c->x86_power & (1 << 8)) { | 555 | if (c->x86_power & (1 << 8)) { |
556 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 556 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
557 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 557 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
558 | if (!check_tsc_unstable()) | 558 | if (check_tsc_unstable()) |
559 | set_sched_clock_stable(); | 559 | clear_sched_clock_stable(); |
560 | } else { | ||
561 | clear_sched_clock_stable(); | ||
560 | } | 562 | } |
561 | 563 | ||
562 | /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ | 564 | /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ |
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 1661d8ec9280..2c234a6d94c4 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c | |||
@@ -1,5 +1,5 @@ | |||
1 | #include <linux/bitops.h> | 1 | |
2 | #include <linux/kernel.h> | 2 | #include <linux/sched.h> |
3 | 3 | ||
4 | #include <asm/cpufeature.h> | 4 | #include <asm/cpufeature.h> |
5 | #include <asm/e820.h> | 5 | #include <asm/e820.h> |
@@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c) | |||
104 | #ifdef CONFIG_X86_64 | 104 | #ifdef CONFIG_X86_64 |
105 | set_cpu_cap(c, X86_FEATURE_SYSENTER32); | 105 | set_cpu_cap(c, X86_FEATURE_SYSENTER32); |
106 | #endif | 106 | #endif |
107 | |||
108 | clear_sched_clock_stable(); | ||
107 | } | 109 | } |
108 | 110 | ||
109 | static void init_centaur(struct cpuinfo_x86 *c) | 111 | static void init_centaur(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ede03e849a8b..3bcf6d880611 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c) | |||
83 | strcpy(c->x86_model_id, "386"); | 83 | strcpy(c->x86_model_id, "386"); |
84 | } | 84 | } |
85 | #endif | 85 | #endif |
86 | clear_sched_clock_stable(); | ||
86 | } | 87 | } |
87 | 88 | ||
88 | static const struct cpu_dev default_cpu = { | 89 | static const struct cpu_dev default_cpu = { |
@@ -1056,6 +1057,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) | |||
1056 | */ | 1057 | */ |
1057 | if (this_cpu->c_init) | 1058 | if (this_cpu->c_init) |
1058 | this_cpu->c_init(c); | 1059 | this_cpu->c_init(c); |
1060 | else | ||
1061 | clear_sched_clock_stable(); | ||
1059 | 1062 | ||
1060 | /* Disable the PN if appropriate */ | 1063 | /* Disable the PN if appropriate */ |
1061 | squash_the_stupid_serial_number(c); | 1064 | squash_the_stupid_serial_number(c); |
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index bd9dcd6b712d..47416f959a48 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <asm/pci-direct.h> | 9 | #include <asm/pci-direct.h> |
10 | #include <asm/tsc.h> | 10 | #include <asm/tsc.h> |
11 | #include <asm/cpufeature.h> | 11 | #include <asm/cpufeature.h> |
12 | #include <linux/sched.h> | ||
12 | 13 | ||
13 | #include "cpu.h" | 14 | #include "cpu.h" |
14 | 15 | ||
@@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) | |||
183 | set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); | 184 | set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); |
184 | break; | 185 | break; |
185 | } | 186 | } |
187 | clear_sched_clock_stable(); | ||
186 | } | 188 | } |
187 | 189 | ||
188 | static void init_cyrix(struct cpuinfo_x86 *c) | 190 | static void init_cyrix(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 203f860d2ab3..026c728d6ba7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -119,8 +119,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) | |||
119 | if (c->x86_power & (1 << 8)) { | 119 | if (c->x86_power & (1 << 8)) { |
120 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 120 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
121 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 121 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
122 | if (!check_tsc_unstable()) | 122 | if (check_tsc_unstable()) |
123 | set_sched_clock_stable(); | 123 | clear_sched_clock_stable(); |
124 | } else { | ||
125 | clear_sched_clock_stable(); | ||
124 | } | 126 | } |
125 | 127 | ||
126 | /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ | 128 | /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ |
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 34178564be2a..c1ea5b999839 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c | |||
@@ -1,4 +1,5 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/sched.h> | ||
2 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
3 | #include <asm/cpufeature.h> | 4 | #include <asm/cpufeature.h> |
4 | #include <asm/msr.h> | 5 | #include <asm/msr.h> |
@@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) | |||
14 | if (xlvl >= 0x80860001) | 15 | if (xlvl >= 0x80860001) |
15 | c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); | 16 | c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); |
16 | } | 17 | } |
18 | |||
19 | clear_sched_clock_stable(); | ||
17 | } | 20 | } |
18 | 21 | ||
19 | static void init_transmeta(struct cpuinfo_x86 *c) | 22 | static void init_transmeta(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index cb9c1ed1d391..f73f475d0573 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c | |||
@@ -132,10 +132,8 @@ int sched_set_itmt_support(void) | |||
132 | 132 | ||
133 | sysctl_sched_itmt_enabled = 1; | 133 | sysctl_sched_itmt_enabled = 1; |
134 | 134 | ||
135 | if (sysctl_sched_itmt_enabled) { | 135 | x86_topology_update = true; |
136 | x86_topology_update = true; | 136 | rebuild_sched_domains(); |
137 | rebuild_sched_domains(); | ||
138 | } | ||
139 | 137 | ||
140 | mutex_unlock(&itmt_update_mutex); | 138 | mutex_unlock(&itmt_update_mutex); |
141 | 139 | ||
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..542710b99f52 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c | |||
@@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable) | |||
107 | { | 107 | { |
108 | if (!stable) { | 108 | if (!stable) { |
109 | pv_time_ops.sched_clock = kvm_clock_read; | 109 | pv_time_ops.sched_clock = kvm_clock_read; |
110 | clear_sched_clock_stable(); | ||
110 | return; | 111 | return; |
111 | } | 112 | } |
112 | 113 | ||
113 | kvm_sched_clock_offset = kvm_clock_read(); | 114 | kvm_sched_clock_offset = kvm_clock_read(); |
114 | pv_time_ops.sched_clock = kvm_sched_clock_read; | 115 | pv_time_ops.sched_clock = kvm_sched_clock_read; |
115 | set_sched_clock_stable(); | ||
116 | 116 | ||
117 | printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", | 117 | printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", |
118 | kvm_sched_clock_offset); | 118 | kvm_sched_clock_offset); |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 37e7cf544e51..2724dc82f992 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -1107,6 +1107,16 @@ static u64 read_tsc(struct clocksource *cs) | |||
1107 | return (u64)rdtsc_ordered(); | 1107 | return (u64)rdtsc_ordered(); |
1108 | } | 1108 | } |
1109 | 1109 | ||
1110 | static void tsc_cs_mark_unstable(struct clocksource *cs) | ||
1111 | { | ||
1112 | if (tsc_unstable) | ||
1113 | return; | ||
1114 | tsc_unstable = 1; | ||
1115 | clear_sched_clock_stable(); | ||
1116 | disable_sched_clock_irqtime(); | ||
1117 | pr_info("Marking TSC unstable due to clocksource watchdog\n"); | ||
1118 | } | ||
1119 | |||
1110 | /* | 1120 | /* |
1111 | * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() | 1121 | * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() |
1112 | */ | 1122 | */ |
@@ -1119,6 +1129,7 @@ static struct clocksource clocksource_tsc = { | |||
1119 | CLOCK_SOURCE_MUST_VERIFY, | 1129 | CLOCK_SOURCE_MUST_VERIFY, |
1120 | .archdata = { .vclock_mode = VCLOCK_TSC }, | 1130 | .archdata = { .vclock_mode = VCLOCK_TSC }, |
1121 | .resume = tsc_resume, | 1131 | .resume = tsc_resume, |
1132 | .mark_unstable = tsc_cs_mark_unstable, | ||
1122 | }; | 1133 | }; |
1123 | 1134 | ||
1124 | void mark_tsc_unstable(char *reason) | 1135 | void mark_tsc_unstable(char *reason) |
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 1572c35b4f1a..2ecd7dab4631 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c | |||
@@ -964,10 +964,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, | |||
964 | /* Calculate cpu time spent by current task in 100ns units */ | 964 | /* Calculate cpu time spent by current task in 100ns units */ |
965 | static u64 current_task_runtime_100ns(void) | 965 | static u64 current_task_runtime_100ns(void) |
966 | { | 966 | { |
967 | cputime_t utime, stime; | 967 | u64 utime, stime; |
968 | 968 | ||
969 | task_cputime_adjusted(current, &utime, &stime); | 969 | task_cputime_adjusted(current, &utime, &stime); |
970 | return div_u64(cputime_to_nsecs(utime + stime), 100); | 970 | |
971 | return div_u64(utime + stime, 100); | ||
971 | } | 972 | } |
972 | 973 | ||
973 | static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) | 974 | static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) |
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index b7fbaa56b51a..9e9760b20be5 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild | |||
@@ -1,7 +1,6 @@ | |||
1 | generic-y += bitsperlong.h | 1 | generic-y += bitsperlong.h |
2 | generic-y += bug.h | 2 | generic-y += bug.h |
3 | generic-y += clkdev.h | 3 | generic-y += clkdev.h |
4 | generic-y += cputime.h | ||
5 | generic-y += div64.h | 4 | generic-y += div64.h |
6 | generic-y += dma-contiguous.h | 5 | generic-y += dma-contiguous.h |
7 | generic-y += emergency-restart.h | 6 | generic-y += emergency-restart.h |
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index cc475eff90b3..3e9b319a2e79 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c | |||
@@ -132,7 +132,7 @@ static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) | |||
132 | u64 cur_wall_time; | 132 | u64 cur_wall_time; |
133 | u64 busy_time; | 133 | u64 busy_time; |
134 | 134 | ||
135 | cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); | 135 | cur_wall_time = jiffies64_to_nsecs(get_jiffies_64()); |
136 | 136 | ||
137 | busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; | 137 | busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; |
138 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; | 138 | busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; |
@@ -143,9 +143,9 @@ static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) | |||
143 | 143 | ||
144 | idle_time = cur_wall_time - busy_time; | 144 | idle_time = cur_wall_time - busy_time; |
145 | if (wall) | 145 | if (wall) |
146 | *wall = cputime_to_usecs(cur_wall_time); | 146 | *wall = div_u64(cur_wall_time, NSEC_PER_USEC); |
147 | 147 | ||
148 | return cputime_to_usecs(idle_time); | 148 | return div_u64(idle_time, NSEC_PER_USEC); |
149 | } | 149 | } |
150 | 150 | ||
151 | u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy) | 151 | u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy) |
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 0196467280bd..631bd2c86c5e 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c | |||
@@ -152,7 +152,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) | |||
152 | if (ignore_nice) { | 152 | if (ignore_nice) { |
153 | u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; | 153 | u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; |
154 | 154 | ||
155 | idle_time += cputime_to_usecs(cur_nice - j_cdbs->prev_cpu_nice); | 155 | idle_time += div_u64(cur_nice - j_cdbs->prev_cpu_nice, NSEC_PER_USEC); |
156 | j_cdbs->prev_cpu_nice = cur_nice; | 156 | j_cdbs->prev_cpu_nice = cur_nice; |
157 | } | 157 | } |
158 | 158 | ||
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index ac284e66839c..17048bbec287 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/cpufreq.h> | 13 | #include <linux/cpufreq.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
16 | #include <linux/cputime.h> | ||
17 | 16 | ||
18 | static DEFINE_SPINLOCK(cpufreq_stats_lock); | 17 | static DEFINE_SPINLOCK(cpufreq_stats_lock); |
19 | 18 | ||
diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c index 9cb4b621fbc3..b324474c0c12 100644 --- a/drivers/isdn/mISDN/stack.c +++ b/drivers/isdn/mISDN/stack.c | |||
@@ -203,7 +203,7 @@ mISDNStackd(void *data) | |||
203 | { | 203 | { |
204 | struct mISDNstack *st = data; | 204 | struct mISDNstack *st = data; |
205 | #ifdef MISDN_MSG_STATS | 205 | #ifdef MISDN_MSG_STATS |
206 | cputime_t utime, stime; | 206 | u64 utime, stime; |
207 | #endif | 207 | #endif |
208 | int err = 0; | 208 | int err = 0; |
209 | 209 | ||
@@ -308,7 +308,7 @@ mISDNStackd(void *data) | |||
308 | st->stopped_cnt); | 308 | st->stopped_cnt); |
309 | task_cputime(st->thread, &utime, &stime); | 309 | task_cputime(st->thread, &utime, &stime); |
310 | printk(KERN_DEBUG | 310 | printk(KERN_DEBUG |
311 | "mISDNStackd daemon for %s utime(%ld) stime(%ld)\n", | 311 | "mISDNStackd daemon for %s utime(%llu) stime(%llu)\n", |
312 | dev_name(&st->dev->dev), utime, stime); | 312 | dev_name(&st->dev->dev), utime, stime); |
313 | printk(KERN_DEBUG | 313 | printk(KERN_DEBUG |
314 | "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", | 314 | "mISDNStackd daemon for %s nvcsw(%ld) nivcsw(%ld)\n", |
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index 775527135b93..e199fd6c71ce 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c | |||
@@ -52,8 +52,8 @@ struct rackmeter_dma { | |||
52 | struct rackmeter_cpu { | 52 | struct rackmeter_cpu { |
53 | struct delayed_work sniffer; | 53 | struct delayed_work sniffer; |
54 | struct rackmeter *rm; | 54 | struct rackmeter *rm; |
55 | cputime64_t prev_wall; | 55 | u64 prev_wall; |
56 | cputime64_t prev_idle; | 56 | u64 prev_idle; |
57 | int zero; | 57 | int zero; |
58 | } ____cacheline_aligned; | 58 | } ____cacheline_aligned; |
59 | 59 | ||
@@ -81,7 +81,7 @@ static int rackmeter_ignore_nice; | |||
81 | /* This is copied from cpufreq_ondemand, maybe we should put it in | 81 | /* This is copied from cpufreq_ondemand, maybe we should put it in |
82 | * a common header somewhere | 82 | * a common header somewhere |
83 | */ | 83 | */ |
84 | static inline cputime64_t get_cpu_idle_time(unsigned int cpu) | 84 | static inline u64 get_cpu_idle_time(unsigned int cpu) |
85 | { | 85 | { |
86 | u64 retval; | 86 | u64 retval; |
87 | 87 | ||
@@ -217,23 +217,23 @@ static void rackmeter_do_timer(struct work_struct *work) | |||
217 | container_of(work, struct rackmeter_cpu, sniffer.work); | 217 | container_of(work, struct rackmeter_cpu, sniffer.work); |
218 | struct rackmeter *rm = rcpu->rm; | 218 | struct rackmeter *rm = rcpu->rm; |
219 | unsigned int cpu = smp_processor_id(); | 219 | unsigned int cpu = smp_processor_id(); |
220 | cputime64_t cur_jiffies, total_idle_ticks; | 220 | u64 cur_nsecs, total_idle_nsecs; |
221 | unsigned int total_ticks, idle_ticks; | 221 | u64 total_nsecs, idle_nsecs; |
222 | int i, offset, load, cumm, pause; | 222 | int i, offset, load, cumm, pause; |
223 | 223 | ||
224 | cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); | 224 | cur_nsecs = jiffies64_to_nsecs(get_jiffies_64()); |
225 | total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall); | 225 | total_nsecs = cur_nsecs - rcpu->prev_wall; |
226 | rcpu->prev_wall = cur_jiffies; | 226 | rcpu->prev_wall = cur_nsecs; |
227 | 227 | ||
228 | total_idle_ticks = get_cpu_idle_time(cpu); | 228 | total_idle_nsecs = get_cpu_idle_time(cpu); |
229 | idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); | 229 | idle_nsecs = total_idle_nsecs - rcpu->prev_idle; |
230 | idle_ticks = min(idle_ticks, total_ticks); | 230 | idle_nsecs = min(idle_nsecs, total_nsecs); |
231 | rcpu->prev_idle = total_idle_ticks; | 231 | rcpu->prev_idle = total_idle_nsecs; |
232 | 232 | ||
233 | /* We do a very dumb calculation to update the LEDs for now, | 233 | /* We do a very dumb calculation to update the LEDs for now, |
234 | * we'll do better once we have actual PWM implemented | 234 | * we'll do better once we have actual PWM implemented |
235 | */ | 235 | */ |
236 | load = (9 * (total_ticks - idle_ticks)) / total_ticks; | 236 | load = div64_u64(9 * (total_nsecs - idle_nsecs), total_nsecs); |
237 | 237 | ||
238 | offset = cpu << 3; | 238 | offset = cpu << 3; |
239 | cumm = 0; | 239 | cumm = 0; |
@@ -278,7 +278,7 @@ static void rackmeter_init_cpu_sniffer(struct rackmeter *rm) | |||
278 | continue; | 278 | continue; |
279 | rcpu = &rm->cpu[cpu]; | 279 | rcpu = &rm->cpu[cpu]; |
280 | rcpu->prev_idle = get_cpu_idle_time(cpu); | 280 | rcpu->prev_idle = get_cpu_idle_time(cpu); |
281 | rcpu->prev_wall = jiffies64_to_cputime64(get_jiffies_64()); | 281 | rcpu->prev_wall = jiffies64_to_nsecs(get_jiffies_64()); |
282 | schedule_delayed_work_on(cpu, &rm->cpu[cpu].sniffer, | 282 | schedule_delayed_work_on(cpu, &rm->cpu[cpu].sniffer, |
283 | msecs_to_jiffies(CPU_SAMPLING_RATE)); | 283 | msecs_to_jiffies(CPU_SAMPLING_RATE)); |
284 | } | 284 | } |
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 422370293cfd..e7bf01373bc4 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -1428,17 +1428,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus, | |||
1428 | * group-wide total, not its individual thread total. | 1428 | * group-wide total, not its individual thread total. |
1429 | */ | 1429 | */ |
1430 | thread_group_cputime(p, &cputime); | 1430 | thread_group_cputime(p, &cputime); |
1431 | cputime_to_timeval(cputime.utime, &prstatus->pr_utime); | 1431 | prstatus->pr_utime = ns_to_timeval(cputime.utime); |
1432 | cputime_to_timeval(cputime.stime, &prstatus->pr_stime); | 1432 | prstatus->pr_stime = ns_to_timeval(cputime.stime); |
1433 | } else { | 1433 | } else { |
1434 | cputime_t utime, stime; | 1434 | u64 utime, stime; |
1435 | 1435 | ||
1436 | task_cputime(p, &utime, &stime); | 1436 | task_cputime(p, &utime, &stime); |
1437 | cputime_to_timeval(utime, &prstatus->pr_utime); | 1437 | prstatus->pr_utime = ns_to_timeval(utime); |
1438 | cputime_to_timeval(stime, &prstatus->pr_stime); | 1438 | prstatus->pr_stime = ns_to_timeval(stime); |
1439 | } | 1439 | } |
1440 | cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); | 1440 | |
1441 | cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); | 1441 | prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); |
1442 | prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); | ||
1442 | } | 1443 | } |
1443 | 1444 | ||
1444 | static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, | 1445 | static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, |
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index d2e36f82c35d..ffca4bbc3d63 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c | |||
@@ -1349,17 +1349,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus, | |||
1349 | * group-wide total, not its individual thread total. | 1349 | * group-wide total, not its individual thread total. |
1350 | */ | 1350 | */ |
1351 | thread_group_cputime(p, &cputime); | 1351 | thread_group_cputime(p, &cputime); |
1352 | cputime_to_timeval(cputime.utime, &prstatus->pr_utime); | 1352 | prstatus->pr_utime = ns_to_timeval(cputime.utime); |
1353 | cputime_to_timeval(cputime.stime, &prstatus->pr_stime); | 1353 | prstatus->pr_stime = ns_to_timeval(cputime.stime); |
1354 | } else { | 1354 | } else { |
1355 | cputime_t utime, stime; | 1355 | u64 utime, stime; |
1356 | 1356 | ||
1357 | task_cputime(p, &utime, &stime); | 1357 | task_cputime(p, &utime, &stime); |
1358 | cputime_to_timeval(utime, &prstatus->pr_utime); | 1358 | prstatus->pr_utime = ns_to_timeval(utime); |
1359 | cputime_to_timeval(stime, &prstatus->pr_stime); | 1359 | prstatus->pr_stime = ns_to_timeval(stime); |
1360 | } | 1360 | } |
1361 | cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime); | 1361 | prstatus->pr_cutime = ns_to_timeval(p->signal->cutime); |
1362 | cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime); | 1362 | prstatus->pr_cstime = ns_to_timeval(p->signal->cstime); |
1363 | 1363 | ||
1364 | prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; | 1364 | prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; |
1365 | prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; | 1365 | prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; |
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 4d24d17bcfc1..504b3c3539dc 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c | |||
@@ -51,22 +51,8 @@ | |||
51 | #define elf_prstatus compat_elf_prstatus | 51 | #define elf_prstatus compat_elf_prstatus |
52 | #define elf_prpsinfo compat_elf_prpsinfo | 52 | #define elf_prpsinfo compat_elf_prpsinfo |
53 | 53 | ||
54 | /* | 54 | #undef ns_to_timeval |
55 | * Compat version of cputime_to_compat_timeval, perhaps this | 55 | #define ns_to_timeval ns_to_compat_timeval |
56 | * should be an inline in <linux/compat.h>. | ||
57 | */ | ||
58 | static void cputime_to_compat_timeval(const cputime_t cputime, | ||
59 | struct compat_timeval *value) | ||
60 | { | ||
61 | struct timeval tv; | ||
62 | cputime_to_timeval(cputime, &tv); | ||
63 | value->tv_sec = tv.tv_sec; | ||
64 | value->tv_usec = tv.tv_usec; | ||
65 | } | ||
66 | |||
67 | #undef cputime_to_timeval | ||
68 | #define cputime_to_timeval cputime_to_compat_timeval | ||
69 | |||
70 | 56 | ||
71 | /* | 57 | /* |
72 | * To use this file, asm/elf.h must define compat_elf_check_arch. | 58 | * To use this file, asm/elf.h must define compat_elf_check_arch. |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8c514367ba5a..b6b194ec1b4f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -393,7 +393,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
393 | /* Do we need to erase the effects of a prior jbd2_journal_flush? */ | 393 | /* Do we need to erase the effects of a prior jbd2_journal_flush? */ |
394 | if (journal->j_flags & JBD2_FLUSHED) { | 394 | if (journal->j_flags & JBD2_FLUSHED) { |
395 | jbd_debug(3, "super block updated\n"); | 395 | jbd_debug(3, "super block updated\n"); |
396 | mutex_lock(&journal->j_checkpoint_mutex); | 396 | mutex_lock_io(&journal->j_checkpoint_mutex); |
397 | /* | 397 | /* |
398 | * We hold j_checkpoint_mutex so tail cannot change under us. | 398 | * We hold j_checkpoint_mutex so tail cannot change under us. |
399 | * We don't need any special data guarantees for writing sb | 399 | * We don't need any special data guarantees for writing sb |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a097048ed1a3..d8a5d0a08f07 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -944,7 +944,7 @@ out: | |||
944 | */ | 944 | */ |
945 | void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) | 945 | void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) |
946 | { | 946 | { |
947 | mutex_lock(&journal->j_checkpoint_mutex); | 947 | mutex_lock_io(&journal->j_checkpoint_mutex); |
948 | if (tid_gt(tid, journal->j_tail_sequence)) | 948 | if (tid_gt(tid, journal->j_tail_sequence)) |
949 | __jbd2_update_log_tail(journal, tid, block); | 949 | __jbd2_update_log_tail(journal, tid, block); |
950 | mutex_unlock(&journal->j_checkpoint_mutex); | 950 | mutex_unlock(&journal->j_checkpoint_mutex); |
@@ -1304,7 +1304,7 @@ static int journal_reset(journal_t *journal) | |||
1304 | journal->j_flags |= JBD2_FLUSHED; | 1304 | journal->j_flags |= JBD2_FLUSHED; |
1305 | } else { | 1305 | } else { |
1306 | /* Lock here to make assertions happy... */ | 1306 | /* Lock here to make assertions happy... */ |
1307 | mutex_lock(&journal->j_checkpoint_mutex); | 1307 | mutex_lock_io(&journal->j_checkpoint_mutex); |
1308 | /* | 1308 | /* |
1309 | * Update log tail information. We use REQ_FUA since new | 1309 | * Update log tail information. We use REQ_FUA since new |
1310 | * transaction will start reusing journal space and so we | 1310 | * transaction will start reusing journal space and so we |
@@ -1691,7 +1691,7 @@ int jbd2_journal_destroy(journal_t *journal) | |||
1691 | spin_lock(&journal->j_list_lock); | 1691 | spin_lock(&journal->j_list_lock); |
1692 | while (journal->j_checkpoint_transactions != NULL) { | 1692 | while (journal->j_checkpoint_transactions != NULL) { |
1693 | spin_unlock(&journal->j_list_lock); | 1693 | spin_unlock(&journal->j_list_lock); |
1694 | mutex_lock(&journal->j_checkpoint_mutex); | 1694 | mutex_lock_io(&journal->j_checkpoint_mutex); |
1695 | err = jbd2_log_do_checkpoint(journal); | 1695 | err = jbd2_log_do_checkpoint(journal); |
1696 | mutex_unlock(&journal->j_checkpoint_mutex); | 1696 | mutex_unlock(&journal->j_checkpoint_mutex); |
1697 | /* | 1697 | /* |
@@ -1713,7 +1713,7 @@ int jbd2_journal_destroy(journal_t *journal) | |||
1713 | 1713 | ||
1714 | if (journal->j_sb_buffer) { | 1714 | if (journal->j_sb_buffer) { |
1715 | if (!is_journal_aborted(journal)) { | 1715 | if (!is_journal_aborted(journal)) { |
1716 | mutex_lock(&journal->j_checkpoint_mutex); | 1716 | mutex_lock_io(&journal->j_checkpoint_mutex); |
1717 | 1717 | ||
1718 | write_lock(&journal->j_state_lock); | 1718 | write_lock(&journal->j_state_lock); |
1719 | journal->j_tail_sequence = | 1719 | journal->j_tail_sequence = |
@@ -1955,7 +1955,7 @@ int jbd2_journal_flush(journal_t *journal) | |||
1955 | spin_lock(&journal->j_list_lock); | 1955 | spin_lock(&journal->j_list_lock); |
1956 | while (!err && journal->j_checkpoint_transactions != NULL) { | 1956 | while (!err && journal->j_checkpoint_transactions != NULL) { |
1957 | spin_unlock(&journal->j_list_lock); | 1957 | spin_unlock(&journal->j_list_lock); |
1958 | mutex_lock(&journal->j_checkpoint_mutex); | 1958 | mutex_lock_io(&journal->j_checkpoint_mutex); |
1959 | err = jbd2_log_do_checkpoint(journal); | 1959 | err = jbd2_log_do_checkpoint(journal); |
1960 | mutex_unlock(&journal->j_checkpoint_mutex); | 1960 | mutex_unlock(&journal->j_checkpoint_mutex); |
1961 | spin_lock(&journal->j_list_lock); | 1961 | spin_lock(&journal->j_list_lock); |
@@ -1965,7 +1965,7 @@ int jbd2_journal_flush(journal_t *journal) | |||
1965 | if (is_journal_aborted(journal)) | 1965 | if (is_journal_aborted(journal)) |
1966 | return -EIO; | 1966 | return -EIO; |
1967 | 1967 | ||
1968 | mutex_lock(&journal->j_checkpoint_mutex); | 1968 | mutex_lock_io(&journal->j_checkpoint_mutex); |
1969 | if (!err) { | 1969 | if (!err) { |
1970 | err = jbd2_cleanup_journal_tail(journal); | 1970 | err = jbd2_cleanup_journal_tail(journal); |
1971 | if (err < 0) { | 1971 | if (err < 0) { |
diff --git a/fs/proc/array.c b/fs/proc/array.c index 51a4213afa2e..fe12b519d09b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -401,8 +401,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
401 | unsigned long long start_time; | 401 | unsigned long long start_time; |
402 | unsigned long cmin_flt = 0, cmaj_flt = 0; | 402 | unsigned long cmin_flt = 0, cmaj_flt = 0; |
403 | unsigned long min_flt = 0, maj_flt = 0; | 403 | unsigned long min_flt = 0, maj_flt = 0; |
404 | cputime_t cutime, cstime, utime, stime; | 404 | u64 cutime, cstime, utime, stime; |
405 | cputime_t cgtime, gtime; | 405 | u64 cgtime, gtime; |
406 | unsigned long rsslim = 0; | 406 | unsigned long rsslim = 0; |
407 | char tcomm[sizeof(task->comm)]; | 407 | char tcomm[sizeof(task->comm)]; |
408 | unsigned long flags; | 408 | unsigned long flags; |
@@ -497,10 +497,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
497 | seq_put_decimal_ull(m, " ", cmin_flt); | 497 | seq_put_decimal_ull(m, " ", cmin_flt); |
498 | seq_put_decimal_ull(m, " ", maj_flt); | 498 | seq_put_decimal_ull(m, " ", maj_flt); |
499 | seq_put_decimal_ull(m, " ", cmaj_flt); | 499 | seq_put_decimal_ull(m, " ", cmaj_flt); |
500 | seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime)); | 500 | seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime)); |
501 | seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime)); | 501 | seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime)); |
502 | seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime)); | 502 | seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime)); |
503 | seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime)); | 503 | seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime)); |
504 | seq_put_decimal_ll(m, " ", priority); | 504 | seq_put_decimal_ll(m, " ", priority); |
505 | seq_put_decimal_ll(m, " ", nice); | 505 | seq_put_decimal_ll(m, " ", nice); |
506 | seq_put_decimal_ll(m, " ", num_threads); | 506 | seq_put_decimal_ll(m, " ", num_threads); |
@@ -542,8 +542,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
542 | seq_put_decimal_ull(m, " ", task->rt_priority); | 542 | seq_put_decimal_ull(m, " ", task->rt_priority); |
543 | seq_put_decimal_ull(m, " ", task->policy); | 543 | seq_put_decimal_ull(m, " ", task->policy); |
544 | seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); | 544 | seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task)); |
545 | seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime)); | 545 | seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime)); |
546 | seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime)); | 546 | seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime)); |
547 | 547 | ||
548 | if (mm && permitted) { | 548 | if (mm && permitted) { |
549 | seq_put_decimal_ull(m, " ", mm->start_data); | 549 | seq_put_decimal_ull(m, " ", mm->start_data); |
diff --git a/fs/proc/stat.c b/fs/proc/stat.c index d700c42b3572..e47c3e8c4dfe 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c | |||
@@ -21,9 +21,9 @@ | |||
21 | 21 | ||
22 | #ifdef arch_idle_time | 22 | #ifdef arch_idle_time |
23 | 23 | ||
24 | static cputime64_t get_idle_time(int cpu) | 24 | static u64 get_idle_time(int cpu) |
25 | { | 25 | { |
26 | cputime64_t idle; | 26 | u64 idle; |
27 | 27 | ||
28 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; | 28 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; |
29 | if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) | 29 | if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) |
@@ -31,9 +31,9 @@ static cputime64_t get_idle_time(int cpu) | |||
31 | return idle; | 31 | return idle; |
32 | } | 32 | } |
33 | 33 | ||
34 | static cputime64_t get_iowait_time(int cpu) | 34 | static u64 get_iowait_time(int cpu) |
35 | { | 35 | { |
36 | cputime64_t iowait; | 36 | u64 iowait; |
37 | 37 | ||
38 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; | 38 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; |
39 | if (cpu_online(cpu) && nr_iowait_cpu(cpu)) | 39 | if (cpu_online(cpu) && nr_iowait_cpu(cpu)) |
@@ -45,32 +45,32 @@ static cputime64_t get_iowait_time(int cpu) | |||
45 | 45 | ||
46 | static u64 get_idle_time(int cpu) | 46 | static u64 get_idle_time(int cpu) |
47 | { | 47 | { |
48 | u64 idle, idle_time = -1ULL; | 48 | u64 idle, idle_usecs = -1ULL; |
49 | 49 | ||
50 | if (cpu_online(cpu)) | 50 | if (cpu_online(cpu)) |
51 | idle_time = get_cpu_idle_time_us(cpu, NULL); | 51 | idle_usecs = get_cpu_idle_time_us(cpu, NULL); |
52 | 52 | ||
53 | if (idle_time == -1ULL) | 53 | if (idle_usecs == -1ULL) |
54 | /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ | 54 | /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ |
55 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; | 55 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; |
56 | else | 56 | else |
57 | idle = usecs_to_cputime64(idle_time); | 57 | idle = idle_usecs * NSEC_PER_USEC; |
58 | 58 | ||
59 | return idle; | 59 | return idle; |
60 | } | 60 | } |
61 | 61 | ||
62 | static u64 get_iowait_time(int cpu) | 62 | static u64 get_iowait_time(int cpu) |
63 | { | 63 | { |
64 | u64 iowait, iowait_time = -1ULL; | 64 | u64 iowait, iowait_usecs = -1ULL; |
65 | 65 | ||
66 | if (cpu_online(cpu)) | 66 | if (cpu_online(cpu)) |
67 | iowait_time = get_cpu_iowait_time_us(cpu, NULL); | 67 | iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); |
68 | 68 | ||
69 | if (iowait_time == -1ULL) | 69 | if (iowait_usecs == -1ULL) |
70 | /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ | 70 | /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ |
71 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; | 71 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; |
72 | else | 72 | else |
73 | iowait = usecs_to_cputime64(iowait_time); | 73 | iowait = iowait_usecs * NSEC_PER_USEC; |
74 | 74 | ||
75 | return iowait; | 75 | return iowait; |
76 | } | 76 | } |
@@ -115,16 +115,16 @@ static int show_stat(struct seq_file *p, void *v) | |||
115 | } | 115 | } |
116 | sum += arch_irq_stat(); | 116 | sum += arch_irq_stat(); |
117 | 117 | ||
118 | seq_put_decimal_ull(p, "cpu ", cputime64_to_clock_t(user)); | 118 | seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); |
119 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); | 119 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); |
120 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); | 120 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); |
121 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); | 121 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); |
122 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); | 122 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); |
123 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); | 123 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); |
124 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); | 124 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); |
125 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); | 125 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); |
126 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); | 126 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); |
127 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); | 127 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); |
128 | seq_putc(p, '\n'); | 128 | seq_putc(p, '\n'); |
129 | 129 | ||
130 | for_each_online_cpu(i) { | 130 | for_each_online_cpu(i) { |
@@ -140,16 +140,16 @@ static int show_stat(struct seq_file *p, void *v) | |||
140 | guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; | 140 | guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; |
141 | guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; | 141 | guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; |
142 | seq_printf(p, "cpu%d", i); | 142 | seq_printf(p, "cpu%d", i); |
143 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user)); | 143 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); |
144 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice)); | 144 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); |
145 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system)); | 145 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); |
146 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle)); | 146 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); |
147 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait)); | 147 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); |
148 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq)); | 148 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); |
149 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq)); | 149 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); |
150 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal)); | 150 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); |
151 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest)); | 151 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); |
152 | seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice)); | 152 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); |
153 | seq_putc(p, '\n'); | 153 | seq_putc(p, '\n'); |
154 | } | 154 | } |
155 | seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); | 155 | seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); |
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 33de567c25af..7981c4ffe787 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c | |||
@@ -5,23 +5,20 @@ | |||
5 | #include <linux/seq_file.h> | 5 | #include <linux/seq_file.h> |
6 | #include <linux/time.h> | 6 | #include <linux/time.h> |
7 | #include <linux/kernel_stat.h> | 7 | #include <linux/kernel_stat.h> |
8 | #include <linux/cputime.h> | ||
9 | 8 | ||
10 | static int uptime_proc_show(struct seq_file *m, void *v) | 9 | static int uptime_proc_show(struct seq_file *m, void *v) |
11 | { | 10 | { |
12 | struct timespec uptime; | 11 | struct timespec uptime; |
13 | struct timespec idle; | 12 | struct timespec idle; |
14 | u64 idletime; | ||
15 | u64 nsec; | 13 | u64 nsec; |
16 | u32 rem; | 14 | u32 rem; |
17 | int i; | 15 | int i; |
18 | 16 | ||
19 | idletime = 0; | 17 | nsec = 0; |
20 | for_each_possible_cpu(i) | 18 | for_each_possible_cpu(i) |
21 | idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; | 19 | nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; |
22 | 20 | ||
23 | get_monotonic_boottime(&uptime); | 21 | get_monotonic_boottime(&uptime); |
24 | nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; | ||
25 | idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); | 22 | idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); |
26 | idle.tv_nsec = rem; | 23 | idle.tv_nsec = rem; |
27 | seq_printf(m, "%lu.%02lu %lu.%02lu\n", | 24 | seq_printf(m, "%lu.%02lu %lu.%02lu\n", |
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h deleted file mode 100644 index 51969436b8b8..000000000000 --- a/include/asm-generic/cputime.h +++ /dev/null | |||
@@ -1,15 +0,0 @@ | |||
1 | #ifndef _ASM_GENERIC_CPUTIME_H | ||
2 | #define _ASM_GENERIC_CPUTIME_H | ||
3 | |||
4 | #include <linux/time.h> | ||
5 | #include <linux/jiffies.h> | ||
6 | |||
7 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
8 | # include <asm-generic/cputime_jiffies.h> | ||
9 | #endif | ||
10 | |||
11 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | ||
12 | # include <asm-generic/cputime_nsecs.h> | ||
13 | #endif | ||
14 | |||
15 | #endif | ||
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h deleted file mode 100644 index 6bb8cd45f53b..000000000000 --- a/include/asm-generic/cputime_jiffies.h +++ /dev/null | |||
@@ -1,75 +0,0 @@ | |||
1 | #ifndef _ASM_GENERIC_CPUTIME_JIFFIES_H | ||
2 | #define _ASM_GENERIC_CPUTIME_JIFFIES_H | ||
3 | |||
4 | typedef unsigned long __nocast cputime_t; | ||
5 | |||
6 | #define cmpxchg_cputime(ptr, old, new) cmpxchg(ptr, old, new) | ||
7 | |||
8 | #define cputime_one_jiffy jiffies_to_cputime(1) | ||
9 | #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) | ||
10 | #define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) | ||
11 | |||
12 | typedef u64 __nocast cputime64_t; | ||
13 | |||
14 | #define cputime64_to_jiffies64(__ct) (__force u64)(__ct) | ||
15 | #define jiffies64_to_cputime64(__jif) (__force cputime64_t)(__jif) | ||
16 | |||
17 | |||
18 | /* | ||
19 | * Convert nanoseconds <-> cputime | ||
20 | */ | ||
21 | #define cputime_to_nsecs(__ct) \ | ||
22 | jiffies_to_nsecs(cputime_to_jiffies(__ct)) | ||
23 | #define nsecs_to_cputime64(__nsec) \ | ||
24 | jiffies64_to_cputime64(nsecs_to_jiffies64(__nsec)) | ||
25 | #define nsecs_to_cputime(__nsec) \ | ||
26 | jiffies_to_cputime(nsecs_to_jiffies(__nsec)) | ||
27 | |||
28 | |||
29 | /* | ||
30 | * Convert cputime to microseconds and back. | ||
31 | */ | ||
32 | #define cputime_to_usecs(__ct) \ | ||
33 | jiffies_to_usecs(cputime_to_jiffies(__ct)) | ||
34 | #define usecs_to_cputime(__usec) \ | ||
35 | jiffies_to_cputime(usecs_to_jiffies(__usec)) | ||
36 | #define usecs_to_cputime64(__usec) \ | ||
37 | jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000)) | ||
38 | |||
39 | /* | ||
40 | * Convert cputime to seconds and back. | ||
41 | */ | ||
42 | #define cputime_to_secs(jif) (cputime_to_jiffies(jif) / HZ) | ||
43 | #define secs_to_cputime(sec) jiffies_to_cputime((sec) * HZ) | ||
44 | |||
45 | /* | ||
46 | * Convert cputime to timespec and back. | ||
47 | */ | ||
48 | #define timespec_to_cputime(__val) \ | ||
49 | jiffies_to_cputime(timespec_to_jiffies(__val)) | ||
50 | #define cputime_to_timespec(__ct,__val) \ | ||
51 | jiffies_to_timespec(cputime_to_jiffies(__ct),__val) | ||
52 | |||
53 | /* | ||
54 | * Convert cputime to timeval and back. | ||
55 | */ | ||
56 | #define timeval_to_cputime(__val) \ | ||
57 | jiffies_to_cputime(timeval_to_jiffies(__val)) | ||
58 | #define cputime_to_timeval(__ct,__val) \ | ||
59 | jiffies_to_timeval(cputime_to_jiffies(__ct),__val) | ||
60 | |||
61 | /* | ||
62 | * Convert cputime to clock and back. | ||
63 | */ | ||
64 | #define cputime_to_clock_t(__ct) \ | ||
65 | jiffies_to_clock_t(cputime_to_jiffies(__ct)) | ||
66 | #define clock_t_to_cputime(__x) \ | ||
67 | jiffies_to_cputime(clock_t_to_jiffies(__x)) | ||
68 | |||
69 | /* | ||
70 | * Convert cputime64 to clock. | ||
71 | */ | ||
72 | #define cputime64_to_clock_t(__ct) \ | ||
73 | jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct)) | ||
74 | |||
75 | #endif | ||
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h deleted file mode 100644 index 4e3b18e559b1..000000000000 --- a/include/asm-generic/cputime_nsecs.h +++ /dev/null | |||
@@ -1,121 +0,0 @@ | |||
1 | /* | ||
2 | * Definitions for measuring cputime in nsecs resolution. | ||
3 | * | ||
4 | * Based on <arch/ia64/include/asm/cputime.h> | ||
5 | * | ||
6 | * Copyright (C) 2007 FUJITSU LIMITED | ||
7 | * Copyright (C) 2007 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or | ||
10 | * modify it under the terms of the GNU General Public License | ||
11 | * as published by the Free Software Foundation; either version | ||
12 | * 2 of the License, or (at your option) any later version. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #ifndef _ASM_GENERIC_CPUTIME_NSECS_H | ||
17 | #define _ASM_GENERIC_CPUTIME_NSECS_H | ||
18 | |||
19 | #include <linux/math64.h> | ||
20 | |||
21 | typedef u64 __nocast cputime_t; | ||
22 | typedef u64 __nocast cputime64_t; | ||
23 | |||
24 | #define cmpxchg_cputime(ptr, old, new) cmpxchg64(ptr, old, new) | ||
25 | |||
26 | #define cputime_one_jiffy jiffies_to_cputime(1) | ||
27 | |||
28 | #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) | ||
29 | #define cputime_div_rem(__ct, divisor, remainder) \ | ||
30 | div_u64_rem((__force u64)__ct, divisor, remainder); | ||
31 | |||
32 | /* | ||
33 | * Convert cputime <-> jiffies (HZ) | ||
34 | */ | ||
35 | #define cputime_to_jiffies(__ct) \ | ||
36 | cputime_div(__ct, NSEC_PER_SEC / HZ) | ||
37 | #define jiffies_to_cputime(__jif) \ | ||
38 | (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) | ||
39 | #define cputime64_to_jiffies64(__ct) \ | ||
40 | cputime_div(__ct, NSEC_PER_SEC / HZ) | ||
41 | #define jiffies64_to_cputime64(__jif) \ | ||
42 | (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) | ||
43 | |||
44 | |||
45 | /* | ||
46 | * Convert cputime <-> nanoseconds | ||
47 | */ | ||
48 | #define cputime_to_nsecs(__ct) \ | ||
49 | (__force u64)(__ct) | ||
50 | #define nsecs_to_cputime(__nsecs) \ | ||
51 | (__force cputime_t)(__nsecs) | ||
52 | #define nsecs_to_cputime64(__nsecs) \ | ||
53 | (__force cputime64_t)(__nsecs) | ||
54 | |||
55 | |||
56 | /* | ||
57 | * Convert cputime <-> microseconds | ||
58 | */ | ||
59 | #define cputime_to_usecs(__ct) \ | ||
60 | cputime_div(__ct, NSEC_PER_USEC) | ||
61 | #define usecs_to_cputime(__usecs) \ | ||
62 | (__force cputime_t)((__usecs) * NSEC_PER_USEC) | ||
63 | #define usecs_to_cputime64(__usecs) \ | ||
64 | (__force cputime64_t)((__usecs) * NSEC_PER_USEC) | ||
65 | |||
66 | /* | ||
67 | * Convert cputime <-> seconds | ||
68 | */ | ||
69 | #define cputime_to_secs(__ct) \ | ||
70 | cputime_div(__ct, NSEC_PER_SEC) | ||
71 | #define secs_to_cputime(__secs) \ | ||
72 | (__force cputime_t)((__secs) * NSEC_PER_SEC) | ||
73 | |||
74 | /* | ||
75 | * Convert cputime <-> timespec (nsec) | ||
76 | */ | ||
77 | static inline cputime_t timespec_to_cputime(const struct timespec *val) | ||
78 | { | ||
79 | u64 ret = (u64)val->tv_sec * NSEC_PER_SEC + val->tv_nsec; | ||
80 | return (__force cputime_t) ret; | ||
81 | } | ||
82 | static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) | ||
83 | { | ||
84 | u32 rem; | ||
85 | |||
86 | val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem); | ||
87 | val->tv_nsec = rem; | ||
88 | } | ||
89 | |||
90 | /* | ||
91 | * Convert cputime <-> timeval (msec) | ||
92 | */ | ||
93 | static inline cputime_t timeval_to_cputime(const struct timeval *val) | ||
94 | { | ||
95 | u64 ret = (u64)val->tv_sec * NSEC_PER_SEC + | ||
96 | val->tv_usec * NSEC_PER_USEC; | ||
97 | return (__force cputime_t) ret; | ||
98 | } | ||
99 | static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) | ||
100 | { | ||
101 | u32 rem; | ||
102 | |||
103 | val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem); | ||
104 | val->tv_usec = rem / NSEC_PER_USEC; | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Convert cputime <-> clock (USER_HZ) | ||
109 | */ | ||
110 | #define cputime_to_clock_t(__ct) \ | ||
111 | cputime_div(__ct, (NSEC_PER_SEC / USER_HZ)) | ||
112 | #define clock_t_to_cputime(__x) \ | ||
113 | (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) | ||
114 | |||
115 | /* | ||
116 | * Convert cputime64 to clock. | ||
117 | */ | ||
118 | #define cputime64_to_clock_t(__ct) \ | ||
119 | cputime_to_clock_t((__force cputime_t)__ct) | ||
120 | |||
121 | #endif | ||
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e315d04a2fd9..cfc75848a35d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h | |||
@@ -62,6 +62,8 @@ struct module; | |||
62 | * @archdata: arch-specific data | 62 | * @archdata: arch-specific data |
63 | * @suspend: suspend function for the clocksource, if necessary | 63 | * @suspend: suspend function for the clocksource, if necessary |
64 | * @resume: resume function for the clocksource, if necessary | 64 | * @resume: resume function for the clocksource, if necessary |
65 | * @mark_unstable: Optional function to inform the clocksource driver that | ||
66 | * the watchdog marked the clocksource unstable | ||
65 | * @owner: module reference, must be set by clocksource in modules | 67 | * @owner: module reference, must be set by clocksource in modules |
66 | * | 68 | * |
67 | * Note: This struct is not used in hotpathes of the timekeeping code | 69 | * Note: This struct is not used in hotpathes of the timekeeping code |
@@ -93,6 +95,7 @@ struct clocksource { | |||
93 | unsigned long flags; | 95 | unsigned long flags; |
94 | void (*suspend)(struct clocksource *cs); | 96 | void (*suspend)(struct clocksource *cs); |
95 | void (*resume)(struct clocksource *cs); | 97 | void (*resume)(struct clocksource *cs); |
98 | void (*mark_unstable)(struct clocksource *cs); | ||
96 | 99 | ||
97 | /* private: */ | 100 | /* private: */ |
98 | #ifdef CONFIG_CLOCKSOURCE_WATCHDOG | 101 | #ifdef CONFIG_CLOCKSOURCE_WATCHDOG |
diff --git a/include/linux/compat.h b/include/linux/compat.h index 63609398ef9f..9e40be522793 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h | |||
@@ -731,7 +731,25 @@ asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, | |||
731 | static inline bool in_compat_syscall(void) { return is_compat_task(); } | 731 | static inline bool in_compat_syscall(void) { return is_compat_task(); } |
732 | #endif | 732 | #endif |
733 | 733 | ||
734 | #else | 734 | /** |
735 | * ns_to_compat_timeval - Compat version of ns_to_timeval | ||
736 | * @nsec: the nanoseconds value to be converted | ||
737 | * | ||
738 | * Returns the compat_timeval representation of the nsec parameter. | ||
739 | */ | ||
740 | static inline struct compat_timeval ns_to_compat_timeval(s64 nsec) | ||
741 | { | ||
742 | struct timeval tv; | ||
743 | struct compat_timeval ctv; | ||
744 | |||
745 | tv = ns_to_timeval(nsec); | ||
746 | ctv.tv_sec = tv.tv_sec; | ||
747 | ctv.tv_usec = tv.tv_usec; | ||
748 | |||
749 | return ctv; | ||
750 | } | ||
751 | |||
752 | #else /* !CONFIG_COMPAT */ | ||
735 | 753 | ||
736 | #define is_compat_task() (0) | 754 | #define is_compat_task() (0) |
737 | static inline bool in_compat_syscall(void) { return false; } | 755 | static inline bool in_compat_syscall(void) { return false; } |
diff --git a/include/linux/cputime.h b/include/linux/cputime.h index f2eb2ee535ca..a691dc4ddc13 100644 --- a/include/linux/cputime.h +++ b/include/linux/cputime.h | |||
@@ -1,6 +1,7 @@ | |||
1 | #ifndef __LINUX_CPUTIME_H | 1 | #ifndef __LINUX_CPUTIME_H |
2 | #define __LINUX_CPUTIME_H | 2 | #define __LINUX_CPUTIME_H |
3 | 3 | ||
4 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | ||
4 | #include <asm/cputime.h> | 5 | #include <asm/cputime.h> |
5 | 6 | ||
6 | #ifndef cputime_to_nsecs | 7 | #ifndef cputime_to_nsecs |
@@ -8,9 +9,5 @@ | |||
8 | (cputime_to_usecs(__ct) * NSEC_PER_USEC) | 9 | (cputime_to_usecs(__ct) * NSEC_PER_USEC) |
9 | #endif | 10 | #endif |
10 | 11 | ||
11 | #ifndef nsecs_to_cputime | 12 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
12 | # define nsecs_to_cputime(__nsecs) \ | ||
13 | usecs_to_cputime((__nsecs) / NSEC_PER_USEC) | ||
14 | #endif | ||
15 | |||
16 | #endif /* __LINUX_CPUTIME_H */ | 13 | #endif /* __LINUX_CPUTIME_H */ |
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 6cee17c22313..00e60f79a9cc 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h | |||
@@ -17,6 +17,7 @@ | |||
17 | #ifndef _LINUX_DELAYACCT_H | 17 | #ifndef _LINUX_DELAYACCT_H |
18 | #define _LINUX_DELAYACCT_H | 18 | #define _LINUX_DELAYACCT_H |
19 | 19 | ||
20 | #include <uapi/linux/taskstats.h> | ||
20 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
21 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
22 | 23 | ||
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 589d14e970ad..624215cebee5 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h | |||
@@ -293,6 +293,8 @@ static inline u64 jiffies_to_nsecs(const unsigned long j) | |||
293 | return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; | 293 | return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; |
294 | } | 294 | } |
295 | 295 | ||
296 | extern u64 jiffies64_to_nsecs(u64 j); | ||
297 | |||
296 | extern unsigned long __msecs_to_jiffies(const unsigned int m); | 298 | extern unsigned long __msecs_to_jiffies(const unsigned int m); |
297 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) | 299 | #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) |
298 | /* | 300 | /* |
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 00f776816aa3..66be8b6beceb 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
10 | #include <linux/vtime.h> | 10 | #include <linux/vtime.h> |
11 | #include <asm/irq.h> | 11 | #include <asm/irq.h> |
12 | #include <linux/cputime.h> | ||
13 | 12 | ||
14 | /* | 13 | /* |
15 | * 'kernel_stat.h' contains the definitions needed for doing | 14 | * 'kernel_stat.h' contains the definitions needed for doing |
@@ -78,15 +77,18 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) | |||
78 | return kstat_cpu(cpu).irqs_sum; | 77 | return kstat_cpu(cpu).irqs_sum; |
79 | } | 78 | } |
80 | 79 | ||
81 | extern void account_user_time(struct task_struct *, cputime_t); | 80 | extern void account_user_time(struct task_struct *, u64); |
82 | extern void account_system_time(struct task_struct *, int, cputime_t); | 81 | extern void account_guest_time(struct task_struct *, u64); |
83 | extern void account_steal_time(cputime_t); | 82 | extern void account_system_time(struct task_struct *, int, u64); |
84 | extern void account_idle_time(cputime_t); | 83 | extern void account_system_index_time(struct task_struct *, u64, |
84 | enum cpu_usage_stat); | ||
85 | extern void account_steal_time(u64); | ||
86 | extern void account_idle_time(u64); | ||
85 | 87 | ||
86 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 88 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
87 | static inline void account_process_tick(struct task_struct *tsk, int user) | 89 | static inline void account_process_tick(struct task_struct *tsk, int user) |
88 | { | 90 | { |
89 | vtime_account_user(tsk); | 91 | vtime_flush(tsk); |
90 | } | 92 | } |
91 | #else | 93 | #else |
92 | extern void account_process_tick(struct task_struct *, int user); | 94 | extern void account_process_tick(struct task_struct *, int user); |
diff --git a/include/linux/mutex.h b/include/linux/mutex.h index b97870f2debd..7fffbfcd5430 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h | |||
@@ -156,10 +156,12 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, | |||
156 | unsigned int subclass); | 156 | unsigned int subclass); |
157 | extern int __must_check mutex_lock_killable_nested(struct mutex *lock, | 157 | extern int __must_check mutex_lock_killable_nested(struct mutex *lock, |
158 | unsigned int subclass); | 158 | unsigned int subclass); |
159 | extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); | ||
159 | 160 | ||
160 | #define mutex_lock(lock) mutex_lock_nested(lock, 0) | 161 | #define mutex_lock(lock) mutex_lock_nested(lock, 0) |
161 | #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) | 162 | #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) |
162 | #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) | 163 | #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) |
164 | #define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) | ||
163 | 165 | ||
164 | #define mutex_lock_nest_lock(lock, nest_lock) \ | 166 | #define mutex_lock_nest_lock(lock, nest_lock) \ |
165 | do { \ | 167 | do { \ |
@@ -171,11 +173,13 @@ do { \ | |||
171 | extern void mutex_lock(struct mutex *lock); | 173 | extern void mutex_lock(struct mutex *lock); |
172 | extern int __must_check mutex_lock_interruptible(struct mutex *lock); | 174 | extern int __must_check mutex_lock_interruptible(struct mutex *lock); |
173 | extern int __must_check mutex_lock_killable(struct mutex *lock); | 175 | extern int __must_check mutex_lock_killable(struct mutex *lock); |
176 | extern void mutex_lock_io(struct mutex *lock); | ||
174 | 177 | ||
175 | # define mutex_lock_nested(lock, subclass) mutex_lock(lock) | 178 | # define mutex_lock_nested(lock, subclass) mutex_lock(lock) |
176 | # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) | 179 | # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) |
177 | # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) | 180 | # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) |
178 | # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) | 181 | # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) |
182 | # define mutex_lock_io_nested(lock, subclass) mutex_lock(lock) | ||
179 | #endif | 183 | #endif |
180 | 184 | ||
181 | /* | 185 | /* |
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 62d44c176071..64aa189efe21 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h | |||
@@ -8,19 +8,9 @@ | |||
8 | #include <linux/alarmtimer.h> | 8 | #include <linux/alarmtimer.h> |
9 | 9 | ||
10 | 10 | ||
11 | static inline unsigned long long cputime_to_expires(cputime_t expires) | ||
12 | { | ||
13 | return (__force unsigned long long)expires; | ||
14 | } | ||
15 | |||
16 | static inline cputime_t expires_to_cputime(unsigned long long expires) | ||
17 | { | ||
18 | return (__force cputime_t)expires; | ||
19 | } | ||
20 | |||
21 | struct cpu_timer_list { | 11 | struct cpu_timer_list { |
22 | struct list_head entry; | 12 | struct list_head entry; |
23 | unsigned long long expires, incr; | 13 | u64 expires, incr; |
24 | struct task_struct *task; | 14 | struct task_struct *task; |
25 | int firing; | 15 | int firing; |
26 | }; | 16 | }; |
@@ -129,7 +119,7 @@ void run_posix_cpu_timers(struct task_struct *task); | |||
129 | void posix_cpu_timers_exit(struct task_struct *task); | 119 | void posix_cpu_timers_exit(struct task_struct *task); |
130 | void posix_cpu_timers_exit_group(struct task_struct *task); | 120 | void posix_cpu_timers_exit_group(struct task_struct *task); |
131 | void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, | 121 | void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, |
132 | cputime_t *newval, cputime_t *oldval); | 122 | u64 *newval, u64 *oldval); |
133 | 123 | ||
134 | long clock_nanosleep_restart(struct restart_block *restart_block); | 124 | long clock_nanosleep_restart(struct restart_block *restart_block); |
135 | 125 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e4782eae076..c89b7fdec41e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -29,7 +29,6 @@ struct sched_param { | |||
29 | 29 | ||
30 | #include <asm/page.h> | 30 | #include <asm/page.h> |
31 | #include <asm/ptrace.h> | 31 | #include <asm/ptrace.h> |
32 | #include <linux/cputime.h> | ||
33 | 32 | ||
34 | #include <linux/smp.h> | 33 | #include <linux/smp.h> |
35 | #include <linux/sem.h> | 34 | #include <linux/sem.h> |
@@ -461,12 +460,10 @@ extern signed long schedule_timeout_idle(signed long timeout); | |||
461 | asmlinkage void schedule(void); | 460 | asmlinkage void schedule(void); |
462 | extern void schedule_preempt_disabled(void); | 461 | extern void schedule_preempt_disabled(void); |
463 | 462 | ||
463 | extern int __must_check io_schedule_prepare(void); | ||
464 | extern void io_schedule_finish(int token); | ||
464 | extern long io_schedule_timeout(long timeout); | 465 | extern long io_schedule_timeout(long timeout); |
465 | 466 | extern void io_schedule(void); | |
466 | static inline void io_schedule(void) | ||
467 | { | ||
468 | io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); | ||
469 | } | ||
470 | 467 | ||
471 | void __noreturn do_task_dead(void); | 468 | void __noreturn do_task_dead(void); |
472 | 469 | ||
@@ -565,15 +562,13 @@ struct pacct_struct { | |||
565 | int ac_flag; | 562 | int ac_flag; |
566 | long ac_exitcode; | 563 | long ac_exitcode; |
567 | unsigned long ac_mem; | 564 | unsigned long ac_mem; |
568 | cputime_t ac_utime, ac_stime; | 565 | u64 ac_utime, ac_stime; |
569 | unsigned long ac_minflt, ac_majflt; | 566 | unsigned long ac_minflt, ac_majflt; |
570 | }; | 567 | }; |
571 | 568 | ||
572 | struct cpu_itimer { | 569 | struct cpu_itimer { |
573 | cputime_t expires; | 570 | u64 expires; |
574 | cputime_t incr; | 571 | u64 incr; |
575 | u32 error; | ||
576 | u32 incr_error; | ||
577 | }; | 572 | }; |
578 | 573 | ||
579 | /** | 574 | /** |
@@ -587,8 +582,8 @@ struct cpu_itimer { | |||
587 | */ | 582 | */ |
588 | struct prev_cputime { | 583 | struct prev_cputime { |
589 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 584 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE |
590 | cputime_t utime; | 585 | u64 utime; |
591 | cputime_t stime; | 586 | u64 stime; |
592 | raw_spinlock_t lock; | 587 | raw_spinlock_t lock; |
593 | #endif | 588 | #endif |
594 | }; | 589 | }; |
@@ -603,8 +598,8 @@ static inline void prev_cputime_init(struct prev_cputime *prev) | |||
603 | 598 | ||
604 | /** | 599 | /** |
605 | * struct task_cputime - collected CPU time counts | 600 | * struct task_cputime - collected CPU time counts |
606 | * @utime: time spent in user mode, in &cputime_t units | 601 | * @utime: time spent in user mode, in nanoseconds |
607 | * @stime: time spent in kernel mode, in &cputime_t units | 602 | * @stime: time spent in kernel mode, in nanoseconds |
608 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds | 603 | * @sum_exec_runtime: total time spent on the CPU, in nanoseconds |
609 | * | 604 | * |
610 | * This structure groups together three kinds of CPU time that are tracked for | 605 | * This structure groups together three kinds of CPU time that are tracked for |
@@ -612,8 +607,8 @@ static inline void prev_cputime_init(struct prev_cputime *prev) | |||
612 | * these counts together and treat all three of them in parallel. | 607 | * these counts together and treat all three of them in parallel. |
613 | */ | 608 | */ |
614 | struct task_cputime { | 609 | struct task_cputime { |
615 | cputime_t utime; | 610 | u64 utime; |
616 | cputime_t stime; | 611 | u64 stime; |
617 | unsigned long long sum_exec_runtime; | 612 | unsigned long long sum_exec_runtime; |
618 | }; | 613 | }; |
619 | 614 | ||
@@ -622,13 +617,6 @@ struct task_cputime { | |||
622 | #define prof_exp stime | 617 | #define prof_exp stime |
623 | #define sched_exp sum_exec_runtime | 618 | #define sched_exp sum_exec_runtime |
624 | 619 | ||
625 | #define INIT_CPUTIME \ | ||
626 | (struct task_cputime) { \ | ||
627 | .utime = 0, \ | ||
628 | .stime = 0, \ | ||
629 | .sum_exec_runtime = 0, \ | ||
630 | } | ||
631 | |||
632 | /* | 620 | /* |
633 | * This is the atomic variant of task_cputime, which can be used for | 621 | * This is the atomic variant of task_cputime, which can be used for |
634 | * storing and updating task_cputime statistics without locking. | 622 | * storing and updating task_cputime statistics without locking. |
@@ -787,9 +775,9 @@ struct signal_struct { | |||
787 | * in __exit_signal, except for the group leader. | 775 | * in __exit_signal, except for the group leader. |
788 | */ | 776 | */ |
789 | seqlock_t stats_lock; | 777 | seqlock_t stats_lock; |
790 | cputime_t utime, stime, cutime, cstime; | 778 | u64 utime, stime, cutime, cstime; |
791 | cputime_t gtime; | 779 | u64 gtime; |
792 | cputime_t cgtime; | 780 | u64 cgtime; |
793 | struct prev_cputime prev_cputime; | 781 | struct prev_cputime prev_cputime; |
794 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 782 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
795 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 783 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
@@ -1668,11 +1656,11 @@ struct task_struct { | |||
1668 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1656 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
1669 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ | 1657 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1670 | 1658 | ||
1671 | cputime_t utime, stime; | 1659 | u64 utime, stime; |
1672 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME | 1660 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME |
1673 | cputime_t utimescaled, stimescaled; | 1661 | u64 utimescaled, stimescaled; |
1674 | #endif | 1662 | #endif |
1675 | cputime_t gtime; | 1663 | u64 gtime; |
1676 | struct prev_cputime prev_cputime; | 1664 | struct prev_cputime prev_cputime; |
1677 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1665 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
1678 | seqcount_t vtime_seqcount; | 1666 | seqcount_t vtime_seqcount; |
@@ -1824,7 +1812,7 @@ struct task_struct { | |||
1824 | #if defined(CONFIG_TASK_XACCT) | 1812 | #if defined(CONFIG_TASK_XACCT) |
1825 | u64 acct_rss_mem1; /* accumulated rss usage */ | 1813 | u64 acct_rss_mem1; /* accumulated rss usage */ |
1826 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ | 1814 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ |
1827 | cputime_t acct_timexpd; /* stime + utime since last update */ | 1815 | u64 acct_timexpd; /* stime + utime since last update */ |
1828 | #endif | 1816 | #endif |
1829 | #ifdef CONFIG_CPUSETS | 1817 | #ifdef CONFIG_CPUSETS |
1830 | nodemask_t mems_allowed; /* Protected by alloc_lock */ | 1818 | nodemask_t mems_allowed; /* Protected by alloc_lock */ |
@@ -2269,17 +2257,17 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask); | |||
2269 | 2257 | ||
2270 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 2258 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
2271 | extern void task_cputime(struct task_struct *t, | 2259 | extern void task_cputime(struct task_struct *t, |
2272 | cputime_t *utime, cputime_t *stime); | 2260 | u64 *utime, u64 *stime); |
2273 | extern cputime_t task_gtime(struct task_struct *t); | 2261 | extern u64 task_gtime(struct task_struct *t); |
2274 | #else | 2262 | #else |
2275 | static inline void task_cputime(struct task_struct *t, | 2263 | static inline void task_cputime(struct task_struct *t, |
2276 | cputime_t *utime, cputime_t *stime) | 2264 | u64 *utime, u64 *stime) |
2277 | { | 2265 | { |
2278 | *utime = t->utime; | 2266 | *utime = t->utime; |
2279 | *stime = t->stime; | 2267 | *stime = t->stime; |
2280 | } | 2268 | } |
2281 | 2269 | ||
2282 | static inline cputime_t task_gtime(struct task_struct *t) | 2270 | static inline u64 task_gtime(struct task_struct *t) |
2283 | { | 2271 | { |
2284 | return t->gtime; | 2272 | return t->gtime; |
2285 | } | 2273 | } |
@@ -2287,23 +2275,23 @@ static inline cputime_t task_gtime(struct task_struct *t) | |||
2287 | 2275 | ||
2288 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME | 2276 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME |
2289 | static inline void task_cputime_scaled(struct task_struct *t, | 2277 | static inline void task_cputime_scaled(struct task_struct *t, |
2290 | cputime_t *utimescaled, | 2278 | u64 *utimescaled, |
2291 | cputime_t *stimescaled) | 2279 | u64 *stimescaled) |
2292 | { | 2280 | { |
2293 | *utimescaled = t->utimescaled; | 2281 | *utimescaled = t->utimescaled; |
2294 | *stimescaled = t->stimescaled; | 2282 | *stimescaled = t->stimescaled; |
2295 | } | 2283 | } |
2296 | #else | 2284 | #else |
2297 | static inline void task_cputime_scaled(struct task_struct *t, | 2285 | static inline void task_cputime_scaled(struct task_struct *t, |
2298 | cputime_t *utimescaled, | 2286 | u64 *utimescaled, |
2299 | cputime_t *stimescaled) | 2287 | u64 *stimescaled) |
2300 | { | 2288 | { |
2301 | task_cputime(t, utimescaled, stimescaled); | 2289 | task_cputime(t, utimescaled, stimescaled); |
2302 | } | 2290 | } |
2303 | #endif | 2291 | #endif |
2304 | 2292 | ||
2305 | extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); | 2293 | extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); |
2306 | extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); | 2294 | extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st); |
2307 | 2295 | ||
2308 | /* | 2296 | /* |
2309 | * Per process flags | 2297 | * Per process flags |
@@ -2522,10 +2510,18 @@ extern u64 sched_clock_cpu(int cpu); | |||
2522 | extern void sched_clock_init(void); | 2510 | extern void sched_clock_init(void); |
2523 | 2511 | ||
2524 | #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 2512 | #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
2513 | static inline void sched_clock_init_late(void) | ||
2514 | { | ||
2515 | } | ||
2516 | |||
2525 | static inline void sched_clock_tick(void) | 2517 | static inline void sched_clock_tick(void) |
2526 | { | 2518 | { |
2527 | } | 2519 | } |
2528 | 2520 | ||
2521 | static inline void clear_sched_clock_stable(void) | ||
2522 | { | ||
2523 | } | ||
2524 | |||
2529 | static inline void sched_clock_idle_sleep_event(void) | 2525 | static inline void sched_clock_idle_sleep_event(void) |
2530 | { | 2526 | { |
2531 | } | 2527 | } |
@@ -2544,6 +2540,7 @@ static inline u64 local_clock(void) | |||
2544 | return sched_clock(); | 2540 | return sched_clock(); |
2545 | } | 2541 | } |
2546 | #else | 2542 | #else |
2543 | extern void sched_clock_init_late(void); | ||
2547 | /* | 2544 | /* |
2548 | * Architectures can set this to 1 if they have specified | 2545 | * Architectures can set this to 1 if they have specified |
2549 | * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, | 2546 | * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, |
@@ -2551,7 +2548,6 @@ static inline u64 local_clock(void) | |||
2551 | * is reliable after all: | 2548 | * is reliable after all: |
2552 | */ | 2549 | */ |
2553 | extern int sched_clock_stable(void); | 2550 | extern int sched_clock_stable(void); |
2554 | extern void set_sched_clock_stable(void); | ||
2555 | extern void clear_sched_clock_stable(void); | 2551 | extern void clear_sched_clock_stable(void); |
2556 | 2552 | ||
2557 | extern void sched_clock_tick(void); | 2553 | extern void sched_clock_tick(void); |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 441145351301..49308e142aae 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -59,6 +59,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice; | |||
59 | extern unsigned int sysctl_sched_autogroup_enabled; | 59 | extern unsigned int sysctl_sched_autogroup_enabled; |
60 | #endif | 60 | #endif |
61 | 61 | ||
62 | extern int sysctl_sched_rr_timeslice; | ||
62 | extern int sched_rr_timeslice; | 63 | extern int sched_rr_timeslice; |
63 | 64 | ||
64 | extern int sched_rr_handler(struct ctl_table *table, int write, | 65 | extern int sched_rr_handler(struct ctl_table *table, int write, |
diff --git a/include/linux/vtime.h b/include/linux/vtime.h index aa9bfea8804a..0681fe25abeb 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h | |||
@@ -58,27 +58,28 @@ static inline void vtime_task_switch(struct task_struct *prev) | |||
58 | 58 | ||
59 | extern void vtime_account_system(struct task_struct *tsk); | 59 | extern void vtime_account_system(struct task_struct *tsk); |
60 | extern void vtime_account_idle(struct task_struct *tsk); | 60 | extern void vtime_account_idle(struct task_struct *tsk); |
61 | extern void vtime_account_user(struct task_struct *tsk); | ||
62 | 61 | ||
63 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 62 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ |
64 | 63 | ||
65 | static inline void vtime_task_switch(struct task_struct *prev) { } | 64 | static inline void vtime_task_switch(struct task_struct *prev) { } |
66 | static inline void vtime_account_system(struct task_struct *tsk) { } | 65 | static inline void vtime_account_system(struct task_struct *tsk) { } |
67 | static inline void vtime_account_user(struct task_struct *tsk) { } | ||
68 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ | 66 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ |
69 | 67 | ||
70 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 68 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
71 | extern void arch_vtime_task_switch(struct task_struct *tsk); | 69 | extern void arch_vtime_task_switch(struct task_struct *tsk); |
70 | extern void vtime_account_user(struct task_struct *tsk); | ||
72 | extern void vtime_user_enter(struct task_struct *tsk); | 71 | extern void vtime_user_enter(struct task_struct *tsk); |
73 | 72 | ||
74 | static inline void vtime_user_exit(struct task_struct *tsk) | 73 | static inline void vtime_user_exit(struct task_struct *tsk) |
75 | { | 74 | { |
76 | vtime_account_user(tsk); | 75 | vtime_account_user(tsk); |
77 | } | 76 | } |
77 | |||
78 | extern void vtime_guest_enter(struct task_struct *tsk); | 78 | extern void vtime_guest_enter(struct task_struct *tsk); |
79 | extern void vtime_guest_exit(struct task_struct *tsk); | 79 | extern void vtime_guest_exit(struct task_struct *tsk); |
80 | extern void vtime_init_idle(struct task_struct *tsk, int cpu); | 80 | extern void vtime_init_idle(struct task_struct *tsk, int cpu); |
81 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ | 81 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |
82 | static inline void vtime_account_user(struct task_struct *tsk) { } | ||
82 | static inline void vtime_user_enter(struct task_struct *tsk) { } | 83 | static inline void vtime_user_enter(struct task_struct *tsk) { } |
83 | static inline void vtime_user_exit(struct task_struct *tsk) { } | 84 | static inline void vtime_user_exit(struct task_struct *tsk) { } |
84 | static inline void vtime_guest_enter(struct task_struct *tsk) { } | 85 | static inline void vtime_guest_enter(struct task_struct *tsk) { } |
@@ -93,9 +94,11 @@ static inline void vtime_account_irq_exit(struct task_struct *tsk) | |||
93 | /* On hard|softirq exit we always account to hard|softirq cputime */ | 94 | /* On hard|softirq exit we always account to hard|softirq cputime */ |
94 | vtime_account_system(tsk); | 95 | vtime_account_system(tsk); |
95 | } | 96 | } |
97 | extern void vtime_flush(struct task_struct *tsk); | ||
96 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 98 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
97 | static inline void vtime_account_irq_enter(struct task_struct *tsk) { } | 99 | static inline void vtime_account_irq_enter(struct task_struct *tsk) { } |
98 | static inline void vtime_account_irq_exit(struct task_struct *tsk) { } | 100 | static inline void vtime_account_irq_exit(struct task_struct *tsk) { } |
101 | static inline void vtime_flush(struct task_struct *tsk) { } | ||
99 | #endif | 102 | #endif |
100 | 103 | ||
101 | 104 | ||
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 1448637616d6..1bca99dbb98f 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h | |||
@@ -269,17 +269,17 @@ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, | |||
269 | TRACE_EVENT(itimer_state, | 269 | TRACE_EVENT(itimer_state, |
270 | 270 | ||
271 | TP_PROTO(int which, const struct itimerval *const value, | 271 | TP_PROTO(int which, const struct itimerval *const value, |
272 | cputime_t expires), | 272 | unsigned long long expires), |
273 | 273 | ||
274 | TP_ARGS(which, value, expires), | 274 | TP_ARGS(which, value, expires), |
275 | 275 | ||
276 | TP_STRUCT__entry( | 276 | TP_STRUCT__entry( |
277 | __field( int, which ) | 277 | __field( int, which ) |
278 | __field( cputime_t, expires ) | 278 | __field( unsigned long long, expires ) |
279 | __field( long, value_sec ) | 279 | __field( long, value_sec ) |
280 | __field( long, value_usec ) | 280 | __field( long, value_usec ) |
281 | __field( long, interval_sec ) | 281 | __field( long, interval_sec ) |
282 | __field( long, interval_usec ) | 282 | __field( long, interval_usec ) |
283 | ), | 283 | ), |
284 | 284 | ||
285 | TP_fast_assign( | 285 | TP_fast_assign( |
@@ -292,7 +292,7 @@ TRACE_EVENT(itimer_state, | |||
292 | ), | 292 | ), |
293 | 293 | ||
294 | TP_printk("which=%d expires=%llu it_value=%ld.%ld it_interval=%ld.%ld", | 294 | TP_printk("which=%d expires=%llu it_value=%ld.%ld it_interval=%ld.%ld", |
295 | __entry->which, (unsigned long long)__entry->expires, | 295 | __entry->which, __entry->expires, |
296 | __entry->value_sec, __entry->value_usec, | 296 | __entry->value_sec, __entry->value_usec, |
297 | __entry->interval_sec, __entry->interval_usec) | 297 | __entry->interval_sec, __entry->interval_usec) |
298 | ); | 298 | ); |
@@ -305,14 +305,14 @@ TRACE_EVENT(itimer_state, | |||
305 | */ | 305 | */ |
306 | TRACE_EVENT(itimer_expire, | 306 | TRACE_EVENT(itimer_expire, |
307 | 307 | ||
308 | TP_PROTO(int which, struct pid *pid, cputime_t now), | 308 | TP_PROTO(int which, struct pid *pid, unsigned long long now), |
309 | 309 | ||
310 | TP_ARGS(which, pid, now), | 310 | TP_ARGS(which, pid, now), |
311 | 311 | ||
312 | TP_STRUCT__entry( | 312 | TP_STRUCT__entry( |
313 | __field( int , which ) | 313 | __field( int , which ) |
314 | __field( pid_t, pid ) | 314 | __field( pid_t, pid ) |
315 | __field( cputime_t, now ) | 315 | __field( unsigned long long, now ) |
316 | ), | 316 | ), |
317 | 317 | ||
318 | TP_fast_assign( | 318 | TP_fast_assign( |
@@ -322,7 +322,7 @@ TRACE_EVENT(itimer_expire, | |||
322 | ), | 322 | ), |
323 | 323 | ||
324 | TP_printk("which=%d pid=%d now=%llu", __entry->which, | 324 | TP_printk("which=%d pid=%d now=%llu", __entry->which, |
325 | (int) __entry->pid, (unsigned long long)__entry->now) | 325 | (int) __entry->pid, __entry->now) |
326 | ); | 326 | ); |
327 | 327 | ||
328 | #ifdef CONFIG_NO_HZ_COMMON | 328 | #ifdef CONFIG_NO_HZ_COMMON |
diff --git a/init/main.c b/init/main.c index 9648d707eea5..6ced14a3df12 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -625,7 +625,6 @@ asmlinkage __visible void __init start_kernel(void) | |||
625 | numa_policy_init(); | 625 | numa_policy_init(); |
626 | if (late_time_init) | 626 | if (late_time_init) |
627 | late_time_init(); | 627 | late_time_init(); |
628 | sched_clock_init(); | ||
629 | calibrate_delay(); | 628 | calibrate_delay(); |
630 | pidmap_init(); | 629 | pidmap_init(); |
631 | anon_vma_init(); | 630 | anon_vma_init(); |
diff --git a/kernel/acct.c b/kernel/acct.c index 74963d192c5d..ca9cb55b5855 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -453,8 +453,8 @@ static void fill_ac(acct_t *ac) | |||
453 | spin_lock_irq(¤t->sighand->siglock); | 453 | spin_lock_irq(¤t->sighand->siglock); |
454 | tty = current->signal->tty; /* Safe as we hold the siglock */ | 454 | tty = current->signal->tty; /* Safe as we hold the siglock */ |
455 | ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | 455 | ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; |
456 | ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | 456 | ac->ac_utime = encode_comp_t(nsec_to_AHZ(pacct->ac_utime)); |
457 | ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | 457 | ac->ac_stime = encode_comp_t(nsec_to_AHZ(pacct->ac_stime)); |
458 | ac->ac_flag = pacct->ac_flag; | 458 | ac->ac_flag = pacct->ac_flag; |
459 | ac->ac_mem = encode_comp_t(pacct->ac_mem); | 459 | ac->ac_mem = encode_comp_t(pacct->ac_mem); |
460 | ac->ac_minflt = encode_comp_t(pacct->ac_minflt); | 460 | ac->ac_minflt = encode_comp_t(pacct->ac_minflt); |
@@ -530,7 +530,7 @@ out: | |||
530 | void acct_collect(long exitcode, int group_dead) | 530 | void acct_collect(long exitcode, int group_dead) |
531 | { | 531 | { |
532 | struct pacct_struct *pacct = ¤t->signal->pacct; | 532 | struct pacct_struct *pacct = ¤t->signal->pacct; |
533 | cputime_t utime, stime; | 533 | u64 utime, stime; |
534 | unsigned long vsize = 0; | 534 | unsigned long vsize = 0; |
535 | 535 | ||
536 | if (group_dead && current->mm) { | 536 | if (group_dead && current->mm) { |
@@ -559,6 +559,7 @@ void acct_collect(long exitcode, int group_dead) | |||
559 | pacct->ac_flag |= ACORE; | 559 | pacct->ac_flag |= ACORE; |
560 | if (current->flags & PF_SIGNALED) | 560 | if (current->flags & PF_SIGNALED) |
561 | pacct->ac_flag |= AXSIG; | 561 | pacct->ac_flag |= AXSIG; |
562 | |||
562 | task_cputime(current, &utime, &stime); | 563 | task_cputime(current, &utime, &stime); |
563 | pacct->ac_utime += utime; | 564 | pacct->ac_utime += utime; |
564 | pacct->ac_stime += stime; | 565 | pacct->ac_stime += stime; |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 435c14a45118..660549656991 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
@@ -82,19 +82,19 @@ void __delayacct_blkio_end(void) | |||
82 | 82 | ||
83 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) | 83 | int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) |
84 | { | 84 | { |
85 | cputime_t utime, stime, stimescaled, utimescaled; | 85 | u64 utime, stime, stimescaled, utimescaled; |
86 | unsigned long long t2, t3; | 86 | unsigned long long t2, t3; |
87 | unsigned long flags, t1; | 87 | unsigned long flags, t1; |
88 | s64 tmp; | 88 | s64 tmp; |
89 | 89 | ||
90 | task_cputime(tsk, &utime, &stime); | 90 | task_cputime(tsk, &utime, &stime); |
91 | tmp = (s64)d->cpu_run_real_total; | 91 | tmp = (s64)d->cpu_run_real_total; |
92 | tmp += cputime_to_nsecs(utime + stime); | 92 | tmp += utime + stime; |
93 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; | 93 | d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; |
94 | 94 | ||
95 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); | 95 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); |
96 | tmp = (s64)d->cpu_scaled_run_real_total; | 96 | tmp = (s64)d->cpu_scaled_run_real_total; |
97 | tmp += cputime_to_nsecs(utimescaled + stimescaled); | 97 | tmp += utimescaled + stimescaled; |
98 | d->cpu_scaled_run_real_total = | 98 | d->cpu_scaled_run_real_total = |
99 | (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; | 99 | (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; |
100 | 100 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 8f14b866f9f6..8e5e21338b3a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -86,7 +86,7 @@ static void __exit_signal(struct task_struct *tsk) | |||
86 | bool group_dead = thread_group_leader(tsk); | 86 | bool group_dead = thread_group_leader(tsk); |
87 | struct sighand_struct *sighand; | 87 | struct sighand_struct *sighand; |
88 | struct tty_struct *uninitialized_var(tty); | 88 | struct tty_struct *uninitialized_var(tty); |
89 | cputime_t utime, stime; | 89 | u64 utime, stime; |
90 | 90 | ||
91 | sighand = rcu_dereference_check(tsk->sighand, | 91 | sighand = rcu_dereference_check(tsk->sighand, |
92 | lockdep_tasklist_lock_is_held()); | 92 | lockdep_tasklist_lock_is_held()); |
@@ -1091,7 +1091,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) | |||
1091 | struct signal_struct *sig = p->signal; | 1091 | struct signal_struct *sig = p->signal; |
1092 | struct signal_struct *psig = current->signal; | 1092 | struct signal_struct *psig = current->signal; |
1093 | unsigned long maxrss; | 1093 | unsigned long maxrss; |
1094 | cputime_t tgutime, tgstime; | 1094 | u64 tgutime, tgstime; |
1095 | 1095 | ||
1096 | /* | 1096 | /* |
1097 | * The resource counters for the group leader are in its | 1097 | * The resource counters for the group leader are in its |
diff --git a/kernel/fork.c b/kernel/fork.c index 105c6676d93b..f6995cdfe714 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1314,7 +1314,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) | |||
1314 | 1314 | ||
1315 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | 1315 | cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
1316 | if (cpu_limit != RLIM_INFINITY) { | 1316 | if (cpu_limit != RLIM_INFINITY) { |
1317 | sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); | 1317 | sig->cputime_expires.prof_exp = cpu_limit * NSEC_PER_SEC; |
1318 | sig->cputimer.running = true; | 1318 | sig->cputimer.running = true; |
1319 | } | 1319 | } |
1320 | 1320 | ||
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9b349619f431..8464a5cbab97 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -783,6 +783,20 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | |||
783 | } | 783 | } |
784 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | 784 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); |
785 | 785 | ||
786 | void __sched | ||
787 | mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) | ||
788 | { | ||
789 | int token; | ||
790 | |||
791 | might_sleep(); | ||
792 | |||
793 | token = io_schedule_prepare(); | ||
794 | __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, | ||
795 | subclass, NULL, _RET_IP_, NULL, 0); | ||
796 | io_schedule_finish(token); | ||
797 | } | ||
798 | EXPORT_SYMBOL_GPL(mutex_lock_io_nested); | ||
799 | |||
786 | static inline int | 800 | static inline int |
787 | ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | 801 | ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) |
788 | { | 802 | { |
@@ -950,6 +964,16 @@ int __sched mutex_lock_killable(struct mutex *lock) | |||
950 | } | 964 | } |
951 | EXPORT_SYMBOL(mutex_lock_killable); | 965 | EXPORT_SYMBOL(mutex_lock_killable); |
952 | 966 | ||
967 | void __sched mutex_lock_io(struct mutex *lock) | ||
968 | { | ||
969 | int token; | ||
970 | |||
971 | token = io_schedule_prepare(); | ||
972 | mutex_lock(lock); | ||
973 | io_schedule_finish(token); | ||
974 | } | ||
975 | EXPORT_SYMBOL_GPL(mutex_lock_io); | ||
976 | |||
953 | static noinline void __sched | 977 | static noinline void __sched |
954 | __mutex_lock_slowpath(struct mutex *lock) | 978 | __mutex_lock_slowpath(struct mutex *lock) |
955 | { | 979 | { |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 5e59b832ae2b..89ab6758667b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -18,8 +18,8 @@ endif | |||
18 | obj-y += core.o loadavg.o clock.o cputime.o | 18 | obj-y += core.o loadavg.o clock.o cputime.o |
19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o | 19 | obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o |
20 | obj-y += wait.o swait.o completion.o idle.o | 20 | obj-y += wait.o swait.o completion.o idle.o |
21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o | 21 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o |
22 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 22 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
23 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 23 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o | 24 | obj-$(CONFIG_SCHED_DEBUG) += debug.o |
25 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o | 25 | obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o |
diff --git a/kernel/sched/auto_group.c b/kernel/sched/autogroup.c index da39489d2d80..da39489d2d80 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/autogroup.c | |||
diff --git a/kernel/sched/auto_group.h b/kernel/sched/autogroup.h index 890c95f2587a..890c95f2587a 100644 --- a/kernel/sched/auto_group.h +++ b/kernel/sched/autogroup.h | |||
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e85a725e5c34..ad64efe41722 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -77,41 +77,88 @@ EXPORT_SYMBOL_GPL(sched_clock); | |||
77 | 77 | ||
78 | __read_mostly int sched_clock_running; | 78 | __read_mostly int sched_clock_running; |
79 | 79 | ||
80 | void sched_clock_init(void) | ||
81 | { | ||
82 | sched_clock_running = 1; | ||
83 | } | ||
84 | |||
80 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | 85 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
81 | static struct static_key __sched_clock_stable = STATIC_KEY_INIT; | 86 | /* |
82 | static int __sched_clock_stable_early; | 87 | * We must start with !__sched_clock_stable because the unstable -> stable |
88 | * transition is accurate, while the stable -> unstable transition is not. | ||
89 | * | ||
90 | * Similarly we start with __sched_clock_stable_early, thereby assuming we | ||
91 | * will become stable, such that there's only a single 1 -> 0 transition. | ||
92 | */ | ||
93 | static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); | ||
94 | static int __sched_clock_stable_early = 1; | ||
83 | 95 | ||
84 | int sched_clock_stable(void) | 96 | /* |
97 | * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset | ||
98 | */ | ||
99 | static __read_mostly u64 raw_offset; | ||
100 | static __read_mostly u64 gtod_offset; | ||
101 | |||
102 | struct sched_clock_data { | ||
103 | u64 tick_raw; | ||
104 | u64 tick_gtod; | ||
105 | u64 clock; | ||
106 | }; | ||
107 | |||
108 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | ||
109 | |||
110 | static inline struct sched_clock_data *this_scd(void) | ||
85 | { | 111 | { |
86 | return static_key_false(&__sched_clock_stable); | 112 | return this_cpu_ptr(&sched_clock_data); |
87 | } | 113 | } |
88 | 114 | ||
89 | static void __set_sched_clock_stable(void) | 115 | static inline struct sched_clock_data *cpu_sdc(int cpu) |
90 | { | 116 | { |
91 | if (!sched_clock_stable()) | 117 | return &per_cpu(sched_clock_data, cpu); |
92 | static_key_slow_inc(&__sched_clock_stable); | 118 | } |
93 | 119 | ||
94 | tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); | 120 | int sched_clock_stable(void) |
121 | { | ||
122 | return static_branch_likely(&__sched_clock_stable); | ||
95 | } | 123 | } |
96 | 124 | ||
97 | void set_sched_clock_stable(void) | 125 | static void __set_sched_clock_stable(void) |
98 | { | 126 | { |
99 | __sched_clock_stable_early = 1; | 127 | struct sched_clock_data *scd = this_scd(); |
100 | 128 | ||
101 | smp_mb(); /* matches sched_clock_init() */ | 129 | /* |
130 | * Attempt to make the (initial) unstable->stable transition continuous. | ||
131 | */ | ||
132 | raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); | ||
102 | 133 | ||
103 | if (!sched_clock_running) | 134 | printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", |
104 | return; | 135 | scd->tick_gtod, gtod_offset, |
136 | scd->tick_raw, raw_offset); | ||
105 | 137 | ||
106 | __set_sched_clock_stable(); | 138 | static_branch_enable(&__sched_clock_stable); |
139 | tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); | ||
107 | } | 140 | } |
108 | 141 | ||
109 | static void __clear_sched_clock_stable(struct work_struct *work) | 142 | static void __clear_sched_clock_stable(struct work_struct *work) |
110 | { | 143 | { |
111 | /* XXX worry about clock continuity */ | 144 | struct sched_clock_data *scd = this_scd(); |
112 | if (sched_clock_stable()) | 145 | |
113 | static_key_slow_dec(&__sched_clock_stable); | 146 | /* |
147 | * Attempt to make the stable->unstable transition continuous. | ||
148 | * | ||
149 | * Trouble is, this is typically called from the TSC watchdog | ||
150 | * timer, which is late per definition. This means the tick | ||
151 | * values can already be screwy. | ||
152 | * | ||
153 | * Still do what we can. | ||
154 | */ | ||
155 | gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); | ||
156 | |||
157 | printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", | ||
158 | scd->tick_gtod, gtod_offset, | ||
159 | scd->tick_raw, raw_offset); | ||
114 | 160 | ||
161 | static_branch_disable(&__sched_clock_stable); | ||
115 | tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); | 162 | tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); |
116 | } | 163 | } |
117 | 164 | ||
@@ -121,47 +168,15 @@ void clear_sched_clock_stable(void) | |||
121 | { | 168 | { |
122 | __sched_clock_stable_early = 0; | 169 | __sched_clock_stable_early = 0; |
123 | 170 | ||
124 | smp_mb(); /* matches sched_clock_init() */ | 171 | smp_mb(); /* matches sched_clock_init_late() */ |
125 | |||
126 | if (!sched_clock_running) | ||
127 | return; | ||
128 | 172 | ||
129 | schedule_work(&sched_clock_work); | 173 | if (sched_clock_running == 2) |
174 | schedule_work(&sched_clock_work); | ||
130 | } | 175 | } |
131 | 176 | ||
132 | struct sched_clock_data { | 177 | void sched_clock_init_late(void) |
133 | u64 tick_raw; | ||
134 | u64 tick_gtod; | ||
135 | u64 clock; | ||
136 | }; | ||
137 | |||
138 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); | ||
139 | |||
140 | static inline struct sched_clock_data *this_scd(void) | ||
141 | { | 178 | { |
142 | return this_cpu_ptr(&sched_clock_data); | 179 | sched_clock_running = 2; |
143 | } | ||
144 | |||
145 | static inline struct sched_clock_data *cpu_sdc(int cpu) | ||
146 | { | ||
147 | return &per_cpu(sched_clock_data, cpu); | ||
148 | } | ||
149 | |||
150 | void sched_clock_init(void) | ||
151 | { | ||
152 | u64 ktime_now = ktime_to_ns(ktime_get()); | ||
153 | int cpu; | ||
154 | |||
155 | for_each_possible_cpu(cpu) { | ||
156 | struct sched_clock_data *scd = cpu_sdc(cpu); | ||
157 | |||
158 | scd->tick_raw = 0; | ||
159 | scd->tick_gtod = ktime_now; | ||
160 | scd->clock = ktime_now; | ||
161 | } | ||
162 | |||
163 | sched_clock_running = 1; | ||
164 | |||
165 | /* | 180 | /* |
166 | * Ensure that it is impossible to not do a static_key update. | 181 | * Ensure that it is impossible to not do a static_key update. |
167 | * | 182 | * |
@@ -173,8 +188,6 @@ void sched_clock_init(void) | |||
173 | 188 | ||
174 | if (__sched_clock_stable_early) | 189 | if (__sched_clock_stable_early) |
175 | __set_sched_clock_stable(); | 190 | __set_sched_clock_stable(); |
176 | else | ||
177 | __clear_sched_clock_stable(NULL); | ||
178 | } | 191 | } |
179 | 192 | ||
180 | /* | 193 | /* |
@@ -216,7 +229,7 @@ again: | |||
216 | * scd->tick_gtod + TICK_NSEC); | 229 | * scd->tick_gtod + TICK_NSEC); |
217 | */ | 230 | */ |
218 | 231 | ||
219 | clock = scd->tick_gtod + delta; | 232 | clock = scd->tick_gtod + gtod_offset + delta; |
220 | min_clock = wrap_max(scd->tick_gtod, old_clock); | 233 | min_clock = wrap_max(scd->tick_gtod, old_clock); |
221 | max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); | 234 | max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); |
222 | 235 | ||
@@ -302,7 +315,7 @@ u64 sched_clock_cpu(int cpu) | |||
302 | u64 clock; | 315 | u64 clock; |
303 | 316 | ||
304 | if (sched_clock_stable()) | 317 | if (sched_clock_stable()) |
305 | return sched_clock(); | 318 | return sched_clock() + raw_offset; |
306 | 319 | ||
307 | if (unlikely(!sched_clock_running)) | 320 | if (unlikely(!sched_clock_running)) |
308 | return 0ull; | 321 | return 0ull; |
@@ -323,23 +336,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu); | |||
323 | void sched_clock_tick(void) | 336 | void sched_clock_tick(void) |
324 | { | 337 | { |
325 | struct sched_clock_data *scd; | 338 | struct sched_clock_data *scd; |
326 | u64 now, now_gtod; | ||
327 | |||
328 | if (sched_clock_stable()) | ||
329 | return; | ||
330 | |||
331 | if (unlikely(!sched_clock_running)) | ||
332 | return; | ||
333 | 339 | ||
334 | WARN_ON_ONCE(!irqs_disabled()); | 340 | WARN_ON_ONCE(!irqs_disabled()); |
335 | 341 | ||
342 | /* | ||
343 | * Update these values even if sched_clock_stable(), because it can | ||
344 | * become unstable at any point in time at which point we need some | ||
345 | * values to fall back on. | ||
346 | * | ||
347 | * XXX arguably we can skip this if we expose tsc_clocksource_reliable | ||
348 | */ | ||
336 | scd = this_scd(); | 349 | scd = this_scd(); |
337 | now_gtod = ktime_to_ns(ktime_get()); | 350 | scd->tick_raw = sched_clock(); |
338 | now = sched_clock(); | 351 | scd->tick_gtod = ktime_get_ns(); |
339 | 352 | ||
340 | scd->tick_raw = now; | 353 | if (!sched_clock_stable() && likely(sched_clock_running)) |
341 | scd->tick_gtod = now_gtod; | 354 | sched_clock_local(scd); |
342 | sched_clock_local(scd); | ||
343 | } | 355 | } |
344 | 356 | ||
345 | /* | 357 | /* |
@@ -366,11 +378,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
366 | 378 | ||
367 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 379 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
368 | 380 | ||
369 | void sched_clock_init(void) | ||
370 | { | ||
371 | sched_clock_running = 1; | ||
372 | } | ||
373 | |||
374 | u64 sched_clock_cpu(int cpu) | 381 | u64 sched_clock_cpu(int cpu) |
375 | { | 382 | { |
376 | if (unlikely(!sched_clock_running)) | 383 | if (unlikely(!sched_clock_running)) |
@@ -378,6 +385,7 @@ u64 sched_clock_cpu(int cpu) | |||
378 | 385 | ||
379 | return sched_clock(); | 386 | return sched_clock(); |
380 | } | 387 | } |
388 | |||
381 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ | 389 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
382 | 390 | ||
383 | /* | 391 | /* |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 8d0f35debf35..f063a25d4449 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -31,7 +31,8 @@ void complete(struct completion *x) | |||
31 | unsigned long flags; | 31 | unsigned long flags; |
32 | 32 | ||
33 | spin_lock_irqsave(&x->wait.lock, flags); | 33 | spin_lock_irqsave(&x->wait.lock, flags); |
34 | x->done++; | 34 | if (x->done != UINT_MAX) |
35 | x->done++; | ||
35 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); | 36 | __wake_up_locked(&x->wait, TASK_NORMAL, 1); |
36 | spin_unlock_irqrestore(&x->wait.lock, flags); | 37 | spin_unlock_irqrestore(&x->wait.lock, flags); |
37 | } | 38 | } |
@@ -51,7 +52,7 @@ void complete_all(struct completion *x) | |||
51 | unsigned long flags; | 52 | unsigned long flags; |
52 | 53 | ||
53 | spin_lock_irqsave(&x->wait.lock, flags); | 54 | spin_lock_irqsave(&x->wait.lock, flags); |
54 | x->done += UINT_MAX/2; | 55 | x->done = UINT_MAX; |
55 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); | 56 | __wake_up_locked(&x->wait, TASK_NORMAL, 0); |
56 | spin_unlock_irqrestore(&x->wait.lock, flags); | 57 | spin_unlock_irqrestore(&x->wait.lock, flags); |
57 | } | 58 | } |
@@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x, | |||
79 | if (!x->done) | 80 | if (!x->done) |
80 | return timeout; | 81 | return timeout; |
81 | } | 82 | } |
82 | x->done--; | 83 | if (x->done != UINT_MAX) |
84 | x->done--; | ||
83 | return timeout ?: 1; | 85 | return timeout ?: 1; |
84 | } | 86 | } |
85 | 87 | ||
@@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x) | |||
280 | spin_lock_irqsave(&x->wait.lock, flags); | 282 | spin_lock_irqsave(&x->wait.lock, flags); |
281 | if (!x->done) | 283 | if (!x->done) |
282 | ret = 0; | 284 | ret = 0; |
283 | else | 285 | else if (x->done != UINT_MAX) |
284 | x->done--; | 286 | x->done--; |
285 | spin_unlock_irqrestore(&x->wait.lock, flags); | 287 | spin_unlock_irqrestore(&x->wait.lock, flags); |
286 | return ret; | 288 | return ret; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..34e2291a9a6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1,88 +1,28 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/core.c | 2 | * kernel/sched/core.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Core kernel scheduler code and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | ||
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | ||
9 | * make semaphores SMP safe | ||
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | ||
11 | * by Andrea Arcangeli | ||
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | ||
13 | * hybrid priority-list and round-robin design with | ||
14 | * an array-switch method of distributing timeslices | ||
15 | * and per-CPU runqueues. Cleanups and useful suggestions | ||
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | ||
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | ||
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | ||
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | ||
20 | * fair scheduling design by Con Kolivas. | ||
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | ||
22 | * by Peter Williams | ||
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | ||
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | ||
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
27 | */ | 7 | */ |
28 | 8 | #include <linux/sched.h> | |
29 | #include <linux/kasan.h> | ||
30 | #include <linux/mm.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/nmi.h> | ||
33 | #include <linux/init.h> | ||
34 | #include <linux/uaccess.h> | ||
35 | #include <linux/highmem.h> | ||
36 | #include <linux/mmu_context.h> | ||
37 | #include <linux/interrupt.h> | ||
38 | #include <linux/capability.h> | ||
39 | #include <linux/completion.h> | ||
40 | #include <linux/kernel_stat.h> | ||
41 | #include <linux/debug_locks.h> | ||
42 | #include <linux/perf_event.h> | ||
43 | #include <linux/security.h> | ||
44 | #include <linux/notifier.h> | ||
45 | #include <linux/profile.h> | ||
46 | #include <linux/freezer.h> | ||
47 | #include <linux/vmalloc.h> | ||
48 | #include <linux/blkdev.h> | ||
49 | #include <linux/delay.h> | ||
50 | #include <linux/pid_namespace.h> | ||
51 | #include <linux/smp.h> | ||
52 | #include <linux/threads.h> | ||
53 | #include <linux/timer.h> | ||
54 | #include <linux/rcupdate.h> | ||
55 | #include <linux/cpu.h> | ||
56 | #include <linux/cpuset.h> | 9 | #include <linux/cpuset.h> |
57 | #include <linux/percpu.h> | ||
58 | #include <linux/proc_fs.h> | ||
59 | #include <linux/seq_file.h> | ||
60 | #include <linux/sysctl.h> | ||
61 | #include <linux/syscalls.h> | ||
62 | #include <linux/times.h> | ||
63 | #include <linux/tsacct_kern.h> | ||
64 | #include <linux/kprobes.h> | ||
65 | #include <linux/delayacct.h> | 10 | #include <linux/delayacct.h> |
66 | #include <linux/unistd.h> | ||
67 | #include <linux/pagemap.h> | ||
68 | #include <linux/hrtimer.h> | ||
69 | #include <linux/tick.h> | ||
70 | #include <linux/ctype.h> | ||
71 | #include <linux/ftrace.h> | ||
72 | #include <linux/slab.h> | ||
73 | #include <linux/init_task.h> | 11 | #include <linux/init_task.h> |
74 | #include <linux/context_tracking.h> | 12 | #include <linux/context_tracking.h> |
75 | #include <linux/compiler.h> | 13 | |
76 | #include <linux/frame.h> | 14 | #include <linux/blkdev.h> |
15 | #include <linux/kprobes.h> | ||
16 | #include <linux/mmu_context.h> | ||
17 | #include <linux/module.h> | ||
18 | #include <linux/nmi.h> | ||
77 | #include <linux/prefetch.h> | 19 | #include <linux/prefetch.h> |
78 | #include <linux/mutex.h> | 20 | #include <linux/profile.h> |
21 | #include <linux/security.h> | ||
22 | #include <linux/syscalls.h> | ||
79 | 23 | ||
80 | #include <asm/switch_to.h> | 24 | #include <asm/switch_to.h> |
81 | #include <asm/tlb.h> | 25 | #include <asm/tlb.h> |
82 | #include <asm/irq_regs.h> | ||
83 | #ifdef CONFIG_PARAVIRT | ||
84 | #include <asm/paravirt.h> | ||
85 | #endif | ||
86 | 26 | ||
87 | #include "sched.h" | 27 | #include "sched.h" |
88 | #include "../workqueue_internal.h" | 28 | #include "../workqueue_internal.h" |
@@ -91,27 +31,8 @@ | |||
91 | #define CREATE_TRACE_POINTS | 31 | #define CREATE_TRACE_POINTS |
92 | #include <trace/events/sched.h> | 32 | #include <trace/events/sched.h> |
93 | 33 | ||
94 | DEFINE_MUTEX(sched_domains_mutex); | ||
95 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 34 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
96 | 35 | ||
97 | static void update_rq_clock_task(struct rq *rq, s64 delta); | ||
98 | |||
99 | void update_rq_clock(struct rq *rq) | ||
100 | { | ||
101 | s64 delta; | ||
102 | |||
103 | lockdep_assert_held(&rq->lock); | ||
104 | |||
105 | if (rq->clock_skip_update & RQCF_ACT_SKIP) | ||
106 | return; | ||
107 | |||
108 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
109 | if (delta < 0) | ||
110 | return; | ||
111 | rq->clock += delta; | ||
112 | update_rq_clock_task(rq, delta); | ||
113 | } | ||
114 | |||
115 | /* | 36 | /* |
116 | * Debugging: various feature bits | 37 | * Debugging: various feature bits |
117 | */ | 38 | */ |
@@ -140,7 +61,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
140 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | 61 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; |
141 | 62 | ||
142 | /* | 63 | /* |
143 | * period over which we measure -rt task cpu usage in us. | 64 | * period over which we measure -rt task CPU usage in us. |
144 | * default: 1s | 65 | * default: 1s |
145 | */ | 66 | */ |
146 | unsigned int sysctl_sched_rt_period = 1000000; | 67 | unsigned int sysctl_sched_rt_period = 1000000; |
@@ -153,7 +74,7 @@ __read_mostly int scheduler_running; | |||
153 | */ | 74 | */ |
154 | int sysctl_sched_rt_runtime = 950000; | 75 | int sysctl_sched_rt_runtime = 950000; |
155 | 76 | ||
156 | /* cpus with isolated domains */ | 77 | /* CPUs with isolated domains */ |
157 | cpumask_var_t cpu_isolated_map; | 78 | cpumask_var_t cpu_isolated_map; |
158 | 79 | ||
159 | /* | 80 | /* |
@@ -185,7 +106,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
185 | rq = task_rq(p); | 106 | rq = task_rq(p); |
186 | raw_spin_lock(&rq->lock); | 107 | raw_spin_lock(&rq->lock); |
187 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 108 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
188 | rf->cookie = lockdep_pin_lock(&rq->lock); | 109 | rq_pin_lock(rq, rf); |
189 | return rq; | 110 | return rq; |
190 | } | 111 | } |
191 | raw_spin_unlock(&rq->lock); | 112 | raw_spin_unlock(&rq->lock); |
@@ -221,11 +142,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
221 | * If we observe the old cpu in task_rq_lock, the acquire of | 142 | * If we observe the old cpu in task_rq_lock, the acquire of |
222 | * the old rq->lock will fully serialize against the stores. | 143 | * the old rq->lock will fully serialize against the stores. |
223 | * | 144 | * |
224 | * If we observe the new cpu in task_rq_lock, the acquire will | 145 | * If we observe the new CPU in task_rq_lock, the acquire will |
225 | * pair with the WMB to ensure we must then also see migrating. | 146 | * pair with the WMB to ensure we must then also see migrating. |
226 | */ | 147 | */ |
227 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { | 148 | if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { |
228 | rf->cookie = lockdep_pin_lock(&rq->lock); | 149 | rq_pin_lock(rq, rf); |
229 | return rq; | 150 | return rq; |
230 | } | 151 | } |
231 | raw_spin_unlock(&rq->lock); | 152 | raw_spin_unlock(&rq->lock); |
@@ -236,6 +157,84 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
236 | } | 157 | } |
237 | } | 158 | } |
238 | 159 | ||
160 | /* | ||
161 | * RQ-clock updating methods: | ||
162 | */ | ||
163 | |||
164 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
165 | { | ||
166 | /* | ||
167 | * In theory, the compile should just see 0 here, and optimize out the call | ||
168 | * to sched_rt_avg_update. But I don't trust it... | ||
169 | */ | ||
170 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
171 | s64 steal = 0, irq_delta = 0; | ||
172 | #endif | ||
173 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
174 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
175 | |||
176 | /* | ||
177 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
178 | * this case when a previous update_rq_clock() happened inside a | ||
179 | * {soft,}irq region. | ||
180 | * | ||
181 | * When this happens, we stop ->clock_task and only update the | ||
182 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
183 | * update will consume the rest. This ensures ->clock_task is | ||
184 | * monotonic. | ||
185 | * | ||
186 | * It does however cause some slight miss-attribution of {soft,}irq | ||
187 | * time, a more accurate solution would be to update the irq_time using | ||
188 | * the current rq->clock timestamp, except that would require using | ||
189 | * atomic ops. | ||
190 | */ | ||
191 | if (irq_delta > delta) | ||
192 | irq_delta = delta; | ||
193 | |||
194 | rq->prev_irq_time += irq_delta; | ||
195 | delta -= irq_delta; | ||
196 | #endif | ||
197 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
198 | if (static_key_false((¶virt_steal_rq_enabled))) { | ||
199 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
200 | steal -= rq->prev_steal_time_rq; | ||
201 | |||
202 | if (unlikely(steal > delta)) | ||
203 | steal = delta; | ||
204 | |||
205 | rq->prev_steal_time_rq += steal; | ||
206 | delta -= steal; | ||
207 | } | ||
208 | #endif | ||
209 | |||
210 | rq->clock_task += delta; | ||
211 | |||
212 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
213 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | ||
214 | sched_rt_avg_update(rq, irq_delta + steal); | ||
215 | #endif | ||
216 | } | ||
217 | |||
218 | void update_rq_clock(struct rq *rq) | ||
219 | { | ||
220 | s64 delta; | ||
221 | |||
222 | lockdep_assert_held(&rq->lock); | ||
223 | |||
224 | if (rq->clock_update_flags & RQCF_ACT_SKIP) | ||
225 | return; | ||
226 | |||
227 | #ifdef CONFIG_SCHED_DEBUG | ||
228 | rq->clock_update_flags |= RQCF_UPDATED; | ||
229 | #endif | ||
230 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
231 | if (delta < 0) | ||
232 | return; | ||
233 | rq->clock += delta; | ||
234 | update_rq_clock_task(rq, delta); | ||
235 | } | ||
236 | |||
237 | |||
239 | #ifdef CONFIG_SCHED_HRTICK | 238 | #ifdef CONFIG_SCHED_HRTICK |
240 | /* | 239 | /* |
241 | * Use HR-timers to deliver accurate preemption points. | 240 | * Use HR-timers to deliver accurate preemption points. |
@@ -458,7 +457,7 @@ void wake_up_q(struct wake_q_head *head) | |||
458 | 457 | ||
459 | task = container_of(node, struct task_struct, wake_q); | 458 | task = container_of(node, struct task_struct, wake_q); |
460 | BUG_ON(!task); | 459 | BUG_ON(!task); |
461 | /* task can safely be re-inserted now */ | 460 | /* Task can safely be re-inserted now: */ |
462 | node = node->next; | 461 | node = node->next; |
463 | task->wake_q.next = NULL; | 462 | task->wake_q.next = NULL; |
464 | 463 | ||
@@ -516,12 +515,12 @@ void resched_cpu(int cpu) | |||
516 | #ifdef CONFIG_SMP | 515 | #ifdef CONFIG_SMP |
517 | #ifdef CONFIG_NO_HZ_COMMON | 516 | #ifdef CONFIG_NO_HZ_COMMON |
518 | /* | 517 | /* |
519 | * In the semi idle case, use the nearest busy cpu for migrating timers | 518 | * In the semi idle case, use the nearest busy CPU for migrating timers |
520 | * from an idle cpu. This is good for power-savings. | 519 | * from an idle CPU. This is good for power-savings. |
521 | * | 520 | * |
522 | * We don't do similar optimization for completely idle system, as | 521 | * We don't do similar optimization for completely idle system, as |
523 | * selecting an idle cpu will add more delays to the timers than intended | 522 | * selecting an idle CPU will add more delays to the timers than intended |
524 | * (as that cpu's timer base may not be uptodate wrt jiffies etc). | 523 | * (as that CPU's timer base may not be uptodate wrt jiffies etc). |
525 | */ | 524 | */ |
526 | int get_nohz_timer_target(void) | 525 | int get_nohz_timer_target(void) |
527 | { | 526 | { |
@@ -550,6 +549,7 @@ unlock: | |||
550 | rcu_read_unlock(); | 549 | rcu_read_unlock(); |
551 | return cpu; | 550 | return cpu; |
552 | } | 551 | } |
552 | |||
553 | /* | 553 | /* |
554 | * When add_timer_on() enqueues a timer into the timer wheel of an | 554 | * When add_timer_on() enqueues a timer into the timer wheel of an |
555 | * idle CPU then this timer might expire before the next timer event | 555 | * idle CPU then this timer might expire before the next timer event |
@@ -784,60 +784,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
784 | dequeue_task(rq, p, flags); | 784 | dequeue_task(rq, p, flags); |
785 | } | 785 | } |
786 | 786 | ||
787 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
788 | { | ||
789 | /* | ||
790 | * In theory, the compile should just see 0 here, and optimize out the call | ||
791 | * to sched_rt_avg_update. But I don't trust it... | ||
792 | */ | ||
793 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
794 | s64 steal = 0, irq_delta = 0; | ||
795 | #endif | ||
796 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
797 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
798 | |||
799 | /* | ||
800 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
801 | * this case when a previous update_rq_clock() happened inside a | ||
802 | * {soft,}irq region. | ||
803 | * | ||
804 | * When this happens, we stop ->clock_task and only update the | ||
805 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
806 | * update will consume the rest. This ensures ->clock_task is | ||
807 | * monotonic. | ||
808 | * | ||
809 | * It does however cause some slight miss-attribution of {soft,}irq | ||
810 | * time, a more accurate solution would be to update the irq_time using | ||
811 | * the current rq->clock timestamp, except that would require using | ||
812 | * atomic ops. | ||
813 | */ | ||
814 | if (irq_delta > delta) | ||
815 | irq_delta = delta; | ||
816 | |||
817 | rq->prev_irq_time += irq_delta; | ||
818 | delta -= irq_delta; | ||
819 | #endif | ||
820 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | ||
821 | if (static_key_false((¶virt_steal_rq_enabled))) { | ||
822 | steal = paravirt_steal_clock(cpu_of(rq)); | ||
823 | steal -= rq->prev_steal_time_rq; | ||
824 | |||
825 | if (unlikely(steal > delta)) | ||
826 | steal = delta; | ||
827 | |||
828 | rq->prev_steal_time_rq += steal; | ||
829 | delta -= steal; | ||
830 | } | ||
831 | #endif | ||
832 | |||
833 | rq->clock_task += delta; | ||
834 | |||
835 | #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) | ||
836 | if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) | ||
837 | sched_rt_avg_update(rq, irq_delta + steal); | ||
838 | #endif | ||
839 | } | ||
840 | |||
841 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 787 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
842 | { | 788 | { |
843 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 789 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
@@ -1018,7 +964,7 @@ struct migration_arg { | |||
1018 | }; | 964 | }; |
1019 | 965 | ||
1020 | /* | 966 | /* |
1021 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 967 | * Move (not current) task off this CPU, onto the destination CPU. We're doing |
1022 | * this because either it can't run here any more (set_cpus_allowed() | 968 | * this because either it can't run here any more (set_cpus_allowed() |
1023 | * away from this CPU, or CPU going down), or because we're | 969 | * away from this CPU, or CPU going down), or because we're |
1024 | * attempting to rebalance this task on exec (sched_exec). | 970 | * attempting to rebalance this task on exec (sched_exec). |
@@ -1052,8 +998,8 @@ static int migration_cpu_stop(void *data) | |||
1052 | struct rq *rq = this_rq(); | 998 | struct rq *rq = this_rq(); |
1053 | 999 | ||
1054 | /* | 1000 | /* |
1055 | * The original target cpu might have gone down and we might | 1001 | * The original target CPU might have gone down and we might |
1056 | * be on another cpu but it doesn't matter. | 1002 | * be on another CPU but it doesn't matter. |
1057 | */ | 1003 | */ |
1058 | local_irq_disable(); | 1004 | local_irq_disable(); |
1059 | /* | 1005 | /* |
@@ -1171,7 +1117,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1171 | if (p->flags & PF_KTHREAD) { | 1117 | if (p->flags & PF_KTHREAD) { |
1172 | /* | 1118 | /* |
1173 | * For kernel threads that do indeed end up on online && | 1119 | * For kernel threads that do indeed end up on online && |
1174 | * !active we want to ensure they are strict per-cpu threads. | 1120 | * !active we want to ensure they are strict per-CPU threads. |
1175 | */ | 1121 | */ |
1176 | WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && | 1122 | WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && |
1177 | !cpumask_intersects(new_mask, cpu_active_mask) && | 1123 | !cpumask_intersects(new_mask, cpu_active_mask) && |
@@ -1195,9 +1141,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1195 | * OK, since we're going to drop the lock immediately | 1141 | * OK, since we're going to drop the lock immediately |
1196 | * afterwards anyway. | 1142 | * afterwards anyway. |
1197 | */ | 1143 | */ |
1198 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 1144 | rq_unpin_lock(rq, &rf); |
1199 | rq = move_queued_task(rq, p, dest_cpu); | 1145 | rq = move_queued_task(rq, p, dest_cpu); |
1200 | lockdep_repin_lock(&rq->lock, rf.cookie); | 1146 | rq_repin_lock(rq, &rf); |
1201 | } | 1147 | } |
1202 | out: | 1148 | out: |
1203 | task_rq_unlock(rq, p, &rf); | 1149 | task_rq_unlock(rq, p, &rf); |
@@ -1276,7 +1222,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) | |||
1276 | /* | 1222 | /* |
1277 | * Task isn't running anymore; make it appear like we migrated | 1223 | * Task isn't running anymore; make it appear like we migrated |
1278 | * it before it went to sleep. This means on wakeup we make the | 1224 | * it before it went to sleep. This means on wakeup we make the |
1279 | * previous cpu our target instead of where it really is. | 1225 | * previous CPU our target instead of where it really is. |
1280 | */ | 1226 | */ |
1281 | p->wake_cpu = cpu; | 1227 | p->wake_cpu = cpu; |
1282 | } | 1228 | } |
@@ -1508,12 +1454,12 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1508 | * | 1454 | * |
1509 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1455 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, |
1510 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1456 | * see __set_cpus_allowed_ptr(). At this point the newly online |
1511 | * cpu isn't yet part of the sched domains, and balancing will not | 1457 | * CPU isn't yet part of the sched domains, and balancing will not |
1512 | * see it. | 1458 | * see it. |
1513 | * | 1459 | * |
1514 | * - on cpu-down we clear cpu_active() to mask the sched domains and | 1460 | * - on CPU-down we clear cpu_active() to mask the sched domains and |
1515 | * avoid the load balancer to place new tasks on the to be removed | 1461 | * avoid the load balancer to place new tasks on the to be removed |
1516 | * cpu. Existing tasks will remain running there and will be taken | 1462 | * CPU. Existing tasks will remain running there and will be taken |
1517 | * off. | 1463 | * off. |
1518 | * | 1464 | * |
1519 | * This means that fallback selection must not select !active CPUs. | 1465 | * This means that fallback selection must not select !active CPUs. |
@@ -1529,9 +1475,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1529 | int dest_cpu; | 1475 | int dest_cpu; |
1530 | 1476 | ||
1531 | /* | 1477 | /* |
1532 | * If the node that the cpu is on has been offlined, cpu_to_node() | 1478 | * If the node that the CPU is on has been offlined, cpu_to_node() |
1533 | * will return -1. There is no cpu on the node, and we should | 1479 | * will return -1. There is no CPU on the node, and we should |
1534 | * select the cpu on the other node. | 1480 | * select the CPU on the other node. |
1535 | */ | 1481 | */ |
1536 | if (nid != -1) { | 1482 | if (nid != -1) { |
1537 | nodemask = cpumask_of_node(nid); | 1483 | nodemask = cpumask_of_node(nid); |
@@ -1563,7 +1509,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1563 | state = possible; | 1509 | state = possible; |
1564 | break; | 1510 | break; |
1565 | } | 1511 | } |
1566 | /* fall-through */ | 1512 | /* Fall-through */ |
1567 | case possible: | 1513 | case possible: |
1568 | do_set_cpus_allowed(p, cpu_possible_mask); | 1514 | do_set_cpus_allowed(p, cpu_possible_mask); |
1569 | state = fail; | 1515 | state = fail; |
@@ -1607,7 +1553,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | |||
1607 | /* | 1553 | /* |
1608 | * In order not to call set_task_cpu() on a blocking task we need | 1554 | * In order not to call set_task_cpu() on a blocking task we need |
1609 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | 1555 | * to rely on ttwu() to place the task on a valid ->cpus_allowed |
1610 | * cpu. | 1556 | * CPU. |
1611 | * | 1557 | * |
1612 | * Since this is common to all placement strategies, this lives here. | 1558 | * Since this is common to all placement strategies, this lives here. |
1613 | * | 1559 | * |
@@ -1681,7 +1627,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl | |||
1681 | activate_task(rq, p, en_flags); | 1627 | activate_task(rq, p, en_flags); |
1682 | p->on_rq = TASK_ON_RQ_QUEUED; | 1628 | p->on_rq = TASK_ON_RQ_QUEUED; |
1683 | 1629 | ||
1684 | /* if a worker is waking up, notify workqueue */ | 1630 | /* If a worker is waking up, notify the workqueue: */ |
1685 | if (p->flags & PF_WQ_WORKER) | 1631 | if (p->flags & PF_WQ_WORKER) |
1686 | wq_worker_waking_up(p, cpu_of(rq)); | 1632 | wq_worker_waking_up(p, cpu_of(rq)); |
1687 | } | 1633 | } |
@@ -1690,7 +1636,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl | |||
1690 | * Mark the task runnable and perform wakeup-preemption. | 1636 | * Mark the task runnable and perform wakeup-preemption. |
1691 | */ | 1637 | */ |
1692 | static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | 1638 | static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, |
1693 | struct pin_cookie cookie) | 1639 | struct rq_flags *rf) |
1694 | { | 1640 | { |
1695 | check_preempt_curr(rq, p, wake_flags); | 1641 | check_preempt_curr(rq, p, wake_flags); |
1696 | p->state = TASK_RUNNING; | 1642 | p->state = TASK_RUNNING; |
@@ -1702,9 +1648,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1702 | * Our task @p is fully woken up and running; so its safe to | 1648 | * Our task @p is fully woken up and running; so its safe to |
1703 | * drop the rq->lock, hereafter rq is only used for statistics. | 1649 | * drop the rq->lock, hereafter rq is only used for statistics. |
1704 | */ | 1650 | */ |
1705 | lockdep_unpin_lock(&rq->lock, cookie); | 1651 | rq_unpin_lock(rq, rf); |
1706 | p->sched_class->task_woken(rq, p); | 1652 | p->sched_class->task_woken(rq, p); |
1707 | lockdep_repin_lock(&rq->lock, cookie); | 1653 | rq_repin_lock(rq, rf); |
1708 | } | 1654 | } |
1709 | 1655 | ||
1710 | if (rq->idle_stamp) { | 1656 | if (rq->idle_stamp) { |
@@ -1723,7 +1669,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1723 | 1669 | ||
1724 | static void | 1670 | static void |
1725 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | 1671 | ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, |
1726 | struct pin_cookie cookie) | 1672 | struct rq_flags *rf) |
1727 | { | 1673 | { |
1728 | int en_flags = ENQUEUE_WAKEUP; | 1674 | int en_flags = ENQUEUE_WAKEUP; |
1729 | 1675 | ||
@@ -1738,7 +1684,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, | |||
1738 | #endif | 1684 | #endif |
1739 | 1685 | ||
1740 | ttwu_activate(rq, p, en_flags); | 1686 | ttwu_activate(rq, p, en_flags); |
1741 | ttwu_do_wakeup(rq, p, wake_flags, cookie); | 1687 | ttwu_do_wakeup(rq, p, wake_flags, rf); |
1742 | } | 1688 | } |
1743 | 1689 | ||
1744 | /* | 1690 | /* |
@@ -1757,7 +1703,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) | |||
1757 | if (task_on_rq_queued(p)) { | 1703 | if (task_on_rq_queued(p)) { |
1758 | /* check_preempt_curr() may use rq clock */ | 1704 | /* check_preempt_curr() may use rq clock */ |
1759 | update_rq_clock(rq); | 1705 | update_rq_clock(rq); |
1760 | ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); | 1706 | ttwu_do_wakeup(rq, p, wake_flags, &rf); |
1761 | ret = 1; | 1707 | ret = 1; |
1762 | } | 1708 | } |
1763 | __task_rq_unlock(rq, &rf); | 1709 | __task_rq_unlock(rq, &rf); |
@@ -1770,15 +1716,15 @@ void sched_ttwu_pending(void) | |||
1770 | { | 1716 | { |
1771 | struct rq *rq = this_rq(); | 1717 | struct rq *rq = this_rq(); |
1772 | struct llist_node *llist = llist_del_all(&rq->wake_list); | 1718 | struct llist_node *llist = llist_del_all(&rq->wake_list); |
1773 | struct pin_cookie cookie; | ||
1774 | struct task_struct *p; | 1719 | struct task_struct *p; |
1775 | unsigned long flags; | 1720 | unsigned long flags; |
1721 | struct rq_flags rf; | ||
1776 | 1722 | ||
1777 | if (!llist) | 1723 | if (!llist) |
1778 | return; | 1724 | return; |
1779 | 1725 | ||
1780 | raw_spin_lock_irqsave(&rq->lock, flags); | 1726 | raw_spin_lock_irqsave(&rq->lock, flags); |
1781 | cookie = lockdep_pin_lock(&rq->lock); | 1727 | rq_pin_lock(rq, &rf); |
1782 | 1728 | ||
1783 | while (llist) { | 1729 | while (llist) { |
1784 | int wake_flags = 0; | 1730 | int wake_flags = 0; |
@@ -1789,10 +1735,10 @@ void sched_ttwu_pending(void) | |||
1789 | if (p->sched_remote_wakeup) | 1735 | if (p->sched_remote_wakeup) |
1790 | wake_flags = WF_MIGRATED; | 1736 | wake_flags = WF_MIGRATED; |
1791 | 1737 | ||
1792 | ttwu_do_activate(rq, p, wake_flags, cookie); | 1738 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1793 | } | 1739 | } |
1794 | 1740 | ||
1795 | lockdep_unpin_lock(&rq->lock, cookie); | 1741 | rq_unpin_lock(rq, &rf); |
1796 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1742 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1797 | } | 1743 | } |
1798 | 1744 | ||
@@ -1864,7 +1810,7 @@ void wake_up_if_idle(int cpu) | |||
1864 | raw_spin_lock_irqsave(&rq->lock, flags); | 1810 | raw_spin_lock_irqsave(&rq->lock, flags); |
1865 | if (is_idle_task(rq->curr)) | 1811 | if (is_idle_task(rq->curr)) |
1866 | smp_send_reschedule(cpu); | 1812 | smp_send_reschedule(cpu); |
1867 | /* Else cpu is not in idle, do nothing here */ | 1813 | /* Else CPU is not idle, do nothing here: */ |
1868 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 1814 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1869 | } | 1815 | } |
1870 | 1816 | ||
@@ -1881,20 +1827,20 @@ bool cpus_share_cache(int this_cpu, int that_cpu) | |||
1881 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | 1827 | static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) |
1882 | { | 1828 | { |
1883 | struct rq *rq = cpu_rq(cpu); | 1829 | struct rq *rq = cpu_rq(cpu); |
1884 | struct pin_cookie cookie; | 1830 | struct rq_flags rf; |
1885 | 1831 | ||
1886 | #if defined(CONFIG_SMP) | 1832 | #if defined(CONFIG_SMP) |
1887 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { | 1833 | if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { |
1888 | sched_clock_cpu(cpu); /* sync clocks x-cpu */ | 1834 | sched_clock_cpu(cpu); /* Sync clocks across CPUs */ |
1889 | ttwu_queue_remote(p, cpu, wake_flags); | 1835 | ttwu_queue_remote(p, cpu, wake_flags); |
1890 | return; | 1836 | return; |
1891 | } | 1837 | } |
1892 | #endif | 1838 | #endif |
1893 | 1839 | ||
1894 | raw_spin_lock(&rq->lock); | 1840 | raw_spin_lock(&rq->lock); |
1895 | cookie = lockdep_pin_lock(&rq->lock); | 1841 | rq_pin_lock(rq, &rf); |
1896 | ttwu_do_activate(rq, p, wake_flags, cookie); | 1842 | ttwu_do_activate(rq, p, wake_flags, &rf); |
1897 | lockdep_unpin_lock(&rq->lock, cookie); | 1843 | rq_unpin_lock(rq, &rf); |
1898 | raw_spin_unlock(&rq->lock); | 1844 | raw_spin_unlock(&rq->lock); |
1899 | } | 1845 | } |
1900 | 1846 | ||
@@ -1904,8 +1850,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1904 | * MIGRATION | 1850 | * MIGRATION |
1905 | * | 1851 | * |
1906 | * The basic program-order guarantee on SMP systems is that when a task [t] | 1852 | * The basic program-order guarantee on SMP systems is that when a task [t] |
1907 | * migrates, all its activity on its old cpu [c0] happens-before any subsequent | 1853 | * migrates, all its activity on its old CPU [c0] happens-before any subsequent |
1908 | * execution on its new cpu [c1]. | 1854 | * execution on its new CPU [c1]. |
1909 | * | 1855 | * |
1910 | * For migration (of runnable tasks) this is provided by the following means: | 1856 | * For migration (of runnable tasks) this is provided by the following means: |
1911 | * | 1857 | * |
@@ -1916,7 +1862,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1916 | * | 1862 | * |
1917 | * Transitivity guarantees that B happens after A and C after B. | 1863 | * Transitivity guarantees that B happens after A and C after B. |
1918 | * Note: we only require RCpc transitivity. | 1864 | * Note: we only require RCpc transitivity. |
1919 | * Note: the cpu doing B need not be c0 or c1 | 1865 | * Note: the CPU doing B need not be c0 or c1 |
1920 | * | 1866 | * |
1921 | * Example: | 1867 | * Example: |
1922 | * | 1868 | * |
@@ -2024,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2024 | 1970 | ||
2025 | trace_sched_waking(p); | 1971 | trace_sched_waking(p); |
2026 | 1972 | ||
2027 | success = 1; /* we're going to change ->state */ | 1973 | /* We're going to change ->state: */ |
1974 | success = 1; | ||
2028 | cpu = task_cpu(p); | 1975 | cpu = task_cpu(p); |
2029 | 1976 | ||
2030 | /* | 1977 | /* |
@@ -2073,7 +2020,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2073 | smp_rmb(); | 2020 | smp_rmb(); |
2074 | 2021 | ||
2075 | /* | 2022 | /* |
2076 | * If the owning (remote) cpu is still in the middle of schedule() with | 2023 | * If the owning (remote) CPU is still in the middle of schedule() with |
2077 | * this task as prev, wait until its done referencing the task. | 2024 | * this task as prev, wait until its done referencing the task. |
2078 | * | 2025 | * |
2079 | * Pairs with the smp_store_release() in finish_lock_switch(). | 2026 | * Pairs with the smp_store_release() in finish_lock_switch(). |
@@ -2086,11 +2033,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2086 | p->sched_contributes_to_load = !!task_contributes_to_load(p); | 2033 | p->sched_contributes_to_load = !!task_contributes_to_load(p); |
2087 | p->state = TASK_WAKING; | 2034 | p->state = TASK_WAKING; |
2088 | 2035 | ||
2036 | if (p->in_iowait) { | ||
2037 | delayacct_blkio_end(); | ||
2038 | atomic_dec(&task_rq(p)->nr_iowait); | ||
2039 | } | ||
2040 | |||
2089 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); | 2041 | cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); |
2090 | if (task_cpu(p) != cpu) { | 2042 | if (task_cpu(p) != cpu) { |
2091 | wake_flags |= WF_MIGRATED; | 2043 | wake_flags |= WF_MIGRATED; |
2092 | set_task_cpu(p, cpu); | 2044 | set_task_cpu(p, cpu); |
2093 | } | 2045 | } |
2046 | |||
2047 | #else /* CONFIG_SMP */ | ||
2048 | |||
2049 | if (p->in_iowait) { | ||
2050 | delayacct_blkio_end(); | ||
2051 | atomic_dec(&task_rq(p)->nr_iowait); | ||
2052 | } | ||
2053 | |||
2094 | #endif /* CONFIG_SMP */ | 2054 | #endif /* CONFIG_SMP */ |
2095 | 2055 | ||
2096 | ttwu_queue(p, cpu, wake_flags); | 2056 | ttwu_queue(p, cpu, wake_flags); |
@@ -2111,7 +2071,7 @@ out: | |||
2111 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2071 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2112 | * the current task. | 2072 | * the current task. |
2113 | */ | 2073 | */ |
2114 | static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) | 2074 | static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) |
2115 | { | 2075 | { |
2116 | struct rq *rq = task_rq(p); | 2076 | struct rq *rq = task_rq(p); |
2117 | 2077 | ||
@@ -2128,11 +2088,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2128 | * disabled avoiding further scheduler activity on it and we've | 2088 | * disabled avoiding further scheduler activity on it and we've |
2129 | * not yet picked a replacement task. | 2089 | * not yet picked a replacement task. |
2130 | */ | 2090 | */ |
2131 | lockdep_unpin_lock(&rq->lock, cookie); | 2091 | rq_unpin_lock(rq, rf); |
2132 | raw_spin_unlock(&rq->lock); | 2092 | raw_spin_unlock(&rq->lock); |
2133 | raw_spin_lock(&p->pi_lock); | 2093 | raw_spin_lock(&p->pi_lock); |
2134 | raw_spin_lock(&rq->lock); | 2094 | raw_spin_lock(&rq->lock); |
2135 | lockdep_repin_lock(&rq->lock, cookie); | 2095 | rq_repin_lock(rq, rf); |
2136 | } | 2096 | } |
2137 | 2097 | ||
2138 | if (!(p->state & TASK_NORMAL)) | 2098 | if (!(p->state & TASK_NORMAL)) |
@@ -2140,10 +2100,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie | |||
2140 | 2100 | ||
2141 | trace_sched_waking(p); | 2101 | trace_sched_waking(p); |
2142 | 2102 | ||
2143 | if (!task_on_rq_queued(p)) | 2103 | if (!task_on_rq_queued(p)) { |
2104 | if (p->in_iowait) { | ||
2105 | delayacct_blkio_end(); | ||
2106 | atomic_dec(&rq->nr_iowait); | ||
2107 | } | ||
2144 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | 2108 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); |
2109 | } | ||
2145 | 2110 | ||
2146 | ttwu_do_wakeup(rq, p, 0, cookie); | 2111 | ttwu_do_wakeup(rq, p, 0, rf); |
2147 | ttwu_stat(p, smp_processor_id(), 0); | 2112 | ttwu_stat(p, smp_processor_id(), 0); |
2148 | out: | 2113 | out: |
2149 | raw_spin_unlock(&p->pi_lock); | 2114 | raw_spin_unlock(&p->pi_lock); |
@@ -2427,7 +2392,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2427 | */ | 2392 | */ |
2428 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2393 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2429 | /* | 2394 | /* |
2430 | * We're setting the cpu for the first time, we don't migrate, | 2395 | * We're setting the CPU for the first time, we don't migrate, |
2431 | * so use __set_task_cpu(). | 2396 | * so use __set_task_cpu(). |
2432 | */ | 2397 | */ |
2433 | __set_task_cpu(p, cpu); | 2398 | __set_task_cpu(p, cpu); |
@@ -2570,7 +2535,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2570 | /* | 2535 | /* |
2571 | * Fork balancing, do it here and not earlier because: | 2536 | * Fork balancing, do it here and not earlier because: |
2572 | * - cpus_allowed can change in the fork path | 2537 | * - cpus_allowed can change in the fork path |
2573 | * - any previously selected cpu might disappear through hotplug | 2538 | * - any previously selected CPU might disappear through hotplug |
2574 | * | 2539 | * |
2575 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | 2540 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, |
2576 | * as we're not fully set-up yet. | 2541 | * as we're not fully set-up yet. |
@@ -2578,6 +2543,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2578 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); | 2543 | __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); |
2579 | #endif | 2544 | #endif |
2580 | rq = __task_rq_lock(p, &rf); | 2545 | rq = __task_rq_lock(p, &rf); |
2546 | update_rq_clock(rq); | ||
2581 | post_init_entity_util_avg(&p->se); | 2547 | post_init_entity_util_avg(&p->se); |
2582 | 2548 | ||
2583 | activate_task(rq, p, 0); | 2549 | activate_task(rq, p, 0); |
@@ -2590,9 +2556,9 @@ void wake_up_new_task(struct task_struct *p) | |||
2590 | * Nothing relies on rq->lock after this, so its fine to | 2556 | * Nothing relies on rq->lock after this, so its fine to |
2591 | * drop it. | 2557 | * drop it. |
2592 | */ | 2558 | */ |
2593 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 2559 | rq_unpin_lock(rq, &rf); |
2594 | p->sched_class->task_woken(rq, p); | 2560 | p->sched_class->task_woken(rq, p); |
2595 | lockdep_repin_lock(&rq->lock, rf.cookie); | 2561 | rq_repin_lock(rq, &rf); |
2596 | } | 2562 | } |
2597 | #endif | 2563 | #endif |
2598 | task_rq_unlock(rq, p, &rf); | 2564 | task_rq_unlock(rq, p, &rf); |
@@ -2861,7 +2827,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) | |||
2861 | */ | 2827 | */ |
2862 | static __always_inline struct rq * | 2828 | static __always_inline struct rq * |
2863 | context_switch(struct rq *rq, struct task_struct *prev, | 2829 | context_switch(struct rq *rq, struct task_struct *prev, |
2864 | struct task_struct *next, struct pin_cookie cookie) | 2830 | struct task_struct *next, struct rq_flags *rf) |
2865 | { | 2831 | { |
2866 | struct mm_struct *mm, *oldmm; | 2832 | struct mm_struct *mm, *oldmm; |
2867 | 2833 | ||
@@ -2887,13 +2853,16 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2887 | prev->active_mm = NULL; | 2853 | prev->active_mm = NULL; |
2888 | rq->prev_mm = oldmm; | 2854 | rq->prev_mm = oldmm; |
2889 | } | 2855 | } |
2856 | |||
2857 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); | ||
2858 | |||
2890 | /* | 2859 | /* |
2891 | * Since the runqueue lock will be released by the next | 2860 | * Since the runqueue lock will be released by the next |
2892 | * task (which is an invalid locking op but in the case | 2861 | * task (which is an invalid locking op but in the case |
2893 | * of the scheduler it's an obvious special-case), so we | 2862 | * of the scheduler it's an obvious special-case), so we |
2894 | * do an early lockdep release here: | 2863 | * do an early lockdep release here: |
2895 | */ | 2864 | */ |
2896 | lockdep_unpin_lock(&rq->lock, cookie); | 2865 | rq_unpin_lock(rq, rf); |
2897 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 2866 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
2898 | 2867 | ||
2899 | /* Here we just switch the register state and the stack. */ | 2868 | /* Here we just switch the register state and the stack. */ |
@@ -2920,7 +2889,7 @@ unsigned long nr_running(void) | |||
2920 | } | 2889 | } |
2921 | 2890 | ||
2922 | /* | 2891 | /* |
2923 | * Check if only the current task is running on the cpu. | 2892 | * Check if only the current task is running on the CPU. |
2924 | * | 2893 | * |
2925 | * Caution: this function does not check that the caller has disabled | 2894 | * Caution: this function does not check that the caller has disabled |
2926 | * preemption, thus the result might have a time-of-check-to-time-of-use | 2895 | * preemption, thus the result might have a time-of-check-to-time-of-use |
@@ -2949,6 +2918,36 @@ unsigned long long nr_context_switches(void) | |||
2949 | return sum; | 2918 | return sum; |
2950 | } | 2919 | } |
2951 | 2920 | ||
2921 | /* | ||
2922 | * IO-wait accounting, and how its mostly bollocks (on SMP). | ||
2923 | * | ||
2924 | * The idea behind IO-wait account is to account the idle time that we could | ||
2925 | * have spend running if it were not for IO. That is, if we were to improve the | ||
2926 | * storage performance, we'd have a proportional reduction in IO-wait time. | ||
2927 | * | ||
2928 | * This all works nicely on UP, where, when a task blocks on IO, we account | ||
2929 | * idle time as IO-wait, because if the storage were faster, it could've been | ||
2930 | * running and we'd not be idle. | ||
2931 | * | ||
2932 | * This has been extended to SMP, by doing the same for each CPU. This however | ||
2933 | * is broken. | ||
2934 | * | ||
2935 | * Imagine for instance the case where two tasks block on one CPU, only the one | ||
2936 | * CPU will have IO-wait accounted, while the other has regular idle. Even | ||
2937 | * though, if the storage were faster, both could've ran at the same time, | ||
2938 | * utilising both CPUs. | ||
2939 | * | ||
2940 | * This means, that when looking globally, the current IO-wait accounting on | ||
2941 | * SMP is a lower bound, by reason of under accounting. | ||
2942 | * | ||
2943 | * Worse, since the numbers are provided per CPU, they are sometimes | ||
2944 | * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly | ||
2945 | * associated with any one particular CPU, it can wake to another CPU than it | ||
2946 | * blocked on. This means the per CPU IO-wait number is meaningless. | ||
2947 | * | ||
2948 | * Task CPU affinities can make all that even more 'interesting'. | ||
2949 | */ | ||
2950 | |||
2952 | unsigned long nr_iowait(void) | 2951 | unsigned long nr_iowait(void) |
2953 | { | 2952 | { |
2954 | unsigned long i, sum = 0; | 2953 | unsigned long i, sum = 0; |
@@ -2959,6 +2958,13 @@ unsigned long nr_iowait(void) | |||
2959 | return sum; | 2958 | return sum; |
2960 | } | 2959 | } |
2961 | 2960 | ||
2961 | /* | ||
2962 | * Consumers of these two interfaces, like for example the cpufreq menu | ||
2963 | * governor are using nonsensical data. Boosting frequency for a CPU that has | ||
2964 | * IO-wait which might not even end up running the task when it does become | ||
2965 | * runnable. | ||
2966 | */ | ||
2967 | |||
2962 | unsigned long nr_iowait_cpu(int cpu) | 2968 | unsigned long nr_iowait_cpu(int cpu) |
2963 | { | 2969 | { |
2964 | struct rq *this = cpu_rq(cpu); | 2970 | struct rq *this = cpu_rq(cpu); |
@@ -3042,8 +3048,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3042 | * So we have a optimization chance when the task's delta_exec is 0. | 3048 | * So we have a optimization chance when the task's delta_exec is 0. |
3043 | * Reading ->on_cpu is racy, but this is ok. | 3049 | * Reading ->on_cpu is racy, but this is ok. |
3044 | * | 3050 | * |
3045 | * If we race with it leaving cpu, we'll take a lock. So we're correct. | 3051 | * If we race with it leaving CPU, we'll take a lock. So we're correct. |
3046 | * If we race with it entering cpu, unaccounted time is 0. This is | 3052 | * If we race with it entering CPU, unaccounted time is 0. This is |
3047 | * indistinguishable from the read occurring a few cycles earlier. | 3053 | * indistinguishable from the read occurring a few cycles earlier. |
3048 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has | 3054 | * If we see ->on_cpu without ->on_rq, the task is leaving, and has |
3049 | * been accounted, so we're correct here as well. | 3055 | * been accounted, so we're correct here as well. |
@@ -3257,31 +3263,30 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3257 | * Pick up the highest-prio task: | 3263 | * Pick up the highest-prio task: |
3258 | */ | 3264 | */ |
3259 | static inline struct task_struct * | 3265 | static inline struct task_struct * |
3260 | pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 3266 | pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
3261 | { | 3267 | { |
3262 | const struct sched_class *class = &fair_sched_class; | 3268 | const struct sched_class *class; |
3263 | struct task_struct *p; | 3269 | struct task_struct *p; |
3264 | 3270 | ||
3265 | /* | 3271 | /* |
3266 | * Optimization: we know that if all tasks are in | 3272 | * Optimization: we know that if all tasks are in |
3267 | * the fair class we can call that function directly: | 3273 | * the fair class we can call that function directly: |
3268 | */ | 3274 | */ |
3269 | if (likely(prev->sched_class == class && | 3275 | if (likely(rq->nr_running == rq->cfs.h_nr_running)) { |
3270 | rq->nr_running == rq->cfs.h_nr_running)) { | 3276 | p = fair_sched_class.pick_next_task(rq, prev, rf); |
3271 | p = fair_sched_class.pick_next_task(rq, prev, cookie); | ||
3272 | if (unlikely(p == RETRY_TASK)) | 3277 | if (unlikely(p == RETRY_TASK)) |
3273 | goto again; | 3278 | goto again; |
3274 | 3279 | ||
3275 | /* assumes fair_sched_class->next == idle_sched_class */ | 3280 | /* Assumes fair_sched_class->next == idle_sched_class */ |
3276 | if (unlikely(!p)) | 3281 | if (unlikely(!p)) |
3277 | p = idle_sched_class.pick_next_task(rq, prev, cookie); | 3282 | p = idle_sched_class.pick_next_task(rq, prev, rf); |
3278 | 3283 | ||
3279 | return p; | 3284 | return p; |
3280 | } | 3285 | } |
3281 | 3286 | ||
3282 | again: | 3287 | again: |
3283 | for_each_class(class) { | 3288 | for_each_class(class) { |
3284 | p = class->pick_next_task(rq, prev, cookie); | 3289 | p = class->pick_next_task(rq, prev, rf); |
3285 | if (p) { | 3290 | if (p) { |
3286 | if (unlikely(p == RETRY_TASK)) | 3291 | if (unlikely(p == RETRY_TASK)) |
3287 | goto again; | 3292 | goto again; |
@@ -3289,7 +3294,8 @@ again: | |||
3289 | } | 3294 | } |
3290 | } | 3295 | } |
3291 | 3296 | ||
3292 | BUG(); /* the idle class will always have a runnable task */ | 3297 | /* The idle class should always have a runnable task: */ |
3298 | BUG(); | ||
3293 | } | 3299 | } |
3294 | 3300 | ||
3295 | /* | 3301 | /* |
@@ -3335,7 +3341,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3335 | { | 3341 | { |
3336 | struct task_struct *prev, *next; | 3342 | struct task_struct *prev, *next; |
3337 | unsigned long *switch_count; | 3343 | unsigned long *switch_count; |
3338 | struct pin_cookie cookie; | 3344 | struct rq_flags rf; |
3339 | struct rq *rq; | 3345 | struct rq *rq; |
3340 | int cpu; | 3346 | int cpu; |
3341 | 3347 | ||
@@ -3358,9 +3364,10 @@ static void __sched notrace __schedule(bool preempt) | |||
3358 | */ | 3364 | */ |
3359 | smp_mb__before_spinlock(); | 3365 | smp_mb__before_spinlock(); |
3360 | raw_spin_lock(&rq->lock); | 3366 | raw_spin_lock(&rq->lock); |
3361 | cookie = lockdep_pin_lock(&rq->lock); | 3367 | rq_pin_lock(rq, &rf); |
3362 | 3368 | ||
3363 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | 3369 | /* Promote REQ to ACT */ |
3370 | rq->clock_update_flags <<= 1; | ||
3364 | 3371 | ||
3365 | switch_count = &prev->nivcsw; | 3372 | switch_count = &prev->nivcsw; |
3366 | if (!preempt && prev->state) { | 3373 | if (!preempt && prev->state) { |
@@ -3370,6 +3377,11 @@ static void __sched notrace __schedule(bool preempt) | |||
3370 | deactivate_task(rq, prev, DEQUEUE_SLEEP); | 3377 | deactivate_task(rq, prev, DEQUEUE_SLEEP); |
3371 | prev->on_rq = 0; | 3378 | prev->on_rq = 0; |
3372 | 3379 | ||
3380 | if (prev->in_iowait) { | ||
3381 | atomic_inc(&rq->nr_iowait); | ||
3382 | delayacct_blkio_start(); | ||
3383 | } | ||
3384 | |||
3373 | /* | 3385 | /* |
3374 | * If a worker went to sleep, notify and ask workqueue | 3386 | * If a worker went to sleep, notify and ask workqueue |
3375 | * whether it wants to wake up a task to maintain | 3387 | * whether it wants to wake up a task to maintain |
@@ -3380,7 +3392,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3380 | 3392 | ||
3381 | to_wakeup = wq_worker_sleeping(prev); | 3393 | to_wakeup = wq_worker_sleeping(prev); |
3382 | if (to_wakeup) | 3394 | if (to_wakeup) |
3383 | try_to_wake_up_local(to_wakeup, cookie); | 3395 | try_to_wake_up_local(to_wakeup, &rf); |
3384 | } | 3396 | } |
3385 | } | 3397 | } |
3386 | switch_count = &prev->nvcsw; | 3398 | switch_count = &prev->nvcsw; |
@@ -3389,10 +3401,9 @@ static void __sched notrace __schedule(bool preempt) | |||
3389 | if (task_on_rq_queued(prev)) | 3401 | if (task_on_rq_queued(prev)) |
3390 | update_rq_clock(rq); | 3402 | update_rq_clock(rq); |
3391 | 3403 | ||
3392 | next = pick_next_task(rq, prev, cookie); | 3404 | next = pick_next_task(rq, prev, &rf); |
3393 | clear_tsk_need_resched(prev); | 3405 | clear_tsk_need_resched(prev); |
3394 | clear_preempt_need_resched(); | 3406 | clear_preempt_need_resched(); |
3395 | rq->clock_skip_update = 0; | ||
3396 | 3407 | ||
3397 | if (likely(prev != next)) { | 3408 | if (likely(prev != next)) { |
3398 | rq->nr_switches++; | 3409 | rq->nr_switches++; |
@@ -3400,9 +3411,12 @@ static void __sched notrace __schedule(bool preempt) | |||
3400 | ++*switch_count; | 3411 | ++*switch_count; |
3401 | 3412 | ||
3402 | trace_sched_switch(preempt, prev, next); | 3413 | trace_sched_switch(preempt, prev, next); |
3403 | rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ | 3414 | |
3415 | /* Also unlocks the rq: */ | ||
3416 | rq = context_switch(rq, prev, next, &rf); | ||
3404 | } else { | 3417 | } else { |
3405 | lockdep_unpin_lock(&rq->lock, cookie); | 3418 | rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); |
3419 | rq_unpin_lock(rq, &rf); | ||
3406 | raw_spin_unlock_irq(&rq->lock); | 3420 | raw_spin_unlock_irq(&rq->lock); |
3407 | } | 3421 | } |
3408 | 3422 | ||
@@ -3426,14 +3440,18 @@ void __noreturn do_task_dead(void) | |||
3426 | smp_mb(); | 3440 | smp_mb(); |
3427 | raw_spin_unlock_wait(¤t->pi_lock); | 3441 | raw_spin_unlock_wait(¤t->pi_lock); |
3428 | 3442 | ||
3429 | /* causes final put_task_struct in finish_task_switch(). */ | 3443 | /* Causes final put_task_struct in finish_task_switch(): */ |
3430 | __set_current_state(TASK_DEAD); | 3444 | __set_current_state(TASK_DEAD); |
3431 | current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ | 3445 | |
3446 | /* Tell freezer to ignore us: */ | ||
3447 | current->flags |= PF_NOFREEZE; | ||
3448 | |||
3432 | __schedule(false); | 3449 | __schedule(false); |
3433 | BUG(); | 3450 | BUG(); |
3434 | /* Avoid "noreturn function does return". */ | 3451 | |
3452 | /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ | ||
3435 | for (;;) | 3453 | for (;;) |
3436 | cpu_relax(); /* For when BUG is null */ | 3454 | cpu_relax(); |
3437 | } | 3455 | } |
3438 | 3456 | ||
3439 | static inline void sched_submit_work(struct task_struct *tsk) | 3457 | static inline void sched_submit_work(struct task_struct *tsk) |
@@ -3651,6 +3669,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3651 | BUG_ON(prio > MAX_PRIO); | 3669 | BUG_ON(prio > MAX_PRIO); |
3652 | 3670 | ||
3653 | rq = __task_rq_lock(p, &rf); | 3671 | rq = __task_rq_lock(p, &rf); |
3672 | update_rq_clock(rq); | ||
3654 | 3673 | ||
3655 | /* | 3674 | /* |
3656 | * Idle task boosting is a nono in general. There is one | 3675 | * Idle task boosting is a nono in general. There is one |
@@ -3725,7 +3744,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
3725 | 3744 | ||
3726 | check_class_changed(rq, p, prev_class, oldprio); | 3745 | check_class_changed(rq, p, prev_class, oldprio); |
3727 | out_unlock: | 3746 | out_unlock: |
3728 | preempt_disable(); /* avoid rq from going away on us */ | 3747 | /* Avoid rq from going away on us: */ |
3748 | preempt_disable(); | ||
3729 | __task_rq_unlock(rq, &rf); | 3749 | __task_rq_unlock(rq, &rf); |
3730 | 3750 | ||
3731 | balance_callback(rq); | 3751 | balance_callback(rq); |
@@ -3747,6 +3767,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
3747 | * the task might be in the middle of scheduling on another CPU. | 3767 | * the task might be in the middle of scheduling on another CPU. |
3748 | */ | 3768 | */ |
3749 | rq = task_rq_lock(p, &rf); | 3769 | rq = task_rq_lock(p, &rf); |
3770 | update_rq_clock(rq); | ||
3771 | |||
3750 | /* | 3772 | /* |
3751 | * The RT priorities are set via sched_setscheduler(), but we still | 3773 | * The RT priorities are set via sched_setscheduler(), but we still |
3752 | * allow the 'normal' nice value to be set - but as expected | 3774 | * allow the 'normal' nice value to be set - but as expected |
@@ -3793,7 +3815,7 @@ EXPORT_SYMBOL(set_user_nice); | |||
3793 | */ | 3815 | */ |
3794 | int can_nice(const struct task_struct *p, const int nice) | 3816 | int can_nice(const struct task_struct *p, const int nice) |
3795 | { | 3817 | { |
3796 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3818 | /* Convert nice value [19,-20] to rlimit style value [1,40]: */ |
3797 | int nice_rlim = nice_to_rlimit(nice); | 3819 | int nice_rlim = nice_to_rlimit(nice); |
3798 | 3820 | ||
3799 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || | 3821 | return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || |
@@ -3849,7 +3871,7 @@ int task_prio(const struct task_struct *p) | |||
3849 | } | 3871 | } |
3850 | 3872 | ||
3851 | /** | 3873 | /** |
3852 | * idle_cpu - is a given cpu idle currently? | 3874 | * idle_cpu - is a given CPU idle currently? |
3853 | * @cpu: the processor in question. | 3875 | * @cpu: the processor in question. |
3854 | * | 3876 | * |
3855 | * Return: 1 if the CPU is currently idle. 0 otherwise. | 3877 | * Return: 1 if the CPU is currently idle. 0 otherwise. |
@@ -3873,10 +3895,10 @@ int idle_cpu(int cpu) | |||
3873 | } | 3895 | } |
3874 | 3896 | ||
3875 | /** | 3897 | /** |
3876 | * idle_task - return the idle task for a given cpu. | 3898 | * idle_task - return the idle task for a given CPU. |
3877 | * @cpu: the processor in question. | 3899 | * @cpu: the processor in question. |
3878 | * | 3900 | * |
3879 | * Return: The idle task for the cpu @cpu. | 3901 | * Return: The idle task for the CPU @cpu. |
3880 | */ | 3902 | */ |
3881 | struct task_struct *idle_task(int cpu) | 3903 | struct task_struct *idle_task(int cpu) |
3882 | { | 3904 | { |
@@ -4042,7 +4064,7 @@ __checkparam_dl(const struct sched_attr *attr) | |||
4042 | } | 4064 | } |
4043 | 4065 | ||
4044 | /* | 4066 | /* |
4045 | * check the target process has a UID that matches the current process's | 4067 | * Check the target process has a UID that matches the current process's: |
4046 | */ | 4068 | */ |
4047 | static bool check_same_owner(struct task_struct *p) | 4069 | static bool check_same_owner(struct task_struct *p) |
4048 | { | 4070 | { |
@@ -4057,8 +4079,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4057 | return match; | 4079 | return match; |
4058 | } | 4080 | } |
4059 | 4081 | ||
4060 | static bool dl_param_changed(struct task_struct *p, | 4082 | static bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
4061 | const struct sched_attr *attr) | ||
4062 | { | 4083 | { |
4063 | struct sched_dl_entity *dl_se = &p->dl; | 4084 | struct sched_dl_entity *dl_se = &p->dl; |
4064 | 4085 | ||
@@ -4085,10 +4106,10 @@ static int __sched_setscheduler(struct task_struct *p, | |||
4085 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; | 4106 | int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; |
4086 | struct rq *rq; | 4107 | struct rq *rq; |
4087 | 4108 | ||
4088 | /* may grab non-irq protected spin_locks */ | 4109 | /* May grab non-irq protected spin_locks: */ |
4089 | BUG_ON(in_interrupt()); | 4110 | BUG_ON(in_interrupt()); |
4090 | recheck: | 4111 | recheck: |
4091 | /* double check policy once rq lock held */ | 4112 | /* Double check policy once rq lock held: */ |
4092 | if (policy < 0) { | 4113 | if (policy < 0) { |
4093 | reset_on_fork = p->sched_reset_on_fork; | 4114 | reset_on_fork = p->sched_reset_on_fork; |
4094 | policy = oldpolicy = p->policy; | 4115 | policy = oldpolicy = p->policy; |
@@ -4128,11 +4149,11 @@ recheck: | |||
4128 | unsigned long rlim_rtprio = | 4149 | unsigned long rlim_rtprio = |
4129 | task_rlimit(p, RLIMIT_RTPRIO); | 4150 | task_rlimit(p, RLIMIT_RTPRIO); |
4130 | 4151 | ||
4131 | /* can't set/change the rt policy */ | 4152 | /* Can't set/change the rt policy: */ |
4132 | if (policy != p->policy && !rlim_rtprio) | 4153 | if (policy != p->policy && !rlim_rtprio) |
4133 | return -EPERM; | 4154 | return -EPERM; |
4134 | 4155 | ||
4135 | /* can't increase priority */ | 4156 | /* Can't increase priority: */ |
4136 | if (attr->sched_priority > p->rt_priority && | 4157 | if (attr->sched_priority > p->rt_priority && |
4137 | attr->sched_priority > rlim_rtprio) | 4158 | attr->sched_priority > rlim_rtprio) |
4138 | return -EPERM; | 4159 | return -EPERM; |
@@ -4156,11 +4177,11 @@ recheck: | |||
4156 | return -EPERM; | 4177 | return -EPERM; |
4157 | } | 4178 | } |
4158 | 4179 | ||
4159 | /* can't change other user's priorities */ | 4180 | /* Can't change other user's priorities: */ |
4160 | if (!check_same_owner(p)) | 4181 | if (!check_same_owner(p)) |
4161 | return -EPERM; | 4182 | return -EPERM; |
4162 | 4183 | ||
4163 | /* Normal users shall not reset the sched_reset_on_fork flag */ | 4184 | /* Normal users shall not reset the sched_reset_on_fork flag: */ |
4164 | if (p->sched_reset_on_fork && !reset_on_fork) | 4185 | if (p->sched_reset_on_fork && !reset_on_fork) |
4165 | return -EPERM; | 4186 | return -EPERM; |
4166 | } | 4187 | } |
@@ -4172,16 +4193,17 @@ recheck: | |||
4172 | } | 4193 | } |
4173 | 4194 | ||
4174 | /* | 4195 | /* |
4175 | * make sure no PI-waiters arrive (or leave) while we are | 4196 | * Make sure no PI-waiters arrive (or leave) while we are |
4176 | * changing the priority of the task: | 4197 | * changing the priority of the task: |
4177 | * | 4198 | * |
4178 | * To be able to change p->policy safely, the appropriate | 4199 | * To be able to change p->policy safely, the appropriate |
4179 | * runqueue lock must be held. | 4200 | * runqueue lock must be held. |
4180 | */ | 4201 | */ |
4181 | rq = task_rq_lock(p, &rf); | 4202 | rq = task_rq_lock(p, &rf); |
4203 | update_rq_clock(rq); | ||
4182 | 4204 | ||
4183 | /* | 4205 | /* |
4184 | * Changing the policy of the stop threads its a very bad idea | 4206 | * Changing the policy of the stop threads its a very bad idea: |
4185 | */ | 4207 | */ |
4186 | if (p == rq->stop) { | 4208 | if (p == rq->stop) { |
4187 | task_rq_unlock(rq, p, &rf); | 4209 | task_rq_unlock(rq, p, &rf); |
@@ -4237,7 +4259,7 @@ change: | |||
4237 | #endif | 4259 | #endif |
4238 | } | 4260 | } |
4239 | 4261 | ||
4240 | /* recheck policy now with rq lock held */ | 4262 | /* Re-check policy now with rq lock held: */ |
4241 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4263 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4242 | policy = oldpolicy = -1; | 4264 | policy = oldpolicy = -1; |
4243 | task_rq_unlock(rq, p, &rf); | 4265 | task_rq_unlock(rq, p, &rf); |
@@ -4294,15 +4316,15 @@ change: | |||
4294 | set_curr_task(rq, p); | 4316 | set_curr_task(rq, p); |
4295 | 4317 | ||
4296 | check_class_changed(rq, p, prev_class, oldprio); | 4318 | check_class_changed(rq, p, prev_class, oldprio); |
4297 | preempt_disable(); /* avoid rq from going away on us */ | 4319 | |
4320 | /* Avoid rq from going away on us: */ | ||
4321 | preempt_disable(); | ||
4298 | task_rq_unlock(rq, p, &rf); | 4322 | task_rq_unlock(rq, p, &rf); |
4299 | 4323 | ||
4300 | if (pi) | 4324 | if (pi) |
4301 | rt_mutex_adjust_pi(p); | 4325 | rt_mutex_adjust_pi(p); |
4302 | 4326 | ||
4303 | /* | 4327 | /* Run balance callbacks after we've adjusted the PI chain: */ |
4304 | * Run balance callbacks after we've adjusted the PI chain. | ||
4305 | */ | ||
4306 | balance_callback(rq); | 4328 | balance_callback(rq); |
4307 | preempt_enable(); | 4329 | preempt_enable(); |
4308 | 4330 | ||
@@ -4395,8 +4417,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
4395 | /* | 4417 | /* |
4396 | * Mimics kernel/events/core.c perf_copy_attr(). | 4418 | * Mimics kernel/events/core.c perf_copy_attr(). |
4397 | */ | 4419 | */ |
4398 | static int sched_copy_attr(struct sched_attr __user *uattr, | 4420 | static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) |
4399 | struct sched_attr *attr) | ||
4400 | { | 4421 | { |
4401 | u32 size; | 4422 | u32 size; |
4402 | int ret; | 4423 | int ret; |
@@ -4404,19 +4425,19 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
4404 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) | 4425 | if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) |
4405 | return -EFAULT; | 4426 | return -EFAULT; |
4406 | 4427 | ||
4407 | /* | 4428 | /* Zero the full structure, so that a short copy will be nice: */ |
4408 | * zero the full structure, so that a short copy will be nice. | ||
4409 | */ | ||
4410 | memset(attr, 0, sizeof(*attr)); | 4429 | memset(attr, 0, sizeof(*attr)); |
4411 | 4430 | ||
4412 | ret = get_user(size, &uattr->size); | 4431 | ret = get_user(size, &uattr->size); |
4413 | if (ret) | 4432 | if (ret) |
4414 | return ret; | 4433 | return ret; |
4415 | 4434 | ||
4416 | if (size > PAGE_SIZE) /* silly large */ | 4435 | /* Bail out on silly large: */ |
4436 | if (size > PAGE_SIZE) | ||
4417 | goto err_size; | 4437 | goto err_size; |
4418 | 4438 | ||
4419 | if (!size) /* abi compat */ | 4439 | /* ABI compatibility quirk: */ |
4440 | if (!size) | ||
4420 | size = SCHED_ATTR_SIZE_VER0; | 4441 | size = SCHED_ATTR_SIZE_VER0; |
4421 | 4442 | ||
4422 | if (size < SCHED_ATTR_SIZE_VER0) | 4443 | if (size < SCHED_ATTR_SIZE_VER0) |
@@ -4451,7 +4472,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr, | |||
4451 | return -EFAULT; | 4472 | return -EFAULT; |
4452 | 4473 | ||
4453 | /* | 4474 | /* |
4454 | * XXX: do we want to be lenient like existing syscalls; or do we want | 4475 | * XXX: Do we want to be lenient like existing syscalls; or do we want |
4455 | * to be strict and return an error on out-of-bounds values? | 4476 | * to be strict and return an error on out-of-bounds values? |
4456 | */ | 4477 | */ |
4457 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); | 4478 | attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); |
@@ -4471,10 +4492,8 @@ err_size: | |||
4471 | * | 4492 | * |
4472 | * Return: 0 on success. An error code otherwise. | 4493 | * Return: 0 on success. An error code otherwise. |
4473 | */ | 4494 | */ |
4474 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, | 4495 | SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) |
4475 | struct sched_param __user *, param) | ||
4476 | { | 4496 | { |
4477 | /* negative values for policy are not valid */ | ||
4478 | if (policy < 0) | 4497 | if (policy < 0) |
4479 | return -EINVAL; | 4498 | return -EINVAL; |
4480 | 4499 | ||
@@ -4784,10 +4803,10 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | |||
4784 | } | 4803 | } |
4785 | 4804 | ||
4786 | /** | 4805 | /** |
4787 | * sys_sched_setaffinity - set the cpu affinity of a process | 4806 | * sys_sched_setaffinity - set the CPU affinity of a process |
4788 | * @pid: pid of the process | 4807 | * @pid: pid of the process |
4789 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4808 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4790 | * @user_mask_ptr: user-space pointer to the new cpu mask | 4809 | * @user_mask_ptr: user-space pointer to the new CPU mask |
4791 | * | 4810 | * |
4792 | * Return: 0 on success. An error code otherwise. | 4811 | * Return: 0 on success. An error code otherwise. |
4793 | */ | 4812 | */ |
@@ -4835,10 +4854,10 @@ out_unlock: | |||
4835 | } | 4854 | } |
4836 | 4855 | ||
4837 | /** | 4856 | /** |
4838 | * sys_sched_getaffinity - get the cpu affinity of a process | 4857 | * sys_sched_getaffinity - get the CPU affinity of a process |
4839 | * @pid: pid of the process | 4858 | * @pid: pid of the process |
4840 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4859 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4841 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4860 | * @user_mask_ptr: user-space pointer to hold the current CPU mask |
4842 | * | 4861 | * |
4843 | * Return: size of CPU mask copied to user_mask_ptr on success. An | 4862 | * Return: size of CPU mask copied to user_mask_ptr on success. An |
4844 | * error code otherwise. | 4863 | * error code otherwise. |
@@ -4966,7 +4985,7 @@ EXPORT_SYMBOL(__cond_resched_softirq); | |||
4966 | * Typical broken usage is: | 4985 | * Typical broken usage is: |
4967 | * | 4986 | * |
4968 | * while (!event) | 4987 | * while (!event) |
4969 | * yield(); | 4988 | * yield(); |
4970 | * | 4989 | * |
4971 | * where one assumes that yield() will let 'the other' process run that will | 4990 | * where one assumes that yield() will let 'the other' process run that will |
4972 | * make event true. If the current task is a SCHED_FIFO task that will never | 4991 | * make event true. If the current task is a SCHED_FIFO task that will never |
@@ -5057,31 +5076,48 @@ out_irq: | |||
5057 | } | 5076 | } |
5058 | EXPORT_SYMBOL_GPL(yield_to); | 5077 | EXPORT_SYMBOL_GPL(yield_to); |
5059 | 5078 | ||
5079 | int io_schedule_prepare(void) | ||
5080 | { | ||
5081 | int old_iowait = current->in_iowait; | ||
5082 | |||
5083 | current->in_iowait = 1; | ||
5084 | blk_schedule_flush_plug(current); | ||
5085 | |||
5086 | return old_iowait; | ||
5087 | } | ||
5088 | |||
5089 | void io_schedule_finish(int token) | ||
5090 | { | ||
5091 | current->in_iowait = token; | ||
5092 | } | ||
5093 | |||
5060 | /* | 5094 | /* |
5061 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5095 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5062 | * that process accounting knows that this is a task in IO wait state. | 5096 | * that process accounting knows that this is a task in IO wait state. |
5063 | */ | 5097 | */ |
5064 | long __sched io_schedule_timeout(long timeout) | 5098 | long __sched io_schedule_timeout(long timeout) |
5065 | { | 5099 | { |
5066 | int old_iowait = current->in_iowait; | 5100 | int token; |
5067 | struct rq *rq; | ||
5068 | long ret; | 5101 | long ret; |
5069 | 5102 | ||
5070 | current->in_iowait = 1; | 5103 | token = io_schedule_prepare(); |
5071 | blk_schedule_flush_plug(current); | ||
5072 | |||
5073 | delayacct_blkio_start(); | ||
5074 | rq = raw_rq(); | ||
5075 | atomic_inc(&rq->nr_iowait); | ||
5076 | ret = schedule_timeout(timeout); | 5104 | ret = schedule_timeout(timeout); |
5077 | current->in_iowait = old_iowait; | 5105 | io_schedule_finish(token); |
5078 | atomic_dec(&rq->nr_iowait); | ||
5079 | delayacct_blkio_end(); | ||
5080 | 5106 | ||
5081 | return ret; | 5107 | return ret; |
5082 | } | 5108 | } |
5083 | EXPORT_SYMBOL(io_schedule_timeout); | 5109 | EXPORT_SYMBOL(io_schedule_timeout); |
5084 | 5110 | ||
5111 | void io_schedule(void) | ||
5112 | { | ||
5113 | int token; | ||
5114 | |||
5115 | token = io_schedule_prepare(); | ||
5116 | schedule(); | ||
5117 | io_schedule_finish(token); | ||
5118 | } | ||
5119 | EXPORT_SYMBOL(io_schedule); | ||
5120 | |||
5085 | /** | 5121 | /** |
5086 | * sys_sched_get_priority_max - return maximum RT priority. | 5122 | * sys_sched_get_priority_max - return maximum RT priority. |
5087 | * @policy: scheduling class. | 5123 | * @policy: scheduling class. |
@@ -5264,7 +5300,7 @@ void init_idle_bootup_task(struct task_struct *idle) | |||
5264 | /** | 5300 | /** |
5265 | * init_idle - set up an idle thread for a given CPU | 5301 | * init_idle - set up an idle thread for a given CPU |
5266 | * @idle: task in question | 5302 | * @idle: task in question |
5267 | * @cpu: cpu the idle task belongs to | 5303 | * @cpu: CPU the idle task belongs to |
5268 | * | 5304 | * |
5269 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 5305 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
5270 | * flag, to make booting more robust. | 5306 | * flag, to make booting more robust. |
@@ -5295,7 +5331,7 @@ void init_idle(struct task_struct *idle, int cpu) | |||
5295 | #endif | 5331 | #endif |
5296 | /* | 5332 | /* |
5297 | * We're having a chicken and egg problem, even though we are | 5333 | * We're having a chicken and egg problem, even though we are |
5298 | * holding rq->lock, the cpu isn't yet set to this cpu so the | 5334 | * holding rq->lock, the CPU isn't yet set to this CPU so the |
5299 | * lockdep check in task_group() will fail. | 5335 | * lockdep check in task_group() will fail. |
5300 | * | 5336 | * |
5301 | * Similar case to sched_fork(). / Alternatively we could | 5337 | * Similar case to sched_fork(). / Alternatively we could |
@@ -5360,7 +5396,7 @@ int task_can_attach(struct task_struct *p, | |||
5360 | 5396 | ||
5361 | /* | 5397 | /* |
5362 | * Kthreads which disallow setaffinity shouldn't be moved | 5398 | * Kthreads which disallow setaffinity shouldn't be moved |
5363 | * to a new cpuset; we don't want to change their cpu | 5399 | * to a new cpuset; we don't want to change their CPU |
5364 | * affinity and isolating such threads by their set of | 5400 | * affinity and isolating such threads by their set of |
5365 | * allowed nodes is unnecessary. Thus, cpusets are not | 5401 | * allowed nodes is unnecessary. Thus, cpusets are not |
5366 | * applicable for such threads. This prevents checking for | 5402 | * applicable for such threads. This prevents checking for |
@@ -5409,7 +5445,7 @@ out: | |||
5409 | 5445 | ||
5410 | #ifdef CONFIG_SMP | 5446 | #ifdef CONFIG_SMP |
5411 | 5447 | ||
5412 | static bool sched_smp_initialized __read_mostly; | 5448 | bool sched_smp_initialized __read_mostly; |
5413 | 5449 | ||
5414 | #ifdef CONFIG_NUMA_BALANCING | 5450 | #ifdef CONFIG_NUMA_BALANCING |
5415 | /* Migrate current task p to target_cpu */ | 5451 | /* Migrate current task p to target_cpu */ |
@@ -5461,7 +5497,7 @@ void sched_setnuma(struct task_struct *p, int nid) | |||
5461 | 5497 | ||
5462 | #ifdef CONFIG_HOTPLUG_CPU | 5498 | #ifdef CONFIG_HOTPLUG_CPU |
5463 | /* | 5499 | /* |
5464 | * Ensures that the idle task is using init_mm right before its cpu goes | 5500 | * Ensure that the idle task is using init_mm right before its CPU goes |
5465 | * offline. | 5501 | * offline. |
5466 | */ | 5502 | */ |
5467 | void idle_task_exit(void) | 5503 | void idle_task_exit(void) |
@@ -5521,7 +5557,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5521 | { | 5557 | { |
5522 | struct rq *rq = dead_rq; | 5558 | struct rq *rq = dead_rq; |
5523 | struct task_struct *next, *stop = rq->stop; | 5559 | struct task_struct *next, *stop = rq->stop; |
5524 | struct pin_cookie cookie; | 5560 | struct rq_flags rf, old_rf; |
5525 | int dest_cpu; | 5561 | int dest_cpu; |
5526 | 5562 | ||
5527 | /* | 5563 | /* |
@@ -5545,16 +5581,16 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5545 | for (;;) { | 5581 | for (;;) { |
5546 | /* | 5582 | /* |
5547 | * There's this thread running, bail when that's the only | 5583 | * There's this thread running, bail when that's the only |
5548 | * remaining thread. | 5584 | * remaining thread: |
5549 | */ | 5585 | */ |
5550 | if (rq->nr_running == 1) | 5586 | if (rq->nr_running == 1) |
5551 | break; | 5587 | break; |
5552 | 5588 | ||
5553 | /* | 5589 | /* |
5554 | * pick_next_task assumes pinned rq->lock. | 5590 | * pick_next_task() assumes pinned rq->lock: |
5555 | */ | 5591 | */ |
5556 | cookie = lockdep_pin_lock(&rq->lock); | 5592 | rq_pin_lock(rq, &rf); |
5557 | next = pick_next_task(rq, &fake_task, cookie); | 5593 | next = pick_next_task(rq, &fake_task, &rf); |
5558 | BUG_ON(!next); | 5594 | BUG_ON(!next); |
5559 | next->sched_class->put_prev_task(rq, next); | 5595 | next->sched_class->put_prev_task(rq, next); |
5560 | 5596 | ||
@@ -5567,7 +5603,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5567 | * because !cpu_active at this point, which means load-balance | 5603 | * because !cpu_active at this point, which means load-balance |
5568 | * will not interfere. Also, stop-machine. | 5604 | * will not interfere. Also, stop-machine. |
5569 | */ | 5605 | */ |
5570 | lockdep_unpin_lock(&rq->lock, cookie); | 5606 | rq_unpin_lock(rq, &rf); |
5571 | raw_spin_unlock(&rq->lock); | 5607 | raw_spin_unlock(&rq->lock); |
5572 | raw_spin_lock(&next->pi_lock); | 5608 | raw_spin_lock(&next->pi_lock); |
5573 | raw_spin_lock(&rq->lock); | 5609 | raw_spin_lock(&rq->lock); |
@@ -5582,6 +5618,13 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5582 | continue; | 5618 | continue; |
5583 | } | 5619 | } |
5584 | 5620 | ||
5621 | /* | ||
5622 | * __migrate_task() may return with a different | ||
5623 | * rq->lock held and a new cookie in 'rf', but we need | ||
5624 | * to preserve rf::clock_update_flags for 'dead_rq'. | ||
5625 | */ | ||
5626 | old_rf = rf; | ||
5627 | |||
5585 | /* Find suitable destination for @next, with force if needed. */ | 5628 | /* Find suitable destination for @next, with force if needed. */ |
5586 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); | 5629 | dest_cpu = select_fallback_rq(dead_rq->cpu, next); |
5587 | 5630 | ||
@@ -5590,6 +5633,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5590 | raw_spin_unlock(&rq->lock); | 5633 | raw_spin_unlock(&rq->lock); |
5591 | rq = dead_rq; | 5634 | rq = dead_rq; |
5592 | raw_spin_lock(&rq->lock); | 5635 | raw_spin_lock(&rq->lock); |
5636 | rf = old_rf; | ||
5593 | } | 5637 | } |
5594 | raw_spin_unlock(&next->pi_lock); | 5638 | raw_spin_unlock(&next->pi_lock); |
5595 | } | 5639 | } |
@@ -5598,7 +5642,7 @@ static void migrate_tasks(struct rq *dead_rq) | |||
5598 | } | 5642 | } |
5599 | #endif /* CONFIG_HOTPLUG_CPU */ | 5643 | #endif /* CONFIG_HOTPLUG_CPU */ |
5600 | 5644 | ||
5601 | static void set_rq_online(struct rq *rq) | 5645 | void set_rq_online(struct rq *rq) |
5602 | { | 5646 | { |
5603 | if (!rq->online) { | 5647 | if (!rq->online) { |
5604 | const struct sched_class *class; | 5648 | const struct sched_class *class; |
@@ -5613,7 +5657,7 @@ static void set_rq_online(struct rq *rq) | |||
5613 | } | 5657 | } |
5614 | } | 5658 | } |
5615 | 5659 | ||
5616 | static void set_rq_offline(struct rq *rq) | 5660 | void set_rq_offline(struct rq *rq) |
5617 | { | 5661 | { |
5618 | if (rq->online) { | 5662 | if (rq->online) { |
5619 | const struct sched_class *class; | 5663 | const struct sched_class *class; |
@@ -5635,1647 +5679,10 @@ static void set_cpu_rq_start_time(unsigned int cpu) | |||
5635 | rq->age_stamp = sched_clock_cpu(cpu); | 5679 | rq->age_stamp = sched_clock_cpu(cpu); |
5636 | } | 5680 | } |
5637 | 5681 | ||
5638 | static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | ||
5639 | |||
5640 | #ifdef CONFIG_SCHED_DEBUG | ||
5641 | |||
5642 | static __read_mostly int sched_debug_enabled; | ||
5643 | |||
5644 | static int __init sched_debug_setup(char *str) | ||
5645 | { | ||
5646 | sched_debug_enabled = 1; | ||
5647 | |||
5648 | return 0; | ||
5649 | } | ||
5650 | early_param("sched_debug", sched_debug_setup); | ||
5651 | |||
5652 | static inline bool sched_debug(void) | ||
5653 | { | ||
5654 | return sched_debug_enabled; | ||
5655 | } | ||
5656 | |||
5657 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
5658 | struct cpumask *groupmask) | ||
5659 | { | ||
5660 | struct sched_group *group = sd->groups; | ||
5661 | |||
5662 | cpumask_clear(groupmask); | ||
5663 | |||
5664 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
5665 | |||
5666 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
5667 | printk("does not load-balance\n"); | ||
5668 | if (sd->parent) | ||
5669 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
5670 | " has parent"); | ||
5671 | return -1; | ||
5672 | } | ||
5673 | |||
5674 | printk(KERN_CONT "span %*pbl level %s\n", | ||
5675 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
5676 | |||
5677 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
5678 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
5679 | "CPU%d\n", cpu); | ||
5680 | } | ||
5681 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
5682 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
5683 | " CPU%d\n", cpu); | ||
5684 | } | ||
5685 | |||
5686 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
5687 | do { | ||
5688 | if (!group) { | ||
5689 | printk("\n"); | ||
5690 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
5691 | break; | ||
5692 | } | ||
5693 | |||
5694 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
5695 | printk(KERN_CONT "\n"); | ||
5696 | printk(KERN_ERR "ERROR: empty group\n"); | ||
5697 | break; | ||
5698 | } | ||
5699 | |||
5700 | if (!(sd->flags & SD_OVERLAP) && | ||
5701 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5702 | printk(KERN_CONT "\n"); | ||
5703 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
5704 | break; | ||
5705 | } | ||
5706 | |||
5707 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
5708 | |||
5709 | printk(KERN_CONT " %*pbl", | ||
5710 | cpumask_pr_args(sched_group_cpus(group))); | ||
5711 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
5712 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
5713 | group->sgc->capacity); | ||
5714 | } | ||
5715 | |||
5716 | group = group->next; | ||
5717 | } while (group != sd->groups); | ||
5718 | printk(KERN_CONT "\n"); | ||
5719 | |||
5720 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
5721 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
5722 | |||
5723 | if (sd->parent && | ||
5724 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
5725 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
5726 | "of domain->span\n"); | ||
5727 | return 0; | ||
5728 | } | ||
5729 | |||
5730 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
5731 | { | ||
5732 | int level = 0; | ||
5733 | |||
5734 | if (!sched_debug_enabled) | ||
5735 | return; | ||
5736 | |||
5737 | if (!sd) { | ||
5738 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
5739 | return; | ||
5740 | } | ||
5741 | |||
5742 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
5743 | |||
5744 | for (;;) { | ||
5745 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
5746 | break; | ||
5747 | level++; | ||
5748 | sd = sd->parent; | ||
5749 | if (!sd) | ||
5750 | break; | ||
5751 | } | ||
5752 | } | ||
5753 | #else /* !CONFIG_SCHED_DEBUG */ | ||
5754 | |||
5755 | # define sched_debug_enabled 0 | ||
5756 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
5757 | static inline bool sched_debug(void) | ||
5758 | { | ||
5759 | return false; | ||
5760 | } | ||
5761 | #endif /* CONFIG_SCHED_DEBUG */ | ||
5762 | |||
5763 | static int sd_degenerate(struct sched_domain *sd) | ||
5764 | { | ||
5765 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
5766 | return 1; | ||
5767 | |||
5768 | /* Following flags need at least 2 groups */ | ||
5769 | if (sd->flags & (SD_LOAD_BALANCE | | ||
5770 | SD_BALANCE_NEWIDLE | | ||
5771 | SD_BALANCE_FORK | | ||
5772 | SD_BALANCE_EXEC | | ||
5773 | SD_SHARE_CPUCAPACITY | | ||
5774 | SD_ASYM_CPUCAPACITY | | ||
5775 | SD_SHARE_PKG_RESOURCES | | ||
5776 | SD_SHARE_POWERDOMAIN)) { | ||
5777 | if (sd->groups != sd->groups->next) | ||
5778 | return 0; | ||
5779 | } | ||
5780 | |||
5781 | /* Following flags don't use groups */ | ||
5782 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
5783 | return 0; | ||
5784 | |||
5785 | return 1; | ||
5786 | } | ||
5787 | |||
5788 | static int | ||
5789 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
5790 | { | ||
5791 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
5792 | |||
5793 | if (sd_degenerate(parent)) | ||
5794 | return 1; | ||
5795 | |||
5796 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
5797 | return 0; | ||
5798 | |||
5799 | /* Flags needing groups don't count if only 1 group in parent */ | ||
5800 | if (parent->groups == parent->groups->next) { | ||
5801 | pflags &= ~(SD_LOAD_BALANCE | | ||
5802 | SD_BALANCE_NEWIDLE | | ||
5803 | SD_BALANCE_FORK | | ||
5804 | SD_BALANCE_EXEC | | ||
5805 | SD_ASYM_CPUCAPACITY | | ||
5806 | SD_SHARE_CPUCAPACITY | | ||
5807 | SD_SHARE_PKG_RESOURCES | | ||
5808 | SD_PREFER_SIBLING | | ||
5809 | SD_SHARE_POWERDOMAIN); | ||
5810 | if (nr_node_ids == 1) | ||
5811 | pflags &= ~SD_SERIALIZE; | ||
5812 | } | ||
5813 | if (~cflags & pflags) | ||
5814 | return 0; | ||
5815 | |||
5816 | return 1; | ||
5817 | } | ||
5818 | |||
5819 | static void free_rootdomain(struct rcu_head *rcu) | ||
5820 | { | ||
5821 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
5822 | |||
5823 | cpupri_cleanup(&rd->cpupri); | ||
5824 | cpudl_cleanup(&rd->cpudl); | ||
5825 | free_cpumask_var(rd->dlo_mask); | ||
5826 | free_cpumask_var(rd->rto_mask); | ||
5827 | free_cpumask_var(rd->online); | ||
5828 | free_cpumask_var(rd->span); | ||
5829 | kfree(rd); | ||
5830 | } | ||
5831 | |||
5832 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
5833 | { | ||
5834 | struct root_domain *old_rd = NULL; | ||
5835 | unsigned long flags; | ||
5836 | |||
5837 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5838 | |||
5839 | if (rq->rd) { | ||
5840 | old_rd = rq->rd; | ||
5841 | |||
5842 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
5843 | set_rq_offline(rq); | ||
5844 | |||
5845 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
5846 | |||
5847 | /* | ||
5848 | * If we dont want to free the old_rd yet then | ||
5849 | * set old_rd to NULL to skip the freeing later | ||
5850 | * in this function: | ||
5851 | */ | ||
5852 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
5853 | old_rd = NULL; | ||
5854 | } | ||
5855 | |||
5856 | atomic_inc(&rd->refcount); | ||
5857 | rq->rd = rd; | ||
5858 | |||
5859 | cpumask_set_cpu(rq->cpu, rd->span); | ||
5860 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
5861 | set_rq_online(rq); | ||
5862 | |||
5863 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5864 | |||
5865 | if (old_rd) | ||
5866 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
5867 | } | ||
5868 | |||
5869 | static int init_rootdomain(struct root_domain *rd) | ||
5870 | { | ||
5871 | memset(rd, 0, sizeof(*rd)); | ||
5872 | |||
5873 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
5874 | goto out; | ||
5875 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
5876 | goto free_span; | ||
5877 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
5878 | goto free_online; | ||
5879 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
5880 | goto free_dlo_mask; | ||
5881 | |||
5882 | init_dl_bw(&rd->dl_bw); | ||
5883 | if (cpudl_init(&rd->cpudl) != 0) | ||
5884 | goto free_dlo_mask; | ||
5885 | |||
5886 | if (cpupri_init(&rd->cpupri) != 0) | ||
5887 | goto free_rto_mask; | ||
5888 | return 0; | ||
5889 | |||
5890 | free_rto_mask: | ||
5891 | free_cpumask_var(rd->rto_mask); | ||
5892 | free_dlo_mask: | ||
5893 | free_cpumask_var(rd->dlo_mask); | ||
5894 | free_online: | ||
5895 | free_cpumask_var(rd->online); | ||
5896 | free_span: | ||
5897 | free_cpumask_var(rd->span); | ||
5898 | out: | ||
5899 | return -ENOMEM; | ||
5900 | } | ||
5901 | |||
5902 | /* | ||
5903 | * By default the system creates a single root-domain with all cpus as | ||
5904 | * members (mimicking the global state we have today). | ||
5905 | */ | ||
5906 | struct root_domain def_root_domain; | ||
5907 | |||
5908 | static void init_defrootdomain(void) | ||
5909 | { | ||
5910 | init_rootdomain(&def_root_domain); | ||
5911 | |||
5912 | atomic_set(&def_root_domain.refcount, 1); | ||
5913 | } | ||
5914 | |||
5915 | static struct root_domain *alloc_rootdomain(void) | ||
5916 | { | ||
5917 | struct root_domain *rd; | ||
5918 | |||
5919 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
5920 | if (!rd) | ||
5921 | return NULL; | ||
5922 | |||
5923 | if (init_rootdomain(rd) != 0) { | ||
5924 | kfree(rd); | ||
5925 | return NULL; | ||
5926 | } | ||
5927 | |||
5928 | return rd; | ||
5929 | } | ||
5930 | |||
5931 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
5932 | { | ||
5933 | struct sched_group *tmp, *first; | ||
5934 | |||
5935 | if (!sg) | ||
5936 | return; | ||
5937 | |||
5938 | first = sg; | ||
5939 | do { | ||
5940 | tmp = sg->next; | ||
5941 | |||
5942 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
5943 | kfree(sg->sgc); | ||
5944 | |||
5945 | kfree(sg); | ||
5946 | sg = tmp; | ||
5947 | } while (sg != first); | ||
5948 | } | ||
5949 | |||
5950 | static void destroy_sched_domain(struct sched_domain *sd) | ||
5951 | { | ||
5952 | /* | ||
5953 | * If its an overlapping domain it has private groups, iterate and | ||
5954 | * nuke them all. | ||
5955 | */ | ||
5956 | if (sd->flags & SD_OVERLAP) { | ||
5957 | free_sched_groups(sd->groups, 1); | ||
5958 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
5959 | kfree(sd->groups->sgc); | ||
5960 | kfree(sd->groups); | ||
5961 | } | ||
5962 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
5963 | kfree(sd->shared); | ||
5964 | kfree(sd); | ||
5965 | } | ||
5966 | |||
5967 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
5968 | { | ||
5969 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
5970 | |||
5971 | while (sd) { | ||
5972 | struct sched_domain *parent = sd->parent; | ||
5973 | destroy_sched_domain(sd); | ||
5974 | sd = parent; | ||
5975 | } | ||
5976 | } | ||
5977 | |||
5978 | static void destroy_sched_domains(struct sched_domain *sd) | ||
5979 | { | ||
5980 | if (sd) | ||
5981 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
5982 | } | ||
5983 | |||
5984 | /* | ||
5985 | * Keep a special pointer to the highest sched_domain that has | ||
5986 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
5987 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
5988 | * | ||
5989 | * Also keep a unique ID per domain (we use the first cpu number in | ||
5990 | * the cpumask of the domain), this allows us to quickly tell if | ||
5991 | * two cpus are in the same cache domain, see cpus_share_cache(). | ||
5992 | */ | ||
5993 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
5994 | DEFINE_PER_CPU(int, sd_llc_size); | ||
5995 | DEFINE_PER_CPU(int, sd_llc_id); | ||
5996 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
5997 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
5998 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
5999 | |||
6000 | static void update_top_cache_domain(int cpu) | ||
6001 | { | ||
6002 | struct sched_domain_shared *sds = NULL; | ||
6003 | struct sched_domain *sd; | ||
6004 | int id = cpu; | ||
6005 | int size = 1; | ||
6006 | |||
6007 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
6008 | if (sd) { | ||
6009 | id = cpumask_first(sched_domain_span(sd)); | ||
6010 | size = cpumask_weight(sched_domain_span(sd)); | ||
6011 | sds = sd->shared; | ||
6012 | } | ||
6013 | |||
6014 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
6015 | per_cpu(sd_llc_size, cpu) = size; | ||
6016 | per_cpu(sd_llc_id, cpu) = id; | ||
6017 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
6018 | |||
6019 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
6020 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
6021 | |||
6022 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
6023 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
6024 | } | ||
6025 | |||
6026 | /* | ||
6027 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
6028 | * hold the hotplug lock. | ||
6029 | */ | ||
6030 | static void | ||
6031 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
6032 | { | ||
6033 | struct rq *rq = cpu_rq(cpu); | ||
6034 | struct sched_domain *tmp; | ||
6035 | |||
6036 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
6037 | for (tmp = sd; tmp; ) { | ||
6038 | struct sched_domain *parent = tmp->parent; | ||
6039 | if (!parent) | ||
6040 | break; | ||
6041 | |||
6042 | if (sd_parent_degenerate(tmp, parent)) { | ||
6043 | tmp->parent = parent->parent; | ||
6044 | if (parent->parent) | ||
6045 | parent->parent->child = tmp; | ||
6046 | /* | ||
6047 | * Transfer SD_PREFER_SIBLING down in case of a | ||
6048 | * degenerate parent; the spans match for this | ||
6049 | * so the property transfers. | ||
6050 | */ | ||
6051 | if (parent->flags & SD_PREFER_SIBLING) | ||
6052 | tmp->flags |= SD_PREFER_SIBLING; | ||
6053 | destroy_sched_domain(parent); | ||
6054 | } else | ||
6055 | tmp = tmp->parent; | ||
6056 | } | ||
6057 | |||
6058 | if (sd && sd_degenerate(sd)) { | ||
6059 | tmp = sd; | ||
6060 | sd = sd->parent; | ||
6061 | destroy_sched_domain(tmp); | ||
6062 | if (sd) | ||
6063 | sd->child = NULL; | ||
6064 | } | ||
6065 | |||
6066 | sched_domain_debug(sd, cpu); | ||
6067 | |||
6068 | rq_attach_root(rq, rd); | ||
6069 | tmp = rq->sd; | ||
6070 | rcu_assign_pointer(rq->sd, sd); | ||
6071 | destroy_sched_domains(tmp); | ||
6072 | |||
6073 | update_top_cache_domain(cpu); | ||
6074 | } | ||
6075 | |||
6076 | /* Setup the mask of cpus configured for isolated domains */ | ||
6077 | static int __init isolated_cpu_setup(char *str) | ||
6078 | { | ||
6079 | int ret; | ||
6080 | |||
6081 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
6082 | ret = cpulist_parse(str, cpu_isolated_map); | ||
6083 | if (ret) { | ||
6084 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
6085 | return 0; | ||
6086 | } | ||
6087 | return 1; | ||
6088 | } | ||
6089 | __setup("isolcpus=", isolated_cpu_setup); | ||
6090 | |||
6091 | struct s_data { | ||
6092 | struct sched_domain ** __percpu sd; | ||
6093 | struct root_domain *rd; | ||
6094 | }; | ||
6095 | |||
6096 | enum s_alloc { | ||
6097 | sa_rootdomain, | ||
6098 | sa_sd, | ||
6099 | sa_sd_storage, | ||
6100 | sa_none, | ||
6101 | }; | ||
6102 | |||
6103 | /* | ||
6104 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6105 | * domain traversal. | ||
6106 | * | ||
6107 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6108 | * unequal depth, make sure to skip domains that already cover the entire | ||
6109 | * range. | ||
6110 | * | ||
6111 | * In that case build_sched_domains() will have terminated the iteration early | ||
6112 | * and our sibling sd spans will be empty. Domains should always include the | ||
6113 | * cpu they're built on, so check that. | ||
6114 | * | ||
6115 | */ | ||
6116 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6117 | { | ||
6118 | const struct cpumask *span = sched_domain_span(sd); | ||
6119 | struct sd_data *sdd = sd->private; | ||
6120 | struct sched_domain *sibling; | ||
6121 | int i; | ||
6122 | |||
6123 | for_each_cpu(i, span) { | ||
6124 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6125 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6126 | continue; | ||
6127 | |||
6128 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6129 | } | ||
6130 | } | ||
6131 | |||
6132 | /* | ||
6133 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6134 | * of this group that's also in the iteration mask. | ||
6135 | */ | ||
6136 | int group_balance_cpu(struct sched_group *sg) | ||
6137 | { | ||
6138 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6139 | } | ||
6140 | |||
6141 | static int | ||
6142 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
6143 | { | ||
6144 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
6145 | const struct cpumask *span = sched_domain_span(sd); | ||
6146 | struct cpumask *covered = sched_domains_tmpmask; | ||
6147 | struct sd_data *sdd = sd->private; | ||
6148 | struct sched_domain *sibling; | ||
6149 | int i; | ||
6150 | |||
6151 | cpumask_clear(covered); | ||
6152 | |||
6153 | for_each_cpu(i, span) { | ||
6154 | struct cpumask *sg_span; | ||
6155 | |||
6156 | if (cpumask_test_cpu(i, covered)) | ||
6157 | continue; | ||
6158 | |||
6159 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6160 | |||
6161 | /* See the comment near build_group_mask(). */ | ||
6162 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6163 | continue; | ||
6164 | |||
6165 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6166 | GFP_KERNEL, cpu_to_node(cpu)); | ||
6167 | |||
6168 | if (!sg) | ||
6169 | goto fail; | ||
6170 | |||
6171 | sg_span = sched_group_cpus(sg); | ||
6172 | if (sibling->child) | ||
6173 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
6174 | else | ||
6175 | cpumask_set_cpu(i, sg_span); | ||
6176 | |||
6177 | cpumask_or(covered, covered, sg_span); | ||
6178 | |||
6179 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
6180 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
6181 | build_group_mask(sd, sg); | ||
6182 | |||
6183 | /* | ||
6184 | * Initialize sgc->capacity such that even if we mess up the | ||
6185 | * domains and no possible iteration will get us here, we won't | ||
6186 | * die on a /0 trap. | ||
6187 | */ | ||
6188 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
6189 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
6190 | |||
6191 | /* | ||
6192 | * Make sure the first group of this domain contains the | ||
6193 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6194 | * breaks. See update_sg_lb_stats(). | ||
6195 | */ | ||
6196 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6197 | group_balance_cpu(sg) == cpu) | ||
6198 | groups = sg; | ||
6199 | |||
6200 | if (!first) | ||
6201 | first = sg; | ||
6202 | if (last) | ||
6203 | last->next = sg; | ||
6204 | last = sg; | ||
6205 | last->next = first; | ||
6206 | } | ||
6207 | sd->groups = groups; | ||
6208 | |||
6209 | return 0; | ||
6210 | |||
6211 | fail: | ||
6212 | free_sched_groups(first, 0); | ||
6213 | |||
6214 | return -ENOMEM; | ||
6215 | } | ||
6216 | |||
6217 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
6218 | { | ||
6219 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6220 | struct sched_domain *child = sd->child; | ||
6221 | |||
6222 | if (child) | ||
6223 | cpu = cpumask_first(sched_domain_span(child)); | ||
6224 | |||
6225 | if (sg) { | ||
6226 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
6227 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
6228 | atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ | ||
6229 | } | ||
6230 | |||
6231 | return cpu; | ||
6232 | } | ||
6233 | |||
6234 | /* | ||
6235 | * build_sched_groups will build a circular linked list of the groups | ||
6236 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
6237 | * and ->cpu_capacity to 0. | ||
6238 | * | ||
6239 | * Assumes the sched_domain tree is fully constructed | ||
6240 | */ | ||
6241 | static int | ||
6242 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
6243 | { | ||
6244 | struct sched_group *first = NULL, *last = NULL; | ||
6245 | struct sd_data *sdd = sd->private; | ||
6246 | const struct cpumask *span = sched_domain_span(sd); | ||
6247 | struct cpumask *covered; | ||
6248 | int i; | ||
6249 | |||
6250 | get_group(cpu, sdd, &sd->groups); | ||
6251 | atomic_inc(&sd->groups->ref); | ||
6252 | |||
6253 | if (cpu != cpumask_first(span)) | ||
6254 | return 0; | ||
6255 | |||
6256 | lockdep_assert_held(&sched_domains_mutex); | ||
6257 | covered = sched_domains_tmpmask; | ||
6258 | |||
6259 | cpumask_clear(covered); | ||
6260 | |||
6261 | for_each_cpu(i, span) { | ||
6262 | struct sched_group *sg; | ||
6263 | int group, j; | ||
6264 | |||
6265 | if (cpumask_test_cpu(i, covered)) | ||
6266 | continue; | ||
6267 | |||
6268 | group = get_group(i, sdd, &sg); | ||
6269 | cpumask_setall(sched_group_mask(sg)); | ||
6270 | |||
6271 | for_each_cpu(j, span) { | ||
6272 | if (get_group(j, sdd, NULL) != group) | ||
6273 | continue; | ||
6274 | |||
6275 | cpumask_set_cpu(j, covered); | ||
6276 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
6277 | } | ||
6278 | |||
6279 | if (!first) | ||
6280 | first = sg; | ||
6281 | if (last) | ||
6282 | last->next = sg; | ||
6283 | last = sg; | ||
6284 | } | ||
6285 | last->next = first; | ||
6286 | |||
6287 | return 0; | ||
6288 | } | ||
6289 | |||
6290 | /* | ||
6291 | * Initialize sched groups cpu_capacity. | ||
6292 | * | ||
6293 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
6294 | * distributing the load between different sched groups in a sched domain. | ||
6295 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
6296 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
6297 | * group having more cpu_capacity will pickup more load compared to the | ||
6298 | * group having less cpu_capacity. | ||
6299 | */ | ||
6300 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
6301 | { | ||
6302 | struct sched_group *sg = sd->groups; | ||
6303 | |||
6304 | WARN_ON(!sg); | ||
6305 | |||
6306 | do { | ||
6307 | int cpu, max_cpu = -1; | ||
6308 | |||
6309 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
6310 | |||
6311 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
6312 | goto next; | ||
6313 | |||
6314 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
6315 | if (max_cpu < 0) | ||
6316 | max_cpu = cpu; | ||
6317 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
6318 | max_cpu = cpu; | ||
6319 | } | ||
6320 | sg->asym_prefer_cpu = max_cpu; | ||
6321 | |||
6322 | next: | ||
6323 | sg = sg->next; | ||
6324 | } while (sg != sd->groups); | ||
6325 | |||
6326 | if (cpu != group_balance_cpu(sg)) | ||
6327 | return; | ||
6328 | |||
6329 | update_group_capacity(sd, cpu); | ||
6330 | } | ||
6331 | |||
6332 | /* | ||
6333 | * Initializers for schedule domains | ||
6334 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
6335 | */ | ||
6336 | |||
6337 | static int default_relax_domain_level = -1; | ||
6338 | int sched_domain_level_max; | ||
6339 | |||
6340 | static int __init setup_relax_domain_level(char *str) | ||
6341 | { | ||
6342 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
6343 | pr_warn("Unable to set relax_domain_level\n"); | ||
6344 | |||
6345 | return 1; | ||
6346 | } | ||
6347 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
6348 | |||
6349 | static void set_domain_attribute(struct sched_domain *sd, | ||
6350 | struct sched_domain_attr *attr) | ||
6351 | { | ||
6352 | int request; | ||
6353 | |||
6354 | if (!attr || attr->relax_domain_level < 0) { | ||
6355 | if (default_relax_domain_level < 0) | ||
6356 | return; | ||
6357 | else | ||
6358 | request = default_relax_domain_level; | ||
6359 | } else | ||
6360 | request = attr->relax_domain_level; | ||
6361 | if (request < sd->level) { | ||
6362 | /* turn off idle balance on this domain */ | ||
6363 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6364 | } else { | ||
6365 | /* turn on idle balance on this domain */ | ||
6366 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
6367 | } | ||
6368 | } | ||
6369 | |||
6370 | static void __sdt_free(const struct cpumask *cpu_map); | ||
6371 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
6372 | |||
6373 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
6374 | const struct cpumask *cpu_map) | ||
6375 | { | ||
6376 | switch (what) { | ||
6377 | case sa_rootdomain: | ||
6378 | if (!atomic_read(&d->rd->refcount)) | ||
6379 | free_rootdomain(&d->rd->rcu); /* fall through */ | ||
6380 | case sa_sd: | ||
6381 | free_percpu(d->sd); /* fall through */ | ||
6382 | case sa_sd_storage: | ||
6383 | __sdt_free(cpu_map); /* fall through */ | ||
6384 | case sa_none: | ||
6385 | break; | ||
6386 | } | ||
6387 | } | ||
6388 | |||
6389 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | ||
6390 | const struct cpumask *cpu_map) | ||
6391 | { | ||
6392 | memset(d, 0, sizeof(*d)); | ||
6393 | |||
6394 | if (__sdt_alloc(cpu_map)) | ||
6395 | return sa_sd_storage; | ||
6396 | d->sd = alloc_percpu(struct sched_domain *); | ||
6397 | if (!d->sd) | ||
6398 | return sa_sd_storage; | ||
6399 | d->rd = alloc_rootdomain(); | ||
6400 | if (!d->rd) | ||
6401 | return sa_sd; | ||
6402 | return sa_rootdomain; | ||
6403 | } | ||
6404 | |||
6405 | /* | ||
6406 | * NULL the sd_data elements we've used to build the sched_domain and | ||
6407 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
6408 | * will not free the data we're using. | ||
6409 | */ | ||
6410 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
6411 | { | ||
6412 | struct sd_data *sdd = sd->private; | ||
6413 | |||
6414 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
6415 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
6416 | |||
6417 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
6418 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
6419 | |||
6420 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
6421 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
6422 | |||
6423 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
6424 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
6425 | } | ||
6426 | |||
6427 | #ifdef CONFIG_NUMA | ||
6428 | static int sched_domains_numa_levels; | ||
6429 | enum numa_topology_type sched_numa_topology_type; | ||
6430 | static int *sched_domains_numa_distance; | ||
6431 | int sched_max_numa_distance; | ||
6432 | static struct cpumask ***sched_domains_numa_masks; | ||
6433 | static int sched_domains_curr_level; | ||
6434 | #endif | ||
6435 | |||
6436 | /* | ||
6437 | * SD_flags allowed in topology descriptions. | ||
6438 | * | ||
6439 | * These flags are purely descriptive of the topology and do not prescribe | ||
6440 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
6441 | * function: | ||
6442 | * | ||
6443 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
6444 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
6445 | * SD_NUMA - describes NUMA topologies | ||
6446 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
6447 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
6448 | * | ||
6449 | * Odd one out, which beside describing the topology has a quirk also | ||
6450 | * prescribes the desired behaviour that goes along with it: | ||
6451 | * | ||
6452 | * SD_ASYM_PACKING - describes SMT quirks | ||
6453 | */ | ||
6454 | #define TOPOLOGY_SD_FLAGS \ | ||
6455 | (SD_SHARE_CPUCAPACITY | \ | ||
6456 | SD_SHARE_PKG_RESOURCES | \ | ||
6457 | SD_NUMA | \ | ||
6458 | SD_ASYM_PACKING | \ | ||
6459 | SD_ASYM_CPUCAPACITY | \ | ||
6460 | SD_SHARE_POWERDOMAIN) | ||
6461 | |||
6462 | static struct sched_domain * | ||
6463 | sd_init(struct sched_domain_topology_level *tl, | ||
6464 | const struct cpumask *cpu_map, | ||
6465 | struct sched_domain *child, int cpu) | ||
6466 | { | ||
6467 | struct sd_data *sdd = &tl->data; | ||
6468 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
6469 | int sd_id, sd_weight, sd_flags = 0; | ||
6470 | |||
6471 | #ifdef CONFIG_NUMA | ||
6472 | /* | ||
6473 | * Ugly hack to pass state to sd_numa_mask()... | ||
6474 | */ | ||
6475 | sched_domains_curr_level = tl->numa_level; | ||
6476 | #endif | ||
6477 | |||
6478 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
6479 | |||
6480 | if (tl->sd_flags) | ||
6481 | sd_flags = (*tl->sd_flags)(); | ||
6482 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
6483 | "wrong sd_flags in topology description\n")) | ||
6484 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
6485 | |||
6486 | *sd = (struct sched_domain){ | ||
6487 | .min_interval = sd_weight, | ||
6488 | .max_interval = 2*sd_weight, | ||
6489 | .busy_factor = 32, | ||
6490 | .imbalance_pct = 125, | ||
6491 | |||
6492 | .cache_nice_tries = 0, | ||
6493 | .busy_idx = 0, | ||
6494 | .idle_idx = 0, | ||
6495 | .newidle_idx = 0, | ||
6496 | .wake_idx = 0, | ||
6497 | .forkexec_idx = 0, | ||
6498 | |||
6499 | .flags = 1*SD_LOAD_BALANCE | ||
6500 | | 1*SD_BALANCE_NEWIDLE | ||
6501 | | 1*SD_BALANCE_EXEC | ||
6502 | | 1*SD_BALANCE_FORK | ||
6503 | | 0*SD_BALANCE_WAKE | ||
6504 | | 1*SD_WAKE_AFFINE | ||
6505 | | 0*SD_SHARE_CPUCAPACITY | ||
6506 | | 0*SD_SHARE_PKG_RESOURCES | ||
6507 | | 0*SD_SERIALIZE | ||
6508 | | 0*SD_PREFER_SIBLING | ||
6509 | | 0*SD_NUMA | ||
6510 | | sd_flags | ||
6511 | , | ||
6512 | |||
6513 | .last_balance = jiffies, | ||
6514 | .balance_interval = sd_weight, | ||
6515 | .smt_gain = 0, | ||
6516 | .max_newidle_lb_cost = 0, | ||
6517 | .next_decay_max_lb_cost = jiffies, | ||
6518 | .child = child, | ||
6519 | #ifdef CONFIG_SCHED_DEBUG | ||
6520 | .name = tl->name, | ||
6521 | #endif | ||
6522 | }; | ||
6523 | |||
6524 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
6525 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
6526 | |||
6527 | /* | ||
6528 | * Convert topological properties into behaviour. | ||
6529 | */ | ||
6530 | |||
6531 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
6532 | struct sched_domain *t = sd; | ||
6533 | |||
6534 | for_each_lower_domain(t) | ||
6535 | t->flags |= SD_BALANCE_WAKE; | ||
6536 | } | ||
6537 | |||
6538 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
6539 | sd->flags |= SD_PREFER_SIBLING; | ||
6540 | sd->imbalance_pct = 110; | ||
6541 | sd->smt_gain = 1178; /* ~15% */ | ||
6542 | |||
6543 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6544 | sd->imbalance_pct = 117; | ||
6545 | sd->cache_nice_tries = 1; | ||
6546 | sd->busy_idx = 2; | ||
6547 | |||
6548 | #ifdef CONFIG_NUMA | ||
6549 | } else if (sd->flags & SD_NUMA) { | ||
6550 | sd->cache_nice_tries = 2; | ||
6551 | sd->busy_idx = 3; | ||
6552 | sd->idle_idx = 2; | ||
6553 | |||
6554 | sd->flags |= SD_SERIALIZE; | ||
6555 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
6556 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
6557 | SD_BALANCE_FORK | | ||
6558 | SD_WAKE_AFFINE); | ||
6559 | } | ||
6560 | |||
6561 | #endif | ||
6562 | } else { | ||
6563 | sd->flags |= SD_PREFER_SIBLING; | ||
6564 | sd->cache_nice_tries = 1; | ||
6565 | sd->busy_idx = 2; | ||
6566 | sd->idle_idx = 1; | ||
6567 | } | ||
6568 | |||
6569 | /* | ||
6570 | * For all levels sharing cache; connect a sched_domain_shared | ||
6571 | * instance. | ||
6572 | */ | ||
6573 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
6574 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
6575 | atomic_inc(&sd->shared->ref); | ||
6576 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
6577 | } | ||
6578 | |||
6579 | sd->private = sdd; | ||
6580 | |||
6581 | return sd; | ||
6582 | } | ||
6583 | |||
6584 | /* | ||
6585 | * Topology list, bottom-up. | ||
6586 | */ | ||
6587 | static struct sched_domain_topology_level default_topology[] = { | ||
6588 | #ifdef CONFIG_SCHED_SMT | ||
6589 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
6590 | #endif | ||
6591 | #ifdef CONFIG_SCHED_MC | ||
6592 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
6593 | #endif | ||
6594 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
6595 | { NULL, }, | ||
6596 | }; | ||
6597 | |||
6598 | static struct sched_domain_topology_level *sched_domain_topology = | ||
6599 | default_topology; | ||
6600 | |||
6601 | #define for_each_sd_topology(tl) \ | ||
6602 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
6603 | |||
6604 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
6605 | { | ||
6606 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
6607 | return; | ||
6608 | |||
6609 | sched_domain_topology = tl; | ||
6610 | } | ||
6611 | |||
6612 | #ifdef CONFIG_NUMA | ||
6613 | |||
6614 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6615 | { | ||
6616 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6617 | } | ||
6618 | |||
6619 | static void sched_numa_warn(const char *str) | ||
6620 | { | ||
6621 | static int done = false; | ||
6622 | int i,j; | ||
6623 | |||
6624 | if (done) | ||
6625 | return; | ||
6626 | |||
6627 | done = true; | ||
6628 | |||
6629 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6630 | |||
6631 | for (i = 0; i < nr_node_ids; i++) { | ||
6632 | printk(KERN_WARNING " "); | ||
6633 | for (j = 0; j < nr_node_ids; j++) | ||
6634 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6635 | printk(KERN_CONT "\n"); | ||
6636 | } | ||
6637 | printk(KERN_WARNING "\n"); | ||
6638 | } | ||
6639 | |||
6640 | bool find_numa_distance(int distance) | ||
6641 | { | ||
6642 | int i; | ||
6643 | |||
6644 | if (distance == node_distance(0, 0)) | ||
6645 | return true; | ||
6646 | |||
6647 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6648 | if (sched_domains_numa_distance[i] == distance) | ||
6649 | return true; | ||
6650 | } | ||
6651 | |||
6652 | return false; | ||
6653 | } | ||
6654 | |||
6655 | /* | ||
6656 | * A system can have three types of NUMA topology: | ||
6657 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
6658 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
6659 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
6660 | * | ||
6661 | * The difference between a glueless mesh topology and a backplane | ||
6662 | * topology lies in whether communication between not directly | ||
6663 | * connected nodes goes through intermediary nodes (where programs | ||
6664 | * could run), or through backplane controllers. This affects | ||
6665 | * placement of programs. | ||
6666 | * | ||
6667 | * The type of topology can be discerned with the following tests: | ||
6668 | * - If the maximum distance between any nodes is 1 hop, the system | ||
6669 | * is directly connected. | ||
6670 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
6671 | * there is an intermediary node C, which is < N hops away from both | ||
6672 | * nodes A and B, the system is a glueless mesh. | ||
6673 | */ | ||
6674 | static void init_numa_topology_type(void) | ||
6675 | { | ||
6676 | int a, b, c, n; | ||
6677 | |||
6678 | n = sched_max_numa_distance; | ||
6679 | |||
6680 | if (sched_domains_numa_levels <= 1) { | ||
6681 | sched_numa_topology_type = NUMA_DIRECT; | ||
6682 | return; | ||
6683 | } | ||
6684 | |||
6685 | for_each_online_node(a) { | ||
6686 | for_each_online_node(b) { | ||
6687 | /* Find two nodes furthest removed from each other. */ | ||
6688 | if (node_distance(a, b) < n) | ||
6689 | continue; | ||
6690 | |||
6691 | /* Is there an intermediary node between a and b? */ | ||
6692 | for_each_online_node(c) { | ||
6693 | if (node_distance(a, c) < n && | ||
6694 | node_distance(b, c) < n) { | ||
6695 | sched_numa_topology_type = | ||
6696 | NUMA_GLUELESS_MESH; | ||
6697 | return; | ||
6698 | } | ||
6699 | } | ||
6700 | |||
6701 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
6702 | return; | ||
6703 | } | ||
6704 | } | ||
6705 | } | ||
6706 | |||
6707 | static void sched_init_numa(void) | ||
6708 | { | ||
6709 | int next_distance, curr_distance = node_distance(0, 0); | ||
6710 | struct sched_domain_topology_level *tl; | ||
6711 | int level = 0; | ||
6712 | int i, j, k; | ||
6713 | |||
6714 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6715 | if (!sched_domains_numa_distance) | ||
6716 | return; | ||
6717 | |||
6718 | /* | ||
6719 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6720 | * unique distances in the node_distance() table. | ||
6721 | * | ||
6722 | * Assumes node_distance(0,j) includes all distances in | ||
6723 | * node_distance(i,j) in order to avoid cubic time. | ||
6724 | */ | ||
6725 | next_distance = curr_distance; | ||
6726 | for (i = 0; i < nr_node_ids; i++) { | ||
6727 | for (j = 0; j < nr_node_ids; j++) { | ||
6728 | for (k = 0; k < nr_node_ids; k++) { | ||
6729 | int distance = node_distance(i, k); | ||
6730 | |||
6731 | if (distance > curr_distance && | ||
6732 | (distance < next_distance || | ||
6733 | next_distance == curr_distance)) | ||
6734 | next_distance = distance; | ||
6735 | |||
6736 | /* | ||
6737 | * While not a strong assumption it would be nice to know | ||
6738 | * about cases where if node A is connected to B, B is not | ||
6739 | * equally connected to A. | ||
6740 | */ | ||
6741 | if (sched_debug() && node_distance(k, i) != distance) | ||
6742 | sched_numa_warn("Node-distance not symmetric"); | ||
6743 | |||
6744 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6745 | sched_numa_warn("Node-0 not representative"); | ||
6746 | } | ||
6747 | if (next_distance != curr_distance) { | ||
6748 | sched_domains_numa_distance[level++] = next_distance; | ||
6749 | sched_domains_numa_levels = level; | ||
6750 | curr_distance = next_distance; | ||
6751 | } else break; | ||
6752 | } | ||
6753 | |||
6754 | /* | ||
6755 | * In case of sched_debug() we verify the above assumption. | ||
6756 | */ | ||
6757 | if (!sched_debug()) | ||
6758 | break; | ||
6759 | } | ||
6760 | |||
6761 | if (!level) | ||
6762 | return; | ||
6763 | |||
6764 | /* | ||
6765 | * 'level' contains the number of unique distances, excluding the | ||
6766 | * identity distance node_distance(i,i). | ||
6767 | * | ||
6768 | * The sched_domains_numa_distance[] array includes the actual distance | ||
6769 | * numbers. | ||
6770 | */ | ||
6771 | |||
6772 | /* | ||
6773 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
6774 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
6775 | * the array will contain less then 'level' members. This could be | ||
6776 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
6777 | * in other functions. | ||
6778 | * | ||
6779 | * We reset it to 'level' at the end of this function. | ||
6780 | */ | ||
6781 | sched_domains_numa_levels = 0; | ||
6782 | |||
6783 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6784 | if (!sched_domains_numa_masks) | ||
6785 | return; | ||
6786 | |||
6787 | /* | ||
6788 | * Now for each level, construct a mask per node which contains all | ||
6789 | * cpus of nodes that are that many hops away from us. | ||
6790 | */ | ||
6791 | for (i = 0; i < level; i++) { | ||
6792 | sched_domains_numa_masks[i] = | ||
6793 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6794 | if (!sched_domains_numa_masks[i]) | ||
6795 | return; | ||
6796 | |||
6797 | for (j = 0; j < nr_node_ids; j++) { | ||
6798 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6799 | if (!mask) | ||
6800 | return; | ||
6801 | |||
6802 | sched_domains_numa_masks[i][j] = mask; | ||
6803 | |||
6804 | for_each_node(k) { | ||
6805 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6806 | continue; | ||
6807 | |||
6808 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6809 | } | ||
6810 | } | ||
6811 | } | ||
6812 | |||
6813 | /* Compute default topology size */ | ||
6814 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
6815 | |||
6816 | tl = kzalloc((i + level + 1) * | ||
6817 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6818 | if (!tl) | ||
6819 | return; | ||
6820 | |||
6821 | /* | ||
6822 | * Copy the default topology bits.. | ||
6823 | */ | ||
6824 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
6825 | tl[i] = sched_domain_topology[i]; | ||
6826 | |||
6827 | /* | ||
6828 | * .. and append 'j' levels of NUMA goodness. | ||
6829 | */ | ||
6830 | for (j = 0; j < level; i++, j++) { | ||
6831 | tl[i] = (struct sched_domain_topology_level){ | ||
6832 | .mask = sd_numa_mask, | ||
6833 | .sd_flags = cpu_numa_flags, | ||
6834 | .flags = SDTL_OVERLAP, | ||
6835 | .numa_level = j, | ||
6836 | SD_INIT_NAME(NUMA) | ||
6837 | }; | ||
6838 | } | ||
6839 | |||
6840 | sched_domain_topology = tl; | ||
6841 | |||
6842 | sched_domains_numa_levels = level; | ||
6843 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
6844 | |||
6845 | init_numa_topology_type(); | ||
6846 | } | ||
6847 | |||
6848 | static void sched_domains_numa_masks_set(unsigned int cpu) | ||
6849 | { | ||
6850 | int node = cpu_to_node(cpu); | ||
6851 | int i, j; | ||
6852 | |||
6853 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6854 | for (j = 0; j < nr_node_ids; j++) { | ||
6855 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
6856 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6857 | } | ||
6858 | } | ||
6859 | } | ||
6860 | |||
6861 | static void sched_domains_numa_masks_clear(unsigned int cpu) | ||
6862 | { | ||
6863 | int i, j; | ||
6864 | |||
6865 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6866 | for (j = 0; j < nr_node_ids; j++) | ||
6867 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
6868 | } | ||
6869 | } | ||
6870 | |||
6871 | #else | ||
6872 | static inline void sched_init_numa(void) { } | ||
6873 | static void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
6874 | static void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
6875 | #endif /* CONFIG_NUMA */ | ||
6876 | |||
6877 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
6878 | { | ||
6879 | struct sched_domain_topology_level *tl; | ||
6880 | int j; | ||
6881 | |||
6882 | for_each_sd_topology(tl) { | ||
6883 | struct sd_data *sdd = &tl->data; | ||
6884 | |||
6885 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
6886 | if (!sdd->sd) | ||
6887 | return -ENOMEM; | ||
6888 | |||
6889 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
6890 | if (!sdd->sds) | ||
6891 | return -ENOMEM; | ||
6892 | |||
6893 | sdd->sg = alloc_percpu(struct sched_group *); | ||
6894 | if (!sdd->sg) | ||
6895 | return -ENOMEM; | ||
6896 | |||
6897 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
6898 | if (!sdd->sgc) | ||
6899 | return -ENOMEM; | ||
6900 | |||
6901 | for_each_cpu(j, cpu_map) { | ||
6902 | struct sched_domain *sd; | ||
6903 | struct sched_domain_shared *sds; | ||
6904 | struct sched_group *sg; | ||
6905 | struct sched_group_capacity *sgc; | ||
6906 | |||
6907 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
6908 | GFP_KERNEL, cpu_to_node(j)); | ||
6909 | if (!sd) | ||
6910 | return -ENOMEM; | ||
6911 | |||
6912 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
6913 | |||
6914 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
6915 | GFP_KERNEL, cpu_to_node(j)); | ||
6916 | if (!sds) | ||
6917 | return -ENOMEM; | ||
6918 | |||
6919 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
6920 | |||
6921 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
6922 | GFP_KERNEL, cpu_to_node(j)); | ||
6923 | if (!sg) | ||
6924 | return -ENOMEM; | ||
6925 | |||
6926 | sg->next = sg; | ||
6927 | |||
6928 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
6929 | |||
6930 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
6931 | GFP_KERNEL, cpu_to_node(j)); | ||
6932 | if (!sgc) | ||
6933 | return -ENOMEM; | ||
6934 | |||
6935 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
6936 | } | ||
6937 | } | ||
6938 | |||
6939 | return 0; | ||
6940 | } | ||
6941 | |||
6942 | static void __sdt_free(const struct cpumask *cpu_map) | ||
6943 | { | ||
6944 | struct sched_domain_topology_level *tl; | ||
6945 | int j; | ||
6946 | |||
6947 | for_each_sd_topology(tl) { | ||
6948 | struct sd_data *sdd = &tl->data; | ||
6949 | |||
6950 | for_each_cpu(j, cpu_map) { | ||
6951 | struct sched_domain *sd; | ||
6952 | |||
6953 | if (sdd->sd) { | ||
6954 | sd = *per_cpu_ptr(sdd->sd, j); | ||
6955 | if (sd && (sd->flags & SD_OVERLAP)) | ||
6956 | free_sched_groups(sd->groups, 0); | ||
6957 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
6958 | } | ||
6959 | |||
6960 | if (sdd->sds) | ||
6961 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
6962 | if (sdd->sg) | ||
6963 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
6964 | if (sdd->sgc) | ||
6965 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
6966 | } | ||
6967 | free_percpu(sdd->sd); | ||
6968 | sdd->sd = NULL; | ||
6969 | free_percpu(sdd->sds); | ||
6970 | sdd->sds = NULL; | ||
6971 | free_percpu(sdd->sg); | ||
6972 | sdd->sg = NULL; | ||
6973 | free_percpu(sdd->sgc); | ||
6974 | sdd->sgc = NULL; | ||
6975 | } | ||
6976 | } | ||
6977 | |||
6978 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
6979 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
6980 | struct sched_domain *child, int cpu) | ||
6981 | { | ||
6982 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
6983 | |||
6984 | if (child) { | ||
6985 | sd->level = child->level + 1; | ||
6986 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
6987 | child->parent = sd; | ||
6988 | |||
6989 | if (!cpumask_subset(sched_domain_span(child), | ||
6990 | sched_domain_span(sd))) { | ||
6991 | pr_err("BUG: arch topology borken\n"); | ||
6992 | #ifdef CONFIG_SCHED_DEBUG | ||
6993 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
6994 | child->name, sd->name); | ||
6995 | #endif | ||
6996 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
6997 | cpumask_or(sched_domain_span(sd), | ||
6998 | sched_domain_span(sd), | ||
6999 | sched_domain_span(child)); | ||
7000 | } | ||
7001 | |||
7002 | } | ||
7003 | set_domain_attribute(sd, attr); | ||
7004 | |||
7005 | return sd; | ||
7006 | } | ||
7007 | |||
7008 | /* | 5682 | /* |
7009 | * Build sched domains for a given set of cpus and attach the sched domains | 5683 | * used to mark begin/end of suspend/resume: |
7010 | * to the individual cpus | ||
7011 | */ | 5684 | */ |
7012 | static int build_sched_domains(const struct cpumask *cpu_map, | 5685 | static int num_cpus_frozen; |
7013 | struct sched_domain_attr *attr) | ||
7014 | { | ||
7015 | enum s_alloc alloc_state; | ||
7016 | struct sched_domain *sd; | ||
7017 | struct s_data d; | ||
7018 | struct rq *rq = NULL; | ||
7019 | int i, ret = -ENOMEM; | ||
7020 | |||
7021 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
7022 | if (alloc_state != sa_rootdomain) | ||
7023 | goto error; | ||
7024 | |||
7025 | /* Set up domains for cpus specified by the cpu_map. */ | ||
7026 | for_each_cpu(i, cpu_map) { | ||
7027 | struct sched_domain_topology_level *tl; | ||
7028 | |||
7029 | sd = NULL; | ||
7030 | for_each_sd_topology(tl) { | ||
7031 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
7032 | if (tl == sched_domain_topology) | ||
7033 | *per_cpu_ptr(d.sd, i) = sd; | ||
7034 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
7035 | sd->flags |= SD_OVERLAP; | ||
7036 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
7037 | break; | ||
7038 | } | ||
7039 | } | ||
7040 | |||
7041 | /* Build the groups for the domains */ | ||
7042 | for_each_cpu(i, cpu_map) { | ||
7043 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7044 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
7045 | if (sd->flags & SD_OVERLAP) { | ||
7046 | if (build_overlap_sched_groups(sd, i)) | ||
7047 | goto error; | ||
7048 | } else { | ||
7049 | if (build_sched_groups(sd, i)) | ||
7050 | goto error; | ||
7051 | } | ||
7052 | } | ||
7053 | } | ||
7054 | |||
7055 | /* Calculate CPU capacity for physical packages and nodes */ | ||
7056 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
7057 | if (!cpumask_test_cpu(i, cpu_map)) | ||
7058 | continue; | ||
7059 | |||
7060 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
7061 | claim_allocations(i, sd); | ||
7062 | init_sched_groups_capacity(i, sd); | ||
7063 | } | ||
7064 | } | ||
7065 | |||
7066 | /* Attach the domains */ | ||
7067 | rcu_read_lock(); | ||
7068 | for_each_cpu(i, cpu_map) { | ||
7069 | rq = cpu_rq(i); | ||
7070 | sd = *per_cpu_ptr(d.sd, i); | ||
7071 | |||
7072 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
7073 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
7074 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
7075 | |||
7076 | cpu_attach_domain(sd, d.rd, i); | ||
7077 | } | ||
7078 | rcu_read_unlock(); | ||
7079 | |||
7080 | if (rq && sched_debug_enabled) { | ||
7081 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
7082 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
7083 | } | ||
7084 | |||
7085 | ret = 0; | ||
7086 | error: | ||
7087 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
7088 | return ret; | ||
7089 | } | ||
7090 | |||
7091 | static cpumask_var_t *doms_cur; /* current sched domains */ | ||
7092 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | ||
7093 | static struct sched_domain_attr *dattr_cur; | ||
7094 | /* attribues of custom domains in 'doms_cur' */ | ||
7095 | |||
7096 | /* | ||
7097 | * Special case: If a kmalloc of a doms_cur partition (array of | ||
7098 | * cpumask) fails, then fallback to a single sched domain, | ||
7099 | * as determined by the single cpumask fallback_doms. | ||
7100 | */ | ||
7101 | static cpumask_var_t fallback_doms; | ||
7102 | |||
7103 | /* | ||
7104 | * arch_update_cpu_topology lets virtualized architectures update the | ||
7105 | * cpu core maps. It is supposed to return 1 if the topology changed | ||
7106 | * or 0 if it stayed the same. | ||
7107 | */ | ||
7108 | int __weak arch_update_cpu_topology(void) | ||
7109 | { | ||
7110 | return 0; | ||
7111 | } | ||
7112 | |||
7113 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
7114 | { | ||
7115 | int i; | ||
7116 | cpumask_var_t *doms; | ||
7117 | |||
7118 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
7119 | if (!doms) | ||
7120 | return NULL; | ||
7121 | for (i = 0; i < ndoms; i++) { | ||
7122 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
7123 | free_sched_domains(doms, i); | ||
7124 | return NULL; | ||
7125 | } | ||
7126 | } | ||
7127 | return doms; | ||
7128 | } | ||
7129 | |||
7130 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
7131 | { | ||
7132 | unsigned int i; | ||
7133 | for (i = 0; i < ndoms; i++) | ||
7134 | free_cpumask_var(doms[i]); | ||
7135 | kfree(doms); | ||
7136 | } | ||
7137 | |||
7138 | /* | ||
7139 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
7140 | * For now this just excludes isolated cpus, but could be used to | ||
7141 | * exclude other special cases in the future. | ||
7142 | */ | ||
7143 | static int init_sched_domains(const struct cpumask *cpu_map) | ||
7144 | { | ||
7145 | int err; | ||
7146 | |||
7147 | arch_update_cpu_topology(); | ||
7148 | ndoms_cur = 1; | ||
7149 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
7150 | if (!doms_cur) | ||
7151 | doms_cur = &fallback_doms; | ||
7152 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
7153 | err = build_sched_domains(doms_cur[0], NULL); | ||
7154 | register_sched_domain_sysctl(); | ||
7155 | |||
7156 | return err; | ||
7157 | } | ||
7158 | |||
7159 | /* | ||
7160 | * Detach sched domains from a group of cpus specified in cpu_map | ||
7161 | * These cpus will now be attached to the NULL domain | ||
7162 | */ | ||
7163 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
7164 | { | ||
7165 | int i; | ||
7166 | |||
7167 | rcu_read_lock(); | ||
7168 | for_each_cpu(i, cpu_map) | ||
7169 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
7170 | rcu_read_unlock(); | ||
7171 | } | ||
7172 | |||
7173 | /* handle null as "default" */ | ||
7174 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
7175 | struct sched_domain_attr *new, int idx_new) | ||
7176 | { | ||
7177 | struct sched_domain_attr tmp; | ||
7178 | |||
7179 | /* fast path */ | ||
7180 | if (!new && !cur) | ||
7181 | return 1; | ||
7182 | |||
7183 | tmp = SD_ATTR_INIT; | ||
7184 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
7185 | new ? (new + idx_new) : &tmp, | ||
7186 | sizeof(struct sched_domain_attr)); | ||
7187 | } | ||
7188 | |||
7189 | /* | ||
7190 | * Partition sched domains as specified by the 'ndoms_new' | ||
7191 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
7192 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
7193 | * It destroys each deleted domain and builds each new domain. | ||
7194 | * | ||
7195 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
7196 | * The masks don't intersect (don't overlap.) We should setup one | ||
7197 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
7198 | * not be load balanced. If the same cpumask appears both in the | ||
7199 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
7200 | * it as it is. | ||
7201 | * | ||
7202 | * The passed in 'doms_new' should be allocated using | ||
7203 | * alloc_sched_domains. This routine takes ownership of it and will | ||
7204 | * free_sched_domains it when done with it. If the caller failed the | ||
7205 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
7206 | * and partition_sched_domains() will fallback to the single partition | ||
7207 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
7208 | * | ||
7209 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
7210 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
7211 | * and it will not create the default domain. | ||
7212 | * | ||
7213 | * Call with hotplug lock held | ||
7214 | */ | ||
7215 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
7216 | struct sched_domain_attr *dattr_new) | ||
7217 | { | ||
7218 | int i, j, n; | ||
7219 | int new_topology; | ||
7220 | |||
7221 | mutex_lock(&sched_domains_mutex); | ||
7222 | |||
7223 | /* always unregister in case we don't destroy any domains */ | ||
7224 | unregister_sched_domain_sysctl(); | ||
7225 | |||
7226 | /* Let architecture update cpu core mappings. */ | ||
7227 | new_topology = arch_update_cpu_topology(); | ||
7228 | |||
7229 | n = doms_new ? ndoms_new : 0; | ||
7230 | |||
7231 | /* Destroy deleted domains */ | ||
7232 | for (i = 0; i < ndoms_cur; i++) { | ||
7233 | for (j = 0; j < n && !new_topology; j++) { | ||
7234 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
7235 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
7236 | goto match1; | ||
7237 | } | ||
7238 | /* no match - a current sched domain not in new doms_new[] */ | ||
7239 | detach_destroy_domains(doms_cur[i]); | ||
7240 | match1: | ||
7241 | ; | ||
7242 | } | ||
7243 | |||
7244 | n = ndoms_cur; | ||
7245 | if (doms_new == NULL) { | ||
7246 | n = 0; | ||
7247 | doms_new = &fallback_doms; | ||
7248 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
7249 | WARN_ON_ONCE(dattr_new); | ||
7250 | } | ||
7251 | |||
7252 | /* Build new domains */ | ||
7253 | for (i = 0; i < ndoms_new; i++) { | ||
7254 | for (j = 0; j < n && !new_topology; j++) { | ||
7255 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
7256 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
7257 | goto match2; | ||
7258 | } | ||
7259 | /* no match - add a new doms_new */ | ||
7260 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
7261 | match2: | ||
7262 | ; | ||
7263 | } | ||
7264 | |||
7265 | /* Remember the new sched domains */ | ||
7266 | if (doms_cur != &fallback_doms) | ||
7267 | free_sched_domains(doms_cur, ndoms_cur); | ||
7268 | kfree(dattr_cur); /* kfree(NULL) is safe */ | ||
7269 | doms_cur = doms_new; | ||
7270 | dattr_cur = dattr_new; | ||
7271 | ndoms_cur = ndoms_new; | ||
7272 | |||
7273 | register_sched_domain_sysctl(); | ||
7274 | |||
7275 | mutex_unlock(&sched_domains_mutex); | ||
7276 | } | ||
7277 | |||
7278 | static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ | ||
7279 | 5686 | ||
7280 | /* | 5687 | /* |
7281 | * Update cpusets according to cpu_active mask. If cpusets are | 5688 | * Update cpusets according to cpu_active mask. If cpusets are |
@@ -7352,7 +5759,7 @@ int sched_cpu_activate(unsigned int cpu) | |||
7352 | * Put the rq online, if not already. This happens: | 5759 | * Put the rq online, if not already. This happens: |
7353 | * | 5760 | * |
7354 | * 1) In the early boot process, because we build the real domains | 5761 | * 1) In the early boot process, because we build the real domains |
7355 | * after all cpus have been brought up. | 5762 | * after all CPUs have been brought up. |
7356 | * | 5763 | * |
7357 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the | 5764 | * 2) At runtime, if cpuset_cpu_active() fails to rebuild the |
7358 | * domains. | 5765 | * domains. |
@@ -7467,7 +5874,7 @@ void __init sched_init_smp(void) | |||
7467 | 5874 | ||
7468 | /* | 5875 | /* |
7469 | * There's no userspace yet to cause hotplug operations; hence all the | 5876 | * There's no userspace yet to cause hotplug operations; hence all the |
7470 | * cpu masks are stable and all blatant races in the below code cannot | 5877 | * CPU masks are stable and all blatant races in the below code cannot |
7471 | * happen. | 5878 | * happen. |
7472 | */ | 5879 | */ |
7473 | mutex_lock(&sched_domains_mutex); | 5880 | mutex_lock(&sched_domains_mutex); |
@@ -7487,6 +5894,7 @@ void __init sched_init_smp(void) | |||
7487 | init_sched_dl_class(); | 5894 | init_sched_dl_class(); |
7488 | 5895 | ||
7489 | sched_init_smt(); | 5896 | sched_init_smt(); |
5897 | sched_clock_init_late(); | ||
7490 | 5898 | ||
7491 | sched_smp_initialized = true; | 5899 | sched_smp_initialized = true; |
7492 | } | 5900 | } |
@@ -7502,6 +5910,7 @@ early_initcall(migration_init); | |||
7502 | void __init sched_init_smp(void) | 5910 | void __init sched_init_smp(void) |
7503 | { | 5911 | { |
7504 | sched_init_granularity(); | 5912 | sched_init_granularity(); |
5913 | sched_clock_init_late(); | ||
7505 | } | 5914 | } |
7506 | #endif /* CONFIG_SMP */ | 5915 | #endif /* CONFIG_SMP */ |
7507 | 5916 | ||
@@ -7545,6 +5954,8 @@ void __init sched_init(void) | |||
7545 | int i, j; | 5954 | int i, j; |
7546 | unsigned long alloc_size = 0, ptr; | 5955 | unsigned long alloc_size = 0, ptr; |
7547 | 5956 | ||
5957 | sched_clock_init(); | ||
5958 | |||
7548 | for (i = 0; i < WAIT_TABLE_SIZE; i++) | 5959 | for (i = 0; i < WAIT_TABLE_SIZE; i++) |
7549 | init_waitqueue_head(bit_wait_table + i); | 5960 | init_waitqueue_head(bit_wait_table + i); |
7550 | 5961 | ||
@@ -7583,10 +5994,8 @@ void __init sched_init(void) | |||
7583 | } | 5994 | } |
7584 | #endif /* CONFIG_CPUMASK_OFFSTACK */ | 5995 | #endif /* CONFIG_CPUMASK_OFFSTACK */ |
7585 | 5996 | ||
7586 | init_rt_bandwidth(&def_rt_bandwidth, | 5997 | init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); |
7587 | global_rt_period(), global_rt_runtime()); | 5998 | init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime()); |
7588 | init_dl_bandwidth(&def_dl_bandwidth, | ||
7589 | global_rt_period(), global_rt_runtime()); | ||
7590 | 5999 | ||
7591 | #ifdef CONFIG_SMP | 6000 | #ifdef CONFIG_SMP |
7592 | init_defrootdomain(); | 6001 | init_defrootdomain(); |
@@ -7622,18 +6031,18 @@ void __init sched_init(void) | |||
7622 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6031 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7623 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | 6032 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; |
7624 | /* | 6033 | /* |
7625 | * How much cpu bandwidth does root_task_group get? | 6034 | * How much CPU bandwidth does root_task_group get? |
7626 | * | 6035 | * |
7627 | * In case of task-groups formed thr' the cgroup filesystem, it | 6036 | * In case of task-groups formed thr' the cgroup filesystem, it |
7628 | * gets 100% of the cpu resources in the system. This overall | 6037 | * gets 100% of the CPU resources in the system. This overall |
7629 | * system cpu resource is divided among the tasks of | 6038 | * system CPU resource is divided among the tasks of |
7630 | * root_task_group and its child task-groups in a fair manner, | 6039 | * root_task_group and its child task-groups in a fair manner, |
7631 | * based on each entity's (task or task-group's) weight | 6040 | * based on each entity's (task or task-group's) weight |
7632 | * (se->load.weight). | 6041 | * (se->load.weight). |
7633 | * | 6042 | * |
7634 | * In other words, if root_task_group has 10 tasks of weight | 6043 | * In other words, if root_task_group has 10 tasks of weight |
7635 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 6044 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7636 | * then A0's share of the cpu resource is: | 6045 | * then A0's share of the CPU resource is: |
7637 | * | 6046 | * |
7638 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 6047 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7639 | * | 6048 | * |
@@ -7742,10 +6151,14 @@ EXPORT_SYMBOL(__might_sleep); | |||
7742 | 6151 | ||
7743 | void ___might_sleep(const char *file, int line, int preempt_offset) | 6152 | void ___might_sleep(const char *file, int line, int preempt_offset) |
7744 | { | 6153 | { |
7745 | static unsigned long prev_jiffy; /* ratelimiting */ | 6154 | /* Ratelimiting timestamp: */ |
6155 | static unsigned long prev_jiffy; | ||
6156 | |||
7746 | unsigned long preempt_disable_ip; | 6157 | unsigned long preempt_disable_ip; |
7747 | 6158 | ||
7748 | rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ | 6159 | /* WARN_ON_ONCE() by default, no rate limit required: */ |
6160 | rcu_sleep_check(); | ||
6161 | |||
7749 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 6162 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
7750 | !is_idle_task(current)) || | 6163 | !is_idle_task(current)) || |
7751 | system_state != SYSTEM_RUNNING || oops_in_progress) | 6164 | system_state != SYSTEM_RUNNING || oops_in_progress) |
@@ -7754,7 +6167,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7754 | return; | 6167 | return; |
7755 | prev_jiffy = jiffies; | 6168 | prev_jiffy = jiffies; |
7756 | 6169 | ||
7757 | /* Save this before calling printk(), since that will clobber it */ | 6170 | /* Save this before calling printk(), since that will clobber it: */ |
7758 | preempt_disable_ip = get_preempt_disable_ip(current); | 6171 | preempt_disable_ip = get_preempt_disable_ip(current); |
7759 | 6172 | ||
7760 | printk(KERN_ERR | 6173 | printk(KERN_ERR |
@@ -7833,7 +6246,7 @@ void normalize_rt_tasks(void) | |||
7833 | */ | 6246 | */ |
7834 | 6247 | ||
7835 | /** | 6248 | /** |
7836 | * curr_task - return the current task for a given cpu. | 6249 | * curr_task - return the current task for a given CPU. |
7837 | * @cpu: the processor in question. | 6250 | * @cpu: the processor in question. |
7838 | * | 6251 | * |
7839 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6252 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
@@ -7849,13 +6262,13 @@ struct task_struct *curr_task(int cpu) | |||
7849 | 6262 | ||
7850 | #ifdef CONFIG_IA64 | 6263 | #ifdef CONFIG_IA64 |
7851 | /** | 6264 | /** |
7852 | * set_curr_task - set the current task for a given cpu. | 6265 | * set_curr_task - set the current task for a given CPU. |
7853 | * @cpu: the processor in question. | 6266 | * @cpu: the processor in question. |
7854 | * @p: the task pointer to set. | 6267 | * @p: the task pointer to set. |
7855 | * | 6268 | * |
7856 | * Description: This function must only be used when non-maskable interrupts | 6269 | * Description: This function must only be used when non-maskable interrupts |
7857 | * are serviced on a separate stack. It allows the architecture to switch the | 6270 | * are serviced on a separate stack. It allows the architecture to switch the |
7858 | * notion of the current task on a cpu in a non-blocking manner. This function | 6271 | * notion of the current task on a CPU in a non-blocking manner. This function |
7859 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6272 | * must be called with all CPU's synchronized, and interrupts disabled, the |
7860 | * and caller must save the original value of the current task (see | 6273 | * and caller must save the original value of the current task (see |
7861 | * curr_task() above) and restore that value before reenabling interrupts and | 6274 | * curr_task() above) and restore that value before reenabling interrupts and |
@@ -7911,7 +6324,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7911 | spin_lock_irqsave(&task_group_lock, flags); | 6324 | spin_lock_irqsave(&task_group_lock, flags); |
7912 | list_add_rcu(&tg->list, &task_groups); | 6325 | list_add_rcu(&tg->list, &task_groups); |
7913 | 6326 | ||
7914 | WARN_ON(!parent); /* root should already exist */ | 6327 | /* Root should already exist: */ |
6328 | WARN_ON(!parent); | ||
7915 | 6329 | ||
7916 | tg->parent = parent; | 6330 | tg->parent = parent; |
7917 | INIT_LIST_HEAD(&tg->children); | 6331 | INIT_LIST_HEAD(&tg->children); |
@@ -7924,13 +6338,13 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) | |||
7924 | /* rcu callback to free various structures associated with a task group */ | 6338 | /* rcu callback to free various structures associated with a task group */ |
7925 | static void sched_free_group_rcu(struct rcu_head *rhp) | 6339 | static void sched_free_group_rcu(struct rcu_head *rhp) |
7926 | { | 6340 | { |
7927 | /* now it should be safe to free those cfs_rqs */ | 6341 | /* Now it should be safe to free those cfs_rqs: */ |
7928 | sched_free_group(container_of(rhp, struct task_group, rcu)); | 6342 | sched_free_group(container_of(rhp, struct task_group, rcu)); |
7929 | } | 6343 | } |
7930 | 6344 | ||
7931 | void sched_destroy_group(struct task_group *tg) | 6345 | void sched_destroy_group(struct task_group *tg) |
7932 | { | 6346 | { |
7933 | /* wait for possible concurrent references to cfs_rqs complete */ | 6347 | /* Wait for possible concurrent references to cfs_rqs complete: */ |
7934 | call_rcu(&tg->rcu, sched_free_group_rcu); | 6348 | call_rcu(&tg->rcu, sched_free_group_rcu); |
7935 | } | 6349 | } |
7936 | 6350 | ||
@@ -7938,7 +6352,7 @@ void sched_offline_group(struct task_group *tg) | |||
7938 | { | 6352 | { |
7939 | unsigned long flags; | 6353 | unsigned long flags; |
7940 | 6354 | ||
7941 | /* end participation in shares distribution */ | 6355 | /* End participation in shares distribution: */ |
7942 | unregister_fair_sched_group(tg); | 6356 | unregister_fair_sched_group(tg); |
7943 | 6357 | ||
7944 | spin_lock_irqsave(&task_group_lock, flags); | 6358 | spin_lock_irqsave(&task_group_lock, flags); |
@@ -7983,20 +6397,21 @@ void sched_move_task(struct task_struct *tsk) | |||
7983 | struct rq *rq; | 6397 | struct rq *rq; |
7984 | 6398 | ||
7985 | rq = task_rq_lock(tsk, &rf); | 6399 | rq = task_rq_lock(tsk, &rf); |
6400 | update_rq_clock(rq); | ||
7986 | 6401 | ||
7987 | running = task_current(rq, tsk); | 6402 | running = task_current(rq, tsk); |
7988 | queued = task_on_rq_queued(tsk); | 6403 | queued = task_on_rq_queued(tsk); |
7989 | 6404 | ||
7990 | if (queued) | 6405 | if (queued) |
7991 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); | 6406 | dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); |
7992 | if (unlikely(running)) | 6407 | if (running) |
7993 | put_prev_task(rq, tsk); | 6408 | put_prev_task(rq, tsk); |
7994 | 6409 | ||
7995 | sched_change_group(tsk, TASK_MOVE_GROUP); | 6410 | sched_change_group(tsk, TASK_MOVE_GROUP); |
7996 | 6411 | ||
7997 | if (queued) | 6412 | if (queued) |
7998 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); | 6413 | enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); |
7999 | if (unlikely(running)) | 6414 | if (running) |
8000 | set_curr_task(rq, tsk); | 6415 | set_curr_task(rq, tsk); |
8001 | 6416 | ||
8002 | task_rq_unlock(rq, tsk, &rf); | 6417 | task_rq_unlock(rq, tsk, &rf); |
@@ -8366,11 +6781,14 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
8366 | 6781 | ||
8367 | mutex_lock(&mutex); | 6782 | mutex_lock(&mutex); |
8368 | ret = proc_dointvec(table, write, buffer, lenp, ppos); | 6783 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
8369 | /* make sure that internally we keep jiffies */ | 6784 | /* |
8370 | /* also, writing zero resets timeslice to default */ | 6785 | * Make sure that internally we keep jiffies. |
6786 | * Also, writing zero resets the timeslice to default: | ||
6787 | */ | ||
8371 | if (!ret && write) { | 6788 | if (!ret && write) { |
8372 | sched_rr_timeslice = sched_rr_timeslice <= 0 ? | 6789 | sched_rr_timeslice = |
8373 | RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); | 6790 | sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : |
6791 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | ||
8374 | } | 6792 | } |
8375 | mutex_unlock(&mutex); | 6793 | mutex_unlock(&mutex); |
8376 | return ret; | 6794 | return ret; |
@@ -8431,6 +6849,7 @@ static void cpu_cgroup_fork(struct task_struct *task) | |||
8431 | 6849 | ||
8432 | rq = task_rq_lock(task, &rf); | 6850 | rq = task_rq_lock(task, &rf); |
8433 | 6851 | ||
6852 | update_rq_clock(rq); | ||
8434 | sched_change_group(task, TASK_SET_GROUP); | 6853 | sched_change_group(task, TASK_SET_GROUP); |
8435 | 6854 | ||
8436 | task_rq_unlock(rq, task, &rf); | 6855 | task_rq_unlock(rq, task, &rf); |
@@ -8550,9 +6969,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) | |||
8550 | cfs_b->quota = quota; | 6969 | cfs_b->quota = quota; |
8551 | 6970 | ||
8552 | __refill_cfs_bandwidth_runtime(cfs_b); | 6971 | __refill_cfs_bandwidth_runtime(cfs_b); |
8553 | /* restart the period timer (if active) to handle new period expiry */ | 6972 | |
6973 | /* Restart the period timer (if active) to handle new period expiry: */ | ||
8554 | if (runtime_enabled) | 6974 | if (runtime_enabled) |
8555 | start_cfs_bandwidth(cfs_b); | 6975 | start_cfs_bandwidth(cfs_b); |
6976 | |||
8556 | raw_spin_unlock_irq(&cfs_b->lock); | 6977 | raw_spin_unlock_irq(&cfs_b->lock); |
8557 | 6978 | ||
8558 | for_each_online_cpu(i) { | 6979 | for_each_online_cpu(i) { |
@@ -8690,8 +7111,8 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) | |||
8690 | parent_quota = parent_b->hierarchical_quota; | 7111 | parent_quota = parent_b->hierarchical_quota; |
8691 | 7112 | ||
8692 | /* | 7113 | /* |
8693 | * ensure max(child_quota) <= parent_quota, inherit when no | 7114 | * Ensure max(child_quota) <= parent_quota, inherit when no |
8694 | * limit is set | 7115 | * limit is set: |
8695 | */ | 7116 | */ |
8696 | if (quota == RUNTIME_INF) | 7117 | if (quota == RUNTIME_INF) |
8697 | quota = parent_quota; | 7118 | quota = parent_quota; |
@@ -8800,7 +7221,7 @@ static struct cftype cpu_files[] = { | |||
8800 | .write_u64 = cpu_rt_period_write_uint, | 7221 | .write_u64 = cpu_rt_period_write_uint, |
8801 | }, | 7222 | }, |
8802 | #endif | 7223 | #endif |
8803 | { } /* terminate */ | 7224 | { } /* Terminate */ |
8804 | }; | 7225 | }; |
8805 | 7226 | ||
8806 | struct cgroup_subsys cpu_cgrp_subsys = { | 7227 | struct cgroup_subsys cpu_cgrp_subsys = { |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9add206b5608..f95ab29a45d0 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) | |||
297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { | 297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { |
298 | seq_printf(sf, "%s %lld\n", | 298 | seq_printf(sf, "%s %lld\n", |
299 | cpuacct_stat_desc[stat], | 299 | cpuacct_stat_desc[stat], |
300 | (long long)cputime64_to_clock_t(val[stat])); | 300 | (long long)nsec_to_clock_t(val[stat])); |
301 | } | 301 | } |
302 | 302 | ||
303 | return 0; | 303 | return 0; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7700a9cba335..2ecec3a4f1ee 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/kernel_stat.h> | 4 | #include <linux/kernel_stat.h> |
5 | #include <linux/static_key.h> | 5 | #include <linux/static_key.h> |
6 | #include <linux/context_tracking.h> | 6 | #include <linux/context_tracking.h> |
7 | #include <linux/cputime.h> | ||
7 | #include "sched.h" | 8 | #include "sched.h" |
8 | #ifdef CONFIG_PARAVIRT | 9 | #ifdef CONFIG_PARAVIRT |
9 | #include <asm/paravirt.h> | 10 | #include <asm/paravirt.h> |
@@ -44,6 +45,7 @@ void disable_sched_clock_irqtime(void) | |||
44 | void irqtime_account_irq(struct task_struct *curr) | 45 | void irqtime_account_irq(struct task_struct *curr) |
45 | { | 46 | { |
46 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); | 47 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
48 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
47 | s64 delta; | 49 | s64 delta; |
48 | int cpu; | 50 | int cpu; |
49 | 51 | ||
@@ -61,49 +63,34 @@ void irqtime_account_irq(struct task_struct *curr) | |||
61 | * in that case, so as not to confuse scheduler with a special task | 63 | * in that case, so as not to confuse scheduler with a special task |
62 | * that do not consume any time, but still wants to run. | 64 | * that do not consume any time, but still wants to run. |
63 | */ | 65 | */ |
64 | if (hardirq_count()) | 66 | if (hardirq_count()) { |
65 | irqtime->hardirq_time += delta; | 67 | cpustat[CPUTIME_IRQ] += delta; |
66 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | 68 | irqtime->tick_delta += delta; |
67 | irqtime->softirq_time += delta; | 69 | } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { |
70 | cpustat[CPUTIME_SOFTIRQ] += delta; | ||
71 | irqtime->tick_delta += delta; | ||
72 | } | ||
68 | 73 | ||
69 | u64_stats_update_end(&irqtime->sync); | 74 | u64_stats_update_end(&irqtime->sync); |
70 | } | 75 | } |
71 | EXPORT_SYMBOL_GPL(irqtime_account_irq); | 76 | EXPORT_SYMBOL_GPL(irqtime_account_irq); |
72 | 77 | ||
73 | static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime) | 78 | static u64 irqtime_tick_accounted(u64 maxtime) |
74 | { | 79 | { |
75 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 80 | struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); |
76 | cputime_t irq_cputime; | 81 | u64 delta; |
77 | |||
78 | irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx]; | ||
79 | irq_cputime = min(irq_cputime, maxtime); | ||
80 | cpustat[idx] += irq_cputime; | ||
81 | 82 | ||
82 | return irq_cputime; | 83 | delta = min(irqtime->tick_delta, maxtime); |
83 | } | 84 | irqtime->tick_delta -= delta; |
84 | 85 | ||
85 | static cputime_t irqtime_account_hi_update(cputime_t maxtime) | 86 | return delta; |
86 | { | ||
87 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time), | ||
88 | CPUTIME_IRQ, maxtime); | ||
89 | } | ||
90 | |||
91 | static cputime_t irqtime_account_si_update(cputime_t maxtime) | ||
92 | { | ||
93 | return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time), | ||
94 | CPUTIME_SOFTIRQ, maxtime); | ||
95 | } | 87 | } |
96 | 88 | ||
97 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 89 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
98 | 90 | ||
99 | #define sched_clock_irqtime (0) | 91 | #define sched_clock_irqtime (0) |
100 | 92 | ||
101 | static cputime_t irqtime_account_hi_update(cputime_t dummy) | 93 | static u64 irqtime_tick_accounted(u64 dummy) |
102 | { | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | static cputime_t irqtime_account_si_update(cputime_t dummy) | ||
107 | { | 94 | { |
108 | return 0; | 95 | return 0; |
109 | } | 96 | } |
@@ -129,7 +116,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
129 | * @p: the process that the cpu time gets accounted to | 116 | * @p: the process that the cpu time gets accounted to |
130 | * @cputime: the cpu time spent in user space since the last update | 117 | * @cputime: the cpu time spent in user space since the last update |
131 | */ | 118 | */ |
132 | void account_user_time(struct task_struct *p, cputime_t cputime) | 119 | void account_user_time(struct task_struct *p, u64 cputime) |
133 | { | 120 | { |
134 | int index; | 121 | int index; |
135 | 122 | ||
@@ -140,7 +127,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
140 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | 127 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
141 | 128 | ||
142 | /* Add user time to cpustat. */ | 129 | /* Add user time to cpustat. */ |
143 | task_group_account_field(p, index, (__force u64) cputime); | 130 | task_group_account_field(p, index, cputime); |
144 | 131 | ||
145 | /* Account for user time used */ | 132 | /* Account for user time used */ |
146 | acct_account_cputime(p); | 133 | acct_account_cputime(p); |
@@ -151,7 +138,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
151 | * @p: the process that the cpu time gets accounted to | 138 | * @p: the process that the cpu time gets accounted to |
152 | * @cputime: the cpu time spent in virtual machine since the last update | 139 | * @cputime: the cpu time spent in virtual machine since the last update |
153 | */ | 140 | */ |
154 | static void account_guest_time(struct task_struct *p, cputime_t cputime) | 141 | void account_guest_time(struct task_struct *p, u64 cputime) |
155 | { | 142 | { |
156 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 143 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
157 | 144 | ||
@@ -162,11 +149,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
162 | 149 | ||
163 | /* Add guest time to cpustat. */ | 150 | /* Add guest time to cpustat. */ |
164 | if (task_nice(p) > 0) { | 151 | if (task_nice(p) > 0) { |
165 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | 152 | cpustat[CPUTIME_NICE] += cputime; |
166 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | 153 | cpustat[CPUTIME_GUEST_NICE] += cputime; |
167 | } else { | 154 | } else { |
168 | cpustat[CPUTIME_USER] += (__force u64) cputime; | 155 | cpustat[CPUTIME_USER] += cputime; |
169 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | 156 | cpustat[CPUTIME_GUEST] += cputime; |
170 | } | 157 | } |
171 | } | 158 | } |
172 | 159 | ||
@@ -176,15 +163,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
176 | * @cputime: the cpu time spent in kernel space since the last update | 163 | * @cputime: the cpu time spent in kernel space since the last update |
177 | * @index: pointer to cpustat field that has to be updated | 164 | * @index: pointer to cpustat field that has to be updated |
178 | */ | 165 | */ |
179 | static inline | 166 | void account_system_index_time(struct task_struct *p, |
180 | void __account_system_time(struct task_struct *p, cputime_t cputime, int index) | 167 | u64 cputime, enum cpu_usage_stat index) |
181 | { | 168 | { |
182 | /* Add system time to process. */ | 169 | /* Add system time to process. */ |
183 | p->stime += cputime; | 170 | p->stime += cputime; |
184 | account_group_system_time(p, cputime); | 171 | account_group_system_time(p, cputime); |
185 | 172 | ||
186 | /* Add system time to cpustat. */ | 173 | /* Add system time to cpustat. */ |
187 | task_group_account_field(p, index, (__force u64) cputime); | 174 | task_group_account_field(p, index, cputime); |
188 | 175 | ||
189 | /* Account for system time used */ | 176 | /* Account for system time used */ |
190 | acct_account_cputime(p); | 177 | acct_account_cputime(p); |
@@ -196,8 +183,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, int index) | |||
196 | * @hardirq_offset: the offset to subtract from hardirq_count() | 183 | * @hardirq_offset: the offset to subtract from hardirq_count() |
197 | * @cputime: the cpu time spent in kernel space since the last update | 184 | * @cputime: the cpu time spent in kernel space since the last update |
198 | */ | 185 | */ |
199 | void account_system_time(struct task_struct *p, int hardirq_offset, | 186 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
200 | cputime_t cputime) | ||
201 | { | 187 | { |
202 | int index; | 188 | int index; |
203 | 189 | ||
@@ -213,33 +199,33 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
213 | else | 199 | else |
214 | index = CPUTIME_SYSTEM; | 200 | index = CPUTIME_SYSTEM; |
215 | 201 | ||
216 | __account_system_time(p, cputime, index); | 202 | account_system_index_time(p, cputime, index); |
217 | } | 203 | } |
218 | 204 | ||
219 | /* | 205 | /* |
220 | * Account for involuntary wait time. | 206 | * Account for involuntary wait time. |
221 | * @cputime: the cpu time spent in involuntary wait | 207 | * @cputime: the cpu time spent in involuntary wait |
222 | */ | 208 | */ |
223 | void account_steal_time(cputime_t cputime) | 209 | void account_steal_time(u64 cputime) |
224 | { | 210 | { |
225 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 211 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
226 | 212 | ||
227 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | 213 | cpustat[CPUTIME_STEAL] += cputime; |
228 | } | 214 | } |
229 | 215 | ||
230 | /* | 216 | /* |
231 | * Account for idle time. | 217 | * Account for idle time. |
232 | * @cputime: the cpu time spent in idle wait | 218 | * @cputime: the cpu time spent in idle wait |
233 | */ | 219 | */ |
234 | void account_idle_time(cputime_t cputime) | 220 | void account_idle_time(u64 cputime) |
235 | { | 221 | { |
236 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 222 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
237 | struct rq *rq = this_rq(); | 223 | struct rq *rq = this_rq(); |
238 | 224 | ||
239 | if (atomic_read(&rq->nr_iowait) > 0) | 225 | if (atomic_read(&rq->nr_iowait) > 0) |
240 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | 226 | cpustat[CPUTIME_IOWAIT] += cputime; |
241 | else | 227 | else |
242 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | 228 | cpustat[CPUTIME_IDLE] += cputime; |
243 | } | 229 | } |
244 | 230 | ||
245 | /* | 231 | /* |
@@ -247,21 +233,19 @@ void account_idle_time(cputime_t cputime) | |||
247 | * ticks are not redelivered later. Due to that, this function may on | 233 | * ticks are not redelivered later. Due to that, this function may on |
248 | * occasion account more time than the calling functions think elapsed. | 234 | * occasion account more time than the calling functions think elapsed. |
249 | */ | 235 | */ |
250 | static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) | 236 | static __always_inline u64 steal_account_process_time(u64 maxtime) |
251 | { | 237 | { |
252 | #ifdef CONFIG_PARAVIRT | 238 | #ifdef CONFIG_PARAVIRT |
253 | if (static_key_false(¶virt_steal_enabled)) { | 239 | if (static_key_false(¶virt_steal_enabled)) { |
254 | cputime_t steal_cputime; | ||
255 | u64 steal; | 240 | u64 steal; |
256 | 241 | ||
257 | steal = paravirt_steal_clock(smp_processor_id()); | 242 | steal = paravirt_steal_clock(smp_processor_id()); |
258 | steal -= this_rq()->prev_steal_time; | 243 | steal -= this_rq()->prev_steal_time; |
244 | steal = min(steal, maxtime); | ||
245 | account_steal_time(steal); | ||
246 | this_rq()->prev_steal_time += steal; | ||
259 | 247 | ||
260 | steal_cputime = min(nsecs_to_cputime(steal), maxtime); | 248 | return steal; |
261 | account_steal_time(steal_cputime); | ||
262 | this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); | ||
263 | |||
264 | return steal_cputime; | ||
265 | } | 249 | } |
266 | #endif | 250 | #endif |
267 | return 0; | 251 | return 0; |
@@ -270,9 +254,9 @@ static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) | |||
270 | /* | 254 | /* |
271 | * Account how much elapsed time was spent in steal, irq, or softirq time. | 255 | * Account how much elapsed time was spent in steal, irq, or softirq time. |
272 | */ | 256 | */ |
273 | static inline cputime_t account_other_time(cputime_t max) | 257 | static inline u64 account_other_time(u64 max) |
274 | { | 258 | { |
275 | cputime_t accounted; | 259 | u64 accounted; |
276 | 260 | ||
277 | /* Shall be converted to a lockdep-enabled lightweight check */ | 261 | /* Shall be converted to a lockdep-enabled lightweight check */ |
278 | WARN_ON_ONCE(!irqs_disabled()); | 262 | WARN_ON_ONCE(!irqs_disabled()); |
@@ -280,10 +264,7 @@ static inline cputime_t account_other_time(cputime_t max) | |||
280 | accounted = steal_account_process_time(max); | 264 | accounted = steal_account_process_time(max); |
281 | 265 | ||
282 | if (accounted < max) | 266 | if (accounted < max) |
283 | accounted += irqtime_account_hi_update(max - accounted); | 267 | accounted += irqtime_tick_accounted(max - accounted); |
284 | |||
285 | if (accounted < max) | ||
286 | accounted += irqtime_account_si_update(max - accounted); | ||
287 | 268 | ||
288 | return accounted; | 269 | return accounted; |
289 | } | 270 | } |
@@ -315,7 +296,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) | |||
315 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | 296 | void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) |
316 | { | 297 | { |
317 | struct signal_struct *sig = tsk->signal; | 298 | struct signal_struct *sig = tsk->signal; |
318 | cputime_t utime, stime; | 299 | u64 utime, stime; |
319 | struct task_struct *t; | 300 | struct task_struct *t; |
320 | unsigned int seq, nextseq; | 301 | unsigned int seq, nextseq; |
321 | unsigned long flags; | 302 | unsigned long flags; |
@@ -379,8 +360,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
379 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 360 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
380 | struct rq *rq, int ticks) | 361 | struct rq *rq, int ticks) |
381 | { | 362 | { |
382 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; | 363 | u64 other, cputime = TICK_NSEC * ticks; |
383 | cputime_t other; | ||
384 | 364 | ||
385 | /* | 365 | /* |
386 | * When returning from idle, many ticks can get accounted at | 366 | * When returning from idle, many ticks can get accounted at |
@@ -392,6 +372,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
392 | other = account_other_time(ULONG_MAX); | 372 | other = account_other_time(ULONG_MAX); |
393 | if (other >= cputime) | 373 | if (other >= cputime) |
394 | return; | 374 | return; |
375 | |||
395 | cputime -= other; | 376 | cputime -= other; |
396 | 377 | ||
397 | if (this_cpu_ksoftirqd() == p) { | 378 | if (this_cpu_ksoftirqd() == p) { |
@@ -400,7 +381,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
400 | * So, we have to handle it separately here. | 381 | * So, we have to handle it separately here. |
401 | * Also, p->stime needs to be updated for ksoftirqd. | 382 | * Also, p->stime needs to be updated for ksoftirqd. |
402 | */ | 383 | */ |
403 | __account_system_time(p, cputime, CPUTIME_SOFTIRQ); | 384 | account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); |
404 | } else if (user_tick) { | 385 | } else if (user_tick) { |
405 | account_user_time(p, cputime); | 386 | account_user_time(p, cputime); |
406 | } else if (p == rq->idle) { | 387 | } else if (p == rq->idle) { |
@@ -408,7 +389,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
408 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | 389 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
409 | account_guest_time(p, cputime); | 390 | account_guest_time(p, cputime); |
410 | } else { | 391 | } else { |
411 | __account_system_time(p, cputime, CPUTIME_SYSTEM); | 392 | account_system_index_time(p, cputime, CPUTIME_SYSTEM); |
412 | } | 393 | } |
413 | } | 394 | } |
414 | 395 | ||
@@ -437,9 +418,7 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
437 | else | 418 | else |
438 | vtime_account_system(prev); | 419 | vtime_account_system(prev); |
439 | 420 | ||
440 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE | 421 | vtime_flush(prev); |
441 | vtime_account_user(prev); | ||
442 | #endif | ||
443 | arch_vtime_task_switch(prev); | 422 | arch_vtime_task_switch(prev); |
444 | } | 423 | } |
445 | #endif | 424 | #endif |
@@ -467,14 +446,14 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
467 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); | 446 | EXPORT_SYMBOL_GPL(vtime_account_irq_enter); |
468 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | 447 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ |
469 | 448 | ||
470 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 449 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
471 | { | 450 | { |
472 | *ut = p->utime; | 451 | *ut = p->utime; |
473 | *st = p->stime; | 452 | *st = p->stime; |
474 | } | 453 | } |
475 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | 454 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
476 | 455 | ||
477 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 456 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
478 | { | 457 | { |
479 | struct task_cputime cputime; | 458 | struct task_cputime cputime; |
480 | 459 | ||
@@ -491,7 +470,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
491 | */ | 470 | */ |
492 | void account_process_tick(struct task_struct *p, int user_tick) | 471 | void account_process_tick(struct task_struct *p, int user_tick) |
493 | { | 472 | { |
494 | cputime_t cputime, steal; | 473 | u64 cputime, steal; |
495 | struct rq *rq = this_rq(); | 474 | struct rq *rq = this_rq(); |
496 | 475 | ||
497 | if (vtime_accounting_cpu_enabled()) | 476 | if (vtime_accounting_cpu_enabled()) |
@@ -502,7 +481,7 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
502 | return; | 481 | return; |
503 | } | 482 | } |
504 | 483 | ||
505 | cputime = cputime_one_jiffy; | 484 | cputime = TICK_NSEC; |
506 | steal = steal_account_process_time(ULONG_MAX); | 485 | steal = steal_account_process_time(ULONG_MAX); |
507 | 486 | ||
508 | if (steal >= cputime) | 487 | if (steal >= cputime) |
@@ -524,14 +503,14 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
524 | */ | 503 | */ |
525 | void account_idle_ticks(unsigned long ticks) | 504 | void account_idle_ticks(unsigned long ticks) |
526 | { | 505 | { |
527 | cputime_t cputime, steal; | 506 | u64 cputime, steal; |
528 | 507 | ||
529 | if (sched_clock_irqtime) { | 508 | if (sched_clock_irqtime) { |
530 | irqtime_account_idle_ticks(ticks); | 509 | irqtime_account_idle_ticks(ticks); |
531 | return; | 510 | return; |
532 | } | 511 | } |
533 | 512 | ||
534 | cputime = jiffies_to_cputime(ticks); | 513 | cputime = ticks * TICK_NSEC; |
535 | steal = steal_account_process_time(ULONG_MAX); | 514 | steal = steal_account_process_time(ULONG_MAX); |
536 | 515 | ||
537 | if (steal >= cputime) | 516 | if (steal >= cputime) |
@@ -545,7 +524,7 @@ void account_idle_ticks(unsigned long ticks) | |||
545 | * Perform (stime * rtime) / total, but avoid multiplication overflow by | 524 | * Perform (stime * rtime) / total, but avoid multiplication overflow by |
546 | * loosing precision when the numbers are big. | 525 | * loosing precision when the numbers are big. |
547 | */ | 526 | */ |
548 | static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) | 527 | static u64 scale_stime(u64 stime, u64 rtime, u64 total) |
549 | { | 528 | { |
550 | u64 scaled; | 529 | u64 scaled; |
551 | 530 | ||
@@ -582,7 +561,7 @@ drop_precision: | |||
582 | * followed by a 64/32->64 divide. | 561 | * followed by a 64/32->64 divide. |
583 | */ | 562 | */ |
584 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); | 563 | scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); |
585 | return (__force cputime_t) scaled; | 564 | return scaled; |
586 | } | 565 | } |
587 | 566 | ||
588 | /* | 567 | /* |
@@ -607,14 +586,14 @@ drop_precision: | |||
607 | */ | 586 | */ |
608 | static void cputime_adjust(struct task_cputime *curr, | 587 | static void cputime_adjust(struct task_cputime *curr, |
609 | struct prev_cputime *prev, | 588 | struct prev_cputime *prev, |
610 | cputime_t *ut, cputime_t *st) | 589 | u64 *ut, u64 *st) |
611 | { | 590 | { |
612 | cputime_t rtime, stime, utime; | 591 | u64 rtime, stime, utime; |
613 | unsigned long flags; | 592 | unsigned long flags; |
614 | 593 | ||
615 | /* Serialize concurrent callers such that we can honour our guarantees */ | 594 | /* Serialize concurrent callers such that we can honour our guarantees */ |
616 | raw_spin_lock_irqsave(&prev->lock, flags); | 595 | raw_spin_lock_irqsave(&prev->lock, flags); |
617 | rtime = nsecs_to_cputime(curr->sum_exec_runtime); | 596 | rtime = curr->sum_exec_runtime; |
618 | 597 | ||
619 | /* | 598 | /* |
620 | * This is possible under two circumstances: | 599 | * This is possible under two circumstances: |
@@ -645,8 +624,7 @@ static void cputime_adjust(struct task_cputime *curr, | |||
645 | goto update; | 624 | goto update; |
646 | } | 625 | } |
647 | 626 | ||
648 | stime = scale_stime((__force u64)stime, (__force u64)rtime, | 627 | stime = scale_stime(stime, rtime, stime + utime); |
649 | (__force u64)(stime + utime)); | ||
650 | 628 | ||
651 | update: | 629 | update: |
652 | /* | 630 | /* |
@@ -679,7 +657,7 @@ out: | |||
679 | raw_spin_unlock_irqrestore(&prev->lock, flags); | 657 | raw_spin_unlock_irqrestore(&prev->lock, flags); |
680 | } | 658 | } |
681 | 659 | ||
682 | void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 660 | void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
683 | { | 661 | { |
684 | struct task_cputime cputime = { | 662 | struct task_cputime cputime = { |
685 | .sum_exec_runtime = p->se.sum_exec_runtime, | 663 | .sum_exec_runtime = p->se.sum_exec_runtime, |
@@ -690,7 +668,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | |||
690 | } | 668 | } |
691 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); | 669 | EXPORT_SYMBOL_GPL(task_cputime_adjusted); |
692 | 670 | ||
693 | void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) | 671 | void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) |
694 | { | 672 | { |
695 | struct task_cputime cputime; | 673 | struct task_cputime cputime; |
696 | 674 | ||
@@ -700,20 +678,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
700 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 678 | #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ |
701 | 679 | ||
702 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 680 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
703 | static cputime_t vtime_delta(struct task_struct *tsk) | 681 | static u64 vtime_delta(struct task_struct *tsk) |
704 | { | 682 | { |
705 | unsigned long now = READ_ONCE(jiffies); | 683 | unsigned long now = READ_ONCE(jiffies); |
706 | 684 | ||
707 | if (time_before(now, (unsigned long)tsk->vtime_snap)) | 685 | if (time_before(now, (unsigned long)tsk->vtime_snap)) |
708 | return 0; | 686 | return 0; |
709 | 687 | ||
710 | return jiffies_to_cputime(now - tsk->vtime_snap); | 688 | return jiffies_to_nsecs(now - tsk->vtime_snap); |
711 | } | 689 | } |
712 | 690 | ||
713 | static cputime_t get_vtime_delta(struct task_struct *tsk) | 691 | static u64 get_vtime_delta(struct task_struct *tsk) |
714 | { | 692 | { |
715 | unsigned long now = READ_ONCE(jiffies); | 693 | unsigned long now = READ_ONCE(jiffies); |
716 | cputime_t delta, other; | 694 | u64 delta, other; |
717 | 695 | ||
718 | /* | 696 | /* |
719 | * Unlike tick based timing, vtime based timing never has lost | 697 | * Unlike tick based timing, vtime based timing never has lost |
@@ -722,7 +700,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) | |||
722 | * elapsed time. Limit account_other_time to prevent rounding | 700 | * elapsed time. Limit account_other_time to prevent rounding |
723 | * errors from causing elapsed vtime to go negative. | 701 | * errors from causing elapsed vtime to go negative. |
724 | */ | 702 | */ |
725 | delta = jiffies_to_cputime(now - tsk->vtime_snap); | 703 | delta = jiffies_to_nsecs(now - tsk->vtime_snap); |
726 | other = account_other_time(delta); | 704 | other = account_other_time(delta); |
727 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); | 705 | WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); |
728 | tsk->vtime_snap = now; | 706 | tsk->vtime_snap = now; |
@@ -732,9 +710,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) | |||
732 | 710 | ||
733 | static void __vtime_account_system(struct task_struct *tsk) | 711 | static void __vtime_account_system(struct task_struct *tsk) |
734 | { | 712 | { |
735 | cputime_t delta_cpu = get_vtime_delta(tsk); | 713 | account_system_time(tsk, irq_count(), get_vtime_delta(tsk)); |
736 | |||
737 | account_system_time(tsk, irq_count(), delta_cpu); | ||
738 | } | 714 | } |
739 | 715 | ||
740 | void vtime_account_system(struct task_struct *tsk) | 716 | void vtime_account_system(struct task_struct *tsk) |
@@ -749,14 +725,10 @@ void vtime_account_system(struct task_struct *tsk) | |||
749 | 725 | ||
750 | void vtime_account_user(struct task_struct *tsk) | 726 | void vtime_account_user(struct task_struct *tsk) |
751 | { | 727 | { |
752 | cputime_t delta_cpu; | ||
753 | |||
754 | write_seqcount_begin(&tsk->vtime_seqcount); | 728 | write_seqcount_begin(&tsk->vtime_seqcount); |
755 | tsk->vtime_snap_whence = VTIME_SYS; | 729 | tsk->vtime_snap_whence = VTIME_SYS; |
756 | if (vtime_delta(tsk)) { | 730 | if (vtime_delta(tsk)) |
757 | delta_cpu = get_vtime_delta(tsk); | 731 | account_user_time(tsk, get_vtime_delta(tsk)); |
758 | account_user_time(tsk, delta_cpu); | ||
759 | } | ||
760 | write_seqcount_end(&tsk->vtime_seqcount); | 732 | write_seqcount_end(&tsk->vtime_seqcount); |
761 | } | 733 | } |
762 | 734 | ||
@@ -797,9 +769,7 @@ EXPORT_SYMBOL_GPL(vtime_guest_exit); | |||
797 | 769 | ||
798 | void vtime_account_idle(struct task_struct *tsk) | 770 | void vtime_account_idle(struct task_struct *tsk) |
799 | { | 771 | { |
800 | cputime_t delta_cpu = get_vtime_delta(tsk); | 772 | account_idle_time(get_vtime_delta(tsk)); |
801 | |||
802 | account_idle_time(delta_cpu); | ||
803 | } | 773 | } |
804 | 774 | ||
805 | void arch_vtime_task_switch(struct task_struct *prev) | 775 | void arch_vtime_task_switch(struct task_struct *prev) |
@@ -826,10 +796,10 @@ void vtime_init_idle(struct task_struct *t, int cpu) | |||
826 | local_irq_restore(flags); | 796 | local_irq_restore(flags); |
827 | } | 797 | } |
828 | 798 | ||
829 | cputime_t task_gtime(struct task_struct *t) | 799 | u64 task_gtime(struct task_struct *t) |
830 | { | 800 | { |
831 | unsigned int seq; | 801 | unsigned int seq; |
832 | cputime_t gtime; | 802 | u64 gtime; |
833 | 803 | ||
834 | if (!vtime_accounting_enabled()) | 804 | if (!vtime_accounting_enabled()) |
835 | return t->gtime; | 805 | return t->gtime; |
@@ -851,9 +821,9 @@ cputime_t task_gtime(struct task_struct *t) | |||
851 | * add up the pending nohz execution time since the last | 821 | * add up the pending nohz execution time since the last |
852 | * cputime snapshot. | 822 | * cputime snapshot. |
853 | */ | 823 | */ |
854 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | 824 | void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) |
855 | { | 825 | { |
856 | cputime_t delta; | 826 | u64 delta; |
857 | unsigned int seq; | 827 | unsigned int seq; |
858 | 828 | ||
859 | if (!vtime_accounting_enabled()) { | 829 | if (!vtime_accounting_enabled()) { |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 70ef2b1901e4..27737f34757d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -663,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
663 | * Nothing relies on rq->lock after this, so its safe to drop | 663 | * Nothing relies on rq->lock after this, so its safe to drop |
664 | * rq->lock. | 664 | * rq->lock. |
665 | */ | 665 | */ |
666 | lockdep_unpin_lock(&rq->lock, rf.cookie); | 666 | rq_unpin_lock(rq, &rf); |
667 | push_dl_task(rq); | 667 | push_dl_task(rq); |
668 | lockdep_repin_lock(&rq->lock, rf.cookie); | 668 | rq_repin_lock(rq, &rf); |
669 | } | 669 | } |
670 | #endif | 670 | #endif |
671 | 671 | ||
@@ -1118,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, | |||
1118 | } | 1118 | } |
1119 | 1119 | ||
1120 | struct task_struct * | 1120 | struct task_struct * |
1121 | pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 1121 | pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
1122 | { | 1122 | { |
1123 | struct sched_dl_entity *dl_se; | 1123 | struct sched_dl_entity *dl_se; |
1124 | struct task_struct *p; | 1124 | struct task_struct *p; |
@@ -1133,9 +1133,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo | |||
1133 | * disabled avoiding further scheduler activity on it and we're | 1133 | * disabled avoiding further scheduler activity on it and we're |
1134 | * being very careful to re-start the picking loop. | 1134 | * being very careful to re-start the picking loop. |
1135 | */ | 1135 | */ |
1136 | lockdep_unpin_lock(&rq->lock, cookie); | 1136 | rq_unpin_lock(rq, rf); |
1137 | pull_dl_task(rq); | 1137 | pull_dl_task(rq); |
1138 | lockdep_repin_lock(&rq->lock, cookie); | 1138 | rq_repin_lock(rq, rf); |
1139 | /* | 1139 | /* |
1140 | * pull_dl_task() can drop (and re-acquire) rq->lock; this | 1140 | * pull_dl_task() can drop (and re-acquire) rq->lock; this |
1141 | * means a stop task can slip in, in which case we need to | 1141 | * means a stop task can slip in, in which case we need to |
@@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
1729 | #ifdef CONFIG_SMP | 1729 | #ifdef CONFIG_SMP |
1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) | 1730 | if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) |
1731 | queue_push_tasks(rq); | 1731 | queue_push_tasks(rq); |
1732 | #else | 1732 | #endif |
1733 | if (dl_task(rq->curr)) | 1733 | if (dl_task(rq->curr)) |
1734 | check_preempt_curr_dl(rq, p, 0); | 1734 | check_preempt_curr_dl(rq, p, 0); |
1735 | else | 1735 | else |
1736 | resched_curr(rq); | 1736 | resched_curr(rq); |
1737 | #endif | ||
1738 | } | 1737 | } |
1739 | } | 1738 | } |
1740 | 1739 | ||
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fa178b62ea79..109adc0e9cb9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
953 | #endif | 953 | #endif |
954 | P(policy); | 954 | P(policy); |
955 | P(prio); | 955 | P(prio); |
956 | if (p->policy == SCHED_DEADLINE) { | ||
957 | P(dl.runtime); | ||
958 | P(dl.deadline); | ||
959 | } | ||
956 | #undef PN_SCHEDSTAT | 960 | #undef PN_SCHEDSTAT |
957 | #undef PN | 961 | #undef PN |
958 | #undef __PN | 962 | #undef __PN |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..274c747a01ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) | |||
2657 | if (tg_weight) | 2657 | if (tg_weight) |
2658 | shares /= tg_weight; | 2658 | shares /= tg_weight; |
2659 | 2659 | ||
2660 | /* | ||
2661 | * MIN_SHARES has to be unscaled here to support per-CPU partitioning | ||
2662 | * of a group with small tg->shares value. It is a floor value which is | ||
2663 | * assigned as a minimum load.weight to the sched_entity representing | ||
2664 | * the group on a CPU. | ||
2665 | * | ||
2666 | * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 | ||
2667 | * on an 8-core system with 8 tasks each runnable on one CPU shares has | ||
2668 | * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In | ||
2669 | * case no task is runnable on a CPU MIN_SHARES=2 should be returned | ||
2670 | * instead of 0. | ||
2671 | */ | ||
2660 | if (shares < MIN_SHARES) | 2672 | if (shares < MIN_SHARES) |
2661 | shares = MIN_SHARES; | 2673 | shares = MIN_SHARES; |
2662 | if (shares > tg->shares) | 2674 | if (shares > tg->shares) |
@@ -2689,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
2689 | 2701 | ||
2690 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2702 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
2691 | 2703 | ||
2692 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | 2704 | static void update_cfs_shares(struct sched_entity *se) |
2693 | { | 2705 | { |
2706 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
2694 | struct task_group *tg; | 2707 | struct task_group *tg; |
2695 | struct sched_entity *se; | ||
2696 | long shares; | 2708 | long shares; |
2697 | 2709 | ||
2698 | tg = cfs_rq->tg; | 2710 | if (!cfs_rq) |
2699 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | 2711 | return; |
2700 | if (!se || throttled_hierarchy(cfs_rq)) | 2712 | |
2713 | if (throttled_hierarchy(cfs_rq)) | ||
2701 | return; | 2714 | return; |
2715 | |||
2716 | tg = cfs_rq->tg; | ||
2717 | |||
2702 | #ifndef CONFIG_SMP | 2718 | #ifndef CONFIG_SMP |
2703 | if (likely(se->load.weight == tg->shares)) | 2719 | if (likely(se->load.weight == tg->shares)) |
2704 | return; | 2720 | return; |
@@ -2707,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) | |||
2707 | 2723 | ||
2708 | reweight_entity(cfs_rq_of(se), se, shares); | 2724 | reweight_entity(cfs_rq_of(se), se, shares); |
2709 | } | 2725 | } |
2726 | |||
2710 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 2727 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
2711 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | 2728 | static inline void update_cfs_shares(struct sched_entity *se) |
2712 | { | 2729 | { |
2713 | } | 2730 | } |
2714 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 2731 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
@@ -3424,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) | |||
3424 | return cfs_rq->avg.load_avg; | 3441 | return cfs_rq->avg.load_avg; |
3425 | } | 3442 | } |
3426 | 3443 | ||
3427 | static int idle_balance(struct rq *this_rq); | 3444 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf); |
3428 | 3445 | ||
3429 | #else /* CONFIG_SMP */ | 3446 | #else /* CONFIG_SMP */ |
3430 | 3447 | ||
@@ -3453,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | |||
3453 | static inline void | 3470 | static inline void |
3454 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} | 3471 | detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} |
3455 | 3472 | ||
3456 | static inline int idle_balance(struct rq *rq) | 3473 | static inline int idle_balance(struct rq *rq, struct rq_flags *rf) |
3457 | { | 3474 | { |
3458 | return 0; | 3475 | return 0; |
3459 | } | 3476 | } |
@@ -3582,10 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3582 | if (renorm && !curr) | 3599 | if (renorm && !curr) |
3583 | se->vruntime += cfs_rq->min_vruntime; | 3600 | se->vruntime += cfs_rq->min_vruntime; |
3584 | 3601 | ||
3602 | /* | ||
3603 | * When enqueuing a sched_entity, we must: | ||
3604 | * - Update loads to have both entity and cfs_rq synced with now. | ||
3605 | * - Add its load to cfs_rq->runnable_avg | ||
3606 | * - For group_entity, update its weight to reflect the new share of | ||
3607 | * its group cfs_rq | ||
3608 | * - Add its new weight to cfs_rq->load.weight | ||
3609 | */ | ||
3585 | update_load_avg(se, UPDATE_TG); | 3610 | update_load_avg(se, UPDATE_TG); |
3586 | enqueue_entity_load_avg(cfs_rq, se); | 3611 | enqueue_entity_load_avg(cfs_rq, se); |
3612 | update_cfs_shares(se); | ||
3587 | account_entity_enqueue(cfs_rq, se); | 3613 | account_entity_enqueue(cfs_rq, se); |
3588 | update_cfs_shares(cfs_rq); | ||
3589 | 3614 | ||
3590 | if (flags & ENQUEUE_WAKEUP) | 3615 | if (flags & ENQUEUE_WAKEUP) |
3591 | place_entity(cfs_rq, se, 0); | 3616 | place_entity(cfs_rq, se, 0); |
@@ -3657,6 +3682,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3657 | * Update run-time statistics of the 'current'. | 3682 | * Update run-time statistics of the 'current'. |
3658 | */ | 3683 | */ |
3659 | update_curr(cfs_rq); | 3684 | update_curr(cfs_rq); |
3685 | |||
3686 | /* | ||
3687 | * When dequeuing a sched_entity, we must: | ||
3688 | * - Update loads to have both entity and cfs_rq synced with now. | ||
3689 | * - Substract its load from the cfs_rq->runnable_avg. | ||
3690 | * - Substract its previous weight from cfs_rq->load.weight. | ||
3691 | * - For group entity, update its weight to reflect the new share | ||
3692 | * of its group cfs_rq. | ||
3693 | */ | ||
3660 | update_load_avg(se, UPDATE_TG); | 3694 | update_load_avg(se, UPDATE_TG); |
3661 | dequeue_entity_load_avg(cfs_rq, se); | 3695 | dequeue_entity_load_avg(cfs_rq, se); |
3662 | 3696 | ||
@@ -3681,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3681 | /* return excess runtime on last dequeue */ | 3715 | /* return excess runtime on last dequeue */ |
3682 | return_cfs_rq_runtime(cfs_rq); | 3716 | return_cfs_rq_runtime(cfs_rq); |
3683 | 3717 | ||
3684 | update_cfs_shares(cfs_rq); | 3718 | update_cfs_shares(se); |
3685 | 3719 | ||
3686 | /* | 3720 | /* |
3687 | * Now advance min_vruntime if @se was the entity holding it back, | 3721 | * Now advance min_vruntime if @se was the entity holding it back, |
@@ -3864,7 +3898,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3864 | * Ensure that runnable average is periodically updated. | 3898 | * Ensure that runnable average is periodically updated. |
3865 | */ | 3899 | */ |
3866 | update_load_avg(curr, UPDATE_TG); | 3900 | update_load_avg(curr, UPDATE_TG); |
3867 | update_cfs_shares(cfs_rq); | 3901 | update_cfs_shares(curr); |
3868 | 3902 | ||
3869 | #ifdef CONFIG_SCHED_HRTICK | 3903 | #ifdef CONFIG_SCHED_HRTICK |
3870 | /* | 3904 | /* |
@@ -4761,7 +4795,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4761 | break; | 4795 | break; |
4762 | 4796 | ||
4763 | update_load_avg(se, UPDATE_TG); | 4797 | update_load_avg(se, UPDATE_TG); |
4764 | update_cfs_shares(cfs_rq); | 4798 | update_cfs_shares(se); |
4765 | } | 4799 | } |
4766 | 4800 | ||
4767 | if (!se) | 4801 | if (!se) |
@@ -4820,7 +4854,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4820 | break; | 4854 | break; |
4821 | 4855 | ||
4822 | update_load_avg(se, UPDATE_TG); | 4856 | update_load_avg(se, UPDATE_TG); |
4823 | update_cfs_shares(cfs_rq); | 4857 | update_cfs_shares(se); |
4824 | } | 4858 | } |
4825 | 4859 | ||
4826 | if (!se) | 4860 | if (!se) |
@@ -6213,7 +6247,7 @@ preempt: | |||
6213 | } | 6247 | } |
6214 | 6248 | ||
6215 | static struct task_struct * | 6249 | static struct task_struct * |
6216 | pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 6250 | pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
6217 | { | 6251 | { |
6218 | struct cfs_rq *cfs_rq = &rq->cfs; | 6252 | struct cfs_rq *cfs_rq = &rq->cfs; |
6219 | struct sched_entity *se; | 6253 | struct sched_entity *se; |
@@ -6320,15 +6354,8 @@ simple: | |||
6320 | return p; | 6354 | return p; |
6321 | 6355 | ||
6322 | idle: | 6356 | idle: |
6323 | /* | 6357 | new_tasks = idle_balance(rq, rf); |
6324 | * This is OK, because current is on_cpu, which avoids it being picked | 6358 | |
6325 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
6326 | * further scheduler activity on it and we're being very careful to | ||
6327 | * re-start the picking loop. | ||
6328 | */ | ||
6329 | lockdep_unpin_lock(&rq->lock, cookie); | ||
6330 | new_tasks = idle_balance(rq); | ||
6331 | lockdep_repin_lock(&rq->lock, cookie); | ||
6332 | /* | 6359 | /* |
6333 | * Because idle_balance() releases (and re-acquires) rq->lock, it is | 6360 | * Because idle_balance() releases (and re-acquires) rq->lock, it is |
6334 | * possible for any higher priority task to appear. In that case we | 6361 | * possible for any higher priority task to appear. In that case we |
@@ -8077,6 +8104,7 @@ redo: | |||
8077 | 8104 | ||
8078 | more_balance: | 8105 | more_balance: |
8079 | raw_spin_lock_irqsave(&busiest->lock, flags); | 8106 | raw_spin_lock_irqsave(&busiest->lock, flags); |
8107 | update_rq_clock(busiest); | ||
8080 | 8108 | ||
8081 | /* | 8109 | /* |
8082 | * cur_ld_moved - load moved in current iteration | 8110 | * cur_ld_moved - load moved in current iteration |
@@ -8297,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) | |||
8297 | * idle_balance is called by schedule() if this_cpu is about to become | 8325 | * idle_balance is called by schedule() if this_cpu is about to become |
8298 | * idle. Attempts to pull tasks from other CPUs. | 8326 | * idle. Attempts to pull tasks from other CPUs. |
8299 | */ | 8327 | */ |
8300 | static int idle_balance(struct rq *this_rq) | 8328 | static int idle_balance(struct rq *this_rq, struct rq_flags *rf) |
8301 | { | 8329 | { |
8302 | unsigned long next_balance = jiffies + HZ; | 8330 | unsigned long next_balance = jiffies + HZ; |
8303 | int this_cpu = this_rq->cpu; | 8331 | int this_cpu = this_rq->cpu; |
@@ -8311,6 +8339,14 @@ static int idle_balance(struct rq *this_rq) | |||
8311 | */ | 8339 | */ |
8312 | this_rq->idle_stamp = rq_clock(this_rq); | 8340 | this_rq->idle_stamp = rq_clock(this_rq); |
8313 | 8341 | ||
8342 | /* | ||
8343 | * This is OK, because current is on_cpu, which avoids it being picked | ||
8344 | * for load-balance and preemption/IRQs are still disabled avoiding | ||
8345 | * further scheduler activity on it and we're being very careful to | ||
8346 | * re-start the picking loop. | ||
8347 | */ | ||
8348 | rq_unpin_lock(this_rq, rf); | ||
8349 | |||
8314 | if (this_rq->avg_idle < sysctl_sched_migration_cost || | 8350 | if (this_rq->avg_idle < sysctl_sched_migration_cost || |
8315 | !this_rq->rd->overload) { | 8351 | !this_rq->rd->overload) { |
8316 | rcu_read_lock(); | 8352 | rcu_read_lock(); |
@@ -8388,6 +8424,8 @@ out: | |||
8388 | if (pulled_task) | 8424 | if (pulled_task) |
8389 | this_rq->idle_stamp = 0; | 8425 | this_rq->idle_stamp = 0; |
8390 | 8426 | ||
8427 | rq_repin_lock(this_rq, rf); | ||
8428 | |||
8391 | return pulled_task; | 8429 | return pulled_task; |
8392 | } | 8430 | } |
8393 | 8431 | ||
@@ -8443,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8443 | }; | 8481 | }; |
8444 | 8482 | ||
8445 | schedstat_inc(sd->alb_count); | 8483 | schedstat_inc(sd->alb_count); |
8484 | update_rq_clock(busiest_rq); | ||
8446 | 8485 | ||
8447 | p = detach_one_task(&env); | 8486 | p = detach_one_task(&env); |
8448 | if (p) { | 8487 | if (p) { |
@@ -9264,6 +9303,7 @@ void online_fair_sched_group(struct task_group *tg) | |||
9264 | se = tg->se[i]; | 9303 | se = tg->se[i]; |
9265 | 9304 | ||
9266 | raw_spin_lock_irq(&rq->lock); | 9305 | raw_spin_lock_irq(&rq->lock); |
9306 | update_rq_clock(rq); | ||
9267 | attach_entity_cfs_rq(se); | 9307 | attach_entity_cfs_rq(se); |
9268 | sync_throttle(tg, i); | 9308 | sync_throttle(tg, i); |
9269 | raw_spin_unlock_irq(&rq->lock); | 9309 | raw_spin_unlock_irq(&rq->lock); |
@@ -9356,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
9356 | 9396 | ||
9357 | /* Possible calls to update_curr() need rq clock */ | 9397 | /* Possible calls to update_curr() need rq clock */ |
9358 | update_rq_clock(rq); | 9398 | update_rq_clock(rq); |
9359 | for_each_sched_entity(se) | 9399 | for_each_sched_entity(se) { |
9360 | update_cfs_shares(group_cfs_rq(se)); | 9400 | update_load_avg(se, UPDATE_TG); |
9401 | update_cfs_shares(se); | ||
9402 | } | ||
9361 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 9403 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
9362 | } | 9404 | } |
9363 | 9405 | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 5405d3feb112..0c00172db63e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct * | 26 | static struct task_struct * |
27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
28 | { | 28 | { |
29 | put_prev_task(rq, prev); | 29 | put_prev_task(rq, prev); |
30 | update_idle_core(rq); | 30 | update_idle_core(rq); |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a688a8206727..e8836cfc4cdb 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/irq_work.h> | 9 | #include <linux/irq_work.h> |
10 | 10 | ||
11 | int sched_rr_timeslice = RR_TIMESLICE; | 11 | int sched_rr_timeslice = RR_TIMESLICE; |
12 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | ||
12 | 13 | ||
13 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); | 14 | static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); |
14 | 15 | ||
@@ -1523,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) | |||
1523 | } | 1524 | } |
1524 | 1525 | ||
1525 | static struct task_struct * | 1526 | static struct task_struct * |
1526 | pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 1527 | pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
1527 | { | 1528 | { |
1528 | struct task_struct *p; | 1529 | struct task_struct *p; |
1529 | struct rt_rq *rt_rq = &rq->rt; | 1530 | struct rt_rq *rt_rq = &rq->rt; |
@@ -1535,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo | |||
1535 | * disabled avoiding further scheduler activity on it and we're | 1536 | * disabled avoiding further scheduler activity on it and we're |
1536 | * being very careful to re-start the picking loop. | 1537 | * being very careful to re-start the picking loop. |
1537 | */ | 1538 | */ |
1538 | lockdep_unpin_lock(&rq->lock, cookie); | 1539 | rq_unpin_lock(rq, rf); |
1539 | pull_rt_task(rq); | 1540 | pull_rt_task(rq); |
1540 | lockdep_repin_lock(&rq->lock, cookie); | 1541 | rq_repin_lock(rq, rf); |
1541 | /* | 1542 | /* |
1542 | * pull_rt_task() can drop (and re-acquire) rq->lock; this | 1543 | * pull_rt_task() can drop (and re-acquire) rq->lock; this |
1543 | * means a dl or stop task can slip in, in which case we need | 1544 | * means a dl or stop task can slip in, in which case we need |
@@ -2198,10 +2199,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
2198 | #ifdef CONFIG_SMP | 2199 | #ifdef CONFIG_SMP |
2199 | if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) | 2200 | if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) |
2200 | queue_push_tasks(rq); | 2201 | queue_push_tasks(rq); |
2201 | #else | 2202 | #endif /* CONFIG_SMP */ |
2202 | if (p->prio < rq->curr->prio) | 2203 | if (p->prio < rq->curr->prio) |
2203 | resched_curr(rq); | 2204 | resched_curr(rq); |
2204 | #endif /* CONFIG_SMP */ | ||
2205 | } | 2205 | } |
2206 | } | 2206 | } |
2207 | 2207 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b34c7826ca5..71b10a9b73cf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/sched/rt.h> | 4 | #include <linux/sched/rt.h> |
5 | #include <linux/u64_stats_sync.h> | 5 | #include <linux/u64_stats_sync.h> |
6 | #include <linux/sched/deadline.h> | 6 | #include <linux/sched/deadline.h> |
7 | #include <linux/kernel_stat.h> | ||
7 | #include <linux/binfmts.h> | 8 | #include <linux/binfmts.h> |
8 | #include <linux/mutex.h> | 9 | #include <linux/mutex.h> |
9 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
@@ -222,7 +223,7 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
222 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 223 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
223 | } | 224 | } |
224 | 225 | ||
225 | extern struct mutex sched_domains_mutex; | 226 | extern void init_dl_bw(struct dl_bw *dl_b); |
226 | 227 | ||
227 | #ifdef CONFIG_CGROUP_SCHED | 228 | #ifdef CONFIG_CGROUP_SCHED |
228 | 229 | ||
@@ -583,6 +584,13 @@ struct root_domain { | |||
583 | }; | 584 | }; |
584 | 585 | ||
585 | extern struct root_domain def_root_domain; | 586 | extern struct root_domain def_root_domain; |
587 | extern struct mutex sched_domains_mutex; | ||
588 | extern cpumask_var_t fallback_doms; | ||
589 | extern cpumask_var_t sched_domains_tmpmask; | ||
590 | |||
591 | extern void init_defrootdomain(void); | ||
592 | extern int init_sched_domains(const struct cpumask *cpu_map); | ||
593 | extern void rq_attach_root(struct rq *rq, struct root_domain *rd); | ||
586 | 594 | ||
587 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
588 | 596 | ||
@@ -644,7 +652,7 @@ struct rq { | |||
644 | unsigned long next_balance; | 652 | unsigned long next_balance; |
645 | struct mm_struct *prev_mm; | 653 | struct mm_struct *prev_mm; |
646 | 654 | ||
647 | unsigned int clock_skip_update; | 655 | unsigned int clock_update_flags; |
648 | u64 clock; | 656 | u64 clock; |
649 | u64 clock_task; | 657 | u64 clock_task; |
650 | 658 | ||
@@ -768,28 +776,110 @@ static inline u64 __rq_clock_broken(struct rq *rq) | |||
768 | return READ_ONCE(rq->clock); | 776 | return READ_ONCE(rq->clock); |
769 | } | 777 | } |
770 | 778 | ||
779 | /* | ||
780 | * rq::clock_update_flags bits | ||
781 | * | ||
782 | * %RQCF_REQ_SKIP - will request skipping of clock update on the next | ||
783 | * call to __schedule(). This is an optimisation to avoid | ||
784 | * neighbouring rq clock updates. | ||
785 | * | ||
786 | * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is | ||
787 | * in effect and calls to update_rq_clock() are being ignored. | ||
788 | * | ||
789 | * %RQCF_UPDATED - is a debug flag that indicates whether a call has been | ||
790 | * made to update_rq_clock() since the last time rq::lock was pinned. | ||
791 | * | ||
792 | * If inside of __schedule(), clock_update_flags will have been | ||
793 | * shifted left (a left shift is a cheap operation for the fast path | ||
794 | * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, | ||
795 | * | ||
796 | * if (rq-clock_update_flags >= RQCF_UPDATED) | ||
797 | * | ||
798 | * to check if %RQCF_UPADTED is set. It'll never be shifted more than | ||
799 | * one position though, because the next rq_unpin_lock() will shift it | ||
800 | * back. | ||
801 | */ | ||
802 | #define RQCF_REQ_SKIP 0x01 | ||
803 | #define RQCF_ACT_SKIP 0x02 | ||
804 | #define RQCF_UPDATED 0x04 | ||
805 | |||
806 | static inline void assert_clock_updated(struct rq *rq) | ||
807 | { | ||
808 | /* | ||
809 | * The only reason for not seeing a clock update since the | ||
810 | * last rq_pin_lock() is if we're currently skipping updates. | ||
811 | */ | ||
812 | SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); | ||
813 | } | ||
814 | |||
771 | static inline u64 rq_clock(struct rq *rq) | 815 | static inline u64 rq_clock(struct rq *rq) |
772 | { | 816 | { |
773 | lockdep_assert_held(&rq->lock); | 817 | lockdep_assert_held(&rq->lock); |
818 | assert_clock_updated(rq); | ||
819 | |||
774 | return rq->clock; | 820 | return rq->clock; |
775 | } | 821 | } |
776 | 822 | ||
777 | static inline u64 rq_clock_task(struct rq *rq) | 823 | static inline u64 rq_clock_task(struct rq *rq) |
778 | { | 824 | { |
779 | lockdep_assert_held(&rq->lock); | 825 | lockdep_assert_held(&rq->lock); |
826 | assert_clock_updated(rq); | ||
827 | |||
780 | return rq->clock_task; | 828 | return rq->clock_task; |
781 | } | 829 | } |
782 | 830 | ||
783 | #define RQCF_REQ_SKIP 0x01 | ||
784 | #define RQCF_ACT_SKIP 0x02 | ||
785 | |||
786 | static inline void rq_clock_skip_update(struct rq *rq, bool skip) | 831 | static inline void rq_clock_skip_update(struct rq *rq, bool skip) |
787 | { | 832 | { |
788 | lockdep_assert_held(&rq->lock); | 833 | lockdep_assert_held(&rq->lock); |
789 | if (skip) | 834 | if (skip) |
790 | rq->clock_skip_update |= RQCF_REQ_SKIP; | 835 | rq->clock_update_flags |= RQCF_REQ_SKIP; |
791 | else | 836 | else |
792 | rq->clock_skip_update &= ~RQCF_REQ_SKIP; | 837 | rq->clock_update_flags &= ~RQCF_REQ_SKIP; |
838 | } | ||
839 | |||
840 | struct rq_flags { | ||
841 | unsigned long flags; | ||
842 | struct pin_cookie cookie; | ||
843 | #ifdef CONFIG_SCHED_DEBUG | ||
844 | /* | ||
845 | * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the | ||
846 | * current pin context is stashed here in case it needs to be | ||
847 | * restored in rq_repin_lock(). | ||
848 | */ | ||
849 | unsigned int clock_update_flags; | ||
850 | #endif | ||
851 | }; | ||
852 | |||
853 | static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) | ||
854 | { | ||
855 | rf->cookie = lockdep_pin_lock(&rq->lock); | ||
856 | |||
857 | #ifdef CONFIG_SCHED_DEBUG | ||
858 | rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); | ||
859 | rf->clock_update_flags = 0; | ||
860 | #endif | ||
861 | } | ||
862 | |||
863 | static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) | ||
864 | { | ||
865 | #ifdef CONFIG_SCHED_DEBUG | ||
866 | if (rq->clock_update_flags > RQCF_ACT_SKIP) | ||
867 | rf->clock_update_flags = RQCF_UPDATED; | ||
868 | #endif | ||
869 | |||
870 | lockdep_unpin_lock(&rq->lock, rf->cookie); | ||
871 | } | ||
872 | |||
873 | static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) | ||
874 | { | ||
875 | lockdep_repin_lock(&rq->lock, rf->cookie); | ||
876 | |||
877 | #ifdef CONFIG_SCHED_DEBUG | ||
878 | /* | ||
879 | * Restore the value we stashed in @rf for this pin context. | ||
880 | */ | ||
881 | rq->clock_update_flags |= rf->clock_update_flags; | ||
882 | #endif | ||
793 | } | 883 | } |
794 | 884 | ||
795 | #ifdef CONFIG_NUMA | 885 | #ifdef CONFIG_NUMA |
@@ -803,6 +893,16 @@ extern int sched_max_numa_distance; | |||
803 | extern bool find_numa_distance(int distance); | 893 | extern bool find_numa_distance(int distance); |
804 | #endif | 894 | #endif |
805 | 895 | ||
896 | #ifdef CONFIG_NUMA | ||
897 | extern void sched_init_numa(void); | ||
898 | extern void sched_domains_numa_masks_set(unsigned int cpu); | ||
899 | extern void sched_domains_numa_masks_clear(unsigned int cpu); | ||
900 | #else | ||
901 | static inline void sched_init_numa(void) { } | ||
902 | static inline void sched_domains_numa_masks_set(unsigned int cpu) { } | ||
903 | static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } | ||
904 | #endif | ||
905 | |||
806 | #ifdef CONFIG_NUMA_BALANCING | 906 | #ifdef CONFIG_NUMA_BALANCING |
807 | /* The regions in numa_faults array from task_struct */ | 907 | /* The regions in numa_faults array from task_struct */ |
808 | enum numa_faults_stats { | 908 | enum numa_faults_stats { |
@@ -969,7 +1069,7 @@ static inline void sched_ttwu_pending(void) { } | |||
969 | #endif /* CONFIG_SMP */ | 1069 | #endif /* CONFIG_SMP */ |
970 | 1070 | ||
971 | #include "stats.h" | 1071 | #include "stats.h" |
972 | #include "auto_group.h" | 1072 | #include "autogroup.h" |
973 | 1073 | ||
974 | #ifdef CONFIG_CGROUP_SCHED | 1074 | #ifdef CONFIG_CGROUP_SCHED |
975 | 1075 | ||
@@ -1245,7 +1345,7 @@ struct sched_class { | |||
1245 | */ | 1345 | */ |
1246 | struct task_struct * (*pick_next_task) (struct rq *rq, | 1346 | struct task_struct * (*pick_next_task) (struct rq *rq, |
1247 | struct task_struct *prev, | 1347 | struct task_struct *prev, |
1248 | struct pin_cookie cookie); | 1348 | struct rq_flags *rf); |
1249 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1349 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
1250 | 1350 | ||
1251 | #ifdef CONFIG_SMP | 1351 | #ifdef CONFIG_SMP |
@@ -1501,11 +1601,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } | |||
1501 | static inline void sched_avg_update(struct rq *rq) { } | 1601 | static inline void sched_avg_update(struct rq *rq) { } |
1502 | #endif | 1602 | #endif |
1503 | 1603 | ||
1504 | struct rq_flags { | ||
1505 | unsigned long flags; | ||
1506 | struct pin_cookie cookie; | ||
1507 | }; | ||
1508 | |||
1509 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1604 | struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
1510 | __acquires(rq->lock); | 1605 | __acquires(rq->lock); |
1511 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | 1606 | struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) |
@@ -1515,7 +1610,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
1515 | static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) | 1610 | static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) |
1516 | __releases(rq->lock) | 1611 | __releases(rq->lock) |
1517 | { | 1612 | { |
1518 | lockdep_unpin_lock(&rq->lock, rf->cookie); | 1613 | rq_unpin_lock(rq, rf); |
1519 | raw_spin_unlock(&rq->lock); | 1614 | raw_spin_unlock(&rq->lock); |
1520 | } | 1615 | } |
1521 | 1616 | ||
@@ -1524,7 +1619,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) | |||
1524 | __releases(rq->lock) | 1619 | __releases(rq->lock) |
1525 | __releases(p->pi_lock) | 1620 | __releases(p->pi_lock) |
1526 | { | 1621 | { |
1527 | lockdep_unpin_lock(&rq->lock, rf->cookie); | 1622 | rq_unpin_lock(rq, rf); |
1528 | raw_spin_unlock(&rq->lock); | 1623 | raw_spin_unlock(&rq->lock); |
1529 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); | 1624 | raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); |
1530 | } | 1625 | } |
@@ -1674,6 +1769,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1674 | __release(rq2->lock); | 1769 | __release(rq2->lock); |
1675 | } | 1770 | } |
1676 | 1771 | ||
1772 | extern void set_rq_online (struct rq *rq); | ||
1773 | extern void set_rq_offline(struct rq *rq); | ||
1774 | extern bool sched_smp_initialized; | ||
1775 | |||
1677 | #else /* CONFIG_SMP */ | 1776 | #else /* CONFIG_SMP */ |
1678 | 1777 | ||
1679 | /* | 1778 | /* |
@@ -1750,8 +1849,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } | |||
1750 | 1849 | ||
1751 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 1850 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
1752 | struct irqtime { | 1851 | struct irqtime { |
1753 | u64 hardirq_time; | 1852 | u64 tick_delta; |
1754 | u64 softirq_time; | ||
1755 | u64 irq_start_time; | 1853 | u64 irq_start_time; |
1756 | struct u64_stats_sync sync; | 1854 | struct u64_stats_sync sync; |
1757 | }; | 1855 | }; |
@@ -1761,12 +1859,13 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime); | |||
1761 | static inline u64 irq_time_read(int cpu) | 1859 | static inline u64 irq_time_read(int cpu) |
1762 | { | 1860 | { |
1763 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); | 1861 | struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); |
1862 | u64 *cpustat = kcpustat_cpu(cpu).cpustat; | ||
1764 | unsigned int seq; | 1863 | unsigned int seq; |
1765 | u64 total; | 1864 | u64 total; |
1766 | 1865 | ||
1767 | do { | 1866 | do { |
1768 | seq = __u64_stats_fetch_begin(&irqtime->sync); | 1867 | seq = __u64_stats_fetch_begin(&irqtime->sync); |
1769 | total = irqtime->softirq_time + irqtime->hardirq_time; | 1868 | total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; |
1770 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); | 1869 | } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); |
1771 | 1870 | ||
1772 | return total; | 1871 | return total; |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c69a9870ab79..bf0da0aa0a14 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -224,7 +224,7 @@ struct thread_group_cputimer *get_running_cputimer(struct task_struct *tsk) | |||
224 | * running CPU and update the utime field there. | 224 | * running CPU and update the utime field there. |
225 | */ | 225 | */ |
226 | static inline void account_group_user_time(struct task_struct *tsk, | 226 | static inline void account_group_user_time(struct task_struct *tsk, |
227 | cputime_t cputime) | 227 | u64 cputime) |
228 | { | 228 | { |
229 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); | 229 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); |
230 | 230 | ||
@@ -245,7 +245,7 @@ static inline void account_group_user_time(struct task_struct *tsk, | |||
245 | * running CPU and update the stime field there. | 245 | * running CPU and update the stime field there. |
246 | */ | 246 | */ |
247 | static inline void account_group_system_time(struct task_struct *tsk, | 247 | static inline void account_group_system_time(struct task_struct *tsk, |
248 | cputime_t cputime) | 248 | u64 cputime) |
249 | { | 249 | { |
250 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); | 250 | struct thread_group_cputimer *cputimer = get_running_cputimer(tsk); |
251 | 251 | ||
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 604297a08b3a..9f69fb630853 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) | |||
24 | } | 24 | } |
25 | 25 | ||
26 | static struct task_struct * | 26 | static struct task_struct * |
27 | pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) | 27 | pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) |
28 | { | 28 | { |
29 | struct task_struct *stop = rq->stop; | 29 | struct task_struct *stop = rq->stop; |
30 | 30 | ||
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c new file mode 100644 index 000000000000..1b0b4fb12837 --- /dev/null +++ b/kernel/sched/topology.c | |||
@@ -0,0 +1,1658 @@ | |||
1 | /* | ||
2 | * Scheduler topology setup/handling methods | ||
3 | */ | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mutex.h> | ||
6 | |||
7 | #include "sched.h" | ||
8 | |||
9 | DEFINE_MUTEX(sched_domains_mutex); | ||
10 | |||
11 | /* Protected by sched_domains_mutex: */ | ||
12 | cpumask_var_t sched_domains_tmpmask; | ||
13 | |||
14 | #ifdef CONFIG_SCHED_DEBUG | ||
15 | |||
16 | static __read_mostly int sched_debug_enabled; | ||
17 | |||
18 | static int __init sched_debug_setup(char *str) | ||
19 | { | ||
20 | sched_debug_enabled = 1; | ||
21 | |||
22 | return 0; | ||
23 | } | ||
24 | early_param("sched_debug", sched_debug_setup); | ||
25 | |||
26 | static inline bool sched_debug(void) | ||
27 | { | ||
28 | return sched_debug_enabled; | ||
29 | } | ||
30 | |||
31 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | ||
32 | struct cpumask *groupmask) | ||
33 | { | ||
34 | struct sched_group *group = sd->groups; | ||
35 | |||
36 | cpumask_clear(groupmask); | ||
37 | |||
38 | printk(KERN_DEBUG "%*s domain %d: ", level, "", level); | ||
39 | |||
40 | if (!(sd->flags & SD_LOAD_BALANCE)) { | ||
41 | printk("does not load-balance\n"); | ||
42 | if (sd->parent) | ||
43 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | ||
44 | " has parent"); | ||
45 | return -1; | ||
46 | } | ||
47 | |||
48 | printk(KERN_CONT "span %*pbl level %s\n", | ||
49 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | ||
50 | |||
51 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
52 | printk(KERN_ERR "ERROR: domain->span does not contain " | ||
53 | "CPU%d\n", cpu); | ||
54 | } | ||
55 | if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { | ||
56 | printk(KERN_ERR "ERROR: domain->groups does not contain" | ||
57 | " CPU%d\n", cpu); | ||
58 | } | ||
59 | |||
60 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | ||
61 | do { | ||
62 | if (!group) { | ||
63 | printk("\n"); | ||
64 | printk(KERN_ERR "ERROR: group is NULL\n"); | ||
65 | break; | ||
66 | } | ||
67 | |||
68 | if (!cpumask_weight(sched_group_cpus(group))) { | ||
69 | printk(KERN_CONT "\n"); | ||
70 | printk(KERN_ERR "ERROR: empty group\n"); | ||
71 | break; | ||
72 | } | ||
73 | |||
74 | if (!(sd->flags & SD_OVERLAP) && | ||
75 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
76 | printk(KERN_CONT "\n"); | ||
77 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | ||
78 | break; | ||
79 | } | ||
80 | |||
81 | cpumask_or(groupmask, groupmask, sched_group_cpus(group)); | ||
82 | |||
83 | printk(KERN_CONT " %*pbl", | ||
84 | cpumask_pr_args(sched_group_cpus(group))); | ||
85 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | ||
86 | printk(KERN_CONT " (cpu_capacity = %lu)", | ||
87 | group->sgc->capacity); | ||
88 | } | ||
89 | |||
90 | group = group->next; | ||
91 | } while (group != sd->groups); | ||
92 | printk(KERN_CONT "\n"); | ||
93 | |||
94 | if (!cpumask_equal(sched_domain_span(sd), groupmask)) | ||
95 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | ||
96 | |||
97 | if (sd->parent && | ||
98 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | ||
99 | printk(KERN_ERR "ERROR: parent span is not a superset " | ||
100 | "of domain->span\n"); | ||
101 | return 0; | ||
102 | } | ||
103 | |||
104 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | ||
105 | { | ||
106 | int level = 0; | ||
107 | |||
108 | if (!sched_debug_enabled) | ||
109 | return; | ||
110 | |||
111 | if (!sd) { | ||
112 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
113 | return; | ||
114 | } | ||
115 | |||
116 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | ||
117 | |||
118 | for (;;) { | ||
119 | if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) | ||
120 | break; | ||
121 | level++; | ||
122 | sd = sd->parent; | ||
123 | if (!sd) | ||
124 | break; | ||
125 | } | ||
126 | } | ||
127 | #else /* !CONFIG_SCHED_DEBUG */ | ||
128 | |||
129 | # define sched_debug_enabled 0 | ||
130 | # define sched_domain_debug(sd, cpu) do { } while (0) | ||
131 | static inline bool sched_debug(void) | ||
132 | { | ||
133 | return false; | ||
134 | } | ||
135 | #endif /* CONFIG_SCHED_DEBUG */ | ||
136 | |||
137 | static int sd_degenerate(struct sched_domain *sd) | ||
138 | { | ||
139 | if (cpumask_weight(sched_domain_span(sd)) == 1) | ||
140 | return 1; | ||
141 | |||
142 | /* Following flags need at least 2 groups */ | ||
143 | if (sd->flags & (SD_LOAD_BALANCE | | ||
144 | SD_BALANCE_NEWIDLE | | ||
145 | SD_BALANCE_FORK | | ||
146 | SD_BALANCE_EXEC | | ||
147 | SD_SHARE_CPUCAPACITY | | ||
148 | SD_ASYM_CPUCAPACITY | | ||
149 | SD_SHARE_PKG_RESOURCES | | ||
150 | SD_SHARE_POWERDOMAIN)) { | ||
151 | if (sd->groups != sd->groups->next) | ||
152 | return 0; | ||
153 | } | ||
154 | |||
155 | /* Following flags don't use groups */ | ||
156 | if (sd->flags & (SD_WAKE_AFFINE)) | ||
157 | return 0; | ||
158 | |||
159 | return 1; | ||
160 | } | ||
161 | |||
162 | static int | ||
163 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | ||
164 | { | ||
165 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
166 | |||
167 | if (sd_degenerate(parent)) | ||
168 | return 1; | ||
169 | |||
170 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | ||
171 | return 0; | ||
172 | |||
173 | /* Flags needing groups don't count if only 1 group in parent */ | ||
174 | if (parent->groups == parent->groups->next) { | ||
175 | pflags &= ~(SD_LOAD_BALANCE | | ||
176 | SD_BALANCE_NEWIDLE | | ||
177 | SD_BALANCE_FORK | | ||
178 | SD_BALANCE_EXEC | | ||
179 | SD_ASYM_CPUCAPACITY | | ||
180 | SD_SHARE_CPUCAPACITY | | ||
181 | SD_SHARE_PKG_RESOURCES | | ||
182 | SD_PREFER_SIBLING | | ||
183 | SD_SHARE_POWERDOMAIN); | ||
184 | if (nr_node_ids == 1) | ||
185 | pflags &= ~SD_SERIALIZE; | ||
186 | } | ||
187 | if (~cflags & pflags) | ||
188 | return 0; | ||
189 | |||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | static void free_rootdomain(struct rcu_head *rcu) | ||
194 | { | ||
195 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); | ||
196 | |||
197 | cpupri_cleanup(&rd->cpupri); | ||
198 | cpudl_cleanup(&rd->cpudl); | ||
199 | free_cpumask_var(rd->dlo_mask); | ||
200 | free_cpumask_var(rd->rto_mask); | ||
201 | free_cpumask_var(rd->online); | ||
202 | free_cpumask_var(rd->span); | ||
203 | kfree(rd); | ||
204 | } | ||
205 | |||
206 | void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
207 | { | ||
208 | struct root_domain *old_rd = NULL; | ||
209 | unsigned long flags; | ||
210 | |||
211 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
212 | |||
213 | if (rq->rd) { | ||
214 | old_rd = rq->rd; | ||
215 | |||
216 | if (cpumask_test_cpu(rq->cpu, old_rd->online)) | ||
217 | set_rq_offline(rq); | ||
218 | |||
219 | cpumask_clear_cpu(rq->cpu, old_rd->span); | ||
220 | |||
221 | /* | ||
222 | * If we dont want to free the old_rd yet then | ||
223 | * set old_rd to NULL to skip the freeing later | ||
224 | * in this function: | ||
225 | */ | ||
226 | if (!atomic_dec_and_test(&old_rd->refcount)) | ||
227 | old_rd = NULL; | ||
228 | } | ||
229 | |||
230 | atomic_inc(&rd->refcount); | ||
231 | rq->rd = rd; | ||
232 | |||
233 | cpumask_set_cpu(rq->cpu, rd->span); | ||
234 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) | ||
235 | set_rq_online(rq); | ||
236 | |||
237 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
238 | |||
239 | if (old_rd) | ||
240 | call_rcu_sched(&old_rd->rcu, free_rootdomain); | ||
241 | } | ||
242 | |||
243 | static int init_rootdomain(struct root_domain *rd) | ||
244 | { | ||
245 | memset(rd, 0, sizeof(*rd)); | ||
246 | |||
247 | if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) | ||
248 | goto out; | ||
249 | if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) | ||
250 | goto free_span; | ||
251 | if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) | ||
252 | goto free_online; | ||
253 | if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) | ||
254 | goto free_dlo_mask; | ||
255 | |||
256 | init_dl_bw(&rd->dl_bw); | ||
257 | if (cpudl_init(&rd->cpudl) != 0) | ||
258 | goto free_rto_mask; | ||
259 | |||
260 | if (cpupri_init(&rd->cpupri) != 0) | ||
261 | goto free_cpudl; | ||
262 | return 0; | ||
263 | |||
264 | free_cpudl: | ||
265 | cpudl_cleanup(&rd->cpudl); | ||
266 | free_rto_mask: | ||
267 | free_cpumask_var(rd->rto_mask); | ||
268 | free_dlo_mask: | ||
269 | free_cpumask_var(rd->dlo_mask); | ||
270 | free_online: | ||
271 | free_cpumask_var(rd->online); | ||
272 | free_span: | ||
273 | free_cpumask_var(rd->span); | ||
274 | out: | ||
275 | return -ENOMEM; | ||
276 | } | ||
277 | |||
278 | /* | ||
279 | * By default the system creates a single root-domain with all CPUs as | ||
280 | * members (mimicking the global state we have today). | ||
281 | */ | ||
282 | struct root_domain def_root_domain; | ||
283 | |||
284 | void init_defrootdomain(void) | ||
285 | { | ||
286 | init_rootdomain(&def_root_domain); | ||
287 | |||
288 | atomic_set(&def_root_domain.refcount, 1); | ||
289 | } | ||
290 | |||
291 | static struct root_domain *alloc_rootdomain(void) | ||
292 | { | ||
293 | struct root_domain *rd; | ||
294 | |||
295 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
296 | if (!rd) | ||
297 | return NULL; | ||
298 | |||
299 | if (init_rootdomain(rd) != 0) { | ||
300 | kfree(rd); | ||
301 | return NULL; | ||
302 | } | ||
303 | |||
304 | return rd; | ||
305 | } | ||
306 | |||
307 | static void free_sched_groups(struct sched_group *sg, int free_sgc) | ||
308 | { | ||
309 | struct sched_group *tmp, *first; | ||
310 | |||
311 | if (!sg) | ||
312 | return; | ||
313 | |||
314 | first = sg; | ||
315 | do { | ||
316 | tmp = sg->next; | ||
317 | |||
318 | if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) | ||
319 | kfree(sg->sgc); | ||
320 | |||
321 | kfree(sg); | ||
322 | sg = tmp; | ||
323 | } while (sg != first); | ||
324 | } | ||
325 | |||
326 | static void destroy_sched_domain(struct sched_domain *sd) | ||
327 | { | ||
328 | /* | ||
329 | * If its an overlapping domain it has private groups, iterate and | ||
330 | * nuke them all. | ||
331 | */ | ||
332 | if (sd->flags & SD_OVERLAP) { | ||
333 | free_sched_groups(sd->groups, 1); | ||
334 | } else if (atomic_dec_and_test(&sd->groups->ref)) { | ||
335 | kfree(sd->groups->sgc); | ||
336 | kfree(sd->groups); | ||
337 | } | ||
338 | if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) | ||
339 | kfree(sd->shared); | ||
340 | kfree(sd); | ||
341 | } | ||
342 | |||
343 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) | ||
344 | { | ||
345 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); | ||
346 | |||
347 | while (sd) { | ||
348 | struct sched_domain *parent = sd->parent; | ||
349 | destroy_sched_domain(sd); | ||
350 | sd = parent; | ||
351 | } | ||
352 | } | ||
353 | |||
354 | static void destroy_sched_domains(struct sched_domain *sd) | ||
355 | { | ||
356 | if (sd) | ||
357 | call_rcu(&sd->rcu, destroy_sched_domains_rcu); | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Keep a special pointer to the highest sched_domain that has | ||
362 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this | ||
363 | * allows us to avoid some pointer chasing select_idle_sibling(). | ||
364 | * | ||
365 | * Also keep a unique ID per domain (we use the first CPU number in | ||
366 | * the cpumask of the domain), this allows us to quickly tell if | ||
367 | * two CPUs are in the same cache domain, see cpus_share_cache(). | ||
368 | */ | ||
369 | DEFINE_PER_CPU(struct sched_domain *, sd_llc); | ||
370 | DEFINE_PER_CPU(int, sd_llc_size); | ||
371 | DEFINE_PER_CPU(int, sd_llc_id); | ||
372 | DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); | ||
373 | DEFINE_PER_CPU(struct sched_domain *, sd_numa); | ||
374 | DEFINE_PER_CPU(struct sched_domain *, sd_asym); | ||
375 | |||
376 | static void update_top_cache_domain(int cpu) | ||
377 | { | ||
378 | struct sched_domain_shared *sds = NULL; | ||
379 | struct sched_domain *sd; | ||
380 | int id = cpu; | ||
381 | int size = 1; | ||
382 | |||
383 | sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); | ||
384 | if (sd) { | ||
385 | id = cpumask_first(sched_domain_span(sd)); | ||
386 | size = cpumask_weight(sched_domain_span(sd)); | ||
387 | sds = sd->shared; | ||
388 | } | ||
389 | |||
390 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); | ||
391 | per_cpu(sd_llc_size, cpu) = size; | ||
392 | per_cpu(sd_llc_id, cpu) = id; | ||
393 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); | ||
394 | |||
395 | sd = lowest_flag_domain(cpu, SD_NUMA); | ||
396 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); | ||
397 | |||
398 | sd = highest_flag_domain(cpu, SD_ASYM_PACKING); | ||
399 | rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); | ||
400 | } | ||
401 | |||
402 | /* | ||
403 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | ||
404 | * hold the hotplug lock. | ||
405 | */ | ||
406 | static void | ||
407 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
408 | { | ||
409 | struct rq *rq = cpu_rq(cpu); | ||
410 | struct sched_domain *tmp; | ||
411 | |||
412 | /* Remove the sched domains which do not contribute to scheduling. */ | ||
413 | for (tmp = sd; tmp; ) { | ||
414 | struct sched_domain *parent = tmp->parent; | ||
415 | if (!parent) | ||
416 | break; | ||
417 | |||
418 | if (sd_parent_degenerate(tmp, parent)) { | ||
419 | tmp->parent = parent->parent; | ||
420 | if (parent->parent) | ||
421 | parent->parent->child = tmp; | ||
422 | /* | ||
423 | * Transfer SD_PREFER_SIBLING down in case of a | ||
424 | * degenerate parent; the spans match for this | ||
425 | * so the property transfers. | ||
426 | */ | ||
427 | if (parent->flags & SD_PREFER_SIBLING) | ||
428 | tmp->flags |= SD_PREFER_SIBLING; | ||
429 | destroy_sched_domain(parent); | ||
430 | } else | ||
431 | tmp = tmp->parent; | ||
432 | } | ||
433 | |||
434 | if (sd && sd_degenerate(sd)) { | ||
435 | tmp = sd; | ||
436 | sd = sd->parent; | ||
437 | destroy_sched_domain(tmp); | ||
438 | if (sd) | ||
439 | sd->child = NULL; | ||
440 | } | ||
441 | |||
442 | sched_domain_debug(sd, cpu); | ||
443 | |||
444 | rq_attach_root(rq, rd); | ||
445 | tmp = rq->sd; | ||
446 | rcu_assign_pointer(rq->sd, sd); | ||
447 | destroy_sched_domains(tmp); | ||
448 | |||
449 | update_top_cache_domain(cpu); | ||
450 | } | ||
451 | |||
452 | /* Setup the mask of CPUs configured for isolated domains */ | ||
453 | static int __init isolated_cpu_setup(char *str) | ||
454 | { | ||
455 | int ret; | ||
456 | |||
457 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
458 | ret = cpulist_parse(str, cpu_isolated_map); | ||
459 | if (ret) { | ||
460 | pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); | ||
461 | return 0; | ||
462 | } | ||
463 | return 1; | ||
464 | } | ||
465 | __setup("isolcpus=", isolated_cpu_setup); | ||
466 | |||
467 | struct s_data { | ||
468 | struct sched_domain ** __percpu sd; | ||
469 | struct root_domain *rd; | ||
470 | }; | ||
471 | |||
472 | enum s_alloc { | ||
473 | sa_rootdomain, | ||
474 | sa_sd, | ||
475 | sa_sd_storage, | ||
476 | sa_none, | ||
477 | }; | ||
478 | |||
479 | /* | ||
480 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
481 | * domain traversal. | ||
482 | * | ||
483 | * Asymmetric node setups can result in situations where the domain tree is of | ||
484 | * unequal depth, make sure to skip domains that already cover the entire | ||
485 | * range. | ||
486 | * | ||
487 | * In that case build_sched_domains() will have terminated the iteration early | ||
488 | * and our sibling sd spans will be empty. Domains should always include the | ||
489 | * CPU they're built on, so check that. | ||
490 | */ | ||
491 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
492 | { | ||
493 | const struct cpumask *span = sched_domain_span(sd); | ||
494 | struct sd_data *sdd = sd->private; | ||
495 | struct sched_domain *sibling; | ||
496 | int i; | ||
497 | |||
498 | for_each_cpu(i, span) { | ||
499 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
500 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
501 | continue; | ||
502 | |||
503 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
504 | } | ||
505 | } | ||
506 | |||
507 | /* | ||
508 | * Return the canonical balance CPU for this group, this is the first CPU | ||
509 | * of this group that's also in the iteration mask. | ||
510 | */ | ||
511 | int group_balance_cpu(struct sched_group *sg) | ||
512 | { | ||
513 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
514 | } | ||
515 | |||
516 | static int | ||
517 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | ||
518 | { | ||
519 | struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; | ||
520 | const struct cpumask *span = sched_domain_span(sd); | ||
521 | struct cpumask *covered = sched_domains_tmpmask; | ||
522 | struct sd_data *sdd = sd->private; | ||
523 | struct sched_domain *sibling; | ||
524 | int i; | ||
525 | |||
526 | cpumask_clear(covered); | ||
527 | |||
528 | for_each_cpu(i, span) { | ||
529 | struct cpumask *sg_span; | ||
530 | |||
531 | if (cpumask_test_cpu(i, covered)) | ||
532 | continue; | ||
533 | |||
534 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
535 | |||
536 | /* See the comment near build_group_mask(). */ | ||
537 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
538 | continue; | ||
539 | |||
540 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
541 | GFP_KERNEL, cpu_to_node(cpu)); | ||
542 | |||
543 | if (!sg) | ||
544 | goto fail; | ||
545 | |||
546 | sg_span = sched_group_cpus(sg); | ||
547 | if (sibling->child) | ||
548 | cpumask_copy(sg_span, sched_domain_span(sibling->child)); | ||
549 | else | ||
550 | cpumask_set_cpu(i, sg_span); | ||
551 | |||
552 | cpumask_or(covered, covered, sg_span); | ||
553 | |||
554 | sg->sgc = *per_cpu_ptr(sdd->sgc, i); | ||
555 | if (atomic_inc_return(&sg->sgc->ref) == 1) | ||
556 | build_group_mask(sd, sg); | ||
557 | |||
558 | /* | ||
559 | * Initialize sgc->capacity such that even if we mess up the | ||
560 | * domains and no possible iteration will get us here, we won't | ||
561 | * die on a /0 trap. | ||
562 | */ | ||
563 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | ||
564 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
565 | |||
566 | /* | ||
567 | * Make sure the first group of this domain contains the | ||
568 | * canonical balance CPU. Otherwise the sched_domain iteration | ||
569 | * breaks. See update_sg_lb_stats(). | ||
570 | */ | ||
571 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
572 | group_balance_cpu(sg) == cpu) | ||
573 | groups = sg; | ||
574 | |||
575 | if (!first) | ||
576 | first = sg; | ||
577 | if (last) | ||
578 | last->next = sg; | ||
579 | last = sg; | ||
580 | last->next = first; | ||
581 | } | ||
582 | sd->groups = groups; | ||
583 | |||
584 | return 0; | ||
585 | |||
586 | fail: | ||
587 | free_sched_groups(first, 0); | ||
588 | |||
589 | return -ENOMEM; | ||
590 | } | ||
591 | |||
592 | static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) | ||
593 | { | ||
594 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
595 | struct sched_domain *child = sd->child; | ||
596 | |||
597 | if (child) | ||
598 | cpu = cpumask_first(sched_domain_span(child)); | ||
599 | |||
600 | if (sg) { | ||
601 | *sg = *per_cpu_ptr(sdd->sg, cpu); | ||
602 | (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); | ||
603 | |||
604 | /* For claim_allocations: */ | ||
605 | atomic_set(&(*sg)->sgc->ref, 1); | ||
606 | } | ||
607 | |||
608 | return cpu; | ||
609 | } | ||
610 | |||
611 | /* | ||
612 | * build_sched_groups will build a circular linked list of the groups | ||
613 | * covered by the given span, and will set each group's ->cpumask correctly, | ||
614 | * and ->cpu_capacity to 0. | ||
615 | * | ||
616 | * Assumes the sched_domain tree is fully constructed | ||
617 | */ | ||
618 | static int | ||
619 | build_sched_groups(struct sched_domain *sd, int cpu) | ||
620 | { | ||
621 | struct sched_group *first = NULL, *last = NULL; | ||
622 | struct sd_data *sdd = sd->private; | ||
623 | const struct cpumask *span = sched_domain_span(sd); | ||
624 | struct cpumask *covered; | ||
625 | int i; | ||
626 | |||
627 | get_group(cpu, sdd, &sd->groups); | ||
628 | atomic_inc(&sd->groups->ref); | ||
629 | |||
630 | if (cpu != cpumask_first(span)) | ||
631 | return 0; | ||
632 | |||
633 | lockdep_assert_held(&sched_domains_mutex); | ||
634 | covered = sched_domains_tmpmask; | ||
635 | |||
636 | cpumask_clear(covered); | ||
637 | |||
638 | for_each_cpu(i, span) { | ||
639 | struct sched_group *sg; | ||
640 | int group, j; | ||
641 | |||
642 | if (cpumask_test_cpu(i, covered)) | ||
643 | continue; | ||
644 | |||
645 | group = get_group(i, sdd, &sg); | ||
646 | cpumask_setall(sched_group_mask(sg)); | ||
647 | |||
648 | for_each_cpu(j, span) { | ||
649 | if (get_group(j, sdd, NULL) != group) | ||
650 | continue; | ||
651 | |||
652 | cpumask_set_cpu(j, covered); | ||
653 | cpumask_set_cpu(j, sched_group_cpus(sg)); | ||
654 | } | ||
655 | |||
656 | if (!first) | ||
657 | first = sg; | ||
658 | if (last) | ||
659 | last->next = sg; | ||
660 | last = sg; | ||
661 | } | ||
662 | last->next = first; | ||
663 | |||
664 | return 0; | ||
665 | } | ||
666 | |||
667 | /* | ||
668 | * Initialize sched groups cpu_capacity. | ||
669 | * | ||
670 | * cpu_capacity indicates the capacity of sched group, which is used while | ||
671 | * distributing the load between different sched groups in a sched domain. | ||
672 | * Typically cpu_capacity for all the groups in a sched domain will be same | ||
673 | * unless there are asymmetries in the topology. If there are asymmetries, | ||
674 | * group having more cpu_capacity will pickup more load compared to the | ||
675 | * group having less cpu_capacity. | ||
676 | */ | ||
677 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | ||
678 | { | ||
679 | struct sched_group *sg = sd->groups; | ||
680 | |||
681 | WARN_ON(!sg); | ||
682 | |||
683 | do { | ||
684 | int cpu, max_cpu = -1; | ||
685 | |||
686 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | ||
687 | |||
688 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
689 | goto next; | ||
690 | |||
691 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
692 | if (max_cpu < 0) | ||
693 | max_cpu = cpu; | ||
694 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
695 | max_cpu = cpu; | ||
696 | } | ||
697 | sg->asym_prefer_cpu = max_cpu; | ||
698 | |||
699 | next: | ||
700 | sg = sg->next; | ||
701 | } while (sg != sd->groups); | ||
702 | |||
703 | if (cpu != group_balance_cpu(sg)) | ||
704 | return; | ||
705 | |||
706 | update_group_capacity(sd, cpu); | ||
707 | } | ||
708 | |||
709 | /* | ||
710 | * Initializers for schedule domains | ||
711 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() | ||
712 | */ | ||
713 | |||
714 | static int default_relax_domain_level = -1; | ||
715 | int sched_domain_level_max; | ||
716 | |||
717 | static int __init setup_relax_domain_level(char *str) | ||
718 | { | ||
719 | if (kstrtoint(str, 0, &default_relax_domain_level)) | ||
720 | pr_warn("Unable to set relax_domain_level\n"); | ||
721 | |||
722 | return 1; | ||
723 | } | ||
724 | __setup("relax_domain_level=", setup_relax_domain_level); | ||
725 | |||
726 | static void set_domain_attribute(struct sched_domain *sd, | ||
727 | struct sched_domain_attr *attr) | ||
728 | { | ||
729 | int request; | ||
730 | |||
731 | if (!attr || attr->relax_domain_level < 0) { | ||
732 | if (default_relax_domain_level < 0) | ||
733 | return; | ||
734 | else | ||
735 | request = default_relax_domain_level; | ||
736 | } else | ||
737 | request = attr->relax_domain_level; | ||
738 | if (request < sd->level) { | ||
739 | /* Turn off idle balance on this domain: */ | ||
740 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
741 | } else { | ||
742 | /* Turn on idle balance on this domain: */ | ||
743 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); | ||
744 | } | ||
745 | } | ||
746 | |||
747 | static void __sdt_free(const struct cpumask *cpu_map); | ||
748 | static int __sdt_alloc(const struct cpumask *cpu_map); | ||
749 | |||
750 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
751 | const struct cpumask *cpu_map) | ||
752 | { | ||
753 | switch (what) { | ||
754 | case sa_rootdomain: | ||
755 | if (!atomic_read(&d->rd->refcount)) | ||
756 | free_rootdomain(&d->rd->rcu); | ||
757 | /* Fall through */ | ||
758 | case sa_sd: | ||
759 | free_percpu(d->sd); | ||
760 | /* Fall through */ | ||
761 | case sa_sd_storage: | ||
762 | __sdt_free(cpu_map); | ||
763 | /* Fall through */ | ||
764 | case sa_none: | ||
765 | break; | ||
766 | } | ||
767 | } | ||
768 | |||
769 | static enum s_alloc | ||
770 | __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | ||
771 | { | ||
772 | memset(d, 0, sizeof(*d)); | ||
773 | |||
774 | if (__sdt_alloc(cpu_map)) | ||
775 | return sa_sd_storage; | ||
776 | d->sd = alloc_percpu(struct sched_domain *); | ||
777 | if (!d->sd) | ||
778 | return sa_sd_storage; | ||
779 | d->rd = alloc_rootdomain(); | ||
780 | if (!d->rd) | ||
781 | return sa_sd; | ||
782 | return sa_rootdomain; | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * NULL the sd_data elements we've used to build the sched_domain and | ||
787 | * sched_group structure so that the subsequent __free_domain_allocs() | ||
788 | * will not free the data we're using. | ||
789 | */ | ||
790 | static void claim_allocations(int cpu, struct sched_domain *sd) | ||
791 | { | ||
792 | struct sd_data *sdd = sd->private; | ||
793 | |||
794 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); | ||
795 | *per_cpu_ptr(sdd->sd, cpu) = NULL; | ||
796 | |||
797 | if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) | ||
798 | *per_cpu_ptr(sdd->sds, cpu) = NULL; | ||
799 | |||
800 | if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) | ||
801 | *per_cpu_ptr(sdd->sg, cpu) = NULL; | ||
802 | |||
803 | if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) | ||
804 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; | ||
805 | } | ||
806 | |||
807 | #ifdef CONFIG_NUMA | ||
808 | static int sched_domains_numa_levels; | ||
809 | enum numa_topology_type sched_numa_topology_type; | ||
810 | static int *sched_domains_numa_distance; | ||
811 | int sched_max_numa_distance; | ||
812 | static struct cpumask ***sched_domains_numa_masks; | ||
813 | static int sched_domains_curr_level; | ||
814 | #endif | ||
815 | |||
816 | /* | ||
817 | * SD_flags allowed in topology descriptions. | ||
818 | * | ||
819 | * These flags are purely descriptive of the topology and do not prescribe | ||
820 | * behaviour. Behaviour is artificial and mapped in the below sd_init() | ||
821 | * function: | ||
822 | * | ||
823 | * SD_SHARE_CPUCAPACITY - describes SMT topologies | ||
824 | * SD_SHARE_PKG_RESOURCES - describes shared caches | ||
825 | * SD_NUMA - describes NUMA topologies | ||
826 | * SD_SHARE_POWERDOMAIN - describes shared power domain | ||
827 | * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies | ||
828 | * | ||
829 | * Odd one out, which beside describing the topology has a quirk also | ||
830 | * prescribes the desired behaviour that goes along with it: | ||
831 | * | ||
832 | * SD_ASYM_PACKING - describes SMT quirks | ||
833 | */ | ||
834 | #define TOPOLOGY_SD_FLAGS \ | ||
835 | (SD_SHARE_CPUCAPACITY | \ | ||
836 | SD_SHARE_PKG_RESOURCES | \ | ||
837 | SD_NUMA | \ | ||
838 | SD_ASYM_PACKING | \ | ||
839 | SD_ASYM_CPUCAPACITY | \ | ||
840 | SD_SHARE_POWERDOMAIN) | ||
841 | |||
842 | static struct sched_domain * | ||
843 | sd_init(struct sched_domain_topology_level *tl, | ||
844 | const struct cpumask *cpu_map, | ||
845 | struct sched_domain *child, int cpu) | ||
846 | { | ||
847 | struct sd_data *sdd = &tl->data; | ||
848 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); | ||
849 | int sd_id, sd_weight, sd_flags = 0; | ||
850 | |||
851 | #ifdef CONFIG_NUMA | ||
852 | /* | ||
853 | * Ugly hack to pass state to sd_numa_mask()... | ||
854 | */ | ||
855 | sched_domains_curr_level = tl->numa_level; | ||
856 | #endif | ||
857 | |||
858 | sd_weight = cpumask_weight(tl->mask(cpu)); | ||
859 | |||
860 | if (tl->sd_flags) | ||
861 | sd_flags = (*tl->sd_flags)(); | ||
862 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, | ||
863 | "wrong sd_flags in topology description\n")) | ||
864 | sd_flags &= ~TOPOLOGY_SD_FLAGS; | ||
865 | |||
866 | *sd = (struct sched_domain){ | ||
867 | .min_interval = sd_weight, | ||
868 | .max_interval = 2*sd_weight, | ||
869 | .busy_factor = 32, | ||
870 | .imbalance_pct = 125, | ||
871 | |||
872 | .cache_nice_tries = 0, | ||
873 | .busy_idx = 0, | ||
874 | .idle_idx = 0, | ||
875 | .newidle_idx = 0, | ||
876 | .wake_idx = 0, | ||
877 | .forkexec_idx = 0, | ||
878 | |||
879 | .flags = 1*SD_LOAD_BALANCE | ||
880 | | 1*SD_BALANCE_NEWIDLE | ||
881 | | 1*SD_BALANCE_EXEC | ||
882 | | 1*SD_BALANCE_FORK | ||
883 | | 0*SD_BALANCE_WAKE | ||
884 | | 1*SD_WAKE_AFFINE | ||
885 | | 0*SD_SHARE_CPUCAPACITY | ||
886 | | 0*SD_SHARE_PKG_RESOURCES | ||
887 | | 0*SD_SERIALIZE | ||
888 | | 0*SD_PREFER_SIBLING | ||
889 | | 0*SD_NUMA | ||
890 | | sd_flags | ||
891 | , | ||
892 | |||
893 | .last_balance = jiffies, | ||
894 | .balance_interval = sd_weight, | ||
895 | .smt_gain = 0, | ||
896 | .max_newidle_lb_cost = 0, | ||
897 | .next_decay_max_lb_cost = jiffies, | ||
898 | .child = child, | ||
899 | #ifdef CONFIG_SCHED_DEBUG | ||
900 | .name = tl->name, | ||
901 | #endif | ||
902 | }; | ||
903 | |||
904 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | ||
905 | sd_id = cpumask_first(sched_domain_span(sd)); | ||
906 | |||
907 | /* | ||
908 | * Convert topological properties into behaviour. | ||
909 | */ | ||
910 | |||
911 | if (sd->flags & SD_ASYM_CPUCAPACITY) { | ||
912 | struct sched_domain *t = sd; | ||
913 | |||
914 | for_each_lower_domain(t) | ||
915 | t->flags |= SD_BALANCE_WAKE; | ||
916 | } | ||
917 | |||
918 | if (sd->flags & SD_SHARE_CPUCAPACITY) { | ||
919 | sd->flags |= SD_PREFER_SIBLING; | ||
920 | sd->imbalance_pct = 110; | ||
921 | sd->smt_gain = 1178; /* ~15% */ | ||
922 | |||
923 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
924 | sd->imbalance_pct = 117; | ||
925 | sd->cache_nice_tries = 1; | ||
926 | sd->busy_idx = 2; | ||
927 | |||
928 | #ifdef CONFIG_NUMA | ||
929 | } else if (sd->flags & SD_NUMA) { | ||
930 | sd->cache_nice_tries = 2; | ||
931 | sd->busy_idx = 3; | ||
932 | sd->idle_idx = 2; | ||
933 | |||
934 | sd->flags |= SD_SERIALIZE; | ||
935 | if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { | ||
936 | sd->flags &= ~(SD_BALANCE_EXEC | | ||
937 | SD_BALANCE_FORK | | ||
938 | SD_WAKE_AFFINE); | ||
939 | } | ||
940 | |||
941 | #endif | ||
942 | } else { | ||
943 | sd->flags |= SD_PREFER_SIBLING; | ||
944 | sd->cache_nice_tries = 1; | ||
945 | sd->busy_idx = 2; | ||
946 | sd->idle_idx = 1; | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * For all levels sharing cache; connect a sched_domain_shared | ||
951 | * instance. | ||
952 | */ | ||
953 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { | ||
954 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); | ||
955 | atomic_inc(&sd->shared->ref); | ||
956 | atomic_set(&sd->shared->nr_busy_cpus, sd_weight); | ||
957 | } | ||
958 | |||
959 | sd->private = sdd; | ||
960 | |||
961 | return sd; | ||
962 | } | ||
963 | |||
964 | /* | ||
965 | * Topology list, bottom-up. | ||
966 | */ | ||
967 | static struct sched_domain_topology_level default_topology[] = { | ||
968 | #ifdef CONFIG_SCHED_SMT | ||
969 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | ||
970 | #endif | ||
971 | #ifdef CONFIG_SCHED_MC | ||
972 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | ||
973 | #endif | ||
974 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | ||
975 | { NULL, }, | ||
976 | }; | ||
977 | |||
978 | static struct sched_domain_topology_level *sched_domain_topology = | ||
979 | default_topology; | ||
980 | |||
981 | #define for_each_sd_topology(tl) \ | ||
982 | for (tl = sched_domain_topology; tl->mask; tl++) | ||
983 | |||
984 | void set_sched_topology(struct sched_domain_topology_level *tl) | ||
985 | { | ||
986 | if (WARN_ON_ONCE(sched_smp_initialized)) | ||
987 | return; | ||
988 | |||
989 | sched_domain_topology = tl; | ||
990 | } | ||
991 | |||
992 | #ifdef CONFIG_NUMA | ||
993 | |||
994 | static const struct cpumask *sd_numa_mask(int cpu) | ||
995 | { | ||
996 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
997 | } | ||
998 | |||
999 | static void sched_numa_warn(const char *str) | ||
1000 | { | ||
1001 | static int done = false; | ||
1002 | int i,j; | ||
1003 | |||
1004 | if (done) | ||
1005 | return; | ||
1006 | |||
1007 | done = true; | ||
1008 | |||
1009 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
1010 | |||
1011 | for (i = 0; i < nr_node_ids; i++) { | ||
1012 | printk(KERN_WARNING " "); | ||
1013 | for (j = 0; j < nr_node_ids; j++) | ||
1014 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
1015 | printk(KERN_CONT "\n"); | ||
1016 | } | ||
1017 | printk(KERN_WARNING "\n"); | ||
1018 | } | ||
1019 | |||
1020 | bool find_numa_distance(int distance) | ||
1021 | { | ||
1022 | int i; | ||
1023 | |||
1024 | if (distance == node_distance(0, 0)) | ||
1025 | return true; | ||
1026 | |||
1027 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1028 | if (sched_domains_numa_distance[i] == distance) | ||
1029 | return true; | ||
1030 | } | ||
1031 | |||
1032 | return false; | ||
1033 | } | ||
1034 | |||
1035 | /* | ||
1036 | * A system can have three types of NUMA topology: | ||
1037 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system | ||
1038 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes | ||
1039 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane | ||
1040 | * | ||
1041 | * The difference between a glueless mesh topology and a backplane | ||
1042 | * topology lies in whether communication between not directly | ||
1043 | * connected nodes goes through intermediary nodes (where programs | ||
1044 | * could run), or through backplane controllers. This affects | ||
1045 | * placement of programs. | ||
1046 | * | ||
1047 | * The type of topology can be discerned with the following tests: | ||
1048 | * - If the maximum distance between any nodes is 1 hop, the system | ||
1049 | * is directly connected. | ||
1050 | * - If for two nodes A and B, located N > 1 hops away from each other, | ||
1051 | * there is an intermediary node C, which is < N hops away from both | ||
1052 | * nodes A and B, the system is a glueless mesh. | ||
1053 | */ | ||
1054 | static void init_numa_topology_type(void) | ||
1055 | { | ||
1056 | int a, b, c, n; | ||
1057 | |||
1058 | n = sched_max_numa_distance; | ||
1059 | |||
1060 | if (sched_domains_numa_levels <= 1) { | ||
1061 | sched_numa_topology_type = NUMA_DIRECT; | ||
1062 | return; | ||
1063 | } | ||
1064 | |||
1065 | for_each_online_node(a) { | ||
1066 | for_each_online_node(b) { | ||
1067 | /* Find two nodes furthest removed from each other. */ | ||
1068 | if (node_distance(a, b) < n) | ||
1069 | continue; | ||
1070 | |||
1071 | /* Is there an intermediary node between a and b? */ | ||
1072 | for_each_online_node(c) { | ||
1073 | if (node_distance(a, c) < n && | ||
1074 | node_distance(b, c) < n) { | ||
1075 | sched_numa_topology_type = | ||
1076 | NUMA_GLUELESS_MESH; | ||
1077 | return; | ||
1078 | } | ||
1079 | } | ||
1080 | |||
1081 | sched_numa_topology_type = NUMA_BACKPLANE; | ||
1082 | return; | ||
1083 | } | ||
1084 | } | ||
1085 | } | ||
1086 | |||
1087 | void sched_init_numa(void) | ||
1088 | { | ||
1089 | int next_distance, curr_distance = node_distance(0, 0); | ||
1090 | struct sched_domain_topology_level *tl; | ||
1091 | int level = 0; | ||
1092 | int i, j, k; | ||
1093 | |||
1094 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
1095 | if (!sched_domains_numa_distance) | ||
1096 | return; | ||
1097 | |||
1098 | /* | ||
1099 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
1100 | * unique distances in the node_distance() table. | ||
1101 | * | ||
1102 | * Assumes node_distance(0,j) includes all distances in | ||
1103 | * node_distance(i,j) in order to avoid cubic time. | ||
1104 | */ | ||
1105 | next_distance = curr_distance; | ||
1106 | for (i = 0; i < nr_node_ids; i++) { | ||
1107 | for (j = 0; j < nr_node_ids; j++) { | ||
1108 | for (k = 0; k < nr_node_ids; k++) { | ||
1109 | int distance = node_distance(i, k); | ||
1110 | |||
1111 | if (distance > curr_distance && | ||
1112 | (distance < next_distance || | ||
1113 | next_distance == curr_distance)) | ||
1114 | next_distance = distance; | ||
1115 | |||
1116 | /* | ||
1117 | * While not a strong assumption it would be nice to know | ||
1118 | * about cases where if node A is connected to B, B is not | ||
1119 | * equally connected to A. | ||
1120 | */ | ||
1121 | if (sched_debug() && node_distance(k, i) != distance) | ||
1122 | sched_numa_warn("Node-distance not symmetric"); | ||
1123 | |||
1124 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
1125 | sched_numa_warn("Node-0 not representative"); | ||
1126 | } | ||
1127 | if (next_distance != curr_distance) { | ||
1128 | sched_domains_numa_distance[level++] = next_distance; | ||
1129 | sched_domains_numa_levels = level; | ||
1130 | curr_distance = next_distance; | ||
1131 | } else break; | ||
1132 | } | ||
1133 | |||
1134 | /* | ||
1135 | * In case of sched_debug() we verify the above assumption. | ||
1136 | */ | ||
1137 | if (!sched_debug()) | ||
1138 | break; | ||
1139 | } | ||
1140 | |||
1141 | if (!level) | ||
1142 | return; | ||
1143 | |||
1144 | /* | ||
1145 | * 'level' contains the number of unique distances, excluding the | ||
1146 | * identity distance node_distance(i,i). | ||
1147 | * | ||
1148 | * The sched_domains_numa_distance[] array includes the actual distance | ||
1149 | * numbers. | ||
1150 | */ | ||
1151 | |||
1152 | /* | ||
1153 | * Here, we should temporarily reset sched_domains_numa_levels to 0. | ||
1154 | * If it fails to allocate memory for array sched_domains_numa_masks[][], | ||
1155 | * the array will contain less then 'level' members. This could be | ||
1156 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] | ||
1157 | * in other functions. | ||
1158 | * | ||
1159 | * We reset it to 'level' at the end of this function. | ||
1160 | */ | ||
1161 | sched_domains_numa_levels = 0; | ||
1162 | |||
1163 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
1164 | if (!sched_domains_numa_masks) | ||
1165 | return; | ||
1166 | |||
1167 | /* | ||
1168 | * Now for each level, construct a mask per node which contains all | ||
1169 | * CPUs of nodes that are that many hops away from us. | ||
1170 | */ | ||
1171 | for (i = 0; i < level; i++) { | ||
1172 | sched_domains_numa_masks[i] = | ||
1173 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
1174 | if (!sched_domains_numa_masks[i]) | ||
1175 | return; | ||
1176 | |||
1177 | for (j = 0; j < nr_node_ids; j++) { | ||
1178 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
1179 | if (!mask) | ||
1180 | return; | ||
1181 | |||
1182 | sched_domains_numa_masks[i][j] = mask; | ||
1183 | |||
1184 | for_each_node(k) { | ||
1185 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
1186 | continue; | ||
1187 | |||
1188 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
1189 | } | ||
1190 | } | ||
1191 | } | ||
1192 | |||
1193 | /* Compute default topology size */ | ||
1194 | for (i = 0; sched_domain_topology[i].mask; i++); | ||
1195 | |||
1196 | tl = kzalloc((i + level + 1) * | ||
1197 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
1198 | if (!tl) | ||
1199 | return; | ||
1200 | |||
1201 | /* | ||
1202 | * Copy the default topology bits.. | ||
1203 | */ | ||
1204 | for (i = 0; sched_domain_topology[i].mask; i++) | ||
1205 | tl[i] = sched_domain_topology[i]; | ||
1206 | |||
1207 | /* | ||
1208 | * .. and append 'j' levels of NUMA goodness. | ||
1209 | */ | ||
1210 | for (j = 0; j < level; i++, j++) { | ||
1211 | tl[i] = (struct sched_domain_topology_level){ | ||
1212 | .mask = sd_numa_mask, | ||
1213 | .sd_flags = cpu_numa_flags, | ||
1214 | .flags = SDTL_OVERLAP, | ||
1215 | .numa_level = j, | ||
1216 | SD_INIT_NAME(NUMA) | ||
1217 | }; | ||
1218 | } | ||
1219 | |||
1220 | sched_domain_topology = tl; | ||
1221 | |||
1222 | sched_domains_numa_levels = level; | ||
1223 | sched_max_numa_distance = sched_domains_numa_distance[level - 1]; | ||
1224 | |||
1225 | init_numa_topology_type(); | ||
1226 | } | ||
1227 | |||
1228 | void sched_domains_numa_masks_set(unsigned int cpu) | ||
1229 | { | ||
1230 | int node = cpu_to_node(cpu); | ||
1231 | int i, j; | ||
1232 | |||
1233 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1234 | for (j = 0; j < nr_node_ids; j++) { | ||
1235 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) | ||
1236 | cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1237 | } | ||
1238 | } | ||
1239 | } | ||
1240 | |||
1241 | void sched_domains_numa_masks_clear(unsigned int cpu) | ||
1242 | { | ||
1243 | int i, j; | ||
1244 | |||
1245 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
1246 | for (j = 0; j < nr_node_ids; j++) | ||
1247 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); | ||
1248 | } | ||
1249 | } | ||
1250 | |||
1251 | #endif /* CONFIG_NUMA */ | ||
1252 | |||
1253 | static int __sdt_alloc(const struct cpumask *cpu_map) | ||
1254 | { | ||
1255 | struct sched_domain_topology_level *tl; | ||
1256 | int j; | ||
1257 | |||
1258 | for_each_sd_topology(tl) { | ||
1259 | struct sd_data *sdd = &tl->data; | ||
1260 | |||
1261 | sdd->sd = alloc_percpu(struct sched_domain *); | ||
1262 | if (!sdd->sd) | ||
1263 | return -ENOMEM; | ||
1264 | |||
1265 | sdd->sds = alloc_percpu(struct sched_domain_shared *); | ||
1266 | if (!sdd->sds) | ||
1267 | return -ENOMEM; | ||
1268 | |||
1269 | sdd->sg = alloc_percpu(struct sched_group *); | ||
1270 | if (!sdd->sg) | ||
1271 | return -ENOMEM; | ||
1272 | |||
1273 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); | ||
1274 | if (!sdd->sgc) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | for_each_cpu(j, cpu_map) { | ||
1278 | struct sched_domain *sd; | ||
1279 | struct sched_domain_shared *sds; | ||
1280 | struct sched_group *sg; | ||
1281 | struct sched_group_capacity *sgc; | ||
1282 | |||
1283 | sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), | ||
1284 | GFP_KERNEL, cpu_to_node(j)); | ||
1285 | if (!sd) | ||
1286 | return -ENOMEM; | ||
1287 | |||
1288 | *per_cpu_ptr(sdd->sd, j) = sd; | ||
1289 | |||
1290 | sds = kzalloc_node(sizeof(struct sched_domain_shared), | ||
1291 | GFP_KERNEL, cpu_to_node(j)); | ||
1292 | if (!sds) | ||
1293 | return -ENOMEM; | ||
1294 | |||
1295 | *per_cpu_ptr(sdd->sds, j) = sds; | ||
1296 | |||
1297 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
1298 | GFP_KERNEL, cpu_to_node(j)); | ||
1299 | if (!sg) | ||
1300 | return -ENOMEM; | ||
1301 | |||
1302 | sg->next = sg; | ||
1303 | |||
1304 | *per_cpu_ptr(sdd->sg, j) = sg; | ||
1305 | |||
1306 | sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), | ||
1307 | GFP_KERNEL, cpu_to_node(j)); | ||
1308 | if (!sgc) | ||
1309 | return -ENOMEM; | ||
1310 | |||
1311 | *per_cpu_ptr(sdd->sgc, j) = sgc; | ||
1312 | } | ||
1313 | } | ||
1314 | |||
1315 | return 0; | ||
1316 | } | ||
1317 | |||
1318 | static void __sdt_free(const struct cpumask *cpu_map) | ||
1319 | { | ||
1320 | struct sched_domain_topology_level *tl; | ||
1321 | int j; | ||
1322 | |||
1323 | for_each_sd_topology(tl) { | ||
1324 | struct sd_data *sdd = &tl->data; | ||
1325 | |||
1326 | for_each_cpu(j, cpu_map) { | ||
1327 | struct sched_domain *sd; | ||
1328 | |||
1329 | if (sdd->sd) { | ||
1330 | sd = *per_cpu_ptr(sdd->sd, j); | ||
1331 | if (sd && (sd->flags & SD_OVERLAP)) | ||
1332 | free_sched_groups(sd->groups, 0); | ||
1333 | kfree(*per_cpu_ptr(sdd->sd, j)); | ||
1334 | } | ||
1335 | |||
1336 | if (sdd->sds) | ||
1337 | kfree(*per_cpu_ptr(sdd->sds, j)); | ||
1338 | if (sdd->sg) | ||
1339 | kfree(*per_cpu_ptr(sdd->sg, j)); | ||
1340 | if (sdd->sgc) | ||
1341 | kfree(*per_cpu_ptr(sdd->sgc, j)); | ||
1342 | } | ||
1343 | free_percpu(sdd->sd); | ||
1344 | sdd->sd = NULL; | ||
1345 | free_percpu(sdd->sds); | ||
1346 | sdd->sds = NULL; | ||
1347 | free_percpu(sdd->sg); | ||
1348 | sdd->sg = NULL; | ||
1349 | free_percpu(sdd->sgc); | ||
1350 | sdd->sgc = NULL; | ||
1351 | } | ||
1352 | } | ||
1353 | |||
1354 | struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | ||
1355 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
1356 | struct sched_domain *child, int cpu) | ||
1357 | { | ||
1358 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); | ||
1359 | |||
1360 | if (child) { | ||
1361 | sd->level = child->level + 1; | ||
1362 | sched_domain_level_max = max(sched_domain_level_max, sd->level); | ||
1363 | child->parent = sd; | ||
1364 | |||
1365 | if (!cpumask_subset(sched_domain_span(child), | ||
1366 | sched_domain_span(sd))) { | ||
1367 | pr_err("BUG: arch topology borken\n"); | ||
1368 | #ifdef CONFIG_SCHED_DEBUG | ||
1369 | pr_err(" the %s domain not a subset of the %s domain\n", | ||
1370 | child->name, sd->name); | ||
1371 | #endif | ||
1372 | /* Fixup, ensure @sd has at least @child cpus. */ | ||
1373 | cpumask_or(sched_domain_span(sd), | ||
1374 | sched_domain_span(sd), | ||
1375 | sched_domain_span(child)); | ||
1376 | } | ||
1377 | |||
1378 | } | ||
1379 | set_domain_attribute(sd, attr); | ||
1380 | |||
1381 | return sd; | ||
1382 | } | ||
1383 | |||
1384 | /* | ||
1385 | * Build sched domains for a given set of CPUs and attach the sched domains | ||
1386 | * to the individual CPUs | ||
1387 | */ | ||
1388 | static int | ||
1389 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) | ||
1390 | { | ||
1391 | enum s_alloc alloc_state; | ||
1392 | struct sched_domain *sd; | ||
1393 | struct s_data d; | ||
1394 | struct rq *rq = NULL; | ||
1395 | int i, ret = -ENOMEM; | ||
1396 | |||
1397 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); | ||
1398 | if (alloc_state != sa_rootdomain) | ||
1399 | goto error; | ||
1400 | |||
1401 | /* Set up domains for CPUs specified by the cpu_map: */ | ||
1402 | for_each_cpu(i, cpu_map) { | ||
1403 | struct sched_domain_topology_level *tl; | ||
1404 | |||
1405 | sd = NULL; | ||
1406 | for_each_sd_topology(tl) { | ||
1407 | sd = build_sched_domain(tl, cpu_map, attr, sd, i); | ||
1408 | if (tl == sched_domain_topology) | ||
1409 | *per_cpu_ptr(d.sd, i) = sd; | ||
1410 | if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) | ||
1411 | sd->flags |= SD_OVERLAP; | ||
1412 | if (cpumask_equal(cpu_map, sched_domain_span(sd))) | ||
1413 | break; | ||
1414 | } | ||
1415 | } | ||
1416 | |||
1417 | /* Build the groups for the domains */ | ||
1418 | for_each_cpu(i, cpu_map) { | ||
1419 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1420 | sd->span_weight = cpumask_weight(sched_domain_span(sd)); | ||
1421 | if (sd->flags & SD_OVERLAP) { | ||
1422 | if (build_overlap_sched_groups(sd, i)) | ||
1423 | goto error; | ||
1424 | } else { | ||
1425 | if (build_sched_groups(sd, i)) | ||
1426 | goto error; | ||
1427 | } | ||
1428 | } | ||
1429 | } | ||
1430 | |||
1431 | /* Calculate CPU capacity for physical packages and nodes */ | ||
1432 | for (i = nr_cpumask_bits-1; i >= 0; i--) { | ||
1433 | if (!cpumask_test_cpu(i, cpu_map)) | ||
1434 | continue; | ||
1435 | |||
1436 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { | ||
1437 | claim_allocations(i, sd); | ||
1438 | init_sched_groups_capacity(i, sd); | ||
1439 | } | ||
1440 | } | ||
1441 | |||
1442 | /* Attach the domains */ | ||
1443 | rcu_read_lock(); | ||
1444 | for_each_cpu(i, cpu_map) { | ||
1445 | rq = cpu_rq(i); | ||
1446 | sd = *per_cpu_ptr(d.sd, i); | ||
1447 | |||
1448 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ | ||
1449 | if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) | ||
1450 | WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); | ||
1451 | |||
1452 | cpu_attach_domain(sd, d.rd, i); | ||
1453 | } | ||
1454 | rcu_read_unlock(); | ||
1455 | |||
1456 | if (rq && sched_debug_enabled) { | ||
1457 | pr_info("span: %*pbl (max cpu_capacity = %lu)\n", | ||
1458 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); | ||
1459 | } | ||
1460 | |||
1461 | ret = 0; | ||
1462 | error: | ||
1463 | __free_domain_allocs(&d, alloc_state, cpu_map); | ||
1464 | return ret; | ||
1465 | } | ||
1466 | |||
1467 | /* Current sched domains: */ | ||
1468 | static cpumask_var_t *doms_cur; | ||
1469 | |||
1470 | /* Number of sched domains in 'doms_cur': */ | ||
1471 | static int ndoms_cur; | ||
1472 | |||
1473 | /* Attribues of custom domains in 'doms_cur' */ | ||
1474 | static struct sched_domain_attr *dattr_cur; | ||
1475 | |||
1476 | /* | ||
1477 | * Special case: If a kmalloc() of a doms_cur partition (array of | ||
1478 | * cpumask) fails, then fallback to a single sched domain, | ||
1479 | * as determined by the single cpumask fallback_doms. | ||
1480 | */ | ||
1481 | cpumask_var_t fallback_doms; | ||
1482 | |||
1483 | /* | ||
1484 | * arch_update_cpu_topology lets virtualized architectures update the | ||
1485 | * CPU core maps. It is supposed to return 1 if the topology changed | ||
1486 | * or 0 if it stayed the same. | ||
1487 | */ | ||
1488 | int __weak arch_update_cpu_topology(void) | ||
1489 | { | ||
1490 | return 0; | ||
1491 | } | ||
1492 | |||
1493 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
1494 | { | ||
1495 | int i; | ||
1496 | cpumask_var_t *doms; | ||
1497 | |||
1498 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
1499 | if (!doms) | ||
1500 | return NULL; | ||
1501 | for (i = 0; i < ndoms; i++) { | ||
1502 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
1503 | free_sched_domains(doms, i); | ||
1504 | return NULL; | ||
1505 | } | ||
1506 | } | ||
1507 | return doms; | ||
1508 | } | ||
1509 | |||
1510 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
1511 | { | ||
1512 | unsigned int i; | ||
1513 | for (i = 0; i < ndoms; i++) | ||
1514 | free_cpumask_var(doms[i]); | ||
1515 | kfree(doms); | ||
1516 | } | ||
1517 | |||
1518 | /* | ||
1519 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
1520 | * For now this just excludes isolated CPUs, but could be used to | ||
1521 | * exclude other special cases in the future. | ||
1522 | */ | ||
1523 | int init_sched_domains(const struct cpumask *cpu_map) | ||
1524 | { | ||
1525 | int err; | ||
1526 | |||
1527 | arch_update_cpu_topology(); | ||
1528 | ndoms_cur = 1; | ||
1529 | doms_cur = alloc_sched_domains(ndoms_cur); | ||
1530 | if (!doms_cur) | ||
1531 | doms_cur = &fallback_doms; | ||
1532 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | ||
1533 | err = build_sched_domains(doms_cur[0], NULL); | ||
1534 | register_sched_domain_sysctl(); | ||
1535 | |||
1536 | return err; | ||
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Detach sched domains from a group of CPUs specified in cpu_map | ||
1541 | * These CPUs will now be attached to the NULL domain | ||
1542 | */ | ||
1543 | static void detach_destroy_domains(const struct cpumask *cpu_map) | ||
1544 | { | ||
1545 | int i; | ||
1546 | |||
1547 | rcu_read_lock(); | ||
1548 | for_each_cpu(i, cpu_map) | ||
1549 | cpu_attach_domain(NULL, &def_root_domain, i); | ||
1550 | rcu_read_unlock(); | ||
1551 | } | ||
1552 | |||
1553 | /* handle null as "default" */ | ||
1554 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | ||
1555 | struct sched_domain_attr *new, int idx_new) | ||
1556 | { | ||
1557 | struct sched_domain_attr tmp; | ||
1558 | |||
1559 | /* Fast path: */ | ||
1560 | if (!new && !cur) | ||
1561 | return 1; | ||
1562 | |||
1563 | tmp = SD_ATTR_INIT; | ||
1564 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | ||
1565 | new ? (new + idx_new) : &tmp, | ||
1566 | sizeof(struct sched_domain_attr)); | ||
1567 | } | ||
1568 | |||
1569 | /* | ||
1570 | * Partition sched domains as specified by the 'ndoms_new' | ||
1571 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
1572 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
1573 | * It destroys each deleted domain and builds each new domain. | ||
1574 | * | ||
1575 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. | ||
1576 | * The masks don't intersect (don't overlap.) We should setup one | ||
1577 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
1578 | * not be load balanced. If the same cpumask appears both in the | ||
1579 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
1580 | * it as it is. | ||
1581 | * | ||
1582 | * The passed in 'doms_new' should be allocated using | ||
1583 | * alloc_sched_domains. This routine takes ownership of it and will | ||
1584 | * free_sched_domains it when done with it. If the caller failed the | ||
1585 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, | ||
1586 | * and partition_sched_domains() will fallback to the single partition | ||
1587 | * 'fallback_doms', it also forces the domains to be rebuilt. | ||
1588 | * | ||
1589 | * If doms_new == NULL it will be replaced with cpu_online_mask. | ||
1590 | * ndoms_new == 0 is a special case for destroying existing domains, | ||
1591 | * and it will not create the default domain. | ||
1592 | * | ||
1593 | * Call with hotplug lock held | ||
1594 | */ | ||
1595 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], | ||
1596 | struct sched_domain_attr *dattr_new) | ||
1597 | { | ||
1598 | int i, j, n; | ||
1599 | int new_topology; | ||
1600 | |||
1601 | mutex_lock(&sched_domains_mutex); | ||
1602 | |||
1603 | /* Always unregister in case we don't destroy any domains: */ | ||
1604 | unregister_sched_domain_sysctl(); | ||
1605 | |||
1606 | /* Let the architecture update CPU core mappings: */ | ||
1607 | new_topology = arch_update_cpu_topology(); | ||
1608 | |||
1609 | n = doms_new ? ndoms_new : 0; | ||
1610 | |||
1611 | /* Destroy deleted domains: */ | ||
1612 | for (i = 0; i < ndoms_cur; i++) { | ||
1613 | for (j = 0; j < n && !new_topology; j++) { | ||
1614 | if (cpumask_equal(doms_cur[i], doms_new[j]) | ||
1615 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | ||
1616 | goto match1; | ||
1617 | } | ||
1618 | /* No match - a current sched domain not in new doms_new[] */ | ||
1619 | detach_destroy_domains(doms_cur[i]); | ||
1620 | match1: | ||
1621 | ; | ||
1622 | } | ||
1623 | |||
1624 | n = ndoms_cur; | ||
1625 | if (doms_new == NULL) { | ||
1626 | n = 0; | ||
1627 | doms_new = &fallback_doms; | ||
1628 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); | ||
1629 | WARN_ON_ONCE(dattr_new); | ||
1630 | } | ||
1631 | |||
1632 | /* Build new domains: */ | ||
1633 | for (i = 0; i < ndoms_new; i++) { | ||
1634 | for (j = 0; j < n && !new_topology; j++) { | ||
1635 | if (cpumask_equal(doms_new[i], doms_cur[j]) | ||
1636 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | ||
1637 | goto match2; | ||
1638 | } | ||
1639 | /* No match - add a new doms_new */ | ||
1640 | build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); | ||
1641 | match2: | ||
1642 | ; | ||
1643 | } | ||
1644 | |||
1645 | /* Remember the new sched domains: */ | ||
1646 | if (doms_cur != &fallback_doms) | ||
1647 | free_sched_domains(doms_cur, ndoms_cur); | ||
1648 | |||
1649 | kfree(dattr_cur); | ||
1650 | doms_cur = doms_new; | ||
1651 | dattr_cur = dattr_new; | ||
1652 | ndoms_cur = ndoms_new; | ||
1653 | |||
1654 | register_sched_domain_sysctl(); | ||
1655 | |||
1656 | mutex_unlock(&sched_domains_mutex); | ||
1657 | } | ||
1658 | |||
diff --git a/kernel/signal.c b/kernel/signal.c index 3603d93a1968..13f9def8b24a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1581,7 +1581,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1581 | unsigned long flags; | 1581 | unsigned long flags; |
1582 | struct sighand_struct *psig; | 1582 | struct sighand_struct *psig; |
1583 | bool autoreap = false; | 1583 | bool autoreap = false; |
1584 | cputime_t utime, stime; | 1584 | u64 utime, stime; |
1585 | 1585 | ||
1586 | BUG_ON(sig == -1); | 1586 | BUG_ON(sig == -1); |
1587 | 1587 | ||
@@ -1620,8 +1620,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
1620 | rcu_read_unlock(); | 1620 | rcu_read_unlock(); |
1621 | 1621 | ||
1622 | task_cputime(tsk, &utime, &stime); | 1622 | task_cputime(tsk, &utime, &stime); |
1623 | info.si_utime = cputime_to_clock_t(utime + tsk->signal->utime); | 1623 | info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime); |
1624 | info.si_stime = cputime_to_clock_t(stime + tsk->signal->stime); | 1624 | info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime); |
1625 | 1625 | ||
1626 | info.si_status = tsk->exit_code & 0x7f; | 1626 | info.si_status = tsk->exit_code & 0x7f; |
1627 | if (tsk->exit_code & 0x80) | 1627 | if (tsk->exit_code & 0x80) |
@@ -1685,7 +1685,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1685 | unsigned long flags; | 1685 | unsigned long flags; |
1686 | struct task_struct *parent; | 1686 | struct task_struct *parent; |
1687 | struct sighand_struct *sighand; | 1687 | struct sighand_struct *sighand; |
1688 | cputime_t utime, stime; | 1688 | u64 utime, stime; |
1689 | 1689 | ||
1690 | if (for_ptracer) { | 1690 | if (for_ptracer) { |
1691 | parent = tsk->parent; | 1691 | parent = tsk->parent; |
@@ -1705,8 +1705,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, | |||
1705 | rcu_read_unlock(); | 1705 | rcu_read_unlock(); |
1706 | 1706 | ||
1707 | task_cputime(tsk, &utime, &stime); | 1707 | task_cputime(tsk, &utime, &stime); |
1708 | info.si_utime = cputime_to_clock_t(utime); | 1708 | info.si_utime = nsec_to_clock_t(utime); |
1709 | info.si_stime = cputime_to_clock_t(stime); | 1709 | info.si_stime = nsec_to_clock_t(stime); |
1710 | 1710 | ||
1711 | info.si_code = why; | 1711 | info.si_code = why; |
1712 | switch (why) { | 1712 | switch (why) { |
diff --git a/kernel/sys.c b/kernel/sys.c index 842914ef7de4..7d4a9a6df956 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -881,15 +881,15 @@ SYSCALL_DEFINE0(getegid) | |||
881 | 881 | ||
882 | void do_sys_times(struct tms *tms) | 882 | void do_sys_times(struct tms *tms) |
883 | { | 883 | { |
884 | cputime_t tgutime, tgstime, cutime, cstime; | 884 | u64 tgutime, tgstime, cutime, cstime; |
885 | 885 | ||
886 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); | 886 | thread_group_cputime_adjusted(current, &tgutime, &tgstime); |
887 | cutime = current->signal->cutime; | 887 | cutime = current->signal->cutime; |
888 | cstime = current->signal->cstime; | 888 | cstime = current->signal->cstime; |
889 | tms->tms_utime = cputime_to_clock_t(tgutime); | 889 | tms->tms_utime = nsec_to_clock_t(tgutime); |
890 | tms->tms_stime = cputime_to_clock_t(tgstime); | 890 | tms->tms_stime = nsec_to_clock_t(tgstime); |
891 | tms->tms_cutime = cputime_to_clock_t(cutime); | 891 | tms->tms_cutime = nsec_to_clock_t(cutime); |
892 | tms->tms_cstime = cputime_to_clock_t(cstime); | 892 | tms->tms_cstime = nsec_to_clock_t(cstime); |
893 | } | 893 | } |
894 | 894 | ||
895 | SYSCALL_DEFINE1(times, struct tms __user *, tbuf) | 895 | SYSCALL_DEFINE1(times, struct tms __user *, tbuf) |
@@ -1544,7 +1544,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1544 | { | 1544 | { |
1545 | struct task_struct *t; | 1545 | struct task_struct *t; |
1546 | unsigned long flags; | 1546 | unsigned long flags; |
1547 | cputime_t tgutime, tgstime, utime, stime; | 1547 | u64 tgutime, tgstime, utime, stime; |
1548 | unsigned long maxrss = 0; | 1548 | unsigned long maxrss = 0; |
1549 | 1549 | ||
1550 | memset((char *)r, 0, sizeof (*r)); | 1550 | memset((char *)r, 0, sizeof (*r)); |
@@ -1600,8 +1600,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1600 | unlock_task_sighand(p, &flags); | 1600 | unlock_task_sighand(p, &flags); |
1601 | 1601 | ||
1602 | out: | 1602 | out: |
1603 | cputime_to_timeval(utime, &r->ru_utime); | 1603 | r->ru_utime = ns_to_timeval(utime); |
1604 | cputime_to_timeval(stime, &r->ru_stime); | 1604 | r->ru_stime = ns_to_timeval(stime); |
1605 | 1605 | ||
1606 | if (who != RUSAGE_CHILDREN) { | 1606 | if (who != RUSAGE_CHILDREN) { |
1607 | struct mm_struct *mm = get_task_mm(p); | 1607 | struct mm_struct *mm = get_task_mm(p); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1aea594a54db..bb260ceb3718 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -416,7 +416,7 @@ static struct ctl_table kern_table[] = { | |||
416 | }, | 416 | }, |
417 | { | 417 | { |
418 | .procname = "sched_rr_timeslice_ms", | 418 | .procname = "sched_rr_timeslice_ms", |
419 | .data = &sched_rr_timeslice, | 419 | .data = &sysctl_sched_rr_timeslice, |
420 | .maxlen = sizeof(int), | 420 | .maxlen = sizeof(int), |
421 | .mode = 0644, | 421 | .mode = 0644, |
422 | .proc_handler = sched_rr_handler, | 422 | .proc_handler = sched_rr_handler, |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 665985b0a89a..93621ae718d3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs) | |||
141 | { | 141 | { |
142 | cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); | 142 | cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); |
143 | cs->flags |= CLOCK_SOURCE_UNSTABLE; | 143 | cs->flags |= CLOCK_SOURCE_UNSTABLE; |
144 | |||
145 | if (cs->mark_unstable) | ||
146 | cs->mark_unstable(cs); | ||
147 | |||
144 | if (finished_booting) | 148 | if (finished_booting) |
145 | schedule_work(&watchdog_work); | 149 | schedule_work(&watchdog_work); |
146 | } | 150 | } |
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8c89143f9ebf..a95f13c31464 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c | |||
@@ -45,16 +45,16 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) | |||
45 | static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | 45 | static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, |
46 | struct itimerval *const value) | 46 | struct itimerval *const value) |
47 | { | 47 | { |
48 | cputime_t cval, cinterval; | 48 | u64 val, interval; |
49 | struct cpu_itimer *it = &tsk->signal->it[clock_id]; | 49 | struct cpu_itimer *it = &tsk->signal->it[clock_id]; |
50 | 50 | ||
51 | spin_lock_irq(&tsk->sighand->siglock); | 51 | spin_lock_irq(&tsk->sighand->siglock); |
52 | 52 | ||
53 | cval = it->expires; | 53 | val = it->expires; |
54 | cinterval = it->incr; | 54 | interval = it->incr; |
55 | if (cval) { | 55 | if (val) { |
56 | struct task_cputime cputime; | 56 | struct task_cputime cputime; |
57 | cputime_t t; | 57 | u64 t; |
58 | 58 | ||
59 | thread_group_cputimer(tsk, &cputime); | 59 | thread_group_cputimer(tsk, &cputime); |
60 | if (clock_id == CPUCLOCK_PROF) | 60 | if (clock_id == CPUCLOCK_PROF) |
@@ -63,17 +63,17 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | |||
63 | /* CPUCLOCK_VIRT */ | 63 | /* CPUCLOCK_VIRT */ |
64 | t = cputime.utime; | 64 | t = cputime.utime; |
65 | 65 | ||
66 | if (cval < t) | 66 | if (val < t) |
67 | /* about to fire */ | 67 | /* about to fire */ |
68 | cval = cputime_one_jiffy; | 68 | val = TICK_NSEC; |
69 | else | 69 | else |
70 | cval = cval - t; | 70 | val -= t; |
71 | } | 71 | } |
72 | 72 | ||
73 | spin_unlock_irq(&tsk->sighand->siglock); | 73 | spin_unlock_irq(&tsk->sighand->siglock); |
74 | 74 | ||
75 | cputime_to_timeval(cval, &value->it_value); | 75 | value->it_value = ns_to_timeval(val); |
76 | cputime_to_timeval(cinterval, &value->it_interval); | 76 | value->it_interval = ns_to_timeval(interval); |
77 | } | 77 | } |
78 | 78 | ||
79 | int do_getitimer(int which, struct itimerval *value) | 79 | int do_getitimer(int which, struct itimerval *value) |
@@ -129,55 +129,35 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) | |||
129 | return HRTIMER_NORESTART; | 129 | return HRTIMER_NORESTART; |
130 | } | 130 | } |
131 | 131 | ||
132 | static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) | ||
133 | { | ||
134 | struct timespec ts; | ||
135 | s64 cpu_ns; | ||
136 | |||
137 | cputime_to_timespec(ct, &ts); | ||
138 | cpu_ns = timespec_to_ns(&ts); | ||
139 | |||
140 | return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; | ||
141 | } | ||
142 | |||
143 | static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, | 132 | static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, |
144 | const struct itimerval *const value, | 133 | const struct itimerval *const value, |
145 | struct itimerval *const ovalue) | 134 | struct itimerval *const ovalue) |
146 | { | 135 | { |
147 | cputime_t cval, nval, cinterval, ninterval; | 136 | u64 oval, nval, ointerval, ninterval; |
148 | s64 ns_ninterval, ns_nval; | ||
149 | u32 error, incr_error; | ||
150 | struct cpu_itimer *it = &tsk->signal->it[clock_id]; | 137 | struct cpu_itimer *it = &tsk->signal->it[clock_id]; |
151 | 138 | ||
152 | nval = timeval_to_cputime(&value->it_value); | 139 | nval = timeval_to_ns(&value->it_value); |
153 | ns_nval = timeval_to_ns(&value->it_value); | 140 | ninterval = timeval_to_ns(&value->it_interval); |
154 | ninterval = timeval_to_cputime(&value->it_interval); | ||
155 | ns_ninterval = timeval_to_ns(&value->it_interval); | ||
156 | |||
157 | error = cputime_sub_ns(nval, ns_nval); | ||
158 | incr_error = cputime_sub_ns(ninterval, ns_ninterval); | ||
159 | 141 | ||
160 | spin_lock_irq(&tsk->sighand->siglock); | 142 | spin_lock_irq(&tsk->sighand->siglock); |
161 | 143 | ||
162 | cval = it->expires; | 144 | oval = it->expires; |
163 | cinterval = it->incr; | 145 | ointerval = it->incr; |
164 | if (cval || nval) { | 146 | if (oval || nval) { |
165 | if (nval > 0) | 147 | if (nval > 0) |
166 | nval += cputime_one_jiffy; | 148 | nval += TICK_NSEC; |
167 | set_process_cpu_timer(tsk, clock_id, &nval, &cval); | 149 | set_process_cpu_timer(tsk, clock_id, &nval, &oval); |
168 | } | 150 | } |
169 | it->expires = nval; | 151 | it->expires = nval; |
170 | it->incr = ninterval; | 152 | it->incr = ninterval; |
171 | it->error = error; | ||
172 | it->incr_error = incr_error; | ||
173 | trace_itimer_state(clock_id == CPUCLOCK_VIRT ? | 153 | trace_itimer_state(clock_id == CPUCLOCK_VIRT ? |
174 | ITIMER_VIRTUAL : ITIMER_PROF, value, nval); | 154 | ITIMER_VIRTUAL : ITIMER_PROF, value, nval); |
175 | 155 | ||
176 | spin_unlock_irq(&tsk->sighand->siglock); | 156 | spin_unlock_irq(&tsk->sighand->siglock); |
177 | 157 | ||
178 | if (ovalue) { | 158 | if (ovalue) { |
179 | cputime_to_timeval(cval, &ovalue->it_value); | 159 | ovalue->it_value = ns_to_timeval(oval); |
180 | cputime_to_timeval(cinterval, &ovalue->it_interval); | 160 | ovalue->it_interval = ns_to_timeval(ointerval); |
181 | } | 161 | } |
182 | } | 162 | } |
183 | 163 | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a4a0e478e44d..7906b3f0c41a 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c | |||
@@ -27,19 +27,8 @@ | |||
27 | 27 | ||
28 | #include "timekeeping.h" | 28 | #include "timekeeping.h" |
29 | 29 | ||
30 | /* The Jiffies based clocksource is the lowest common | ||
31 | * denominator clock source which should function on | ||
32 | * all systems. It has the same coarse resolution as | ||
33 | * the timer interrupt frequency HZ and it suffers | ||
34 | * inaccuracies caused by missed or lost timer | ||
35 | * interrupts and the inability for the timer | ||
36 | * interrupt hardware to accuratly tick at the | ||
37 | * requested HZ value. It is also not recommended | ||
38 | * for "tick-less" systems. | ||
39 | */ | ||
40 | #define NSEC_PER_JIFFY ((NSEC_PER_SEC+HZ/2)/HZ) | ||
41 | 30 | ||
42 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | 31 | /* Since jiffies uses a simple TICK_NSEC multiplier |
43 | * conversion, the .shift value could be zero. However | 32 | * conversion, the .shift value could be zero. However |
44 | * this would make NTP adjustments impossible as they are | 33 | * this would make NTP adjustments impossible as they are |
45 | * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to | 34 | * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to |
@@ -47,8 +36,8 @@ | |||
47 | * amount, and give ntp adjustments in units of 1/2^8 | 36 | * amount, and give ntp adjustments in units of 1/2^8 |
48 | * | 37 | * |
49 | * The value 8 is somewhat carefully chosen, as anything | 38 | * The value 8 is somewhat carefully chosen, as anything |
50 | * larger can result in overflows. NSEC_PER_JIFFY grows as | 39 | * larger can result in overflows. TICK_NSEC grows as HZ |
51 | * HZ shrinks, so values greater than 8 overflow 32bits when | 40 | * shrinks, so values greater than 8 overflow 32bits when |
52 | * HZ=100. | 41 | * HZ=100. |
53 | */ | 42 | */ |
54 | #if HZ < 34 | 43 | #if HZ < 34 |
@@ -64,12 +53,23 @@ static u64 jiffies_read(struct clocksource *cs) | |||
64 | return (u64) jiffies; | 53 | return (u64) jiffies; |
65 | } | 54 | } |
66 | 55 | ||
56 | /* | ||
57 | * The Jiffies based clocksource is the lowest common | ||
58 | * denominator clock source which should function on | ||
59 | * all systems. It has the same coarse resolution as | ||
60 | * the timer interrupt frequency HZ and it suffers | ||
61 | * inaccuracies caused by missed or lost timer | ||
62 | * interrupts and the inability for the timer | ||
63 | * interrupt hardware to accuratly tick at the | ||
64 | * requested HZ value. It is also not recommended | ||
65 | * for "tick-less" systems. | ||
66 | */ | ||
67 | static struct clocksource clocksource_jiffies = { | 67 | static struct clocksource clocksource_jiffies = { |
68 | .name = "jiffies", | 68 | .name = "jiffies", |
69 | .rating = 1, /* lowest valid rating*/ | 69 | .rating = 1, /* lowest valid rating*/ |
70 | .read = jiffies_read, | 70 | .read = jiffies_read, |
71 | .mask = CLOCKSOURCE_MASK(32), | 71 | .mask = CLOCKSOURCE_MASK(32), |
72 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | 72 | .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ |
73 | .shift = JIFFIES_SHIFT, | 73 | .shift = JIFFIES_SHIFT, |
74 | .max_cycles = 10, | 74 | .max_cycles = 10, |
75 | }; | 75 | }; |
@@ -125,7 +125,7 @@ int register_refined_jiffies(long cycles_per_second) | |||
125 | shift_hz += cycles_per_tick/2; | 125 | shift_hz += cycles_per_tick/2; |
126 | do_div(shift_hz, cycles_per_tick); | 126 | do_div(shift_hz, cycles_per_tick); |
127 | /* Calculate nsec_per_tick using shift_hz */ | 127 | /* Calculate nsec_per_tick using shift_hz */ |
128 | nsec_per_tick = (u64)NSEC_PER_SEC << 8; | 128 | nsec_per_tick = (u64)TICK_NSEC << 8; |
129 | nsec_per_tick += (u32)shift_hz/2; | 129 | nsec_per_tick += (u32)shift_hz/2; |
130 | do_div(nsec_per_tick, (u32)shift_hz); | 130 | do_div(nsec_per_tick, (u32)shift_hz); |
131 | 131 | ||
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index e9e8c10f0d9a..b4377a5e4269 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -20,10 +20,10 @@ | |||
20 | */ | 20 | */ |
21 | void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) | 21 | void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) |
22 | { | 22 | { |
23 | cputime_t cputime = secs_to_cputime(rlim_new); | 23 | u64 nsecs = rlim_new * NSEC_PER_SEC; |
24 | 24 | ||
25 | spin_lock_irq(&task->sighand->siglock); | 25 | spin_lock_irq(&task->sighand->siglock); |
26 | set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); | 26 | set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL); |
27 | spin_unlock_irq(&task->sighand->siglock); | 27 | spin_unlock_irq(&task->sighand->siglock); |
28 | } | 28 | } |
29 | 29 | ||
@@ -50,39 +50,14 @@ static int check_clock(const clockid_t which_clock) | |||
50 | return error; | 50 | return error; |
51 | } | 51 | } |
52 | 52 | ||
53 | static inline unsigned long long | ||
54 | timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) | ||
55 | { | ||
56 | unsigned long long ret; | ||
57 | |||
58 | ret = 0; /* high half always zero when .cpu used */ | ||
59 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | ||
60 | ret = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; | ||
61 | } else { | ||
62 | ret = cputime_to_expires(timespec_to_cputime(tp)); | ||
63 | } | ||
64 | return ret; | ||
65 | } | ||
66 | |||
67 | static void sample_to_timespec(const clockid_t which_clock, | ||
68 | unsigned long long expires, | ||
69 | struct timespec *tp) | ||
70 | { | ||
71 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) | ||
72 | *tp = ns_to_timespec(expires); | ||
73 | else | ||
74 | cputime_to_timespec((__force cputime_t)expires, tp); | ||
75 | } | ||
76 | |||
77 | /* | 53 | /* |
78 | * Update expiry time from increment, and increase overrun count, | 54 | * Update expiry time from increment, and increase overrun count, |
79 | * given the current clock sample. | 55 | * given the current clock sample. |
80 | */ | 56 | */ |
81 | static void bump_cpu_timer(struct k_itimer *timer, | 57 | static void bump_cpu_timer(struct k_itimer *timer, u64 now) |
82 | unsigned long long now) | ||
83 | { | 58 | { |
84 | int i; | 59 | int i; |
85 | unsigned long long delta, incr; | 60 | u64 delta, incr; |
86 | 61 | ||
87 | if (timer->it.cpu.incr == 0) | 62 | if (timer->it.cpu.incr == 0) |
88 | return; | 63 | return; |
@@ -122,21 +97,21 @@ static inline int task_cputime_zero(const struct task_cputime *cputime) | |||
122 | return 0; | 97 | return 0; |
123 | } | 98 | } |
124 | 99 | ||
125 | static inline unsigned long long prof_ticks(struct task_struct *p) | 100 | static inline u64 prof_ticks(struct task_struct *p) |
126 | { | 101 | { |
127 | cputime_t utime, stime; | 102 | u64 utime, stime; |
128 | 103 | ||
129 | task_cputime(p, &utime, &stime); | 104 | task_cputime(p, &utime, &stime); |
130 | 105 | ||
131 | return cputime_to_expires(utime + stime); | 106 | return utime + stime; |
132 | } | 107 | } |
133 | static inline unsigned long long virt_ticks(struct task_struct *p) | 108 | static inline u64 virt_ticks(struct task_struct *p) |
134 | { | 109 | { |
135 | cputime_t utime, stime; | 110 | u64 utime, stime; |
136 | 111 | ||
137 | task_cputime(p, &utime, &stime); | 112 | task_cputime(p, &utime, &stime); |
138 | 113 | ||
139 | return cputime_to_expires(utime); | 114 | return utime; |
140 | } | 115 | } |
141 | 116 | ||
142 | static int | 117 | static int |
@@ -176,8 +151,8 @@ posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) | |||
176 | /* | 151 | /* |
177 | * Sample a per-thread clock for the given task. | 152 | * Sample a per-thread clock for the given task. |
178 | */ | 153 | */ |
179 | static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, | 154 | static int cpu_clock_sample(const clockid_t which_clock, |
180 | unsigned long long *sample) | 155 | struct task_struct *p, u64 *sample) |
181 | { | 156 | { |
182 | switch (CPUCLOCK_WHICH(which_clock)) { | 157 | switch (CPUCLOCK_WHICH(which_clock)) { |
183 | default: | 158 | default: |
@@ -260,7 +235,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) | |||
260 | */ | 235 | */ |
261 | static int cpu_clock_sample_group(const clockid_t which_clock, | 236 | static int cpu_clock_sample_group(const clockid_t which_clock, |
262 | struct task_struct *p, | 237 | struct task_struct *p, |
263 | unsigned long long *sample) | 238 | u64 *sample) |
264 | { | 239 | { |
265 | struct task_cputime cputime; | 240 | struct task_cputime cputime; |
266 | 241 | ||
@@ -269,11 +244,11 @@ static int cpu_clock_sample_group(const clockid_t which_clock, | |||
269 | return -EINVAL; | 244 | return -EINVAL; |
270 | case CPUCLOCK_PROF: | 245 | case CPUCLOCK_PROF: |
271 | thread_group_cputime(p, &cputime); | 246 | thread_group_cputime(p, &cputime); |
272 | *sample = cputime_to_expires(cputime.utime + cputime.stime); | 247 | *sample = cputime.utime + cputime.stime; |
273 | break; | 248 | break; |
274 | case CPUCLOCK_VIRT: | 249 | case CPUCLOCK_VIRT: |
275 | thread_group_cputime(p, &cputime); | 250 | thread_group_cputime(p, &cputime); |
276 | *sample = cputime_to_expires(cputime.utime); | 251 | *sample = cputime.utime; |
277 | break; | 252 | break; |
278 | case CPUCLOCK_SCHED: | 253 | case CPUCLOCK_SCHED: |
279 | thread_group_cputime(p, &cputime); | 254 | thread_group_cputime(p, &cputime); |
@@ -288,7 +263,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
288 | struct timespec *tp) | 263 | struct timespec *tp) |
289 | { | 264 | { |
290 | int err = -EINVAL; | 265 | int err = -EINVAL; |
291 | unsigned long long rtn; | 266 | u64 rtn; |
292 | 267 | ||
293 | if (CPUCLOCK_PERTHREAD(which_clock)) { | 268 | if (CPUCLOCK_PERTHREAD(which_clock)) { |
294 | if (same_thread_group(tsk, current)) | 269 | if (same_thread_group(tsk, current)) |
@@ -299,7 +274,7 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, | |||
299 | } | 274 | } |
300 | 275 | ||
301 | if (!err) | 276 | if (!err) |
302 | sample_to_timespec(which_clock, rtn, tp); | 277 | *tp = ns_to_timespec(rtn); |
303 | 278 | ||
304 | return err; | 279 | return err; |
305 | } | 280 | } |
@@ -453,7 +428,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) | |||
453 | cleanup_timers(tsk->signal->cpu_timers); | 428 | cleanup_timers(tsk->signal->cpu_timers); |
454 | } | 429 | } |
455 | 430 | ||
456 | static inline int expires_gt(cputime_t expires, cputime_t new_exp) | 431 | static inline int expires_gt(u64 expires, u64 new_exp) |
457 | { | 432 | { |
458 | return expires == 0 || expires > new_exp; | 433 | return expires == 0 || expires > new_exp; |
459 | } | 434 | } |
@@ -488,7 +463,7 @@ static void arm_timer(struct k_itimer *timer) | |||
488 | list_add(&nt->entry, listpos); | 463 | list_add(&nt->entry, listpos); |
489 | 464 | ||
490 | if (listpos == head) { | 465 | if (listpos == head) { |
491 | unsigned long long exp = nt->expires; | 466 | u64 exp = nt->expires; |
492 | 467 | ||
493 | /* | 468 | /* |
494 | * We are the new earliest-expiring POSIX 1.b timer, hence | 469 | * We are the new earliest-expiring POSIX 1.b timer, hence |
@@ -499,16 +474,15 @@ static void arm_timer(struct k_itimer *timer) | |||
499 | 474 | ||
500 | switch (CPUCLOCK_WHICH(timer->it_clock)) { | 475 | switch (CPUCLOCK_WHICH(timer->it_clock)) { |
501 | case CPUCLOCK_PROF: | 476 | case CPUCLOCK_PROF: |
502 | if (expires_gt(cputime_expires->prof_exp, expires_to_cputime(exp))) | 477 | if (expires_gt(cputime_expires->prof_exp, exp)) |
503 | cputime_expires->prof_exp = expires_to_cputime(exp); | 478 | cputime_expires->prof_exp = exp; |
504 | break; | 479 | break; |
505 | case CPUCLOCK_VIRT: | 480 | case CPUCLOCK_VIRT: |
506 | if (expires_gt(cputime_expires->virt_exp, expires_to_cputime(exp))) | 481 | if (expires_gt(cputime_expires->virt_exp, exp)) |
507 | cputime_expires->virt_exp = expires_to_cputime(exp); | 482 | cputime_expires->virt_exp = exp; |
508 | break; | 483 | break; |
509 | case CPUCLOCK_SCHED: | 484 | case CPUCLOCK_SCHED: |
510 | if (cputime_expires->sched_exp == 0 || | 485 | if (expires_gt(cputime_expires->sched_exp, exp)) |
511 | cputime_expires->sched_exp > exp) | ||
512 | cputime_expires->sched_exp = exp; | 486 | cputime_expires->sched_exp = exp; |
513 | break; | 487 | break; |
514 | } | 488 | } |
@@ -559,8 +533,7 @@ static void cpu_timer_fire(struct k_itimer *timer) | |||
559 | * traversal. | 533 | * traversal. |
560 | */ | 534 | */ |
561 | static int cpu_timer_sample_group(const clockid_t which_clock, | 535 | static int cpu_timer_sample_group(const clockid_t which_clock, |
562 | struct task_struct *p, | 536 | struct task_struct *p, u64 *sample) |
563 | unsigned long long *sample) | ||
564 | { | 537 | { |
565 | struct task_cputime cputime; | 538 | struct task_cputime cputime; |
566 | 539 | ||
@@ -569,10 +542,10 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
569 | default: | 542 | default: |
570 | return -EINVAL; | 543 | return -EINVAL; |
571 | case CPUCLOCK_PROF: | 544 | case CPUCLOCK_PROF: |
572 | *sample = cputime_to_expires(cputime.utime + cputime.stime); | 545 | *sample = cputime.utime + cputime.stime; |
573 | break; | 546 | break; |
574 | case CPUCLOCK_VIRT: | 547 | case CPUCLOCK_VIRT: |
575 | *sample = cputime_to_expires(cputime.utime); | 548 | *sample = cputime.utime; |
576 | break; | 549 | break; |
577 | case CPUCLOCK_SCHED: | 550 | case CPUCLOCK_SCHED: |
578 | *sample = cputime.sum_exec_runtime; | 551 | *sample = cputime.sum_exec_runtime; |
@@ -593,12 +566,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
593 | unsigned long flags; | 566 | unsigned long flags; |
594 | struct sighand_struct *sighand; | 567 | struct sighand_struct *sighand; |
595 | struct task_struct *p = timer->it.cpu.task; | 568 | struct task_struct *p = timer->it.cpu.task; |
596 | unsigned long long old_expires, new_expires, old_incr, val; | 569 | u64 old_expires, new_expires, old_incr, val; |
597 | int ret; | 570 | int ret; |
598 | 571 | ||
599 | WARN_ON_ONCE(p == NULL); | 572 | WARN_ON_ONCE(p == NULL); |
600 | 573 | ||
601 | new_expires = timespec_to_sample(timer->it_clock, &new->it_value); | 574 | new_expires = timespec_to_ns(&new->it_value); |
602 | 575 | ||
603 | /* | 576 | /* |
604 | * Protect against sighand release/switch in exit/exec and p->cpu_timers | 577 | * Protect against sighand release/switch in exit/exec and p->cpu_timers |
@@ -659,9 +632,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
659 | bump_cpu_timer(timer, val); | 632 | bump_cpu_timer(timer, val); |
660 | if (val < timer->it.cpu.expires) { | 633 | if (val < timer->it.cpu.expires) { |
661 | old_expires = timer->it.cpu.expires - val; | 634 | old_expires = timer->it.cpu.expires - val; |
662 | sample_to_timespec(timer->it_clock, | 635 | old->it_value = ns_to_timespec(old_expires); |
663 | old_expires, | ||
664 | &old->it_value); | ||
665 | } else { | 636 | } else { |
666 | old->it_value.tv_nsec = 1; | 637 | old->it_value.tv_nsec = 1; |
667 | old->it_value.tv_sec = 0; | 638 | old->it_value.tv_sec = 0; |
@@ -699,8 +670,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
699 | * Install the new reload setting, and | 670 | * Install the new reload setting, and |
700 | * set up the signal and overrun bookkeeping. | 671 | * set up the signal and overrun bookkeeping. |
701 | */ | 672 | */ |
702 | timer->it.cpu.incr = timespec_to_sample(timer->it_clock, | 673 | timer->it.cpu.incr = timespec_to_ns(&new->it_interval); |
703 | &new->it_interval); | ||
704 | 674 | ||
705 | /* | 675 | /* |
706 | * This acts as a modification timestamp for the timer, | 676 | * This acts as a modification timestamp for the timer, |
@@ -723,17 +693,15 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, | |||
723 | 693 | ||
724 | ret = 0; | 694 | ret = 0; |
725 | out: | 695 | out: |
726 | if (old) { | 696 | if (old) |
727 | sample_to_timespec(timer->it_clock, | 697 | old->it_interval = ns_to_timespec(old_incr); |
728 | old_incr, &old->it_interval); | ||
729 | } | ||
730 | 698 | ||
731 | return ret; | 699 | return ret; |
732 | } | 700 | } |
733 | 701 | ||
734 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | 702 | static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) |
735 | { | 703 | { |
736 | unsigned long long now; | 704 | u64 now; |
737 | struct task_struct *p = timer->it.cpu.task; | 705 | struct task_struct *p = timer->it.cpu.task; |
738 | 706 | ||
739 | WARN_ON_ONCE(p == NULL); | 707 | WARN_ON_ONCE(p == NULL); |
@@ -741,8 +709,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
741 | /* | 709 | /* |
742 | * Easy part: convert the reload time. | 710 | * Easy part: convert the reload time. |
743 | */ | 711 | */ |
744 | sample_to_timespec(timer->it_clock, | 712 | itp->it_interval = ns_to_timespec(timer->it.cpu.incr); |
745 | timer->it.cpu.incr, &itp->it_interval); | ||
746 | 713 | ||
747 | if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ | 714 | if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ |
748 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; | 715 | itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; |
@@ -771,8 +738,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
771 | * Call the timer disarmed, nothing else to do. | 738 | * Call the timer disarmed, nothing else to do. |
772 | */ | 739 | */ |
773 | timer->it.cpu.expires = 0; | 740 | timer->it.cpu.expires = 0; |
774 | sample_to_timespec(timer->it_clock, timer->it.cpu.expires, | 741 | itp->it_value = ns_to_timespec(timer->it.cpu.expires); |
775 | &itp->it_value); | ||
776 | return; | 742 | return; |
777 | } else { | 743 | } else { |
778 | cpu_timer_sample_group(timer->it_clock, p, &now); | 744 | cpu_timer_sample_group(timer->it_clock, p, &now); |
@@ -781,9 +747,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
781 | } | 747 | } |
782 | 748 | ||
783 | if (now < timer->it.cpu.expires) { | 749 | if (now < timer->it.cpu.expires) { |
784 | sample_to_timespec(timer->it_clock, | 750 | itp->it_value = ns_to_timespec(timer->it.cpu.expires - now); |
785 | timer->it.cpu.expires - now, | ||
786 | &itp->it_value); | ||
787 | } else { | 751 | } else { |
788 | /* | 752 | /* |
789 | * The timer should have expired already, but the firing | 753 | * The timer should have expired already, but the firing |
@@ -827,7 +791,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
827 | struct list_head *timers = tsk->cpu_timers; | 791 | struct list_head *timers = tsk->cpu_timers; |
828 | struct signal_struct *const sig = tsk->signal; | 792 | struct signal_struct *const sig = tsk->signal; |
829 | struct task_cputime *tsk_expires = &tsk->cputime_expires; | 793 | struct task_cputime *tsk_expires = &tsk->cputime_expires; |
830 | unsigned long long expires; | 794 | u64 expires; |
831 | unsigned long soft; | 795 | unsigned long soft; |
832 | 796 | ||
833 | /* | 797 | /* |
@@ -838,10 +802,10 @@ static void check_thread_timers(struct task_struct *tsk, | |||
838 | return; | 802 | return; |
839 | 803 | ||
840 | expires = check_timers_list(timers, firing, prof_ticks(tsk)); | 804 | expires = check_timers_list(timers, firing, prof_ticks(tsk)); |
841 | tsk_expires->prof_exp = expires_to_cputime(expires); | 805 | tsk_expires->prof_exp = expires; |
842 | 806 | ||
843 | expires = check_timers_list(++timers, firing, virt_ticks(tsk)); | 807 | expires = check_timers_list(++timers, firing, virt_ticks(tsk)); |
844 | tsk_expires->virt_exp = expires_to_cputime(expires); | 808 | tsk_expires->virt_exp = expires; |
845 | 809 | ||
846 | tsk_expires->sched_exp = check_timers_list(++timers, firing, | 810 | tsk_expires->sched_exp = check_timers_list(++timers, firing, |
847 | tsk->se.sum_exec_runtime); | 811 | tsk->se.sum_exec_runtime); |
@@ -890,26 +854,17 @@ static inline void stop_process_timers(struct signal_struct *sig) | |||
890 | tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); | 854 | tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); |
891 | } | 855 | } |
892 | 856 | ||
893 | static u32 onecputick; | ||
894 | |||
895 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | 857 | static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, |
896 | unsigned long long *expires, | 858 | u64 *expires, u64 cur_time, int signo) |
897 | unsigned long long cur_time, int signo) | ||
898 | { | 859 | { |
899 | if (!it->expires) | 860 | if (!it->expires) |
900 | return; | 861 | return; |
901 | 862 | ||
902 | if (cur_time >= it->expires) { | 863 | if (cur_time >= it->expires) { |
903 | if (it->incr) { | 864 | if (it->incr) |
904 | it->expires += it->incr; | 865 | it->expires += it->incr; |
905 | it->error += it->incr_error; | 866 | else |
906 | if (it->error >= onecputick) { | ||
907 | it->expires -= cputime_one_jiffy; | ||
908 | it->error -= onecputick; | ||
909 | } | ||
910 | } else { | ||
911 | it->expires = 0; | 867 | it->expires = 0; |
912 | } | ||
913 | 868 | ||
914 | trace_itimer_expire(signo == SIGPROF ? | 869 | trace_itimer_expire(signo == SIGPROF ? |
915 | ITIMER_PROF : ITIMER_VIRTUAL, | 870 | ITIMER_PROF : ITIMER_VIRTUAL, |
@@ -917,9 +872,8 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
917 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); | 872 | __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); |
918 | } | 873 | } |
919 | 874 | ||
920 | if (it->expires && (!*expires || it->expires < *expires)) { | 875 | if (it->expires && (!*expires || it->expires < *expires)) |
921 | *expires = it->expires; | 876 | *expires = it->expires; |
922 | } | ||
923 | } | 877 | } |
924 | 878 | ||
925 | /* | 879 | /* |
@@ -931,8 +885,8 @@ static void check_process_timers(struct task_struct *tsk, | |||
931 | struct list_head *firing) | 885 | struct list_head *firing) |
932 | { | 886 | { |
933 | struct signal_struct *const sig = tsk->signal; | 887 | struct signal_struct *const sig = tsk->signal; |
934 | unsigned long long utime, ptime, virt_expires, prof_expires; | 888 | u64 utime, ptime, virt_expires, prof_expires; |
935 | unsigned long long sum_sched_runtime, sched_expires; | 889 | u64 sum_sched_runtime, sched_expires; |
936 | struct list_head *timers = sig->cpu_timers; | 890 | struct list_head *timers = sig->cpu_timers; |
937 | struct task_cputime cputime; | 891 | struct task_cputime cputime; |
938 | unsigned long soft; | 892 | unsigned long soft; |
@@ -954,8 +908,8 @@ static void check_process_timers(struct task_struct *tsk, | |||
954 | * Collect the current process totals. | 908 | * Collect the current process totals. |
955 | */ | 909 | */ |
956 | thread_group_cputimer(tsk, &cputime); | 910 | thread_group_cputimer(tsk, &cputime); |
957 | utime = cputime_to_expires(cputime.utime); | 911 | utime = cputime.utime; |
958 | ptime = utime + cputime_to_expires(cputime.stime); | 912 | ptime = utime + cputime.stime; |
959 | sum_sched_runtime = cputime.sum_exec_runtime; | 913 | sum_sched_runtime = cputime.sum_exec_runtime; |
960 | 914 | ||
961 | prof_expires = check_timers_list(timers, firing, ptime); | 915 | prof_expires = check_timers_list(timers, firing, ptime); |
@@ -971,10 +925,10 @@ static void check_process_timers(struct task_struct *tsk, | |||
971 | SIGVTALRM); | 925 | SIGVTALRM); |
972 | soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); | 926 | soft = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); |
973 | if (soft != RLIM_INFINITY) { | 927 | if (soft != RLIM_INFINITY) { |
974 | unsigned long psecs = cputime_to_secs(ptime); | 928 | unsigned long psecs = div_u64(ptime, NSEC_PER_SEC); |
975 | unsigned long hard = | 929 | unsigned long hard = |
976 | READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); | 930 | READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); |
977 | cputime_t x; | 931 | u64 x; |
978 | if (psecs >= hard) { | 932 | if (psecs >= hard) { |
979 | /* | 933 | /* |
980 | * At the hard limit, we just die. | 934 | * At the hard limit, we just die. |
@@ -993,14 +947,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
993 | sig->rlim[RLIMIT_CPU].rlim_cur = soft; | 947 | sig->rlim[RLIMIT_CPU].rlim_cur = soft; |
994 | } | 948 | } |
995 | } | 949 | } |
996 | x = secs_to_cputime(soft); | 950 | x = soft * NSEC_PER_SEC; |
997 | if (!prof_expires || x < prof_expires) { | 951 | if (!prof_expires || x < prof_expires) |
998 | prof_expires = x; | 952 | prof_expires = x; |
999 | } | ||
1000 | } | 953 | } |
1001 | 954 | ||
1002 | sig->cputime_expires.prof_exp = expires_to_cputime(prof_expires); | 955 | sig->cputime_expires.prof_exp = prof_expires; |
1003 | sig->cputime_expires.virt_exp = expires_to_cputime(virt_expires); | 956 | sig->cputime_expires.virt_exp = virt_expires; |
1004 | sig->cputime_expires.sched_exp = sched_expires; | 957 | sig->cputime_expires.sched_exp = sched_expires; |
1005 | if (task_cputime_zero(&sig->cputime_expires)) | 958 | if (task_cputime_zero(&sig->cputime_expires)) |
1006 | stop_process_timers(sig); | 959 | stop_process_timers(sig); |
@@ -1017,7 +970,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1017 | struct sighand_struct *sighand; | 970 | struct sighand_struct *sighand; |
1018 | unsigned long flags; | 971 | unsigned long flags; |
1019 | struct task_struct *p = timer->it.cpu.task; | 972 | struct task_struct *p = timer->it.cpu.task; |
1020 | unsigned long long now; | 973 | u64 now; |
1021 | 974 | ||
1022 | WARN_ON_ONCE(p == NULL); | 975 | WARN_ON_ONCE(p == NULL); |
1023 | 976 | ||
@@ -1214,9 +1167,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1214 | * The tsk->sighand->siglock must be held by the caller. | 1167 | * The tsk->sighand->siglock must be held by the caller. |
1215 | */ | 1168 | */ |
1216 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | 1169 | void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, |
1217 | cputime_t *newval, cputime_t *oldval) | 1170 | u64 *newval, u64 *oldval) |
1218 | { | 1171 | { |
1219 | unsigned long long now; | 1172 | u64 now; |
1220 | 1173 | ||
1221 | WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); | 1174 | WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); |
1222 | cpu_timer_sample_group(clock_idx, tsk, &now); | 1175 | cpu_timer_sample_group(clock_idx, tsk, &now); |
@@ -1230,7 +1183,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1230 | if (*oldval) { | 1183 | if (*oldval) { |
1231 | if (*oldval <= now) { | 1184 | if (*oldval <= now) { |
1232 | /* Just about to fire. */ | 1185 | /* Just about to fire. */ |
1233 | *oldval = cputime_one_jiffy; | 1186 | *oldval = TICK_NSEC; |
1234 | } else { | 1187 | } else { |
1235 | *oldval -= now; | 1188 | *oldval -= now; |
1236 | } | 1189 | } |
@@ -1310,7 +1263,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | |||
1310 | /* | 1263 | /* |
1311 | * We were interrupted by a signal. | 1264 | * We were interrupted by a signal. |
1312 | */ | 1265 | */ |
1313 | sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); | 1266 | *rqtp = ns_to_timespec(timer.it.cpu.expires); |
1314 | error = posix_cpu_timer_set(&timer, 0, &zero_it, it); | 1267 | error = posix_cpu_timer_set(&timer, 0, &zero_it, it); |
1315 | if (!error) { | 1268 | if (!error) { |
1316 | /* | 1269 | /* |
@@ -1476,15 +1429,10 @@ static __init int init_posix_cpu_timers(void) | |||
1476 | .clock_get = thread_cpu_clock_get, | 1429 | .clock_get = thread_cpu_clock_get, |
1477 | .timer_create = thread_cpu_timer_create, | 1430 | .timer_create = thread_cpu_timer_create, |
1478 | }; | 1431 | }; |
1479 | struct timespec ts; | ||
1480 | 1432 | ||
1481 | posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); | 1433 | posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); |
1482 | posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); | 1434 | posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); |
1483 | 1435 | ||
1484 | cputime_to_timespec(cputime_one_jiffy, &ts); | ||
1485 | onecputick = ts.tv_nsec; | ||
1486 | WARN_ON(ts.tv_sec != 0); | ||
1487 | |||
1488 | return 0; | 1436 | return 0; |
1489 | } | 1437 | } |
1490 | __initcall(init_posix_cpu_timers); | 1438 | __initcall(init_posix_cpu_timers); |
diff --git a/kernel/time/time.c b/kernel/time/time.c index a3a9a8a029dc..25bdd2504571 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
@@ -702,6 +702,16 @@ u64 nsec_to_clock_t(u64 x) | |||
702 | #endif | 702 | #endif |
703 | } | 703 | } |
704 | 704 | ||
705 | u64 jiffies64_to_nsecs(u64 j) | ||
706 | { | ||
707 | #if !(NSEC_PER_SEC % HZ) | ||
708 | return (NSEC_PER_SEC / HZ) * j; | ||
709 | # else | ||
710 | return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN); | ||
711 | #endif | ||
712 | } | ||
713 | EXPORT_SYMBOL(jiffies64_to_nsecs); | ||
714 | |||
705 | /** | 715 | /** |
706 | * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 | 716 | * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 |
707 | * | 717 | * |
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index c48688904f9f..f83bbb81600b 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc | |||
@@ -98,6 +98,12 @@ define timeconst(hz) { | |||
98 | print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" | 98 | print "#define HZ_TO_USEC_DEN\t\t", hz/cd, "\n" |
99 | print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" | 99 | print "#define USEC_TO_HZ_NUM\t\t", hz/cd, "\n" |
100 | print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" | 100 | print "#define USEC_TO_HZ_DEN\t\t", 1000000/cd, "\n" |
101 | |||
102 | cd=gcd(hz,1000000000) | ||
103 | print "#define HZ_TO_NSEC_NUM\t\t", 1000000000/cd, "\n" | ||
104 | print "#define HZ_TO_NSEC_DEN\t\t", hz/cd, "\n" | ||
105 | print "#define NSEC_TO_HZ_NUM\t\t", hz/cd, "\n" | ||
106 | print "#define NSEC_TO_HZ_DEN\t\t", 1000000000/cd, "\n" | ||
101 | print "\n" | 107 | print "\n" |
102 | 108 | ||
103 | print "#endif /* KERNEL_TIMECONST_H */\n" | 109 | print "#endif /* KERNEL_TIMECONST_H */\n" |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f8e26ab963ed..5c21f0535056 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -31,7 +31,7 @@ void bacct_add_tsk(struct user_namespace *user_ns, | |||
31 | struct taskstats *stats, struct task_struct *tsk) | 31 | struct taskstats *stats, struct task_struct *tsk) |
32 | { | 32 | { |
33 | const struct cred *tcred; | 33 | const struct cred *tcred; |
34 | cputime_t utime, stime, utimescaled, stimescaled; | 34 | u64 utime, stime, utimescaled, stimescaled; |
35 | u64 delta; | 35 | u64 delta; |
36 | 36 | ||
37 | BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); | 37 | BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); |
@@ -67,12 +67,12 @@ void bacct_add_tsk(struct user_namespace *user_ns, | |||
67 | rcu_read_unlock(); | 67 | rcu_read_unlock(); |
68 | 68 | ||
69 | task_cputime(tsk, &utime, &stime); | 69 | task_cputime(tsk, &utime, &stime); |
70 | stats->ac_utime = cputime_to_usecs(utime); | 70 | stats->ac_utime = div_u64(utime, NSEC_PER_USEC); |
71 | stats->ac_stime = cputime_to_usecs(stime); | 71 | stats->ac_stime = div_u64(stime, NSEC_PER_USEC); |
72 | 72 | ||
73 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); | 73 | task_cputime_scaled(tsk, &utimescaled, &stimescaled); |
74 | stats->ac_utimescaled = cputime_to_usecs(utimescaled); | 74 | stats->ac_utimescaled = div_u64(utimescaled, NSEC_PER_USEC); |
75 | stats->ac_stimescaled = cputime_to_usecs(stimescaled); | 75 | stats->ac_stimescaled = div_u64(stimescaled, NSEC_PER_USEC); |
76 | 76 | ||
77 | stats->ac_minflt = tsk->min_flt; | 77 | stats->ac_minflt = tsk->min_flt; |
78 | stats->ac_majflt = tsk->maj_flt; | 78 | stats->ac_majflt = tsk->maj_flt; |
@@ -123,18 +123,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
123 | #undef MB | 123 | #undef MB |
124 | 124 | ||
125 | static void __acct_update_integrals(struct task_struct *tsk, | 125 | static void __acct_update_integrals(struct task_struct *tsk, |
126 | cputime_t utime, cputime_t stime) | 126 | u64 utime, u64 stime) |
127 | { | 127 | { |
128 | cputime_t time, dtime; | 128 | u64 time, delta; |
129 | u64 delta; | ||
130 | 129 | ||
131 | if (!likely(tsk->mm)) | 130 | if (!likely(tsk->mm)) |
132 | return; | 131 | return; |
133 | 132 | ||
134 | time = stime + utime; | 133 | time = stime + utime; |
135 | dtime = time - tsk->acct_timexpd; | 134 | delta = time - tsk->acct_timexpd; |
136 | /* Avoid division: cputime_t is often in nanoseconds already. */ | ||
137 | delta = cputime_to_nsecs(dtime); | ||
138 | 135 | ||
139 | if (delta < TICK_NSEC) | 136 | if (delta < TICK_NSEC) |
140 | return; | 137 | return; |
@@ -155,7 +152,7 @@ static void __acct_update_integrals(struct task_struct *tsk, | |||
155 | */ | 152 | */ |
156 | void acct_update_integrals(struct task_struct *tsk) | 153 | void acct_update_integrals(struct task_struct *tsk) |
157 | { | 154 | { |
158 | cputime_t utime, stime; | 155 | u64 utime, stime; |
159 | unsigned long flags; | 156 | unsigned long flags; |
160 | 157 | ||
161 | local_irq_save(flags); | 158 | local_irq_save(flags); |