diff options
30 files changed, 938 insertions, 115 deletions
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index e38b8df3d727..8e9359de1d28 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt | |||
@@ -191,7 +191,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that | |||
191 | o A hardware or software issue shuts off the scheduler-clock | 191 | o A hardware or software issue shuts off the scheduler-clock |
192 | interrupt on a CPU that is not in dyntick-idle mode. This | 192 | interrupt on a CPU that is not in dyntick-idle mode. This |
193 | problem really has happened, and seems to be most likely to | 193 | problem really has happened, and seems to be most likely to |
194 | result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels. | 194 | result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. |
195 | 195 | ||
196 | o A bug in the RCU implementation. | 196 | o A bug in the RCU implementation. |
197 | 197 | ||
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt index c7a2eb8450c2..e3e5d9ae50cd 100644 --- a/Documentation/cpu-freq/governors.txt +++ b/Documentation/cpu-freq/governors.txt | |||
@@ -131,8 +131,8 @@ sampling_rate_min: | |||
131 | The sampling rate is limited by the HW transition latency: | 131 | The sampling rate is limited by the HW transition latency: |
132 | transition_latency * 100 | 132 | transition_latency * 100 |
133 | Or by kernel restrictions: | 133 | Or by kernel restrictions: |
134 | If CONFIG_NO_HZ is set, the limit is 10ms fixed. | 134 | If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed. |
135 | If CONFIG_NO_HZ is not set or nohz=off boot parameter is used, the | 135 | If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is used, the |
136 | limits depend on the CONFIG_HZ option: | 136 | limits depend on the CONFIG_HZ option: |
137 | HZ=1000: min=20000us (20ms) | 137 | HZ=1000: min=20000us (20ms) |
138 | HZ=250: min=80000us (80ms) | 138 | HZ=250: min=80000us (80ms) |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index de12397b60a9..7d55ebb5660c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -1951,6 +1951,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
1951 | Valid arguments: on, off | 1951 | Valid arguments: on, off |
1952 | Default: on | 1952 | Default: on |
1953 | 1953 | ||
1954 | nohz_full= [KNL,BOOT] | ||
1955 | In kernels built with CONFIG_NO_HZ_FULL=y, set | ||
1956 | the specified list of CPUs whose tick will be stopped | ||
1957 | whenever possible. The boot CPU will be forced outside | ||
1958 | the range to maintain the timekeeping. | ||
1959 | The CPUs in this range must also be included in the | ||
1960 | rcu_nocbs= set. | ||
1961 | |||
1954 | noiotrap [SH] Disables trapped I/O port accesses. | 1962 | noiotrap [SH] Disables trapped I/O port accesses. |
1955 | 1963 | ||
1956 | noirqdebug [X86-32] Disables the code which attempts to detect and | 1964 | noirqdebug [X86-32] Disables the code which attempts to detect and |
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt new file mode 100644 index 000000000000..5b5322024067 --- /dev/null +++ b/Documentation/timers/NO_HZ.txt | |||
@@ -0,0 +1,273 @@ | |||
1 | NO_HZ: Reducing Scheduling-Clock Ticks | ||
2 | |||
3 | |||
4 | This document describes Kconfig options and boot parameters that can | ||
5 | reduce the number of scheduling-clock interrupts, thereby improving energy | ||
6 | efficiency and reducing OS jitter. Reducing OS jitter is important for | ||
7 | some types of computationally intensive high-performance computing (HPC) | ||
8 | applications and for real-time applications. | ||
9 | |||
10 | There are two main contexts in which the number of scheduling-clock | ||
11 | interrupts can be reduced compared to the old-school approach of sending | ||
12 | a scheduling-clock interrupt to all CPUs every jiffy whether they need | ||
13 | it or not (CONFIG_HZ_PERIODIC=y or CONFIG_NO_HZ=n for older kernels): | ||
14 | |||
15 | 1. Idle CPUs (CONFIG_NO_HZ_IDLE=y or CONFIG_NO_HZ=y for older kernels). | ||
16 | |||
17 | 2. CPUs having only one runnable task (CONFIG_NO_HZ_FULL=y). | ||
18 | |||
19 | These two cases are described in the following two sections, followed | ||
20 | by a third section on RCU-specific considerations and a fourth and final | ||
21 | section listing known issues. | ||
22 | |||
23 | |||
24 | IDLE CPUs | ||
25 | |||
26 | If a CPU is idle, there is little point in sending it a scheduling-clock | ||
27 | interrupt. After all, the primary purpose of a scheduling-clock interrupt | ||
28 | is to force a busy CPU to shift its attention among multiple duties, | ||
29 | and an idle CPU has no duties to shift its attention among. | ||
30 | |||
31 | The CONFIG_NO_HZ_IDLE=y Kconfig option causes the kernel to avoid sending | ||
32 | scheduling-clock interrupts to idle CPUs, which is critically important | ||
33 | both to battery-powered devices and to highly virtualized mainframes. | ||
34 | A battery-powered device running a CONFIG_HZ_PERIODIC=y kernel would | ||
35 | drain its battery very quickly, easily 2-3 times as fast as would the | ||
36 | same device running a CONFIG_NO_HZ_IDLE=y kernel. A mainframe running | ||
37 | 1,500 OS instances might find that half of its CPU time was consumed by | ||
38 | unnecessary scheduling-clock interrupts. In these situations, there | ||
39 | is strong motivation to avoid sending scheduling-clock interrupts to | ||
40 | idle CPUs. That said, dyntick-idle mode is not free: | ||
41 | |||
42 | 1. It increases the number of instructions executed on the path | ||
43 | to and from the idle loop. | ||
44 | |||
45 | 2. On many architectures, dyntick-idle mode also increases the | ||
46 | number of expensive clock-reprogramming operations. | ||
47 | |||
48 | Therefore, systems with aggressive real-time response constraints often | ||
49 | run CONFIG_HZ_PERIODIC=y kernels (or CONFIG_NO_HZ=n for older kernels) | ||
50 | in order to avoid degrading from-idle transition latencies. | ||
51 | |||
52 | An idle CPU that is not receiving scheduling-clock interrupts is said to | ||
53 | be "dyntick-idle", "in dyntick-idle mode", "in nohz mode", or "running | ||
54 | tickless". The remainder of this document will use "dyntick-idle mode". | ||
55 | |||
56 | There is also a boot parameter "nohz=" that can be used to disable | ||
57 | dyntick-idle mode in CONFIG_NO_HZ_IDLE=y kernels by specifying "nohz=off". | ||
58 | By default, CONFIG_NO_HZ_IDLE=y kernels boot with "nohz=on", enabling | ||
59 | dyntick-idle mode. | ||
60 | |||
61 | |||
62 | CPUs WITH ONLY ONE RUNNABLE TASK | ||
63 | |||
64 | If a CPU has only one runnable task, there is little point in sending it | ||
65 | a scheduling-clock interrupt because there is no other task to switch to. | ||
66 | |||
67 | The CONFIG_NO_HZ_FULL=y Kconfig option causes the kernel to avoid | ||
68 | sending scheduling-clock interrupts to CPUs with a single runnable task, | ||
69 | and such CPUs are said to be "adaptive-ticks CPUs". This is important | ||
70 | for applications with aggressive real-time response constraints because | ||
71 | it allows them to improve their worst-case response times by the maximum | ||
72 | duration of a scheduling-clock interrupt. It is also important for | ||
73 | computationally intensive short-iteration workloads: If any CPU is | ||
74 | delayed during a given iteration, all the other CPUs will be forced to | ||
75 | wait idle while the delayed CPU finishes. Thus, the delay is multiplied | ||
76 | by one less than the number of CPUs. In these situations, there is | ||
77 | again strong motivation to avoid sending scheduling-clock interrupts. | ||
78 | |||
79 | By default, no CPU will be an adaptive-ticks CPU. The "nohz_full=" | ||
80 | boot parameter specifies the adaptive-ticks CPUs. For example, | ||
81 | "nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks | ||
82 | CPUs. Note that you are prohibited from marking all of the CPUs as | ||
83 | adaptive-tick CPUs: At least one non-adaptive-tick CPU must remain | ||
84 | online to handle timekeeping tasks in order to ensure that system calls | ||
85 | like gettimeofday() returns accurate values on adaptive-tick CPUs. | ||
86 | (This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no | ||
87 | running user processes to observe slight drifts in clock rate.) | ||
88 | Therefore, the boot CPU is prohibited from entering adaptive-ticks | ||
89 | mode. Specifying a "nohz_full=" mask that includes the boot CPU will | ||
90 | result in a boot-time error message, and the boot CPU will be removed | ||
91 | from the mask. | ||
92 | |||
93 | Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies | ||
94 | that all CPUs other than the boot CPU are adaptive-ticks CPUs. This | ||
95 | Kconfig parameter will be overridden by the "nohz_full=" boot parameter, | ||
96 | so that if both the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter and | ||
97 | the "nohz_full=1" boot parameter is specified, the boot parameter will | ||
98 | prevail so that only CPU 1 will be an adaptive-ticks CPU. | ||
99 | |||
100 | Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded. | ||
101 | This is covered in the "RCU IMPLICATIONS" section below. | ||
102 | |||
103 | Normally, a CPU remains in adaptive-ticks mode as long as possible. | ||
104 | In particular, transitioning to kernel mode does not automatically change | ||
105 | the mode. Instead, the CPU will exit adaptive-ticks mode only if needed, | ||
106 | for example, if that CPU enqueues an RCU callback. | ||
107 | |||
108 | Just as with dyntick-idle mode, the benefits of adaptive-tick mode do | ||
109 | not come for free: | ||
110 | |||
111 | 1. CONFIG_NO_HZ_FULL selects CONFIG_NO_HZ_COMMON, so you cannot run | ||
112 | adaptive ticks without also running dyntick idle. This dependency | ||
113 | extends down into the implementation, so that all of the costs | ||
114 | of CONFIG_NO_HZ_IDLE are also incurred by CONFIG_NO_HZ_FULL. | ||
115 | |||
116 | 2. The user/kernel transitions are slightly more expensive due | ||
117 | to the need to inform kernel subsystems (such as RCU) about | ||
118 | the change in mode. | ||
119 | |||
120 | 3. POSIX CPU timers on adaptive-tick CPUs may miss their deadlines | ||
121 | (perhaps indefinitely) because they currently rely on | ||
122 | scheduling-tick interrupts. This will likely be fixed in | ||
123 | one of two ways: (1) Prevent CPUs with POSIX CPU timers from | ||
124 | entering adaptive-tick mode, or (2) Use hrtimers or other | ||
125 | adaptive-ticks-immune mechanism to cause the POSIX CPU timer to | ||
126 | fire properly. | ||
127 | |||
128 | 4. If there are more perf events pending than the hardware can | ||
129 | accommodate, they are normally round-robined so as to collect | ||
130 | all of them over time. Adaptive-tick mode may prevent this | ||
131 | round-robining from happening. This will likely be fixed by | ||
132 | preventing CPUs with large numbers of perf events pending from | ||
133 | entering adaptive-tick mode. | ||
134 | |||
135 | 5. Scheduler statistics for adaptive-tick CPUs may be computed | ||
136 | slightly differently than those for non-adaptive-tick CPUs. | ||
137 | This might in turn perturb load-balancing of real-time tasks. | ||
138 | |||
139 | 6. The LB_BIAS scheduler feature is disabled by adaptive ticks. | ||
140 | |||
141 | Although improvements are expected over time, adaptive ticks is quite | ||
142 | useful for many types of real-time and compute-intensive applications. | ||
143 | However, the drawbacks listed above mean that adaptive ticks should not | ||
144 | (yet) be enabled by default. | ||
145 | |||
146 | |||
147 | RCU IMPLICATIONS | ||
148 | |||
149 | There are situations in which idle CPUs cannot be permitted to | ||
150 | enter either dyntick-idle mode or adaptive-tick mode, the most | ||
151 | common being when that CPU has RCU callbacks pending. | ||
152 | |||
153 | The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs | ||
154 | to enter dyntick-idle mode or adaptive-tick mode anyway. In this case, | ||
155 | a timer will awaken these CPUs every four jiffies in order to ensure | ||
156 | that the RCU callbacks are processed in a timely fashion. | ||
157 | |||
158 | Another approach is to offload RCU callback processing to "rcuo" kthreads | ||
159 | using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to | ||
160 | offload may be selected via several methods: | ||
161 | |||
162 | 1. One of three mutually exclusive Kconfig options specify a | ||
163 | build-time default for the CPUs to offload: | ||
164 | |||
165 | a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in | ||
166 | no CPUs being offloaded. | ||
167 | |||
168 | b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes | ||
169 | CPU 0 to be offloaded. | ||
170 | |||
171 | c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all | ||
172 | CPUs to be offloaded. Note that the callbacks will be | ||
173 | offloaded to "rcuo" kthreads, and that those kthreads | ||
174 | will in fact run on some CPU. However, this approach | ||
175 | gives fine-grained control on exactly which CPUs the | ||
176 | callbacks run on, along with their scheduling priority | ||
177 | (including the default of SCHED_OTHER), and it further | ||
178 | allows this control to be varied dynamically at runtime. | ||
179 | |||
180 | 2. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated | ||
181 | list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1, | ||
182 | 3, 4, and 5. The specified CPUs will be offloaded in addition to | ||
183 | any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or | ||
184 | CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot | ||
185 | parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y. | ||
186 | |||
187 | The offloaded CPUs will never queue RCU callbacks, and therefore RCU | ||
188 | never prevents offloaded CPUs from entering either dyntick-idle mode | ||
189 | or adaptive-tick mode. That said, note that it is up to userspace to | ||
190 | pin the "rcuo" kthreads to specific CPUs if desired. Otherwise, the | ||
191 | scheduler will decide where to run them, which might or might not be | ||
192 | where you want them to run. | ||
193 | |||
194 | |||
195 | KNOWN ISSUES | ||
196 | |||
197 | o Dyntick-idle slows transitions to and from idle slightly. | ||
198 | In practice, this has not been a problem except for the most | ||
199 | aggressive real-time workloads, which have the option of disabling | ||
200 | dyntick-idle mode, an option that most of them take. However, | ||
201 | some workloads will no doubt want to use adaptive ticks to | ||
202 | eliminate scheduling-clock interrupt latencies. Here are some | ||
203 | options for these workloads: | ||
204 | |||
205 | a. Use PMQOS from userspace to inform the kernel of your | ||
206 | latency requirements (preferred). | ||
207 | |||
208 | b. On x86 systems, use the "idle=mwait" boot parameter. | ||
209 | |||
210 | c. On x86 systems, use the "intel_idle.max_cstate=" to limit | ||
211 | ` the maximum C-state depth. | ||
212 | |||
213 | d. On x86 systems, use the "idle=poll" boot parameter. | ||
214 | However, please note that use of this parameter can cause | ||
215 | your CPU to overheat, which may cause thermal throttling | ||
216 | to degrade your latencies -- and that this degradation can | ||
217 | be even worse than that of dyntick-idle. Furthermore, | ||
218 | this parameter effectively disables Turbo Mode on Intel | ||
219 | CPUs, which can significantly reduce maximum performance. | ||
220 | |||
221 | o Adaptive-ticks slows user/kernel transitions slightly. | ||
222 | This is not expected to be a problem for computationally intensive | ||
223 | workloads, which have few such transitions. Careful benchmarking | ||
224 | will be required to determine whether or not other workloads | ||
225 | are significantly affected by this effect. | ||
226 | |||
227 | o Adaptive-ticks does not do anything unless there is only one | ||
228 | runnable task for a given CPU, even though there are a number | ||
229 | of other situations where the scheduling-clock tick is not | ||
230 | needed. To give but one example, consider a CPU that has one | ||
231 | runnable high-priority SCHED_FIFO task and an arbitrary number | ||
232 | of low-priority SCHED_OTHER tasks. In this case, the CPU is | ||
233 | required to run the SCHED_FIFO task until it either blocks or | ||
234 | some other higher-priority task awakens on (or is assigned to) | ||
235 | this CPU, so there is no point in sending a scheduling-clock | ||
236 | interrupt to this CPU. However, the current implementation | ||
237 | nevertheless sends scheduling-clock interrupts to CPUs having a | ||
238 | single runnable SCHED_FIFO task and multiple runnable SCHED_OTHER | ||
239 | tasks, even though these interrupts are unnecessary. | ||
240 | |||
241 | Better handling of these sorts of situations is future work. | ||
242 | |||
243 | o A reboot is required to reconfigure both adaptive idle and RCU | ||
244 | callback offloading. Runtime reconfiguration could be provided | ||
245 | if needed, however, due to the complexity of reconfiguring RCU at | ||
246 | runtime, there would need to be an earthshakingly good reason. | ||
247 | Especially given that you have the straightforward option of | ||
248 | simply offloading RCU callbacks from all CPUs and pinning them | ||
249 | where you want them whenever you want them pinned. | ||
250 | |||
251 | o Additional configuration is required to deal with other sources | ||
252 | of OS jitter, including interrupts and system-utility tasks | ||
253 | and processes. This configuration normally involves binding | ||
254 | interrupts and tasks to particular CPUs. | ||
255 | |||
256 | o Some sources of OS jitter can currently be eliminated only by | ||
257 | constraining the workload. For example, the only way to eliminate | ||
258 | OS jitter due to global TLB shootdowns is to avoid the unmapping | ||
259 | operations (such as kernel module unload operations) that | ||
260 | result in these shootdowns. For another example, page faults | ||
261 | and TLB misses can be reduced (and in some cases eliminated) by | ||
262 | using huge pages and by constraining the amount of memory used | ||
263 | by the application. Pre-faulting the working set can also be | ||
264 | helpful, especially when combined with the mlock() and mlockall() | ||
265 | system calls. | ||
266 | |||
267 | o Unless all CPUs are idle, at least one CPU must keep the | ||
268 | scheduling-clock interrupt going in order to support accurate | ||
269 | timekeeping. | ||
270 | |||
271 | o If there are adaptive-ticks CPUs, there will be at least one | ||
272 | CPU keeping the scheduling-clock interrupt going, even if all | ||
273 | CPUs are otherwise idle. | ||
diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 2df313b6a586..c92306809029 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h | |||
@@ -30,8 +30,8 @@ DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); | |||
30 | #ifdef CONFIG_PRINTK | 30 | #ifdef CONFIG_PRINTK |
31 | DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); | 31 | DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); |
32 | #endif | 32 | #endif |
33 | #ifdef CONFIG_NO_HZ | 33 | #ifdef CONFIG_NO_HZ_COMMON |
34 | DEFINE(UML_CONFIG_NO_HZ, CONFIG_NO_HZ); | 34 | DEFINE(UML_CONFIG_NO_HZ_COMMON, CONFIG_NO_HZ_COMMON); |
35 | #endif | 35 | #endif |
36 | #ifdef CONFIG_UML_X86 | 36 | #ifdef CONFIG_UML_X86 |
37 | DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); | 37 | DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); |
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index fac388cb464f..e9824d5dd7d5 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c | |||
@@ -79,7 +79,7 @@ long long os_nsecs(void) | |||
79 | return timeval_to_ns(&tv); | 79 | return timeval_to_ns(&tv); |
80 | } | 80 | } |
81 | 81 | ||
82 | #ifdef UML_CONFIG_NO_HZ | 82 | #ifdef UML_CONFIG_NO_HZ_COMMON |
83 | static int after_sleep_interval(struct timespec *ts) | 83 | static int after_sleep_interval(struct timespec *ts) |
84 | { | 84 | { |
85 | return 0; | 85 | return 0; |
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index a8ece9a33aef..2c9e62c2bfd0 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h | |||
@@ -16,21 +16,27 @@ | |||
16 | #ifndef _ASM_GENERIC_CPUTIME_NSECS_H | 16 | #ifndef _ASM_GENERIC_CPUTIME_NSECS_H |
17 | #define _ASM_GENERIC_CPUTIME_NSECS_H | 17 | #define _ASM_GENERIC_CPUTIME_NSECS_H |
18 | 18 | ||
19 | #include <linux/math64.h> | ||
20 | |||
19 | typedef u64 __nocast cputime_t; | 21 | typedef u64 __nocast cputime_t; |
20 | typedef u64 __nocast cputime64_t; | 22 | typedef u64 __nocast cputime64_t; |
21 | 23 | ||
22 | #define cputime_one_jiffy jiffies_to_cputime(1) | 24 | #define cputime_one_jiffy jiffies_to_cputime(1) |
23 | 25 | ||
26 | #define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor) | ||
27 | #define cputime_div_rem(__ct, divisor, remainder) \ | ||
28 | div_u64_rem((__force u64)__ct, divisor, remainder); | ||
29 | |||
24 | /* | 30 | /* |
25 | * Convert cputime <-> jiffies (HZ) | 31 | * Convert cputime <-> jiffies (HZ) |
26 | */ | 32 | */ |
27 | #define cputime_to_jiffies(__ct) \ | 33 | #define cputime_to_jiffies(__ct) \ |
28 | ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) | 34 | cputime_div(__ct, NSEC_PER_SEC / HZ) |
29 | #define cputime_to_scaled(__ct) (__ct) | 35 | #define cputime_to_scaled(__ct) (__ct) |
30 | #define jiffies_to_cputime(__jif) \ | 36 | #define jiffies_to_cputime(__jif) \ |
31 | (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) | 37 | (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) |
32 | #define cputime64_to_jiffies64(__ct) \ | 38 | #define cputime64_to_jiffies64(__ct) \ |
33 | ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) | 39 | cputime_div(__ct, NSEC_PER_SEC / HZ) |
34 | #define jiffies64_to_cputime64(__jif) \ | 40 | #define jiffies64_to_cputime64(__jif) \ |
35 | (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) | 41 | (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) |
36 | 42 | ||
@@ -45,7 +51,7 @@ typedef u64 __nocast cputime64_t; | |||
45 | * Convert cputime <-> microseconds | 51 | * Convert cputime <-> microseconds |
46 | */ | 52 | */ |
47 | #define cputime_to_usecs(__ct) \ | 53 | #define cputime_to_usecs(__ct) \ |
48 | ((__force u64)(__ct) / NSEC_PER_USEC) | 54 | cputime_div(__ct, NSEC_PER_USEC) |
49 | #define usecs_to_cputime(__usecs) \ | 55 | #define usecs_to_cputime(__usecs) \ |
50 | (__force cputime_t)((__usecs) * NSEC_PER_USEC) | 56 | (__force cputime_t)((__usecs) * NSEC_PER_USEC) |
51 | #define usecs_to_cputime64(__usecs) \ | 57 | #define usecs_to_cputime64(__usecs) \ |
@@ -55,7 +61,7 @@ typedef u64 __nocast cputime64_t; | |||
55 | * Convert cputime <-> seconds | 61 | * Convert cputime <-> seconds |
56 | */ | 62 | */ |
57 | #define cputime_to_secs(__ct) \ | 63 | #define cputime_to_secs(__ct) \ |
58 | ((__force u64)(__ct) / NSEC_PER_SEC) | 64 | cputime_div(__ct, NSEC_PER_SEC) |
59 | #define secs_to_cputime(__secs) \ | 65 | #define secs_to_cputime(__secs) \ |
60 | (__force cputime_t)((__secs) * NSEC_PER_SEC) | 66 | (__force cputime_t)((__secs) * NSEC_PER_SEC) |
61 | 67 | ||
@@ -69,8 +75,10 @@ static inline cputime_t timespec_to_cputime(const struct timespec *val) | |||
69 | } | 75 | } |
70 | static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) | 76 | static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) |
71 | { | 77 | { |
72 | val->tv_sec = (__force u64) ct / NSEC_PER_SEC; | 78 | u32 rem; |
73 | val->tv_nsec = (__force u64) ct % NSEC_PER_SEC; | 79 | |
80 | val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem); | ||
81 | val->tv_nsec = rem; | ||
74 | } | 82 | } |
75 | 83 | ||
76 | /* | 84 | /* |
@@ -83,15 +91,17 @@ static inline cputime_t timeval_to_cputime(const struct timeval *val) | |||
83 | } | 91 | } |
84 | static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) | 92 | static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) |
85 | { | 93 | { |
86 | val->tv_sec = (__force u64) ct / NSEC_PER_SEC; | 94 | u32 rem; |
87 | val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC; | 95 | |
96 | val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem); | ||
97 | val->tv_usec = rem / NSEC_PER_USEC; | ||
88 | } | 98 | } |
89 | 99 | ||
90 | /* | 100 | /* |
91 | * Convert cputime <-> clock (USER_HZ) | 101 | * Convert cputime <-> clock (USER_HZ) |
92 | */ | 102 | */ |
93 | #define cputime_to_clock_t(__ct) \ | 103 | #define cputime_to_clock_t(__ct) \ |
94 | ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ)) | 104 | cputime_div(__ct, (NSEC_PER_SEC / USER_HZ)) |
95 | #define clock_t_to_cputime(__x) \ | 105 | #define clock_t_to_cputime(__x) \ |
96 | (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) | 106 | (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) |
97 | 107 | ||
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e0373d26c244..f463a46424e2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h | |||
@@ -788,6 +788,12 @@ static inline int __perf_event_disable(void *info) { return -1; } | |||
788 | static inline void perf_event_task_tick(void) { } | 788 | static inline void perf_event_task_tick(void) { } |
789 | #endif | 789 | #endif |
790 | 790 | ||
791 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL) | ||
792 | extern bool perf_event_can_stop_tick(void); | ||
793 | #else | ||
794 | static inline bool perf_event_can_stop_tick(void) { return true; } | ||
795 | #endif | ||
796 | |||
791 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) | 797 | #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) |
792 | extern void perf_restore_debug_store(void); | 798 | extern void perf_restore_debug_store(void); |
793 | #else | 799 | #else |
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 042058fdb0af..3698d9d08978 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h | |||
@@ -122,6 +122,8 @@ void run_posix_cpu_timers(struct task_struct *task); | |||
122 | void posix_cpu_timers_exit(struct task_struct *task); | 122 | void posix_cpu_timers_exit(struct task_struct *task); |
123 | void posix_cpu_timers_exit_group(struct task_struct *task); | 123 | void posix_cpu_timers_exit_group(struct task_struct *task); |
124 | 124 | ||
125 | bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk); | ||
126 | |||
125 | void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, | 127 | void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, |
126 | cputime_t *newval, cputime_t *oldval); | 128 | cputime_t *newval, cputime_t *oldval); |
127 | 129 | ||
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9ed2c9a4de45..4ccd68e49b00 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
@@ -1000,4 +1000,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) | |||
1000 | #define kfree_rcu(ptr, rcu_head) \ | 1000 | #define kfree_rcu(ptr, rcu_head) \ |
1001 | __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) | 1001 | __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) |
1002 | 1002 | ||
1003 | #ifdef CONFIG_RCU_NOCB_CPU | ||
1004 | extern bool rcu_is_nocb_cpu(int cpu); | ||
1005 | #else | ||
1006 | static inline bool rcu_is_nocb_cpu(int cpu) { return false; } | ||
1007 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
1008 | |||
1009 | |||
1003 | #endif /* __LINUX_RCUPDATE_H */ | 1010 | #endif /* __LINUX_RCUPDATE_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 981ab6887259..ebf7095158a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -231,7 +231,7 @@ extern void init_idle_bootup_task(struct task_struct *idle); | |||
231 | 231 | ||
232 | extern int runqueue_is_locked(int cpu); | 232 | extern int runqueue_is_locked(int cpu); |
233 | 233 | ||
234 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 234 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
235 | extern void nohz_balance_enter_idle(int cpu); | 235 | extern void nohz_balance_enter_idle(int cpu); |
236 | extern void set_cpu_sd_state_idle(void); | 236 | extern void set_cpu_sd_state_idle(void); |
237 | extern int get_nohz_timer_target(void); | 237 | extern int get_nohz_timer_target(void); |
@@ -1762,13 +1762,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, | |||
1762 | } | 1762 | } |
1763 | #endif | 1763 | #endif |
1764 | 1764 | ||
1765 | #ifdef CONFIG_NO_HZ | 1765 | #ifdef CONFIG_NO_HZ_COMMON |
1766 | void calc_load_enter_idle(void); | 1766 | void calc_load_enter_idle(void); |
1767 | void calc_load_exit_idle(void); | 1767 | void calc_load_exit_idle(void); |
1768 | #else | 1768 | #else |
1769 | static inline void calc_load_enter_idle(void) { } | 1769 | static inline void calc_load_enter_idle(void) { } |
1770 | static inline void calc_load_exit_idle(void) { } | 1770 | static inline void calc_load_exit_idle(void) { } |
1771 | #endif /* CONFIG_NO_HZ */ | 1771 | #endif /* CONFIG_NO_HZ_COMMON */ |
1772 | 1772 | ||
1773 | #ifndef CONFIG_CPUMASK_OFFSTACK | 1773 | #ifndef CONFIG_CPUMASK_OFFSTACK |
1774 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 1774 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
@@ -1854,10 +1854,16 @@ extern void idle_task_exit(void); | |||
1854 | static inline void idle_task_exit(void) {} | 1854 | static inline void idle_task_exit(void) {} |
1855 | #endif | 1855 | #endif |
1856 | 1856 | ||
1857 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 1857 | #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) |
1858 | extern void wake_up_idle_cpu(int cpu); | 1858 | extern void wake_up_nohz_cpu(int cpu); |
1859 | #else | 1859 | #else |
1860 | static inline void wake_up_idle_cpu(int cpu) { } | 1860 | static inline void wake_up_nohz_cpu(int cpu) { } |
1861 | #endif | ||
1862 | |||
1863 | #ifdef CONFIG_NO_HZ_FULL | ||
1864 | extern bool sched_can_stop_tick(void); | ||
1865 | #else | ||
1866 | static inline bool sched_can_stop_tick(void) { return false; } | ||
1861 | #endif | 1867 | #endif |
1862 | 1868 | ||
1863 | #ifdef CONFIG_SCHED_AUTOGROUP | 1869 | #ifdef CONFIG_SCHED_AUTOGROUP |
diff --git a/include/linux/tick.h b/include/linux/tick.h index 553272e6af55..9180f4b85e6d 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h | |||
@@ -82,7 +82,7 @@ extern int tick_program_event(ktime_t expires, int force); | |||
82 | extern void tick_setup_sched_timer(void); | 82 | extern void tick_setup_sched_timer(void); |
83 | # endif | 83 | # endif |
84 | 84 | ||
85 | # if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS | 85 | # if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS |
86 | extern void tick_cancel_sched_timer(int cpu); | 86 | extern void tick_cancel_sched_timer(int cpu); |
87 | # else | 87 | # else |
88 | static inline void tick_cancel_sched_timer(int cpu) { } | 88 | static inline void tick_cancel_sched_timer(int cpu) { } |
@@ -123,7 +123,7 @@ static inline void tick_check_idle(int cpu) { } | |||
123 | static inline int tick_oneshot_mode_active(void) { return 0; } | 123 | static inline int tick_oneshot_mode_active(void) { return 0; } |
124 | #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ | 124 | #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ |
125 | 125 | ||
126 | # ifdef CONFIG_NO_HZ | 126 | # ifdef CONFIG_NO_HZ_COMMON |
127 | DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched); | 127 | DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched); |
128 | 128 | ||
129 | static inline int tick_nohz_tick_stopped(void) | 129 | static inline int tick_nohz_tick_stopped(void) |
@@ -138,7 +138,7 @@ extern ktime_t tick_nohz_get_sleep_length(void); | |||
138 | extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); | 138 | extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); |
139 | extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); | 139 | extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); |
140 | 140 | ||
141 | # else /* !CONFIG_NO_HZ */ | 141 | # else /* !CONFIG_NO_HZ_COMMON */ |
142 | static inline int tick_nohz_tick_stopped(void) | 142 | static inline int tick_nohz_tick_stopped(void) |
143 | { | 143 | { |
144 | return 0; | 144 | return 0; |
@@ -155,7 +155,24 @@ static inline ktime_t tick_nohz_get_sleep_length(void) | |||
155 | } | 155 | } |
156 | static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } | 156 | static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } |
157 | static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } | 157 | static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } |
158 | # endif /* !NO_HZ */ | 158 | # endif /* !CONFIG_NO_HZ_COMMON */ |
159 | |||
160 | #ifdef CONFIG_NO_HZ_FULL | ||
161 | extern void tick_nohz_init(void); | ||
162 | extern int tick_nohz_full_cpu(int cpu); | ||
163 | extern void tick_nohz_full_check(void); | ||
164 | extern void tick_nohz_full_kick(void); | ||
165 | extern void tick_nohz_full_kick_all(void); | ||
166 | extern void tick_nohz_task_switch(struct task_struct *tsk); | ||
167 | #else | ||
168 | static inline void tick_nohz_init(void) { } | ||
169 | static inline int tick_nohz_full_cpu(int cpu) { return 0; } | ||
170 | static inline void tick_nohz_full_check(void) { } | ||
171 | static inline void tick_nohz_full_kick(void) { } | ||
172 | static inline void tick_nohz_full_kick_all(void) { } | ||
173 | static inline void tick_nohz_task_switch(struct task_struct *tsk) { } | ||
174 | #endif | ||
175 | |||
159 | 176 | ||
160 | # ifdef CONFIG_CPU_IDLE_GOV_MENU | 177 | # ifdef CONFIG_CPU_IDLE_GOV_MENU |
161 | extern void menu_hrtimer_cancel(void); | 178 | extern void menu_hrtimer_cancel(void); |
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 425bcfe56c62..e967dd8a34c6 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h | |||
@@ -323,6 +323,27 @@ TRACE_EVENT(itimer_expire, | |||
323 | (int) __entry->pid, (unsigned long long)__entry->now) | 323 | (int) __entry->pid, (unsigned long long)__entry->now) |
324 | ); | 324 | ); |
325 | 325 | ||
326 | #ifdef CONFIG_NO_HZ_COMMON | ||
327 | TRACE_EVENT(tick_stop, | ||
328 | |||
329 | TP_PROTO(int success, char *error_msg), | ||
330 | |||
331 | TP_ARGS(success, error_msg), | ||
332 | |||
333 | TP_STRUCT__entry( | ||
334 | __field( int , success ) | ||
335 | __string( msg, error_msg ) | ||
336 | ), | ||
337 | |||
338 | TP_fast_assign( | ||
339 | __entry->success = success; | ||
340 | __assign_str(msg, error_msg); | ||
341 | ), | ||
342 | |||
343 | TP_printk("success=%s msg=%s", __entry->success ? "yes" : "no", __get_str(msg)) | ||
344 | ); | ||
345 | #endif | ||
346 | |||
326 | #endif /* _TRACE_TIMER_H */ | 347 | #endif /* _TRACE_TIMER_H */ |
327 | 348 | ||
328 | /* This part must be outside protection */ | 349 | /* This part must be outside protection */ |
diff --git a/init/Kconfig b/init/Kconfig index 4367e1379002..66f67afad4fa 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -302,7 +302,7 @@ choice | |||
302 | # Kind of a stub config for the pure tick based cputime accounting | 302 | # Kind of a stub config for the pure tick based cputime accounting |
303 | config TICK_CPU_ACCOUNTING | 303 | config TICK_CPU_ACCOUNTING |
304 | bool "Simple tick based cputime accounting" | 304 | bool "Simple tick based cputime accounting" |
305 | depends on !S390 | 305 | depends on !S390 && !NO_HZ_FULL |
306 | help | 306 | help |
307 | This is the basic tick based cputime accounting that maintains | 307 | This is the basic tick based cputime accounting that maintains |
308 | statistics about user, system and idle time spent on per jiffies | 308 | statistics about user, system and idle time spent on per jiffies |
@@ -312,7 +312,7 @@ config TICK_CPU_ACCOUNTING | |||
312 | 312 | ||
313 | config VIRT_CPU_ACCOUNTING_NATIVE | 313 | config VIRT_CPU_ACCOUNTING_NATIVE |
314 | bool "Deterministic task and CPU time accounting" | 314 | bool "Deterministic task and CPU time accounting" |
315 | depends on HAVE_VIRT_CPU_ACCOUNTING | 315 | depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL |
316 | select VIRT_CPU_ACCOUNTING | 316 | select VIRT_CPU_ACCOUNTING |
317 | help | 317 | help |
318 | Select this option to enable more accurate task and CPU time | 318 | Select this option to enable more accurate task and CPU time |
@@ -342,7 +342,7 @@ config VIRT_CPU_ACCOUNTING_GEN | |||
342 | 342 | ||
343 | config IRQ_TIME_ACCOUNTING | 343 | config IRQ_TIME_ACCOUNTING |
344 | bool "Fine granularity task level IRQ time accounting" | 344 | bool "Fine granularity task level IRQ time accounting" |
345 | depends on HAVE_IRQ_TIME_ACCOUNTING | 345 | depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL |
346 | help | 346 | help |
347 | Select this option to enable fine granularity task irq time | 347 | Select this option to enable fine granularity task irq time |
348 | accounting. This is done by reading a timestamp on each | 348 | accounting. This is done by reading a timestamp on each |
@@ -576,7 +576,7 @@ config RCU_FANOUT_EXACT | |||
576 | 576 | ||
577 | config RCU_FAST_NO_HZ | 577 | config RCU_FAST_NO_HZ |
578 | bool "Accelerate last non-dyntick-idle CPU's grace periods" | 578 | bool "Accelerate last non-dyntick-idle CPU's grace periods" |
579 | depends on NO_HZ && SMP | 579 | depends on NO_HZ_COMMON && SMP |
580 | default n | 580 | default n |
581 | help | 581 | help |
582 | This option permits CPUs to enter dynticks-idle state even if | 582 | This option permits CPUs to enter dynticks-idle state even if |
diff --git a/init/main.c b/init/main.c index 12c366944dbd..1952bf2f6875 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -545,6 +545,7 @@ asmlinkage void __init start_kernel(void) | |||
545 | idr_init_cache(); | 545 | idr_init_cache(); |
546 | perf_event_init(); | 546 | perf_event_init(); |
547 | rcu_init(); | 547 | rcu_init(); |
548 | tick_nohz_init(); | ||
548 | radix_tree_init(); | 549 | radix_tree_init(); |
549 | /* init some links before init_ISA_irqs() */ | 550 | /* init some links before init_ISA_irqs() */ |
550 | early_irq_init(); | 551 | early_irq_init(); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 3820e3cefbae..6b41c1899a8b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/poll.h> | 18 | #include <linux/poll.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/hash.h> | 20 | #include <linux/hash.h> |
21 | #include <linux/tick.h> | ||
21 | #include <linux/sysfs.h> | 22 | #include <linux/sysfs.h> |
22 | #include <linux/dcache.h> | 23 | #include <linux/dcache.h> |
23 | #include <linux/percpu.h> | 24 | #include <linux/percpu.h> |
@@ -685,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu) | |||
685 | 686 | ||
686 | WARN_ON(!irqs_disabled()); | 687 | WARN_ON(!irqs_disabled()); |
687 | 688 | ||
688 | if (list_empty(&cpuctx->rotation_list)) | 689 | if (list_empty(&cpuctx->rotation_list)) { |
690 | int was_empty = list_empty(head); | ||
689 | list_add(&cpuctx->rotation_list, head); | 691 | list_add(&cpuctx->rotation_list, head); |
692 | if (was_empty) | ||
693 | tick_nohz_full_kick(); | ||
694 | } | ||
690 | } | 695 | } |
691 | 696 | ||
692 | static void get_ctx(struct perf_event_context *ctx) | 697 | static void get_ctx(struct perf_event_context *ctx) |
@@ -2591,6 +2596,16 @@ done: | |||
2591 | list_del_init(&cpuctx->rotation_list); | 2596 | list_del_init(&cpuctx->rotation_list); |
2592 | } | 2597 | } |
2593 | 2598 | ||
2599 | #ifdef CONFIG_NO_HZ_FULL | ||
2600 | bool perf_event_can_stop_tick(void) | ||
2601 | { | ||
2602 | if (list_empty(&__get_cpu_var(rotation_list))) | ||
2603 | return true; | ||
2604 | else | ||
2605 | return false; | ||
2606 | } | ||
2607 | #endif | ||
2608 | |||
2594 | void perf_event_task_tick(void) | 2609 | void perf_event_task_tick(void) |
2595 | { | 2610 | { |
2596 | struct list_head *head = &__get_cpu_var(rotation_list); | 2611 | struct list_head *head = &__get_cpu_var(rotation_list); |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 14be27feda49..abfd89d687ac 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -161,7 +161,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, | |||
161 | */ | 161 | */ |
162 | static int hrtimer_get_target(int this_cpu, int pinned) | 162 | static int hrtimer_get_target(int this_cpu, int pinned) |
163 | { | 163 | { |
164 | #ifdef CONFIG_NO_HZ | 164 | #ifdef CONFIG_NO_HZ_COMMON |
165 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) | 165 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) |
166 | return get_nohz_timer_target(); | 166 | return get_nohz_timer_target(); |
167 | #endif | 167 | #endif |
@@ -1107,7 +1107,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) | |||
1107 | } | 1107 | } |
1108 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); | 1108 | EXPORT_SYMBOL_GPL(hrtimer_get_remaining); |
1109 | 1109 | ||
1110 | #ifdef CONFIG_NO_HZ | 1110 | #ifdef CONFIG_NO_HZ_COMMON |
1111 | /** | 1111 | /** |
1112 | * hrtimer_get_next_event - get the time until next expiry event | 1112 | * hrtimer_get_next_event - get the time until next expiry event |
1113 | * | 1113 | * |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8fd709c9bb58..42670e9b44e0 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -10,6 +10,8 @@ | |||
10 | #include <linux/kernel_stat.h> | 10 | #include <linux/kernel_stat.h> |
11 | #include <trace/events/timer.h> | 11 | #include <trace/events/timer.h> |
12 | #include <linux/random.h> | 12 | #include <linux/random.h> |
13 | #include <linux/tick.h> | ||
14 | #include <linux/workqueue.h> | ||
13 | 15 | ||
14 | /* | 16 | /* |
15 | * Called after updating RLIMIT_CPU to run cpu timer and update | 17 | * Called after updating RLIMIT_CPU to run cpu timer and update |
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer, | |||
153 | } | 155 | } |
154 | } | 156 | } |
155 | 157 | ||
158 | /** | ||
159 | * task_cputime_zero - Check a task_cputime struct for all zero fields. | ||
160 | * | ||
161 | * @cputime: The struct to compare. | ||
162 | * | ||
163 | * Checks @cputime to see if all fields are zero. Returns true if all fields | ||
164 | * are zero, false if any field is nonzero. | ||
165 | */ | ||
166 | static inline int task_cputime_zero(const struct task_cputime *cputime) | ||
167 | { | ||
168 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) | ||
169 | return 1; | ||
170 | return 0; | ||
171 | } | ||
172 | |||
156 | static inline cputime_t prof_ticks(struct task_struct *p) | 173 | static inline cputime_t prof_ticks(struct task_struct *p) |
157 | { | 174 | { |
158 | cputime_t utime, stime; | 175 | cputime_t utime, stime; |
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock, | |||
636 | return 0; | 653 | return 0; |
637 | } | 654 | } |
638 | 655 | ||
656 | #ifdef CONFIG_NO_HZ_FULL | ||
657 | static void nohz_kick_work_fn(struct work_struct *work) | ||
658 | { | ||
659 | tick_nohz_full_kick_all(); | ||
660 | } | ||
661 | |||
662 | static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); | ||
663 | |||
664 | /* | ||
665 | * We need the IPIs to be sent from sane process context. | ||
666 | * The posix cpu timers are always set with irqs disabled. | ||
667 | */ | ||
668 | static void posix_cpu_timer_kick_nohz(void) | ||
669 | { | ||
670 | schedule_work(&nohz_kick_work); | ||
671 | } | ||
672 | |||
673 | bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) | ||
674 | { | ||
675 | if (!task_cputime_zero(&tsk->cputime_expires)) | ||
676 | return false; | ||
677 | |||
678 | if (tsk->signal->cputimer.running) | ||
679 | return false; | ||
680 | |||
681 | return true; | ||
682 | } | ||
683 | #else | ||
684 | static inline void posix_cpu_timer_kick_nohz(void) { } | ||
685 | #endif | ||
686 | |||
639 | /* | 687 | /* |
640 | * Guts of sys_timer_settime for CPU timers. | 688 | * Guts of sys_timer_settime for CPU timers. |
641 | * This is called with the timer locked and interrupts disabled. | 689 | * This is called with the timer locked and interrupts disabled. |
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
794 | sample_to_timespec(timer->it_clock, | 842 | sample_to_timespec(timer->it_clock, |
795 | old_incr, &old->it_interval); | 843 | old_incr, &old->it_interval); |
796 | } | 844 | } |
845 | if (!ret) | ||
846 | posix_cpu_timer_kick_nohz(); | ||
797 | return ret; | 847 | return ret; |
798 | } | 848 | } |
799 | 849 | ||
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, | |||
1008 | } | 1058 | } |
1009 | } | 1059 | } |
1010 | 1060 | ||
1011 | /** | ||
1012 | * task_cputime_zero - Check a task_cputime struct for all zero fields. | ||
1013 | * | ||
1014 | * @cputime: The struct to compare. | ||
1015 | * | ||
1016 | * Checks @cputime to see if all fields are zero. Returns true if all fields | ||
1017 | * are zero, false if any field is nonzero. | ||
1018 | */ | ||
1019 | static inline int task_cputime_zero(const struct task_cputime *cputime) | ||
1020 | { | ||
1021 | if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) | ||
1022 | return 1; | ||
1023 | return 0; | ||
1024 | } | ||
1025 | |||
1026 | /* | 1061 | /* |
1027 | * Check for any per-thread CPU timers that have fired and move them | 1062 | * Check for any per-thread CPU timers that have fired and move them |
1028 | * off the tsk->*_timers list onto the firing list. Per-thread timers | 1063 | * off the tsk->*_timers list onto the firing list. Per-thread timers |
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk) | |||
1336 | cpu_timer_fire(timer); | 1371 | cpu_timer_fire(timer); |
1337 | spin_unlock(&timer->it_lock); | 1372 | spin_unlock(&timer->it_lock); |
1338 | } | 1373 | } |
1374 | |||
1375 | /* | ||
1376 | * In case some timers were rescheduled after the queue got emptied, | ||
1377 | * wake up full dynticks CPUs. | ||
1378 | */ | ||
1379 | if (tsk->signal->cputimer.running) | ||
1380 | posix_cpu_timer_kick_nohz(); | ||
1339 | } | 1381 | } |
1340 | 1382 | ||
1341 | /* | 1383 | /* |
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1366 | } | 1408 | } |
1367 | 1409 | ||
1368 | if (!*newval) | 1410 | if (!*newval) |
1369 | return; | 1411 | goto out; |
1370 | *newval += now.cpu; | 1412 | *newval += now.cpu; |
1371 | } | 1413 | } |
1372 | 1414 | ||
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, | |||
1384 | tsk->signal->cputime_expires.virt_exp = *newval; | 1426 | tsk->signal->cputime_expires.virt_exp = *newval; |
1385 | break; | 1427 | break; |
1386 | } | 1428 | } |
1429 | out: | ||
1430 | posix_cpu_timer_kick_nohz(); | ||
1387 | } | 1431 | } |
1388 | 1432 | ||
1389 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, | 1433 | static int do_cpu_nanosleep(const clockid_t which_clock, int flags, |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d8534308fd05..16ea67925015 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -799,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
799 | rdp->offline_fqs++; | 799 | rdp->offline_fqs++; |
800 | return 1; | 800 | return 1; |
801 | } | 801 | } |
802 | |||
803 | /* | ||
804 | * There is a possibility that a CPU in adaptive-ticks state | ||
805 | * might run in the kernel with the scheduling-clock tick disabled | ||
806 | * for an extended time period. Invoke rcu_kick_nohz_cpu() to | ||
807 | * force the CPU to restart the scheduling-clock tick in this | ||
808 | * CPU is in this state. | ||
809 | */ | ||
810 | rcu_kick_nohz_cpu(rdp->cpu); | ||
811 | |||
802 | return 0; | 812 | return 0; |
803 | } | 813 | } |
804 | 814 | ||
@@ -1820,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | |||
1820 | struct rcu_node *rnp, struct rcu_data *rdp) | 1830 | struct rcu_node *rnp, struct rcu_data *rdp) |
1821 | { | 1831 | { |
1822 | /* No-CBs CPUs do not have orphanable callbacks. */ | 1832 | /* No-CBs CPUs do not have orphanable callbacks. */ |
1823 | if (is_nocb_cpu(rdp->cpu)) | 1833 | if (rcu_is_nocb_cpu(rdp->cpu)) |
1824 | return; | 1834 | return; |
1825 | 1835 | ||
1826 | /* | 1836 | /* |
@@ -2892,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
2892 | * corresponding CPU's preceding callbacks have been invoked. | 2902 | * corresponding CPU's preceding callbacks have been invoked. |
2893 | */ | 2903 | */ |
2894 | for_each_possible_cpu(cpu) { | 2904 | for_each_possible_cpu(cpu) { |
2895 | if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) | 2905 | if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) |
2896 | continue; | 2906 | continue; |
2897 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2907 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2898 | if (is_nocb_cpu(cpu)) { | 2908 | if (rcu_is_nocb_cpu(cpu)) { |
2899 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | 2909 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, |
2900 | rsp->n_barrier_done); | 2910 | rsp->n_barrier_done); |
2901 | atomic_inc(&rsp->barrier_cpu_count); | 2911 | atomic_inc(&rsp->barrier_cpu_count); |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14ee40795d6f..da77a8f57ff9 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -530,13 +530,13 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp); | |||
530 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | 530 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); |
531 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | 531 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); |
532 | static void rcu_init_one_nocb(struct rcu_node *rnp); | 532 | static void rcu_init_one_nocb(struct rcu_node *rnp); |
533 | static bool is_nocb_cpu(int cpu); | ||
534 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 533 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
535 | bool lazy); | 534 | bool lazy); |
536 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 535 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
537 | struct rcu_data *rdp); | 536 | struct rcu_data *rdp); |
538 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 537 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
539 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 538 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
539 | static void rcu_kick_nohz_cpu(int cpu); | ||
540 | static bool init_nocb_callback_list(struct rcu_data *rdp); | 540 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
541 | 541 | ||
542 | #endif /* #ifndef RCU_TREE_NONCORE */ | 542 | #endif /* #ifndef RCU_TREE_NONCORE */ |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index d084ae3f281c..71bd7337d0cc 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/oom.h> | 29 | #include <linux/oom.h> |
30 | #include <linux/smpboot.h> | 30 | #include <linux/smpboot.h> |
31 | #include <linux/tick.h> | ||
31 | 32 | ||
32 | #define RCU_KTHREAD_PRIO 1 | 33 | #define RCU_KTHREAD_PRIO 1 |
33 | 34 | ||
@@ -2052,7 +2053,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
2052 | } | 2053 | } |
2053 | 2054 | ||
2054 | /* Is the specified CPU a no-CPUs CPU? */ | 2055 | /* Is the specified CPU a no-CPUs CPU? */ |
2055 | static bool is_nocb_cpu(int cpu) | 2056 | bool rcu_is_nocb_cpu(int cpu) |
2056 | { | 2057 | { |
2057 | if (have_rcu_nocb_mask) | 2058 | if (have_rcu_nocb_mask) |
2058 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | 2059 | return cpumask_test_cpu(cpu, rcu_nocb_mask); |
@@ -2110,7 +2111,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
2110 | bool lazy) | 2111 | bool lazy) |
2111 | { | 2112 | { |
2112 | 2113 | ||
2113 | if (!is_nocb_cpu(rdp->cpu)) | 2114 | if (!rcu_is_nocb_cpu(rdp->cpu)) |
2114 | return 0; | 2115 | return 0; |
2115 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | 2116 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); |
2116 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | 2117 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) |
@@ -2134,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2134 | long qll = rsp->qlen_lazy; | 2135 | long qll = rsp->qlen_lazy; |
2135 | 2136 | ||
2136 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | 2137 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ |
2137 | if (!is_nocb_cpu(smp_processor_id())) | 2138 | if (!rcu_is_nocb_cpu(smp_processor_id())) |
2138 | return 0; | 2139 | return 0; |
2139 | rsp->qlen = 0; | 2140 | rsp->qlen = 0; |
2140 | rsp->qlen_lazy = 0; | 2141 | rsp->qlen_lazy = 0; |
@@ -2306,11 +2307,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) | |||
2306 | { | 2307 | { |
2307 | } | 2308 | } |
2308 | 2309 | ||
2309 | static bool is_nocb_cpu(int cpu) | ||
2310 | { | ||
2311 | return false; | ||
2312 | } | ||
2313 | |||
2314 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 2310 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
2315 | bool lazy) | 2311 | bool lazy) |
2316 | { | 2312 | { |
@@ -2337,3 +2333,20 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) | |||
2337 | } | 2333 | } |
2338 | 2334 | ||
2339 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | 2335 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ |
2336 | |||
2337 | /* | ||
2338 | * An adaptive-ticks CPU can potentially execute in kernel mode for an | ||
2339 | * arbitrarily long period of time with the scheduling-clock tick turned | ||
2340 | * off. RCU will be paying attention to this CPU because it is in the | ||
2341 | * kernel, but the CPU cannot be guaranteed to be executing the RCU state | ||
2342 | * machine because the scheduling-clock tick has been disabled. Therefore, | ||
2343 | * if an adaptive-ticks CPU is failing to respond to the current grace | ||
2344 | * period and has not be idle from an RCU perspective, kick it. | ||
2345 | */ | ||
2346 | static void rcu_kick_nohz_cpu(int cpu) | ||
2347 | { | ||
2348 | #ifdef CONFIG_NO_HZ_FULL | ||
2349 | if (tick_nohz_full_cpu(cpu)) | ||
2350 | smp_send_reschedule(cpu); | ||
2351 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
2352 | } | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c70a8814a767..e94842d4400c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -544,7 +544,7 @@ void resched_cpu(int cpu) | |||
544 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 544 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
545 | } | 545 | } |
546 | 546 | ||
547 | #ifdef CONFIG_NO_HZ | 547 | #ifdef CONFIG_NO_HZ_COMMON |
548 | /* | 548 | /* |
549 | * In the semi idle case, use the nearest busy cpu for migrating timers | 549 | * In the semi idle case, use the nearest busy cpu for migrating timers |
550 | * from an idle cpu. This is good for power-savings. | 550 | * from an idle cpu. This is good for power-savings. |
@@ -582,7 +582,7 @@ unlock: | |||
582 | * account when the CPU goes back to idle and evaluates the timer | 582 | * account when the CPU goes back to idle and evaluates the timer |
583 | * wheel for the next timer event. | 583 | * wheel for the next timer event. |
584 | */ | 584 | */ |
585 | void wake_up_idle_cpu(int cpu) | 585 | static void wake_up_idle_cpu(int cpu) |
586 | { | 586 | { |
587 | struct rq *rq = cpu_rq(cpu); | 587 | struct rq *rq = cpu_rq(cpu); |
588 | 588 | ||
@@ -612,20 +612,56 @@ void wake_up_idle_cpu(int cpu) | |||
612 | smp_send_reschedule(cpu); | 612 | smp_send_reschedule(cpu); |
613 | } | 613 | } |
614 | 614 | ||
615 | static bool wake_up_full_nohz_cpu(int cpu) | ||
616 | { | ||
617 | if (tick_nohz_full_cpu(cpu)) { | ||
618 | if (cpu != smp_processor_id() || | ||
619 | tick_nohz_tick_stopped()) | ||
620 | smp_send_reschedule(cpu); | ||
621 | return true; | ||
622 | } | ||
623 | |||
624 | return false; | ||
625 | } | ||
626 | |||
627 | void wake_up_nohz_cpu(int cpu) | ||
628 | { | ||
629 | if (!wake_up_full_nohz_cpu(cpu)) | ||
630 | wake_up_idle_cpu(cpu); | ||
631 | } | ||
632 | |||
615 | static inline bool got_nohz_idle_kick(void) | 633 | static inline bool got_nohz_idle_kick(void) |
616 | { | 634 | { |
617 | int cpu = smp_processor_id(); | 635 | int cpu = smp_processor_id(); |
618 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); | 636 | return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); |
619 | } | 637 | } |
620 | 638 | ||
621 | #else /* CONFIG_NO_HZ */ | 639 | #else /* CONFIG_NO_HZ_COMMON */ |
622 | 640 | ||
623 | static inline bool got_nohz_idle_kick(void) | 641 | static inline bool got_nohz_idle_kick(void) |
624 | { | 642 | { |
625 | return false; | 643 | return false; |
626 | } | 644 | } |
627 | 645 | ||
628 | #endif /* CONFIG_NO_HZ */ | 646 | #endif /* CONFIG_NO_HZ_COMMON */ |
647 | |||
648 | #ifdef CONFIG_NO_HZ_FULL | ||
649 | bool sched_can_stop_tick(void) | ||
650 | { | ||
651 | struct rq *rq; | ||
652 | |||
653 | rq = this_rq(); | ||
654 | |||
655 | /* Make sure rq->nr_running update is visible after the IPI */ | ||
656 | smp_rmb(); | ||
657 | |||
658 | /* More than one running task need preemption */ | ||
659 | if (rq->nr_running > 1) | ||
660 | return false; | ||
661 | |||
662 | return true; | ||
663 | } | ||
664 | #endif /* CONFIG_NO_HZ_FULL */ | ||
629 | 665 | ||
630 | void sched_avg_update(struct rq *rq) | 666 | void sched_avg_update(struct rq *rq) |
631 | { | 667 | { |
@@ -1357,7 +1393,8 @@ static void sched_ttwu_pending(void) | |||
1357 | 1393 | ||
1358 | void scheduler_ipi(void) | 1394 | void scheduler_ipi(void) |
1359 | { | 1395 | { |
1360 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) | 1396 | if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() |
1397 | && !tick_nohz_full_cpu(smp_processor_id())) | ||
1361 | return; | 1398 | return; |
1362 | 1399 | ||
1363 | /* | 1400 | /* |
@@ -1374,6 +1411,7 @@ void scheduler_ipi(void) | |||
1374 | * somewhat pessimize the simple resched case. | 1411 | * somewhat pessimize the simple resched case. |
1375 | */ | 1412 | */ |
1376 | irq_enter(); | 1413 | irq_enter(); |
1414 | tick_nohz_full_check(); | ||
1377 | sched_ttwu_pending(); | 1415 | sched_ttwu_pending(); |
1378 | 1416 | ||
1379 | /* | 1417 | /* |
@@ -1855,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1855 | kprobe_flush_task(prev); | 1893 | kprobe_flush_task(prev); |
1856 | put_task_struct(prev); | 1894 | put_task_struct(prev); |
1857 | } | 1895 | } |
1896 | |||
1897 | tick_nohz_task_switch(current); | ||
1858 | } | 1898 | } |
1859 | 1899 | ||
1860 | #ifdef CONFIG_SMP | 1900 | #ifdef CONFIG_SMP |
@@ -2118,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
2118 | return load >> FSHIFT; | 2158 | return load >> FSHIFT; |
2119 | } | 2159 | } |
2120 | 2160 | ||
2121 | #ifdef CONFIG_NO_HZ | 2161 | #ifdef CONFIG_NO_HZ_COMMON |
2122 | /* | 2162 | /* |
2123 | * Handle NO_HZ for the global load-average. | 2163 | * Handle NO_HZ for the global load-average. |
2124 | * | 2164 | * |
@@ -2344,12 +2384,12 @@ static void calc_global_nohz(void) | |||
2344 | smp_wmb(); | 2384 | smp_wmb(); |
2345 | calc_load_idx++; | 2385 | calc_load_idx++; |
2346 | } | 2386 | } |
2347 | #else /* !CONFIG_NO_HZ */ | 2387 | #else /* !CONFIG_NO_HZ_COMMON */ |
2348 | 2388 | ||
2349 | static inline long calc_load_fold_idle(void) { return 0; } | 2389 | static inline long calc_load_fold_idle(void) { return 0; } |
2350 | static inline void calc_global_nohz(void) { } | 2390 | static inline void calc_global_nohz(void) { } |
2351 | 2391 | ||
2352 | #endif /* CONFIG_NO_HZ */ | 2392 | #endif /* CONFIG_NO_HZ_COMMON */ |
2353 | 2393 | ||
2354 | /* | 2394 | /* |
2355 | * calc_load - update the avenrun load estimates 10 ticks after the | 2395 | * calc_load - update the avenrun load estimates 10 ticks after the |
@@ -2509,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
2509 | sched_avg_update(this_rq); | 2549 | sched_avg_update(this_rq); |
2510 | } | 2550 | } |
2511 | 2551 | ||
2512 | #ifdef CONFIG_NO_HZ | 2552 | #ifdef CONFIG_NO_HZ_COMMON |
2513 | /* | 2553 | /* |
2514 | * There is no sane way to deal with nohz on smp when using jiffies because the | 2554 | * There is no sane way to deal with nohz on smp when using jiffies because the |
2515 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 2555 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading |
@@ -2569,7 +2609,7 @@ void update_cpu_load_nohz(void) | |||
2569 | } | 2609 | } |
2570 | raw_spin_unlock(&this_rq->lock); | 2610 | raw_spin_unlock(&this_rq->lock); |
2571 | } | 2611 | } |
2572 | #endif /* CONFIG_NO_HZ */ | 2612 | #endif /* CONFIG_NO_HZ_COMMON */ |
2573 | 2613 | ||
2574 | /* | 2614 | /* |
2575 | * Called from scheduler_tick() | 2615 | * Called from scheduler_tick() |
@@ -6950,7 +6990,7 @@ void __init sched_init(void) | |||
6950 | INIT_LIST_HEAD(&rq->cfs_tasks); | 6990 | INIT_LIST_HEAD(&rq->cfs_tasks); |
6951 | 6991 | ||
6952 | rq_attach_root(rq, &def_root_domain); | 6992 | rq_attach_root(rq, &def_root_domain); |
6953 | #ifdef CONFIG_NO_HZ | 6993 | #ifdef CONFIG_NO_HZ_COMMON |
6954 | rq->nohz_flags = 0; | 6994 | rq->nohz_flags = 0; |
6955 | #endif | 6995 | #endif |
6956 | #endif | 6996 | #endif |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8bf7081b1ec5..c61a614465c8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -5355,7 +5355,7 @@ out_unlock: | |||
5355 | return 0; | 5355 | return 0; |
5356 | } | 5356 | } |
5357 | 5357 | ||
5358 | #ifdef CONFIG_NO_HZ | 5358 | #ifdef CONFIG_NO_HZ_COMMON |
5359 | /* | 5359 | /* |
5360 | * idle load balancing details | 5360 | * idle load balancing details |
5361 | * - When one of the busy CPUs notice that there may be an idle rebalancing | 5361 | * - When one of the busy CPUs notice that there may be an idle rebalancing |
@@ -5572,9 +5572,9 @@ out: | |||
5572 | rq->next_balance = next_balance; | 5572 | rq->next_balance = next_balance; |
5573 | } | 5573 | } |
5574 | 5574 | ||
5575 | #ifdef CONFIG_NO_HZ | 5575 | #ifdef CONFIG_NO_HZ_COMMON |
5576 | /* | 5576 | /* |
5577 | * In CONFIG_NO_HZ case, the idle balance kickee will do the | 5577 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
5578 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 5578 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
5579 | */ | 5579 | */ |
5580 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | 5580 | static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) |
@@ -5717,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu) | |||
5717 | if (time_after_eq(jiffies, rq->next_balance) && | 5717 | if (time_after_eq(jiffies, rq->next_balance) && |
5718 | likely(!on_null_domain(cpu))) | 5718 | likely(!on_null_domain(cpu))) |
5719 | raise_softirq(SCHED_SOFTIRQ); | 5719 | raise_softirq(SCHED_SOFTIRQ); |
5720 | #ifdef CONFIG_NO_HZ | 5720 | #ifdef CONFIG_NO_HZ_COMMON |
5721 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) | 5721 | if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) |
5722 | nohz_balancer_kick(cpu); | 5722 | nohz_balancer_kick(cpu); |
5723 | #endif | 5723 | #endif |
@@ -6187,7 +6187,7 @@ __init void init_sched_fair_class(void) | |||
6187 | #ifdef CONFIG_SMP | 6187 | #ifdef CONFIG_SMP |
6188 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); | 6188 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); |
6189 | 6189 | ||
6190 | #ifdef CONFIG_NO_HZ | 6190 | #ifdef CONFIG_NO_HZ_COMMON |
6191 | nohz.next_balance = jiffies; | 6191 | nohz.next_balance = jiffies; |
6192 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); | 6192 | zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); |
6193 | cpu_notifier(sched_ilb_notifier, 0); | 6193 | cpu_notifier(sched_ilb_notifier, 0); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c225c4c7111..24dc29897749 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/mutex.h> | 5 | #include <linux/mutex.h> |
6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
7 | #include <linux/stop_machine.h> | 7 | #include <linux/stop_machine.h> |
8 | #include <linux/tick.h> | ||
8 | 9 | ||
9 | #include "cpupri.h" | 10 | #include "cpupri.h" |
10 | #include "cpuacct.h" | 11 | #include "cpuacct.h" |
@@ -405,7 +406,7 @@ struct rq { | |||
405 | #define CPU_LOAD_IDX_MAX 5 | 406 | #define CPU_LOAD_IDX_MAX 5 |
406 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 407 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
407 | unsigned long last_load_update_tick; | 408 | unsigned long last_load_update_tick; |
408 | #ifdef CONFIG_NO_HZ | 409 | #ifdef CONFIG_NO_HZ_COMMON |
409 | u64 nohz_stamp; | 410 | u64 nohz_stamp; |
410 | unsigned long nohz_flags; | 411 | unsigned long nohz_flags; |
411 | #endif | 412 | #endif |
@@ -1072,6 +1073,16 @@ static inline u64 steal_ticks(u64 steal) | |||
1072 | static inline void inc_nr_running(struct rq *rq) | 1073 | static inline void inc_nr_running(struct rq *rq) |
1073 | { | 1074 | { |
1074 | rq->nr_running++; | 1075 | rq->nr_running++; |
1076 | |||
1077 | #ifdef CONFIG_NO_HZ_FULL | ||
1078 | if (rq->nr_running == 2) { | ||
1079 | if (tick_nohz_full_cpu(rq->cpu)) { | ||
1080 | /* Order rq->nr_running write against the IPI */ | ||
1081 | smp_wmb(); | ||
1082 | smp_send_reschedule(rq->cpu); | ||
1083 | } | ||
1084 | } | ||
1085 | #endif | ||
1075 | } | 1086 | } |
1076 | 1087 | ||
1077 | static inline void dec_nr_running(struct rq *rq) | 1088 | static inline void dec_nr_running(struct rq *rq) |
@@ -1299,7 +1310,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); | |||
1299 | 1310 | ||
1300 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); | 1311 | extern void account_cfs_bandwidth_used(int enabled, int was_enabled); |
1301 | 1312 | ||
1302 | #ifdef CONFIG_NO_HZ | 1313 | #ifdef CONFIG_NO_HZ_COMMON |
1303 | enum rq_nohz_flag_bits { | 1314 | enum rq_nohz_flag_bits { |
1304 | NOHZ_TICK_STOPPED, | 1315 | NOHZ_TICK_STOPPED, |
1305 | NOHZ_BALANCE_KICK, | 1316 | NOHZ_BALANCE_KICK, |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 14d7758074aa..51a09d56e78b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void) | |||
329 | wakeup_softirqd(); | 329 | wakeup_softirqd(); |
330 | } | 330 | } |
331 | 331 | ||
332 | static inline void tick_irq_exit(void) | ||
333 | { | ||
334 | #ifdef CONFIG_NO_HZ_COMMON | ||
335 | int cpu = smp_processor_id(); | ||
336 | |||
337 | /* Make sure that timer wheel updates are propagated */ | ||
338 | if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { | ||
339 | if (!in_interrupt()) | ||
340 | tick_nohz_irq_exit(); | ||
341 | } | ||
342 | #endif | ||
343 | } | ||
344 | |||
332 | /* | 345 | /* |
333 | * Exit an interrupt context. Process softirqs if needed and possible: | 346 | * Exit an interrupt context. Process softirqs if needed and possible: |
334 | */ | 347 | */ |
@@ -346,11 +359,7 @@ void irq_exit(void) | |||
346 | if (!in_interrupt() && local_softirq_pending()) | 359 | if (!in_interrupt() && local_softirq_pending()) |
347 | invoke_softirq(); | 360 | invoke_softirq(); |
348 | 361 | ||
349 | #ifdef CONFIG_NO_HZ | 362 | tick_irq_exit(); |
350 | /* Make sure that timer wheel updates are propagated */ | ||
351 | if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) | ||
352 | tick_nohz_irq_exit(); | ||
353 | #endif | ||
354 | rcu_irq_exit(); | 363 | rcu_irq_exit(); |
355 | } | 364 | } |
356 | 365 | ||
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 24510d84efd7..a2ddd650cb92 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
@@ -64,20 +64,89 @@ config GENERIC_CMOS_UPDATE | |||
64 | if GENERIC_CLOCKEVENTS | 64 | if GENERIC_CLOCKEVENTS |
65 | menu "Timers subsystem" | 65 | menu "Timers subsystem" |
66 | 66 | ||
67 | # Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is | 67 | # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is |
68 | # only related to the tick functionality. Oneshot clockevent devices | 68 | # only related to the tick functionality. Oneshot clockevent devices |
69 | # are supported independ of this. | 69 | # are supported independ of this. |
70 | config TICK_ONESHOT | 70 | config TICK_ONESHOT |
71 | bool | 71 | bool |
72 | 72 | ||
73 | config NO_HZ | 73 | config NO_HZ_COMMON |
74 | bool "Tickless System (Dynamic Ticks)" | 74 | bool |
75 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | 75 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS |
76 | select TICK_ONESHOT | 76 | select TICK_ONESHOT |
77 | |||
78 | choice | ||
79 | prompt "Timer tick handling" | ||
80 | default NO_HZ_IDLE if NO_HZ | ||
81 | |||
82 | config HZ_PERIODIC | ||
83 | bool "Periodic timer ticks (constant rate, no dynticks)" | ||
84 | help | ||
85 | This option keeps the tick running periodically at a constant | ||
86 | rate, even when the CPU doesn't need it. | ||
87 | |||
88 | config NO_HZ_IDLE | ||
89 | bool "Idle dynticks system (tickless idle)" | ||
90 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | ||
91 | select NO_HZ_COMMON | ||
92 | help | ||
93 | This option enables a tickless idle system: timer interrupts | ||
94 | will only trigger on an as-needed basis when the system is idle. | ||
95 | This is usually interesting for energy saving. | ||
96 | |||
97 | Most of the time you want to say Y here. | ||
98 | |||
99 | config NO_HZ_FULL | ||
100 | bool "Full dynticks system (tickless)" | ||
101 | # NO_HZ_COMMON dependency | ||
102 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | ||
103 | # We need at least one periodic CPU for timekeeping | ||
104 | depends on SMP | ||
105 | # RCU_USER_QS dependency | ||
106 | depends on HAVE_CONTEXT_TRACKING | ||
107 | # VIRT_CPU_ACCOUNTING_GEN dependency | ||
108 | depends on 64BIT | ||
109 | select NO_HZ_COMMON | ||
110 | select RCU_USER_QS | ||
111 | select RCU_NOCB_CPU | ||
112 | select RCU_NOCB_CPU_ALL | ||
113 | select VIRT_CPU_ACCOUNTING_GEN | ||
114 | select CONTEXT_TRACKING_FORCE | ||
115 | select IRQ_WORK | ||
116 | help | ||
117 | Adaptively try to shutdown the tick whenever possible, even when | ||
118 | the CPU is running tasks. Typically this requires running a single | ||
119 | task on the CPU. Chances for running tickless are maximized when | ||
120 | the task mostly runs in userspace and has few kernel activity. | ||
121 | |||
122 | You need to fill up the nohz_full boot parameter with the | ||
123 | desired range of dynticks CPUs. | ||
124 | |||
125 | This is implemented at the expense of some overhead in user <-> kernel | ||
126 | transitions: syscalls, exceptions and interrupts. Even when it's | ||
127 | dynamically off. | ||
128 | |||
129 | Say N. | ||
130 | |||
131 | endchoice | ||
132 | |||
133 | config NO_HZ_FULL_ALL | ||
134 | bool "Full dynticks system on all CPUs by default" | ||
135 | depends on NO_HZ_FULL | ||
136 | help | ||
137 | If the user doesn't pass the nohz_full boot option to | ||
138 | define the range of full dynticks CPUs, consider that all | ||
139 | CPUs in the system are full dynticks by default. | ||
140 | Note the boot CPU will still be kept outside the range to | ||
141 | handle the timekeeping duty. | ||
142 | |||
143 | config NO_HZ | ||
144 | bool "Old Idle dynticks config" | ||
145 | depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS | ||
77 | help | 146 | help |
78 | This option enables a tickless system: timer interrupts will | 147 | This is the old config entry that enables dynticks idle. |
79 | only trigger on an as-needed basis both when the system is | 148 | We keep it around for a little while to enforce backward |
80 | busy and when the system is idle. | 149 | compatibility with older config files. |
81 | 150 | ||
82 | config HIGH_RES_TIMERS | 151 | config HIGH_RES_TIMERS |
83 | bool "High Resolution Timer Support" | 152 | bool "High Resolution Timer Support" |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7f32fe0e52cd..40c10502c9e9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -574,7 +574,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | |||
574 | bc->event_handler = tick_handle_oneshot_broadcast; | 574 | bc->event_handler = tick_handle_oneshot_broadcast; |
575 | 575 | ||
576 | /* Take the do_timer update */ | 576 | /* Take the do_timer update */ |
577 | tick_do_timer_cpu = cpu; | 577 | if (!tick_nohz_full_cpu(cpu)) |
578 | tick_do_timer_cpu = cpu; | ||
578 | 579 | ||
579 | /* | 580 | /* |
580 | * We must be careful here. There might be other CPUs | 581 | * We must be careful here. There might be other CPUs |
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b1600a6973f4..83f2bd967161 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c | |||
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td, | |||
163 | * this cpu: | 163 | * this cpu: |
164 | */ | 164 | */ |
165 | if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { | 165 | if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { |
166 | tick_do_timer_cpu = cpu; | 166 | if (!tick_nohz_full_cpu(cpu)) |
167 | tick_do_timer_cpu = cpu; | ||
168 | else | ||
169 | tick_do_timer_cpu = TICK_DO_TIMER_NONE; | ||
167 | tick_next_period = ktime_get(); | 170 | tick_next_period = ktime_get(); |
168 | tick_period = ktime_set(0, NSEC_PER_SEC / HZ); | 171 | tick_period = ktime_set(0, NSEC_PER_SEC / HZ); |
169 | } | 172 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a19a39952c1b..1c9f53b2ddb7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -21,11 +21,15 @@ | |||
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/irq_work.h> | 23 | #include <linux/irq_work.h> |
24 | #include <linux/posix-timers.h> | ||
25 | #include <linux/perf_event.h> | ||
24 | 26 | ||
25 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
26 | 28 | ||
27 | #include "tick-internal.h" | 29 | #include "tick-internal.h" |
28 | 30 | ||
31 | #include <trace/events/timer.h> | ||
32 | |||
29 | /* | 33 | /* |
30 | * Per cpu nohz control structure | 34 | * Per cpu nohz control structure |
31 | */ | 35 | */ |
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now) | |||
104 | { | 108 | { |
105 | int cpu = smp_processor_id(); | 109 | int cpu = smp_processor_id(); |
106 | 110 | ||
107 | #ifdef CONFIG_NO_HZ | 111 | #ifdef CONFIG_NO_HZ_COMMON |
108 | /* | 112 | /* |
109 | * Check if the do_timer duty was dropped. We don't care about | 113 | * Check if the do_timer duty was dropped. We don't care about |
110 | * concurrency: This happens only when the cpu in charge went | 114 | * concurrency: This happens only when the cpu in charge went |
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now) | |||
112 | * this duty, then the jiffies update is still serialized by | 116 | * this duty, then the jiffies update is still serialized by |
113 | * jiffies_lock. | 117 | * jiffies_lock. |
114 | */ | 118 | */ |
115 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) | 119 | if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) |
120 | && !tick_nohz_full_cpu(cpu)) | ||
116 | tick_do_timer_cpu = cpu; | 121 | tick_do_timer_cpu = cpu; |
117 | #endif | 122 | #endif |
118 | 123 | ||
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now) | |||
123 | 128 | ||
124 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | 129 | static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) |
125 | { | 130 | { |
126 | #ifdef CONFIG_NO_HZ | 131 | #ifdef CONFIG_NO_HZ_COMMON |
127 | /* | 132 | /* |
128 | * When we are idle and the tick is stopped, we have to touch | 133 | * When we are idle and the tick is stopped, we have to touch |
129 | * the watchdog as we might not schedule for a really long | 134 | * the watchdog as we might not schedule for a really long |
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) | |||
142 | profile_tick(CPU_PROFILING); | 147 | profile_tick(CPU_PROFILING); |
143 | } | 148 | } |
144 | 149 | ||
150 | #ifdef CONFIG_NO_HZ_FULL | ||
151 | static cpumask_var_t nohz_full_mask; | ||
152 | bool have_nohz_full_mask; | ||
153 | |||
154 | static bool can_stop_full_tick(void) | ||
155 | { | ||
156 | WARN_ON_ONCE(!irqs_disabled()); | ||
157 | |||
158 | if (!sched_can_stop_tick()) { | ||
159 | trace_tick_stop(0, "more than 1 task in runqueue\n"); | ||
160 | return false; | ||
161 | } | ||
162 | |||
163 | if (!posix_cpu_timers_can_stop_tick(current)) { | ||
164 | trace_tick_stop(0, "posix timers running\n"); | ||
165 | return false; | ||
166 | } | ||
167 | |||
168 | if (!perf_event_can_stop_tick()) { | ||
169 | trace_tick_stop(0, "perf events running\n"); | ||
170 | return false; | ||
171 | } | ||
172 | |||
173 | /* sched_clock_tick() needs us? */ | ||
174 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK | ||
175 | /* | ||
176 | * TODO: kick full dynticks CPUs when | ||
177 | * sched_clock_stable is set. | ||
178 | */ | ||
179 | if (!sched_clock_stable) { | ||
180 | trace_tick_stop(0, "unstable sched clock\n"); | ||
181 | return false; | ||
182 | } | ||
183 | #endif | ||
184 | |||
185 | return true; | ||
186 | } | ||
187 | |||
188 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); | ||
189 | |||
190 | /* | ||
191 | * Re-evaluate the need for the tick on the current CPU | ||
192 | * and restart it if necessary. | ||
193 | */ | ||
194 | void tick_nohz_full_check(void) | ||
195 | { | ||
196 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | ||
197 | |||
198 | if (tick_nohz_full_cpu(smp_processor_id())) { | ||
199 | if (ts->tick_stopped && !is_idle_task(current)) { | ||
200 | if (!can_stop_full_tick()) | ||
201 | tick_nohz_restart_sched_tick(ts, ktime_get()); | ||
202 | } | ||
203 | } | ||
204 | } | ||
205 | |||
206 | static void nohz_full_kick_work_func(struct irq_work *work) | ||
207 | { | ||
208 | tick_nohz_full_check(); | ||
209 | } | ||
210 | |||
211 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | ||
212 | .func = nohz_full_kick_work_func, | ||
213 | }; | ||
214 | |||
215 | /* | ||
216 | * Kick the current CPU if it's full dynticks in order to force it to | ||
217 | * re-evaluate its dependency on the tick and restart it if necessary. | ||
218 | */ | ||
219 | void tick_nohz_full_kick(void) | ||
220 | { | ||
221 | if (tick_nohz_full_cpu(smp_processor_id())) | ||
222 | irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); | ||
223 | } | ||
224 | |||
225 | static void nohz_full_kick_ipi(void *info) | ||
226 | { | ||
227 | tick_nohz_full_check(); | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * Kick all full dynticks CPUs in order to force these to re-evaluate | ||
232 | * their dependency on the tick and restart it if necessary. | ||
233 | */ | ||
234 | void tick_nohz_full_kick_all(void) | ||
235 | { | ||
236 | if (!have_nohz_full_mask) | ||
237 | return; | ||
238 | |||
239 | preempt_disable(); | ||
240 | smp_call_function_many(nohz_full_mask, | ||
241 | nohz_full_kick_ipi, NULL, false); | ||
242 | preempt_enable(); | ||
243 | } | ||
244 | |||
245 | /* | ||
246 | * Re-evaluate the need for the tick as we switch the current task. | ||
247 | * It might need the tick due to per task/process properties: | ||
248 | * perf events, posix cpu timers, ... | ||
249 | */ | ||
250 | void tick_nohz_task_switch(struct task_struct *tsk) | ||
251 | { | ||
252 | unsigned long flags; | ||
253 | |||
254 | local_irq_save(flags); | ||
255 | |||
256 | if (!tick_nohz_full_cpu(smp_processor_id())) | ||
257 | goto out; | ||
258 | |||
259 | if (tick_nohz_tick_stopped() && !can_stop_full_tick()) | ||
260 | tick_nohz_full_kick(); | ||
261 | |||
262 | out: | ||
263 | local_irq_restore(flags); | ||
264 | } | ||
265 | |||
266 | int tick_nohz_full_cpu(int cpu) | ||
267 | { | ||
268 | if (!have_nohz_full_mask) | ||
269 | return 0; | ||
270 | |||
271 | return cpumask_test_cpu(cpu, nohz_full_mask); | ||
272 | } | ||
273 | |||
274 | /* Parse the boot-time nohz CPU list from the kernel parameters. */ | ||
275 | static int __init tick_nohz_full_setup(char *str) | ||
276 | { | ||
277 | int cpu; | ||
278 | |||
279 | alloc_bootmem_cpumask_var(&nohz_full_mask); | ||
280 | if (cpulist_parse(str, nohz_full_mask) < 0) { | ||
281 | pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); | ||
282 | return 1; | ||
283 | } | ||
284 | |||
285 | cpu = smp_processor_id(); | ||
286 | if (cpumask_test_cpu(cpu, nohz_full_mask)) { | ||
287 | pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); | ||
288 | cpumask_clear_cpu(cpu, nohz_full_mask); | ||
289 | } | ||
290 | have_nohz_full_mask = true; | ||
291 | |||
292 | return 1; | ||
293 | } | ||
294 | __setup("nohz_full=", tick_nohz_full_setup); | ||
295 | |||
296 | static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, | ||
297 | unsigned long action, | ||
298 | void *hcpu) | ||
299 | { | ||
300 | unsigned int cpu = (unsigned long)hcpu; | ||
301 | |||
302 | switch (action & ~CPU_TASKS_FROZEN) { | ||
303 | case CPU_DOWN_PREPARE: | ||
304 | /* | ||
305 | * If we handle the timekeeping duty for full dynticks CPUs, | ||
306 | * we can't safely shutdown that CPU. | ||
307 | */ | ||
308 | if (have_nohz_full_mask && tick_do_timer_cpu == cpu) | ||
309 | return -EINVAL; | ||
310 | break; | ||
311 | } | ||
312 | return NOTIFY_OK; | ||
313 | } | ||
314 | |||
315 | /* | ||
316 | * Worst case string length in chunks of CPU range seems 2 steps | ||
317 | * separations: 0,2,4,6,... | ||
318 | * This is NR_CPUS + sizeof('\0') | ||
319 | */ | ||
320 | static char __initdata nohz_full_buf[NR_CPUS + 1]; | ||
321 | |||
322 | static int tick_nohz_init_all(void) | ||
323 | { | ||
324 | int err = -1; | ||
325 | |||
326 | #ifdef CONFIG_NO_HZ_FULL_ALL | ||
327 | if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { | ||
328 | pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); | ||
329 | return err; | ||
330 | } | ||
331 | err = 0; | ||
332 | cpumask_setall(nohz_full_mask); | ||
333 | cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); | ||
334 | have_nohz_full_mask = true; | ||
335 | #endif | ||
336 | return err; | ||
337 | } | ||
338 | |||
339 | void __init tick_nohz_init(void) | ||
340 | { | ||
341 | int cpu; | ||
342 | |||
343 | if (!have_nohz_full_mask) { | ||
344 | if (tick_nohz_init_all() < 0) | ||
345 | return; | ||
346 | } | ||
347 | |||
348 | cpu_notifier(tick_nohz_cpu_down_callback, 0); | ||
349 | |||
350 | /* Make sure full dynticks CPU are also RCU nocbs */ | ||
351 | for_each_cpu(cpu, nohz_full_mask) { | ||
352 | if (!rcu_is_nocb_cpu(cpu)) { | ||
353 | pr_warning("NO_HZ: CPU %d is not RCU nocb: " | ||
354 | "cleared from nohz_full range", cpu); | ||
355 | cpumask_clear_cpu(cpu, nohz_full_mask); | ||
356 | } | ||
357 | } | ||
358 | |||
359 | cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); | ||
360 | pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); | ||
361 | } | ||
362 | #else | ||
363 | #define have_nohz_full_mask (0) | ||
364 | #endif | ||
365 | |||
145 | /* | 366 | /* |
146 | * NOHZ - aka dynamic tick functionality | 367 | * NOHZ - aka dynamic tick functionality |
147 | */ | 368 | */ |
148 | #ifdef CONFIG_NO_HZ | 369 | #ifdef CONFIG_NO_HZ_COMMON |
149 | /* | 370 | /* |
150 | * NO HZ enabled ? | 371 | * NO HZ enabled ? |
151 | */ | 372 | */ |
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
345 | delta_jiffies = rcu_delta_jiffies; | 566 | delta_jiffies = rcu_delta_jiffies; |
346 | } | 567 | } |
347 | } | 568 | } |
569 | |||
348 | /* | 570 | /* |
349 | * Do not stop the tick, if we are only one off | 571 | * Do not stop the tick, if we are only one off (or less) |
350 | * or if the cpu is required for rcu | 572 | * or if the cpu is required for RCU: |
351 | */ | 573 | */ |
352 | if (!ts->tick_stopped && delta_jiffies == 1) | 574 | if (!ts->tick_stopped && delta_jiffies <= 1) |
353 | goto out; | 575 | goto out; |
354 | 576 | ||
355 | /* Schedule the tick, if we are at least one jiffie off */ | 577 | /* Schedule the tick, if we are at least one jiffie off */ |
@@ -421,6 +643,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
421 | 643 | ||
422 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 644 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
423 | ts->tick_stopped = 1; | 645 | ts->tick_stopped = 1; |
646 | trace_tick_stop(1, " "); | ||
424 | } | 647 | } |
425 | 648 | ||
426 | /* | 649 | /* |
@@ -457,6 +680,24 @@ out: | |||
457 | return ret; | 680 | return ret; |
458 | } | 681 | } |
459 | 682 | ||
683 | static void tick_nohz_full_stop_tick(struct tick_sched *ts) | ||
684 | { | ||
685 | #ifdef CONFIG_NO_HZ_FULL | ||
686 | int cpu = smp_processor_id(); | ||
687 | |||
688 | if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) | ||
689 | return; | ||
690 | |||
691 | if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) | ||
692 | return; | ||
693 | |||
694 | if (!can_stop_full_tick()) | ||
695 | return; | ||
696 | |||
697 | tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); | ||
698 | #endif | ||
699 | } | ||
700 | |||
460 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | 701 | static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) |
461 | { | 702 | { |
462 | /* | 703 | /* |
@@ -489,6 +730,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
489 | return false; | 730 | return false; |
490 | } | 731 | } |
491 | 732 | ||
733 | if (have_nohz_full_mask) { | ||
734 | /* | ||
735 | * Keep the tick alive to guarantee timekeeping progression | ||
736 | * if there are full dynticks CPUs around | ||
737 | */ | ||
738 | if (tick_do_timer_cpu == cpu) | ||
739 | return false; | ||
740 | /* | ||
741 | * Boot safety: make sure the timekeeping duty has been | ||
742 | * assigned before entering dyntick-idle mode, | ||
743 | */ | ||
744 | if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) | ||
745 | return false; | ||
746 | } | ||
747 | |||
492 | return true; | 748 | return true; |
493 | } | 749 | } |
494 | 750 | ||
@@ -568,12 +824,13 @@ void tick_nohz_irq_exit(void) | |||
568 | { | 824 | { |
569 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); | 825 | struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); |
570 | 826 | ||
571 | if (!ts->inidle) | 827 | if (ts->inidle) { |
572 | return; | 828 | /* Cancel the timer because CPU already waken up from the C-states*/ |
573 | 829 | menu_hrtimer_cancel(); | |
574 | /* Cancel the timer because CPU already waken up from the C-states*/ | 830 | __tick_nohz_idle_enter(ts); |
575 | menu_hrtimer_cancel(); | 831 | } else { |
576 | __tick_nohz_idle_enter(ts); | 832 | tick_nohz_full_stop_tick(ts); |
833 | } | ||
577 | } | 834 | } |
578 | 835 | ||
579 | /** | 836 | /** |
@@ -802,7 +1059,7 @@ static inline void tick_check_nohz(int cpu) | |||
802 | static inline void tick_nohz_switch_to_nohz(void) { } | 1059 | static inline void tick_nohz_switch_to_nohz(void) { } |
803 | static inline void tick_check_nohz(int cpu) { } | 1060 | static inline void tick_check_nohz(int cpu) { } |
804 | 1061 | ||
805 | #endif /* NO_HZ */ | 1062 | #endif /* CONFIG_NO_HZ_COMMON */ |
806 | 1063 | ||
807 | /* | 1064 | /* |
808 | * Called from irq_enter to notify about the possible interruption of idle() | 1065 | * Called from irq_enter to notify about the possible interruption of idle() |
@@ -887,14 +1144,14 @@ void tick_setup_sched_timer(void) | |||
887 | now = ktime_get(); | 1144 | now = ktime_get(); |
888 | } | 1145 | } |
889 | 1146 | ||
890 | #ifdef CONFIG_NO_HZ | 1147 | #ifdef CONFIG_NO_HZ_COMMON |
891 | if (tick_nohz_enabled) | 1148 | if (tick_nohz_enabled) |
892 | ts->nohz_mode = NOHZ_MODE_HIGHRES; | 1149 | ts->nohz_mode = NOHZ_MODE_HIGHRES; |
893 | #endif | 1150 | #endif |
894 | } | 1151 | } |
895 | #endif /* HIGH_RES_TIMERS */ | 1152 | #endif /* HIGH_RES_TIMERS */ |
896 | 1153 | ||
897 | #if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS | 1154 | #if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS |
898 | void tick_cancel_sched_timer(int cpu) | 1155 | void tick_cancel_sched_timer(int cpu) |
899 | { | 1156 | { |
900 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 1157 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
diff --git a/kernel/timer.c b/kernel/timer.c index dbf7a78a1ef1..1b7489fdea41 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -738,7 +738,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, | |||
738 | 738 | ||
739 | cpu = smp_processor_id(); | 739 | cpu = smp_processor_id(); |
740 | 740 | ||
741 | #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) | 741 | #if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) |
742 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) | 742 | if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) |
743 | cpu = get_nohz_timer_target(); | 743 | cpu = get_nohz_timer_target(); |
744 | #endif | 744 | #endif |
@@ -930,14 +930,14 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
930 | debug_activate(timer, timer->expires); | 930 | debug_activate(timer, timer->expires); |
931 | internal_add_timer(base, timer); | 931 | internal_add_timer(base, timer); |
932 | /* | 932 | /* |
933 | * Check whether the other CPU is idle and needs to be | 933 | * Check whether the other CPU is in dynticks mode and needs |
934 | * triggered to reevaluate the timer wheel when nohz is | 934 | * to be triggered to reevaluate the timer wheel. |
935 | * active. We are protected against the other CPU fiddling | 935 | * We are protected against the other CPU fiddling |
936 | * with the timer by holding the timer base lock. This also | 936 | * with the timer by holding the timer base lock. This also |
937 | * makes sure that a CPU on the way to idle can not evaluate | 937 | * makes sure that a CPU on the way to stop its tick can not |
938 | * the timer wheel. | 938 | * evaluate the timer wheel. |
939 | */ | 939 | */ |
940 | wake_up_idle_cpu(cpu); | 940 | wake_up_nohz_cpu(cpu); |
941 | spin_unlock_irqrestore(&base->lock, flags); | 941 | spin_unlock_irqrestore(&base->lock, flags); |
942 | } | 942 | } |
943 | EXPORT_SYMBOL_GPL(add_timer_on); | 943 | EXPORT_SYMBOL_GPL(add_timer_on); |
@@ -1188,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) | |||
1188 | spin_unlock_irq(&base->lock); | 1188 | spin_unlock_irq(&base->lock); |
1189 | } | 1189 | } |
1190 | 1190 | ||
1191 | #ifdef CONFIG_NO_HZ | 1191 | #ifdef CONFIG_NO_HZ_COMMON |
1192 | /* | 1192 | /* |
1193 | * Find out when the next timer event is due to happen. This | 1193 | * Find out when the next timer event is due to happen. This |
1194 | * is used on S/390 to stop all activity when a CPU is idle. | 1194 | * is used on S/390 to stop all activity when a CPU is idle. |