aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/stallwarn.txt2
-rw-r--r--Documentation/cpu-freq/governors.txt4
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/timers/NO_HZ.txt273
-rw-r--r--arch/um/include/shared/common-offsets.h4
-rw-r--r--arch/um/os-Linux/time.c2
-rw-r--r--include/asm-generic/cputime_nsecs.h28
-rw-r--r--include/linux/perf_event.h6
-rw-r--r--include/linux/posix-timers.h2
-rw-r--r--include/linux/rcupdate.h7
-rw-r--r--include/linux/sched.h18
-rw-r--r--include/linux/tick.h25
-rw-r--r--include/trace/events/timer.h21
-rw-r--r--init/Kconfig8
-rw-r--r--init/main.c1
-rw-r--r--kernel/events/core.c17
-rw-r--r--kernel/hrtimer.c4
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/rcutree.c16
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h29
-rw-r--r--kernel/sched/core.c62
-rw-r--r--kernel/sched/fair.c10
-rw-r--r--kernel/sched/sched.h15
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/time/Kconfig81
-rw-r--r--kernel/time/tick-broadcast.c3
-rw-r--r--kernel/time/tick-common.c5
-rw-r--r--kernel/time/tick-sched.c289
-rw-r--r--kernel/timer.c16
30 files changed, 938 insertions, 115 deletions
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index e38b8df3d727..8e9359de1d28 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -191,7 +191,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
191o A hardware or software issue shuts off the scheduler-clock 191o A hardware or software issue shuts off the scheduler-clock
192 interrupt on a CPU that is not in dyntick-idle mode. This 192 interrupt on a CPU that is not in dyntick-idle mode. This
193 problem really has happened, and seems to be most likely to 193 problem really has happened, and seems to be most likely to
194 result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels. 194 result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
195 195
196o A bug in the RCU implementation. 196o A bug in the RCU implementation.
197 197
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index c7a2eb8450c2..e3e5d9ae50cd 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -131,8 +131,8 @@ sampling_rate_min:
131The sampling rate is limited by the HW transition latency: 131The sampling rate is limited by the HW transition latency:
132transition_latency * 100 132transition_latency * 100
133Or by kernel restrictions: 133Or by kernel restrictions:
134If CONFIG_NO_HZ is set, the limit is 10ms fixed. 134If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed.
135If CONFIG_NO_HZ is not set or nohz=off boot parameter is used, the 135If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is used, the
136limits depend on the CONFIG_HZ option: 136limits depend on the CONFIG_HZ option:
137HZ=1000: min=20000us (20ms) 137HZ=1000: min=20000us (20ms)
138HZ=250: min=80000us (80ms) 138HZ=250: min=80000us (80ms)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index de12397b60a9..7d55ebb5660c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1951,6 +1951,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
1951 Valid arguments: on, off 1951 Valid arguments: on, off
1952 Default: on 1952 Default: on
1953 1953
1954 nohz_full= [KNL,BOOT]
1955 In kernels built with CONFIG_NO_HZ_FULL=y, set
1956 the specified list of CPUs whose tick will be stopped
1957 whenever possible. The boot CPU will be forced outside
1958 the range to maintain the timekeeping.
1959 The CPUs in this range must also be included in the
1960 rcu_nocbs= set.
1961
1954 noiotrap [SH] Disables trapped I/O port accesses. 1962 noiotrap [SH] Disables trapped I/O port accesses.
1955 1963
1956 noirqdebug [X86-32] Disables the code which attempts to detect and 1964 noirqdebug [X86-32] Disables the code which attempts to detect and
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
new file mode 100644
index 000000000000..5b5322024067
--- /dev/null
+++ b/Documentation/timers/NO_HZ.txt
@@ -0,0 +1,273 @@
1 NO_HZ: Reducing Scheduling-Clock Ticks
2
3
4This document describes Kconfig options and boot parameters that can
5reduce the number of scheduling-clock interrupts, thereby improving energy
6efficiency and reducing OS jitter. Reducing OS jitter is important for
7some types of computationally intensive high-performance computing (HPC)
8applications and for real-time applications.
9
10There are two main contexts in which the number of scheduling-clock
11interrupts can be reduced compared to the old-school approach of sending
12a scheduling-clock interrupt to all CPUs every jiffy whether they need
13it or not (CONFIG_HZ_PERIODIC=y or CONFIG_NO_HZ=n for older kernels):
14
151. Idle CPUs (CONFIG_NO_HZ_IDLE=y or CONFIG_NO_HZ=y for older kernels).
16
172. CPUs having only one runnable task (CONFIG_NO_HZ_FULL=y).
18
19These two cases are described in the following two sections, followed
20by a third section on RCU-specific considerations and a fourth and final
21section listing known issues.
22
23
24IDLE CPUs
25
26If a CPU is idle, there is little point in sending it a scheduling-clock
27interrupt. After all, the primary purpose of a scheduling-clock interrupt
28is to force a busy CPU to shift its attention among multiple duties,
29and an idle CPU has no duties to shift its attention among.
30
31The CONFIG_NO_HZ_IDLE=y Kconfig option causes the kernel to avoid sending
32scheduling-clock interrupts to idle CPUs, which is critically important
33both to battery-powered devices and to highly virtualized mainframes.
34A battery-powered device running a CONFIG_HZ_PERIODIC=y kernel would
35drain its battery very quickly, easily 2-3 times as fast as would the
36same device running a CONFIG_NO_HZ_IDLE=y kernel. A mainframe running
371,500 OS instances might find that half of its CPU time was consumed by
38unnecessary scheduling-clock interrupts. In these situations, there
39is strong motivation to avoid sending scheduling-clock interrupts to
40idle CPUs. That said, dyntick-idle mode is not free:
41
421. It increases the number of instructions executed on the path
43 to and from the idle loop.
44
452. On many architectures, dyntick-idle mode also increases the
46 number of expensive clock-reprogramming operations.
47
48Therefore, systems with aggressive real-time response constraints often
49run CONFIG_HZ_PERIODIC=y kernels (or CONFIG_NO_HZ=n for older kernels)
50in order to avoid degrading from-idle transition latencies.
51
52An idle CPU that is not receiving scheduling-clock interrupts is said to
53be "dyntick-idle", "in dyntick-idle mode", "in nohz mode", or "running
54tickless". The remainder of this document will use "dyntick-idle mode".
55
56There is also a boot parameter "nohz=" that can be used to disable
57dyntick-idle mode in CONFIG_NO_HZ_IDLE=y kernels by specifying "nohz=off".
58By default, CONFIG_NO_HZ_IDLE=y kernels boot with "nohz=on", enabling
59dyntick-idle mode.
60
61
62CPUs WITH ONLY ONE RUNNABLE TASK
63
64If a CPU has only one runnable task, there is little point in sending it
65a scheduling-clock interrupt because there is no other task to switch to.
66
67The CONFIG_NO_HZ_FULL=y Kconfig option causes the kernel to avoid
68sending scheduling-clock interrupts to CPUs with a single runnable task,
69and such CPUs are said to be "adaptive-ticks CPUs". This is important
70for applications with aggressive real-time response constraints because
71it allows them to improve their worst-case response times by the maximum
72duration of a scheduling-clock interrupt. It is also important for
73computationally intensive short-iteration workloads: If any CPU is
74delayed during a given iteration, all the other CPUs will be forced to
75wait idle while the delayed CPU finishes. Thus, the delay is multiplied
76by one less than the number of CPUs. In these situations, there is
77again strong motivation to avoid sending scheduling-clock interrupts.
78
79By default, no CPU will be an adaptive-ticks CPU. The "nohz_full="
80boot parameter specifies the adaptive-ticks CPUs. For example,
81"nohz_full=1,6-8" says that CPUs 1, 6, 7, and 8 are to be adaptive-ticks
82CPUs. Note that you are prohibited from marking all of the CPUs as
83adaptive-tick CPUs: At least one non-adaptive-tick CPU must remain
84online to handle timekeeping tasks in order to ensure that system calls
85like gettimeofday() returns accurate values on adaptive-tick CPUs.
86(This is not an issue for CONFIG_NO_HZ_IDLE=y because there are no
87running user processes to observe slight drifts in clock rate.)
88Therefore, the boot CPU is prohibited from entering adaptive-ticks
89mode. Specifying a "nohz_full=" mask that includes the boot CPU will
90result in a boot-time error message, and the boot CPU will be removed
91from the mask.
92
93Alternatively, the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter specifies
94that all CPUs other than the boot CPU are adaptive-ticks CPUs. This
95Kconfig parameter will be overridden by the "nohz_full=" boot parameter,
96so that if both the CONFIG_NO_HZ_FULL_ALL=y Kconfig parameter and
97the "nohz_full=1" boot parameter is specified, the boot parameter will
98prevail so that only CPU 1 will be an adaptive-ticks CPU.
99
100Finally, adaptive-ticks CPUs must have their RCU callbacks offloaded.
101This is covered in the "RCU IMPLICATIONS" section below.
102
103Normally, a CPU remains in adaptive-ticks mode as long as possible.
104In particular, transitioning to kernel mode does not automatically change
105the mode. Instead, the CPU will exit adaptive-ticks mode only if needed,
106for example, if that CPU enqueues an RCU callback.
107
108Just as with dyntick-idle mode, the benefits of adaptive-tick mode do
109not come for free:
110
1111. CONFIG_NO_HZ_FULL selects CONFIG_NO_HZ_COMMON, so you cannot run
112 adaptive ticks without also running dyntick idle. This dependency
113 extends down into the implementation, so that all of the costs
114 of CONFIG_NO_HZ_IDLE are also incurred by CONFIG_NO_HZ_FULL.
115
1162. The user/kernel transitions are slightly more expensive due
117 to the need to inform kernel subsystems (such as RCU) about
118 the change in mode.
119
1203. POSIX CPU timers on adaptive-tick CPUs may miss their deadlines
121 (perhaps indefinitely) because they currently rely on
122 scheduling-tick interrupts. This will likely be fixed in
123 one of two ways: (1) Prevent CPUs with POSIX CPU timers from
124 entering adaptive-tick mode, or (2) Use hrtimers or other
125 adaptive-ticks-immune mechanism to cause the POSIX CPU timer to
126 fire properly.
127
1284. If there are more perf events pending than the hardware can
129 accommodate, they are normally round-robined so as to collect
130 all of them over time. Adaptive-tick mode may prevent this
131 round-robining from happening. This will likely be fixed by
132 preventing CPUs with large numbers of perf events pending from
133 entering adaptive-tick mode.
134
1355. Scheduler statistics for adaptive-tick CPUs may be computed
136 slightly differently than those for non-adaptive-tick CPUs.
137 This might in turn perturb load-balancing of real-time tasks.
138
1396. The LB_BIAS scheduler feature is disabled by adaptive ticks.
140
141Although improvements are expected over time, adaptive ticks is quite
142useful for many types of real-time and compute-intensive applications.
143However, the drawbacks listed above mean that adaptive ticks should not
144(yet) be enabled by default.
145
146
147RCU IMPLICATIONS
148
149There are situations in which idle CPUs cannot be permitted to
150enter either dyntick-idle mode or adaptive-tick mode, the most
151common being when that CPU has RCU callbacks pending.
152
153The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs
154to enter dyntick-idle mode or adaptive-tick mode anyway. In this case,
155a timer will awaken these CPUs every four jiffies in order to ensure
156that the RCU callbacks are processed in a timely fashion.
157
158Another approach is to offload RCU callback processing to "rcuo" kthreads
159using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to
160offload may be selected via several methods:
161
1621. One of three mutually exclusive Kconfig options specify a
163 build-time default for the CPUs to offload:
164
165 a. The CONFIG_RCU_NOCB_CPU_NONE=y Kconfig option results in
166 no CPUs being offloaded.
167
168 b. The CONFIG_RCU_NOCB_CPU_ZERO=y Kconfig option causes
169 CPU 0 to be offloaded.
170
171 c. The CONFIG_RCU_NOCB_CPU_ALL=y Kconfig option causes all
172 CPUs to be offloaded. Note that the callbacks will be
173 offloaded to "rcuo" kthreads, and that those kthreads
174 will in fact run on some CPU. However, this approach
175 gives fine-grained control on exactly which CPUs the
176 callbacks run on, along with their scheduling priority
177 (including the default of SCHED_OTHER), and it further
178 allows this control to be varied dynamically at runtime.
179
1802. The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated
181 list of CPUs and CPU ranges, for example, "1,3-5" selects CPUs 1,
182 3, 4, and 5. The specified CPUs will be offloaded in addition to
183 any CPUs specified as offloaded by CONFIG_RCU_NOCB_CPU_ZERO=y or
184 CONFIG_RCU_NOCB_CPU_ALL=y. This means that the "rcu_nocbs=" boot
185 parameter has no effect for kernels built with RCU_NOCB_CPU_ALL=y.
186
187The offloaded CPUs will never queue RCU callbacks, and therefore RCU
188never prevents offloaded CPUs from entering either dyntick-idle mode
189or adaptive-tick mode. That said, note that it is up to userspace to
190pin the "rcuo" kthreads to specific CPUs if desired. Otherwise, the
191scheduler will decide where to run them, which might or might not be
192where you want them to run.
193
194
195KNOWN ISSUES
196
197o Dyntick-idle slows transitions to and from idle slightly.
198 In practice, this has not been a problem except for the most
199 aggressive real-time workloads, which have the option of disabling
200 dyntick-idle mode, an option that most of them take. However,
201 some workloads will no doubt want to use adaptive ticks to
202 eliminate scheduling-clock interrupt latencies. Here are some
203 options for these workloads:
204
205 a. Use PMQOS from userspace to inform the kernel of your
206 latency requirements (preferred).
207
208 b. On x86 systems, use the "idle=mwait" boot parameter.
209
210 c. On x86 systems, use the "intel_idle.max_cstate=" to limit
211 ` the maximum C-state depth.
212
213 d. On x86 systems, use the "idle=poll" boot parameter.
214 However, please note that use of this parameter can cause
215 your CPU to overheat, which may cause thermal throttling
216 to degrade your latencies -- and that this degradation can
217 be even worse than that of dyntick-idle. Furthermore,
218 this parameter effectively disables Turbo Mode on Intel
219 CPUs, which can significantly reduce maximum performance.
220
221o Adaptive-ticks slows user/kernel transitions slightly.
222 This is not expected to be a problem for computationally intensive
223 workloads, which have few such transitions. Careful benchmarking
224 will be required to determine whether or not other workloads
225 are significantly affected by this effect.
226
227o Adaptive-ticks does not do anything unless there is only one
228 runnable task for a given CPU, even though there are a number
229 of other situations where the scheduling-clock tick is not
230 needed. To give but one example, consider a CPU that has one
231 runnable high-priority SCHED_FIFO task and an arbitrary number
232 of low-priority SCHED_OTHER tasks. In this case, the CPU is
233 required to run the SCHED_FIFO task until it either blocks or
234 some other higher-priority task awakens on (or is assigned to)
235 this CPU, so there is no point in sending a scheduling-clock
236 interrupt to this CPU. However, the current implementation
237 nevertheless sends scheduling-clock interrupts to CPUs having a
238 single runnable SCHED_FIFO task and multiple runnable SCHED_OTHER
239 tasks, even though these interrupts are unnecessary.
240
241 Better handling of these sorts of situations is future work.
242
243o A reboot is required to reconfigure both adaptive idle and RCU
244 callback offloading. Runtime reconfiguration could be provided
245 if needed, however, due to the complexity of reconfiguring RCU at
246 runtime, there would need to be an earthshakingly good reason.
247 Especially given that you have the straightforward option of
248 simply offloading RCU callbacks from all CPUs and pinning them
249 where you want them whenever you want them pinned.
250
251o Additional configuration is required to deal with other sources
252 of OS jitter, including interrupts and system-utility tasks
253 and processes. This configuration normally involves binding
254 interrupts and tasks to particular CPUs.
255
256o Some sources of OS jitter can currently be eliminated only by
257 constraining the workload. For example, the only way to eliminate
258 OS jitter due to global TLB shootdowns is to avoid the unmapping
259 operations (such as kernel module unload operations) that
260 result in these shootdowns. For another example, page faults
261 and TLB misses can be reduced (and in some cases eliminated) by
262 using huge pages and by constraining the amount of memory used
263 by the application. Pre-faulting the working set can also be
264 helpful, especially when combined with the mlock() and mlockall()
265 system calls.
266
267o Unless all CPUs are idle, at least one CPU must keep the
268 scheduling-clock interrupt going in order to support accurate
269 timekeeping.
270
271o If there are adaptive-ticks CPUs, there will be at least one
272 CPU keeping the scheduling-clock interrupt going, even if all
273 CPUs are otherwise idle.
diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h
index 2df313b6a586..c92306809029 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -30,8 +30,8 @@ DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
30#ifdef CONFIG_PRINTK 30#ifdef CONFIG_PRINTK
31DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); 31DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK);
32#endif 32#endif
33#ifdef CONFIG_NO_HZ 33#ifdef CONFIG_NO_HZ_COMMON
34DEFINE(UML_CONFIG_NO_HZ, CONFIG_NO_HZ); 34DEFINE(UML_CONFIG_NO_HZ_COMMON, CONFIG_NO_HZ_COMMON);
35#endif 35#endif
36#ifdef CONFIG_UML_X86 36#ifdef CONFIG_UML_X86
37DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); 37DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86);
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index fac388cb464f..e9824d5dd7d5 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -79,7 +79,7 @@ long long os_nsecs(void)
79 return timeval_to_ns(&tv); 79 return timeval_to_ns(&tv);
80} 80}
81 81
82#ifdef UML_CONFIG_NO_HZ 82#ifdef UML_CONFIG_NO_HZ_COMMON
83static int after_sleep_interval(struct timespec *ts) 83static int after_sleep_interval(struct timespec *ts)
84{ 84{
85 return 0; 85 return 0;
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index a8ece9a33aef..2c9e62c2bfd0 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -16,21 +16,27 @@
16#ifndef _ASM_GENERIC_CPUTIME_NSECS_H 16#ifndef _ASM_GENERIC_CPUTIME_NSECS_H
17#define _ASM_GENERIC_CPUTIME_NSECS_H 17#define _ASM_GENERIC_CPUTIME_NSECS_H
18 18
19#include <linux/math64.h>
20
19typedef u64 __nocast cputime_t; 21typedef u64 __nocast cputime_t;
20typedef u64 __nocast cputime64_t; 22typedef u64 __nocast cputime64_t;
21 23
22#define cputime_one_jiffy jiffies_to_cputime(1) 24#define cputime_one_jiffy jiffies_to_cputime(1)
23 25
26#define cputime_div(__ct, divisor) div_u64((__force u64)__ct, divisor)
27#define cputime_div_rem(__ct, divisor, remainder) \
28 div_u64_rem((__force u64)__ct, divisor, remainder);
29
24/* 30/*
25 * Convert cputime <-> jiffies (HZ) 31 * Convert cputime <-> jiffies (HZ)
26 */ 32 */
27#define cputime_to_jiffies(__ct) \ 33#define cputime_to_jiffies(__ct) \
28 ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) 34 cputime_div(__ct, NSEC_PER_SEC / HZ)
29#define cputime_to_scaled(__ct) (__ct) 35#define cputime_to_scaled(__ct) (__ct)
30#define jiffies_to_cputime(__jif) \ 36#define jiffies_to_cputime(__jif) \
31 (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) 37 (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
32#define cputime64_to_jiffies64(__ct) \ 38#define cputime64_to_jiffies64(__ct) \
33 ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) 39 cputime_div(__ct, NSEC_PER_SEC / HZ)
34#define jiffies64_to_cputime64(__jif) \ 40#define jiffies64_to_cputime64(__jif) \
35 (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) 41 (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ))
36 42
@@ -45,7 +51,7 @@ typedef u64 __nocast cputime64_t;
45 * Convert cputime <-> microseconds 51 * Convert cputime <-> microseconds
46 */ 52 */
47#define cputime_to_usecs(__ct) \ 53#define cputime_to_usecs(__ct) \
48 ((__force u64)(__ct) / NSEC_PER_USEC) 54 cputime_div(__ct, NSEC_PER_USEC)
49#define usecs_to_cputime(__usecs) \ 55#define usecs_to_cputime(__usecs) \
50 (__force cputime_t)((__usecs) * NSEC_PER_USEC) 56 (__force cputime_t)((__usecs) * NSEC_PER_USEC)
51#define usecs_to_cputime64(__usecs) \ 57#define usecs_to_cputime64(__usecs) \
@@ -55,7 +61,7 @@ typedef u64 __nocast cputime64_t;
55 * Convert cputime <-> seconds 61 * Convert cputime <-> seconds
56 */ 62 */
57#define cputime_to_secs(__ct) \ 63#define cputime_to_secs(__ct) \
58 ((__force u64)(__ct) / NSEC_PER_SEC) 64 cputime_div(__ct, NSEC_PER_SEC)
59#define secs_to_cputime(__secs) \ 65#define secs_to_cputime(__secs) \
60 (__force cputime_t)((__secs) * NSEC_PER_SEC) 66 (__force cputime_t)((__secs) * NSEC_PER_SEC)
61 67
@@ -69,8 +75,10 @@ static inline cputime_t timespec_to_cputime(const struct timespec *val)
69} 75}
70static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) 76static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val)
71{ 77{
72 val->tv_sec = (__force u64) ct / NSEC_PER_SEC; 78 u32 rem;
73 val->tv_nsec = (__force u64) ct % NSEC_PER_SEC; 79
80 val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem);
81 val->tv_nsec = rem;
74} 82}
75 83
76/* 84/*
@@ -83,15 +91,17 @@ static inline cputime_t timeval_to_cputime(const struct timeval *val)
83} 91}
84static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) 92static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val)
85{ 93{
86 val->tv_sec = (__force u64) ct / NSEC_PER_SEC; 94 u32 rem;
87 val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC; 95
96 val->tv_sec = cputime_div_rem(ct, NSEC_PER_SEC, &rem);
97 val->tv_usec = rem / NSEC_PER_USEC;
88} 98}
89 99
90/* 100/*
91 * Convert cputime <-> clock (USER_HZ) 101 * Convert cputime <-> clock (USER_HZ)
92 */ 102 */
93#define cputime_to_clock_t(__ct) \ 103#define cputime_to_clock_t(__ct) \
94 ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ)) 104 cputime_div(__ct, (NSEC_PER_SEC / USER_HZ))
95#define clock_t_to_cputime(__x) \ 105#define clock_t_to_cputime(__x) \
96 (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) 106 (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ))
97 107
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e0373d26c244..f463a46424e2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -788,6 +788,12 @@ static inline int __perf_event_disable(void *info) { return -1; }
788static inline void perf_event_task_tick(void) { } 788static inline void perf_event_task_tick(void) { }
789#endif 789#endif
790 790
791#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
792extern bool perf_event_can_stop_tick(void);
793#else
794static inline bool perf_event_can_stop_tick(void) { return true; }
795#endif
796
791#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) 797#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
792extern void perf_restore_debug_store(void); 798extern void perf_restore_debug_store(void);
793#else 799#else
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 042058fdb0af..3698d9d08978 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -122,6 +122,8 @@ void run_posix_cpu_timers(struct task_struct *task);
122void posix_cpu_timers_exit(struct task_struct *task); 122void posix_cpu_timers_exit(struct task_struct *task);
123void posix_cpu_timers_exit_group(struct task_struct *task); 123void posix_cpu_timers_exit_group(struct task_struct *task);
124 124
125bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk);
126
125void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, 127void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
126 cputime_t *newval, cputime_t *oldval); 128 cputime_t *newval, cputime_t *oldval);
127 129
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 9ed2c9a4de45..4ccd68e49b00 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1000,4 +1000,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
1000#define kfree_rcu(ptr, rcu_head) \ 1000#define kfree_rcu(ptr, rcu_head) \
1001 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) 1001 __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
1002 1002
1003#ifdef CONFIG_RCU_NOCB_CPU
1004extern bool rcu_is_nocb_cpu(int cpu);
1005#else
1006static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
1007#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
1008
1009
1003#endif /* __LINUX_RCUPDATE_H */ 1010#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 981ab6887259..ebf7095158a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -231,7 +231,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
231 231
232extern int runqueue_is_locked(int cpu); 232extern int runqueue_is_locked(int cpu);
233 233
234#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 234#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
235extern void nohz_balance_enter_idle(int cpu); 235extern void nohz_balance_enter_idle(int cpu);
236extern void set_cpu_sd_state_idle(void); 236extern void set_cpu_sd_state_idle(void);
237extern int get_nohz_timer_target(void); 237extern int get_nohz_timer_target(void);
@@ -1762,13 +1762,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
1762} 1762}
1763#endif 1763#endif
1764 1764
1765#ifdef CONFIG_NO_HZ 1765#ifdef CONFIG_NO_HZ_COMMON
1766void calc_load_enter_idle(void); 1766void calc_load_enter_idle(void);
1767void calc_load_exit_idle(void); 1767void calc_load_exit_idle(void);
1768#else 1768#else
1769static inline void calc_load_enter_idle(void) { } 1769static inline void calc_load_enter_idle(void) { }
1770static inline void calc_load_exit_idle(void) { } 1770static inline void calc_load_exit_idle(void) { }
1771#endif /* CONFIG_NO_HZ */ 1771#endif /* CONFIG_NO_HZ_COMMON */
1772 1772
1773#ifndef CONFIG_CPUMASK_OFFSTACK 1773#ifndef CONFIG_CPUMASK_OFFSTACK
1774static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 1774static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
@@ -1854,10 +1854,16 @@ extern void idle_task_exit(void);
1854static inline void idle_task_exit(void) {} 1854static inline void idle_task_exit(void) {}
1855#endif 1855#endif
1856 1856
1857#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 1857#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
1858extern void wake_up_idle_cpu(int cpu); 1858extern void wake_up_nohz_cpu(int cpu);
1859#else 1859#else
1860static inline void wake_up_idle_cpu(int cpu) { } 1860static inline void wake_up_nohz_cpu(int cpu) { }
1861#endif
1862
1863#ifdef CONFIG_NO_HZ_FULL
1864extern bool sched_can_stop_tick(void);
1865#else
1866static inline bool sched_can_stop_tick(void) { return false; }
1861#endif 1867#endif
1862 1868
1863#ifdef CONFIG_SCHED_AUTOGROUP 1869#ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 553272e6af55..9180f4b85e6d 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -82,7 +82,7 @@ extern int tick_program_event(ktime_t expires, int force);
82extern void tick_setup_sched_timer(void); 82extern void tick_setup_sched_timer(void);
83# endif 83# endif
84 84
85# if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS 85# if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
86extern void tick_cancel_sched_timer(int cpu); 86extern void tick_cancel_sched_timer(int cpu);
87# else 87# else
88static inline void tick_cancel_sched_timer(int cpu) { } 88static inline void tick_cancel_sched_timer(int cpu) { }
@@ -123,7 +123,7 @@ static inline void tick_check_idle(int cpu) { }
123static inline int tick_oneshot_mode_active(void) { return 0; } 123static inline int tick_oneshot_mode_active(void) { return 0; }
124#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ 124#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
125 125
126# ifdef CONFIG_NO_HZ 126# ifdef CONFIG_NO_HZ_COMMON
127DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched); 127DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched);
128 128
129static inline int tick_nohz_tick_stopped(void) 129static inline int tick_nohz_tick_stopped(void)
@@ -138,7 +138,7 @@ extern ktime_t tick_nohz_get_sleep_length(void);
138extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); 138extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
139extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); 139extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
140 140
141# else /* !CONFIG_NO_HZ */ 141# else /* !CONFIG_NO_HZ_COMMON */
142static inline int tick_nohz_tick_stopped(void) 142static inline int tick_nohz_tick_stopped(void)
143{ 143{
144 return 0; 144 return 0;
@@ -155,7 +155,24 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
155} 155}
156static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } 156static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
157static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } 157static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
158# endif /* !NO_HZ */ 158# endif /* !CONFIG_NO_HZ_COMMON */
159
160#ifdef CONFIG_NO_HZ_FULL
161extern void tick_nohz_init(void);
162extern int tick_nohz_full_cpu(int cpu);
163extern void tick_nohz_full_check(void);
164extern void tick_nohz_full_kick(void);
165extern void tick_nohz_full_kick_all(void);
166extern void tick_nohz_task_switch(struct task_struct *tsk);
167#else
168static inline void tick_nohz_init(void) { }
169static inline int tick_nohz_full_cpu(int cpu) { return 0; }
170static inline void tick_nohz_full_check(void) { }
171static inline void tick_nohz_full_kick(void) { }
172static inline void tick_nohz_full_kick_all(void) { }
173static inline void tick_nohz_task_switch(struct task_struct *tsk) { }
174#endif
175
159 176
160# ifdef CONFIG_CPU_IDLE_GOV_MENU 177# ifdef CONFIG_CPU_IDLE_GOV_MENU
161extern void menu_hrtimer_cancel(void); 178extern void menu_hrtimer_cancel(void);
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 425bcfe56c62..e967dd8a34c6 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -323,6 +323,27 @@ TRACE_EVENT(itimer_expire,
323 (int) __entry->pid, (unsigned long long)__entry->now) 323 (int) __entry->pid, (unsigned long long)__entry->now)
324); 324);
325 325
326#ifdef CONFIG_NO_HZ_COMMON
327TRACE_EVENT(tick_stop,
328
329 TP_PROTO(int success, char *error_msg),
330
331 TP_ARGS(success, error_msg),
332
333 TP_STRUCT__entry(
334 __field( int , success )
335 __string( msg, error_msg )
336 ),
337
338 TP_fast_assign(
339 __entry->success = success;
340 __assign_str(msg, error_msg);
341 ),
342
343 TP_printk("success=%s msg=%s", __entry->success ? "yes" : "no", __get_str(msg))
344);
345#endif
346
326#endif /* _TRACE_TIMER_H */ 347#endif /* _TRACE_TIMER_H */
327 348
328/* This part must be outside protection */ 349/* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index 4367e1379002..66f67afad4fa 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -302,7 +302,7 @@ choice
302# Kind of a stub config for the pure tick based cputime accounting 302# Kind of a stub config for the pure tick based cputime accounting
303config TICK_CPU_ACCOUNTING 303config TICK_CPU_ACCOUNTING
304 bool "Simple tick based cputime accounting" 304 bool "Simple tick based cputime accounting"
305 depends on !S390 305 depends on !S390 && !NO_HZ_FULL
306 help 306 help
307 This is the basic tick based cputime accounting that maintains 307 This is the basic tick based cputime accounting that maintains
308 statistics about user, system and idle time spent on per jiffies 308 statistics about user, system and idle time spent on per jiffies
@@ -312,7 +312,7 @@ config TICK_CPU_ACCOUNTING
312 312
313config VIRT_CPU_ACCOUNTING_NATIVE 313config VIRT_CPU_ACCOUNTING_NATIVE
314 bool "Deterministic task and CPU time accounting" 314 bool "Deterministic task and CPU time accounting"
315 depends on HAVE_VIRT_CPU_ACCOUNTING 315 depends on HAVE_VIRT_CPU_ACCOUNTING && !NO_HZ_FULL
316 select VIRT_CPU_ACCOUNTING 316 select VIRT_CPU_ACCOUNTING
317 help 317 help
318 Select this option to enable more accurate task and CPU time 318 Select this option to enable more accurate task and CPU time
@@ -342,7 +342,7 @@ config VIRT_CPU_ACCOUNTING_GEN
342 342
343config IRQ_TIME_ACCOUNTING 343config IRQ_TIME_ACCOUNTING
344 bool "Fine granularity task level IRQ time accounting" 344 bool "Fine granularity task level IRQ time accounting"
345 depends on HAVE_IRQ_TIME_ACCOUNTING 345 depends on HAVE_IRQ_TIME_ACCOUNTING && !NO_HZ_FULL
346 help 346 help
347 Select this option to enable fine granularity task irq time 347 Select this option to enable fine granularity task irq time
348 accounting. This is done by reading a timestamp on each 348 accounting. This is done by reading a timestamp on each
@@ -576,7 +576,7 @@ config RCU_FANOUT_EXACT
576 576
577config RCU_FAST_NO_HZ 577config RCU_FAST_NO_HZ
578 bool "Accelerate last non-dyntick-idle CPU's grace periods" 578 bool "Accelerate last non-dyntick-idle CPU's grace periods"
579 depends on NO_HZ && SMP 579 depends on NO_HZ_COMMON && SMP
580 default n 580 default n
581 help 581 help
582 This option permits CPUs to enter dynticks-idle state even if 582 This option permits CPUs to enter dynticks-idle state even if
diff --git a/init/main.c b/init/main.c
index 12c366944dbd..1952bf2f6875 100644
--- a/init/main.c
+++ b/init/main.c
@@ -545,6 +545,7 @@ asmlinkage void __init start_kernel(void)
545 idr_init_cache(); 545 idr_init_cache();
546 perf_event_init(); 546 perf_event_init();
547 rcu_init(); 547 rcu_init();
548 tick_nohz_init();
548 radix_tree_init(); 549 radix_tree_init();
549 /* init some links before init_ISA_irqs() */ 550 /* init some links before init_ISA_irqs() */
550 early_irq_init(); 551 early_irq_init();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3820e3cefbae..6b41c1899a8b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
18#include <linux/poll.h> 18#include <linux/poll.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/hash.h> 20#include <linux/hash.h>
21#include <linux/tick.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/dcache.h> 23#include <linux/dcache.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -685,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
685 686
686 WARN_ON(!irqs_disabled()); 687 WARN_ON(!irqs_disabled());
687 688
688 if (list_empty(&cpuctx->rotation_list)) 689 if (list_empty(&cpuctx->rotation_list)) {
690 int was_empty = list_empty(head);
689 list_add(&cpuctx->rotation_list, head); 691 list_add(&cpuctx->rotation_list, head);
692 if (was_empty)
693 tick_nohz_full_kick();
694 }
690} 695}
691 696
692static void get_ctx(struct perf_event_context *ctx) 697static void get_ctx(struct perf_event_context *ctx)
@@ -2591,6 +2596,16 @@ done:
2591 list_del_init(&cpuctx->rotation_list); 2596 list_del_init(&cpuctx->rotation_list);
2592} 2597}
2593 2598
2599#ifdef CONFIG_NO_HZ_FULL
2600bool perf_event_can_stop_tick(void)
2601{
2602 if (list_empty(&__get_cpu_var(rotation_list)))
2603 return true;
2604 else
2605 return false;
2606}
2607#endif
2608
2594void perf_event_task_tick(void) 2609void perf_event_task_tick(void)
2595{ 2610{
2596 struct list_head *head = &__get_cpu_var(rotation_list); 2611 struct list_head *head = &__get_cpu_var(rotation_list);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14be27feda49..abfd89d687ac 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -161,7 +161,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
161 */ 161 */
162static int hrtimer_get_target(int this_cpu, int pinned) 162static int hrtimer_get_target(int this_cpu, int pinned)
163{ 163{
164#ifdef CONFIG_NO_HZ 164#ifdef CONFIG_NO_HZ_COMMON
165 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) 165 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
166 return get_nohz_timer_target(); 166 return get_nohz_timer_target();
167#endif 167#endif
@@ -1107,7 +1107,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1107} 1107}
1108EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 1108EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1109 1109
1110#ifdef CONFIG_NO_HZ 1110#ifdef CONFIG_NO_HZ_COMMON
1111/** 1111/**
1112 * hrtimer_get_next_event - get the time until next expiry event 1112 * hrtimer_get_next_event - get the time until next expiry event
1113 * 1113 *
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8fd709c9bb58..42670e9b44e0 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12#include <linux/random.h> 12#include <linux/random.h>
13#include <linux/tick.h>
14#include <linux/workqueue.h>
13 15
14/* 16/*
15 * Called after updating RLIMIT_CPU to run cpu timer and update 17 * Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
153 } 155 }
154} 156}
155 157
158/**
159 * task_cputime_zero - Check a task_cputime struct for all zero fields.
160 *
161 * @cputime: The struct to compare.
162 *
163 * Checks @cputime to see if all fields are zero. Returns true if all fields
164 * are zero, false if any field is nonzero.
165 */
166static inline int task_cputime_zero(const struct task_cputime *cputime)
167{
168 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
169 return 1;
170 return 0;
171}
172
156static inline cputime_t prof_ticks(struct task_struct *p) 173static inline cputime_t prof_ticks(struct task_struct *p)
157{ 174{
158 cputime_t utime, stime; 175 cputime_t utime, stime;
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
636 return 0; 653 return 0;
637} 654}
638 655
656#ifdef CONFIG_NO_HZ_FULL
657static void nohz_kick_work_fn(struct work_struct *work)
658{
659 tick_nohz_full_kick_all();
660}
661
662static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
663
664/*
665 * We need the IPIs to be sent from sane process context.
666 * The posix cpu timers are always set with irqs disabled.
667 */
668static void posix_cpu_timer_kick_nohz(void)
669{
670 schedule_work(&nohz_kick_work);
671}
672
673bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
674{
675 if (!task_cputime_zero(&tsk->cputime_expires))
676 return false;
677
678 if (tsk->signal->cputimer.running)
679 return false;
680
681 return true;
682}
683#else
684static inline void posix_cpu_timer_kick_nohz(void) { }
685#endif
686
639/* 687/*
640 * Guts of sys_timer_settime for CPU timers. 688 * Guts of sys_timer_settime for CPU timers.
641 * This is called with the timer locked and interrupts disabled. 689 * This is called with the timer locked and interrupts disabled.
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
794 sample_to_timespec(timer->it_clock, 842 sample_to_timespec(timer->it_clock,
795 old_incr, &old->it_interval); 843 old_incr, &old->it_interval);
796 } 844 }
845 if (!ret)
846 posix_cpu_timer_kick_nohz();
797 return ret; 847 return ret;
798} 848}
799 849
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1008 } 1058 }
1009} 1059}
1010 1060
1011/**
1012 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1013 *
1014 * @cputime: The struct to compare.
1015 *
1016 * Checks @cputime to see if all fields are zero. Returns true if all fields
1017 * are zero, false if any field is nonzero.
1018 */
1019static inline int task_cputime_zero(const struct task_cputime *cputime)
1020{
1021 if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
1022 return 1;
1023 return 0;
1024}
1025
1026/* 1061/*
1027 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1028 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1336 cpu_timer_fire(timer); 1371 cpu_timer_fire(timer);
1337 spin_unlock(&timer->it_lock); 1372 spin_unlock(&timer->it_lock);
1338 } 1373 }
1374
1375 /*
1376 * In case some timers were rescheduled after the queue got emptied,
1377 * wake up full dynticks CPUs.
1378 */
1379 if (tsk->signal->cputimer.running)
1380 posix_cpu_timer_kick_nohz();
1339} 1381}
1340 1382
1341/* 1383/*
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1366 } 1408 }
1367 1409
1368 if (!*newval) 1410 if (!*newval)
1369 return; 1411 goto out;
1370 *newval += now.cpu; 1412 *newval += now.cpu;
1371 } 1413 }
1372 1414
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1384 tsk->signal->cputime_expires.virt_exp = *newval; 1426 tsk->signal->cputime_expires.virt_exp = *newval;
1385 break; 1427 break;
1386 } 1428 }
1429out:
1430 posix_cpu_timer_kick_nohz();
1387} 1431}
1388 1432
1389static int do_cpu_nanosleep(const clockid_t which_clock, int flags, 1433static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d8534308fd05..16ea67925015 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -799,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
799 rdp->offline_fqs++; 799 rdp->offline_fqs++;
800 return 1; 800 return 1;
801 } 801 }
802
803 /*
804 * There is a possibility that a CPU in adaptive-ticks state
805 * might run in the kernel with the scheduling-clock tick disabled
806 * for an extended time period. Invoke rcu_kick_nohz_cpu() to
807 * force the CPU to restart the scheduling-clock tick in this
808 * CPU is in this state.
809 */
810 rcu_kick_nohz_cpu(rdp->cpu);
811
802 return 0; 812 return 0;
803} 813}
804 814
@@ -1820,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1820 struct rcu_node *rnp, struct rcu_data *rdp) 1830 struct rcu_node *rnp, struct rcu_data *rdp)
1821{ 1831{
1822 /* No-CBs CPUs do not have orphanable callbacks. */ 1832 /* No-CBs CPUs do not have orphanable callbacks. */
1823 if (is_nocb_cpu(rdp->cpu)) 1833 if (rcu_is_nocb_cpu(rdp->cpu))
1824 return; 1834 return;
1825 1835
1826 /* 1836 /*
@@ -2892,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
2892 * corresponding CPU's preceding callbacks have been invoked. 2902 * corresponding CPU's preceding callbacks have been invoked.
2893 */ 2903 */
2894 for_each_possible_cpu(cpu) { 2904 for_each_possible_cpu(cpu) {
2895 if (!cpu_online(cpu) && !is_nocb_cpu(cpu)) 2905 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
2896 continue; 2906 continue;
2897 rdp = per_cpu_ptr(rsp->rda, cpu); 2907 rdp = per_cpu_ptr(rsp->rda, cpu);
2898 if (is_nocb_cpu(cpu)) { 2908 if (rcu_is_nocb_cpu(cpu)) {
2899 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 2909 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
2900 rsp->n_barrier_done); 2910 rsp->n_barrier_done);
2901 atomic_inc(&rsp->barrier_cpu_count); 2911 atomic_inc(&rsp->barrier_cpu_count);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14ee40795d6f..da77a8f57ff9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -530,13 +530,13 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp);
530static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 530static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
531static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 531static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
532static void rcu_init_one_nocb(struct rcu_node *rnp); 532static void rcu_init_one_nocb(struct rcu_node *rnp);
533static bool is_nocb_cpu(int cpu);
534static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
535 bool lazy); 534 bool lazy);
536static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
537 struct rcu_data *rdp); 536 struct rcu_data *rdp);
538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 537static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 538static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
539static void rcu_kick_nohz_cpu(int cpu);
540static bool init_nocb_callback_list(struct rcu_data *rdp); 540static bool init_nocb_callback_list(struct rcu_data *rdp);
541 541
542#endif /* #ifndef RCU_TREE_NONCORE */ 542#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index d084ae3f281c..71bd7337d0cc 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include <linux/tick.h>
31 32
32#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
33 34
@@ -2052,7 +2053,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2052} 2053}
2053 2054
2054/* Is the specified CPU a no-CPUs CPU? */ 2055/* Is the specified CPU a no-CPUs CPU? */
2055static bool is_nocb_cpu(int cpu) 2056bool rcu_is_nocb_cpu(int cpu)
2056{ 2057{
2057 if (have_rcu_nocb_mask) 2058 if (have_rcu_nocb_mask)
2058 return cpumask_test_cpu(cpu, rcu_nocb_mask); 2059 return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2110,7 +2111,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2110 bool lazy) 2111 bool lazy)
2111{ 2112{
2112 2113
2113 if (!is_nocb_cpu(rdp->cpu)) 2114 if (!rcu_is_nocb_cpu(rdp->cpu))
2114 return 0; 2115 return 0;
2115 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2116 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2116 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2117 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
@@ -2134,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2134 long qll = rsp->qlen_lazy; 2135 long qll = rsp->qlen_lazy;
2135 2136
2136 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2137 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2137 if (!is_nocb_cpu(smp_processor_id())) 2138 if (!rcu_is_nocb_cpu(smp_processor_id()))
2138 return 0; 2139 return 0;
2139 rsp->qlen = 0; 2140 rsp->qlen = 0;
2140 rsp->qlen_lazy = 0; 2141 rsp->qlen_lazy = 0;
@@ -2306,11 +2307,6 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2306{ 2307{
2307} 2308}
2308 2309
2309static bool is_nocb_cpu(int cpu)
2310{
2311 return false;
2312}
2313
2314static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2310static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2315 bool lazy) 2311 bool lazy)
2316{ 2312{
@@ -2337,3 +2333,20 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2337} 2333}
2338 2334
2339#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2335#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2336
2337/*
2338 * An adaptive-ticks CPU can potentially execute in kernel mode for an
2339 * arbitrarily long period of time with the scheduling-clock tick turned
2340 * off. RCU will be paying attention to this CPU because it is in the
2341 * kernel, but the CPU cannot be guaranteed to be executing the RCU state
2342 * machine because the scheduling-clock tick has been disabled. Therefore,
2343 * if an adaptive-ticks CPU is failing to respond to the current grace
2344 * period and has not be idle from an RCU perspective, kick it.
2345 */
2346static void rcu_kick_nohz_cpu(int cpu)
2347{
2348#ifdef CONFIG_NO_HZ_FULL
2349 if (tick_nohz_full_cpu(cpu))
2350 smp_send_reschedule(cpu);
2351#endif /* #ifdef CONFIG_NO_HZ_FULL */
2352}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c70a8814a767..e94842d4400c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -544,7 +544,7 @@ void resched_cpu(int cpu)
544 raw_spin_unlock_irqrestore(&rq->lock, flags); 544 raw_spin_unlock_irqrestore(&rq->lock, flags);
545} 545}
546 546
547#ifdef CONFIG_NO_HZ 547#ifdef CONFIG_NO_HZ_COMMON
548/* 548/*
549 * In the semi idle case, use the nearest busy cpu for migrating timers 549 * In the semi idle case, use the nearest busy cpu for migrating timers
550 * from an idle cpu. This is good for power-savings. 550 * from an idle cpu. This is good for power-savings.
@@ -582,7 +582,7 @@ unlock:
582 * account when the CPU goes back to idle and evaluates the timer 582 * account when the CPU goes back to idle and evaluates the timer
583 * wheel for the next timer event. 583 * wheel for the next timer event.
584 */ 584 */
585void wake_up_idle_cpu(int cpu) 585static void wake_up_idle_cpu(int cpu)
586{ 586{
587 struct rq *rq = cpu_rq(cpu); 587 struct rq *rq = cpu_rq(cpu);
588 588
@@ -612,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
612 smp_send_reschedule(cpu); 612 smp_send_reschedule(cpu);
613} 613}
614 614
615static bool wake_up_full_nohz_cpu(int cpu)
616{
617 if (tick_nohz_full_cpu(cpu)) {
618 if (cpu != smp_processor_id() ||
619 tick_nohz_tick_stopped())
620 smp_send_reschedule(cpu);
621 return true;
622 }
623
624 return false;
625}
626
627void wake_up_nohz_cpu(int cpu)
628{
629 if (!wake_up_full_nohz_cpu(cpu))
630 wake_up_idle_cpu(cpu);
631}
632
615static inline bool got_nohz_idle_kick(void) 633static inline bool got_nohz_idle_kick(void)
616{ 634{
617 int cpu = smp_processor_id(); 635 int cpu = smp_processor_id();
618 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 636 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
619} 637}
620 638
621#else /* CONFIG_NO_HZ */ 639#else /* CONFIG_NO_HZ_COMMON */
622 640
623static inline bool got_nohz_idle_kick(void) 641static inline bool got_nohz_idle_kick(void)
624{ 642{
625 return false; 643 return false;
626} 644}
627 645
628#endif /* CONFIG_NO_HZ */ 646#endif /* CONFIG_NO_HZ_COMMON */
647
648#ifdef CONFIG_NO_HZ_FULL
649bool sched_can_stop_tick(void)
650{
651 struct rq *rq;
652
653 rq = this_rq();
654
655 /* Make sure rq->nr_running update is visible after the IPI */
656 smp_rmb();
657
658 /* More than one running task need preemption */
659 if (rq->nr_running > 1)
660 return false;
661
662 return true;
663}
664#endif /* CONFIG_NO_HZ_FULL */
629 665
630void sched_avg_update(struct rq *rq) 666void sched_avg_update(struct rq *rq)
631{ 667{
@@ -1357,7 +1393,8 @@ static void sched_ttwu_pending(void)
1357 1393
1358void scheduler_ipi(void) 1394void scheduler_ipi(void)
1359{ 1395{
1360 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1396 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
1397 && !tick_nohz_full_cpu(smp_processor_id()))
1361 return; 1398 return;
1362 1399
1363 /* 1400 /*
@@ -1374,6 +1411,7 @@ void scheduler_ipi(void)
1374 * somewhat pessimize the simple resched case. 1411 * somewhat pessimize the simple resched case.
1375 */ 1412 */
1376 irq_enter(); 1413 irq_enter();
1414 tick_nohz_full_check();
1377 sched_ttwu_pending(); 1415 sched_ttwu_pending();
1378 1416
1379 /* 1417 /*
@@ -1855,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1855 kprobe_flush_task(prev); 1893 kprobe_flush_task(prev);
1856 put_task_struct(prev); 1894 put_task_struct(prev);
1857 } 1895 }
1896
1897 tick_nohz_task_switch(current);
1858} 1898}
1859 1899
1860#ifdef CONFIG_SMP 1900#ifdef CONFIG_SMP
@@ -2118,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2118 return load >> FSHIFT; 2158 return load >> FSHIFT;
2119} 2159}
2120 2160
2121#ifdef CONFIG_NO_HZ 2161#ifdef CONFIG_NO_HZ_COMMON
2122/* 2162/*
2123 * Handle NO_HZ for the global load-average. 2163 * Handle NO_HZ for the global load-average.
2124 * 2164 *
@@ -2344,12 +2384,12 @@ static void calc_global_nohz(void)
2344 smp_wmb(); 2384 smp_wmb();
2345 calc_load_idx++; 2385 calc_load_idx++;
2346} 2386}
2347#else /* !CONFIG_NO_HZ */ 2387#else /* !CONFIG_NO_HZ_COMMON */
2348 2388
2349static inline long calc_load_fold_idle(void) { return 0; } 2389static inline long calc_load_fold_idle(void) { return 0; }
2350static inline void calc_global_nohz(void) { } 2390static inline void calc_global_nohz(void) { }
2351 2391
2352#endif /* CONFIG_NO_HZ */ 2392#endif /* CONFIG_NO_HZ_COMMON */
2353 2393
2354/* 2394/*
2355 * calc_load - update the avenrun load estimates 10 ticks after the 2395 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2509,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2509 sched_avg_update(this_rq); 2549 sched_avg_update(this_rq);
2510} 2550}
2511 2551
2512#ifdef CONFIG_NO_HZ 2552#ifdef CONFIG_NO_HZ_COMMON
2513/* 2553/*
2514 * There is no sane way to deal with nohz on smp when using jiffies because the 2554 * There is no sane way to deal with nohz on smp when using jiffies because the
2515 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 2555 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2569,7 +2609,7 @@ void update_cpu_load_nohz(void)
2569 } 2609 }
2570 raw_spin_unlock(&this_rq->lock); 2610 raw_spin_unlock(&this_rq->lock);
2571} 2611}
2572#endif /* CONFIG_NO_HZ */ 2612#endif /* CONFIG_NO_HZ_COMMON */
2573 2613
2574/* 2614/*
2575 * Called from scheduler_tick() 2615 * Called from scheduler_tick()
@@ -6950,7 +6990,7 @@ void __init sched_init(void)
6950 INIT_LIST_HEAD(&rq->cfs_tasks); 6990 INIT_LIST_HEAD(&rq->cfs_tasks);
6951 6991
6952 rq_attach_root(rq, &def_root_domain); 6992 rq_attach_root(rq, &def_root_domain);
6953#ifdef CONFIG_NO_HZ 6993#ifdef CONFIG_NO_HZ_COMMON
6954 rq->nohz_flags = 0; 6994 rq->nohz_flags = 0;
6955#endif 6995#endif
6956#endif 6996#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8bf7081b1ec5..c61a614465c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5355,7 +5355,7 @@ out_unlock:
5355 return 0; 5355 return 0;
5356} 5356}
5357 5357
5358#ifdef CONFIG_NO_HZ 5358#ifdef CONFIG_NO_HZ_COMMON
5359/* 5359/*
5360 * idle load balancing details 5360 * idle load balancing details
5361 * - When one of the busy CPUs notice that there may be an idle rebalancing 5361 * - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5572,9 +5572,9 @@ out:
5572 rq->next_balance = next_balance; 5572 rq->next_balance = next_balance;
5573} 5573}
5574 5574
5575#ifdef CONFIG_NO_HZ 5575#ifdef CONFIG_NO_HZ_COMMON
5576/* 5576/*
5577 * In CONFIG_NO_HZ case, the idle balance kickee will do the 5577 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
5578 * rebalancing for all the cpus for whom scheduler ticks are stopped. 5578 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5579 */ 5579 */
5580static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) 5580static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5717,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
5717 if (time_after_eq(jiffies, rq->next_balance) && 5717 if (time_after_eq(jiffies, rq->next_balance) &&
5718 likely(!on_null_domain(cpu))) 5718 likely(!on_null_domain(cpu)))
5719 raise_softirq(SCHED_SOFTIRQ); 5719 raise_softirq(SCHED_SOFTIRQ);
5720#ifdef CONFIG_NO_HZ 5720#ifdef CONFIG_NO_HZ_COMMON
5721 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) 5721 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5722 nohz_balancer_kick(cpu); 5722 nohz_balancer_kick(cpu);
5723#endif 5723#endif
@@ -6187,7 +6187,7 @@ __init void init_sched_fair_class(void)
6187#ifdef CONFIG_SMP 6187#ifdef CONFIG_SMP
6188 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 6188 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6189 6189
6190#ifdef CONFIG_NO_HZ 6190#ifdef CONFIG_NO_HZ_COMMON
6191 nohz.next_balance = jiffies; 6191 nohz.next_balance = jiffies;
6192 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 6192 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6193 cpu_notifier(sched_ilb_notifier, 0); 6193 cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4c225c4c7111..24dc29897749 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,6 +5,7 @@
5#include <linux/mutex.h> 5#include <linux/mutex.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h>
8 9
9#include "cpupri.h" 10#include "cpupri.h"
10#include "cpuacct.h" 11#include "cpuacct.h"
@@ -405,7 +406,7 @@ struct rq {
405 #define CPU_LOAD_IDX_MAX 5 406 #define CPU_LOAD_IDX_MAX 5
406 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 407 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
407 unsigned long last_load_update_tick; 408 unsigned long last_load_update_tick;
408#ifdef CONFIG_NO_HZ 409#ifdef CONFIG_NO_HZ_COMMON
409 u64 nohz_stamp; 410 u64 nohz_stamp;
410 unsigned long nohz_flags; 411 unsigned long nohz_flags;
411#endif 412#endif
@@ -1072,6 +1073,16 @@ static inline u64 steal_ticks(u64 steal)
1072static inline void inc_nr_running(struct rq *rq) 1073static inline void inc_nr_running(struct rq *rq)
1073{ 1074{
1074 rq->nr_running++; 1075 rq->nr_running++;
1076
1077#ifdef CONFIG_NO_HZ_FULL
1078 if (rq->nr_running == 2) {
1079 if (tick_nohz_full_cpu(rq->cpu)) {
1080 /* Order rq->nr_running write against the IPI */
1081 smp_wmb();
1082 smp_send_reschedule(rq->cpu);
1083 }
1084 }
1085#endif
1075} 1086}
1076 1087
1077static inline void dec_nr_running(struct rq *rq) 1088static inline void dec_nr_running(struct rq *rq)
@@ -1299,7 +1310,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1299 1310
1300extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1311extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
1301 1312
1302#ifdef CONFIG_NO_HZ 1313#ifdef CONFIG_NO_HZ_COMMON
1303enum rq_nohz_flag_bits { 1314enum rq_nohz_flag_bits {
1304 NOHZ_TICK_STOPPED, 1315 NOHZ_TICK_STOPPED,
1305 NOHZ_BALANCE_KICK, 1316 NOHZ_BALANCE_KICK,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074aa..51a09d56e78b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void)
329 wakeup_softirqd(); 329 wakeup_softirqd();
330} 330}
331 331
332static inline void tick_irq_exit(void)
333{
334#ifdef CONFIG_NO_HZ_COMMON
335 int cpu = smp_processor_id();
336
337 /* Make sure that timer wheel updates are propagated */
338 if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
339 if (!in_interrupt())
340 tick_nohz_irq_exit();
341 }
342#endif
343}
344
332/* 345/*
333 * Exit an interrupt context. Process softirqs if needed and possible: 346 * Exit an interrupt context. Process softirqs if needed and possible:
334 */ 347 */
@@ -346,11 +359,7 @@ void irq_exit(void)
346 if (!in_interrupt() && local_softirq_pending()) 359 if (!in_interrupt() && local_softirq_pending())
347 invoke_softirq(); 360 invoke_softirq();
348 361
349#ifdef CONFIG_NO_HZ 362 tick_irq_exit();
350 /* Make sure that timer wheel updates are propagated */
351 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
352 tick_nohz_irq_exit();
353#endif
354 rcu_irq_exit(); 363 rcu_irq_exit();
355} 364}
356 365
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..a2ddd650cb92 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,20 +64,89 @@ config GENERIC_CMOS_UPDATE
64if GENERIC_CLOCKEVENTS 64if GENERIC_CLOCKEVENTS
65menu "Timers subsystem" 65menu "Timers subsystem"
66 66
67# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is 67# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
68# only related to the tick functionality. Oneshot clockevent devices 68# only related to the tick functionality. Oneshot clockevent devices
69# are supported independ of this. 69# are supported independ of this.
70config TICK_ONESHOT 70config TICK_ONESHOT
71 bool 71 bool
72 72
73config NO_HZ 73config NO_HZ_COMMON
74 bool "Tickless System (Dynamic Ticks)" 74 bool
75 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 75 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
76 select TICK_ONESHOT 76 select TICK_ONESHOT
77
78choice
79 prompt "Timer tick handling"
80 default NO_HZ_IDLE if NO_HZ
81
82config HZ_PERIODIC
83 bool "Periodic timer ticks (constant rate, no dynticks)"
84 help
85 This option keeps the tick running periodically at a constant
86 rate, even when the CPU doesn't need it.
87
88config NO_HZ_IDLE
89 bool "Idle dynticks system (tickless idle)"
90 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
91 select NO_HZ_COMMON
92 help
93 This option enables a tickless idle system: timer interrupts
94 will only trigger on an as-needed basis when the system is idle.
95 This is usually interesting for energy saving.
96
97 Most of the time you want to say Y here.
98
99config NO_HZ_FULL
100 bool "Full dynticks system (tickless)"
101 # NO_HZ_COMMON dependency
102 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
103 # We need at least one periodic CPU for timekeeping
104 depends on SMP
105 # RCU_USER_QS dependency
106 depends on HAVE_CONTEXT_TRACKING
107 # VIRT_CPU_ACCOUNTING_GEN dependency
108 depends on 64BIT
109 select NO_HZ_COMMON
110 select RCU_USER_QS
111 select RCU_NOCB_CPU
112 select RCU_NOCB_CPU_ALL
113 select VIRT_CPU_ACCOUNTING_GEN
114 select CONTEXT_TRACKING_FORCE
115 select IRQ_WORK
116 help
117 Adaptively try to shutdown the tick whenever possible, even when
118 the CPU is running tasks. Typically this requires running a single
119 task on the CPU. Chances for running tickless are maximized when
120 the task mostly runs in userspace and has few kernel activity.
121
122 You need to fill up the nohz_full boot parameter with the
123 desired range of dynticks CPUs.
124
125 This is implemented at the expense of some overhead in user <-> kernel
126 transitions: syscalls, exceptions and interrupts. Even when it's
127 dynamically off.
128
129 Say N.
130
131endchoice
132
133config NO_HZ_FULL_ALL
134 bool "Full dynticks system on all CPUs by default"
135 depends on NO_HZ_FULL
136 help
137 If the user doesn't pass the nohz_full boot option to
138 define the range of full dynticks CPUs, consider that all
139 CPUs in the system are full dynticks by default.
140 Note the boot CPU will still be kept outside the range to
141 handle the timekeeping duty.
142
143config NO_HZ
144 bool "Old Idle dynticks config"
145 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
77 help 146 help
78 This option enables a tickless system: timer interrupts will 147 This is the old config entry that enables dynticks idle.
79 only trigger on an as-needed basis both when the system is 148 We keep it around for a little while to enforce backward
80 busy and when the system is idle. 149 compatibility with older config files.
81 150
82config HIGH_RES_TIMERS 151config HIGH_RES_TIMERS
83 bool "High Resolution Timer Support" 152 bool "High Resolution Timer Support"
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7f32fe0e52cd..40c10502c9e9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -574,7 +574,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
574 bc->event_handler = tick_handle_oneshot_broadcast; 574 bc->event_handler = tick_handle_oneshot_broadcast;
575 575
576 /* Take the do_timer update */ 576 /* Take the do_timer update */
577 tick_do_timer_cpu = cpu; 577 if (!tick_nohz_full_cpu(cpu))
578 tick_do_timer_cpu = cpu;
578 579
579 /* 580 /*
580 * We must be careful here. There might be other CPUs 581 * We must be careful here. There might be other CPUs
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f4..83f2bd967161 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,
163 * this cpu: 163 * this cpu:
164 */ 164 */
165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { 165 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
166 tick_do_timer_cpu = cpu; 166 if (!tick_nohz_full_cpu(cpu))
167 tick_do_timer_cpu = cpu;
168 else
169 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
167 tick_next_period = ktime_get(); 170 tick_next_period = ktime_get();
168 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 171 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
169 } 172 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..1c9f53b2ddb7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,11 +21,15 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/irq_work.h> 23#include <linux/irq_work.h>
24#include <linux/posix-timers.h>
25#include <linux/perf_event.h>
24 26
25#include <asm/irq_regs.h> 27#include <asm/irq_regs.h>
26 28
27#include "tick-internal.h" 29#include "tick-internal.h"
28 30
31#include <trace/events/timer.h>
32
29/* 33/*
30 * Per cpu nohz control structure 34 * Per cpu nohz control structure
31 */ 35 */
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)
104{ 108{
105 int cpu = smp_processor_id(); 109 int cpu = smp_processor_id();
106 110
107#ifdef CONFIG_NO_HZ 111#ifdef CONFIG_NO_HZ_COMMON
108 /* 112 /*
109 * Check if the do_timer duty was dropped. We don't care about 113 * Check if the do_timer duty was dropped. We don't care about
110 * concurrency: This happens only when the cpu in charge went 114 * concurrency: This happens only when the cpu in charge went
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
112 * this duty, then the jiffies update is still serialized by 116 * this duty, then the jiffies update is still serialized by
113 * jiffies_lock. 117 * jiffies_lock.
114 */ 118 */
115 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) 119 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
120 && !tick_nohz_full_cpu(cpu))
116 tick_do_timer_cpu = cpu; 121 tick_do_timer_cpu = cpu;
117#endif 122#endif
118 123
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)
123 128
124static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) 129static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
125{ 130{
126#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ_COMMON
127 /* 132 /*
128 * When we are idle and the tick is stopped, we have to touch 133 * When we are idle and the tick is stopped, we have to touch
129 * the watchdog as we might not schedule for a really long 134 * the watchdog as we might not schedule for a really long
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
142 profile_tick(CPU_PROFILING); 147 profile_tick(CPU_PROFILING);
143} 148}
144 149
150#ifdef CONFIG_NO_HZ_FULL
151static cpumask_var_t nohz_full_mask;
152bool have_nohz_full_mask;
153
154static bool can_stop_full_tick(void)
155{
156 WARN_ON_ONCE(!irqs_disabled());
157
158 if (!sched_can_stop_tick()) {
159 trace_tick_stop(0, "more than 1 task in runqueue\n");
160 return false;
161 }
162
163 if (!posix_cpu_timers_can_stop_tick(current)) {
164 trace_tick_stop(0, "posix timers running\n");
165 return false;
166 }
167
168 if (!perf_event_can_stop_tick()) {
169 trace_tick_stop(0, "perf events running\n");
170 return false;
171 }
172
173 /* sched_clock_tick() needs us? */
174#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
175 /*
176 * TODO: kick full dynticks CPUs when
177 * sched_clock_stable is set.
178 */
179 if (!sched_clock_stable) {
180 trace_tick_stop(0, "unstable sched clock\n");
181 return false;
182 }
183#endif
184
185 return true;
186}
187
188static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
189
190/*
191 * Re-evaluate the need for the tick on the current CPU
192 * and restart it if necessary.
193 */
194void tick_nohz_full_check(void)
195{
196 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
197
198 if (tick_nohz_full_cpu(smp_processor_id())) {
199 if (ts->tick_stopped && !is_idle_task(current)) {
200 if (!can_stop_full_tick())
201 tick_nohz_restart_sched_tick(ts, ktime_get());
202 }
203 }
204}
205
206static void nohz_full_kick_work_func(struct irq_work *work)
207{
208 tick_nohz_full_check();
209}
210
211static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
212 .func = nohz_full_kick_work_func,
213};
214
215/*
216 * Kick the current CPU if it's full dynticks in order to force it to
217 * re-evaluate its dependency on the tick and restart it if necessary.
218 */
219void tick_nohz_full_kick(void)
220{
221 if (tick_nohz_full_cpu(smp_processor_id()))
222 irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
223}
224
225static void nohz_full_kick_ipi(void *info)
226{
227 tick_nohz_full_check();
228}
229
230/*
231 * Kick all full dynticks CPUs in order to force these to re-evaluate
232 * their dependency on the tick and restart it if necessary.
233 */
234void tick_nohz_full_kick_all(void)
235{
236 if (!have_nohz_full_mask)
237 return;
238
239 preempt_disable();
240 smp_call_function_many(nohz_full_mask,
241 nohz_full_kick_ipi, NULL, false);
242 preempt_enable();
243}
244
245/*
246 * Re-evaluate the need for the tick as we switch the current task.
247 * It might need the tick due to per task/process properties:
248 * perf events, posix cpu timers, ...
249 */
250void tick_nohz_task_switch(struct task_struct *tsk)
251{
252 unsigned long flags;
253
254 local_irq_save(flags);
255
256 if (!tick_nohz_full_cpu(smp_processor_id()))
257 goto out;
258
259 if (tick_nohz_tick_stopped() && !can_stop_full_tick())
260 tick_nohz_full_kick();
261
262out:
263 local_irq_restore(flags);
264}
265
266int tick_nohz_full_cpu(int cpu)
267{
268 if (!have_nohz_full_mask)
269 return 0;
270
271 return cpumask_test_cpu(cpu, nohz_full_mask);
272}
273
274/* Parse the boot-time nohz CPU list from the kernel parameters. */
275static int __init tick_nohz_full_setup(char *str)
276{
277 int cpu;
278
279 alloc_bootmem_cpumask_var(&nohz_full_mask);
280 if (cpulist_parse(str, nohz_full_mask) < 0) {
281 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
282 return 1;
283 }
284
285 cpu = smp_processor_id();
286 if (cpumask_test_cpu(cpu, nohz_full_mask)) {
287 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
288 cpumask_clear_cpu(cpu, nohz_full_mask);
289 }
290 have_nohz_full_mask = true;
291
292 return 1;
293}
294__setup("nohz_full=", tick_nohz_full_setup);
295
296static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
297 unsigned long action,
298 void *hcpu)
299{
300 unsigned int cpu = (unsigned long)hcpu;
301
302 switch (action & ~CPU_TASKS_FROZEN) {
303 case CPU_DOWN_PREPARE:
304 /*
305 * If we handle the timekeeping duty for full dynticks CPUs,
306 * we can't safely shutdown that CPU.
307 */
308 if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
309 return -EINVAL;
310 break;
311 }
312 return NOTIFY_OK;
313}
314
315/*
316 * Worst case string length in chunks of CPU range seems 2 steps
317 * separations: 0,2,4,6,...
318 * This is NR_CPUS + sizeof('\0')
319 */
320static char __initdata nohz_full_buf[NR_CPUS + 1];
321
322static int tick_nohz_init_all(void)
323{
324 int err = -1;
325
326#ifdef CONFIG_NO_HZ_FULL_ALL
327 if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
328 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
329 return err;
330 }
331 err = 0;
332 cpumask_setall(nohz_full_mask);
333 cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
334 have_nohz_full_mask = true;
335#endif
336 return err;
337}
338
339void __init tick_nohz_init(void)
340{
341 int cpu;
342
343 if (!have_nohz_full_mask) {
344 if (tick_nohz_init_all() < 0)
345 return;
346 }
347
348 cpu_notifier(tick_nohz_cpu_down_callback, 0);
349
350 /* Make sure full dynticks CPU are also RCU nocbs */
351 for_each_cpu(cpu, nohz_full_mask) {
352 if (!rcu_is_nocb_cpu(cpu)) {
353 pr_warning("NO_HZ: CPU %d is not RCU nocb: "
354 "cleared from nohz_full range", cpu);
355 cpumask_clear_cpu(cpu, nohz_full_mask);
356 }
357 }
358
359 cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
360 pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
361}
362#else
363#define have_nohz_full_mask (0)
364#endif
365
145/* 366/*
146 * NOHZ - aka dynamic tick functionality 367 * NOHZ - aka dynamic tick functionality
147 */ 368 */
148#ifdef CONFIG_NO_HZ 369#ifdef CONFIG_NO_HZ_COMMON
149/* 370/*
150 * NO HZ enabled ? 371 * NO HZ enabled ?
151 */ 372 */
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
345 delta_jiffies = rcu_delta_jiffies; 566 delta_jiffies = rcu_delta_jiffies;
346 } 567 }
347 } 568 }
569
348 /* 570 /*
349 * Do not stop the tick, if we are only one off 571 * Do not stop the tick, if we are only one off (or less)
350 * or if the cpu is required for rcu 572 * or if the cpu is required for RCU:
351 */ 573 */
352 if (!ts->tick_stopped && delta_jiffies == 1) 574 if (!ts->tick_stopped && delta_jiffies <= 1)
353 goto out; 575 goto out;
354 576
355 /* Schedule the tick, if we are at least one jiffie off */ 577 /* Schedule the tick, if we are at least one jiffie off */
@@ -421,6 +643,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
421 643
422 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 644 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
423 ts->tick_stopped = 1; 645 ts->tick_stopped = 1;
646 trace_tick_stop(1, " ");
424 } 647 }
425 648
426 /* 649 /*
@@ -457,6 +680,24 @@ out:
457 return ret; 680 return ret;
458} 681}
459 682
683static void tick_nohz_full_stop_tick(struct tick_sched *ts)
684{
685#ifdef CONFIG_NO_HZ_FULL
686 int cpu = smp_processor_id();
687
688 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
689 return;
690
691 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
692 return;
693
694 if (!can_stop_full_tick())
695 return;
696
697 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
698#endif
699}
700
460static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) 701static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
461{ 702{
462 /* 703 /*
@@ -489,6 +730,21 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
489 return false; 730 return false;
490 } 731 }
491 732
733 if (have_nohz_full_mask) {
734 /*
735 * Keep the tick alive to guarantee timekeeping progression
736 * if there are full dynticks CPUs around
737 */
738 if (tick_do_timer_cpu == cpu)
739 return false;
740 /*
741 * Boot safety: make sure the timekeeping duty has been
742 * assigned before entering dyntick-idle mode,
743 */
744 if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
745 return false;
746 }
747
492 return true; 748 return true;
493} 749}
494 750
@@ -568,12 +824,13 @@ void tick_nohz_irq_exit(void)
568{ 824{
569 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 825 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
570 826
571 if (!ts->inidle) 827 if (ts->inidle) {
572 return; 828 /* Cancel the timer because CPU already waken up from the C-states*/
573 829 menu_hrtimer_cancel();
574 /* Cancel the timer because CPU already waken up from the C-states*/ 830 __tick_nohz_idle_enter(ts);
575 menu_hrtimer_cancel(); 831 } else {
576 __tick_nohz_idle_enter(ts); 832 tick_nohz_full_stop_tick(ts);
833 }
577} 834}
578 835
579/** 836/**
@@ -802,7 +1059,7 @@ static inline void tick_check_nohz(int cpu)
802static inline void tick_nohz_switch_to_nohz(void) { } 1059static inline void tick_nohz_switch_to_nohz(void) { }
803static inline void tick_check_nohz(int cpu) { } 1060static inline void tick_check_nohz(int cpu) { }
804 1061
805#endif /* NO_HZ */ 1062#endif /* CONFIG_NO_HZ_COMMON */
806 1063
807/* 1064/*
808 * Called from irq_enter to notify about the possible interruption of idle() 1065 * Called from irq_enter to notify about the possible interruption of idle()
@@ -887,14 +1144,14 @@ void tick_setup_sched_timer(void)
887 now = ktime_get(); 1144 now = ktime_get();
888 } 1145 }
889 1146
890#ifdef CONFIG_NO_HZ 1147#ifdef CONFIG_NO_HZ_COMMON
891 if (tick_nohz_enabled) 1148 if (tick_nohz_enabled)
892 ts->nohz_mode = NOHZ_MODE_HIGHRES; 1149 ts->nohz_mode = NOHZ_MODE_HIGHRES;
893#endif 1150#endif
894} 1151}
895#endif /* HIGH_RES_TIMERS */ 1152#endif /* HIGH_RES_TIMERS */
896 1153
897#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS 1154#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
898void tick_cancel_sched_timer(int cpu) 1155void tick_cancel_sched_timer(int cpu)
899{ 1156{
900 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 1157 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..1b7489fdea41 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -738,7 +738,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
738 738
739 cpu = smp_processor_id(); 739 cpu = smp_processor_id();
740 740
741#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 741#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
742 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) 742 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
743 cpu = get_nohz_timer_target(); 743 cpu = get_nohz_timer_target();
744#endif 744#endif
@@ -930,14 +930,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
930 debug_activate(timer, timer->expires); 930 debug_activate(timer, timer->expires);
931 internal_add_timer(base, timer); 931 internal_add_timer(base, timer);
932 /* 932 /*
933 * Check whether the other CPU is idle and needs to be 933 * Check whether the other CPU is in dynticks mode and needs
934 * triggered to reevaluate the timer wheel when nohz is 934 * to be triggered to reevaluate the timer wheel.
935 * active. We are protected against the other CPU fiddling 935 * We are protected against the other CPU fiddling
936 * with the timer by holding the timer base lock. This also 936 * with the timer by holding the timer base lock. This also
937 * makes sure that a CPU on the way to idle can not evaluate 937 * makes sure that a CPU on the way to stop its tick can not
938 * the timer wheel. 938 * evaluate the timer wheel.
939 */ 939 */
940 wake_up_idle_cpu(cpu); 940 wake_up_nohz_cpu(cpu);
941 spin_unlock_irqrestore(&base->lock, flags); 941 spin_unlock_irqrestore(&base->lock, flags);
942} 942}
943EXPORT_SYMBOL_GPL(add_timer_on); 943EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1188,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
1188 spin_unlock_irq(&base->lock); 1188 spin_unlock_irq(&base->lock);
1189} 1189}
1190 1190
1191#ifdef CONFIG_NO_HZ 1191#ifdef CONFIG_NO_HZ_COMMON
1192/* 1192/*
1193 * Find out when the next timer event is due to happen. This 1193 * Find out when the next timer event is due to happen. This
1194 * is used on S/390 to stop all activity when a CPU is idle. 1194 * is used on S/390 to stop all activity when a CPU is idle.