diff options
-rw-r--r-- | Documentation/kernel-per-CPU-kthreads.txt | 202 | ||||
-rw-r--r-- | arch/Kconfig | 3 | ||||
-rw-r--r-- | kernel/cpu/idle.c | 2 |
3 files changed, 207 insertions, 0 deletions
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt new file mode 100644 index 000000000000..cbf7ae412da4 --- /dev/null +++ b/Documentation/kernel-per-CPU-kthreads.txt | |||
@@ -0,0 +1,202 @@ | |||
1 | REDUCING OS JITTER DUE TO PER-CPU KTHREADS | ||
2 | |||
3 | This document lists per-CPU kthreads in the Linux kernel and presents | ||
4 | options to control their OS jitter. Note that non-per-CPU kthreads are | ||
5 | not listed here. To reduce OS jitter from non-per-CPU kthreads, bind | ||
6 | them to a "housekeeping" CPU dedicated to such work. | ||
7 | |||
8 | |||
9 | REFERENCES | ||
10 | |||
11 | o Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. | ||
12 | |||
13 | o Documentation/cgroups: Using cgroups to bind tasks to sets of CPUs. | ||
14 | |||
15 | o man taskset: Using the taskset command to bind tasks to sets | ||
16 | of CPUs. | ||
17 | |||
18 | o man sched_setaffinity: Using the sched_setaffinity() system | ||
19 | call to bind tasks to sets of CPUs. | ||
20 | |||
21 | o /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, | ||
22 | writing "0" to offline and "1" to online. | ||
23 | |||
24 | o In order to locate kernel-generated OS jitter on CPU N: | ||
25 | |||
26 | cd /sys/kernel/debug/tracing | ||
27 | echo 1 > max_graph_depth # Increase the "1" for more detail | ||
28 | echo function_graph > current_tracer | ||
29 | # run workload | ||
30 | cat per_cpu/cpuN/trace | ||
31 | |||
32 | |||
33 | KTHREADS | ||
34 | |||
35 | Name: ehca_comp/%u | ||
36 | Purpose: Periodically process Infiniband-related work. | ||
37 | To reduce its OS jitter, do any of the following: | ||
38 | 1. Don't use eHCA Infiniband hardware, instead choosing hardware | ||
39 | that does not require per-CPU kthreads. This will prevent these | ||
40 | kthreads from being created in the first place. (This will | ||
41 | work for most people, as this hardware, though important, is | ||
42 | relatively old and is produced in relatively low unit volumes.) | ||
43 | 2. Do all eHCA-Infiniband-related work on other CPUs, including | ||
44 | interrupts. | ||
45 | 3. Rework the eHCA driver so that its per-CPU kthreads are | ||
46 | provisioned only on selected CPUs. | ||
47 | |||
48 | |||
49 | Name: irq/%d-%s | ||
50 | Purpose: Handle threaded interrupts. | ||
51 | To reduce its OS jitter, do the following: | ||
52 | 1. Use irq affinity to force the irq threads to execute on | ||
53 | some other CPU. | ||
54 | |||
55 | Name: kcmtpd_ctr_%d | ||
56 | Purpose: Handle Bluetooth work. | ||
57 | To reduce its OS jitter, do one of the following: | ||
58 | 1. Don't use Bluetooth, in which case these kthreads won't be | ||
59 | created in the first place. | ||
60 | 2. Use irq affinity to force Bluetooth-related interrupts to | ||
61 | occur on some other CPU and furthermore initiate all | ||
62 | Bluetooth activity on some other CPU. | ||
63 | |||
64 | Name: ksoftirqd/%u | ||
65 | Purpose: Execute softirq handlers when threaded or when under heavy load. | ||
66 | To reduce its OS jitter, each softirq vector must be handled | ||
67 | separately as follows: | ||
68 | TIMER_SOFTIRQ: Do all of the following: | ||
69 | 1. To the extent possible, keep the CPU out of the kernel when it | ||
70 | is non-idle, for example, by avoiding system calls and by forcing | ||
71 | both kernel threads and interrupts to execute elsewhere. | ||
72 | 2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force | ||
73 | the CPU offline, then bring it back online. This forces | ||
74 | recurring timers to migrate elsewhere. If you are concerned | ||
75 | with multiple CPUs, force them all offline before bringing the | ||
76 | first one back online. Once you have onlined the CPUs in question, | ||
77 | do not offline any other CPUs, because doing so could force the | ||
78 | timer back onto one of the CPUs in question. | ||
79 | NET_TX_SOFTIRQ and NET_RX_SOFTIRQ: Do all of the following: | ||
80 | 1. Force networking interrupts onto other CPUs. | ||
81 | 2. Initiate any network I/O on other CPUs. | ||
82 | 3. Once your application has started, prevent CPU-hotplug operations | ||
83 | from being initiated from tasks that might run on the CPU to | ||
84 | be de-jittered. (It is OK to force this CPU offline and then | ||
85 | bring it back online before you start your application.) | ||
86 | BLOCK_SOFTIRQ: Do all of the following: | ||
87 | 1. Force block-device interrupts onto some other CPU. | ||
88 | 2. Initiate any block I/O on other CPUs. | ||
89 | 3. Once your application has started, prevent CPU-hotplug operations | ||
90 | from being initiated from tasks that might run on the CPU to | ||
91 | be de-jittered. (It is OK to force this CPU offline and then | ||
92 | bring it back online before you start your application.) | ||
93 | BLOCK_IOPOLL_SOFTIRQ: Do all of the following: | ||
94 | 1. Force block-device interrupts onto some other CPU. | ||
95 | 2. Initiate any block I/O and block-I/O polling on other CPUs. | ||
96 | 3. Once your application has started, prevent CPU-hotplug operations | ||
97 | from being initiated from tasks that might run on the CPU to | ||
98 | be de-jittered. (It is OK to force this CPU offline and then | ||
99 | bring it back online before you start your application.) | ||
100 | TASKLET_SOFTIRQ: Do one or more of the following: | ||
101 | 1. Avoid use of drivers that use tasklets. (Such drivers will contain | ||
102 | calls to things like tasklet_schedule().) | ||
103 | 2. Convert all drivers that you must use from tasklets to workqueues. | ||
104 | 3. Force interrupts for drivers using tasklets onto other CPUs, | ||
105 | and also do I/O involving these drivers on other CPUs. | ||
106 | SCHED_SOFTIRQ: Do all of the following: | ||
107 | 1. Avoid sending scheduler IPIs to the CPU to be de-jittered, | ||
108 | for example, ensure that at most one runnable kthread is present | ||
109 | on that CPU. If a thread that expects to run on the de-jittered | ||
110 | CPU awakens, the scheduler will send an IPI that can result in | ||
111 | a subsequent SCHED_SOFTIRQ. | ||
112 | 2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, | ||
113 | CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU | ||
114 | to be de-jittered is marked as an adaptive-ticks CPU using the | ||
115 | "nohz_full=" boot parameter. This reduces the number of | ||
116 | scheduler-clock interrupts that the de-jittered CPU receives, | ||
117 | minimizing its chances of being selected to do the load balancing | ||
118 | work that runs in SCHED_SOFTIRQ context. | ||
119 | 3. To the extent possible, keep the CPU out of the kernel when it | ||
120 | is non-idle, for example, by avoiding system calls and by | ||
121 | forcing both kernel threads and interrupts to execute elsewhere. | ||
122 | This further reduces the number of scheduler-clock interrupts | ||
123 | received by the de-jittered CPU. | ||
124 | HRTIMER_SOFTIRQ: Do all of the following: | ||
125 | 1. To the extent possible, keep the CPU out of the kernel when it | ||
126 | is non-idle. For example, avoid system calls and force both | ||
127 | kernel threads and interrupts to execute elsewhere. | ||
128 | 2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the | ||
129 | CPU offline, then bring it back online. This forces recurring | ||
130 | timers to migrate elsewhere. If you are concerned with multiple | ||
131 | CPUs, force them all offline before bringing the first one | ||
132 | back online. Once you have onlined the CPUs in question, do not | ||
133 | offline any other CPUs, because doing so could force the timer | ||
134 | back onto one of the CPUs in question. | ||
135 | RCU_SOFTIRQ: Do at least one of the following: | ||
136 | 1. Offload callbacks and keep the CPU in either dyntick-idle or | ||
137 | adaptive-ticks state by doing all of the following: | ||
138 | a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, | ||
139 | CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU | ||
140 | to be de-jittered is marked as an adaptive-ticks CPU using | ||
141 | the "nohz_full=" boot parameter. Bind the rcuo kthreads | ||
142 | to housekeeping CPUs, which can tolerate OS jitter. | ||
143 | b. To the extent possible, keep the CPU out of the kernel | ||
144 | when it is non-idle, for example, by avoiding system | ||
145 | calls and by forcing both kernel threads and interrupts | ||
146 | to execute elsewhere. | ||
147 | 2. Enable RCU to do its processing remotely via dyntick-idle by | ||
148 | doing all of the following: | ||
149 | a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. | ||
150 | b. Ensure that the CPU goes idle frequently, allowing other | ||
151 | CPUs to detect that it has passed through an RCU quiescent | ||
152 | state. If the kernel is built with CONFIG_NO_HZ_FULL=y, | ||
153 | userspace execution also allows other CPUs to detect that | ||
154 | the CPU in question has passed through a quiescent state. | ||
155 | c. To the extent possible, keep the CPU out of the kernel | ||
156 | when it is non-idle, for example, by avoiding system | ||
157 | calls and by forcing both kernel threads and interrupts | ||
158 | to execute elsewhere. | ||
159 | |||
160 | Name: rcuc/%u | ||
161 | Purpose: Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. | ||
162 | To reduce its OS jitter, do at least one of the following: | ||
163 | 1. Build the kernel with CONFIG_PREEMPT=n. This prevents these | ||
164 | kthreads from being created in the first place, and also obviates | ||
165 | the need for RCU priority boosting. This approach is feasible | ||
166 | for workloads that do not require high degrees of responsiveness. | ||
167 | 2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these | ||
168 | kthreads from being created in the first place. This approach | ||
169 | is feasible only if your workload never requires RCU priority | ||
170 | boosting, for example, if you ensure frequent idle time on all | ||
171 | CPUs that might execute within the kernel. | ||
172 | 3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y, | ||
173 | which offloads all RCU callbacks to kthreads that can be moved | ||
174 | off of CPUs susceptible to OS jitter. This approach prevents the | ||
175 | rcuc/%u kthreads from having any work to do, so that they are | ||
176 | never awakened. | ||
177 | 4. Ensure that the CPU never enters the kernel, and, in particular, | ||
178 | avoid initiating any CPU hotplug operations on this CPU. This is | ||
179 | another way of preventing any callbacks from being queued on the | ||
180 | CPU, again preventing the rcuc/%u kthreads from having any work | ||
181 | to do. | ||
182 | |||
183 | Name: rcuob/%d, rcuop/%d, and rcuos/%d | ||
184 | Purpose: Offload RCU callbacks from the corresponding CPU. | ||
185 | To reduce its OS jitter, do at least one of the following: | ||
186 | 1. Use affinity, cgroups, or other mechanism to force these kthreads | ||
187 | to execute on some other CPU. | ||
188 | 2. Build with CONFIG_RCU_NOCB_CPUS=n, which will prevent these | ||
189 | kthreads from being created in the first place. However, please | ||
190 | note that this will not eliminate OS jitter, but will instead | ||
191 | shift it to RCU_SOFTIRQ. | ||
192 | |||
193 | Name: watchdog/%u | ||
194 | Purpose: Detect software lockups on each CPU. | ||
195 | To reduce its OS jitter, do at least one of the following: | ||
196 | 1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these | ||
197 | kthreads from being created in the first place. | ||
198 | 2. Echo a zero to /proc/sys/kernel/watchdog to disable the | ||
199 | watchdog timer. | ||
200 | 3. Echo a large number of /proc/sys/kernel/watchdog_thresh in | ||
201 | order to reduce the frequency of OS jitter due to the watchdog | ||
202 | timer down to a level that is acceptable for your workload. | ||
diff --git a/arch/Kconfig b/arch/Kconfig index dd0e8eb8042f..a4429bcd609e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -213,6 +213,9 @@ config USE_GENERIC_SMP_HELPERS | |||
213 | config GENERIC_SMP_IDLE_THREAD | 213 | config GENERIC_SMP_IDLE_THREAD |
214 | bool | 214 | bool |
215 | 215 | ||
216 | config GENERIC_IDLE_POLL_SETUP | ||
217 | bool | ||
218 | |||
216 | # Select if arch init_task initializer is different to init/init_task.c | 219 | # Select if arch init_task initializer is different to init/init_task.c |
217 | config ARCH_INIT_TASK | 220 | config ARCH_INIT_TASK |
218 | bool | 221 | bool |
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 8b86c0c68edf..d5585f5e038e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c | |||
@@ -40,11 +40,13 @@ __setup("hlt", cpu_idle_nopoll_setup); | |||
40 | 40 | ||
41 | static inline int cpu_idle_poll(void) | 41 | static inline int cpu_idle_poll(void) |
42 | { | 42 | { |
43 | rcu_idle_enter(); | ||
43 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 44 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
44 | local_irq_enable(); | 45 | local_irq_enable(); |
45 | while (!need_resched()) | 46 | while (!need_resched()) |
46 | cpu_relax(); | 47 | cpu_relax(); |
47 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 48 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
49 | rcu_idle_exit(); | ||
48 | return 1; | 50 | return 1; |
49 | } | 51 | } |
50 | 52 | ||