diff options
-rw-r--r-- | Documentation/RCU/checklist.txt | 26 | ||||
-rw-r--r-- | Documentation/RCU/lockdep.txt | 5 | ||||
-rw-r--r-- | Documentation/RCU/rcubarrier.txt | 15 | ||||
-rw-r--r-- | Documentation/RCU/stallwarn.txt | 33 | ||||
-rw-r--r-- | Documentation/RCU/whatisRCU.txt | 4 | ||||
-rw-r--r-- | Documentation/kernel-parameters.txt | 35 | ||||
-rw-r--r-- | Documentation/kernel-per-CPU-kthreads.txt | 202 | ||||
-rw-r--r-- | include/linux/list_bl.h | 5 | ||||
-rw-r--r-- | include/linux/rculist_bl.h | 2 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 1 | ||||
-rw-r--r-- | include/trace/events/rcu.h | 55 | ||||
-rw-r--r-- | init/Kconfig | 73 | ||||
-rw-r--r-- | kernel/rcutree.c | 260 | ||||
-rw-r--r-- | kernel/rcutree.h | 41 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 601 | ||||
-rw-r--r-- | kernel/rcutree_trace.c | 2 |
16 files changed, 842 insertions, 518 deletions
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index 31ef8fe07f82..79e789b8b8ea 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt | |||
@@ -217,9 +217,14 @@ over a rather long period of time, but improvements are always welcome! | |||
217 | whether the increased speed is worth it. | 217 | whether the increased speed is worth it. |
218 | 218 | ||
219 | 8. Although synchronize_rcu() is slower than is call_rcu(), it | 219 | 8. Although synchronize_rcu() is slower than is call_rcu(), it |
220 | usually results in simpler code. So, unless update performance | 220 | usually results in simpler code. So, unless update performance is |
221 | is critically important or the updaters cannot block, | 221 | critically important, the updaters cannot block, or the latency of |
222 | synchronize_rcu() should be used in preference to call_rcu(). | 222 | synchronize_rcu() is visible from userspace, synchronize_rcu() |
223 | should be used in preference to call_rcu(). Furthermore, | ||
224 | kfree_rcu() usually results in even simpler code than does | ||
225 | synchronize_rcu() without synchronize_rcu()'s multi-millisecond | ||
226 | latency. So please take advantage of kfree_rcu()'s "fire and | ||
227 | forget" memory-freeing capabilities where it applies. | ||
223 | 228 | ||
224 | An especially important property of the synchronize_rcu() | 229 | An especially important property of the synchronize_rcu() |
225 | primitive is that it automatically self-limits: if grace periods | 230 | primitive is that it automatically self-limits: if grace periods |
@@ -268,7 +273,8 @@ over a rather long period of time, but improvements are always welcome! | |||
268 | e. Periodically invoke synchronize_rcu(), permitting a limited | 273 | e. Periodically invoke synchronize_rcu(), permitting a limited |
269 | number of updates per grace period. | 274 | number of updates per grace period. |
270 | 275 | ||
271 | The same cautions apply to call_rcu_bh() and call_rcu_sched(). | 276 | The same cautions apply to call_rcu_bh(), call_rcu_sched(), |
277 | call_srcu(), and kfree_rcu(). | ||
272 | 278 | ||
273 | 9. All RCU list-traversal primitives, which include | 279 | 9. All RCU list-traversal primitives, which include |
274 | rcu_dereference(), list_for_each_entry_rcu(), and | 280 | rcu_dereference(), list_for_each_entry_rcu(), and |
@@ -296,9 +302,9 @@ over a rather long period of time, but improvements are always welcome! | |||
296 | all currently executing rcu_read_lock()-protected RCU read-side | 302 | all currently executing rcu_read_lock()-protected RCU read-side |
297 | critical sections complete. It does -not- necessarily guarantee | 303 | critical sections complete. It does -not- necessarily guarantee |
298 | that all currently running interrupts, NMIs, preempt_disable() | 304 | that all currently running interrupts, NMIs, preempt_disable() |
299 | code, or idle loops will complete. Therefore, if you do not have | 305 | code, or idle loops will complete. Therefore, if your |
300 | rcu_read_lock()-protected read-side critical sections, do -not- | 306 | read-side critical sections are protected by something other |
301 | use synchronize_rcu(). | 307 | than rcu_read_lock(), do -not- use synchronize_rcu(). |
302 | 308 | ||
303 | Similarly, disabling preemption is not an acceptable substitute | 309 | Similarly, disabling preemption is not an acceptable substitute |
304 | for rcu_read_lock(). Code that attempts to use preemption | 310 | for rcu_read_lock(). Code that attempts to use preemption |
@@ -401,9 +407,9 @@ over a rather long period of time, but improvements are always welcome! | |||
401 | read-side critical sections. It is the responsibility of the | 407 | read-side critical sections. It is the responsibility of the |
402 | RCU update-side primitives to deal with this. | 408 | RCU update-side primitives to deal with this. |
403 | 409 | ||
404 | 17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and | 410 | 17. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the |
405 | the __rcu sparse checks to validate your RCU code. These | 411 | __rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to |
406 | can help find problems as follows: | 412 | validate your RCU code. These can help find problems as follows: |
407 | 413 | ||
408 | CONFIG_PROVE_RCU: check that accesses to RCU-protected data | 414 | CONFIG_PROVE_RCU: check that accesses to RCU-protected data |
409 | structures are carried out under the proper RCU | 415 | structures are carried out under the proper RCU |
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt index a102d4b3724b..cd83d2348fef 100644 --- a/Documentation/RCU/lockdep.txt +++ b/Documentation/RCU/lockdep.txt | |||
@@ -64,6 +64,11 @@ checking of rcu_dereference() primitives: | |||
64 | but retain the compiler constraints that prevent duplicating | 64 | but retain the compiler constraints that prevent duplicating |
65 | or coalescsing. This is useful when when testing the | 65 | or coalescsing. This is useful when when testing the |
66 | value of the pointer itself, for example, against NULL. | 66 | value of the pointer itself, for example, against NULL. |
67 | rcu_access_index(idx): | ||
68 | Return the value of the index and omit all barriers, but | ||
69 | retain the compiler constraints that prevent duplicating | ||
70 | or coalescsing. This is useful when when testing the | ||
71 | value of the index itself, for example, against -1. | ||
67 | 72 | ||
68 | The rcu_dereference_check() check expression can be any boolean | 73 | The rcu_dereference_check() check expression can be any boolean |
69 | expression, but would normally include a lockdep expression. However, | 74 | expression, but would normally include a lockdep expression. However, |
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt index 38428c125135..2e319d1b9ef2 100644 --- a/Documentation/RCU/rcubarrier.txt +++ b/Documentation/RCU/rcubarrier.txt | |||
@@ -79,7 +79,20 @@ complete. Pseudo-code using rcu_barrier() is as follows: | |||
79 | 2. Execute rcu_barrier(). | 79 | 2. Execute rcu_barrier(). |
80 | 3. Allow the module to be unloaded. | 80 | 3. Allow the module to be unloaded. |
81 | 81 | ||
82 | The rcutorture module makes use of rcu_barrier in its exit function | 82 | There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier() |
83 | functions for the other flavors of RCU, and you of course must match | ||
84 | the flavor of rcu_barrier() with that of call_rcu(). If your module | ||
85 | uses multiple flavors of call_rcu(), then it must also use multiple | ||
86 | flavors of rcu_barrier() when unloading that module. For example, if | ||
87 | it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on | ||
88 | srcu_struct_2(), then the following three lines of code will be required | ||
89 | when unloading: | ||
90 | |||
91 | 1 rcu_barrier_bh(); | ||
92 | 2 srcu_barrier(&srcu_struct_1); | ||
93 | 3 srcu_barrier(&srcu_struct_2); | ||
94 | |||
95 | The rcutorture module makes use of rcu_barrier() in its exit function | ||
83 | as follows: | 96 | as follows: |
84 | 97 | ||
85 | 1 static void | 98 | 1 static void |
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1927151b386b..e38b8df3d727 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt | |||
@@ -92,14 +92,14 @@ If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set, | |||
92 | more information is printed with the stall-warning message, for example: | 92 | more information is printed with the stall-warning message, for example: |
93 | 93 | ||
94 | INFO: rcu_preempt detected stall on CPU | 94 | INFO: rcu_preempt detected stall on CPU |
95 | 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 | 95 | 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543 |
96 | (t=65000 jiffies) | 96 | (t=65000 jiffies) |
97 | 97 | ||
98 | In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is | 98 | In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is |
99 | printed: | 99 | printed: |
100 | 100 | ||
101 | INFO: rcu_preempt detected stall on CPU | 101 | INFO: rcu_preempt detected stall on CPU |
102 | 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending | 102 | 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D |
103 | (t=65000 jiffies) | 103 | (t=65000 jiffies) |
104 | 104 | ||
105 | The "(64628 ticks this GP)" indicates that this CPU has taken more | 105 | The "(64628 ticks this GP)" indicates that this CPU has taken more |
@@ -116,13 +116,28 @@ number between the two "/"s is the value of the nesting, which will | |||
116 | be a small positive number if in the idle loop and a very large positive | 116 | be a small positive number if in the idle loop and a very large positive |
117 | number (as shown above) otherwise. | 117 | number (as shown above) otherwise. |
118 | 118 | ||
119 | For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is | 119 | The "softirq=" portion of the message tracks the number of RCU softirq |
120 | not in the process of trying to force itself into dyntick-idle state, the | 120 | handlers that the stalled CPU has executed. The number before the "/" |
121 | "." indicates that the CPU has not given up forcing RCU into dyntick-idle | 121 | is the number that had executed since boot at the time that this CPU |
122 | mode (it would be "H" otherwise), and the "timer not pending" indicates | 122 | last noted the beginning of a grace period, which might be the current |
123 | that the CPU has not recently forced RCU into dyntick-idle mode (it | 123 | (stalled) grace period, or it might be some earlier grace period (for |
124 | would otherwise indicate the number of microseconds remaining in this | 124 | example, if the CPU might have been in dyntick-idle mode for an extended |
125 | forced state). | 125 | time period. The number after the "/" is the number that have executed |
126 | since boot until the current time. If this latter number stays constant | ||
127 | across repeated stall-warning messages, it is possible that RCU's softirq | ||
128 | handlers are no longer able to execute on this CPU. This can happen if | ||
129 | the stalled CPU is spinning with interrupts are disabled, or, in -rt | ||
130 | kernels, if a high-priority process is starving RCU's softirq handler. | ||
131 | |||
132 | For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the | ||
133 | low-order 16 bits (in hex) of the jiffies counter when this CPU last | ||
134 | invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked | ||
135 | rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:" | ||
136 | prints the number of non-lazy callbacks posted since the last call to | ||
137 | rcu_needs_cpu(). Finally, an "L" indicates that there are currently | ||
138 | no non-lazy callbacks ("." is printed otherwise, as shown above) and | ||
139 | "D" indicates that dyntick-idle processing is enabled ("." is printed | ||
140 | otherwise, for example, if disabled via the "nohz=" kernel boot parameter). | ||
126 | 141 | ||
127 | 142 | ||
128 | Multiple Warnings From One Stall | 143 | Multiple Warnings From One Stall |
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 0cc7820967f4..10df0b82f459 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt | |||
@@ -265,9 +265,9 @@ rcu_dereference() | |||
265 | rcu_read_lock(); | 265 | rcu_read_lock(); |
266 | p = rcu_dereference(head.next); | 266 | p = rcu_dereference(head.next); |
267 | rcu_read_unlock(); | 267 | rcu_read_unlock(); |
268 | x = p->address; | 268 | x = p->address; /* BUG!!! */ |
269 | rcu_read_lock(); | 269 | rcu_read_lock(); |
270 | y = p->data; | 270 | y = p->data; /* BUG!!! */ |
271 | rcu_read_unlock(); | 271 | rcu_read_unlock(); |
272 | 272 | ||
273 | Holding a reference from one RCU read-side critical section | 273 | Holding a reference from one RCU read-side critical section |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 8ccbf27aead4..52ecc9b84673 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -2484,9 +2484,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2484 | In kernels built with CONFIG_RCU_NOCB_CPU=y, set | 2484 | In kernels built with CONFIG_RCU_NOCB_CPU=y, set |
2485 | the specified list of CPUs to be no-callback CPUs. | 2485 | the specified list of CPUs to be no-callback CPUs. |
2486 | Invocation of these CPUs' RCU callbacks will | 2486 | Invocation of these CPUs' RCU callbacks will |
2487 | be offloaded to "rcuoN" kthreads created for | 2487 | be offloaded to "rcuox/N" kthreads created for |
2488 | that purpose. This reduces OS jitter on the | 2488 | that purpose, where "x" is "b" for RCU-bh, "p" |
2489 | for RCU-preempt, and "s" for RCU-sched, and "N" | ||
2490 | is the CPU number. This reduces OS jitter on the | ||
2489 | offloaded CPUs, which can be useful for HPC and | 2491 | offloaded CPUs, which can be useful for HPC and |
2492 | |||
2490 | real-time workloads. It can also improve energy | 2493 | real-time workloads. It can also improve energy |
2491 | efficiency for asymmetric multiprocessors. | 2494 | efficiency for asymmetric multiprocessors. |
2492 | 2495 | ||
@@ -2510,6 +2513,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2510 | leaf rcu_node structure. Useful for very large | 2513 | leaf rcu_node structure. Useful for very large |
2511 | systems. | 2514 | systems. |
2512 | 2515 | ||
2516 | rcutree.jiffies_till_first_fqs= [KNL,BOOT] | ||
2517 | Set delay from grace-period initialization to | ||
2518 | first attempt to force quiescent states. | ||
2519 | Units are jiffies, minimum value is zero, | ||
2520 | and maximum value is HZ. | ||
2521 | |||
2522 | rcutree.jiffies_till_next_fqs= [KNL,BOOT] | ||
2523 | Set delay between subsequent attempts to force | ||
2524 | quiescent states. Units are jiffies, minimum | ||
2525 | value is one, and maximum value is HZ. | ||
2526 | |||
2513 | rcutree.qhimark= [KNL,BOOT] | 2527 | rcutree.qhimark= [KNL,BOOT] |
2514 | Set threshold of queued | 2528 | Set threshold of queued |
2515 | RCU callbacks over which batch limiting is disabled. | 2529 | RCU callbacks over which batch limiting is disabled. |
@@ -2524,16 +2538,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2524 | rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] | 2538 | rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] |
2525 | Set timeout for RCU CPU stall warning messages. | 2539 | Set timeout for RCU CPU stall warning messages. |
2526 | 2540 | ||
2527 | rcutree.jiffies_till_first_fqs= [KNL,BOOT] | 2541 | rcutree.rcu_idle_gp_delay= [KNL,BOOT] |
2528 | Set delay from grace-period initialization to | 2542 | Set wakeup interval for idle CPUs that have |
2529 | first attempt to force quiescent states. | 2543 | RCU callbacks (RCU_FAST_NO_HZ=y). |
2530 | Units are jiffies, minimum value is zero, | ||
2531 | and maximum value is HZ. | ||
2532 | 2544 | ||
2533 | rcutree.jiffies_till_next_fqs= [KNL,BOOT] | 2545 | rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT] |
2534 | Set delay between subsequent attempts to force | 2546 | Set wakeup interval for idle CPUs that have |
2535 | quiescent states. Units are jiffies, minimum | 2547 | only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y). |
2536 | value is one, and maximum value is HZ. | 2548 | Lazy RCU callbacks are those which RCU can |
2549 | prove do nothing more than free memory. | ||
2537 | 2550 | ||
2538 | rcutorture.fqs_duration= [KNL,BOOT] | 2551 | rcutorture.fqs_duration= [KNL,BOOT] |
2539 | Set duration of force_quiescent_state bursts. | 2552 | Set duration of force_quiescent_state bursts. |
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt new file mode 100644 index 000000000000..cbf7ae412da4 --- /dev/null +++ b/Documentation/kernel-per-CPU-kthreads.txt | |||
@@ -0,0 +1,202 @@ | |||
1 | REDUCING OS JITTER DUE TO PER-CPU KTHREADS | ||
2 | |||
3 | This document lists per-CPU kthreads in the Linux kernel and presents | ||
4 | options to control their OS jitter. Note that non-per-CPU kthreads are | ||
5 | not listed here. To reduce OS jitter from non-per-CPU kthreads, bind | ||
6 | them to a "housekeeping" CPU dedicated to such work. | ||
7 | |||
8 | |||
9 | REFERENCES | ||
10 | |||
11 | o Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. | ||
12 | |||
13 | o Documentation/cgroups: Using cgroups to bind tasks to sets of CPUs. | ||
14 | |||
15 | o man taskset: Using the taskset command to bind tasks to sets | ||
16 | of CPUs. | ||
17 | |||
18 | o man sched_setaffinity: Using the sched_setaffinity() system | ||
19 | call to bind tasks to sets of CPUs. | ||
20 | |||
21 | o /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, | ||
22 | writing "0" to offline and "1" to online. | ||
23 | |||
24 | o In order to locate kernel-generated OS jitter on CPU N: | ||
25 | |||
26 | cd /sys/kernel/debug/tracing | ||
27 | echo 1 > max_graph_depth # Increase the "1" for more detail | ||
28 | echo function_graph > current_tracer | ||
29 | # run workload | ||
30 | cat per_cpu/cpuN/trace | ||
31 | |||
32 | |||
33 | KTHREADS | ||
34 | |||
35 | Name: ehca_comp/%u | ||
36 | Purpose: Periodically process Infiniband-related work. | ||
37 | To reduce its OS jitter, do any of the following: | ||
38 | 1. Don't use eHCA Infiniband hardware, instead choosing hardware | ||
39 | that does not require per-CPU kthreads. This will prevent these | ||
40 | kthreads from being created in the first place. (This will | ||
41 | work for most people, as this hardware, though important, is | ||
42 | relatively old and is produced in relatively low unit volumes.) | ||
43 | 2. Do all eHCA-Infiniband-related work on other CPUs, including | ||
44 | interrupts. | ||
45 | 3. Rework the eHCA driver so that its per-CPU kthreads are | ||
46 | provisioned only on selected CPUs. | ||
47 | |||
48 | |||
49 | Name: irq/%d-%s | ||
50 | Purpose: Handle threaded interrupts. | ||
51 | To reduce its OS jitter, do the following: | ||
52 | 1. Use irq affinity to force the irq threads to execute on | ||
53 | some other CPU. | ||
54 | |||
55 | Name: kcmtpd_ctr_%d | ||
56 | Purpose: Handle Bluetooth work. | ||
57 | To reduce its OS jitter, do one of the following: | ||
58 | 1. Don't use Bluetooth, in which case these kthreads won't be | ||
59 | created in the first place. | ||
60 | 2. Use irq affinity to force Bluetooth-related interrupts to | ||
61 | occur on some other CPU and furthermore initiate all | ||
62 | Bluetooth activity on some other CPU. | ||
63 | |||
64 | Name: ksoftirqd/%u | ||
65 | Purpose: Execute softirq handlers when threaded or when under heavy load. | ||
66 | To reduce its OS jitter, each softirq vector must be handled | ||
67 | separately as follows: | ||
68 | TIMER_SOFTIRQ: Do all of the following: | ||
69 | 1. To the extent possible, keep the CPU out of the kernel when it | ||
70 | is non-idle, for example, by avoiding system calls and by forcing | ||
71 | both kernel threads and interrupts to execute elsewhere. | ||
72 | 2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force | ||
73 | the CPU offline, then bring it back online. This forces | ||
74 | recurring timers to migrate elsewhere. If you are concerned | ||
75 | with multiple CPUs, force them all offline before bringing the | ||
76 | first one back online. Once you have onlined the CPUs in question, | ||
77 | do not offline any other CPUs, because doing so could force the | ||
78 | timer back onto one of the CPUs in question. | ||
79 | NET_TX_SOFTIRQ and NET_RX_SOFTIRQ: Do all of the following: | ||
80 | 1. Force networking interrupts onto other CPUs. | ||
81 | 2. Initiate any network I/O on other CPUs. | ||
82 | 3. Once your application has started, prevent CPU-hotplug operations | ||
83 | from being initiated from tasks that might run on the CPU to | ||
84 | be de-jittered. (It is OK to force this CPU offline and then | ||
85 | bring it back online before you start your application.) | ||
86 | BLOCK_SOFTIRQ: Do all of the following: | ||
87 | 1. Force block-device interrupts onto some other CPU. | ||
88 | 2. Initiate any block I/O on other CPUs. | ||
89 | 3. Once your application has started, prevent CPU-hotplug operations | ||
90 | from being initiated from tasks that might run on the CPU to | ||
91 | be de-jittered. (It is OK to force this CPU offline and then | ||
92 | bring it back online before you start your application.) | ||
93 | BLOCK_IOPOLL_SOFTIRQ: Do all of the following: | ||
94 | 1. Force block-device interrupts onto some other CPU. | ||
95 | 2. Initiate any block I/O and block-I/O polling on other CPUs. | ||
96 | 3. Once your application has started, prevent CPU-hotplug operations | ||
97 | from being initiated from tasks that might run on the CPU to | ||
98 | be de-jittered. (It is OK to force this CPU offline and then | ||
99 | bring it back online before you start your application.) | ||
100 | TASKLET_SOFTIRQ: Do one or more of the following: | ||
101 | 1. Avoid use of drivers that use tasklets. (Such drivers will contain | ||
102 | calls to things like tasklet_schedule().) | ||
103 | 2. Convert all drivers that you must use from tasklets to workqueues. | ||
104 | 3. Force interrupts for drivers using tasklets onto other CPUs, | ||
105 | and also do I/O involving these drivers on other CPUs. | ||
106 | SCHED_SOFTIRQ: Do all of the following: | ||
107 | 1. Avoid sending scheduler IPIs to the CPU to be de-jittered, | ||
108 | for example, ensure that at most one runnable kthread is present | ||
109 | on that CPU. If a thread that expects to run on the de-jittered | ||
110 | CPU awakens, the scheduler will send an IPI that can result in | ||
111 | a subsequent SCHED_SOFTIRQ. | ||
112 | 2. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, | ||
113 | CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU | ||
114 | to be de-jittered is marked as an adaptive-ticks CPU using the | ||
115 | "nohz_full=" boot parameter. This reduces the number of | ||
116 | scheduler-clock interrupts that the de-jittered CPU receives, | ||
117 | minimizing its chances of being selected to do the load balancing | ||
118 | work that runs in SCHED_SOFTIRQ context. | ||
119 | 3. To the extent possible, keep the CPU out of the kernel when it | ||
120 | is non-idle, for example, by avoiding system calls and by | ||
121 | forcing both kernel threads and interrupts to execute elsewhere. | ||
122 | This further reduces the number of scheduler-clock interrupts | ||
123 | received by the de-jittered CPU. | ||
124 | HRTIMER_SOFTIRQ: Do all of the following: | ||
125 | 1. To the extent possible, keep the CPU out of the kernel when it | ||
126 | is non-idle. For example, avoid system calls and force both | ||
127 | kernel threads and interrupts to execute elsewhere. | ||
128 | 2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the | ||
129 | CPU offline, then bring it back online. This forces recurring | ||
130 | timers to migrate elsewhere. If you are concerned with multiple | ||
131 | CPUs, force them all offline before bringing the first one | ||
132 | back online. Once you have onlined the CPUs in question, do not | ||
133 | offline any other CPUs, because doing so could force the timer | ||
134 | back onto one of the CPUs in question. | ||
135 | RCU_SOFTIRQ: Do at least one of the following: | ||
136 | 1. Offload callbacks and keep the CPU in either dyntick-idle or | ||
137 | adaptive-ticks state by doing all of the following: | ||
138 | a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y, | ||
139 | CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU | ||
140 | to be de-jittered is marked as an adaptive-ticks CPU using | ||
141 | the "nohz_full=" boot parameter. Bind the rcuo kthreads | ||
142 | to housekeeping CPUs, which can tolerate OS jitter. | ||
143 | b. To the extent possible, keep the CPU out of the kernel | ||
144 | when it is non-idle, for example, by avoiding system | ||
145 | calls and by forcing both kernel threads and interrupts | ||
146 | to execute elsewhere. | ||
147 | 2. Enable RCU to do its processing remotely via dyntick-idle by | ||
148 | doing all of the following: | ||
149 | a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. | ||
150 | b. Ensure that the CPU goes idle frequently, allowing other | ||
151 | CPUs to detect that it has passed through an RCU quiescent | ||
152 | state. If the kernel is built with CONFIG_NO_HZ_FULL=y, | ||
153 | userspace execution also allows other CPUs to detect that | ||
154 | the CPU in question has passed through a quiescent state. | ||
155 | c. To the extent possible, keep the CPU out of the kernel | ||
156 | when it is non-idle, for example, by avoiding system | ||
157 | calls and by forcing both kernel threads and interrupts | ||
158 | to execute elsewhere. | ||
159 | |||
160 | Name: rcuc/%u | ||
161 | Purpose: Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. | ||
162 | To reduce its OS jitter, do at least one of the following: | ||
163 | 1. Build the kernel with CONFIG_PREEMPT=n. This prevents these | ||
164 | kthreads from being created in the first place, and also obviates | ||
165 | the need for RCU priority boosting. This approach is feasible | ||
166 | for workloads that do not require high degrees of responsiveness. | ||
167 | 2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these | ||
168 | kthreads from being created in the first place. This approach | ||
169 | is feasible only if your workload never requires RCU priority | ||
170 | boosting, for example, if you ensure frequent idle time on all | ||
171 | CPUs that might execute within the kernel. | ||
172 | 3. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y, | ||
173 | which offloads all RCU callbacks to kthreads that can be moved | ||
174 | off of CPUs susceptible to OS jitter. This approach prevents the | ||
175 | rcuc/%u kthreads from having any work to do, so that they are | ||
176 | never awakened. | ||
177 | 4. Ensure that the CPU never enters the kernel, and, in particular, | ||
178 | avoid initiating any CPU hotplug operations on this CPU. This is | ||
179 | another way of preventing any callbacks from being queued on the | ||
180 | CPU, again preventing the rcuc/%u kthreads from having any work | ||
181 | to do. | ||
182 | |||
183 | Name: rcuob/%d, rcuop/%d, and rcuos/%d | ||
184 | Purpose: Offload RCU callbacks from the corresponding CPU. | ||
185 | To reduce its OS jitter, do at least one of the following: | ||
186 | 1. Use affinity, cgroups, or other mechanism to force these kthreads | ||
187 | to execute on some other CPU. | ||
188 | 2. Build with CONFIG_RCU_NOCB_CPUS=n, which will prevent these | ||
189 | kthreads from being created in the first place. However, please | ||
190 | note that this will not eliminate OS jitter, but will instead | ||
191 | shift it to RCU_SOFTIRQ. | ||
192 | |||
193 | Name: watchdog/%u | ||
194 | Purpose: Detect software lockups on each CPU. | ||
195 | To reduce its OS jitter, do at least one of the following: | ||
196 | 1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these | ||
197 | kthreads from being created in the first place. | ||
198 | 2. Echo a zero to /proc/sys/kernel/watchdog to disable the | ||
199 | watchdog timer. | ||
200 | 3. Echo a large number of /proc/sys/kernel/watchdog_thresh in | ||
201 | order to reduce the frequency of OS jitter due to the watchdog | ||
202 | timer down to a level that is acceptable for your workload. | ||
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index 31f9d75adc5b..2eb88556c5c5 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h | |||
@@ -125,6 +125,11 @@ static inline void hlist_bl_unlock(struct hlist_bl_head *b) | |||
125 | __bit_spin_unlock(0, (unsigned long *)b); | 125 | __bit_spin_unlock(0, (unsigned long *)b); |
126 | } | 126 | } |
127 | 127 | ||
128 | static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) | ||
129 | { | ||
130 | return bit_spin_is_locked(0, (unsigned long *)b); | ||
131 | } | ||
132 | |||
128 | /** | 133 | /** |
129 | * hlist_bl_for_each_entry - iterate over list of given type | 134 | * hlist_bl_for_each_entry - iterate over list of given type |
130 | * @tpos: the type * to use as a loop cursor. | 135 | * @tpos: the type * to use as a loop cursor. |
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h index cf1244fbf3b6..4f216c59e7db 100644 --- a/include/linux/rculist_bl.h +++ b/include/linux/rculist_bl.h | |||
@@ -20,7 +20,7 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, | |||
20 | static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) | 20 | static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) |
21 | { | 21 | { |
22 | return (struct hlist_bl_node *) | 22 | return (struct hlist_bl_node *) |
23 | ((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK); | 23 | ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK); |
24 | } | 24 | } |
25 | 25 | ||
26 | /** | 26 | /** |
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b758ce17b309..9ed2c9a4de45 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
@@ -80,6 +80,7 @@ extern void do_trace_rcu_torture_read(char *rcutorturename, | |||
80 | #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) | 80 | #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) |
81 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | 81 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) |
82 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | 82 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) |
83 | #define ulong2long(a) (*(long *)(&(a))) | ||
83 | 84 | ||
84 | /* Exported common interfaces */ | 85 | /* Exported common interfaces */ |
85 | 86 | ||
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 1918e832da4f..59ebcc89f148 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h | |||
@@ -72,6 +72,58 @@ TRACE_EVENT(rcu_grace_period, | |||
72 | ); | 72 | ); |
73 | 73 | ||
74 | /* | 74 | /* |
75 | * Tracepoint for future grace-period events, including those for no-callbacks | ||
76 | * CPUs. The caller should pull the data from the rcu_node structure, | ||
77 | * other than rcuname, which comes from the rcu_state structure, and event, | ||
78 | * which is one of the following: | ||
79 | * | ||
80 | * "Startleaf": Request a nocb grace period based on leaf-node data. | ||
81 | * "Startedleaf": Leaf-node start proved sufficient. | ||
82 | * "Startedleafroot": Leaf-node start proved sufficient after checking root. | ||
83 | * "Startedroot": Requested a nocb grace period based on root-node data. | ||
84 | * "StartWait": Start waiting for the requested grace period. | ||
85 | * "ResumeWait": Resume waiting after signal. | ||
86 | * "EndWait": Complete wait. | ||
87 | * "Cleanup": Clean up rcu_node structure after previous GP. | ||
88 | * "CleanupMore": Clean up, and another no-CB GP is needed. | ||
89 | */ | ||
90 | TRACE_EVENT(rcu_future_grace_period, | ||
91 | |||
92 | TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed, | ||
93 | unsigned long c, u8 level, int grplo, int grphi, | ||
94 | char *gpevent), | ||
95 | |||
96 | TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent), | ||
97 | |||
98 | TP_STRUCT__entry( | ||
99 | __field(char *, rcuname) | ||
100 | __field(unsigned long, gpnum) | ||
101 | __field(unsigned long, completed) | ||
102 | __field(unsigned long, c) | ||
103 | __field(u8, level) | ||
104 | __field(int, grplo) | ||
105 | __field(int, grphi) | ||
106 | __field(char *, gpevent) | ||
107 | ), | ||
108 | |||
109 | TP_fast_assign( | ||
110 | __entry->rcuname = rcuname; | ||
111 | __entry->gpnum = gpnum; | ||
112 | __entry->completed = completed; | ||
113 | __entry->c = c; | ||
114 | __entry->level = level; | ||
115 | __entry->grplo = grplo; | ||
116 | __entry->grphi = grphi; | ||
117 | __entry->gpevent = gpevent; | ||
118 | ), | ||
119 | |||
120 | TP_printk("%s %lu %lu %lu %u %d %d %s", | ||
121 | __entry->rcuname, __entry->gpnum, __entry->completed, | ||
122 | __entry->c, __entry->level, __entry->grplo, __entry->grphi, | ||
123 | __entry->gpevent) | ||
124 | ); | ||
125 | |||
126 | /* | ||
75 | * Tracepoint for grace-period-initialization events. These are | 127 | * Tracepoint for grace-period-initialization events. These are |
76 | * distinguished by the type of RCU, the new grace-period number, the | 128 | * distinguished by the type of RCU, the new grace-period number, the |
77 | * rcu_node structure level, the starting and ending CPU covered by the | 129 | * rcu_node structure level, the starting and ending CPU covered by the |
@@ -601,6 +653,9 @@ TRACE_EVENT(rcu_barrier, | |||
601 | #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) | 653 | #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) |
602 | #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ | 654 | #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ |
603 | qsmask) do { } while (0) | 655 | qsmask) do { } while (0) |
656 | #define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ | ||
657 | level, grplo, grphi, event) \ | ||
658 | do { } while (0) | ||
604 | #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) | 659 | #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) |
605 | #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) | 660 | #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) |
606 | #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ | 661 | #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ |
diff --git a/init/Kconfig b/init/Kconfig index 5341d7232c3a..71bb9e73011a 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -578,13 +578,16 @@ config RCU_FAST_NO_HZ | |||
578 | depends on NO_HZ && SMP | 578 | depends on NO_HZ && SMP |
579 | default n | 579 | default n |
580 | help | 580 | help |
581 | This option causes RCU to attempt to accelerate grace periods in | 581 | This option permits CPUs to enter dynticks-idle state even if |
582 | order to allow CPUs to enter dynticks-idle state more quickly. | 582 | they have RCU callbacks queued, and prevents RCU from waking |
583 | On the other hand, this option increases the overhead of the | 583 | these CPUs up more than roughly once every four jiffies (by |
584 | dynticks-idle checking, thus degrading scheduling latency. | 584 | default, you can adjust this using the rcutree.rcu_idle_gp_delay |
585 | parameter), thus improving energy efficiency. On the other | ||
586 | hand, this option increases the duration of RCU grace periods, | ||
587 | for example, slowing down synchronize_rcu(). | ||
585 | 588 | ||
586 | Say Y if energy efficiency is critically important, and you don't | 589 | Say Y if energy efficiency is critically important, and you |
587 | care about real-time response. | 590 | don't care about increased grace-period durations. |
588 | 591 | ||
589 | Say N if you are unsure. | 592 | Say N if you are unsure. |
590 | 593 | ||
@@ -651,7 +654,7 @@ config RCU_BOOST_DELAY | |||
651 | Accept the default if unsure. | 654 | Accept the default if unsure. |
652 | 655 | ||
653 | config RCU_NOCB_CPU | 656 | config RCU_NOCB_CPU |
654 | bool "Offload RCU callback processing from boot-selected CPUs" | 657 | bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL" |
655 | depends on TREE_RCU || TREE_PREEMPT_RCU | 658 | depends on TREE_RCU || TREE_PREEMPT_RCU |
656 | default n | 659 | default n |
657 | help | 660 | help |
@@ -662,16 +665,56 @@ config RCU_NOCB_CPU | |||
662 | 665 | ||
663 | This option offloads callback invocation from the set of | 666 | This option offloads callback invocation from the set of |
664 | CPUs specified at boot time by the rcu_nocbs parameter. | 667 | CPUs specified at boot time by the rcu_nocbs parameter. |
665 | For each such CPU, a kthread ("rcuoN") will be created to | 668 | For each such CPU, a kthread ("rcuox/N") will be created to |
666 | invoke callbacks, where the "N" is the CPU being offloaded. | 669 | invoke callbacks, where the "N" is the CPU being offloaded, |
667 | Nothing prevents this kthread from running on the specified | 670 | and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and |
668 | CPUs, but (1) the kthreads may be preempted between each | 671 | "s" for RCU-sched. Nothing prevents this kthread from running |
669 | callback, and (2) affinity or cgroups can be used to force | 672 | on the specified CPUs, but (1) the kthreads may be preempted |
670 | the kthreads to run on whatever set of CPUs is desired. | 673 | between each callback, and (2) affinity or cgroups can be used |
671 | 674 | to force the kthreads to run on whatever set of CPUs is desired. | |
672 | Say Y here if you want reduced OS jitter on selected CPUs. | 675 | |
676 | Say Y here if you want to help to debug reduced OS jitter. | ||
673 | Say N here if you are unsure. | 677 | Say N here if you are unsure. |
674 | 678 | ||
679 | choice | ||
680 | prompt "Build-forced no-CBs CPUs" | ||
681 | default RCU_NOCB_CPU_NONE | ||
682 | help | ||
683 | This option allows no-CBs CPUs to be specified at build time. | ||
684 | Additional no-CBs CPUs may be specified by the rcu_nocbs= | ||
685 | boot parameter. | ||
686 | |||
687 | config RCU_NOCB_CPU_NONE | ||
688 | bool "No build_forced no-CBs CPUs" | ||
689 | depends on RCU_NOCB_CPU | ||
690 | help | ||
691 | This option does not force any of the CPUs to be no-CBs CPUs. | ||
692 | Only CPUs designated by the rcu_nocbs= boot parameter will be | ||
693 | no-CBs CPUs. | ||
694 | |||
695 | config RCU_NOCB_CPU_ZERO | ||
696 | bool "CPU 0 is a build_forced no-CBs CPU" | ||
697 | depends on RCU_NOCB_CPU | ||
698 | help | ||
699 | This option forces CPU 0 to be a no-CBs CPU. Additional CPUs | ||
700 | may be designated as no-CBs CPUs using the rcu_nocbs= boot | ||
701 | parameter will be no-CBs CPUs. | ||
702 | |||
703 | Select this if CPU 0 needs to be a no-CBs CPU for real-time | ||
704 | or energy-efficiency reasons. | ||
705 | |||
706 | config RCU_NOCB_CPU_ALL | ||
707 | bool "All CPUs are build_forced no-CBs CPUs" | ||
708 | depends on RCU_NOCB_CPU | ||
709 | help | ||
710 | This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs= | ||
711 | boot parameter will be ignored. | ||
712 | |||
713 | Select this if all CPUs need to be no-CBs CPUs for real-time | ||
714 | or energy-efficiency reasons. | ||
715 | |||
716 | endchoice | ||
717 | |||
675 | endmenu # "RCU Subsystem" | 718 | endmenu # "RCU Subsystem" |
676 | 719 | ||
677 | config IKCONFIG | 720 | config IKCONFIG |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..2d5f94c1c7fb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -64,7 +64,7 @@ | |||
64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | 65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; |
66 | 66 | ||
67 | #define RCU_STATE_INITIALIZER(sname, cr) { \ | 67 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ |
68 | .level = { &sname##_state.node[0] }, \ | 68 | .level = { &sname##_state.node[0] }, \ |
69 | .call = cr, \ | 69 | .call = cr, \ |
70 | .fqs_state = RCU_GP_IDLE, \ | 70 | .fqs_state = RCU_GP_IDLE, \ |
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | 77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ |
78 | .name = #sname, \ | 78 | .name = #sname, \ |
79 | .abbr = sabbr, \ | ||
79 | } | 80 | } |
80 | 81 | ||
81 | struct rcu_state rcu_sched_state = | 82 | struct rcu_state rcu_sched_state = |
82 | RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); | 83 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); |
83 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); | 84 | DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); |
84 | 85 | ||
85 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); | 86 | struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); |
86 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); | 87 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); |
87 | 88 | ||
88 | static struct rcu_state *rcu_state; | 89 | static struct rcu_state *rcu_state; |
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; | |||
223 | module_param(jiffies_till_first_fqs, ulong, 0644); | 224 | module_param(jiffies_till_first_fqs, ulong, 0644); |
224 | module_param(jiffies_till_next_fqs, ulong, 0644); | 225 | module_param(jiffies_till_next_fqs, ulong, 0644); |
225 | 226 | ||
227 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | ||
228 | struct rcu_data *rdp); | ||
226 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | 229 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); |
227 | static void force_quiescent_state(struct rcu_state *rsp); | 230 | static void force_quiescent_state(struct rcu_state *rsp); |
228 | static int rcu_pending(int cpu); | 231 | static int rcu_pending(int cpu); |
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | |||
310 | 313 | ||
311 | if (rcu_gp_in_progress(rsp)) | 314 | if (rcu_gp_in_progress(rsp)) |
312 | return 0; /* No, a grace period is already in progress. */ | 315 | return 0; /* No, a grace period is already in progress. */ |
316 | if (rcu_nocb_needs_gp(rsp)) | ||
317 | return 1; /* Yes, a no-CBs CPU needs one. */ | ||
313 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | 318 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
314 | return 0; /* No, this is a no-CBs (or offline) CPU. */ | 319 | return 0; /* No, this is a no-CBs (or offline) CPU. */ |
315 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | 320 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) |
@@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp) | |||
1035 | { | 1040 | { |
1036 | int i; | 1041 | int i; |
1037 | 1042 | ||
1043 | if (init_nocb_callback_list(rdp)) | ||
1044 | return; | ||
1038 | rdp->nxtlist = NULL; | 1045 | rdp->nxtlist = NULL; |
1039 | for (i = 0; i < RCU_NEXT_SIZE; i++) | 1046 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1040 | rdp->nxttail[i] = &rdp->nxtlist; | 1047 | rdp->nxttail[i] = &rdp->nxtlist; |
1041 | init_nocb_callback_list(rdp); | ||
1042 | } | 1048 | } |
1043 | 1049 | ||
1044 | /* | 1050 | /* |
@@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | |||
1071 | } | 1077 | } |
1072 | 1078 | ||
1073 | /* | 1079 | /* |
1080 | * Trace-event helper function for rcu_start_future_gp() and | ||
1081 | * rcu_nocb_wait_gp(). | ||
1082 | */ | ||
1083 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | ||
1084 | unsigned long c, char *s) | ||
1085 | { | ||
1086 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | ||
1087 | rnp->completed, c, rnp->level, | ||
1088 | rnp->grplo, rnp->grphi, s); | ||
1089 | } | ||
1090 | |||
1091 | /* | ||
1092 | * Start some future grace period, as needed to handle newly arrived | ||
1093 | * callbacks. The required future grace periods are recorded in each | ||
1094 | * rcu_node structure's ->need_future_gp field. | ||
1095 | * | ||
1096 | * The caller must hold the specified rcu_node structure's ->lock. | ||
1097 | */ | ||
1098 | static unsigned long __maybe_unused | ||
1099 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | ||
1100 | { | ||
1101 | unsigned long c; | ||
1102 | int i; | ||
1103 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | ||
1104 | |||
1105 | /* | ||
1106 | * Pick up grace-period number for new callbacks. If this | ||
1107 | * grace period is already marked as needed, return to the caller. | ||
1108 | */ | ||
1109 | c = rcu_cbs_completed(rdp->rsp, rnp); | ||
1110 | trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); | ||
1111 | if (rnp->need_future_gp[c & 0x1]) { | ||
1112 | trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); | ||
1113 | return c; | ||
1114 | } | ||
1115 | |||
1116 | /* | ||
1117 | * If either this rcu_node structure or the root rcu_node structure | ||
1118 | * believe that a grace period is in progress, then we must wait | ||
1119 | * for the one following, which is in "c". Because our request | ||
1120 | * will be noticed at the end of the current grace period, we don't | ||
1121 | * need to explicitly start one. | ||
1122 | */ | ||
1123 | if (rnp->gpnum != rnp->completed || | ||
1124 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | ||
1125 | rnp->need_future_gp[c & 0x1]++; | ||
1126 | trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); | ||
1127 | return c; | ||
1128 | } | ||
1129 | |||
1130 | /* | ||
1131 | * There might be no grace period in progress. If we don't already | ||
1132 | * hold it, acquire the root rcu_node structure's lock in order to | ||
1133 | * start one (if needed). | ||
1134 | */ | ||
1135 | if (rnp != rnp_root) | ||
1136 | raw_spin_lock(&rnp_root->lock); | ||
1137 | |||
1138 | /* | ||
1139 | * Get a new grace-period number. If there really is no grace | ||
1140 | * period in progress, it will be smaller than the one we obtained | ||
1141 | * earlier. Adjust callbacks as needed. Note that even no-CBs | ||
1142 | * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. | ||
1143 | */ | ||
1144 | c = rcu_cbs_completed(rdp->rsp, rnp_root); | ||
1145 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) | ||
1146 | if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) | ||
1147 | rdp->nxtcompleted[i] = c; | ||
1148 | |||
1149 | /* | ||
1150 | * If the needed for the required grace period is already | ||
1151 | * recorded, trace and leave. | ||
1152 | */ | ||
1153 | if (rnp_root->need_future_gp[c & 0x1]) { | ||
1154 | trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); | ||
1155 | goto unlock_out; | ||
1156 | } | ||
1157 | |||
1158 | /* Record the need for the future grace period. */ | ||
1159 | rnp_root->need_future_gp[c & 0x1]++; | ||
1160 | |||
1161 | /* If a grace period is not already in progress, start one. */ | ||
1162 | if (rnp_root->gpnum != rnp_root->completed) { | ||
1163 | trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); | ||
1164 | } else { | ||
1165 | trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); | ||
1166 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | ||
1167 | } | ||
1168 | unlock_out: | ||
1169 | if (rnp != rnp_root) | ||
1170 | raw_spin_unlock(&rnp_root->lock); | ||
1171 | return c; | ||
1172 | } | ||
1173 | |||
1174 | /* | ||
1175 | * Clean up any old requests for the just-ended grace period. Also return | ||
1176 | * whether any additional grace periods have been requested. Also invoke | ||
1177 | * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads | ||
1178 | * waiting for this grace period to complete. | ||
1179 | */ | ||
1180 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
1181 | { | ||
1182 | int c = rnp->completed; | ||
1183 | int needmore; | ||
1184 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1185 | |||
1186 | rcu_nocb_gp_cleanup(rsp, rnp); | ||
1187 | rnp->need_future_gp[c & 0x1] = 0; | ||
1188 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | ||
1189 | trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); | ||
1190 | return needmore; | ||
1191 | } | ||
1192 | |||
1193 | /* | ||
1074 | * If there is room, assign a ->completed number to any callbacks on | 1194 | * If there is room, assign a ->completed number to any callbacks on |
1075 | * this CPU that have not already been assigned. Also accelerate any | 1195 | * this CPU that have not already been assigned. Also accelerate any |
1076 | * callbacks that were previously assigned a ->completed number that has | 1196 | * callbacks that were previously assigned a ->completed number that has |
@@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | |||
1129 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | 1249 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; |
1130 | rdp->nxtcompleted[i] = c; | 1250 | rdp->nxtcompleted[i] = c; |
1131 | } | 1251 | } |
1252 | /* Record any needed additional grace periods. */ | ||
1253 | rcu_start_future_gp(rnp, rdp); | ||
1132 | 1254 | ||
1133 | /* Trace depending on how much we were able to accelerate. */ | 1255 | /* Trace depending on how much we were able to accelerate. */ |
1134 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | 1256 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) |
@@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1308 | rdp = this_cpu_ptr(rsp->rda); | 1430 | rdp = this_cpu_ptr(rsp->rda); |
1309 | rcu_preempt_check_blocked_tasks(rnp); | 1431 | rcu_preempt_check_blocked_tasks(rnp); |
1310 | rnp->qsmask = rnp->qsmaskinit; | 1432 | rnp->qsmask = rnp->qsmaskinit; |
1311 | rnp->gpnum = rsp->gpnum; | 1433 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; |
1312 | WARN_ON_ONCE(rnp->completed != rsp->completed); | 1434 | WARN_ON_ONCE(rnp->completed != rsp->completed); |
1313 | rnp->completed = rsp->completed; | 1435 | ACCESS_ONCE(rnp->completed) = rsp->completed; |
1314 | if (rnp == rdp->mynode) | 1436 | if (rnp == rdp->mynode) |
1315 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 1437 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
1316 | rcu_preempt_boost_start_gp(rnp); | 1438 | rcu_preempt_boost_start_gp(rnp); |
@@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1319 | rnp->grphi, rnp->qsmask); | 1441 | rnp->grphi, rnp->qsmask); |
1320 | raw_spin_unlock_irq(&rnp->lock); | 1442 | raw_spin_unlock_irq(&rnp->lock); |
1321 | #ifdef CONFIG_PROVE_RCU_DELAY | 1443 | #ifdef CONFIG_PROVE_RCU_DELAY |
1322 | if ((random32() % (rcu_num_nodes * 8)) == 0) | 1444 | if ((random32() % (rcu_num_nodes * 8)) == 0 && |
1445 | system_state == SYSTEM_RUNNING) | ||
1323 | schedule_timeout_uninterruptible(2); | 1446 | schedule_timeout_uninterruptible(2); |
1324 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | 1447 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ |
1325 | cond_resched(); | 1448 | cond_resched(); |
@@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1361 | static void rcu_gp_cleanup(struct rcu_state *rsp) | 1484 | static void rcu_gp_cleanup(struct rcu_state *rsp) |
1362 | { | 1485 | { |
1363 | unsigned long gp_duration; | 1486 | unsigned long gp_duration; |
1487 | int nocb = 0; | ||
1364 | struct rcu_data *rdp; | 1488 | struct rcu_data *rdp; |
1365 | struct rcu_node *rnp = rcu_get_root(rsp); | 1489 | struct rcu_node *rnp = rcu_get_root(rsp); |
1366 | 1490 | ||
@@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1390 | */ | 1514 | */ |
1391 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1515 | rcu_for_each_node_breadth_first(rsp, rnp) { |
1392 | raw_spin_lock_irq(&rnp->lock); | 1516 | raw_spin_lock_irq(&rnp->lock); |
1393 | rnp->completed = rsp->gpnum; | 1517 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; |
1518 | rdp = this_cpu_ptr(rsp->rda); | ||
1519 | if (rnp == rdp->mynode) | ||
1520 | __rcu_process_gp_end(rsp, rnp, rdp); | ||
1521 | nocb += rcu_future_gp_cleanup(rsp, rnp); | ||
1394 | raw_spin_unlock_irq(&rnp->lock); | 1522 | raw_spin_unlock_irq(&rnp->lock); |
1395 | cond_resched(); | 1523 | cond_resched(); |
1396 | } | 1524 | } |
1397 | rnp = rcu_get_root(rsp); | 1525 | rnp = rcu_get_root(rsp); |
1398 | raw_spin_lock_irq(&rnp->lock); | 1526 | raw_spin_lock_irq(&rnp->lock); |
1527 | rcu_nocb_gp_set(rnp, nocb); | ||
1399 | 1528 | ||
1400 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | 1529 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
1401 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1530 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); |
1402 | rsp->fqs_state = RCU_GP_IDLE; | 1531 | rsp->fqs_state = RCU_GP_IDLE; |
1403 | rdp = this_cpu_ptr(rsp->rda); | 1532 | rdp = this_cpu_ptr(rsp->rda); |
1533 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | ||
1404 | if (cpu_needs_another_gp(rsp, rdp)) | 1534 | if (cpu_needs_another_gp(rsp, rdp)) |
1405 | rsp->gp_flags = 1; | 1535 | rsp->gp_flags = 1; |
1406 | raw_spin_unlock_irq(&rnp->lock); | 1536 | raw_spin_unlock_irq(&rnp->lock); |
@@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1476 | /* | 1606 | /* |
1477 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1607 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
1478 | * in preparation for detecting the next grace period. The caller must hold | 1608 | * in preparation for detecting the next grace period. The caller must hold |
1479 | * the root node's ->lock, which is released before return. Hard irqs must | 1609 | * the root node's ->lock and hard irqs must be disabled. |
1480 | * be disabled. | ||
1481 | * | 1610 | * |
1482 | * Note that it is legal for a dying CPU (which is marked as offline) to | 1611 | * Note that it is legal for a dying CPU (which is marked as offline) to |
1483 | * invoke this function. This can happen when the dying CPU reports its | 1612 | * invoke this function. This can happen when the dying CPU reports its |
1484 | * quiescent state. | 1613 | * quiescent state. |
1485 | */ | 1614 | */ |
1486 | static void | 1615 | static void |
1487 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | 1616 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
1488 | __releases(rcu_get_root(rsp)->lock) | 1617 | struct rcu_data *rdp) |
1489 | { | 1618 | { |
1490 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1619 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { |
1491 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1492 | |||
1493 | if (!rsp->gp_kthread || | ||
1494 | !cpu_needs_another_gp(rsp, rdp)) { | ||
1495 | /* | 1620 | /* |
1496 | * Either we have not yet spawned the grace-period | 1621 | * Either we have not yet spawned the grace-period |
1497 | * task, this CPU does not need another grace period, | 1622 | * task, this CPU does not need another grace period, |
1498 | * or a grace period is already in progress. | 1623 | * or a grace period is already in progress. |
1499 | * Either way, don't start a new grace period. | 1624 | * Either way, don't start a new grace period. |
1500 | */ | 1625 | */ |
1501 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1502 | return; | 1626 | return; |
1503 | } | 1627 | } |
1504 | |||
1505 | /* | ||
1506 | * Because there is no grace period in progress right now, | ||
1507 | * any callbacks we have up to this point will be satisfied | ||
1508 | * by the next grace period. So this is a good place to | ||
1509 | * assign a grace period number to recently posted callbacks. | ||
1510 | */ | ||
1511 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1512 | |||
1513 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1628 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1514 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | ||
1515 | |||
1516 | /* Ensure that CPU is aware of completion of last grace period. */ | ||
1517 | rcu_process_gp_end(rsp, rdp); | ||
1518 | local_irq_restore(flags); | ||
1519 | 1629 | ||
1520 | /* Wake up rcu_gp_kthread() to start the grace period. */ | 1630 | /* Wake up rcu_gp_kthread() to start the grace period. */ |
1521 | wake_up(&rsp->gp_wq); | 1631 | wake_up(&rsp->gp_wq); |
1522 | } | 1632 | } |
1523 | 1633 | ||
1524 | /* | 1634 | /* |
1635 | * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's | ||
1636 | * callbacks. Note that rcu_start_gp_advanced() cannot do this because it | ||
1637 | * is invoked indirectly from rcu_advance_cbs(), which would result in | ||
1638 | * endless recursion -- or would do so if it wasn't for the self-deadlock | ||
1639 | * that is encountered beforehand. | ||
1640 | */ | ||
1641 | static void | ||
1642 | rcu_start_gp(struct rcu_state *rsp) | ||
1643 | { | ||
1644 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
1645 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
1646 | |||
1647 | /* | ||
1648 | * If there is no grace period in progress right now, any | ||
1649 | * callbacks we have up to this point will be satisfied by the | ||
1650 | * next grace period. Also, advancing the callbacks reduces the | ||
1651 | * probability of false positives from cpu_needs_another_gp() | ||
1652 | * resulting in pointless grace periods. So, advance callbacks | ||
1653 | * then start the grace period! | ||
1654 | */ | ||
1655 | rcu_advance_cbs(rsp, rnp, rdp); | ||
1656 | rcu_start_gp_advanced(rsp, rnp, rdp); | ||
1657 | } | ||
1658 | |||
1659 | /* | ||
1525 | * Report a full set of quiescent states to the specified rcu_state | 1660 | * Report a full set of quiescent states to the specified rcu_state |
1526 | * data structure. This involves cleaning up after the prior grace | 1661 | * data structure. This involves cleaning up after the prior grace |
1527 | * period and letting rcu_start_gp() start up the next grace period | 1662 | * period and letting rcu_start_gp() start up the next grace period |
1528 | * if one is needed. Note that the caller must hold rnp->lock, as | 1663 | * if one is needed. Note that the caller must hold rnp->lock, which |
1529 | * required by rcu_start_gp(), which will release it. | 1664 | * is released before return. |
1530 | */ | 1665 | */ |
1531 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 1666 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) |
1532 | __releases(rcu_get_root(rsp)->lock) | 1667 | __releases(rcu_get_root(rsp)->lock) |
@@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
2124 | local_irq_save(flags); | 2259 | local_irq_save(flags); |
2125 | if (cpu_needs_another_gp(rsp, rdp)) { | 2260 | if (cpu_needs_another_gp(rsp, rdp)) { |
2126 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ | 2261 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ |
2127 | rcu_start_gp(rsp, flags); /* releases above lock */ | 2262 | rcu_start_gp(rsp); |
2263 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | ||
2128 | } else { | 2264 | } else { |
2129 | local_irq_restore(flags); | 2265 | local_irq_restore(flags); |
2130 | } | 2266 | } |
@@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
2169 | 2305 | ||
2170 | static void invoke_rcu_core(void) | 2306 | static void invoke_rcu_core(void) |
2171 | { | 2307 | { |
2172 | raise_softirq(RCU_SOFTIRQ); | 2308 | if (cpu_online(smp_processor_id())) |
2309 | raise_softirq(RCU_SOFTIRQ); | ||
2173 | } | 2310 | } |
2174 | 2311 | ||
2175 | /* | 2312 | /* |
@@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
2204 | 2341 | ||
2205 | /* Start a new grace period if one not already started. */ | 2342 | /* Start a new grace period if one not already started. */ |
2206 | if (!rcu_gp_in_progress(rsp)) { | 2343 | if (!rcu_gp_in_progress(rsp)) { |
2207 | unsigned long nestflag; | ||
2208 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 2344 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
2209 | 2345 | ||
2210 | raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | 2346 | raw_spin_lock(&rnp_root->lock); |
2211 | rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | 2347 | rcu_start_gp(rsp); |
2348 | raw_spin_unlock(&rnp_root->lock); | ||
2212 | } else { | 2349 | } else { |
2213 | /* Give the grace period a kick. */ | 2350 | /* Give the grace period a kick. */ |
2214 | rdp->blimit = LONG_MAX; | 2351 | rdp->blimit = LONG_MAX; |
@@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu) | |||
2628 | } | 2765 | } |
2629 | 2766 | ||
2630 | /* | 2767 | /* |
2631 | * Check to see if any future RCU-related work will need to be done | 2768 | * Return true if the specified CPU has any callback. If all_lazy is |
2632 | * by the current CPU, even if none need be done immediately, returning | 2769 | * non-NULL, store an indication of whether all callbacks are lazy. |
2633 | * 1 if so. | 2770 | * (If there are no callbacks, all of them are deemed to be lazy.) |
2634 | */ | 2771 | */ |
2635 | static int rcu_cpu_has_callbacks(int cpu) | 2772 | static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) |
2636 | { | 2773 | { |
2774 | bool al = true; | ||
2775 | bool hc = false; | ||
2776 | struct rcu_data *rdp; | ||
2637 | struct rcu_state *rsp; | 2777 | struct rcu_state *rsp; |
2638 | 2778 | ||
2639 | /* RCU callbacks either ready or pending? */ | 2779 | for_each_rcu_flavor(rsp) { |
2640 | for_each_rcu_flavor(rsp) | 2780 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2641 | if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) | 2781 | if (rdp->qlen != rdp->qlen_lazy) |
2642 | return 1; | 2782 | al = false; |
2643 | return 0; | 2783 | if (rdp->nxtlist) |
2784 | hc = true; | ||
2785 | } | ||
2786 | if (all_lazy) | ||
2787 | *all_lazy = al; | ||
2788 | return hc; | ||
2644 | } | 2789 | } |
2645 | 2790 | ||
2646 | /* | 2791 | /* |
@@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2859 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 3004 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
2860 | atomic_set(&rdp->dynticks->dynticks, | 3005 | atomic_set(&rdp->dynticks->dynticks, |
2861 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 3006 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
2862 | rcu_prepare_for_idle_init(cpu); | ||
2863 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 3007 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
2864 | 3008 | ||
2865 | /* Add CPU to rcu_node bitmasks. */ | 3009 | /* Add CPU to rcu_node bitmasks. */ |
@@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2909 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | 3053 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); |
2910 | struct rcu_node *rnp = rdp->mynode; | 3054 | struct rcu_node *rnp = rdp->mynode; |
2911 | struct rcu_state *rsp; | 3055 | struct rcu_state *rsp; |
2912 | int ret = NOTIFY_OK; | ||
2913 | 3056 | ||
2914 | trace_rcu_utilization("Start CPU hotplug"); | 3057 | trace_rcu_utilization("Start CPU hotplug"); |
2915 | switch (action) { | 3058 | switch (action) { |
@@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2923 | rcu_boost_kthread_setaffinity(rnp, -1); | 3066 | rcu_boost_kthread_setaffinity(rnp, -1); |
2924 | break; | 3067 | break; |
2925 | case CPU_DOWN_PREPARE: | 3068 | case CPU_DOWN_PREPARE: |
2926 | if (nocb_cpu_expendable(cpu)) | 3069 | rcu_boost_kthread_setaffinity(rnp, cpu); |
2927 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
2928 | else | ||
2929 | ret = NOTIFY_BAD; | ||
2930 | break; | 3070 | break; |
2931 | case CPU_DYING: | 3071 | case CPU_DYING: |
2932 | case CPU_DYING_FROZEN: | 3072 | case CPU_DYING_FROZEN: |
2933 | /* | ||
2934 | * The whole machine is "stopped" except this CPU, so we can | ||
2935 | * touch any data without introducing corruption. We send the | ||
2936 | * dying CPU's callbacks to an arbitrarily chosen online CPU. | ||
2937 | */ | ||
2938 | for_each_rcu_flavor(rsp) | 3073 | for_each_rcu_flavor(rsp) |
2939 | rcu_cleanup_dying_cpu(rsp); | 3074 | rcu_cleanup_dying_cpu(rsp); |
2940 | rcu_cleanup_after_idle(cpu); | ||
2941 | break; | 3075 | break; |
2942 | case CPU_DEAD: | 3076 | case CPU_DEAD: |
2943 | case CPU_DEAD_FROZEN: | 3077 | case CPU_DEAD_FROZEN: |
@@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2950 | break; | 3084 | break; |
2951 | } | 3085 | } |
2952 | trace_rcu_utilization("End CPU hotplug"); | 3086 | trace_rcu_utilization("End CPU hotplug"); |
2953 | return ret; | 3087 | return NOTIFY_OK; |
2954 | } | 3088 | } |
2955 | 3089 | ||
2956 | /* | 3090 | /* |
@@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3085 | } | 3219 | } |
3086 | rnp->level = i; | 3220 | rnp->level = i; |
3087 | INIT_LIST_HEAD(&rnp->blkd_tasks); | 3221 | INIT_LIST_HEAD(&rnp->blkd_tasks); |
3222 | rcu_init_one_nocb(rnp); | ||
3088 | } | 3223 | } |
3089 | } | 3224 | } |
3090 | 3225 | ||
@@ -3170,8 +3305,7 @@ void __init rcu_init(void) | |||
3170 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | 3305 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); |
3171 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | 3306 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); |
3172 | __rcu_init_preempt(); | 3307 | __rcu_init_preempt(); |
3173 | rcu_init_nocb(); | 3308 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); |
3174 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
3175 | 3309 | ||
3176 | /* | 3310 | /* |
3177 | * We don't need protection against CPU-hotplug here because | 3311 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9d..14ee40795d6f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -88,18 +88,13 @@ struct rcu_dynticks { | |||
88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | 88 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ |
89 | atomic_t dynticks; /* Even value for idle, else odd. */ | 89 | atomic_t dynticks; /* Even value for idle, else odd. */ |
90 | #ifdef CONFIG_RCU_FAST_NO_HZ | 90 | #ifdef CONFIG_RCU_FAST_NO_HZ |
91 | int dyntick_drain; /* Prepare-for-idle state variable. */ | 91 | bool all_lazy; /* Are all CPU's CBs lazy? */ |
92 | unsigned long dyntick_holdoff; | ||
93 | /* No retries for the jiffy of failure. */ | ||
94 | struct timer_list idle_gp_timer; | ||
95 | /* Wake up CPU sleeping with callbacks. */ | ||
96 | unsigned long idle_gp_timer_expires; | ||
97 | /* When to wake up CPU (for repost). */ | ||
98 | bool idle_first_pass; /* First pass of attempt to go idle? */ | ||
99 | unsigned long nonlazy_posted; | 92 | unsigned long nonlazy_posted; |
100 | /* # times non-lazy CBs posted to CPU. */ | 93 | /* # times non-lazy CBs posted to CPU. */ |
101 | unsigned long nonlazy_posted_snap; | 94 | unsigned long nonlazy_posted_snap; |
102 | /* idle-period nonlazy_posted snapshot. */ | 95 | /* idle-period nonlazy_posted snapshot. */ |
96 | unsigned long last_accelerate; | ||
97 | /* Last jiffy CBs were accelerated. */ | ||
103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 98 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 99 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
105 | }; | 100 | }; |
@@ -134,9 +129,6 @@ struct rcu_node { | |||
134 | /* elements that need to drain to allow the */ | 129 | /* elements that need to drain to allow the */ |
135 | /* current expedited grace period to */ | 130 | /* current expedited grace period to */ |
136 | /* complete (only for TREE_PREEMPT_RCU). */ | 131 | /* complete (only for TREE_PREEMPT_RCU). */ |
137 | atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ | ||
138 | /* Since this has meaning only for leaf */ | ||
139 | /* rcu_node structures, 32 bits suffices. */ | ||
140 | unsigned long qsmaskinit; | 132 | unsigned long qsmaskinit; |
141 | /* Per-GP initial value for qsmask & expmask. */ | 133 | /* Per-GP initial value for qsmask & expmask. */ |
142 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | 134 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ |
@@ -196,6 +188,12 @@ struct rcu_node { | |||
196 | /* Refused to boost: not sure why, though. */ | 188 | /* Refused to boost: not sure why, though. */ |
197 | /* This can happen due to race conditions. */ | 189 | /* This can happen due to race conditions. */ |
198 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 190 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
191 | #ifdef CONFIG_RCU_NOCB_CPU | ||
192 | wait_queue_head_t nocb_gp_wq[2]; | ||
193 | /* Place for rcu_nocb_kthread() to wait GP. */ | ||
194 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
195 | int need_future_gp[2]; | ||
196 | /* Counts of upcoming no-CB GP requests. */ | ||
199 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; | 197 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; |
200 | } ____cacheline_internodealigned_in_smp; | 198 | } ____cacheline_internodealigned_in_smp; |
201 | 199 | ||
@@ -328,6 +326,11 @@ struct rcu_data { | |||
328 | struct task_struct *nocb_kthread; | 326 | struct task_struct *nocb_kthread; |
329 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | 327 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ |
330 | 328 | ||
329 | /* 8) RCU CPU stall data. */ | ||
330 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
331 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ | ||
332 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
333 | |||
331 | int cpu; | 334 | int cpu; |
332 | struct rcu_state *rsp; | 335 | struct rcu_state *rsp; |
333 | }; | 336 | }; |
@@ -375,12 +378,6 @@ struct rcu_state { | |||
375 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | 378 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ |
376 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | 379 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ |
377 | void (*func)(struct rcu_head *head)); | 380 | void (*func)(struct rcu_head *head)); |
378 | #ifdef CONFIG_RCU_NOCB_CPU | ||
379 | void (*call_remote)(struct rcu_head *head, | ||
380 | void (*func)(struct rcu_head *head)); | ||
381 | /* call_rcu() flavor, but for */ | ||
382 | /* placing on remote CPU. */ | ||
383 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
384 | 381 | ||
385 | /* The following fields are guarded by the root rcu_node's lock. */ | 382 | /* The following fields are guarded by the root rcu_node's lock. */ |
386 | 383 | ||
@@ -443,6 +440,7 @@ struct rcu_state { | |||
443 | unsigned long gp_max; /* Maximum GP duration in */ | 440 | unsigned long gp_max; /* Maximum GP duration in */ |
444 | /* jiffies. */ | 441 | /* jiffies. */ |
445 | char *name; /* Name of structure. */ | 442 | char *name; /* Name of structure. */ |
443 | char abbr; /* Abbreviated name. */ | ||
446 | struct list_head flavors; /* List of RCU flavors. */ | 444 | struct list_head flavors; /* List of RCU flavors. */ |
447 | }; | 445 | }; |
448 | 446 | ||
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
520 | struct rcu_node *rnp); | 518 | struct rcu_node *rnp); |
521 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 519 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
522 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 520 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
523 | static void rcu_prepare_for_idle_init(int cpu); | ||
524 | static void rcu_cleanup_after_idle(int cpu); | 521 | static void rcu_cleanup_after_idle(int cpu); |
525 | static void rcu_prepare_for_idle(int cpu); | 522 | static void rcu_prepare_for_idle(int cpu); |
526 | static void rcu_idle_count_callbacks_posted(void); | 523 | static void rcu_idle_count_callbacks_posted(void); |
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | |||
529 | static void print_cpu_stall_info_end(void); | 526 | static void print_cpu_stall_info_end(void); |
530 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | 527 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); |
531 | static void increment_cpu_stall_ticks(void); | 528 | static void increment_cpu_stall_ticks(void); |
529 | static int rcu_nocb_needs_gp(struct rcu_state *rsp); | ||
530 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | ||
531 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | ||
532 | static void rcu_init_one_nocb(struct rcu_node *rnp); | ||
532 | static bool is_nocb_cpu(int cpu); | 533 | static bool is_nocb_cpu(int cpu); |
533 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | 534 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, |
534 | bool lazy); | 535 | bool lazy); |
535 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | 536 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, |
536 | struct rcu_data *rdp); | 537 | struct rcu_data *rdp); |
537 | static bool nocb_cpu_expendable(int cpu); | ||
538 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | 538 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); |
539 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | 539 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); |
540 | static void init_nocb_callback_list(struct rcu_data *rdp); | 540 | static bool init_nocb_callback_list(struct rcu_data *rdp); |
541 | static void __init rcu_init_nocb(void); | ||
542 | 541 | ||
543 | #endif /* #ifndef RCU_TREE_NONCORE */ | 542 | #endif /* #ifndef RCU_TREE_NONCORE */ |
544 | 543 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..d084ae3f281c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -85,11 +85,21 @@ static void __init rcu_bootup_announce_oddness(void) | |||
85 | if (nr_cpu_ids != NR_CPUS) | 85 | if (nr_cpu_ids != NR_CPUS) |
86 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | 86 | printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); |
87 | #ifdef CONFIG_RCU_NOCB_CPU | 87 | #ifdef CONFIG_RCU_NOCB_CPU |
88 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | ||
89 | if (!have_rcu_nocb_mask) { | ||
90 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | ||
91 | have_rcu_nocb_mask = true; | ||
92 | } | ||
93 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
94 | pr_info("\tExperimental no-CBs CPU 0\n"); | ||
95 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
98 | pr_info("\tExperimental no-CBs for all CPUs\n"); | ||
99 | cpumask_setall(rcu_nocb_mask); | ||
100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
88 | if (have_rcu_nocb_mask) { | 102 | if (have_rcu_nocb_mask) { |
89 | if (cpumask_test_cpu(0, rcu_nocb_mask)) { | ||
90 | cpumask_clear_cpu(0, rcu_nocb_mask); | ||
91 | pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); | ||
92 | } | ||
93 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | 103 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); |
94 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); | 104 | pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); |
95 | if (rcu_nocb_poll) | 105 | if (rcu_nocb_poll) |
@@ -101,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void) | |||
101 | #ifdef CONFIG_TREE_PREEMPT_RCU | 111 | #ifdef CONFIG_TREE_PREEMPT_RCU |
102 | 112 | ||
103 | struct rcu_state rcu_preempt_state = | 113 | struct rcu_state rcu_preempt_state = |
104 | RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); | 114 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); |
105 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); | 115 | DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); |
106 | static struct rcu_state *rcu_state = &rcu_preempt_state; | 116 | static struct rcu_state *rcu_state = &rcu_preempt_state; |
107 | 117 | ||
@@ -1533,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
1533 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | 1543 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) |
1534 | { | 1544 | { |
1535 | *delta_jiffies = ULONG_MAX; | 1545 | *delta_jiffies = ULONG_MAX; |
1536 | return rcu_cpu_has_callbacks(cpu); | 1546 | return rcu_cpu_has_callbacks(cpu, NULL); |
1537 | } | ||
1538 | |||
1539 | /* | ||
1540 | * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. | ||
1541 | */ | ||
1542 | static void rcu_prepare_for_idle_init(int cpu) | ||
1543 | { | ||
1544 | } | 1547 | } |
1545 | 1548 | ||
1546 | /* | 1549 | /* |
@@ -1577,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1577 | * | 1580 | * |
1578 | * The following three proprocessor symbols control this state machine: | 1581 | * The following three proprocessor symbols control this state machine: |
1579 | * | 1582 | * |
1580 | * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt | ||
1581 | * to satisfy RCU. Beyond this point, it is better to incur a periodic | ||
1582 | * scheduling-clock interrupt than to loop through the state machine | ||
1583 | * at full power. | ||
1584 | * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are | ||
1585 | * optional if RCU does not need anything immediately from this | ||
1586 | * CPU, even if this CPU still has RCU callbacks queued. The first | ||
1587 | * times through the state machine are mandatory: we need to give | ||
1588 | * the state machine a chance to communicate a quiescent state | ||
1589 | * to the RCU core. | ||
1590 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | 1583 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted |
1591 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | 1584 | * to sleep in dyntick-idle mode with RCU callbacks pending. This |
1592 | * is sized to be roughly one RCU grace period. Those energy-efficiency | 1585 | * is sized to be roughly one RCU grace period. Those energy-efficiency |
@@ -1602,186 +1595,108 @@ static void rcu_idle_count_callbacks_posted(void) | |||
1602 | * adjustment, they can be converted into kernel config parameters, though | 1595 | * adjustment, they can be converted into kernel config parameters, though |
1603 | * making the state machine smarter might be a better option. | 1596 | * making the state machine smarter might be a better option. |
1604 | */ | 1597 | */ |
1605 | #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ | ||
1606 | #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ | ||
1607 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ | 1598 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ |
1608 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | 1599 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ |
1609 | 1600 | ||
1610 | extern int tick_nohz_enabled; | 1601 | static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; |
1611 | 1602 | module_param(rcu_idle_gp_delay, int, 0644); | |
1612 | /* | 1603 | static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; |
1613 | * Does the specified flavor of RCU have non-lazy callbacks pending on | 1604 | module_param(rcu_idle_lazy_gp_delay, int, 0644); |
1614 | * the specified CPU? Both RCU flavor and CPU are specified by the | ||
1615 | * rcu_data structure. | ||
1616 | */ | ||
1617 | static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) | ||
1618 | { | ||
1619 | return rdp->qlen != rdp->qlen_lazy; | ||
1620 | } | ||
1621 | 1605 | ||
1622 | #ifdef CONFIG_TREE_PREEMPT_RCU | 1606 | extern int tick_nohz_enabled; |
1623 | 1607 | ||
1624 | /* | 1608 | /* |
1625 | * Are there non-lazy RCU-preempt callbacks? (There cannot be if there | 1609 | * Try to advance callbacks for all flavors of RCU on the current CPU. |
1626 | * is no RCU-preempt in the kernel.) | 1610 | * Afterwards, if there are any callbacks ready for immediate invocation, |
1611 | * return true. | ||
1627 | */ | 1612 | */ |
1628 | static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) | 1613 | static bool rcu_try_advance_all_cbs(void) |
1629 | { | 1614 | { |
1630 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 1615 | bool cbs_ready = false; |
1631 | 1616 | struct rcu_data *rdp; | |
1632 | return __rcu_cpu_has_nonlazy_callbacks(rdp); | 1617 | struct rcu_node *rnp; |
1633 | } | 1618 | struct rcu_state *rsp; |
1634 | |||
1635 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
1636 | 1619 | ||
1637 | static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) | 1620 | for_each_rcu_flavor(rsp) { |
1638 | { | 1621 | rdp = this_cpu_ptr(rsp->rda); |
1639 | return 0; | 1622 | rnp = rdp->mynode; |
1640 | } | ||
1641 | 1623 | ||
1642 | #endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1624 | /* |
1625 | * Don't bother checking unless a grace period has | ||
1626 | * completed since we last checked and there are | ||
1627 | * callbacks not yet ready to invoke. | ||
1628 | */ | ||
1629 | if (rdp->completed != rnp->completed && | ||
1630 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | ||
1631 | rcu_process_gp_end(rsp, rdp); | ||
1643 | 1632 | ||
1644 | /* | 1633 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1645 | * Does any flavor of RCU have non-lazy callbacks on the specified CPU? | 1634 | cbs_ready = true; |
1646 | */ | 1635 | } |
1647 | static bool rcu_cpu_has_nonlazy_callbacks(int cpu) | 1636 | return cbs_ready; |
1648 | { | ||
1649 | return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || | ||
1650 | __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || | ||
1651 | rcu_preempt_cpu_has_nonlazy_callbacks(cpu); | ||
1652 | } | 1637 | } |
1653 | 1638 | ||
1654 | /* | 1639 | /* |
1655 | * Allow the CPU to enter dyntick-idle mode if either: (1) There are no | 1640 | * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready |
1656 | * callbacks on this CPU, (2) this CPU has not yet attempted to enter | 1641 | * to invoke. If the CPU has callbacks, try to advance them. Tell the |
1657 | * dyntick-idle mode, or (3) this CPU is in the process of attempting to | 1642 | * caller to set the timeout based on whether or not there are non-lazy |
1658 | * enter dyntick-idle mode. Otherwise, if we have recently tried and failed | 1643 | * callbacks. |
1659 | * to enter dyntick-idle mode, we refuse to try to enter it. After all, | ||
1660 | * it is better to incur scheduling-clock interrupts than to spin | ||
1661 | * continuously for the same time duration! | ||
1662 | * | 1644 | * |
1663 | * The delta_jiffies argument is used to store the time when RCU is | 1645 | * The caller must have disabled interrupts. |
1664 | * going to need the CPU again if it still has callbacks. The reason | ||
1665 | * for this is that rcu_prepare_for_idle() might need to post a timer, | ||
1666 | * but if so, it will do so after tick_nohz_stop_sched_tick() has set | ||
1667 | * the wakeup time for this CPU. This means that RCU's timer can be | ||
1668 | * delayed until the wakeup time, which defeats the purpose of posting | ||
1669 | * a timer. | ||
1670 | */ | 1646 | */ |
1671 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | 1647 | int rcu_needs_cpu(int cpu, unsigned long *dj) |
1672 | { | 1648 | { |
1673 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1649 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
1674 | 1650 | ||
1675 | /* Flag a new idle sojourn to the idle-entry state machine. */ | 1651 | /* Snapshot to detect later posting of non-lazy callback. */ |
1676 | rdtp->idle_first_pass = 1; | 1652 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; |
1653 | |||
1677 | /* If no callbacks, RCU doesn't need the CPU. */ | 1654 | /* If no callbacks, RCU doesn't need the CPU. */ |
1678 | if (!rcu_cpu_has_callbacks(cpu)) { | 1655 | if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { |
1679 | *delta_jiffies = ULONG_MAX; | 1656 | *dj = ULONG_MAX; |
1680 | return 0; | 1657 | return 0; |
1681 | } | 1658 | } |
1682 | if (rdtp->dyntick_holdoff == jiffies) { | 1659 | |
1683 | /* RCU recently tried and failed, so don't try again. */ | 1660 | /* Attempt to advance callbacks. */ |
1684 | *delta_jiffies = 1; | 1661 | if (rcu_try_advance_all_cbs()) { |
1662 | /* Some ready to invoke, so initiate later invocation. */ | ||
1663 | invoke_rcu_core(); | ||
1685 | return 1; | 1664 | return 1; |
1686 | } | 1665 | } |
1687 | /* Set up for the possibility that RCU will post a timer. */ | 1666 | rdtp->last_accelerate = jiffies; |
1688 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | 1667 | |
1689 | *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, | 1668 | /* Request timer delay depending on laziness, and round. */ |
1690 | RCU_IDLE_GP_DELAY) - jiffies; | 1669 | if (rdtp->all_lazy) { |
1670 | *dj = round_up(rcu_idle_gp_delay + jiffies, | ||
1671 | rcu_idle_gp_delay) - jiffies; | ||
1691 | } else { | 1672 | } else { |
1692 | *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; | 1673 | *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; |
1693 | *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; | ||
1694 | } | 1674 | } |
1695 | return 0; | 1675 | return 0; |
1696 | } | 1676 | } |
1697 | 1677 | ||
1698 | /* | 1678 | /* |
1699 | * Handler for smp_call_function_single(). The only point of this | 1679 | * Prepare a CPU for idle from an RCU perspective. The first major task |
1700 | * handler is to wake the CPU up, so the handler does only tracing. | 1680 | * is to sense whether nohz mode has been enabled or disabled via sysfs. |
1701 | */ | 1681 | * The second major task is to check to see if a non-lazy callback has |
1702 | void rcu_idle_demigrate(void *unused) | 1682 | * arrived at a CPU that previously had only lazy callbacks. The third |
1703 | { | 1683 | * major task is to accelerate (that is, assign grace-period numbers to) |
1704 | trace_rcu_prep_idle("Demigrate"); | 1684 | * any recently arrived callbacks. |
1705 | } | ||
1706 | |||
1707 | /* | ||
1708 | * Timer handler used to force CPU to start pushing its remaining RCU | ||
1709 | * callbacks in the case where it entered dyntick-idle mode with callbacks | ||
1710 | * pending. The hander doesn't really need to do anything because the | ||
1711 | * real work is done upon re-entry to idle, or by the next scheduling-clock | ||
1712 | * interrupt should idle not be re-entered. | ||
1713 | * | ||
1714 | * One special case: the timer gets migrated without awakening the CPU | ||
1715 | * on which the timer was scheduled on. In this case, we must wake up | ||
1716 | * that CPU. We do so with smp_call_function_single(). | ||
1717 | */ | ||
1718 | static void rcu_idle_gp_timer_func(unsigned long cpu_in) | ||
1719 | { | ||
1720 | int cpu = (int)cpu_in; | ||
1721 | |||
1722 | trace_rcu_prep_idle("Timer"); | ||
1723 | if (cpu != smp_processor_id()) | ||
1724 | smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); | ||
1725 | else | ||
1726 | WARN_ON_ONCE(1); /* Getting here can hang the system... */ | ||
1727 | } | ||
1728 | |||
1729 | /* | ||
1730 | * Initialize the timer used to pull CPUs out of dyntick-idle mode. | ||
1731 | */ | ||
1732 | static void rcu_prepare_for_idle_init(int cpu) | ||
1733 | { | ||
1734 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1735 | |||
1736 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1737 | setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); | ||
1738 | rdtp->idle_gp_timer_expires = jiffies - 1; | ||
1739 | rdtp->idle_first_pass = 1; | ||
1740 | } | ||
1741 | |||
1742 | /* | ||
1743 | * Clean up for exit from idle. Because we are exiting from idle, there | ||
1744 | * is no longer any point to ->idle_gp_timer, so cancel it. This will | ||
1745 | * do nothing if this timer is not active, so just cancel it unconditionally. | ||
1746 | */ | ||
1747 | static void rcu_cleanup_after_idle(int cpu) | ||
1748 | { | ||
1749 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1750 | |||
1751 | del_timer(&rdtp->idle_gp_timer); | ||
1752 | trace_rcu_prep_idle("Cleanup after idle"); | ||
1753 | rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); | ||
1754 | } | ||
1755 | |||
1756 | /* | ||
1757 | * Check to see if any RCU-related work can be done by the current CPU, | ||
1758 | * and if so, schedule a softirq to get it done. This function is part | ||
1759 | * of the RCU implementation; it is -not- an exported member of the RCU API. | ||
1760 | * | ||
1761 | * The idea is for the current CPU to clear out all work required by the | ||
1762 | * RCU core for the current grace period, so that this CPU can be permitted | ||
1763 | * to enter dyntick-idle mode. In some cases, it will need to be awakened | ||
1764 | * at the end of the grace period by whatever CPU ends the grace period. | ||
1765 | * This allows CPUs to go dyntick-idle more quickly, and to reduce the | ||
1766 | * number of wakeups by a modest integer factor. | ||
1767 | * | ||
1768 | * Because it is not legal to invoke rcu_process_callbacks() with irqs | ||
1769 | * disabled, we do one pass of force_quiescent_state(), then do a | ||
1770 | * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked | ||
1771 | * later. The ->dyntick_drain field controls the sequencing. | ||
1772 | * | 1685 | * |
1773 | * The caller must have disabled interrupts. | 1686 | * The caller must have disabled interrupts. |
1774 | */ | 1687 | */ |
1775 | static void rcu_prepare_for_idle(int cpu) | 1688 | static void rcu_prepare_for_idle(int cpu) |
1776 | { | 1689 | { |
1777 | struct timer_list *tp; | 1690 | struct rcu_data *rdp; |
1778 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1691 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
1692 | struct rcu_node *rnp; | ||
1693 | struct rcu_state *rsp; | ||
1779 | int tne; | 1694 | int tne; |
1780 | 1695 | ||
1781 | /* Handle nohz enablement switches conservatively. */ | 1696 | /* Handle nohz enablement switches conservatively. */ |
1782 | tne = ACCESS_ONCE(tick_nohz_enabled); | 1697 | tne = ACCESS_ONCE(tick_nohz_enabled); |
1783 | if (tne != rdtp->tick_nohz_enabled_snap) { | 1698 | if (tne != rdtp->tick_nohz_enabled_snap) { |
1784 | if (rcu_cpu_has_callbacks(cpu)) | 1699 | if (rcu_cpu_has_callbacks(cpu, NULL)) |
1785 | invoke_rcu_core(); /* force nohz to see update. */ | 1700 | invoke_rcu_core(); /* force nohz to see update. */ |
1786 | rdtp->tick_nohz_enabled_snap = tne; | 1701 | rdtp->tick_nohz_enabled_snap = tne; |
1787 | return; | 1702 | return; |
@@ -1789,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu) | |||
1789 | if (!tne) | 1704 | if (!tne) |
1790 | return; | 1705 | return; |
1791 | 1706 | ||
1792 | /* Adaptive-tick mode, where usermode execution is idle to RCU. */ | 1707 | /* If this is a no-CBs CPU, no callbacks, just return. */ |
1793 | if (!is_idle_task(current)) { | 1708 | if (is_nocb_cpu(cpu)) |
1794 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1795 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
1796 | trace_rcu_prep_idle("User dyntick with callbacks"); | ||
1797 | rdtp->idle_gp_timer_expires = | ||
1798 | round_up(jiffies + RCU_IDLE_GP_DELAY, | ||
1799 | RCU_IDLE_GP_DELAY); | ||
1800 | } else if (rcu_cpu_has_callbacks(cpu)) { | ||
1801 | rdtp->idle_gp_timer_expires = | ||
1802 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); | ||
1803 | trace_rcu_prep_idle("User dyntick with lazy callbacks"); | ||
1804 | } else { | ||
1805 | return; | ||
1806 | } | ||
1807 | tp = &rdtp->idle_gp_timer; | ||
1808 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1809 | return; | 1709 | return; |
1810 | } | ||
1811 | 1710 | ||
1812 | /* | 1711 | /* |
1813 | * If this is an idle re-entry, for example, due to use of | 1712 | * If a non-lazy callback arrived at a CPU having only lazy |
1814 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 1713 | * callbacks, invoke RCU core for the side-effect of recalculating |
1815 | * loop, then don't take any state-machine actions, unless the | 1714 | * idle duration on re-entry to idle. |
1816 | * momentary exit from idle queued additional non-lazy callbacks. | ||
1817 | * Instead, repost the ->idle_gp_timer if this CPU has callbacks | ||
1818 | * pending. | ||
1819 | */ | 1715 | */ |
1820 | if (!rdtp->idle_first_pass && | 1716 | if (rdtp->all_lazy && |
1821 | (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { | 1717 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { |
1822 | if (rcu_cpu_has_callbacks(cpu)) { | 1718 | invoke_rcu_core(); |
1823 | tp = &rdtp->idle_gp_timer; | ||
1824 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1825 | } | ||
1826 | return; | 1719 | return; |
1827 | } | 1720 | } |
1828 | rdtp->idle_first_pass = 0; | ||
1829 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; | ||
1830 | 1721 | ||
1831 | /* | 1722 | /* |
1832 | * If there are no callbacks on this CPU, enter dyntick-idle mode. | 1723 | * If we have not yet accelerated this jiffy, accelerate all |
1833 | * Also reset state to avoid prejudicing later attempts. | 1724 | * callbacks on this CPU. |
1834 | */ | 1725 | */ |
1835 | if (!rcu_cpu_has_callbacks(cpu)) { | 1726 | if (rdtp->last_accelerate == jiffies) |
1836 | rdtp->dyntick_holdoff = jiffies - 1; | ||
1837 | rdtp->dyntick_drain = 0; | ||
1838 | trace_rcu_prep_idle("No callbacks"); | ||
1839 | return; | 1727 | return; |
1728 | rdtp->last_accelerate = jiffies; | ||
1729 | for_each_rcu_flavor(rsp) { | ||
1730 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
1731 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | ||
1732 | continue; | ||
1733 | rnp = rdp->mynode; | ||
1734 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1735 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1736 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1840 | } | 1737 | } |
1738 | } | ||
1841 | 1739 | ||
1842 | /* | 1740 | /* |
1843 | * If in holdoff mode, just return. We will presumably have | 1741 | * Clean up for exit from idle. Attempt to advance callbacks based on |
1844 | * refrained from disabling the scheduling-clock tick. | 1742 | * any grace periods that elapsed while the CPU was idle, and if any |
1845 | */ | 1743 | * callbacks are now ready to invoke, initiate invocation. |
1846 | if (rdtp->dyntick_holdoff == jiffies) { | 1744 | */ |
1847 | trace_rcu_prep_idle("In holdoff"); | 1745 | static void rcu_cleanup_after_idle(int cpu) |
1848 | return; | 1746 | { |
1849 | } | 1747 | struct rcu_data *rdp; |
1748 | struct rcu_state *rsp; | ||
1850 | 1749 | ||
1851 | /* Check and update the ->dyntick_drain sequencing. */ | 1750 | if (is_nocb_cpu(cpu)) |
1852 | if (rdtp->dyntick_drain <= 0) { | ||
1853 | /* First time through, initialize the counter. */ | ||
1854 | rdtp->dyntick_drain = RCU_IDLE_FLUSHES; | ||
1855 | } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && | ||
1856 | !rcu_pending(cpu) && | ||
1857 | !local_softirq_pending()) { | ||
1858 | /* Can we go dyntick-idle despite still having callbacks? */ | ||
1859 | rdtp->dyntick_drain = 0; | ||
1860 | rdtp->dyntick_holdoff = jiffies; | ||
1861 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
1862 | trace_rcu_prep_idle("Dyntick with callbacks"); | ||
1863 | rdtp->idle_gp_timer_expires = | ||
1864 | round_up(jiffies + RCU_IDLE_GP_DELAY, | ||
1865 | RCU_IDLE_GP_DELAY); | ||
1866 | } else { | ||
1867 | rdtp->idle_gp_timer_expires = | ||
1868 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); | ||
1869 | trace_rcu_prep_idle("Dyntick with lazy callbacks"); | ||
1870 | } | ||
1871 | tp = &rdtp->idle_gp_timer; | ||
1872 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
1873 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
1874 | return; /* Nothing more to do immediately. */ | ||
1875 | } else if (--(rdtp->dyntick_drain) <= 0) { | ||
1876 | /* We have hit the limit, so time to give up. */ | ||
1877 | rdtp->dyntick_holdoff = jiffies; | ||
1878 | trace_rcu_prep_idle("Begin holdoff"); | ||
1879 | invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ | ||
1880 | return; | 1751 | return; |
1881 | } | 1752 | rcu_try_advance_all_cbs(); |
1882 | 1753 | for_each_rcu_flavor(rsp) { | |
1883 | /* | 1754 | rdp = per_cpu_ptr(rsp->rda, cpu); |
1884 | * Do one step of pushing the remaining RCU callbacks through | 1755 | if (cpu_has_callbacks_ready_to_invoke(rdp)) |
1885 | * the RCU core state machine. | 1756 | invoke_rcu_core(); |
1886 | */ | ||
1887 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
1888 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | ||
1889 | rcu_preempt_qs(cpu); | ||
1890 | force_quiescent_state(&rcu_preempt_state); | ||
1891 | } | ||
1892 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
1893 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | ||
1894 | rcu_sched_qs(cpu); | ||
1895 | force_quiescent_state(&rcu_sched_state); | ||
1896 | } | ||
1897 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | ||
1898 | rcu_bh_qs(cpu); | ||
1899 | force_quiescent_state(&rcu_bh_state); | ||
1900 | } | ||
1901 | |||
1902 | /* | ||
1903 | * If RCU callbacks are still pending, RCU still needs this CPU. | ||
1904 | * So try forcing the callbacks through the grace period. | ||
1905 | */ | ||
1906 | if (rcu_cpu_has_callbacks(cpu)) { | ||
1907 | trace_rcu_prep_idle("More callbacks"); | ||
1908 | invoke_rcu_core(); | ||
1909 | } else { | ||
1910 | trace_rcu_prep_idle("Callbacks drained"); | ||
1911 | } | 1757 | } |
1912 | } | 1758 | } |
1913 | 1759 | ||
@@ -2015,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier); | |||
2015 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | 1861 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) |
2016 | { | 1862 | { |
2017 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1863 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
2018 | struct timer_list *tltp = &rdtp->idle_gp_timer; | 1864 | unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; |
2019 | char c; | ||
2020 | 1865 | ||
2021 | c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; | 1866 | sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", |
2022 | if (timer_pending(tltp)) | 1867 | rdtp->last_accelerate & 0xffff, jiffies & 0xffff, |
2023 | sprintf(cp, "drain=%d %c timer=%lu", | 1868 | ulong2long(nlpd), |
2024 | rdtp->dyntick_drain, c, tltp->expires - jiffies); | 1869 | rdtp->all_lazy ? 'L' : '.', |
2025 | else | 1870 | rdtp->tick_nohz_enabled_snap ? '.' : 'D'); |
2026 | sprintf(cp, "drain=%d %c timer not pending", | ||
2027 | rdtp->dyntick_drain, c); | ||
2028 | } | 1871 | } |
2029 | 1872 | ||
2030 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 1873 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
@@ -2070,10 +1913,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | |||
2070 | ticks_value = rsp->gpnum - rdp->gpnum; | 1913 | ticks_value = rsp->gpnum - rdp->gpnum; |
2071 | } | 1914 | } |
2072 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | 1915 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); |
2073 | printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", | 1916 | printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", |
2074 | cpu, ticks_value, ticks_title, | 1917 | cpu, ticks_value, ticks_title, |
2075 | atomic_read(&rdtp->dynticks) & 0xfff, | 1918 | atomic_read(&rdtp->dynticks) & 0xfff, |
2076 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | 1919 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, |
1920 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | ||
2077 | fast_no_hz); | 1921 | fast_no_hz); |
2078 | } | 1922 | } |
2079 | 1923 | ||
@@ -2087,6 +1931,7 @@ static void print_cpu_stall_info_end(void) | |||
2087 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | 1931 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) |
2088 | { | 1932 | { |
2089 | rdp->ticks_this_gp = 0; | 1933 | rdp->ticks_this_gp = 0; |
1934 | rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); | ||
2090 | } | 1935 | } |
2091 | 1936 | ||
2092 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | 1937 | /* Increment ->ticks_this_gp for all flavors of RCU. */ |
@@ -2165,6 +2010,47 @@ static int __init parse_rcu_nocb_poll(char *arg) | |||
2165 | } | 2010 | } |
2166 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | 2011 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); |
2167 | 2012 | ||
2013 | /* | ||
2014 | * Do any no-CBs CPUs need another grace period? | ||
2015 | * | ||
2016 | * Interrupts must be disabled. If the caller does not hold the root | ||
2017 | * rnp_node structure's ->lock, the results are advisory only. | ||
2018 | */ | ||
2019 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2020 | { | ||
2021 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
2022 | |||
2023 | return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; | ||
2024 | } | ||
2025 | |||
2026 | /* | ||
2027 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | ||
2028 | * grace period. | ||
2029 | */ | ||
2030 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
2031 | { | ||
2032 | wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); | ||
2033 | } | ||
2034 | |||
2035 | /* | ||
2036 | * Set the root rcu_node structure's ->need_future_gp field | ||
2037 | * based on the sum of those of all rcu_node structures. This does | ||
2038 | * double-count the root rcu_node structure's requests, but this | ||
2039 | * is necessary to handle the possibility of a rcu_nocb_kthread() | ||
2040 | * having awakened during the time that the rcu_node structures | ||
2041 | * were being updated for the end of the previous grace period. | ||
2042 | */ | ||
2043 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | ||
2044 | { | ||
2045 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; | ||
2046 | } | ||
2047 | |||
2048 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
2049 | { | ||
2050 | init_waitqueue_head(&rnp->nocb_gp_wq[0]); | ||
2051 | init_waitqueue_head(&rnp->nocb_gp_wq[1]); | ||
2052 | } | ||
2053 | |||
2168 | /* Is the specified CPU a no-CPUs CPU? */ | 2054 | /* Is the specified CPU a no-CPUs CPU? */ |
2169 | static bool is_nocb_cpu(int cpu) | 2055 | static bool is_nocb_cpu(int cpu) |
2170 | { | 2056 | { |
@@ -2227,6 +2113,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | |||
2227 | if (!is_nocb_cpu(rdp->cpu)) | 2113 | if (!is_nocb_cpu(rdp->cpu)) |
2228 | return 0; | 2114 | return 0; |
2229 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | 2115 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); |
2116 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | ||
2117 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | ||
2118 | (unsigned long)rhp->func, | ||
2119 | rdp->qlen_lazy, rdp->qlen); | ||
2120 | else | ||
2121 | trace_rcu_callback(rdp->rsp->name, rhp, | ||
2122 | rdp->qlen_lazy, rdp->qlen); | ||
2230 | return 1; | 2123 | return 1; |
2231 | } | 2124 | } |
2232 | 2125 | ||
@@ -2265,95 +2158,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2265 | } | 2158 | } |
2266 | 2159 | ||
2267 | /* | 2160 | /* |
2268 | * There must be at least one non-no-CBs CPU in operation at any given | 2161 | * If necessary, kick off a new grace period, and either way wait |
2269 | * time, because no-CBs CPUs are not capable of initiating grace periods | 2162 | * for a subsequent grace period to complete. |
2270 | * independently. This function therefore complains if the specified | ||
2271 | * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to | ||
2272 | * avoid offlining the last such CPU. (Recursion is a wonderful thing, | ||
2273 | * but you have to have a base case!) | ||
2274 | */ | 2163 | */ |
2275 | static bool nocb_cpu_expendable(int cpu) | 2164 | static void rcu_nocb_wait_gp(struct rcu_data *rdp) |
2276 | { | 2165 | { |
2277 | cpumask_var_t non_nocb_cpus; | 2166 | unsigned long c; |
2278 | int ret; | 2167 | bool d; |
2168 | unsigned long flags; | ||
2169 | struct rcu_node *rnp = rdp->mynode; | ||
2170 | |||
2171 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2172 | c = rcu_start_future_gp(rnp, rdp); | ||
2173 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2279 | 2174 | ||
2280 | /* | 2175 | /* |
2281 | * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, | 2176 | * Wait for the grace period. Do so interruptibly to avoid messing |
2282 | * then offlining this CPU is harmless. Let it happen. | 2177 | * up the load average. |
2283 | */ | 2178 | */ |
2284 | if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) | 2179 | trace_rcu_future_gp(rnp, rdp, c, "StartWait"); |
2285 | return 1; | 2180 | for (;;) { |
2286 | 2181 | wait_event_interruptible( | |
2287 | /* If no memory, play it safe and keep the CPU around. */ | 2182 | rnp->nocb_gp_wq[c & 0x1], |
2288 | if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) | 2183 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); |
2289 | return 0; | 2184 | if (likely(d)) |
2290 | cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); | 2185 | break; |
2291 | cpumask_clear_cpu(cpu, non_nocb_cpus); | 2186 | flush_signals(current); |
2292 | ret = !cpumask_empty(non_nocb_cpus); | 2187 | trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); |
2293 | free_cpumask_var(non_nocb_cpus); | 2188 | } |
2294 | return ret; | 2189 | trace_rcu_future_gp(rnp, rdp, c, "EndWait"); |
2295 | } | 2190 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ |
2296 | |||
2297 | /* | ||
2298 | * Helper structure for remote registry of RCU callbacks. | ||
2299 | * This is needed for when a no-CBs CPU needs to start a grace period. | ||
2300 | * If it just invokes call_rcu(), the resulting callback will be queued, | ||
2301 | * which can result in deadlock. | ||
2302 | */ | ||
2303 | struct rcu_head_remote { | ||
2304 | struct rcu_head *rhp; | ||
2305 | call_rcu_func_t *crf; | ||
2306 | void (*func)(struct rcu_head *rhp); | ||
2307 | }; | ||
2308 | |||
2309 | /* | ||
2310 | * Register a callback as specified by the rcu_head_remote struct. | ||
2311 | * This function is intended to be invoked via smp_call_function_single(). | ||
2312 | */ | ||
2313 | static void call_rcu_local(void *arg) | ||
2314 | { | ||
2315 | struct rcu_head_remote *rhrp = | ||
2316 | container_of(arg, struct rcu_head_remote, rhp); | ||
2317 | |||
2318 | rhrp->crf(rhrp->rhp, rhrp->func); | ||
2319 | } | ||
2320 | |||
2321 | /* | ||
2322 | * Set up an rcu_head_remote structure and the invoke call_rcu_local() | ||
2323 | * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via | ||
2324 | * smp_call_function_single(). | ||
2325 | */ | ||
2326 | static void invoke_crf_remote(struct rcu_head *rhp, | ||
2327 | void (*func)(struct rcu_head *rhp), | ||
2328 | call_rcu_func_t crf) | ||
2329 | { | ||
2330 | struct rcu_head_remote rhr; | ||
2331 | |||
2332 | rhr.rhp = rhp; | ||
2333 | rhr.crf = crf; | ||
2334 | rhr.func = func; | ||
2335 | smp_call_function_single(0, call_rcu_local, &rhr, 1); | ||
2336 | } | ||
2337 | |||
2338 | /* | ||
2339 | * Helper functions to be passed to wait_rcu_gp(), each of which | ||
2340 | * invokes invoke_crf_remote() to register a callback appropriately. | ||
2341 | */ | ||
2342 | static void __maybe_unused | ||
2343 | call_rcu_preempt_remote(struct rcu_head *rhp, | ||
2344 | void (*func)(struct rcu_head *rhp)) | ||
2345 | { | ||
2346 | invoke_crf_remote(rhp, func, call_rcu); | ||
2347 | } | ||
2348 | static void call_rcu_bh_remote(struct rcu_head *rhp, | ||
2349 | void (*func)(struct rcu_head *rhp)) | ||
2350 | { | ||
2351 | invoke_crf_remote(rhp, func, call_rcu_bh); | ||
2352 | } | ||
2353 | static void call_rcu_sched_remote(struct rcu_head *rhp, | ||
2354 | void (*func)(struct rcu_head *rhp)) | ||
2355 | { | ||
2356 | invoke_crf_remote(rhp, func, call_rcu_sched); | ||
2357 | } | 2191 | } |
2358 | 2192 | ||
2359 | /* | 2193 | /* |
@@ -2390,7 +2224,7 @@ static int rcu_nocb_kthread(void *arg) | |||
2390 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | 2224 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); |
2391 | ACCESS_ONCE(rdp->nocb_p_count) += c; | 2225 | ACCESS_ONCE(rdp->nocb_p_count) += c; |
2392 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | 2226 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; |
2393 | wait_rcu_gp(rdp->rsp->call_remote); | 2227 | rcu_nocb_wait_gp(rdp); |
2394 | 2228 | ||
2395 | /* Each pass through the following loop invokes a callback. */ | 2229 | /* Each pass through the following loop invokes a callback. */ |
2396 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | 2230 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); |
@@ -2436,32 +2270,41 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
2436 | return; | 2270 | return; |
2437 | for_each_cpu(cpu, rcu_nocb_mask) { | 2271 | for_each_cpu(cpu, rcu_nocb_mask) { |
2438 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2272 | rdp = per_cpu_ptr(rsp->rda, cpu); |
2439 | t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); | 2273 | t = kthread_run(rcu_nocb_kthread, rdp, |
2274 | "rcuo%c/%d", rsp->abbr, cpu); | ||
2440 | BUG_ON(IS_ERR(t)); | 2275 | BUG_ON(IS_ERR(t)); |
2441 | ACCESS_ONCE(rdp->nocb_kthread) = t; | 2276 | ACCESS_ONCE(rdp->nocb_kthread) = t; |
2442 | } | 2277 | } |
2443 | } | 2278 | } |
2444 | 2279 | ||
2445 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | 2280 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ |
2446 | static void init_nocb_callback_list(struct rcu_data *rdp) | 2281 | static bool init_nocb_callback_list(struct rcu_data *rdp) |
2447 | { | 2282 | { |
2448 | if (rcu_nocb_mask == NULL || | 2283 | if (rcu_nocb_mask == NULL || |
2449 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | 2284 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) |
2450 | return; | 2285 | return false; |
2451 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 2286 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; |
2287 | return true; | ||
2288 | } | ||
2289 | |||
2290 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
2291 | |||
2292 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2293 | { | ||
2294 | return 0; | ||
2452 | } | 2295 | } |
2453 | 2296 | ||
2454 | /* Initialize the ->call_remote fields in the rcu_state structures. */ | 2297 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) |
2455 | static void __init rcu_init_nocb(void) | ||
2456 | { | 2298 | { |
2457 | #ifdef CONFIG_PREEMPT_RCU | ||
2458 | rcu_preempt_state.call_remote = call_rcu_preempt_remote; | ||
2459 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2460 | rcu_bh_state.call_remote = call_rcu_bh_remote; | ||
2461 | rcu_sched_state.call_remote = call_rcu_sched_remote; | ||
2462 | } | 2299 | } |
2463 | 2300 | ||
2464 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | 2301 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) |
2302 | { | ||
2303 | } | ||
2304 | |||
2305 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
2306 | { | ||
2307 | } | ||
2465 | 2308 | ||
2466 | static bool is_nocb_cpu(int cpu) | 2309 | static bool is_nocb_cpu(int cpu) |
2467 | { | 2310 | { |
@@ -2480,11 +2323,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | |||
2480 | return 0; | 2323 | return 0; |
2481 | } | 2324 | } |
2482 | 2325 | ||
2483 | static bool nocb_cpu_expendable(int cpu) | ||
2484 | { | ||
2485 | return 1; | ||
2486 | } | ||
2487 | |||
2488 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | 2326 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) |
2489 | { | 2327 | { |
2490 | } | 2328 | } |
@@ -2493,12 +2331,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | |||
2493 | { | 2331 | { |
2494 | } | 2332 | } |
2495 | 2333 | ||
2496 | static void init_nocb_callback_list(struct rcu_data *rdp) | 2334 | static bool init_nocb_callback_list(struct rcu_data *rdp) |
2497 | { | ||
2498 | } | ||
2499 | |||
2500 | static void __init rcu_init_nocb(void) | ||
2501 | { | 2335 | { |
2336 | return false; | ||
2502 | } | 2337 | } |
2503 | 2338 | ||
2504 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | 2339 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dcaa670..49099e81c87b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
@@ -46,8 +46,6 @@ | |||
46 | #define RCU_TREE_NONCORE | 46 | #define RCU_TREE_NONCORE |
47 | #include "rcutree.h" | 47 | #include "rcutree.h" |
48 | 48 | ||
49 | #define ulong2long(a) (*(long *)(&(a))) | ||
50 | |||
51 | static int r_open(struct inode *inode, struct file *file, | 49 | static int r_open(struct inode *inode, struct file *file, |
52 | const struct seq_operations *op) | 50 | const struct seq_operations *op) |
53 | { | 51 | { |