aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-04-30 04:49:04 -0400
committerIngo Molnar <mingo@kernel.org>2013-04-30 04:49:04 -0400
commitfd29f424d458118f02e89596505c68a63dcb3007 (patch)
treeb52470ff7fe7a9f29260afe4a9f22a80fc900140
parentc1be5a5b1b355d40e6cf79cc979eb66dafa24ad1 (diff)
parent49717cb40410fe4b563968680ff7c513967504c6 (diff)
Merge branch 'rcu/doc' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into core/urgent
Pull RCU documentation update for reducing OS jitter due to per-CPU kthreads, from Paul McKenney. Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--Documentation/RCU/checklist.txt26
-rw-r--r--Documentation/RCU/lockdep.txt5
-rw-r--r--Documentation/RCU/rcubarrier.txt15
-rw-r--r--Documentation/RCU/stallwarn.txt33
-rw-r--r--Documentation/RCU/whatisRCU.txt4
-rw-r--r--Documentation/kernel-parameters.txt35
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt202
-rw-r--r--include/linux/list_bl.h5
-rw-r--r--include/linux/rculist_bl.h2
-rw-r--r--include/linux/rcupdate.h1
-rw-r--r--include/trace/events/rcu.h55
-rw-r--r--init/Kconfig73
-rw-r--r--kernel/rcutree.c260
-rw-r--r--kernel/rcutree.h41
-rw-r--r--kernel/rcutree_plugin.h601
-rw-r--r--kernel/rcutree_trace.c2
16 files changed, 842 insertions, 518 deletions
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 31ef8fe07f82..79e789b8b8ea 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -217,9 +217,14 @@ over a rather long period of time, but improvements are always welcome!
217 whether the increased speed is worth it. 217 whether the increased speed is worth it.
218 218
2198. Although synchronize_rcu() is slower than is call_rcu(), it 2198. Although synchronize_rcu() is slower than is call_rcu(), it
220 usually results in simpler code. So, unless update performance 220 usually results in simpler code. So, unless update performance is
221 is critically important or the updaters cannot block, 221 critically important, the updaters cannot block, or the latency of
222 synchronize_rcu() should be used in preference to call_rcu(). 222 synchronize_rcu() is visible from userspace, synchronize_rcu()
223 should be used in preference to call_rcu(). Furthermore,
224 kfree_rcu() usually results in even simpler code than does
225 synchronize_rcu() without synchronize_rcu()'s multi-millisecond
226 latency. So please take advantage of kfree_rcu()'s "fire and
227 forget" memory-freeing capabilities where it applies.
223 228
224 An especially important property of the synchronize_rcu() 229 An especially important property of the synchronize_rcu()
225 primitive is that it automatically self-limits: if grace periods 230 primitive is that it automatically self-limits: if grace periods
@@ -268,7 +273,8 @@ over a rather long period of time, but improvements are always welcome!
268 e. Periodically invoke synchronize_rcu(), permitting a limited 273 e. Periodically invoke synchronize_rcu(), permitting a limited
269 number of updates per grace period. 274 number of updates per grace period.
270 275
271 The same cautions apply to call_rcu_bh() and call_rcu_sched(). 276 The same cautions apply to call_rcu_bh(), call_rcu_sched(),
277 call_srcu(), and kfree_rcu().
272 278
2739. All RCU list-traversal primitives, which include 2799. All RCU list-traversal primitives, which include
274 rcu_dereference(), list_for_each_entry_rcu(), and 280 rcu_dereference(), list_for_each_entry_rcu(), and
@@ -296,9 +302,9 @@ over a rather long period of time, but improvements are always welcome!
296 all currently executing rcu_read_lock()-protected RCU read-side 302 all currently executing rcu_read_lock()-protected RCU read-side
297 critical sections complete. It does -not- necessarily guarantee 303 critical sections complete. It does -not- necessarily guarantee
298 that all currently running interrupts, NMIs, preempt_disable() 304 that all currently running interrupts, NMIs, preempt_disable()
299 code, or idle loops will complete. Therefore, if you do not have 305 code, or idle loops will complete. Therefore, if your
300 rcu_read_lock()-protected read-side critical sections, do -not- 306 read-side critical sections are protected by something other
301 use synchronize_rcu(). 307 than rcu_read_lock(), do -not- use synchronize_rcu().
302 308
303 Similarly, disabling preemption is not an acceptable substitute 309 Similarly, disabling preemption is not an acceptable substitute
304 for rcu_read_lock(). Code that attempts to use preemption 310 for rcu_read_lock(). Code that attempts to use preemption
@@ -401,9 +407,9 @@ over a rather long period of time, but improvements are always welcome!
401 read-side critical sections. It is the responsibility of the 407 read-side critical sections. It is the responsibility of the
402 RCU update-side primitives to deal with this. 408 RCU update-side primitives to deal with this.
403 409
40417. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and 41017. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
405 the __rcu sparse checks to validate your RCU code. These 411 __rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
406 can help find problems as follows: 412 validate your RCU code. These can help find problems as follows:
407 413
408 CONFIG_PROVE_RCU: check that accesses to RCU-protected data 414 CONFIG_PROVE_RCU: check that accesses to RCU-protected data
409 structures are carried out under the proper RCU 415 structures are carried out under the proper RCU
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt
index a102d4b3724b..cd83d2348fef 100644
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt
@@ -64,6 +64,11 @@ checking of rcu_dereference() primitives:
64 but retain the compiler constraints that prevent duplicating 64 but retain the compiler constraints that prevent duplicating
65 or coalescsing. This is useful when when testing the 65 or coalescsing. This is useful when when testing the
66 value of the pointer itself, for example, against NULL. 66 value of the pointer itself, for example, against NULL.
67 rcu_access_index(idx):
68 Return the value of the index and omit all barriers, but
69 retain the compiler constraints that prevent duplicating
70 or coalescsing. This is useful when when testing the
71 value of the index itself, for example, against -1.
67 72
68The rcu_dereference_check() check expression can be any boolean 73The rcu_dereference_check() check expression can be any boolean
69expression, but would normally include a lockdep expression. However, 74expression, but would normally include a lockdep expression. However,
diff --git a/Documentation/RCU/rcubarrier.txt b/Documentation/RCU/rcubarrier.txt
index 38428c125135..2e319d1b9ef2 100644
--- a/Documentation/RCU/rcubarrier.txt
+++ b/Documentation/RCU/rcubarrier.txt
@@ -79,7 +79,20 @@ complete. Pseudo-code using rcu_barrier() is as follows:
79 2. Execute rcu_barrier(). 79 2. Execute rcu_barrier().
80 3. Allow the module to be unloaded. 80 3. Allow the module to be unloaded.
81 81
82The rcutorture module makes use of rcu_barrier in its exit function 82There are also rcu_barrier_bh(), rcu_barrier_sched(), and srcu_barrier()
83functions for the other flavors of RCU, and you of course must match
84the flavor of rcu_barrier() with that of call_rcu(). If your module
85uses multiple flavors of call_rcu(), then it must also use multiple
86flavors of rcu_barrier() when unloading that module. For example, if
87it uses call_rcu_bh(), call_srcu() on srcu_struct_1, and call_srcu() on
88srcu_struct_2(), then the following three lines of code will be required
89when unloading:
90
91 1 rcu_barrier_bh();
92 2 srcu_barrier(&srcu_struct_1);
93 3 srcu_barrier(&srcu_struct_2);
94
95The rcutorture module makes use of rcu_barrier() in its exit function
83as follows: 96as follows:
84 97
85 1 static void 98 1 static void
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1927151b386b..e38b8df3d727 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -92,14 +92,14 @@ If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set,
92more information is printed with the stall-warning message, for example: 92more information is printed with the stall-warning message, for example:
93 93
94 INFO: rcu_preempt detected stall on CPU 94 INFO: rcu_preempt detected stall on CPU
95 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 95 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543
96 (t=65000 jiffies) 96 (t=65000 jiffies)
97 97
98In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is 98In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is
99printed: 99printed:
100 100
101 INFO: rcu_preempt detected stall on CPU 101 INFO: rcu_preempt detected stall on CPU
102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending 102 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D
103 (t=65000 jiffies) 103 (t=65000 jiffies)
104 104
105The "(64628 ticks this GP)" indicates that this CPU has taken more 105The "(64628 ticks this GP)" indicates that this CPU has taken more
@@ -116,13 +116,28 @@ number between the two "/"s is the value of the nesting, which will
116be a small positive number if in the idle loop and a very large positive 116be a small positive number if in the idle loop and a very large positive
117number (as shown above) otherwise. 117number (as shown above) otherwise.
118 118
119For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is 119The "softirq=" portion of the message tracks the number of RCU softirq
120not in the process of trying to force itself into dyntick-idle state, the 120handlers that the stalled CPU has executed. The number before the "/"
121"." indicates that the CPU has not given up forcing RCU into dyntick-idle 121is the number that had executed since boot at the time that this CPU
122mode (it would be "H" otherwise), and the "timer not pending" indicates 122last noted the beginning of a grace period, which might be the current
123that the CPU has not recently forced RCU into dyntick-idle mode (it 123(stalled) grace period, or it might be some earlier grace period (for
124would otherwise indicate the number of microseconds remaining in this 124example, if the CPU might have been in dyntick-idle mode for an extended
125forced state). 125time period. The number after the "/" is the number that have executed
126since boot until the current time. If this latter number stays constant
127across repeated stall-warning messages, it is possible that RCU's softirq
128handlers are no longer able to execute on this CPU. This can happen if
129the stalled CPU is spinning with interrupts are disabled, or, in -rt
130kernels, if a high-priority process is starving RCU's softirq handler.
131
132For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the
133low-order 16 bits (in hex) of the jiffies counter when this CPU last
134invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked
135rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:"
136prints the number of non-lazy callbacks posted since the last call to
137rcu_needs_cpu(). Finally, an "L" indicates that there are currently
138no non-lazy callbacks ("." is printed otherwise, as shown above) and
139"D" indicates that dyntick-idle processing is enabled ("." is printed
140otherwise, for example, if disabled via the "nohz=" kernel boot parameter).
126 141
127 142
128Multiple Warnings From One Stall 143Multiple Warnings From One Stall
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 0cc7820967f4..10df0b82f459 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -265,9 +265,9 @@ rcu_dereference()
265 rcu_read_lock(); 265 rcu_read_lock();
266 p = rcu_dereference(head.next); 266 p = rcu_dereference(head.next);
267 rcu_read_unlock(); 267 rcu_read_unlock();
268 x = p->address; 268 x = p->address; /* BUG!!! */
269 rcu_read_lock(); 269 rcu_read_lock();
270 y = p->data; 270 y = p->data; /* BUG!!! */
271 rcu_read_unlock(); 271 rcu_read_unlock();
272 272
273 Holding a reference from one RCU read-side critical section 273 Holding a reference from one RCU read-side critical section
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8ccbf27aead4..52ecc9b84673 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2484,9 +2484,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2484 In kernels built with CONFIG_RCU_NOCB_CPU=y, set 2484 In kernels built with CONFIG_RCU_NOCB_CPU=y, set
2485 the specified list of CPUs to be no-callback CPUs. 2485 the specified list of CPUs to be no-callback CPUs.
2486 Invocation of these CPUs' RCU callbacks will 2486 Invocation of these CPUs' RCU callbacks will
2487 be offloaded to "rcuoN" kthreads created for 2487 be offloaded to "rcuox/N" kthreads created for
2488 that purpose. This reduces OS jitter on the 2488 that purpose, where "x" is "b" for RCU-bh, "p"
2489 for RCU-preempt, and "s" for RCU-sched, and "N"
2490 is the CPU number. This reduces OS jitter on the
2489 offloaded CPUs, which can be useful for HPC and 2491 offloaded CPUs, which can be useful for HPC and
2492
2490 real-time workloads. It can also improve energy 2493 real-time workloads. It can also improve energy
2491 efficiency for asymmetric multiprocessors. 2494 efficiency for asymmetric multiprocessors.
2492 2495
@@ -2510,6 +2513,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2510 leaf rcu_node structure. Useful for very large 2513 leaf rcu_node structure. Useful for very large
2511 systems. 2514 systems.
2512 2515
2516 rcutree.jiffies_till_first_fqs= [KNL,BOOT]
2517 Set delay from grace-period initialization to
2518 first attempt to force quiescent states.
2519 Units are jiffies, minimum value is zero,
2520 and maximum value is HZ.
2521
2522 rcutree.jiffies_till_next_fqs= [KNL,BOOT]
2523 Set delay between subsequent attempts to force
2524 quiescent states. Units are jiffies, minimum
2525 value is one, and maximum value is HZ.
2526
2513 rcutree.qhimark= [KNL,BOOT] 2527 rcutree.qhimark= [KNL,BOOT]
2514 Set threshold of queued 2528 Set threshold of queued
2515 RCU callbacks over which batch limiting is disabled. 2529 RCU callbacks over which batch limiting is disabled.
@@ -2524,16 +2538,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2524 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] 2538 rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
2525 Set timeout for RCU CPU stall warning messages. 2539 Set timeout for RCU CPU stall warning messages.
2526 2540
2527 rcutree.jiffies_till_first_fqs= [KNL,BOOT] 2541 rcutree.rcu_idle_gp_delay= [KNL,BOOT]
2528 Set delay from grace-period initialization to 2542 Set wakeup interval for idle CPUs that have
2529 first attempt to force quiescent states. 2543 RCU callbacks (RCU_FAST_NO_HZ=y).
2530 Units are jiffies, minimum value is zero,
2531 and maximum value is HZ.
2532 2544
2533 rcutree.jiffies_till_next_fqs= [KNL,BOOT] 2545 rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT]
2534 Set delay between subsequent attempts to force 2546 Set wakeup interval for idle CPUs that have
2535 quiescent states. Units are jiffies, minimum 2547 only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
2536 value is one, and maximum value is HZ. 2548 Lazy RCU callbacks are those which RCU can
2549 prove do nothing more than free memory.
2537 2550
2538 rcutorture.fqs_duration= [KNL,BOOT] 2551 rcutorture.fqs_duration= [KNL,BOOT]
2539 Set duration of force_quiescent_state bursts. 2552 Set duration of force_quiescent_state bursts.
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
new file mode 100644
index 000000000000..cbf7ae412da4
--- /dev/null
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -0,0 +1,202 @@
1REDUCING OS JITTER DUE TO PER-CPU KTHREADS
2
3This document lists per-CPU kthreads in the Linux kernel and presents
4options to control their OS jitter. Note that non-per-CPU kthreads are
5not listed here. To reduce OS jitter from non-per-CPU kthreads, bind
6them to a "housekeeping" CPU dedicated to such work.
7
8
9REFERENCES
10
11o Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs.
12
13o Documentation/cgroups: Using cgroups to bind tasks to sets of CPUs.
14
15o man taskset: Using the taskset command to bind tasks to sets
16 of CPUs.
17
18o man sched_setaffinity: Using the sched_setaffinity() system
19 call to bind tasks to sets of CPUs.
20
21o /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state,
22 writing "0" to offline and "1" to online.
23
24o In order to locate kernel-generated OS jitter on CPU N:
25
26 cd /sys/kernel/debug/tracing
27 echo 1 > max_graph_depth # Increase the "1" for more detail
28 echo function_graph > current_tracer
29 # run workload
30 cat per_cpu/cpuN/trace
31
32
33KTHREADS
34
35Name: ehca_comp/%u
36Purpose: Periodically process Infiniband-related work.
37To reduce its OS jitter, do any of the following:
381. Don't use eHCA Infiniband hardware, instead choosing hardware
39 that does not require per-CPU kthreads. This will prevent these
40 kthreads from being created in the first place. (This will
41 work for most people, as this hardware, though important, is
42 relatively old and is produced in relatively low unit volumes.)
432. Do all eHCA-Infiniband-related work on other CPUs, including
44 interrupts.
453. Rework the eHCA driver so that its per-CPU kthreads are
46 provisioned only on selected CPUs.
47
48
49Name: irq/%d-%s
50Purpose: Handle threaded interrupts.
51To reduce its OS jitter, do the following:
521. Use irq affinity to force the irq threads to execute on
53 some other CPU.
54
55Name: kcmtpd_ctr_%d
56Purpose: Handle Bluetooth work.
57To reduce its OS jitter, do one of the following:
581. Don't use Bluetooth, in which case these kthreads won't be
59 created in the first place.
602. Use irq affinity to force Bluetooth-related interrupts to
61 occur on some other CPU and furthermore initiate all
62 Bluetooth activity on some other CPU.
63
64Name: ksoftirqd/%u
65Purpose: Execute softirq handlers when threaded or when under heavy load.
66To reduce its OS jitter, each softirq vector must be handled
67separately as follows:
68TIMER_SOFTIRQ: Do all of the following:
691. To the extent possible, keep the CPU out of the kernel when it
70 is non-idle, for example, by avoiding system calls and by forcing
71 both kernel threads and interrupts to execute elsewhere.
722. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force
73 the CPU offline, then bring it back online. This forces
74 recurring timers to migrate elsewhere. If you are concerned
75 with multiple CPUs, force them all offline before bringing the
76 first one back online. Once you have onlined the CPUs in question,
77 do not offline any other CPUs, because doing so could force the
78 timer back onto one of the CPUs in question.
79NET_TX_SOFTIRQ and NET_RX_SOFTIRQ: Do all of the following:
801. Force networking interrupts onto other CPUs.
812. Initiate any network I/O on other CPUs.
823. Once your application has started, prevent CPU-hotplug operations
83 from being initiated from tasks that might run on the CPU to
84 be de-jittered. (It is OK to force this CPU offline and then
85 bring it back online before you start your application.)
86BLOCK_SOFTIRQ: Do all of the following:
871. Force block-device interrupts onto some other CPU.
882. Initiate any block I/O on other CPUs.
893. Once your application has started, prevent CPU-hotplug operations
90 from being initiated from tasks that might run on the CPU to
91 be de-jittered. (It is OK to force this CPU offline and then
92 bring it back online before you start your application.)
93BLOCK_IOPOLL_SOFTIRQ: Do all of the following:
941. Force block-device interrupts onto some other CPU.
952. Initiate any block I/O and block-I/O polling on other CPUs.
963. Once your application has started, prevent CPU-hotplug operations
97 from being initiated from tasks that might run on the CPU to
98 be de-jittered. (It is OK to force this CPU offline and then
99 bring it back online before you start your application.)
100TASKLET_SOFTIRQ: Do one or more of the following:
1011. Avoid use of drivers that use tasklets. (Such drivers will contain
102 calls to things like tasklet_schedule().)
1032. Convert all drivers that you must use from tasklets to workqueues.
1043. Force interrupts for drivers using tasklets onto other CPUs,
105 and also do I/O involving these drivers on other CPUs.
106SCHED_SOFTIRQ: Do all of the following:
1071. Avoid sending scheduler IPIs to the CPU to be de-jittered,
108 for example, ensure that at most one runnable kthread is present
109 on that CPU. If a thread that expects to run on the de-jittered
110 CPU awakens, the scheduler will send an IPI that can result in
111 a subsequent SCHED_SOFTIRQ.
1122. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
113 CONFIG_NO_HZ_FULL=y, and, in addition, ensure that the CPU
114 to be de-jittered is marked as an adaptive-ticks CPU using the
115 "nohz_full=" boot parameter. This reduces the number of
116 scheduler-clock interrupts that the de-jittered CPU receives,
117 minimizing its chances of being selected to do the load balancing
118 work that runs in SCHED_SOFTIRQ context.
1193. To the extent possible, keep the CPU out of the kernel when it
120 is non-idle, for example, by avoiding system calls and by
121 forcing both kernel threads and interrupts to execute elsewhere.
122 This further reduces the number of scheduler-clock interrupts
123 received by the de-jittered CPU.
124HRTIMER_SOFTIRQ: Do all of the following:
1251. To the extent possible, keep the CPU out of the kernel when it
126 is non-idle. For example, avoid system calls and force both
127 kernel threads and interrupts to execute elsewhere.
1282. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the
129 CPU offline, then bring it back online. This forces recurring
130 timers to migrate elsewhere. If you are concerned with multiple
131 CPUs, force them all offline before bringing the first one
132 back online. Once you have onlined the CPUs in question, do not
133 offline any other CPUs, because doing so could force the timer
134 back onto one of the CPUs in question.
135RCU_SOFTIRQ: Do at least one of the following:
1361. Offload callbacks and keep the CPU in either dyntick-idle or
137 adaptive-ticks state by doing all of the following:
138 a. Build with CONFIG_RCU_NOCB_CPU=y, CONFIG_RCU_NOCB_CPU_ALL=y,
139 CONFIG_NO_HZ_FULL=y, and, in addition ensure that the CPU
140 to be de-jittered is marked as an adaptive-ticks CPU using
141 the "nohz_full=" boot parameter. Bind the rcuo kthreads
142 to housekeeping CPUs, which can tolerate OS jitter.
143 b. To the extent possible, keep the CPU out of the kernel
144 when it is non-idle, for example, by avoiding system
145 calls and by forcing both kernel threads and interrupts
146 to execute elsewhere.
1472. Enable RCU to do its processing remotely via dyntick-idle by
148 doing all of the following:
149 a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
150 b. Ensure that the CPU goes idle frequently, allowing other
151 CPUs to detect that it has passed through an RCU quiescent
152 state. If the kernel is built with CONFIG_NO_HZ_FULL=y,
153 userspace execution also allows other CPUs to detect that
154 the CPU in question has passed through a quiescent state.
155 c. To the extent possible, keep the CPU out of the kernel
156 when it is non-idle, for example, by avoiding system
157 calls and by forcing both kernel threads and interrupts
158 to execute elsewhere.
159
160Name: rcuc/%u
161Purpose: Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
162To reduce its OS jitter, do at least one of the following:
1631. Build the kernel with CONFIG_PREEMPT=n. This prevents these
164 kthreads from being created in the first place, and also obviates
165 the need for RCU priority boosting. This approach is feasible
166 for workloads that do not require high degrees of responsiveness.
1672. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these
168 kthreads from being created in the first place. This approach
169 is feasible only if your workload never requires RCU priority
170 boosting, for example, if you ensure frequent idle time on all
171 CPUs that might execute within the kernel.
1723. Build with CONFIG_RCU_NOCB_CPU=y and CONFIG_RCU_NOCB_CPU_ALL=y,
173 which offloads all RCU callbacks to kthreads that can be moved
174 off of CPUs susceptible to OS jitter. This approach prevents the
175 rcuc/%u kthreads from having any work to do, so that they are
176 never awakened.
1774. Ensure that the CPU never enters the kernel, and, in particular,
178 avoid initiating any CPU hotplug operations on this CPU. This is
179 another way of preventing any callbacks from being queued on the
180 CPU, again preventing the rcuc/%u kthreads from having any work
181 to do.
182
183Name: rcuob/%d, rcuop/%d, and rcuos/%d
184Purpose: Offload RCU callbacks from the corresponding CPU.
185To reduce its OS jitter, do at least one of the following:
1861. Use affinity, cgroups, or other mechanism to force these kthreads
187 to execute on some other CPU.
1882. Build with CONFIG_RCU_NOCB_CPUS=n, which will prevent these
189 kthreads from being created in the first place. However, please
190 note that this will not eliminate OS jitter, but will instead
191 shift it to RCU_SOFTIRQ.
192
193Name: watchdog/%u
194Purpose: Detect software lockups on each CPU.
195To reduce its OS jitter, do at least one of the following:
1961. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
197 kthreads from being created in the first place.
1982. Echo a zero to /proc/sys/kernel/watchdog to disable the
199 watchdog timer.
2003. Echo a large number of /proc/sys/kernel/watchdog_thresh in
201 order to reduce the frequency of OS jitter due to the watchdog
202 timer down to a level that is acceptable for your workload.
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 31f9d75adc5b..2eb88556c5c5 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -125,6 +125,11 @@ static inline void hlist_bl_unlock(struct hlist_bl_head *b)
125 __bit_spin_unlock(0, (unsigned long *)b); 125 __bit_spin_unlock(0, (unsigned long *)b);
126} 126}
127 127
128static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
129{
130 return bit_spin_is_locked(0, (unsigned long *)b);
131}
132
128/** 133/**
129 * hlist_bl_for_each_entry - iterate over list of given type 134 * hlist_bl_for_each_entry - iterate over list of given type
130 * @tpos: the type * to use as a loop cursor. 135 * @tpos: the type * to use as a loop cursor.
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index cf1244fbf3b6..4f216c59e7db 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -20,7 +20,7 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
20static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) 20static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
21{ 21{
22 return (struct hlist_bl_node *) 22 return (struct hlist_bl_node *)
23 ((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK); 23 ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
24} 24}
25 25
26/** 26/**
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b758ce17b309..9ed2c9a4de45 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -80,6 +80,7 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
80#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) 80#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b))
81#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 81#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
82#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) 82#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
83#define ulong2long(a) (*(long *)(&(a)))
83 84
84/* Exported common interfaces */ 85/* Exported common interfaces */
85 86
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 1918e832da4f..59ebcc89f148 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -72,6 +72,58 @@ TRACE_EVENT(rcu_grace_period,
72); 72);
73 73
74/* 74/*
75 * Tracepoint for future grace-period events, including those for no-callbacks
76 * CPUs. The caller should pull the data from the rcu_node structure,
77 * other than rcuname, which comes from the rcu_state structure, and event,
78 * which is one of the following:
79 *
80 * "Startleaf": Request a nocb grace period based on leaf-node data.
81 * "Startedleaf": Leaf-node start proved sufficient.
82 * "Startedleafroot": Leaf-node start proved sufficient after checking root.
83 * "Startedroot": Requested a nocb grace period based on root-node data.
84 * "StartWait": Start waiting for the requested grace period.
85 * "ResumeWait": Resume waiting after signal.
86 * "EndWait": Complete wait.
87 * "Cleanup": Clean up rcu_node structure after previous GP.
88 * "CleanupMore": Clean up, and another no-CB GP is needed.
89 */
90TRACE_EVENT(rcu_future_grace_period,
91
92 TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed,
93 unsigned long c, u8 level, int grplo, int grphi,
94 char *gpevent),
95
96 TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent),
97
98 TP_STRUCT__entry(
99 __field(char *, rcuname)
100 __field(unsigned long, gpnum)
101 __field(unsigned long, completed)
102 __field(unsigned long, c)
103 __field(u8, level)
104 __field(int, grplo)
105 __field(int, grphi)
106 __field(char *, gpevent)
107 ),
108
109 TP_fast_assign(
110 __entry->rcuname = rcuname;
111 __entry->gpnum = gpnum;
112 __entry->completed = completed;
113 __entry->c = c;
114 __entry->level = level;
115 __entry->grplo = grplo;
116 __entry->grphi = grphi;
117 __entry->gpevent = gpevent;
118 ),
119
120 TP_printk("%s %lu %lu %lu %u %d %d %s",
121 __entry->rcuname, __entry->gpnum, __entry->completed,
122 __entry->c, __entry->level, __entry->grplo, __entry->grphi,
123 __entry->gpevent)
124);
125
126/*
75 * Tracepoint for grace-period-initialization events. These are 127 * Tracepoint for grace-period-initialization events. These are
76 * distinguished by the type of RCU, the new grace-period number, the 128 * distinguished by the type of RCU, the new grace-period number, the
77 * rcu_node structure level, the starting and ending CPU covered by the 129 * rcu_node structure level, the starting and ending CPU covered by the
@@ -601,6 +653,9 @@ TRACE_EVENT(rcu_barrier,
601#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) 653#define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0)
602#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ 654#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \
603 qsmask) do { } while (0) 655 qsmask) do { } while (0)
656#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \
657 level, grplo, grphi, event) \
658 do { } while (0)
604#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) 659#define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0)
605#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) 660#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
606#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ 661#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \
diff --git a/init/Kconfig b/init/Kconfig
index 5341d7232c3a..71bb9e73011a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -578,13 +578,16 @@ config RCU_FAST_NO_HZ
578 depends on NO_HZ && SMP 578 depends on NO_HZ && SMP
579 default n 579 default n
580 help 580 help
581 This option causes RCU to attempt to accelerate grace periods in 581 This option permits CPUs to enter dynticks-idle state even if
582 order to allow CPUs to enter dynticks-idle state more quickly. 582 they have RCU callbacks queued, and prevents RCU from waking
583 On the other hand, this option increases the overhead of the 583 these CPUs up more than roughly once every four jiffies (by
584 dynticks-idle checking, thus degrading scheduling latency. 584 default, you can adjust this using the rcutree.rcu_idle_gp_delay
585 parameter), thus improving energy efficiency. On the other
586 hand, this option increases the duration of RCU grace periods,
587 for example, slowing down synchronize_rcu().
585 588
586 Say Y if energy efficiency is critically important, and you don't 589 Say Y if energy efficiency is critically important, and you
587 care about real-time response. 590 don't care about increased grace-period durations.
588 591
589 Say N if you are unsure. 592 Say N if you are unsure.
590 593
@@ -651,7 +654,7 @@ config RCU_BOOST_DELAY
651 Accept the default if unsure. 654 Accept the default if unsure.
652 655
653config RCU_NOCB_CPU 656config RCU_NOCB_CPU
654 bool "Offload RCU callback processing from boot-selected CPUs" 657 bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL"
655 depends on TREE_RCU || TREE_PREEMPT_RCU 658 depends on TREE_RCU || TREE_PREEMPT_RCU
656 default n 659 default n
657 help 660 help
@@ -662,16 +665,56 @@ config RCU_NOCB_CPU
662 665
663 This option offloads callback invocation from the set of 666 This option offloads callback invocation from the set of
664 CPUs specified at boot time by the rcu_nocbs parameter. 667 CPUs specified at boot time by the rcu_nocbs parameter.
665 For each such CPU, a kthread ("rcuoN") will be created to 668 For each such CPU, a kthread ("rcuox/N") will be created to
666 invoke callbacks, where the "N" is the CPU being offloaded. 669 invoke callbacks, where the "N" is the CPU being offloaded,
667 Nothing prevents this kthread from running on the specified 670 and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and
668 CPUs, but (1) the kthreads may be preempted between each 671 "s" for RCU-sched. Nothing prevents this kthread from running
669 callback, and (2) affinity or cgroups can be used to force 672 on the specified CPUs, but (1) the kthreads may be preempted
670 the kthreads to run on whatever set of CPUs is desired. 673 between each callback, and (2) affinity or cgroups can be used
671 674 to force the kthreads to run on whatever set of CPUs is desired.
672 Say Y here if you want reduced OS jitter on selected CPUs. 675
676 Say Y here if you want to help to debug reduced OS jitter.
673 Say N here if you are unsure. 677 Say N here if you are unsure.
674 678
679choice
680 prompt "Build-forced no-CBs CPUs"
681 default RCU_NOCB_CPU_NONE
682 help
683 This option allows no-CBs CPUs to be specified at build time.
684 Additional no-CBs CPUs may be specified by the rcu_nocbs=
685 boot parameter.
686
687config RCU_NOCB_CPU_NONE
688 bool "No build_forced no-CBs CPUs"
689 depends on RCU_NOCB_CPU
690 help
691 This option does not force any of the CPUs to be no-CBs CPUs.
692 Only CPUs designated by the rcu_nocbs= boot parameter will be
693 no-CBs CPUs.
694
695config RCU_NOCB_CPU_ZERO
696 bool "CPU 0 is a build_forced no-CBs CPU"
697 depends on RCU_NOCB_CPU
698 help
699 This option forces CPU 0 to be a no-CBs CPU. Additional CPUs
700 may be designated as no-CBs CPUs using the rcu_nocbs= boot
701 parameter will be no-CBs CPUs.
702
703 Select this if CPU 0 needs to be a no-CBs CPU for real-time
704 or energy-efficiency reasons.
705
706config RCU_NOCB_CPU_ALL
707 bool "All CPUs are build_forced no-CBs CPUs"
708 depends on RCU_NOCB_CPU
709 help
710 This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs=
711 boot parameter will be ignored.
712
713 Select this if all CPUs need to be no-CBs CPUs for real-time
714 or energy-efficiency reasons.
715
716endchoice
717
675endmenu # "RCU Subsystem" 718endmenu # "RCU Subsystem"
676 719
677config IKCONFIG 720config IKCONFIG
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 5b8ad827fd86..2d5f94c1c7fb 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 64static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 65static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
66 66
67#define RCU_STATE_INITIALIZER(sname, cr) { \ 67#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
68 .level = { &sname##_state.node[0] }, \ 68 .level = { &sname##_state.node[0] }, \
69 .call = cr, \ 69 .call = cr, \
70 .fqs_state = RCU_GP_IDLE, \ 70 .fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
78 .name = #sname, \ 78 .name = #sname, \
79 .abbr = sabbr, \
79} 80}
80 81
81struct rcu_state rcu_sched_state = 82struct rcu_state rcu_sched_state =
82 RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); 83 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
83DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); 84DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
84 85
85struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); 86struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
86DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 87DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
87 88
88static struct rcu_state *rcu_state; 89static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
223module_param(jiffies_till_first_fqs, ulong, 0644); 224module_param(jiffies_till_first_fqs, ulong, 0644);
224module_param(jiffies_till_next_fqs, ulong, 0644); 225module_param(jiffies_till_next_fqs, ulong, 0644);
225 226
227static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
228 struct rcu_data *rdp);
226static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); 229static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
227static void force_quiescent_state(struct rcu_state *rsp); 230static void force_quiescent_state(struct rcu_state *rsp);
228static int rcu_pending(int cpu); 231static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
310 313
311 if (rcu_gp_in_progress(rsp)) 314 if (rcu_gp_in_progress(rsp))
312 return 0; /* No, a grace period is already in progress. */ 315 return 0; /* No, a grace period is already in progress. */
316 if (rcu_nocb_needs_gp(rsp))
317 return 1; /* Yes, a no-CBs CPU needs one. */
313 if (!rdp->nxttail[RCU_NEXT_TAIL]) 318 if (!rdp->nxttail[RCU_NEXT_TAIL])
314 return 0; /* No, this is a no-CBs (or offline) CPU. */ 319 return 0; /* No, this is a no-CBs (or offline) CPU. */
315 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 320 if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -1035,10 +1040,11 @@ static void init_callback_list(struct rcu_data *rdp)
1035{ 1040{
1036 int i; 1041 int i;
1037 1042
1043 if (init_nocb_callback_list(rdp))
1044 return;
1038 rdp->nxtlist = NULL; 1045 rdp->nxtlist = NULL;
1039 for (i = 0; i < RCU_NEXT_SIZE; i++) 1046 for (i = 0; i < RCU_NEXT_SIZE; i++)
1040 rdp->nxttail[i] = &rdp->nxtlist; 1047 rdp->nxttail[i] = &rdp->nxtlist;
1041 init_nocb_callback_list(rdp);
1042} 1048}
1043 1049
1044/* 1050/*
@@ -1071,6 +1077,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
1071} 1077}
1072 1078
1073/* 1079/*
1080 * Trace-event helper function for rcu_start_future_gp() and
1081 * rcu_nocb_wait_gp().
1082 */
1083static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1084 unsigned long c, char *s)
1085{
1086 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
1087 rnp->completed, c, rnp->level,
1088 rnp->grplo, rnp->grphi, s);
1089}
1090
1091/*
1092 * Start some future grace period, as needed to handle newly arrived
1093 * callbacks. The required future grace periods are recorded in each
1094 * rcu_node structure's ->need_future_gp field.
1095 *
1096 * The caller must hold the specified rcu_node structure's ->lock.
1097 */
1098static unsigned long __maybe_unused
1099rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1100{
1101 unsigned long c;
1102 int i;
1103 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1104
1105 /*
1106 * Pick up grace-period number for new callbacks. If this
1107 * grace period is already marked as needed, return to the caller.
1108 */
1109 c = rcu_cbs_completed(rdp->rsp, rnp);
1110 trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
1111 if (rnp->need_future_gp[c & 0x1]) {
1112 trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
1113 return c;
1114 }
1115
1116 /*
1117 * If either this rcu_node structure or the root rcu_node structure
1118 * believe that a grace period is in progress, then we must wait
1119 * for the one following, which is in "c". Because our request
1120 * will be noticed at the end of the current grace period, we don't
1121 * need to explicitly start one.
1122 */
1123 if (rnp->gpnum != rnp->completed ||
1124 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1125 rnp->need_future_gp[c & 0x1]++;
1126 trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
1127 return c;
1128 }
1129
1130 /*
1131 * There might be no grace period in progress. If we don't already
1132 * hold it, acquire the root rcu_node structure's lock in order to
1133 * start one (if needed).
1134 */
1135 if (rnp != rnp_root)
1136 raw_spin_lock(&rnp_root->lock);
1137
1138 /*
1139 * Get a new grace-period number. If there really is no grace
1140 * period in progress, it will be smaller than the one we obtained
1141 * earlier. Adjust callbacks as needed. Note that even no-CBs
1142 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
1143 */
1144 c = rcu_cbs_completed(rdp->rsp, rnp_root);
1145 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
1146 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
1147 rdp->nxtcompleted[i] = c;
1148
1149 /*
1150 * If the needed for the required grace period is already
1151 * recorded, trace and leave.
1152 */
1153 if (rnp_root->need_future_gp[c & 0x1]) {
1154 trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
1155 goto unlock_out;
1156 }
1157
1158 /* Record the need for the future grace period. */
1159 rnp_root->need_future_gp[c & 0x1]++;
1160
1161 /* If a grace period is not already in progress, start one. */
1162 if (rnp_root->gpnum != rnp_root->completed) {
1163 trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
1164 } else {
1165 trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
1166 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1167 }
1168unlock_out:
1169 if (rnp != rnp_root)
1170 raw_spin_unlock(&rnp_root->lock);
1171 return c;
1172}
1173
1174/*
1175 * Clean up any old requests for the just-ended grace period. Also return
1176 * whether any additional grace periods have been requested. Also invoke
1177 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
1178 * waiting for this grace period to complete.
1179 */
1180static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1181{
1182 int c = rnp->completed;
1183 int needmore;
1184 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1185
1186 rcu_nocb_gp_cleanup(rsp, rnp);
1187 rnp->need_future_gp[c & 0x1] = 0;
1188 needmore = rnp->need_future_gp[(c + 1) & 0x1];
1189 trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
1190 return needmore;
1191}
1192
1193/*
1074 * If there is room, assign a ->completed number to any callbacks on 1194 * If there is room, assign a ->completed number to any callbacks on
1075 * this CPU that have not already been assigned. Also accelerate any 1195 * this CPU that have not already been assigned. Also accelerate any
1076 * callbacks that were previously assigned a ->completed number that has 1196 * callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1129 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1249 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
1130 rdp->nxtcompleted[i] = c; 1250 rdp->nxtcompleted[i] = c;
1131 } 1251 }
1252 /* Record any needed additional grace periods. */
1253 rcu_start_future_gp(rnp, rdp);
1132 1254
1133 /* Trace depending on how much we were able to accelerate. */ 1255 /* Trace depending on how much we were able to accelerate. */
1134 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1256 if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1308,9 +1430,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
1308 rdp = this_cpu_ptr(rsp->rda); 1430 rdp = this_cpu_ptr(rsp->rda);
1309 rcu_preempt_check_blocked_tasks(rnp); 1431 rcu_preempt_check_blocked_tasks(rnp);
1310 rnp->qsmask = rnp->qsmaskinit; 1432 rnp->qsmask = rnp->qsmaskinit;
1311 rnp->gpnum = rsp->gpnum; 1433 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1312 WARN_ON_ONCE(rnp->completed != rsp->completed); 1434 WARN_ON_ONCE(rnp->completed != rsp->completed);
1313 rnp->completed = rsp->completed; 1435 ACCESS_ONCE(rnp->completed) = rsp->completed;
1314 if (rnp == rdp->mynode) 1436 if (rnp == rdp->mynode)
1315 rcu_start_gp_per_cpu(rsp, rnp, rdp); 1437 rcu_start_gp_per_cpu(rsp, rnp, rdp);
1316 rcu_preempt_boost_start_gp(rnp); 1438 rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1441,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1319 rnp->grphi, rnp->qsmask); 1441 rnp->grphi, rnp->qsmask);
1320 raw_spin_unlock_irq(&rnp->lock); 1442 raw_spin_unlock_irq(&rnp->lock);
1321#ifdef CONFIG_PROVE_RCU_DELAY 1443#ifdef CONFIG_PROVE_RCU_DELAY
1322 if ((random32() % (rcu_num_nodes * 8)) == 0) 1444 if ((random32() % (rcu_num_nodes * 8)) == 0 &&
1445 system_state == SYSTEM_RUNNING)
1323 schedule_timeout_uninterruptible(2); 1446 schedule_timeout_uninterruptible(2);
1324#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ 1447#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
1325 cond_resched(); 1448 cond_resched();
@@ -1361,6 +1484,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1361static void rcu_gp_cleanup(struct rcu_state *rsp) 1484static void rcu_gp_cleanup(struct rcu_state *rsp)
1362{ 1485{
1363 unsigned long gp_duration; 1486 unsigned long gp_duration;
1487 int nocb = 0;
1364 struct rcu_data *rdp; 1488 struct rcu_data *rdp;
1365 struct rcu_node *rnp = rcu_get_root(rsp); 1489 struct rcu_node *rnp = rcu_get_root(rsp);
1366 1490
@@ -1390,17 +1514,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1390 */ 1514 */
1391 rcu_for_each_node_breadth_first(rsp, rnp) { 1515 rcu_for_each_node_breadth_first(rsp, rnp) {
1392 raw_spin_lock_irq(&rnp->lock); 1516 raw_spin_lock_irq(&rnp->lock);
1393 rnp->completed = rsp->gpnum; 1517 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1518 rdp = this_cpu_ptr(rsp->rda);
1519 if (rnp == rdp->mynode)
1520 __rcu_process_gp_end(rsp, rnp, rdp);
1521 nocb += rcu_future_gp_cleanup(rsp, rnp);
1394 raw_spin_unlock_irq(&rnp->lock); 1522 raw_spin_unlock_irq(&rnp->lock);
1395 cond_resched(); 1523 cond_resched();
1396 } 1524 }
1397 rnp = rcu_get_root(rsp); 1525 rnp = rcu_get_root(rsp);
1398 raw_spin_lock_irq(&rnp->lock); 1526 raw_spin_lock_irq(&rnp->lock);
1527 rcu_nocb_gp_set(rnp, nocb);
1399 1528
1400 rsp->completed = rsp->gpnum; /* Declare grace period done. */ 1529 rsp->completed = rsp->gpnum; /* Declare grace period done. */
1401 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1530 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1402 rsp->fqs_state = RCU_GP_IDLE; 1531 rsp->fqs_state = RCU_GP_IDLE;
1403 rdp = this_cpu_ptr(rsp->rda); 1532 rdp = this_cpu_ptr(rsp->rda);
1533 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1404 if (cpu_needs_another_gp(rsp, rdp)) 1534 if (cpu_needs_another_gp(rsp, rdp))
1405 rsp->gp_flags = 1; 1535 rsp->gp_flags = 1;
1406 raw_spin_unlock_irq(&rnp->lock); 1536 raw_spin_unlock_irq(&rnp->lock);
@@ -1476,57 +1606,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
1476/* 1606/*
1477 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1607 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1478 * in preparation for detecting the next grace period. The caller must hold 1608 * in preparation for detecting the next grace period. The caller must hold
1479 * the root node's ->lock, which is released before return. Hard irqs must 1609 * the root node's ->lock and hard irqs must be disabled.
1480 * be disabled.
1481 * 1610 *
1482 * Note that it is legal for a dying CPU (which is marked as offline) to 1611 * Note that it is legal for a dying CPU (which is marked as offline) to
1483 * invoke this function. This can happen when the dying CPU reports its 1612 * invoke this function. This can happen when the dying CPU reports its
1484 * quiescent state. 1613 * quiescent state.
1485 */ 1614 */
1486static void 1615static void
1487rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1616rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1488 __releases(rcu_get_root(rsp)->lock) 1617 struct rcu_data *rdp)
1489{ 1618{
1490 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1619 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
1491 struct rcu_node *rnp = rcu_get_root(rsp);
1492
1493 if (!rsp->gp_kthread ||
1494 !cpu_needs_another_gp(rsp, rdp)) {
1495 /* 1620 /*
1496 * Either we have not yet spawned the grace-period 1621 * Either we have not yet spawned the grace-period
1497 * task, this CPU does not need another grace period, 1622 * task, this CPU does not need another grace period,
1498 * or a grace period is already in progress. 1623 * or a grace period is already in progress.
1499 * Either way, don't start a new grace period. 1624 * Either way, don't start a new grace period.
1500 */ 1625 */
1501 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1502 return; 1626 return;
1503 } 1627 }
1504
1505 /*
1506 * Because there is no grace period in progress right now,
1507 * any callbacks we have up to this point will be satisfied
1508 * by the next grace period. So this is a good place to
1509 * assign a grace period number to recently posted callbacks.
1510 */
1511 rcu_accelerate_cbs(rsp, rnp, rdp);
1512
1513 rsp->gp_flags = RCU_GP_FLAG_INIT; 1628 rsp->gp_flags = RCU_GP_FLAG_INIT;
1514 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
1515
1516 /* Ensure that CPU is aware of completion of last grace period. */
1517 rcu_process_gp_end(rsp, rdp);
1518 local_irq_restore(flags);
1519 1629
1520 /* Wake up rcu_gp_kthread() to start the grace period. */ 1630 /* Wake up rcu_gp_kthread() to start the grace period. */
1521 wake_up(&rsp->gp_wq); 1631 wake_up(&rsp->gp_wq);
1522} 1632}
1523 1633
1524/* 1634/*
1635 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
1636 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
1637 * is invoked indirectly from rcu_advance_cbs(), which would result in
1638 * endless recursion -- or would do so if it wasn't for the self-deadlock
1639 * that is encountered beforehand.
1640 */
1641static void
1642rcu_start_gp(struct rcu_state *rsp)
1643{
1644 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1645 struct rcu_node *rnp = rcu_get_root(rsp);
1646
1647 /*
1648 * If there is no grace period in progress right now, any
1649 * callbacks we have up to this point will be satisfied by the
1650 * next grace period. Also, advancing the callbacks reduces the
1651 * probability of false positives from cpu_needs_another_gp()
1652 * resulting in pointless grace periods. So, advance callbacks
1653 * then start the grace period!
1654 */
1655 rcu_advance_cbs(rsp, rnp, rdp);
1656 rcu_start_gp_advanced(rsp, rnp, rdp);
1657}
1658
1659/*
1525 * Report a full set of quiescent states to the specified rcu_state 1660 * Report a full set of quiescent states to the specified rcu_state
1526 * data structure. This involves cleaning up after the prior grace 1661 * data structure. This involves cleaning up after the prior grace
1527 * period and letting rcu_start_gp() start up the next grace period 1662 * period and letting rcu_start_gp() start up the next grace period
1528 * if one is needed. Note that the caller must hold rnp->lock, as 1663 * if one is needed. Note that the caller must hold rnp->lock, which
1529 * required by rcu_start_gp(), which will release it. 1664 * is released before return.
1530 */ 1665 */
1531static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 1666static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1532 __releases(rcu_get_root(rsp)->lock) 1667 __releases(rcu_get_root(rsp)->lock)
@@ -2124,7 +2259,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2124 local_irq_save(flags); 2259 local_irq_save(flags);
2125 if (cpu_needs_another_gp(rsp, rdp)) { 2260 if (cpu_needs_another_gp(rsp, rdp)) {
2126 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2261 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2127 rcu_start_gp(rsp, flags); /* releases above lock */ 2262 rcu_start_gp(rsp);
2263 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2128 } else { 2264 } else {
2129 local_irq_restore(flags); 2265 local_irq_restore(flags);
2130 } 2266 }
@@ -2169,7 +2305,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
2169 2305
2170static void invoke_rcu_core(void) 2306static void invoke_rcu_core(void)
2171{ 2307{
2172 raise_softirq(RCU_SOFTIRQ); 2308 if (cpu_online(smp_processor_id()))
2309 raise_softirq(RCU_SOFTIRQ);
2173} 2310}
2174 2311
2175/* 2312/*
@@ -2204,11 +2341,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2204 2341
2205 /* Start a new grace period if one not already started. */ 2342 /* Start a new grace period if one not already started. */
2206 if (!rcu_gp_in_progress(rsp)) { 2343 if (!rcu_gp_in_progress(rsp)) {
2207 unsigned long nestflag;
2208 struct rcu_node *rnp_root = rcu_get_root(rsp); 2344 struct rcu_node *rnp_root = rcu_get_root(rsp);
2209 2345
2210 raw_spin_lock_irqsave(&rnp_root->lock, nestflag); 2346 raw_spin_lock(&rnp_root->lock);
2211 rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ 2347 rcu_start_gp(rsp);
2348 raw_spin_unlock(&rnp_root->lock);
2212 } else { 2349 } else {
2213 /* Give the grace period a kick. */ 2350 /* Give the grace period a kick. */
2214 rdp->blimit = LONG_MAX; 2351 rdp->blimit = LONG_MAX;
@@ -2628,19 +2765,27 @@ static int rcu_pending(int cpu)
2628} 2765}
2629 2766
2630/* 2767/*
2631 * Check to see if any future RCU-related work will need to be done 2768 * Return true if the specified CPU has any callback. If all_lazy is
2632 * by the current CPU, even if none need be done immediately, returning 2769 * non-NULL, store an indication of whether all callbacks are lazy.
2633 * 1 if so. 2770 * (If there are no callbacks, all of them are deemed to be lazy.)
2634 */ 2771 */
2635static int rcu_cpu_has_callbacks(int cpu) 2772static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2636{ 2773{
2774 bool al = true;
2775 bool hc = false;
2776 struct rcu_data *rdp;
2637 struct rcu_state *rsp; 2777 struct rcu_state *rsp;
2638 2778
2639 /* RCU callbacks either ready or pending? */ 2779 for_each_rcu_flavor(rsp) {
2640 for_each_rcu_flavor(rsp) 2780 rdp = per_cpu_ptr(rsp->rda, cpu);
2641 if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) 2781 if (rdp->qlen != rdp->qlen_lazy)
2642 return 1; 2782 al = false;
2643 return 0; 2783 if (rdp->nxtlist)
2784 hc = true;
2785 }
2786 if (all_lazy)
2787 *all_lazy = al;
2788 return hc;
2644} 2789}
2645 2790
2646/* 2791/*
@@ -2859,7 +3004,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2859 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3004 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2860 atomic_set(&rdp->dynticks->dynticks, 3005 atomic_set(&rdp->dynticks->dynticks,
2861 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3006 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2862 rcu_prepare_for_idle_init(cpu);
2863 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3007 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2864 3008
2865 /* Add CPU to rcu_node bitmasks. */ 3009 /* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3053,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2909 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 3053 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2910 struct rcu_node *rnp = rdp->mynode; 3054 struct rcu_node *rnp = rdp->mynode;
2911 struct rcu_state *rsp; 3055 struct rcu_state *rsp;
2912 int ret = NOTIFY_OK;
2913 3056
2914 trace_rcu_utilization("Start CPU hotplug"); 3057 trace_rcu_utilization("Start CPU hotplug");
2915 switch (action) { 3058 switch (action) {
@@ -2923,21 +3066,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2923 rcu_boost_kthread_setaffinity(rnp, -1); 3066 rcu_boost_kthread_setaffinity(rnp, -1);
2924 break; 3067 break;
2925 case CPU_DOWN_PREPARE: 3068 case CPU_DOWN_PREPARE:
2926 if (nocb_cpu_expendable(cpu)) 3069 rcu_boost_kthread_setaffinity(rnp, cpu);
2927 rcu_boost_kthread_setaffinity(rnp, cpu);
2928 else
2929 ret = NOTIFY_BAD;
2930 break; 3070 break;
2931 case CPU_DYING: 3071 case CPU_DYING:
2932 case CPU_DYING_FROZEN: 3072 case CPU_DYING_FROZEN:
2933 /*
2934 * The whole machine is "stopped" except this CPU, so we can
2935 * touch any data without introducing corruption. We send the
2936 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2937 */
2938 for_each_rcu_flavor(rsp) 3073 for_each_rcu_flavor(rsp)
2939 rcu_cleanup_dying_cpu(rsp); 3074 rcu_cleanup_dying_cpu(rsp);
2940 rcu_cleanup_after_idle(cpu);
2941 break; 3075 break;
2942 case CPU_DEAD: 3076 case CPU_DEAD:
2943 case CPU_DEAD_FROZEN: 3077 case CPU_DEAD_FROZEN:
@@ -2950,7 +3084,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2950 break; 3084 break;
2951 } 3085 }
2952 trace_rcu_utilization("End CPU hotplug"); 3086 trace_rcu_utilization("End CPU hotplug");
2953 return ret; 3087 return NOTIFY_OK;
2954} 3088}
2955 3089
2956/* 3090/*
@@ -3085,6 +3219,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3085 } 3219 }
3086 rnp->level = i; 3220 rnp->level = i;
3087 INIT_LIST_HEAD(&rnp->blkd_tasks); 3221 INIT_LIST_HEAD(&rnp->blkd_tasks);
3222 rcu_init_one_nocb(rnp);
3088 } 3223 }
3089 } 3224 }
3090 3225
@@ -3170,8 +3305,7 @@ void __init rcu_init(void)
3170 rcu_init_one(&rcu_sched_state, &rcu_sched_data); 3305 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3171 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3306 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3172 __rcu_init_preempt(); 3307 __rcu_init_preempt();
3173 rcu_init_nocb(); 3308 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3174 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3175 3309
3176 /* 3310 /*
3177 * We don't need protection against CPU-hotplug here because 3311 * We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c896b5045d9d..14ee40795d6f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 atomic_t dynticks; /* Even value for idle, else odd. */ 89 atomic_t dynticks; /* Even value for idle, else odd. */
90#ifdef CONFIG_RCU_FAST_NO_HZ 90#ifdef CONFIG_RCU_FAST_NO_HZ
91 int dyntick_drain; /* Prepare-for-idle state variable. */ 91 bool all_lazy; /* Are all CPU's CBs lazy? */
92 unsigned long dyntick_holdoff;
93 /* No retries for the jiffy of failure. */
94 struct timer_list idle_gp_timer;
95 /* Wake up CPU sleeping with callbacks. */
96 unsigned long idle_gp_timer_expires;
97 /* When to wake up CPU (for repost). */
98 bool idle_first_pass; /* First pass of attempt to go idle? */
99 unsigned long nonlazy_posted; 92 unsigned long nonlazy_posted;
100 /* # times non-lazy CBs posted to CPU. */ 93 /* # times non-lazy CBs posted to CPU. */
101 unsigned long nonlazy_posted_snap; 94 unsigned long nonlazy_posted_snap;
102 /* idle-period nonlazy_posted snapshot. */ 95 /* idle-period nonlazy_posted snapshot. */
96 unsigned long last_accelerate;
97 /* Last jiffy CBs were accelerated. */
103 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 98 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
104#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 99#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
105}; 100};
@@ -134,9 +129,6 @@ struct rcu_node {
134 /* elements that need to drain to allow the */ 129 /* elements that need to drain to allow the */
135 /* current expedited grace period to */ 130 /* current expedited grace period to */
136 /* complete (only for TREE_PREEMPT_RCU). */ 131 /* complete (only for TREE_PREEMPT_RCU). */
137 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
138 /* Since this has meaning only for leaf */
139 /* rcu_node structures, 32 bits suffices. */
140 unsigned long qsmaskinit; 132 unsigned long qsmaskinit;
141 /* Per-GP initial value for qsmask & expmask. */ 133 /* Per-GP initial value for qsmask & expmask. */
142 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 134 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
196 /* Refused to boost: not sure why, though. */ 188 /* Refused to boost: not sure why, though. */
197 /* This can happen due to race conditions. */ 189 /* This can happen due to race conditions. */
198#endif /* #ifdef CONFIG_RCU_BOOST */ 190#endif /* #ifdef CONFIG_RCU_BOOST */
191#ifdef CONFIG_RCU_NOCB_CPU
192 wait_queue_head_t nocb_gp_wq[2];
193 /* Place for rcu_nocb_kthread() to wait GP. */
194#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
195 int need_future_gp[2];
196 /* Counts of upcoming no-CB GP requests. */
199 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 197 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
200} ____cacheline_internodealigned_in_smp; 198} ____cacheline_internodealigned_in_smp;
201 199
@@ -328,6 +326,11 @@ struct rcu_data {
328 struct task_struct *nocb_kthread; 326 struct task_struct *nocb_kthread;
329#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 327#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
330 328
329 /* 8) RCU CPU stall data. */
330#ifdef CONFIG_RCU_CPU_STALL_INFO
331 unsigned int softirq_snap; /* Snapshot of softirq activity. */
332#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
333
331 int cpu; 334 int cpu;
332 struct rcu_state *rsp; 335 struct rcu_state *rsp;
333}; 336};
@@ -375,12 +378,6 @@ struct rcu_state {
375 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 378 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
376 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 379 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
377 void (*func)(struct rcu_head *head)); 380 void (*func)(struct rcu_head *head));
378#ifdef CONFIG_RCU_NOCB_CPU
379 void (*call_remote)(struct rcu_head *head,
380 void (*func)(struct rcu_head *head));
381 /* call_rcu() flavor, but for */
382 /* placing on remote CPU. */
383#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
384 381
385 /* The following fields are guarded by the root rcu_node's lock. */ 382 /* The following fields are guarded by the root rcu_node's lock. */
386 383
@@ -443,6 +440,7 @@ struct rcu_state {
443 unsigned long gp_max; /* Maximum GP duration in */ 440 unsigned long gp_max; /* Maximum GP duration in */
444 /* jiffies. */ 441 /* jiffies. */
445 char *name; /* Name of structure. */ 442 char *name; /* Name of structure. */
443 char abbr; /* Abbreviated name. */
446 struct list_head flavors; /* List of RCU flavors. */ 444 struct list_head flavors; /* List of RCU flavors. */
447}; 445};
448 446
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
520 struct rcu_node *rnp); 518 struct rcu_node *rnp);
521#endif /* #ifdef CONFIG_RCU_BOOST */ 519#endif /* #ifdef CONFIG_RCU_BOOST */
522static void __cpuinit rcu_prepare_kthreads(int cpu); 520static void __cpuinit rcu_prepare_kthreads(int cpu);
523static void rcu_prepare_for_idle_init(int cpu);
524static void rcu_cleanup_after_idle(int cpu); 521static void rcu_cleanup_after_idle(int cpu);
525static void rcu_prepare_for_idle(int cpu); 522static void rcu_prepare_for_idle(int cpu);
526static void rcu_idle_count_callbacks_posted(void); 523static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
529static void print_cpu_stall_info_end(void); 526static void print_cpu_stall_info_end(void);
530static void zero_cpu_stall_ticks(struct rcu_data *rdp); 527static void zero_cpu_stall_ticks(struct rcu_data *rdp);
531static void increment_cpu_stall_ticks(void); 528static void increment_cpu_stall_ticks(void);
529static int rcu_nocb_needs_gp(struct rcu_state *rsp);
530static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
531static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
532static void rcu_init_one_nocb(struct rcu_node *rnp);
532static bool is_nocb_cpu(int cpu); 533static bool is_nocb_cpu(int cpu);
533static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 534static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
534 bool lazy); 535 bool lazy);
535static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 536static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
536 struct rcu_data *rdp); 537 struct rcu_data *rdp);
537static bool nocb_cpu_expendable(int cpu);
538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 538static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 539static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
540static void init_nocb_callback_list(struct rcu_data *rdp); 540static bool init_nocb_callback_list(struct rcu_data *rdp);
541static void __init rcu_init_nocb(void);
542 541
543#endif /* #ifndef RCU_TREE_NONCORE */ 542#endif /* #ifndef RCU_TREE_NONCORE */
544 543
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9d..d084ae3f281c 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -85,11 +85,21 @@ static void __init rcu_bootup_announce_oddness(void)
85 if (nr_cpu_ids != NR_CPUS) 85 if (nr_cpu_ids != NR_CPUS)
86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 86 printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
87#ifdef CONFIG_RCU_NOCB_CPU 87#ifdef CONFIG_RCU_NOCB_CPU
88#ifndef CONFIG_RCU_NOCB_CPU_NONE
89 if (!have_rcu_nocb_mask) {
90 alloc_bootmem_cpumask_var(&rcu_nocb_mask);
91 have_rcu_nocb_mask = true;
92 }
93#ifdef CONFIG_RCU_NOCB_CPU_ZERO
94 pr_info("\tExperimental no-CBs CPU 0\n");
95 cpumask_set_cpu(0, rcu_nocb_mask);
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98 pr_info("\tExperimental no-CBs for all CPUs\n");
99 cpumask_setall(rcu_nocb_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
88 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
89 if (cpumask_test_cpu(0, rcu_nocb_mask)) {
90 cpumask_clear_cpu(0, rcu_nocb_mask);
91 pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
92 }
93 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
94 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); 104 pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
95 if (rcu_nocb_poll) 105 if (rcu_nocb_poll)
@@ -101,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void)
101#ifdef CONFIG_TREE_PREEMPT_RCU 111#ifdef CONFIG_TREE_PREEMPT_RCU
102 112
103struct rcu_state rcu_preempt_state = 113struct rcu_state rcu_preempt_state =
104 RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); 114 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
105DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); 115DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
106static struct rcu_state *rcu_state = &rcu_preempt_state; 116static struct rcu_state *rcu_state = &rcu_preempt_state;
107 117
@@ -1533,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1533int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1543int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
1534{ 1544{
1535 *delta_jiffies = ULONG_MAX; 1545 *delta_jiffies = ULONG_MAX;
1536 return rcu_cpu_has_callbacks(cpu); 1546 return rcu_cpu_has_callbacks(cpu, NULL);
1537}
1538
1539/*
1540 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1541 */
1542static void rcu_prepare_for_idle_init(int cpu)
1543{
1544} 1547}
1545 1548
1546/* 1549/*
@@ -1577,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void)
1577 * 1580 *
1578 * The following three proprocessor symbols control this state machine: 1581 * The following three proprocessor symbols control this state machine:
1579 * 1582 *
1580 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
1581 * to satisfy RCU. Beyond this point, it is better to incur a periodic
1582 * scheduling-clock interrupt than to loop through the state machine
1583 * at full power.
1584 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
1585 * optional if RCU does not need anything immediately from this
1586 * CPU, even if this CPU still has RCU callbacks queued. The first
1587 * times through the state machine are mandatory: we need to give
1588 * the state machine a chance to communicate a quiescent state
1589 * to the RCU core.
1590 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted 1583 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1591 * to sleep in dyntick-idle mode with RCU callbacks pending. This 1584 * to sleep in dyntick-idle mode with RCU callbacks pending. This
1592 * is sized to be roughly one RCU grace period. Those energy-efficiency 1585 * is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1602,186 +1595,108 @@ static void rcu_idle_count_callbacks_posted(void)
1602 * adjustment, they can be converted into kernel config parameters, though 1595 * adjustment, they can be converted into kernel config parameters, though
1603 * making the state machine smarter might be a better option. 1596 * making the state machine smarter might be a better option.
1604 */ 1597 */
1605#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
1606#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
1607#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ 1598#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
1608#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ 1599#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1609 1600
1610extern int tick_nohz_enabled; 1601static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1611 1602module_param(rcu_idle_gp_delay, int, 0644);
1612/* 1603static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1613 * Does the specified flavor of RCU have non-lazy callbacks pending on 1604module_param(rcu_idle_lazy_gp_delay, int, 0644);
1614 * the specified CPU? Both RCU flavor and CPU are specified by the
1615 * rcu_data structure.
1616 */
1617static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
1618{
1619 return rdp->qlen != rdp->qlen_lazy;
1620}
1621 1605
1622#ifdef CONFIG_TREE_PREEMPT_RCU 1606extern int tick_nohz_enabled;
1623 1607
1624/* 1608/*
1625 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there 1609 * Try to advance callbacks for all flavors of RCU on the current CPU.
1626 * is no RCU-preempt in the kernel.) 1610 * Afterwards, if there are any callbacks ready for immediate invocation,
1611 * return true.
1627 */ 1612 */
1628static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1613static bool rcu_try_advance_all_cbs(void)
1629{ 1614{
1630 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 1615 bool cbs_ready = false;
1631 1616 struct rcu_data *rdp;
1632 return __rcu_cpu_has_nonlazy_callbacks(rdp); 1617 struct rcu_node *rnp;
1633} 1618 struct rcu_state *rsp;
1634
1635#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1636 1619
1637static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) 1620 for_each_rcu_flavor(rsp) {
1638{ 1621 rdp = this_cpu_ptr(rsp->rda);
1639 return 0; 1622 rnp = rdp->mynode;
1640}
1641 1623
1642#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ 1624 /*
1625 * Don't bother checking unless a grace period has
1626 * completed since we last checked and there are
1627 * callbacks not yet ready to invoke.
1628 */
1629 if (rdp->completed != rnp->completed &&
1630 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1631 rcu_process_gp_end(rsp, rdp);
1643 1632
1644/* 1633 if (cpu_has_callbacks_ready_to_invoke(rdp))
1645 * Does any flavor of RCU have non-lazy callbacks on the specified CPU? 1634 cbs_ready = true;
1646 */ 1635 }
1647static bool rcu_cpu_has_nonlazy_callbacks(int cpu) 1636 return cbs_ready;
1648{
1649 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
1650 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
1651 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
1652} 1637}
1653 1638
1654/* 1639/*
1655 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1640 * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1656 * callbacks on this CPU, (2) this CPU has not yet attempted to enter 1641 * to invoke. If the CPU has callbacks, try to advance them. Tell the
1657 * dyntick-idle mode, or (3) this CPU is in the process of attempting to 1642 * caller to set the timeout based on whether or not there are non-lazy
1658 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed 1643 * callbacks.
1659 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
1660 * it is better to incur scheduling-clock interrupts than to spin
1661 * continuously for the same time duration!
1662 * 1644 *
1663 * The delta_jiffies argument is used to store the time when RCU is 1645 * The caller must have disabled interrupts.
1664 * going to need the CPU again if it still has callbacks. The reason
1665 * for this is that rcu_prepare_for_idle() might need to post a timer,
1666 * but if so, it will do so after tick_nohz_stop_sched_tick() has set
1667 * the wakeup time for this CPU. This means that RCU's timer can be
1668 * delayed until the wakeup time, which defeats the purpose of posting
1669 * a timer.
1670 */ 1646 */
1671int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) 1647int rcu_needs_cpu(int cpu, unsigned long *dj)
1672{ 1648{
1673 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1649 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1674 1650
1675 /* Flag a new idle sojourn to the idle-entry state machine. */ 1651 /* Snapshot to detect later posting of non-lazy callback. */
1676 rdtp->idle_first_pass = 1; 1652 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1653
1677 /* If no callbacks, RCU doesn't need the CPU. */ 1654 /* If no callbacks, RCU doesn't need the CPU. */
1678 if (!rcu_cpu_has_callbacks(cpu)) { 1655 if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
1679 *delta_jiffies = ULONG_MAX; 1656 *dj = ULONG_MAX;
1680 return 0; 1657 return 0;
1681 } 1658 }
1682 if (rdtp->dyntick_holdoff == jiffies) { 1659
1683 /* RCU recently tried and failed, so don't try again. */ 1660 /* Attempt to advance callbacks. */
1684 *delta_jiffies = 1; 1661 if (rcu_try_advance_all_cbs()) {
1662 /* Some ready to invoke, so initiate later invocation. */
1663 invoke_rcu_core();
1685 return 1; 1664 return 1;
1686 } 1665 }
1687 /* Set up for the possibility that RCU will post a timer. */ 1666 rdtp->last_accelerate = jiffies;
1688 if (rcu_cpu_has_nonlazy_callbacks(cpu)) { 1667
1689 *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, 1668 /* Request timer delay depending on laziness, and round. */
1690 RCU_IDLE_GP_DELAY) - jiffies; 1669 if (rdtp->all_lazy) {
1670 *dj = round_up(rcu_idle_gp_delay + jiffies,
1671 rcu_idle_gp_delay) - jiffies;
1691 } else { 1672 } else {
1692 *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; 1673 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1693 *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
1694 } 1674 }
1695 return 0; 1675 return 0;
1696} 1676}
1697 1677
1698/* 1678/*
1699 * Handler for smp_call_function_single(). The only point of this 1679 * Prepare a CPU for idle from an RCU perspective. The first major task
1700 * handler is to wake the CPU up, so the handler does only tracing. 1680 * is to sense whether nohz mode has been enabled or disabled via sysfs.
1701 */ 1681 * The second major task is to check to see if a non-lazy callback has
1702void rcu_idle_demigrate(void *unused) 1682 * arrived at a CPU that previously had only lazy callbacks. The third
1703{ 1683 * major task is to accelerate (that is, assign grace-period numbers to)
1704 trace_rcu_prep_idle("Demigrate"); 1684 * any recently arrived callbacks.
1705}
1706
1707/*
1708 * Timer handler used to force CPU to start pushing its remaining RCU
1709 * callbacks in the case where it entered dyntick-idle mode with callbacks
1710 * pending. The hander doesn't really need to do anything because the
1711 * real work is done upon re-entry to idle, or by the next scheduling-clock
1712 * interrupt should idle not be re-entered.
1713 *
1714 * One special case: the timer gets migrated without awakening the CPU
1715 * on which the timer was scheduled on. In this case, we must wake up
1716 * that CPU. We do so with smp_call_function_single().
1717 */
1718static void rcu_idle_gp_timer_func(unsigned long cpu_in)
1719{
1720 int cpu = (int)cpu_in;
1721
1722 trace_rcu_prep_idle("Timer");
1723 if (cpu != smp_processor_id())
1724 smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
1725 else
1726 WARN_ON_ONCE(1); /* Getting here can hang the system... */
1727}
1728
1729/*
1730 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
1731 */
1732static void rcu_prepare_for_idle_init(int cpu)
1733{
1734 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1735
1736 rdtp->dyntick_holdoff = jiffies - 1;
1737 setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
1738 rdtp->idle_gp_timer_expires = jiffies - 1;
1739 rdtp->idle_first_pass = 1;
1740}
1741
1742/*
1743 * Clean up for exit from idle. Because we are exiting from idle, there
1744 * is no longer any point to ->idle_gp_timer, so cancel it. This will
1745 * do nothing if this timer is not active, so just cancel it unconditionally.
1746 */
1747static void rcu_cleanup_after_idle(int cpu)
1748{
1749 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1750
1751 del_timer(&rdtp->idle_gp_timer);
1752 trace_rcu_prep_idle("Cleanup after idle");
1753 rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
1754}
1755
1756/*
1757 * Check to see if any RCU-related work can be done by the current CPU,
1758 * and if so, schedule a softirq to get it done. This function is part
1759 * of the RCU implementation; it is -not- an exported member of the RCU API.
1760 *
1761 * The idea is for the current CPU to clear out all work required by the
1762 * RCU core for the current grace period, so that this CPU can be permitted
1763 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1764 * at the end of the grace period by whatever CPU ends the grace period.
1765 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
1766 * number of wakeups by a modest integer factor.
1767 *
1768 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1769 * disabled, we do one pass of force_quiescent_state(), then do a
1770 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1771 * later. The ->dyntick_drain field controls the sequencing.
1772 * 1685 *
1773 * The caller must have disabled interrupts. 1686 * The caller must have disabled interrupts.
1774 */ 1687 */
1775static void rcu_prepare_for_idle(int cpu) 1688static void rcu_prepare_for_idle(int cpu)
1776{ 1689{
1777 struct timer_list *tp; 1690 struct rcu_data *rdp;
1778 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1691 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1692 struct rcu_node *rnp;
1693 struct rcu_state *rsp;
1779 int tne; 1694 int tne;
1780 1695
1781 /* Handle nohz enablement switches conservatively. */ 1696 /* Handle nohz enablement switches conservatively. */
1782 tne = ACCESS_ONCE(tick_nohz_enabled); 1697 tne = ACCESS_ONCE(tick_nohz_enabled);
1783 if (tne != rdtp->tick_nohz_enabled_snap) { 1698 if (tne != rdtp->tick_nohz_enabled_snap) {
1784 if (rcu_cpu_has_callbacks(cpu)) 1699 if (rcu_cpu_has_callbacks(cpu, NULL))
1785 invoke_rcu_core(); /* force nohz to see update. */ 1700 invoke_rcu_core(); /* force nohz to see update. */
1786 rdtp->tick_nohz_enabled_snap = tne; 1701 rdtp->tick_nohz_enabled_snap = tne;
1787 return; 1702 return;
@@ -1789,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu)
1789 if (!tne) 1704 if (!tne)
1790 return; 1705 return;
1791 1706
1792 /* Adaptive-tick mode, where usermode execution is idle to RCU. */ 1707 /* If this is a no-CBs CPU, no callbacks, just return. */
1793 if (!is_idle_task(current)) { 1708 if (is_nocb_cpu(cpu))
1794 rdtp->dyntick_holdoff = jiffies - 1;
1795 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1796 trace_rcu_prep_idle("User dyntick with callbacks");
1797 rdtp->idle_gp_timer_expires =
1798 round_up(jiffies + RCU_IDLE_GP_DELAY,
1799 RCU_IDLE_GP_DELAY);
1800 } else if (rcu_cpu_has_callbacks(cpu)) {
1801 rdtp->idle_gp_timer_expires =
1802 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1803 trace_rcu_prep_idle("User dyntick with lazy callbacks");
1804 } else {
1805 return;
1806 }
1807 tp = &rdtp->idle_gp_timer;
1808 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1809 return; 1709 return;
1810 }
1811 1710
1812 /* 1711 /*
1813 * If this is an idle re-entry, for example, due to use of 1712 * If a non-lazy callback arrived at a CPU having only lazy
1814 * RCU_NONIDLE() or the new idle-loop tracing API within the idle 1713 * callbacks, invoke RCU core for the side-effect of recalculating
1815 * loop, then don't take any state-machine actions, unless the 1714 * idle duration on re-entry to idle.
1816 * momentary exit from idle queued additional non-lazy callbacks.
1817 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
1818 * pending.
1819 */ 1715 */
1820 if (!rdtp->idle_first_pass && 1716 if (rdtp->all_lazy &&
1821 (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { 1717 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1822 if (rcu_cpu_has_callbacks(cpu)) { 1718 invoke_rcu_core();
1823 tp = &rdtp->idle_gp_timer;
1824 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1825 }
1826 return; 1719 return;
1827 } 1720 }
1828 rdtp->idle_first_pass = 0;
1829 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
1830 1721
1831 /* 1722 /*
1832 * If there are no callbacks on this CPU, enter dyntick-idle mode. 1723 * If we have not yet accelerated this jiffy, accelerate all
1833 * Also reset state to avoid prejudicing later attempts. 1724 * callbacks on this CPU.
1834 */ 1725 */
1835 if (!rcu_cpu_has_callbacks(cpu)) { 1726 if (rdtp->last_accelerate == jiffies)
1836 rdtp->dyntick_holdoff = jiffies - 1;
1837 rdtp->dyntick_drain = 0;
1838 trace_rcu_prep_idle("No callbacks");
1839 return; 1727 return;
1728 rdtp->last_accelerate = jiffies;
1729 for_each_rcu_flavor(rsp) {
1730 rdp = per_cpu_ptr(rsp->rda, cpu);
1731 if (!*rdp->nxttail[RCU_DONE_TAIL])
1732 continue;
1733 rnp = rdp->mynode;
1734 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1735 rcu_accelerate_cbs(rsp, rnp, rdp);
1736 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1840 } 1737 }
1738}
1841 1739
1842 /* 1740/*
1843 * If in holdoff mode, just return. We will presumably have 1741 * Clean up for exit from idle. Attempt to advance callbacks based on
1844 * refrained from disabling the scheduling-clock tick. 1742 * any grace periods that elapsed while the CPU was idle, and if any
1845 */ 1743 * callbacks are now ready to invoke, initiate invocation.
1846 if (rdtp->dyntick_holdoff == jiffies) { 1744 */
1847 trace_rcu_prep_idle("In holdoff"); 1745static void rcu_cleanup_after_idle(int cpu)
1848 return; 1746{
1849 } 1747 struct rcu_data *rdp;
1748 struct rcu_state *rsp;
1850 1749
1851 /* Check and update the ->dyntick_drain sequencing. */ 1750 if (is_nocb_cpu(cpu))
1852 if (rdtp->dyntick_drain <= 0) {
1853 /* First time through, initialize the counter. */
1854 rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
1855 } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
1856 !rcu_pending(cpu) &&
1857 !local_softirq_pending()) {
1858 /* Can we go dyntick-idle despite still having callbacks? */
1859 rdtp->dyntick_drain = 0;
1860 rdtp->dyntick_holdoff = jiffies;
1861 if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
1862 trace_rcu_prep_idle("Dyntick with callbacks");
1863 rdtp->idle_gp_timer_expires =
1864 round_up(jiffies + RCU_IDLE_GP_DELAY,
1865 RCU_IDLE_GP_DELAY);
1866 } else {
1867 rdtp->idle_gp_timer_expires =
1868 round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
1869 trace_rcu_prep_idle("Dyntick with lazy callbacks");
1870 }
1871 tp = &rdtp->idle_gp_timer;
1872 mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
1873 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1874 return; /* Nothing more to do immediately. */
1875 } else if (--(rdtp->dyntick_drain) <= 0) {
1876 /* We have hit the limit, so time to give up. */
1877 rdtp->dyntick_holdoff = jiffies;
1878 trace_rcu_prep_idle("Begin holdoff");
1879 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
1880 return; 1751 return;
1881 } 1752 rcu_try_advance_all_cbs();
1882 1753 for_each_rcu_flavor(rsp) {
1883 /* 1754 rdp = per_cpu_ptr(rsp->rda, cpu);
1884 * Do one step of pushing the remaining RCU callbacks through 1755 if (cpu_has_callbacks_ready_to_invoke(rdp))
1885 * the RCU core state machine. 1756 invoke_rcu_core();
1886 */
1887#ifdef CONFIG_TREE_PREEMPT_RCU
1888 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
1889 rcu_preempt_qs(cpu);
1890 force_quiescent_state(&rcu_preempt_state);
1891 }
1892#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1893 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1894 rcu_sched_qs(cpu);
1895 force_quiescent_state(&rcu_sched_state);
1896 }
1897 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1898 rcu_bh_qs(cpu);
1899 force_quiescent_state(&rcu_bh_state);
1900 }
1901
1902 /*
1903 * If RCU callbacks are still pending, RCU still needs this CPU.
1904 * So try forcing the callbacks through the grace period.
1905 */
1906 if (rcu_cpu_has_callbacks(cpu)) {
1907 trace_rcu_prep_idle("More callbacks");
1908 invoke_rcu_core();
1909 } else {
1910 trace_rcu_prep_idle("Callbacks drained");
1911 } 1757 }
1912} 1758}
1913 1759
@@ -2015,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier);
2015static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1861static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2016{ 1862{
2017 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); 1863 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
2018 struct timer_list *tltp = &rdtp->idle_gp_timer; 1864 unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
2019 char c;
2020 1865
2021 c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; 1866 sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
2022 if (timer_pending(tltp)) 1867 rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
2023 sprintf(cp, "drain=%d %c timer=%lu", 1868 ulong2long(nlpd),
2024 rdtp->dyntick_drain, c, tltp->expires - jiffies); 1869 rdtp->all_lazy ? 'L' : '.',
2025 else 1870 rdtp->tick_nohz_enabled_snap ? '.' : 'D');
2026 sprintf(cp, "drain=%d %c timer not pending",
2027 rdtp->dyntick_drain, c);
2028} 1871}
2029 1872
2030#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 1873#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1913,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2070 ticks_value = rsp->gpnum - rdp->gpnum; 1913 ticks_value = rsp->gpnum - rdp->gpnum;
2071 } 1914 }
2072 print_cpu_stall_fast_no_hz(fast_no_hz, cpu); 1915 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2073 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", 1916 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
2074 cpu, ticks_value, ticks_title, 1917 cpu, ticks_value, ticks_title,
2075 atomic_read(&rdtp->dynticks) & 0xfff, 1918 atomic_read(&rdtp->dynticks) & 0xfff,
2076 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, 1919 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1920 rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
2077 fast_no_hz); 1921 fast_no_hz);
2078} 1922}
2079 1923
@@ -2087,6 +1931,7 @@ static void print_cpu_stall_info_end(void)
2087static void zero_cpu_stall_ticks(struct rcu_data *rdp) 1931static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2088{ 1932{
2089 rdp->ticks_this_gp = 0; 1933 rdp->ticks_this_gp = 0;
1934 rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
2090} 1935}
2091 1936
2092/* Increment ->ticks_this_gp for all flavors of RCU. */ 1937/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,6 +2010,47 @@ static int __init parse_rcu_nocb_poll(char *arg)
2165} 2010}
2166early_param("rcu_nocb_poll", parse_rcu_nocb_poll); 2011early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
2167 2012
2013/*
2014 * Do any no-CBs CPUs need another grace period?
2015 *
2016 * Interrupts must be disabled. If the caller does not hold the root
2017 * rnp_node structure's ->lock, the results are advisory only.
2018 */
2019static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2020{
2021 struct rcu_node *rnp = rcu_get_root(rsp);
2022
2023 return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
2024}
2025
2026/*
2027 * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
2028 * grace period.
2029 */
2030static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2031{
2032 wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
2033}
2034
2035/*
2036 * Set the root rcu_node structure's ->need_future_gp field
2037 * based on the sum of those of all rcu_node structures. This does
2038 * double-count the root rcu_node structure's requests, but this
2039 * is necessary to handle the possibility of a rcu_nocb_kthread()
2040 * having awakened during the time that the rcu_node structures
2041 * were being updated for the end of the previous grace period.
2042 */
2043static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2044{
2045 rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
2046}
2047
2048static void rcu_init_one_nocb(struct rcu_node *rnp)
2049{
2050 init_waitqueue_head(&rnp->nocb_gp_wq[0]);
2051 init_waitqueue_head(&rnp->nocb_gp_wq[1]);
2052}
2053
2168/* Is the specified CPU a no-CPUs CPU? */ 2054/* Is the specified CPU a no-CPUs CPU? */
2169static bool is_nocb_cpu(int cpu) 2055static bool is_nocb_cpu(int cpu)
2170{ 2056{
@@ -2227,6 +2113,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2227 if (!is_nocb_cpu(rdp->cpu)) 2113 if (!is_nocb_cpu(rdp->cpu))
2228 return 0; 2114 return 0;
2229 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); 2115 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
2116 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2117 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2118 (unsigned long)rhp->func,
2119 rdp->qlen_lazy, rdp->qlen);
2120 else
2121 trace_rcu_callback(rdp->rsp->name, rhp,
2122 rdp->qlen_lazy, rdp->qlen);
2230 return 1; 2123 return 1;
2231} 2124}
2232 2125
@@ -2265,95 +2158,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2265} 2158}
2266 2159
2267/* 2160/*
2268 * There must be at least one non-no-CBs CPU in operation at any given 2161 * If necessary, kick off a new grace period, and either way wait
2269 * time, because no-CBs CPUs are not capable of initiating grace periods 2162 * for a subsequent grace period to complete.
2270 * independently. This function therefore complains if the specified
2271 * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
2272 * avoid offlining the last such CPU. (Recursion is a wonderful thing,
2273 * but you have to have a base case!)
2274 */ 2163 */
2275static bool nocb_cpu_expendable(int cpu) 2164static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2276{ 2165{
2277 cpumask_var_t non_nocb_cpus; 2166 unsigned long c;
2278 int ret; 2167 bool d;
2168 unsigned long flags;
2169 struct rcu_node *rnp = rdp->mynode;
2170
2171 raw_spin_lock_irqsave(&rnp->lock, flags);
2172 c = rcu_start_future_gp(rnp, rdp);
2173 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2279 2174
2280 /* 2175 /*
2281 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, 2176 * Wait for the grace period. Do so interruptibly to avoid messing
2282 * then offlining this CPU is harmless. Let it happen. 2177 * up the load average.
2283 */ 2178 */
2284 if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) 2179 trace_rcu_future_gp(rnp, rdp, c, "StartWait");
2285 return 1; 2180 for (;;) {
2286 2181 wait_event_interruptible(
2287 /* If no memory, play it safe and keep the CPU around. */ 2182 rnp->nocb_gp_wq[c & 0x1],
2288 if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) 2183 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2289 return 0; 2184 if (likely(d))
2290 cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); 2185 break;
2291 cpumask_clear_cpu(cpu, non_nocb_cpus); 2186 flush_signals(current);
2292 ret = !cpumask_empty(non_nocb_cpus); 2187 trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
2293 free_cpumask_var(non_nocb_cpus); 2188 }
2294 return ret; 2189 trace_rcu_future_gp(rnp, rdp, c, "EndWait");
2295} 2190 smp_mb(); /* Ensure that CB invocation happens after GP end. */
2296
2297/*
2298 * Helper structure for remote registry of RCU callbacks.
2299 * This is needed for when a no-CBs CPU needs to start a grace period.
2300 * If it just invokes call_rcu(), the resulting callback will be queued,
2301 * which can result in deadlock.
2302 */
2303struct rcu_head_remote {
2304 struct rcu_head *rhp;
2305 call_rcu_func_t *crf;
2306 void (*func)(struct rcu_head *rhp);
2307};
2308
2309/*
2310 * Register a callback as specified by the rcu_head_remote struct.
2311 * This function is intended to be invoked via smp_call_function_single().
2312 */
2313static void call_rcu_local(void *arg)
2314{
2315 struct rcu_head_remote *rhrp =
2316 container_of(arg, struct rcu_head_remote, rhp);
2317
2318 rhrp->crf(rhrp->rhp, rhrp->func);
2319}
2320
2321/*
2322 * Set up an rcu_head_remote structure and the invoke call_rcu_local()
2323 * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
2324 * smp_call_function_single().
2325 */
2326static void invoke_crf_remote(struct rcu_head *rhp,
2327 void (*func)(struct rcu_head *rhp),
2328 call_rcu_func_t crf)
2329{
2330 struct rcu_head_remote rhr;
2331
2332 rhr.rhp = rhp;
2333 rhr.crf = crf;
2334 rhr.func = func;
2335 smp_call_function_single(0, call_rcu_local, &rhr, 1);
2336}
2337
2338/*
2339 * Helper functions to be passed to wait_rcu_gp(), each of which
2340 * invokes invoke_crf_remote() to register a callback appropriately.
2341 */
2342static void __maybe_unused
2343call_rcu_preempt_remote(struct rcu_head *rhp,
2344 void (*func)(struct rcu_head *rhp))
2345{
2346 invoke_crf_remote(rhp, func, call_rcu);
2347}
2348static void call_rcu_bh_remote(struct rcu_head *rhp,
2349 void (*func)(struct rcu_head *rhp))
2350{
2351 invoke_crf_remote(rhp, func, call_rcu_bh);
2352}
2353static void call_rcu_sched_remote(struct rcu_head *rhp,
2354 void (*func)(struct rcu_head *rhp))
2355{
2356 invoke_crf_remote(rhp, func, call_rcu_sched);
2357} 2191}
2358 2192
2359/* 2193/*
@@ -2390,7 +2224,7 @@ static int rcu_nocb_kthread(void *arg)
2390 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); 2224 cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
2391 ACCESS_ONCE(rdp->nocb_p_count) += c; 2225 ACCESS_ONCE(rdp->nocb_p_count) += c;
2392 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; 2226 ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
2393 wait_rcu_gp(rdp->rsp->call_remote); 2227 rcu_nocb_wait_gp(rdp);
2394 2228
2395 /* Each pass through the following loop invokes a callback. */ 2229 /* Each pass through the following loop invokes a callback. */
2396 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); 2230 trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,32 +2270,41 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2436 return; 2270 return;
2437 for_each_cpu(cpu, rcu_nocb_mask) { 2271 for_each_cpu(cpu, rcu_nocb_mask) {
2438 rdp = per_cpu_ptr(rsp->rda, cpu); 2272 rdp = per_cpu_ptr(rsp->rda, cpu);
2439 t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); 2273 t = kthread_run(rcu_nocb_kthread, rdp,
2274 "rcuo%c/%d", rsp->abbr, cpu);
2440 BUG_ON(IS_ERR(t)); 2275 BUG_ON(IS_ERR(t));
2441 ACCESS_ONCE(rdp->nocb_kthread) = t; 2276 ACCESS_ONCE(rdp->nocb_kthread) = t;
2442 } 2277 }
2443} 2278}
2444 2279
2445/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2280/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2446static void init_nocb_callback_list(struct rcu_data *rdp) 2281static bool init_nocb_callback_list(struct rcu_data *rdp)
2447{ 2282{
2448 if (rcu_nocb_mask == NULL || 2283 if (rcu_nocb_mask == NULL ||
2449 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) 2284 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2450 return; 2285 return false;
2451 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2286 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2287 return true;
2288}
2289
2290#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2291
2292static int rcu_nocb_needs_gp(struct rcu_state *rsp)
2293{
2294 return 0;
2452} 2295}
2453 2296
2454/* Initialize the ->call_remote fields in the rcu_state structures. */ 2297static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2455static void __init rcu_init_nocb(void)
2456{ 2298{
2457#ifdef CONFIG_PREEMPT_RCU
2458 rcu_preempt_state.call_remote = call_rcu_preempt_remote;
2459#endif /* #ifdef CONFIG_PREEMPT_RCU */
2460 rcu_bh_state.call_remote = call_rcu_bh_remote;
2461 rcu_sched_state.call_remote = call_rcu_sched_remote;
2462} 2299}
2463 2300
2464#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2301static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2302{
2303}
2304
2305static void rcu_init_one_nocb(struct rcu_node *rnp)
2306{
2307}
2465 2308
2466static bool is_nocb_cpu(int cpu) 2309static bool is_nocb_cpu(int cpu)
2467{ 2310{
@@ -2480,11 +2323,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2480 return 0; 2323 return 0;
2481} 2324}
2482 2325
2483static bool nocb_cpu_expendable(int cpu)
2484{
2485 return 1;
2486}
2487
2488static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2326static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2489{ 2327{
2490} 2328}
@@ -2493,12 +2331,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2493{ 2331{
2494} 2332}
2495 2333
2496static void init_nocb_callback_list(struct rcu_data *rdp) 2334static bool init_nocb_callback_list(struct rcu_data *rdp)
2497{
2498}
2499
2500static void __init rcu_init_nocb(void)
2501{ 2335{
2336 return false;
2502} 2337}
2503 2338
2504#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 2339#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..49099e81c87b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "rcutree.h"
48 48
49#define ulong2long(a) (*(long *)(&(a)))
50
51static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
52 const struct seq_operations *op) 50 const struct seq_operations *op)
53{ 51{