aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/RCU/checklist.txt6
-rw-r--r--Documentation/RCU/rcu.txt10
-rw-r--r--Documentation/RCU/stallwarn.txt16
-rw-r--r--Documentation/RCU/torture.txt13
-rw-r--r--Documentation/RCU/trace.txt4
-rw-r--r--Documentation/RCU/whatisRCU.txt19
-rw-r--r--Documentation/atomic_ops.txt87
-rw-r--r--Documentation/lockdep-design.txt63
-rw-r--r--arch/arm/kernel/process.c6
-rw-r--r--arch/avr32/kernel/process.c6
-rw-r--r--arch/blackfin/kernel/process.c6
-rw-r--r--arch/microblaze/kernel/process.c6
-rw-r--r--arch/mips/kernel/process.c6
-rw-r--r--arch/openrisc/kernel/idle.c6
-rw-r--r--arch/powerpc/kernel/idle.c15
-rw-r--r--arch/powerpc/platforms/iseries/setup.c12
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c4
-rw-r--r--arch/s390/kernel/process.c6
-rw-r--r--arch/sh/kernel/idle.c6
-rw-r--r--arch/sparc/kernel/process_64.c6
-rw-r--r--arch/sparc/kernel/setup_32.c2
-rw-r--r--arch/tile/kernel/process.c6
-rw-r--r--arch/tile/mm/fault.c4
-rw-r--r--arch/um/kernel/process.c6
-rw-r--r--arch/unicore32/kernel/process.c6
-rw-r--r--arch/x86/kernel/apic/apic.c6
-rw-r--r--arch/x86/kernel/apic/io_apic.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c10
-rw-r--r--drivers/base/cpu.c7
-rw-r--r--include/linux/cpu.h1
-rw-r--r--include/linux/hardirq.h21
-rw-r--r--include/linux/rcupdate.h115
-rw-r--r--include/linux/sched.h8
-rw-r--r--include/linux/srcu.h87
-rw-r--r--include/linux/tick.h11
-rw-r--r--include/trace/events/rcu.h122
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/cpu.c1
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/core.c2
-rw-r--r--kernel/lockdep.c22
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcutiny.c149
-rw-r--r--kernel/rcutiny_plugin.h29
-rw-r--r--kernel/rcutorture.c225
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h26
-rw-r--r--kernel/rcutree_plugin.h289
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/time/tick-sched.c97
-rw-r--r--kernel/trace/trace.c1
58 files changed, 1512 insertions, 407 deletions
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 0c134f8afc6f..bff2d8be1e18 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -328,6 +328,12 @@ over a rather long period of time, but improvements are always welcome!
328 RCU rather than SRCU, because RCU is almost always faster and 328 RCU rather than SRCU, because RCU is almost always faster and
329 easier to use than is SRCU. 329 easier to use than is SRCU.
330 330
331 If you need to enter your read-side critical section in a
332 hardirq or exception handler, and then exit that same read-side
333 critical section in the task that was interrupted, then you need
334 to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid
335 the lockdep checking that would otherwise this practice illegal.
336
331 Also unlike other forms of RCU, explicit initialization 337 Also unlike other forms of RCU, explicit initialization
332 and cleanup is required via init_srcu_struct() and 338 and cleanup is required via init_srcu_struct() and
333 cleanup_srcu_struct(). These are passed a "struct srcu_struct" 339 cleanup_srcu_struct(). These are passed a "struct srcu_struct"
diff --git a/Documentation/RCU/rcu.txt b/Documentation/RCU/rcu.txt
index 31852705b586..bf778332a28f 100644
--- a/Documentation/RCU/rcu.txt
+++ b/Documentation/RCU/rcu.txt
@@ -38,11 +38,11 @@ o How can the updater tell when a grace period has completed
38 38
39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the 39 Preemptible variants of RCU (CONFIG_TREE_PREEMPT_RCU) get the
40 same effect, but require that the readers manipulate CPU-local 40 same effect, but require that the readers manipulate CPU-local
41 counters. These counters allow limited types of blocking 41 counters. These counters allow limited types of blocking within
42 within RCU read-side critical sections. SRCU also uses 42 RCU read-side critical sections. SRCU also uses CPU-local
43 CPU-local counters, and permits general blocking within 43 counters, and permits general blocking within RCU read-side
44 RCU read-side critical sections. These two variants of 44 critical sections. These variants of RCU detect grace periods
45 RCU detect grace periods by sampling these counters. 45 by sampling these counters.
46 46
47o If I am running on a uniprocessor kernel, which can only do one 47o If I am running on a uniprocessor kernel, which can only do one
48 thing at a time, why should I wait for a grace period? 48 thing at a time, why should I wait for a grace period?
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 4e959208f736..083d88cbc089 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -101,6 +101,11 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
101 CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning 101 CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
102 messages. 102 messages.
103 103
104o A hardware or software issue shuts off the scheduler-clock
105 interrupt on a CPU that is not in dyntick-idle mode. This
106 problem really has happened, and seems to be most likely to
107 result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels.
108
104o A bug in the RCU implementation. 109o A bug in the RCU implementation.
105 110
106o A hardware failure. This is quite unlikely, but has occurred 111o A hardware failure. This is quite unlikely, but has occurred
@@ -109,12 +114,11 @@ o A hardware failure. This is quite unlikely, but has occurred
109 This resulted in a series of RCU CPU stall warnings, eventually 114 This resulted in a series of RCU CPU stall warnings, eventually
110 leading the realization that the CPU had failed. 115 leading the realization that the CPU had failed.
111 116
112The RCU, RCU-sched, and RCU-bh implementations have CPU stall 117The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning.
113warning. SRCU does not have its own CPU stall warnings, but its 118SRCU does not have its own CPU stall warnings, but its calls to
114calls to synchronize_sched() will result in RCU-sched detecting 119synchronize_sched() will result in RCU-sched detecting RCU-sched-related
115RCU-sched-related CPU stalls. Please note that RCU only detects 120CPU stalls. Please note that RCU only detects CPU stalls when there is
116CPU stalls when there is a grace period in progress. No grace period, 121a grace period in progress. No grace period, no CPU stall warnings.
117no CPU stall warnings.
118 122
119To diagnose the cause of the stall, inspect the stack traces. 123To diagnose the cause of the stall, inspect the stack traces.
120The offending function will usually be near the top of the stack. 124The offending function will usually be near the top of the stack.
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 783d6c134d3f..d67068d0d2b9 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -61,11 +61,24 @@ nreaders This is the number of RCU reading threads supported.
61 To properly exercise RCU implementations with preemptible 61 To properly exercise RCU implementations with preemptible
62 read-side critical sections. 62 read-side critical sections.
63 63
64onoff_interval
65 The number of seconds between each attempt to execute a
66 randomly selected CPU-hotplug operation. Defaults to
67 zero, which disables CPU hotplugging. In HOTPLUG_CPU=n
68 kernels, rcutorture will silently refuse to do any
69 CPU-hotplug operations regardless of what value is
70 specified for onoff_interval.
71
64shuffle_interval 72shuffle_interval
65 The number of seconds to keep the test threads affinitied 73 The number of seconds to keep the test threads affinitied
66 to a particular subset of the CPUs, defaults to 3 seconds. 74 to a particular subset of the CPUs, defaults to 3 seconds.
67 Used in conjunction with test_no_idle_hz. 75 Used in conjunction with test_no_idle_hz.
68 76
77shutdown_secs The number of seconds to run the test before terminating
78 the test and powering off the system. The default is
79 zero, which disables test termination and system shutdown.
80 This capability is useful for automated testing.
81
69stat_interval The number of seconds between output of torture 82stat_interval The number of seconds between output of torture
70 statistics (via printk()). Regardless of the interval, 83 statistics (via printk()). Regardless of the interval,
71 statistics are printed when the module is unloaded. 84 statistics are printed when the module is unloaded.
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index aaf65f6c6cd7..49587abfc2f7 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -105,14 +105,10 @@ o "dt" is the current value of the dyntick counter that is incremented
105 or one greater than the interrupt-nesting depth otherwise. 105 or one greater than the interrupt-nesting depth otherwise.
106 The number after the second "/" is the NMI nesting depth. 106 The number after the second "/" is the NMI nesting depth.
107 107
108 This field is displayed only for CONFIG_NO_HZ kernels.
109
110o "df" is the number of times that some other CPU has forced a 108o "df" is the number of times that some other CPU has forced a
111 quiescent state on behalf of this CPU due to this CPU being in 109 quiescent state on behalf of this CPU due to this CPU being in
112 dynticks-idle state. 110 dynticks-idle state.
113 111
114 This field is displayed only for CONFIG_NO_HZ kernels.
115
116o "of" is the number of times that some other CPU has forced a 112o "of" is the number of times that some other CPU has forced a
117 quiescent state on behalf of this CPU due to this CPU being 113 quiescent state on behalf of this CPU due to this CPU being
118 offline. In a perfect world, this might never happen, but it 114 offline. In a perfect world, this might never happen, but it
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 6ef692667e2f..6bbe8dcdc3da 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -4,6 +4,7 @@ to start learning about RCU:
41. What is RCU, Fundamentally? http://lwn.net/Articles/262464/ 41. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
52. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/ 52. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
63. RCU part 3: the RCU API http://lwn.net/Articles/264090/ 63. RCU part 3: the RCU API http://lwn.net/Articles/264090/
74. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
7 8
8 9
9What is RCU? 10What is RCU?
@@ -834,6 +835,8 @@ SRCU: Critical sections Grace period Barrier
834 835
835 srcu_read_lock synchronize_srcu N/A 836 srcu_read_lock synchronize_srcu N/A
836 srcu_read_unlock synchronize_srcu_expedited 837 srcu_read_unlock synchronize_srcu_expedited
838 srcu_read_lock_raw
839 srcu_read_unlock_raw
837 srcu_dereference 840 srcu_dereference
838 841
839SRCU: Initialization/cleanup 842SRCU: Initialization/cleanup
@@ -855,27 +858,33 @@ list can be helpful:
855 858
856a. Will readers need to block? If so, you need SRCU. 859a. Will readers need to block? If so, you need SRCU.
857 860
858b. What about the -rt patchset? If readers would need to block 861b. Is it necessary to start a read-side critical section in a
862 hardirq handler or exception handler, and then to complete
863 this read-side critical section in the task that was
864 interrupted? If so, you need SRCU's srcu_read_lock_raw() and
865 srcu_read_unlock_raw() primitives.
866
867c. What about the -rt patchset? If readers would need to block
859 in an non-rt kernel, you need SRCU. If readers would block 868 in an non-rt kernel, you need SRCU. If readers would block
860 in a -rt kernel, but not in a non-rt kernel, SRCU is not 869 in a -rt kernel, but not in a non-rt kernel, SRCU is not
861 necessary. 870 necessary.
862 871
863c. Do you need to treat NMI handlers, hardirq handlers, 872d. Do you need to treat NMI handlers, hardirq handlers,
864 and code segments with preemption disabled (whether 873 and code segments with preemption disabled (whether
865 via preempt_disable(), local_irq_save(), local_bh_disable(), 874 via preempt_disable(), local_irq_save(), local_bh_disable(),
866 or some other mechanism) as if they were explicit RCU readers? 875 or some other mechanism) as if they were explicit RCU readers?
867 If so, you need RCU-sched. 876 If so, you need RCU-sched.
868 877
869d. Do you need RCU grace periods to complete even in the face 878e. Do you need RCU grace periods to complete even in the face
870 of softirq monopolization of one or more of the CPUs? For 879 of softirq monopolization of one or more of the CPUs? For
871 example, is your code subject to network-based denial-of-service 880 example, is your code subject to network-based denial-of-service
872 attacks? If so, you need RCU-bh. 881 attacks? If so, you need RCU-bh.
873 882
874e. Is your workload too update-intensive for normal use of 883f. Is your workload too update-intensive for normal use of
875 RCU, but inappropriate for other synchronization mechanisms? 884 RCU, but inappropriate for other synchronization mechanisms?
876 If so, consider SLAB_DESTROY_BY_RCU. But please be careful! 885 If so, consider SLAB_DESTROY_BY_RCU. But please be careful!
877 886
878f. Otherwise, use RCU. 887g. Otherwise, use RCU.
879 888
880Of course, this all assumes that you have determined that RCU is in fact 889Of course, this all assumes that you have determined that RCU is in fact
881the right tool for your job. 890the right tool for your job.
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 3bd585b44927..27f2b21a9d5c 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -84,6 +84,93 @@ compiler optimizes the section accessing atomic_t variables.
84 84
85*** YOU HAVE BEEN WARNED! *** 85*** YOU HAVE BEEN WARNED! ***
86 86
87Properly aligned pointers, longs, ints, and chars (and unsigned
88equivalents) may be atomically loaded from and stored to in the same
89sense as described for atomic_read() and atomic_set(). The ACCESS_ONCE()
90macro should be used to prevent the compiler from using optimizations
91that might otherwise optimize accesses out of existence on the one hand,
92or that might create unsolicited accesses on the other.
93
94For example consider the following code:
95
96 while (a > 0)
97 do_something();
98
99If the compiler can prove that do_something() does not store to the
100variable a, then the compiler is within its rights transforming this to
101the following:
102
103 tmp = a;
104 if (a > 0)
105 for (;;)
106 do_something();
107
108If you don't want the compiler to do this (and you probably don't), then
109you should use something like the following:
110
111 while (ACCESS_ONCE(a) < 0)
112 do_something();
113
114Alternatively, you could place a barrier() call in the loop.
115
116For another example, consider the following code:
117
118 tmp_a = a;
119 do_something_with(tmp_a);
120 do_something_else_with(tmp_a);
121
122If the compiler can prove that do_something_with() does not store to the
123variable a, then the compiler is within its rights to manufacture an
124additional load as follows:
125
126 tmp_a = a;
127 do_something_with(tmp_a);
128 tmp_a = a;
129 do_something_else_with(tmp_a);
130
131This could fatally confuse your code if it expected the same value
132to be passed to do_something_with() and do_something_else_with().
133
134The compiler would be likely to manufacture this additional load if
135do_something_with() was an inline function that made very heavy use
136of registers: reloading from variable a could save a flush to the
137stack and later reload. To prevent the compiler from attacking your
138code in this manner, write the following:
139
140 tmp_a = ACCESS_ONCE(a);
141 do_something_with(tmp_a);
142 do_something_else_with(tmp_a);
143
144For a final example, consider the following code, assuming that the
145variable a is set at boot time before the second CPU is brought online
146and never changed later, so that memory barriers are not needed:
147
148 if (a)
149 b = 9;
150 else
151 b = 42;
152
153The compiler is within its rights to manufacture an additional store
154by transforming the above code into the following:
155
156 b = 42;
157 if (a)
158 b = 9;
159
160This could come as a fatal surprise to other code running concurrently
161that expected b to never have the value 42 if a was zero. To prevent
162the compiler from doing this, write something like:
163
164 if (a)
165 ACCESS_ONCE(b) = 9;
166 else
167 ACCESS_ONCE(b) = 42;
168
169Don't even -think- about doing this without proper use of memory barriers,
170locks, or atomic operations if variable a can change at runtime!
171
172*** WARNING: ACCESS_ONCE() DOES NOT IMPLY A BARRIER! ***
173
87Now, we move onto the atomic operation interfaces typically implemented with 174Now, we move onto the atomic operation interfaces typically implemented with
88the help of assembly code. 175the help of assembly code.
89 176
diff --git a/Documentation/lockdep-design.txt b/Documentation/lockdep-design.txt
index abf768c681e2..5dbc99c04f6e 100644
--- a/Documentation/lockdep-design.txt
+++ b/Documentation/lockdep-design.txt
@@ -221,3 +221,66 @@ when the chain is validated for the first time, is then put into a hash
221table, which hash-table can be checked in a lockfree manner. If the 221table, which hash-table can be checked in a lockfree manner. If the
222locking chain occurs again later on, the hash table tells us that we 222locking chain occurs again later on, the hash table tells us that we
223dont have to validate the chain again. 223dont have to validate the chain again.
224
225Troubleshooting:
226----------------
227
228The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
229Exceeding this number will trigger the following lockdep warning:
230
231 (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
232
233By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
234desktop systems have less than 1,000 lock classes, so this warning
235normally results from lock-class leakage or failure to properly
236initialize locks. These two problems are illustrated below:
237
2381. Repeated module loading and unloading while running the validator
239 will result in lock-class leakage. The issue here is that each
240 load of the module will create a new set of lock classes for
241 that module's locks, but module unloading does not remove old
242 classes (see below discussion of reuse of lock classes for why).
243 Therefore, if that module is loaded and unloaded repeatedly,
244 the number of lock classes will eventually reach the maximum.
245
2462. Using structures such as arrays that have large numbers of
247 locks that are not explicitly initialized. For example,
248 a hash table with 8192 buckets where each bucket has its own
249 spinlock_t will consume 8192 lock classes -unless- each spinlock
250 is explicitly initialized at runtime, for example, using the
251 run-time spin_lock_init() as opposed to compile-time initializers
252 such as __SPIN_LOCK_UNLOCKED(). Failure to properly initialize
253 the per-bucket spinlocks would guarantee lock-class overflow.
254 In contrast, a loop that called spin_lock_init() on each lock
255 would place all 8192 locks into a single lock class.
256
257 The moral of this story is that you should always explicitly
258 initialize your locks.
259
260One might argue that the validator should be modified to allow
261lock classes to be reused. However, if you are tempted to make this
262argument, first review the code and think through the changes that would
263be required, keeping in mind that the lock classes to be removed are
264likely to be linked into the lock-dependency graph. This turns out to
265be harder to do than to say.
266
267Of course, if you do run out of lock classes, the next thing to do is
268to find the offending lock classes. First, the following command gives
269you the number of lock classes currently in use along with the maximum:
270
271 grep "lock-classes" /proc/lockdep_stats
272
273This command produces the following output on a modest system:
274
275 lock-classes: 748 [max: 8191]
276
277If the number allocated (748 above) increases continually over time,
278then there is likely a leak. The following command can be used to
279identify the leaking lock classes:
280
281 grep "BD" /proc/lockdep
282
283Run the command and save the output, then compare against the output from
284a later run of this command to identify the leakers. This same output
285can also help you find situations where runtime lock initialization has
286been omitted.
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 3d0c6fb74ae4..e8e8fe505df1 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -183,7 +183,8 @@ void cpu_idle(void)
183 183
184 /* endless idle loop with no priority at all */ 184 /* endless idle loop with no priority at all */
185 while (1) { 185 while (1) {
186 tick_nohz_stop_sched_tick(1); 186 tick_nohz_idle_enter();
187 rcu_idle_enter();
187 leds_event(led_idle_start); 188 leds_event(led_idle_start);
188 while (!need_resched()) { 189 while (!need_resched()) {
189#ifdef CONFIG_HOTPLUG_CPU 190#ifdef CONFIG_HOTPLUG_CPU
@@ -213,7 +214,8 @@ void cpu_idle(void)
213 } 214 }
214 } 215 }
215 leds_event(led_idle_end); 216 leds_event(led_idle_end);
216 tick_nohz_restart_sched_tick(); 217 rcu_idle_exit();
218 tick_nohz_idle_exit();
217 preempt_enable_no_resched(); 219 preempt_enable_no_resched();
218 schedule(); 220 schedule();
219 preempt_disable(); 221 preempt_disable();
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index ef5a2a08fcca..ea3395750324 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -34,10 +34,12 @@ void cpu_idle(void)
34{ 34{
35 /* endless idle loop with no priority at all */ 35 /* endless idle loop with no priority at all */
36 while (1) { 36 while (1) {
37 tick_nohz_stop_sched_tick(1); 37 tick_nohz_idle_enter();
38 rcu_idle_enter();
38 while (!need_resched()) 39 while (!need_resched())
39 cpu_idle_sleep(); 40 cpu_idle_sleep();
40 tick_nohz_restart_sched_tick(); 41 rcu_idle_exit();
42 tick_nohz_idle_exit();
41 preempt_enable_no_resched(); 43 preempt_enable_no_resched();
42 schedule(); 44 schedule();
43 preempt_disable(); 45 preempt_disable();
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 6a80a9e9fc4a..8dd0416673cb 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -88,10 +88,12 @@ void cpu_idle(void)
88#endif 88#endif
89 if (!idle) 89 if (!idle)
90 idle = default_idle; 90 idle = default_idle;
91 tick_nohz_stop_sched_tick(1); 91 tick_nohz_idle_enter();
92 rcu_idle_enter();
92 while (!need_resched()) 93 while (!need_resched())
93 idle(); 94 idle();
94 tick_nohz_restart_sched_tick(); 95 rcu_idle_exit();
96 tick_nohz_idle_exit();
95 preempt_enable_no_resched(); 97 preempt_enable_no_resched();
96 schedule(); 98 schedule();
97 preempt_disable(); 99 preempt_disable();
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 95cc295976a7..7dcb5bfffb75 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -103,10 +103,12 @@ void cpu_idle(void)
103 if (!idle) 103 if (!idle)
104 idle = default_idle; 104 idle = default_idle;
105 105
106 tick_nohz_stop_sched_tick(1); 106 tick_nohz_idle_enter();
107 rcu_idle_enter();
107 while (!need_resched()) 108 while (!need_resched())
108 idle(); 109 idle();
109 tick_nohz_restart_sched_tick(); 110 rcu_idle_exit();
111 tick_nohz_idle_exit();
110 112
111 preempt_enable_no_resched(); 113 preempt_enable_no_resched();
112 schedule(); 114 schedule();
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index c47f96e453c0..7955409051c4 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -56,7 +56,8 @@ void __noreturn cpu_idle(void)
56 56
57 /* endless idle loop with no priority at all */ 57 /* endless idle loop with no priority at all */
58 while (1) { 58 while (1) {
59 tick_nohz_stop_sched_tick(1); 59 tick_nohz_idle_enter();
60 rcu_idle_enter();
60 while (!need_resched() && cpu_online(cpu)) { 61 while (!need_resched() && cpu_online(cpu)) {
61#ifdef CONFIG_MIPS_MT_SMTC 62#ifdef CONFIG_MIPS_MT_SMTC
62 extern void smtc_idle_loop_hook(void); 63 extern void smtc_idle_loop_hook(void);
@@ -77,7 +78,8 @@ void __noreturn cpu_idle(void)
77 system_state == SYSTEM_BOOTING)) 78 system_state == SYSTEM_BOOTING))
78 play_dead(); 79 play_dead();
79#endif 80#endif
80 tick_nohz_restart_sched_tick(); 81 rcu_idle_exit();
82 tick_nohz_idle_exit();
81 preempt_enable_no_resched(); 83 preempt_enable_no_resched();
82 schedule(); 84 schedule();
83 preempt_disable(); 85 preempt_disable();
diff --git a/arch/openrisc/kernel/idle.c b/arch/openrisc/kernel/idle.c
index d5bc5f813e89..e5fc78877830 100644
--- a/arch/openrisc/kernel/idle.c
+++ b/arch/openrisc/kernel/idle.c
@@ -51,7 +51,8 @@ void cpu_idle(void)
51 51
52 /* endless idle loop with no priority at all */ 52 /* endless idle loop with no priority at all */
53 while (1) { 53 while (1) {
54 tick_nohz_stop_sched_tick(1); 54 tick_nohz_idle_enter();
55 rcu_idle_enter();
55 56
56 while (!need_resched()) { 57 while (!need_resched()) {
57 check_pgt_cache(); 58 check_pgt_cache();
@@ -69,7 +70,8 @@ void cpu_idle(void)
69 set_thread_flag(TIF_POLLING_NRFLAG); 70 set_thread_flag(TIF_POLLING_NRFLAG);
70 } 71 }
71 72
72 tick_nohz_restart_sched_tick(); 73 rcu_idle_exit();
74 tick_nohz_idle_exit();
73 preempt_enable_no_resched(); 75 preempt_enable_no_resched();
74 schedule(); 76 schedule();
75 preempt_disable(); 77 preempt_disable();
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 39a2baa6ad58..9c3cd490b1bd 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -46,6 +46,12 @@ static int __init powersave_off(char *arg)
46} 46}
47__setup("powersave=off", powersave_off); 47__setup("powersave=off", powersave_off);
48 48
49#if defined(CONFIG_PPC_PSERIES) && defined(CONFIG_TRACEPOINTS)
50static const bool idle_uses_rcu = 1;
51#else
52static const bool idle_uses_rcu;
53#endif
54
49/* 55/*
50 * The body of the idle task. 56 * The body of the idle task.
51 */ 57 */
@@ -56,7 +62,10 @@ void cpu_idle(void)
56 62
57 set_thread_flag(TIF_POLLING_NRFLAG); 63 set_thread_flag(TIF_POLLING_NRFLAG);
58 while (1) { 64 while (1) {
59 tick_nohz_stop_sched_tick(1); 65 tick_nohz_idle_enter();
66 if (!idle_uses_rcu)
67 rcu_idle_enter();
68
60 while (!need_resched() && !cpu_should_die()) { 69 while (!need_resched() && !cpu_should_die()) {
61 ppc64_runlatch_off(); 70 ppc64_runlatch_off();
62 71
@@ -93,7 +102,9 @@ void cpu_idle(void)
93 102
94 HMT_medium(); 103 HMT_medium();
95 ppc64_runlatch_on(); 104 ppc64_runlatch_on();
96 tick_nohz_restart_sched_tick(); 105 if (!idle_uses_rcu)
106 rcu_idle_exit();
107 tick_nohz_idle_exit();
97 preempt_enable_no_resched(); 108 preempt_enable_no_resched();
98 if (cpu_should_die()) 109 if (cpu_should_die())
99 cpu_die(); 110 cpu_die();
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index ea0acbd8966d..8fc62586a973 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -563,7 +563,8 @@ static void yield_shared_processor(void)
563static void iseries_shared_idle(void) 563static void iseries_shared_idle(void)
564{ 564{
565 while (1) { 565 while (1) {
566 tick_nohz_stop_sched_tick(1); 566 tick_nohz_idle_enter();
567 rcu_idle_enter();
567 while (!need_resched() && !hvlpevent_is_pending()) { 568 while (!need_resched() && !hvlpevent_is_pending()) {
568 local_irq_disable(); 569 local_irq_disable();
569 ppc64_runlatch_off(); 570 ppc64_runlatch_off();
@@ -577,7 +578,8 @@ static void iseries_shared_idle(void)
577 } 578 }
578 579
579 ppc64_runlatch_on(); 580 ppc64_runlatch_on();
580 tick_nohz_restart_sched_tick(); 581 rcu_idle_exit();
582 tick_nohz_idle_exit();
581 583
582 if (hvlpevent_is_pending()) 584 if (hvlpevent_is_pending())
583 process_iSeries_events(); 585 process_iSeries_events();
@@ -593,7 +595,8 @@ static void iseries_dedicated_idle(void)
593 set_thread_flag(TIF_POLLING_NRFLAG); 595 set_thread_flag(TIF_POLLING_NRFLAG);
594 596
595 while (1) { 597 while (1) {
596 tick_nohz_stop_sched_tick(1); 598 tick_nohz_idle_enter();
599 rcu_idle_enter();
597 if (!need_resched()) { 600 if (!need_resched()) {
598 while (!need_resched()) { 601 while (!need_resched()) {
599 ppc64_runlatch_off(); 602 ppc64_runlatch_off();
@@ -610,7 +613,8 @@ static void iseries_dedicated_idle(void)
610 } 613 }
611 614
612 ppc64_runlatch_on(); 615 ppc64_runlatch_on();
613 tick_nohz_restart_sched_tick(); 616 rcu_idle_exit();
617 tick_nohz_idle_exit();
614 preempt_enable_no_resched(); 618 preempt_enable_no_resched();
615 schedule(); 619 schedule();
616 preempt_disable(); 620 preempt_disable();
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 27a49508b410..52d429be6c76 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -555,6 +555,8 @@ void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
555 555
556 (*depth)++; 556 (*depth)++;
557 trace_hcall_entry(opcode, args); 557 trace_hcall_entry(opcode, args);
558 if (opcode == H_CEDE)
559 rcu_idle_enter();
558 (*depth)--; 560 (*depth)--;
559 561
560out: 562out:
@@ -575,6 +577,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
575 goto out; 577 goto out;
576 578
577 (*depth)++; 579 (*depth)++;
580 if (opcode == H_CEDE)
581 rcu_idle_exit();
578 trace_hcall_exit(opcode, retval, retbuf); 582 trace_hcall_exit(opcode, retval, retbuf);
579 (*depth)--; 583 (*depth)--;
580 584
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 9451b210a1b4..3201ae447990 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -91,10 +91,12 @@ static void default_idle(void)
91void cpu_idle(void) 91void cpu_idle(void)
92{ 92{
93 for (;;) { 93 for (;;) {
94 tick_nohz_stop_sched_tick(1); 94 tick_nohz_idle_enter();
95 rcu_idle_enter();
95 while (!need_resched()) 96 while (!need_resched())
96 default_idle(); 97 default_idle();
97 tick_nohz_restart_sched_tick(); 98 rcu_idle_exit();
99 tick_nohz_idle_exit();
98 preempt_enable_no_resched(); 100 preempt_enable_no_resched();
99 schedule(); 101 schedule();
100 preempt_disable(); 102 preempt_disable();
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index db4ecd731a00..406508d4ce74 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -89,7 +89,8 @@ void cpu_idle(void)
89 89
90 /* endless idle loop with no priority at all */ 90 /* endless idle loop with no priority at all */
91 while (1) { 91 while (1) {
92 tick_nohz_stop_sched_tick(1); 92 tick_nohz_idle_enter();
93 rcu_idle_enter();
93 94
94 while (!need_resched()) { 95 while (!need_resched()) {
95 check_pgt_cache(); 96 check_pgt_cache();
@@ -111,7 +112,8 @@ void cpu_idle(void)
111 start_critical_timings(); 112 start_critical_timings();
112 } 113 }
113 114
114 tick_nohz_restart_sched_tick(); 115 rcu_idle_exit();
116 tick_nohz_idle_exit();
115 preempt_enable_no_resched(); 117 preempt_enable_no_resched();
116 schedule(); 118 schedule();
117 preempt_disable(); 119 preempt_disable();
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 3739a06a76cb..39d8b05201a2 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -95,12 +95,14 @@ void cpu_idle(void)
95 set_thread_flag(TIF_POLLING_NRFLAG); 95 set_thread_flag(TIF_POLLING_NRFLAG);
96 96
97 while(1) { 97 while(1) {
98 tick_nohz_stop_sched_tick(1); 98 tick_nohz_idle_enter();
99 rcu_idle_enter();
99 100
100 while (!need_resched() && !cpu_is_offline(cpu)) 101 while (!need_resched() && !cpu_is_offline(cpu))
101 sparc64_yield(cpu); 102 sparc64_yield(cpu);
102 103
103 tick_nohz_restart_sched_tick(); 104 rcu_idle_exit();
105 tick_nohz_idle_exit();
104 106
105 preempt_enable_no_resched(); 107 preempt_enable_no_resched();
106 108
diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c
index fe1e3fc31bc5..ffb883ddd0f0 100644
--- a/arch/sparc/kernel/setup_32.c
+++ b/arch/sparc/kernel/setup_32.c
@@ -84,7 +84,7 @@ static void prom_sync_me(void)
84 84
85 prom_printf("PROM SYNC COMMAND...\n"); 85 prom_printf("PROM SYNC COMMAND...\n");
86 show_free_areas(0); 86 show_free_areas(0);
87 if(current->pid != 0) { 87 if (!is_idle_task(current)) {
88 local_irq_enable(); 88 local_irq_enable();
89 sys_sync(); 89 sys_sync();
90 local_irq_disable(); 90 local_irq_disable();
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 9c45d8bbdf57..4c1ac6e5347a 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -85,7 +85,8 @@ void cpu_idle(void)
85 85
86 /* endless idle loop with no priority at all */ 86 /* endless idle loop with no priority at all */
87 while (1) { 87 while (1) {
88 tick_nohz_stop_sched_tick(1); 88 tick_nohz_idle_enter();
89 rcu_idle_enter();
89 while (!need_resched()) { 90 while (!need_resched()) {
90 if (cpu_is_offline(cpu)) 91 if (cpu_is_offline(cpu))
91 BUG(); /* no HOTPLUG_CPU */ 92 BUG(); /* no HOTPLUG_CPU */
@@ -105,7 +106,8 @@ void cpu_idle(void)
105 local_irq_enable(); 106 local_irq_enable();
106 current_thread_info()->status |= TS_POLLING; 107 current_thread_info()->status |= TS_POLLING;
107 } 108 }
108 tick_nohz_restart_sched_tick(); 109 rcu_idle_exit();
110 tick_nohz_idle_exit();
109 preempt_enable_no_resched(); 111 preempt_enable_no_resched();
110 schedule(); 112 schedule();
111 preempt_disable(); 113 preempt_disable();
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 25b7b90fd620..c1eaaa1fcc20 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -54,7 +54,7 @@ static noinline void force_sig_info_fault(const char *type, int si_signo,
54 if (unlikely(tsk->pid < 2)) { 54 if (unlikely(tsk->pid < 2)) {
55 panic("Signal %d (code %d) at %#lx sent to %s!", 55 panic("Signal %d (code %d) at %#lx sent to %s!",
56 si_signo, si_code & 0xffff, address, 56 si_signo, si_code & 0xffff, address,
57 tsk->pid ? "init" : "the idle task"); 57 is_idle_task(tsk) ? "the idle task" : "init");
58 } 58 }
59 59
60 info.si_signo = si_signo; 60 info.si_signo = si_signo;
@@ -515,7 +515,7 @@ no_context:
515 515
516 if (unlikely(tsk->pid < 2)) { 516 if (unlikely(tsk->pid < 2)) {
517 panic("Kernel page fault running %s!", 517 panic("Kernel page fault running %s!",
518 tsk->pid ? "init" : "the idle task"); 518 is_idle_task(tsk) ? "the idle task" : "init");
519 } 519 }
520 520
521 /* 521 /*
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index c5338351aecd..69f24905abdc 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -246,10 +246,12 @@ void default_idle(void)
246 if (need_resched()) 246 if (need_resched())
247 schedule(); 247 schedule();
248 248
249 tick_nohz_stop_sched_tick(1); 249 tick_nohz_idle_enter();
250 rcu_idle_enter();
250 nsecs = disable_timer(); 251 nsecs = disable_timer();
251 idle_sleep(nsecs); 252 idle_sleep(nsecs);
252 tick_nohz_restart_sched_tick(); 253 rcu_idle_exit();
254 tick_nohz_idle_exit();
253 } 255 }
254} 256}
255 257
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index ba401df971ed..52edc2b62873 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -55,7 +55,8 @@ void cpu_idle(void)
55{ 55{
56 /* endless idle loop with no priority at all */ 56 /* endless idle loop with no priority at all */
57 while (1) { 57 while (1) {
58 tick_nohz_stop_sched_tick(1); 58 tick_nohz_idle_enter();
59 rcu_idle_enter();
59 while (!need_resched()) { 60 while (!need_resched()) {
60 local_irq_disable(); 61 local_irq_disable();
61 stop_critical_timings(); 62 stop_critical_timings();
@@ -63,7 +64,8 @@ void cpu_idle(void)
63 local_irq_enable(); 64 local_irq_enable();
64 start_critical_timings(); 65 start_critical_timings();
65 } 66 }
66 tick_nohz_restart_sched_tick(); 67 rcu_idle_exit();
68 tick_nohz_idle_exit();
67 preempt_enable_no_resched(); 69 preempt_enable_no_resched();
68 schedule(); 70 schedule();
69 preempt_disable(); 71 preempt_disable();
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f98d84caf94c..2cd2d93643dc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -876,8 +876,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
876 * Besides, if we don't timer interrupts ignore the global 876 * Besides, if we don't timer interrupts ignore the global
877 * interrupt lock, which is the WrongThing (tm) to do. 877 * interrupt lock, which is the WrongThing (tm) to do.
878 */ 878 */
879 exit_idle();
880 irq_enter(); 879 irq_enter();
880 exit_idle();
881 local_apic_timer_interrupt(); 881 local_apic_timer_interrupt();
882 irq_exit(); 882 irq_exit();
883 883
@@ -1809,8 +1809,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1809{ 1809{
1810 u32 v; 1810 u32 v;
1811 1811
1812 exit_idle();
1813 irq_enter(); 1812 irq_enter();
1813 exit_idle();
1814 /* 1814 /*
1815 * Check if this really is a spurious interrupt and ACK it 1815 * Check if this really is a spurious interrupt and ACK it
1816 * if it is a vectored one. Just in case... 1816 * if it is a vectored one. Just in case...
@@ -1846,8 +1846,8 @@ void smp_error_interrupt(struct pt_regs *regs)
1846 "Illegal register address", /* APIC Error Bit 7 */ 1846 "Illegal register address", /* APIC Error Bit 7 */
1847 }; 1847 };
1848 1848
1849 exit_idle();
1850 irq_enter(); 1849 irq_enter();
1850 exit_idle();
1851 /* First tickle the hardware, only then report what went on. -- REW */ 1851 /* First tickle the hardware, only then report what went on. -- REW */
1852 v0 = apic_read(APIC_ESR); 1852 v0 = apic_read(APIC_ESR);
1853 apic_write(APIC_ESR, 0); 1853 apic_write(APIC_ESR, 0);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 6d939d7847e2..898055585516 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2421 unsigned vector, me; 2421 unsigned vector, me;
2422 2422
2423 ack_APIC_irq(); 2423 ack_APIC_irq();
2424 exit_idle();
2425 irq_enter(); 2424 irq_enter();
2425 exit_idle();
2426 2426
2427 me = smp_processor_id(); 2427 me = smp_processor_id();
2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2428 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 787e06c84ea6..ce215616d5b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -397,8 +397,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
397 397
398asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) 398asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
399{ 399{
400 exit_idle();
401 irq_enter(); 400 irq_enter();
401 exit_idle();
402 inc_irq_stat(irq_thermal_count); 402 inc_irq_stat(irq_thermal_count);
403 smp_thermal_vector(); 403 smp_thermal_vector();
404 irq_exit(); 404 irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index d746df2909c9..aa578cadb940 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void smp_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 exit_idle();
23 irq_enter(); 22 irq_enter();
23 exit_idle();
24 inc_irq_stat(irq_threshold_count); 24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector(); 25 mce_threshold_vector();
26 irq_exit(); 26 irq_exit();
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 429e0c92924e..5d31e5bdbf85 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -181,8 +181,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
181 unsigned vector = ~regs->orig_ax; 181 unsigned vector = ~regs->orig_ax;
182 unsigned irq; 182 unsigned irq;
183 183
184 exit_idle();
185 irq_enter(); 184 irq_enter();
185 exit_idle();
186 186
187 irq = __this_cpu_read(vector_irq[vector]); 187 irq = __this_cpu_read(vector_irq[vector]);
188 188
@@ -209,10 +209,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
209 209
210 ack_APIC_irq(); 210 ack_APIC_irq();
211 211
212 exit_idle();
213
214 irq_enter(); 212 irq_enter();
215 213
214 exit_idle();
215
216 inc_irq_stat(x86_platform_ipis); 216 inc_irq_stat(x86_platform_ipis);
217 217
218 if (x86_platform_ipi_callback) 218 if (x86_platform_ipi_callback)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 795b79f984c2..485204f58cda 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -99,7 +99,8 @@ void cpu_idle(void)
99 99
100 /* endless idle loop with no priority at all */ 100 /* endless idle loop with no priority at all */
101 while (1) { 101 while (1) {
102 tick_nohz_stop_sched_tick(1); 102 tick_nohz_idle_enter();
103 rcu_idle_enter();
103 while (!need_resched()) { 104 while (!need_resched()) {
104 105
105 check_pgt_cache(); 106 check_pgt_cache();
@@ -116,7 +117,8 @@ void cpu_idle(void)
116 pm_idle(); 117 pm_idle();
117 start_critical_timings(); 118 start_critical_timings();
118 } 119 }
119 tick_nohz_restart_sched_tick(); 120 rcu_idle_exit();
121 tick_nohz_idle_exit();
120 preempt_enable_no_resched(); 122 preempt_enable_no_resched();
121 schedule(); 123 schedule();
122 preempt_disable(); 124 preempt_disable();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3bd7e6eebf31..64e926c89a6f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -122,7 +122,7 @@ void cpu_idle(void)
122 122
123 /* endless idle loop with no priority at all */ 123 /* endless idle loop with no priority at all */
124 while (1) { 124 while (1) {
125 tick_nohz_stop_sched_tick(1); 125 tick_nohz_idle_enter();
126 while (!need_resched()) { 126 while (!need_resched()) {
127 127
128 rmb(); 128 rmb();
@@ -139,8 +139,14 @@ void cpu_idle(void)
139 enter_idle(); 139 enter_idle();
140 /* Don't trace irqs off for idle */ 140 /* Don't trace irqs off for idle */
141 stop_critical_timings(); 141 stop_critical_timings();
142
143 /* enter_idle() needs rcu for notifiers */
144 rcu_idle_enter();
145
142 if (cpuidle_idle_call()) 146 if (cpuidle_idle_call())
143 pm_idle(); 147 pm_idle();
148
149 rcu_idle_exit();
144 start_critical_timings(); 150 start_critical_timings();
145 151
146 /* In many cases the interrupt that ended idle 152 /* In many cases the interrupt that ended idle
@@ -149,7 +155,7 @@ void cpu_idle(void)
149 __exit_idle(); 155 __exit_idle();
150 } 156 }
151 157
152 tick_nohz_restart_sched_tick(); 158 tick_nohz_idle_exit();
153 preempt_enable_no_resched(); 159 preempt_enable_no_resched();
154 schedule(); 160 schedule();
155 preempt_disable(); 161 preempt_disable();
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 251acea3d359..3991502b21e5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -247,6 +247,13 @@ struct sys_device *get_cpu_sysdev(unsigned cpu)
247} 247}
248EXPORT_SYMBOL_GPL(get_cpu_sysdev); 248EXPORT_SYMBOL_GPL(get_cpu_sysdev);
249 249
250bool cpu_is_hotpluggable(unsigned cpu)
251{
252 struct sys_device *dev = get_cpu_sysdev(cpu);
253 return dev && container_of(dev, struct cpu, sysdev)->hotpluggable;
254}
255EXPORT_SYMBOL_GPL(cpu_is_hotpluggable);
256
250int __init cpu_dev_init(void) 257int __init cpu_dev_init(void)
251{ 258{
252 int err; 259 int err;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 6cb60fd2ea84..305c263021e7 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -27,6 +27,7 @@ struct cpu {
27 27
28extern int register_cpu(struct cpu *cpu, int num); 28extern int register_cpu(struct cpu *cpu, int num);
29extern struct sys_device *get_cpu_sysdev(unsigned cpu); 29extern struct sys_device *get_cpu_sysdev(unsigned cpu);
30extern bool cpu_is_hotpluggable(unsigned cpu);
30 31
31extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr); 32extern int cpu_add_sysdev_attr(struct sysdev_attribute *attr);
32extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr); 33extern void cpu_remove_sysdev_attr(struct sysdev_attribute *attr);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f743883f769e..bb7f30971858 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -139,20 +139,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
139extern void account_system_vtime(struct task_struct *tsk); 139extern void account_system_vtime(struct task_struct *tsk);
140#endif 140#endif
141 141
142#if defined(CONFIG_NO_HZ)
143#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) 142#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
144extern void rcu_enter_nohz(void);
145extern void rcu_exit_nohz(void);
146
147static inline void rcu_irq_enter(void)
148{
149 rcu_exit_nohz();
150}
151
152static inline void rcu_irq_exit(void)
153{
154 rcu_enter_nohz();
155}
156 143
157static inline void rcu_nmi_enter(void) 144static inline void rcu_nmi_enter(void)
158{ 145{
@@ -163,17 +150,9 @@ static inline void rcu_nmi_exit(void)
163} 150}
164 151
165#else 152#else
166extern void rcu_irq_enter(void);
167extern void rcu_irq_exit(void);
168extern void rcu_nmi_enter(void); 153extern void rcu_nmi_enter(void);
169extern void rcu_nmi_exit(void); 154extern void rcu_nmi_exit(void);
170#endif 155#endif
171#else
172# define rcu_irq_enter() do { } while (0)
173# define rcu_irq_exit() do { } while (0)
174# define rcu_nmi_enter() do { } while (0)
175# define rcu_nmi_exit() do { } while (0)
176#endif /* #if defined(CONFIG_NO_HZ) */
177 156
178/* 157/*
179 * It is safe to do non-atomic ops on ->hardirq_context, 158 * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2cf4226ade7e..81c04f4348ec 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -51,6 +51,8 @@ extern int rcutorture_runnable; /* for sysctl */
51#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) 51#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
52extern void rcutorture_record_test_transition(void); 52extern void rcutorture_record_test_transition(void);
53extern void rcutorture_record_progress(unsigned long vernum); 53extern void rcutorture_record_progress(unsigned long vernum);
54extern void do_trace_rcu_torture_read(char *rcutorturename,
55 struct rcu_head *rhp);
54#else 56#else
55static inline void rcutorture_record_test_transition(void) 57static inline void rcutorture_record_test_transition(void)
56{ 58{
@@ -58,6 +60,12 @@ static inline void rcutorture_record_test_transition(void)
58static inline void rcutorture_record_progress(unsigned long vernum) 60static inline void rcutorture_record_progress(unsigned long vernum)
59{ 61{
60} 62}
63#ifdef CONFIG_RCU_TRACE
64extern void do_trace_rcu_torture_read(char *rcutorturename,
65 struct rcu_head *rhp);
66#else
67#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
68#endif
61#endif 69#endif
62 70
63#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) 71#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b))
@@ -177,23 +185,10 @@ extern void rcu_sched_qs(int cpu);
177extern void rcu_bh_qs(int cpu); 185extern void rcu_bh_qs(int cpu);
178extern void rcu_check_callbacks(int cpu, int user); 186extern void rcu_check_callbacks(int cpu, int user);
179struct notifier_block; 187struct notifier_block;
180 188extern void rcu_idle_enter(void);
181#ifdef CONFIG_NO_HZ 189extern void rcu_idle_exit(void);
182 190extern void rcu_irq_enter(void);
183extern void rcu_enter_nohz(void); 191extern void rcu_irq_exit(void);
184extern void rcu_exit_nohz(void);
185
186#else /* #ifdef CONFIG_NO_HZ */
187
188static inline void rcu_enter_nohz(void)
189{
190}
191
192static inline void rcu_exit_nohz(void)
193{
194}
195
196#endif /* #else #ifdef CONFIG_NO_HZ */
197 192
198/* 193/*
199 * Infrastructure to implement the synchronize_() primitives in 194 * Infrastructure to implement the synchronize_() primitives in
@@ -233,22 +228,30 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head)
233 228
234#ifdef CONFIG_DEBUG_LOCK_ALLOC 229#ifdef CONFIG_DEBUG_LOCK_ALLOC
235 230
236extern struct lockdep_map rcu_lock_map; 231#ifdef CONFIG_PROVE_RCU
237# define rcu_read_acquire() \ 232extern int rcu_is_cpu_idle(void);
238 lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) 233#else /* !CONFIG_PROVE_RCU */
239# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) 234static inline int rcu_is_cpu_idle(void)
235{
236 return 0;
237}
238#endif /* else !CONFIG_PROVE_RCU */
240 239
241extern struct lockdep_map rcu_bh_lock_map; 240static inline void rcu_lock_acquire(struct lockdep_map *map)
242# define rcu_read_acquire_bh() \ 241{
243 lock_acquire(&rcu_bh_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) 242 WARN_ON_ONCE(rcu_is_cpu_idle());
244# define rcu_read_release_bh() lock_release(&rcu_bh_lock_map, 1, _THIS_IP_) 243 lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_);
244}
245 245
246extern struct lockdep_map rcu_sched_lock_map; 246static inline void rcu_lock_release(struct lockdep_map *map)
247# define rcu_read_acquire_sched() \ 247{
248 lock_acquire(&rcu_sched_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) 248 WARN_ON_ONCE(rcu_is_cpu_idle());
249# define rcu_read_release_sched() \ 249 lock_release(map, 1, _THIS_IP_);
250 lock_release(&rcu_sched_lock_map, 1, _THIS_IP_) 250}
251 251
252extern struct lockdep_map rcu_lock_map;
253extern struct lockdep_map rcu_bh_lock_map;
254extern struct lockdep_map rcu_sched_lock_map;
252extern int debug_lockdep_rcu_enabled(void); 255extern int debug_lockdep_rcu_enabled(void);
253 256
254/** 257/**
@@ -262,11 +265,18 @@ extern int debug_lockdep_rcu_enabled(void);
262 * 265 *
263 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot 266 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
264 * and while lockdep is disabled. 267 * and while lockdep is disabled.
268 *
269 * Note that rcu_read_lock() and the matching rcu_read_unlock() must
270 * occur in the same context, for example, it is illegal to invoke
271 * rcu_read_unlock() in process context if the matching rcu_read_lock()
272 * was invoked from within an irq handler.
265 */ 273 */
266static inline int rcu_read_lock_held(void) 274static inline int rcu_read_lock_held(void)
267{ 275{
268 if (!debug_lockdep_rcu_enabled()) 276 if (!debug_lockdep_rcu_enabled())
269 return 1; 277 return 1;
278 if (rcu_is_cpu_idle())
279 return 0;
270 return lock_is_held(&rcu_lock_map); 280 return lock_is_held(&rcu_lock_map);
271} 281}
272 282
@@ -290,6 +300,19 @@ extern int rcu_read_lock_bh_held(void);
290 * 300 *
291 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 301 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
292 * and while lockdep is disabled. 302 * and while lockdep is disabled.
303 *
304 * Note that if the CPU is in the idle loop from an RCU point of
305 * view (ie: that we are in the section between rcu_idle_enter() and
306 * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
307 * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
308 * that are in such a section, considering these as in extended quiescent
309 * state, so such a CPU is effectively never in an RCU read-side critical
310 * section regardless of what RCU primitives it invokes. This state of
311 * affairs is required --- we need to keep an RCU-free window in idle
312 * where the CPU may possibly enter into low power mode. This way we can
313 * notice an extended quiescent state to other CPUs that started a grace
314 * period. Otherwise we would delay any grace period as long as we run in
315 * the idle task.
293 */ 316 */
294#ifdef CONFIG_PREEMPT_COUNT 317#ifdef CONFIG_PREEMPT_COUNT
295static inline int rcu_read_lock_sched_held(void) 318static inline int rcu_read_lock_sched_held(void)
@@ -298,6 +321,8 @@ static inline int rcu_read_lock_sched_held(void)
298 321
299 if (!debug_lockdep_rcu_enabled()) 322 if (!debug_lockdep_rcu_enabled())
300 return 1; 323 return 1;
324 if (rcu_is_cpu_idle())
325 return 0;
301 if (debug_locks) 326 if (debug_locks)
302 lockdep_opinion = lock_is_held(&rcu_sched_lock_map); 327 lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
303 return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); 328 return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
@@ -311,12 +336,8 @@ static inline int rcu_read_lock_sched_held(void)
311 336
312#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 337#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
313 338
314# define rcu_read_acquire() do { } while (0) 339# define rcu_lock_acquire(a) do { } while (0)
315# define rcu_read_release() do { } while (0) 340# define rcu_lock_release(a) do { } while (0)
316# define rcu_read_acquire_bh() do { } while (0)
317# define rcu_read_release_bh() do { } while (0)
318# define rcu_read_acquire_sched() do { } while (0)
319# define rcu_read_release_sched() do { } while (0)
320 341
321static inline int rcu_read_lock_held(void) 342static inline int rcu_read_lock_held(void)
322{ 343{
@@ -637,7 +658,7 @@ static inline void rcu_read_lock(void)
637{ 658{
638 __rcu_read_lock(); 659 __rcu_read_lock();
639 __acquire(RCU); 660 __acquire(RCU);
640 rcu_read_acquire(); 661 rcu_lock_acquire(&rcu_lock_map);
641} 662}
642 663
643/* 664/*
@@ -657,7 +678,7 @@ static inline void rcu_read_lock(void)
657 */ 678 */
658static inline void rcu_read_unlock(void) 679static inline void rcu_read_unlock(void)
659{ 680{
660 rcu_read_release(); 681 rcu_lock_release(&rcu_lock_map);
661 __release(RCU); 682 __release(RCU);
662 __rcu_read_unlock(); 683 __rcu_read_unlock();
663} 684}
@@ -673,12 +694,17 @@ static inline void rcu_read_unlock(void)
673 * critical sections in interrupt context can use just rcu_read_lock(), 694 * critical sections in interrupt context can use just rcu_read_lock(),
674 * though this should at least be commented to avoid confusing people 695 * though this should at least be commented to avoid confusing people
675 * reading the code. 696 * reading the code.
697 *
698 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
699 * must occur in the same context, for example, it is illegal to invoke
700 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
701 * was invoked from some other task.
676 */ 702 */
677static inline void rcu_read_lock_bh(void) 703static inline void rcu_read_lock_bh(void)
678{ 704{
679 local_bh_disable(); 705 local_bh_disable();
680 __acquire(RCU_BH); 706 __acquire(RCU_BH);
681 rcu_read_acquire_bh(); 707 rcu_lock_acquire(&rcu_bh_lock_map);
682} 708}
683 709
684/* 710/*
@@ -688,7 +714,7 @@ static inline void rcu_read_lock_bh(void)
688 */ 714 */
689static inline void rcu_read_unlock_bh(void) 715static inline void rcu_read_unlock_bh(void)
690{ 716{
691 rcu_read_release_bh(); 717 rcu_lock_release(&rcu_bh_lock_map);
692 __release(RCU_BH); 718 __release(RCU_BH);
693 local_bh_enable(); 719 local_bh_enable();
694} 720}
@@ -700,12 +726,17 @@ static inline void rcu_read_unlock_bh(void)
700 * are being done using call_rcu_sched() or synchronize_rcu_sched(). 726 * are being done using call_rcu_sched() or synchronize_rcu_sched().
701 * Read-side critical sections can also be introduced by anything that 727 * Read-side critical sections can also be introduced by anything that
702 * disables preemption, including local_irq_disable() and friends. 728 * disables preemption, including local_irq_disable() and friends.
729 *
730 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
731 * must occur in the same context, for example, it is illegal to invoke
732 * rcu_read_unlock_sched() from process context if the matching
733 * rcu_read_lock_sched() was invoked from an NMI handler.
703 */ 734 */
704static inline void rcu_read_lock_sched(void) 735static inline void rcu_read_lock_sched(void)
705{ 736{
706 preempt_disable(); 737 preempt_disable();
707 __acquire(RCU_SCHED); 738 __acquire(RCU_SCHED);
708 rcu_read_acquire_sched(); 739 rcu_lock_acquire(&rcu_sched_lock_map);
709} 740}
710 741
711/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ 742/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
@@ -722,7 +753,7 @@ static inline notrace void rcu_read_lock_sched_notrace(void)
722 */ 753 */
723static inline void rcu_read_unlock_sched(void) 754static inline void rcu_read_unlock_sched(void)
724{ 755{
725 rcu_read_release_sched(); 756 rcu_lock_release(&rcu_sched_lock_map);
726 __release(RCU_SCHED); 757 __release(RCU_SCHED);
727 preempt_enable(); 758 preempt_enable();
728} 759}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c4f3e9b9bc5..4a7e4d333a27 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2070,6 +2070,14 @@ extern int sched_setscheduler(struct task_struct *, int,
2070extern int sched_setscheduler_nocheck(struct task_struct *, int, 2070extern int sched_setscheduler_nocheck(struct task_struct *, int,
2071 const struct sched_param *); 2071 const struct sched_param *);
2072extern struct task_struct *idle_task(int cpu); 2072extern struct task_struct *idle_task(int cpu);
2073/**
2074 * is_idle_task - is the specified task an idle task?
2075 * @tsk: the task in question.
2076 */
2077static inline bool is_idle_task(struct task_struct *p)
2078{
2079 return p->pid == 0;
2080}
2073extern struct task_struct *curr_task(int cpu); 2081extern struct task_struct *curr_task(int cpu);
2074extern void set_curr_task(int cpu, struct task_struct *p); 2082extern void set_curr_task(int cpu, struct task_struct *p);
2075 2083
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 58971e891f48..e1b005918bbb 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -28,6 +28,7 @@
28#define _LINUX_SRCU_H 28#define _LINUX_SRCU_H
29 29
30#include <linux/mutex.h> 30#include <linux/mutex.h>
31#include <linux/rcupdate.h>
31 32
32struct srcu_struct_array { 33struct srcu_struct_array {
33 int c[2]; 34 int c[2];
@@ -60,18 +61,10 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name,
60 __init_srcu_struct((sp), #sp, &__srcu_key); \ 61 __init_srcu_struct((sp), #sp, &__srcu_key); \
61}) 62})
62 63
63# define srcu_read_acquire(sp) \
64 lock_acquire(&(sp)->dep_map, 0, 0, 2, 1, NULL, _THIS_IP_)
65# define srcu_read_release(sp) \
66 lock_release(&(sp)->dep_map, 1, _THIS_IP_)
67
68#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 64#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
69 65
70int init_srcu_struct(struct srcu_struct *sp); 66int init_srcu_struct(struct srcu_struct *sp);
71 67
72# define srcu_read_acquire(sp) do { } while (0)
73# define srcu_read_release(sp) do { } while (0)
74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 68#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76 69
77void cleanup_srcu_struct(struct srcu_struct *sp); 70void cleanup_srcu_struct(struct srcu_struct *sp);
@@ -90,12 +83,32 @@ long srcu_batches_completed(struct srcu_struct *sp);
90 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, 83 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
91 * this assumes we are in an SRCU read-side critical section unless it can 84 * this assumes we are in an SRCU read-side critical section unless it can
92 * prove otherwise. 85 * prove otherwise.
86 *
87 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
88 * and while lockdep is disabled.
89 *
90 * Note that if the CPU is in the idle loop from an RCU point of view
91 * (ie: that we are in the section between rcu_idle_enter() and
92 * rcu_idle_exit()) then srcu_read_lock_held() returns false even if
93 * the CPU did an srcu_read_lock(). The reason for this is that RCU
94 * ignores CPUs that are in such a section, considering these as in
95 * extended quiescent state, so such a CPU is effectively never in an
96 * RCU read-side critical section regardless of what RCU primitives it
97 * invokes. This state of affairs is required --- we need to keep an
98 * RCU-free window in idle where the CPU may possibly enter into low
99 * power mode. This way we can notice an extended quiescent state to
100 * other CPUs that started a grace period. Otherwise we would delay any
101 * grace period as long as we run in the idle task.
93 */ 102 */
94static inline int srcu_read_lock_held(struct srcu_struct *sp) 103static inline int srcu_read_lock_held(struct srcu_struct *sp)
95{ 104{
96 if (debug_locks) 105 if (rcu_is_cpu_idle())
97 return lock_is_held(&sp->dep_map); 106 return 0;
98 return 1; 107
108 if (!debug_lockdep_rcu_enabled())
109 return 1;
110
111 return lock_is_held(&sp->dep_map);
99} 112}
100 113
101#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 114#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
@@ -145,12 +158,17 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
145 * one way to indirectly wait on an SRCU grace period is to acquire 158 * one way to indirectly wait on an SRCU grace period is to acquire
146 * a mutex that is held elsewhere while calling synchronize_srcu() or 159 * a mutex that is held elsewhere while calling synchronize_srcu() or
147 * synchronize_srcu_expedited(). 160 * synchronize_srcu_expedited().
161 *
162 * Note that srcu_read_lock() and the matching srcu_read_unlock() must
163 * occur in the same context, for example, it is illegal to invoke
164 * srcu_read_unlock() in an irq handler if the matching srcu_read_lock()
165 * was invoked in process context.
148 */ 166 */
149static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) 167static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
150{ 168{
151 int retval = __srcu_read_lock(sp); 169 int retval = __srcu_read_lock(sp);
152 170
153 srcu_read_acquire(sp); 171 rcu_lock_acquire(&(sp)->dep_map);
154 return retval; 172 return retval;
155} 173}
156 174
@@ -164,8 +182,51 @@ static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
164static inline void srcu_read_unlock(struct srcu_struct *sp, int idx) 182static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
165 __releases(sp) 183 __releases(sp)
166{ 184{
167 srcu_read_release(sp); 185 rcu_lock_release(&(sp)->dep_map);
186 __srcu_read_unlock(sp, idx);
187}
188
189/**
190 * srcu_read_lock_raw - register a new reader for an SRCU-protected structure.
191 * @sp: srcu_struct in which to register the new reader.
192 *
193 * Enter an SRCU read-side critical section. Similar to srcu_read_lock(),
194 * but avoids the RCU-lockdep checking. This means that it is legal to
195 * use srcu_read_lock_raw() in one context, for example, in an exception
196 * handler, and then have the matching srcu_read_unlock_raw() in another
197 * context, for example in the task that took the exception.
198 *
199 * However, the entire SRCU read-side critical section must reside within a
200 * single task. For example, beware of using srcu_read_lock_raw() in
201 * a device interrupt handler and srcu_read_unlock() in the interrupted
202 * task: This will not work if interrupts are threaded.
203 */
204static inline int srcu_read_lock_raw(struct srcu_struct *sp)
205{
206 unsigned long flags;
207 int ret;
208
209 local_irq_save(flags);
210 ret = __srcu_read_lock(sp);
211 local_irq_restore(flags);
212 return ret;
213}
214
215/**
216 * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure.
217 * @sp: srcu_struct in which to unregister the old reader.
218 * @idx: return value from corresponding srcu_read_lock_raw().
219 *
220 * Exit an SRCU read-side critical section without lockdep-RCU checking.
221 * See srcu_read_lock_raw() for more details.
222 */
223static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx)
224{
225 unsigned long flags;
226
227 local_irq_save(flags);
168 __srcu_read_unlock(sp, idx); 228 __srcu_read_unlock(sp, idx);
229 local_irq_restore(flags);
169} 230}
170 231
171#endif 232#endif
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b232ccc0ee29..ab8be90b5cc9 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -7,6 +7,7 @@
7#define _LINUX_TICK_H 7#define _LINUX_TICK_H
8 8
9#include <linux/clockchips.h> 9#include <linux/clockchips.h>
10#include <linux/irqflags.h>
10 11
11#ifdef CONFIG_GENERIC_CLOCKEVENTS 12#ifdef CONFIG_GENERIC_CLOCKEVENTS
12 13
@@ -121,14 +122,16 @@ static inline int tick_oneshot_mode_active(void) { return 0; }
121#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ 122#endif /* !CONFIG_GENERIC_CLOCKEVENTS */
122 123
123# ifdef CONFIG_NO_HZ 124# ifdef CONFIG_NO_HZ
124extern void tick_nohz_stop_sched_tick(int inidle); 125extern void tick_nohz_idle_enter(void);
125extern void tick_nohz_restart_sched_tick(void); 126extern void tick_nohz_idle_exit(void);
127extern void tick_nohz_irq_exit(void);
126extern ktime_t tick_nohz_get_sleep_length(void); 128extern ktime_t tick_nohz_get_sleep_length(void);
127extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); 129extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
128extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); 130extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
129# else 131# else
130static inline void tick_nohz_stop_sched_tick(int inidle) { } 132static inline void tick_nohz_idle_enter(void) { }
131static inline void tick_nohz_restart_sched_tick(void) { } 133static inline void tick_nohz_idle_exit(void) { }
134
132static inline ktime_t tick_nohz_get_sleep_length(void) 135static inline ktime_t tick_nohz_get_sleep_length(void)
133{ 136{
134 ktime_t len = { .tv64 = NSEC_PER_SEC/HZ }; 137 ktime_t len = { .tv64 = NSEC_PER_SEC/HZ };
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 669fbd62ec25..d2d88bed891b 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -241,24 +241,73 @@ TRACE_EVENT(rcu_fqs,
241 241
242/* 242/*
243 * Tracepoint for dyntick-idle entry/exit events. These take a string 243 * Tracepoint for dyntick-idle entry/exit events. These take a string
244 * as argument: "Start" for entering dyntick-idle mode and "End" for 244 * as argument: "Start" for entering dyntick-idle mode, "End" for
245 * leaving it. 245 * leaving it, "--=" for events moving towards idle, and "++=" for events
246 * moving away from idle. "Error on entry: not idle task" and "Error on
247 * exit: not idle task" indicate that a non-idle task is erroneously
248 * toying with the idle loop.
249 *
250 * These events also take a pair of numbers, which indicate the nesting
251 * depth before and after the event of interest. Note that task-related
252 * events use the upper bits of each number, while interrupt-related
253 * events use the lower bits.
246 */ 254 */
247TRACE_EVENT(rcu_dyntick, 255TRACE_EVENT(rcu_dyntick,
248 256
249 TP_PROTO(char *polarity), 257 TP_PROTO(char *polarity, long long oldnesting, long long newnesting),
250 258
251 TP_ARGS(polarity), 259 TP_ARGS(polarity, oldnesting, newnesting),
252 260
253 TP_STRUCT__entry( 261 TP_STRUCT__entry(
254 __field(char *, polarity) 262 __field(char *, polarity)
263 __field(long long, oldnesting)
264 __field(long long, newnesting)
255 ), 265 ),
256 266
257 TP_fast_assign( 267 TP_fast_assign(
258 __entry->polarity = polarity; 268 __entry->polarity = polarity;
269 __entry->oldnesting = oldnesting;
270 __entry->newnesting = newnesting;
271 ),
272
273 TP_printk("%s %llx %llx", __entry->polarity,
274 __entry->oldnesting, __entry->newnesting)
275);
276
277/*
278 * Tracepoint for RCU preparation for idle, the goal being to get RCU
279 * processing done so that the current CPU can shut off its scheduling
280 * clock and enter dyntick-idle mode. One way to accomplish this is
281 * to drain all RCU callbacks from this CPU, and the other is to have
282 * done everything RCU requires for the current grace period. In this
283 * latter case, the CPU will be awakened at the end of the current grace
284 * period in order to process the remainder of its callbacks.
285 *
286 * These tracepoints take a string as argument:
287 *
288 * "No callbacks": Nothing to do, no callbacks on this CPU.
289 * "In holdoff": Nothing to do, holding off after unsuccessful attempt.
290 * "Begin holdoff": Attempt failed, don't retry until next jiffy.
291 * "Dyntick with callbacks": Entering dyntick-idle despite callbacks.
292 * "More callbacks": Still more callbacks, try again to clear them out.
293 * "Callbacks drained": All callbacks processed, off to dyntick idle!
294 * "Timer": Timer fired to cause CPU to continue processing callbacks.
295 */
296TRACE_EVENT(rcu_prep_idle,
297
298 TP_PROTO(char *reason),
299
300 TP_ARGS(reason),
301
302 TP_STRUCT__entry(
303 __field(char *, reason)
304 ),
305
306 TP_fast_assign(
307 __entry->reason = reason;
259 ), 308 ),
260 309
261 TP_printk("%s", __entry->polarity) 310 TP_printk("%s", __entry->reason)
262); 311);
263 312
264/* 313/*
@@ -412,27 +461,71 @@ TRACE_EVENT(rcu_invoke_kfree_callback,
412 461
413/* 462/*
414 * Tracepoint for exiting rcu_do_batch after RCU callbacks have been 463 * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
415 * invoked. The first argument is the name of the RCU flavor and 464 * invoked. The first argument is the name of the RCU flavor,
416 * the second argument is number of callbacks actually invoked. 465 * the second argument is number of callbacks actually invoked,
466 * the third argument (cb) is whether or not any of the callbacks that
467 * were ready to invoke at the beginning of this batch are still
468 * queued, the fourth argument (nr) is the return value of need_resched(),
469 * the fifth argument (iit) is 1 if the current task is the idle task,
470 * and the sixth argument (risk) is the return value from
471 * rcu_is_callbacks_kthread().
417 */ 472 */
418TRACE_EVENT(rcu_batch_end, 473TRACE_EVENT(rcu_batch_end,
419 474
420 TP_PROTO(char *rcuname, int callbacks_invoked), 475 TP_PROTO(char *rcuname, int callbacks_invoked,
476 bool cb, bool nr, bool iit, bool risk),
421 477
422 TP_ARGS(rcuname, callbacks_invoked), 478 TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk),
423 479
424 TP_STRUCT__entry( 480 TP_STRUCT__entry(
425 __field(char *, rcuname) 481 __field(char *, rcuname)
426 __field(int, callbacks_invoked) 482 __field(int, callbacks_invoked)
483 __field(bool, cb)
484 __field(bool, nr)
485 __field(bool, iit)
486 __field(bool, risk)
427 ), 487 ),
428 488
429 TP_fast_assign( 489 TP_fast_assign(
430 __entry->rcuname = rcuname; 490 __entry->rcuname = rcuname;
431 __entry->callbacks_invoked = callbacks_invoked; 491 __entry->callbacks_invoked = callbacks_invoked;
492 __entry->cb = cb;
493 __entry->nr = nr;
494 __entry->iit = iit;
495 __entry->risk = risk;
496 ),
497
498 TP_printk("%s CBs-invoked=%d idle=%c%c%c%c",
499 __entry->rcuname, __entry->callbacks_invoked,
500 __entry->cb ? 'C' : '.',
501 __entry->nr ? 'S' : '.',
502 __entry->iit ? 'I' : '.',
503 __entry->risk ? 'R' : '.')
504);
505
506/*
507 * Tracepoint for rcutorture readers. The first argument is the name
508 * of the RCU flavor from rcutorture's viewpoint and the second argument
509 * is the callback address.
510 */
511TRACE_EVENT(rcu_torture_read,
512
513 TP_PROTO(char *rcutorturename, struct rcu_head *rhp),
514
515 TP_ARGS(rcutorturename, rhp),
516
517 TP_STRUCT__entry(
518 __field(char *, rcutorturename)
519 __field(struct rcu_head *, rhp)
520 ),
521
522 TP_fast_assign(
523 __entry->rcutorturename = rcutorturename;
524 __entry->rhp = rhp;
432 ), 525 ),
433 526
434 TP_printk("%s CBs-invoked=%d", 527 TP_printk("%s torture read %p",
435 __entry->rcuname, __entry->callbacks_invoked) 528 __entry->rcutorturename, __entry->rhp)
436); 529);
437 530
438#else /* #ifdef CONFIG_RCU_TRACE */ 531#else /* #ifdef CONFIG_RCU_TRACE */
@@ -443,13 +536,16 @@ TRACE_EVENT(rcu_batch_end,
443#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) 536#define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0)
444#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) 537#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0)
445#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) 538#define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0)
446#define trace_rcu_dyntick(polarity) do { } while (0) 539#define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0)
540#define trace_rcu_prep_idle(reason) do { } while (0)
447#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) 541#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0)
448#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) 542#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0)
449#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) 543#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0)
450#define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) 544#define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0)
451#define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) 545#define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0)
452#define trace_rcu_batch_end(rcuname, callbacks_invoked) do { } while (0) 546#define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \
547 do { } while (0)
548#define trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
453 549
454#endif /* #else #ifdef CONFIG_RCU_TRACE */ 550#endif /* #else #ifdef CONFIG_RCU_TRACE */
455 551
diff --git a/init/Kconfig b/init/Kconfig
index 43298f9810fb..82b6a4c675b2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -469,14 +469,14 @@ config RCU_FANOUT_EXACT
469 469
470config RCU_FAST_NO_HZ 470config RCU_FAST_NO_HZ
471 bool "Accelerate last non-dyntick-idle CPU's grace periods" 471 bool "Accelerate last non-dyntick-idle CPU's grace periods"
472 depends on TREE_RCU && NO_HZ && SMP 472 depends on NO_HZ && SMP
473 default n 473 default n
474 help 474 help
475 This option causes RCU to attempt to accelerate grace periods 475 This option causes RCU to attempt to accelerate grace periods
476 in order to allow the final CPU to enter dynticks-idle state 476 in order to allow CPUs to enter dynticks-idle state more
477 more quickly. On the other hand, this option increases the 477 quickly. On the other hand, this option increases the overhead
478 overhead of the dynticks-idle checking, particularly on systems 478 of the dynticks-idle checking, particularly on systems with
479 with large numbers of CPUs. 479 large numbers of CPUs.
480 480
481 Say Y if energy efficiency is critically important, particularly 481 Say Y if energy efficiency is critically important, particularly
482 if you have relatively few CPUs. 482 if you have relatively few CPUs.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..9d448ddb2247 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -380,6 +380,7 @@ out:
380 cpu_maps_update_done(); 380 cpu_maps_update_done();
381 return err; 381 return err;
382} 382}
383EXPORT_SYMBOL_GPL(cpu_up);
383 384
384#ifdef CONFIG_PM_SLEEP_SMP 385#ifdef CONFIG_PM_SLEEP_SMP
385static cpumask_var_t frozen_cpus; 386static cpumask_var_t frozen_cpus;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' : 636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' : 637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; 638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) { 639 if (is_idle_task(p)) {
640 /* Idle task. Is it really idle, apart from the kdb 640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */ 641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { 642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 58690af323e4..fc0e7ff11dda 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5366,7 +5366,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5366 regs = get_irq_regs(); 5366 regs = get_irq_regs();
5367 5367
5368 if (regs && !perf_exclude_event(event, regs)) { 5368 if (regs && !perf_exclude_event(event, regs)) {
5369 if (!(event->attr.exclude_idle && current->pid == 0)) 5369 if (!(event->attr.exclude_idle && is_idle_task(current)))
5370 if (perf_event_overflow(event, &data, regs)) 5370 if (perf_event_overflow(event, &data, regs))
5371 ret = HRTIMER_NORESTART; 5371 ret = HRTIMER_NORESTART;
5372 } 5372 }
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e69d633d6aa6..8fb755132322 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4181,6 +4181,28 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4181 printk("%s:%d %s!\n", file, line, s); 4181 printk("%s:%d %s!\n", file, line, s);
4182 printk("\nother info that might help us debug this:\n\n"); 4182 printk("\nother info that might help us debug this:\n\n");
4183 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4183 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
4184
4185 /*
4186 * If a CPU is in the RCU-free window in idle (ie: in the section
4187 * between rcu_idle_enter() and rcu_idle_exit(), then RCU
4188 * considers that CPU to be in an "extended quiescent state",
4189 * which means that RCU will be completely ignoring that CPU.
4190 * Therefore, rcu_read_lock() and friends have absolutely no
4191 * effect on a CPU running in that state. In other words, even if
4192 * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
4193 * delete data structures out from under it. RCU really has no
4194 * choice here: we need to keep an RCU-free window in idle where
4195 * the CPU may possibly enter into low power mode. This way we can
4196 * notice an extended quiescent state to other CPUs that started a grace
4197 * period. Otherwise we would delay any grace period as long as we run
4198 * in the idle task.
4199 *
4200 * So complain bitterly if someone does call rcu_read_lock(),
4201 * rcu_read_lock_bh() and so on from extended quiescent states.
4202 */
4203 if (rcu_is_cpu_idle())
4204 printk("RCU used illegally from extended quiescent state!\n");
4205
4184 lockdep_print_held_locks(curr); 4206 lockdep_print_held_locks(curr);
4185 printk("\nstack backtrace:\n"); 4207 printk("\nstack backtrace:\n");
4186 dump_stack(); 4208 dump_stack();
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550d..aa88baab5f78 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
30#endif /* #else #ifdef CONFIG_RCU_TRACE */ 30#endif /* #else #ifdef CONFIG_RCU_TRACE */
31 31
32/* 32/*
33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from
35 * process context.
36 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
38
39/*
33 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
34 * by call_rcu() and rcu callback execution, and are therefore not part of the 41 * by call_rcu() and rcu callback execution, and are therefore not part of the
35 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. 42 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565aee..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
93{ 93{
94 if (!debug_lockdep_rcu_enabled()) 94 if (!debug_lockdep_rcu_enabled())
95 return 1; 95 return 1;
96 if (rcu_is_cpu_idle())
97 return 0;
96 return in_softirq() || irqs_disabled(); 98 return in_softirq() || irqs_disabled();
97} 99}
98EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
316}; 318};
317EXPORT_SYMBOL_GPL(rcuhead_debug_descr); 319EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
318#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 320#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
321
322#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
323void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
324{
325 trace_rcu_torture_read(rcutorturename, rhp);
326}
327EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
328#else
329#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
330#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e5..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56#ifdef CONFIG_NO_HZ 56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
57 57
58static long rcu_dynticks_nesting = 1; 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval)
60{
61 if (rcu_dynticks_nesting) {
62 RCU_TRACE(trace_rcu_dyntick("--=",
63 oldval, rcu_dynticks_nesting));
64 return;
65 }
66 RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
67 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id());
69
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
71 oldval, rcu_dynticks_nesting));
72 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
74 current->pid, current->comm,
75 idle->pid, idle->comm); /* must be idle task! */
76 }
77 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
78}
59 79
60/* 80/*
61 * Enter dynticks-idle mode, which is an extended quiescent state 81 * Enter idle, which is an extended quiescent state if we have fully
62 * if we have fully entered that mode (i.e., if the new value of 82 * entered that mode (i.e., if the new value of dynticks_nesting is zero).
63 * dynticks_nesting is zero).
64 */ 83 */
65void rcu_enter_nohz(void) 84void rcu_idle_enter(void)
66{ 85{
67 if (--rcu_dynticks_nesting == 0) 86 unsigned long flags;
68 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 87 long long oldval;
88
89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0;
92 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags);
69} 94}
70 95
71/* 96/*
72 * Exit dynticks-idle mode, so that we are no longer in an extended 97 * Exit an interrupt handler towards idle.
73 * quiescent state.
74 */ 98 */
75void rcu_exit_nohz(void) 99void rcu_irq_exit(void)
100{
101 unsigned long flags;
102 long long oldval;
103
104 local_irq_save(flags);
105 oldval = rcu_dynticks_nesting;
106 rcu_dynticks_nesting--;
107 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
108 rcu_idle_enter_common(oldval);
109 local_irq_restore(flags);
110}
111
112/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
113static void rcu_idle_exit_common(long long oldval)
76{ 114{
115 if (oldval) {
116 RCU_TRACE(trace_rcu_dyntick("++=",
117 oldval, rcu_dynticks_nesting));
118 return;
119 }
120 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
121 if (!is_idle_task(current)) {
122 struct task_struct *idle = idle_task(smp_processor_id());
123
124 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
125 oldval, rcu_dynticks_nesting));
126 ftrace_dump(DUMP_ALL);
127 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
128 current->pid, current->comm,
129 idle->pid, idle->comm); /* must be idle task! */
130 }
131}
132
133/*
134 * Exit idle, so that we are no longer in an extended quiescent state.
135 */
136void rcu_idle_exit(void)
137{
138 unsigned long flags;
139 long long oldval;
140
141 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
145 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags);
147}
148
149/*
150 * Enter an interrupt handler, moving away from idle.
151 */
152void rcu_irq_enter(void)
153{
154 unsigned long flags;
155 long long oldval;
156
157 local_irq_save(flags);
158 oldval = rcu_dynticks_nesting;
77 rcu_dynticks_nesting++; 159 rcu_dynticks_nesting++;
160 WARN_ON_ONCE(rcu_dynticks_nesting == 0);
161 rcu_idle_exit_common(oldval);
162 local_irq_restore(flags);
163}
164
165#ifdef CONFIG_PROVE_RCU
166
167/*
168 * Test whether RCU thinks that the current CPU is idle.
169 */
170int rcu_is_cpu_idle(void)
171{
172 return !rcu_dynticks_nesting;
78} 173}
174EXPORT_SYMBOL(rcu_is_cpu_idle);
175
176#endif /* #ifdef CONFIG_PROVE_RCU */
79 177
80#endif /* #ifdef CONFIG_NO_HZ */ 178/*
179 * Test whether the current CPU was interrupted from idle. Nested
180 * interrupts don't count, we must be running at the first interrupt
181 * level.
182 */
183int rcu_is_cpu_rrupt_from_idle(void)
184{
185 return rcu_dynticks_nesting <= 0;
186}
81 187
82/* 188/*
83 * Helper function for rcu_sched_qs() and rcu_bh_qs(). 189 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
126 232
127/* 233/*
128 * Check to see if the scheduling-clock interrupt came from an extended 234 * Check to see if the scheduling-clock interrupt came from an extended
129 * quiescent state, and, if so, tell RCU about it. 235 * quiescent state, and, if so, tell RCU about it. This function must
236 * be called from hardirq context. It is normally called from the
237 * scheduling-clock interrupt.
130 */ 238 */
131void rcu_check_callbacks(int cpu, int user) 239void rcu_check_callbacks(int cpu, int user)
132{ 240{
133 if (user || 241 if (user || rcu_is_cpu_rrupt_from_idle())
134 (idle_cpu(cpu) &&
135 !in_softirq() &&
136 hardirq_count() <= (1 << HARDIRQ_SHIFT)))
137 rcu_sched_qs(cpu); 242 rcu_sched_qs(cpu);
138 else if (!in_softirq()) 243 else if (!in_softirq())
139 rcu_bh_qs(cpu); 244 rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
154 /* If no RCU callbacks ready to invoke, just return. */ 259 /* If no RCU callbacks ready to invoke, just return. */
155 if (&rcp->rcucblist == rcp->donetail) { 260 if (&rcp->rcucblist == rcp->donetail) {
156 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
157 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); 262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(),
265 is_idle_task(current),
266 rcu_is_callbacks_kthread()));
158 return; 267 return;
159 } 268 }
160 269
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
183 RCU_TRACE(cb_count++); 292 RCU_TRACE(cb_count++);
184 } 293 }
185 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 294 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
186 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); 295 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
296 is_idle_task(current),
297 rcu_is_callbacks_kthread()));
187} 298}
188 299
189static void rcu_process_callbacks(struct softirq_action *unused) 300static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc28..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
312 rt_mutex_lock(&mtx); 312 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
314 314
315 return rcu_preempt_ctrlblk.boost_tasks != NULL || 315 return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
316 rcu_preempt_ctrlblk.exp_tasks != NULL; 316 ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
317} 317}
318 318
319/* 319/*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
885 wake_up(&rcu_kthread_wq); 885 wake_up(&rcu_kthread_wq);
886} 886}
887 887
888#ifdef CONFIG_RCU_TRACE
889
890/*
891 * Is the current CPU running the RCU-callbacks kthread?
892 * Caller must have preemption disabled.
893 */
894static bool rcu_is_callbacks_kthread(void)
895{
896 return rcu_kthread_task == current;
897}
898
899#endif /* #ifdef CONFIG_RCU_TRACE */
900
888/* 901/*
889 * This kthread invokes RCU callbacks whose grace periods have 902 * This kthread invokes RCU callbacks whose grace periods have
890 * elapsed. It is awakened as needed, and takes the place of the 903 * elapsed. It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
938 raise_softirq(RCU_SOFTIRQ); 951 raise_softirq(RCU_SOFTIRQ);
939} 952}
940 953
954#ifdef CONFIG_RCU_TRACE
955
956/*
957 * There is no callback kthread, so this thread is never it.
958 */
959static bool rcu_is_callbacks_kthread(void)
960{
961 return false;
962}
963
964#endif /* #ifdef CONFIG_RCU_TRACE */
965
941void rcu_init(void) 966void rcu_init(void)
942{ 967{
943 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685c..88f17b8a3b1d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,9 +61,11 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ 64static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
67static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
68static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
69static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
91MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); 93MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
92module_param(fqs_stutter, int, 0444); 94module_param(fqs_stutter, int, 0444);
93MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
98module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
94module_param(test_boost, int, 0444); 100module_param(test_boost, int, 0444);
95MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
96module_param(test_boost_interval, int, 0444); 102module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
119static struct task_struct *stutter_task; 125static struct task_struct *stutter_task;
120static struct task_struct *fqs_task; 126static struct task_struct *fqs_task;
121static struct task_struct *boost_tasks[NR_CPUS]; 127static struct task_struct *boost_tasks[NR_CPUS];
128static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */
122 132
123#define RCU_TORTURE_PIPE_LEN 10 133#define RCU_TORTURE_PIPE_LEN 10
124 134
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
149static long n_rcu_torture_boost_failure; 159static long n_rcu_torture_boost_failure;
150static long n_rcu_torture_boosts; 160static long n_rcu_torture_boosts;
151static long n_rcu_torture_timers; 161static long n_rcu_torture_timers;
162static long n_offline_attempts;
163static long n_offline_successes;
164static long n_online_attempts;
165static long n_online_successes;
152static struct list_head rcu_torture_removed; 166static struct list_head rcu_torture_removed;
153static cpumask_var_t shuffle_tmp_mask; 167static cpumask_var_t shuffle_tmp_mask;
154 168
@@ -160,6 +174,8 @@ static int stutter_pause_test;
160#define RCUTORTURE_RUNNABLE_INIT 0 174#define RCUTORTURE_RUNNABLE_INIT 0
161#endif 175#endif
162int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 176int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
177module_param(rcutorture_runnable, int, 0444);
178MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
163 179
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 180#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 181#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
167#define rcu_can_boost() 0 183#define rcu_can_boost() 0
168#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ 184#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
169 185
186static unsigned long shutdown_time; /* jiffies to system shutdown. */
170static unsigned long boost_starttime; /* jiffies of next boost test start. */ 187static unsigned long boost_starttime; /* jiffies of next boost test start. */
171DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 188DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
172 /* and boost task create/destroy. */ 189 /* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
182 */ 199 */
183static DEFINE_MUTEX(fullstop_mutex); 200static DEFINE_MUTEX(fullstop_mutex);
184 201
202/* Forward reference. */
203static void rcu_torture_cleanup(void);
204
185/* 205/*
186 * Detect and respond to a system shutdown. 206 * Detect and respond to a system shutdown.
187 */ 207 */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
612 .name = "srcu" 632 .name = "srcu"
613}; 633};
614 634
635static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
636{
637 return srcu_read_lock_raw(&srcu_ctl);
638}
639
640static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
641{
642 srcu_read_unlock_raw(&srcu_ctl, idx);
643}
644
645static struct rcu_torture_ops srcu_raw_ops = {
646 .init = srcu_torture_init,
647 .cleanup = srcu_torture_cleanup,
648 .readlock = srcu_torture_read_lock_raw,
649 .read_delay = srcu_read_delay,
650 .readunlock = srcu_torture_read_unlock_raw,
651 .completed = srcu_torture_completed,
652 .deferred_free = rcu_sync_torture_deferred_free,
653 .sync = srcu_torture_synchronize,
654 .cb_barrier = NULL,
655 .stats = srcu_torture_stats,
656 .name = "srcu_raw"
657};
658
615static void srcu_torture_synchronize_expedited(void) 659static void srcu_torture_synchronize_expedited(void)
616{ 660{
617 synchronize_srcu_expedited(&srcu_ctl); 661 synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
913 return 0; 957 return 0;
914} 958}
915 959
960void rcutorture_trace_dump(void)
961{
962 static atomic_t beenhere = ATOMIC_INIT(0);
963
964 if (atomic_read(&beenhere))
965 return;
966 if (atomic_xchg(&beenhere, 1) != 0)
967 return;
968 do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
969 ftrace_dump(DUMP_ALL);
970}
971
916/* 972/*
917 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 973 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
918 * incrementing the corresponding element of the pipeline array. The 974 * incrementing the corresponding element of the pipeline array. The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
934 rcu_read_lock_bh_held() || 990 rcu_read_lock_bh_held() ||
935 rcu_read_lock_sched_held() || 991 rcu_read_lock_sched_held() ||
936 srcu_read_lock_held(&srcu_ctl)); 992 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
937 if (p == NULL) { 994 if (p == NULL) {
938 /* Leave because rcu_torture_writer is not yet underway */ 995 /* Leave because rcu_torture_writer is not yet underway */
939 cur_ops->readunlock(idx); 996 cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
951 /* Should not happen, but... */ 1008 /* Should not happen, but... */
952 pipe_count = RCU_TORTURE_PIPE_LEN; 1009 pipe_count = RCU_TORTURE_PIPE_LEN;
953 } 1010 }
1011 if (pipe_count > 1)
1012 rcutorture_trace_dump();
954 __this_cpu_inc(rcu_torture_count[pipe_count]); 1013 __this_cpu_inc(rcu_torture_count[pipe_count]);
955 completed = cur_ops->completed() - completed; 1014 completed = cur_ops->completed() - completed;
956 if (completed > RCU_TORTURE_PIPE_LEN) { 1015 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
994 rcu_read_lock_bh_held() || 1053 rcu_read_lock_bh_held() ||
995 rcu_read_lock_sched_held() || 1054 rcu_read_lock_sched_held() ||
996 srcu_read_lock_held(&srcu_ctl)); 1055 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
997 if (p == NULL) { 1057 if (p == NULL) {
998 /* Wait for rcu_torture_writer to get underway */ 1058 /* Wait for rcu_torture_writer to get underway */
999 cur_ops->readunlock(idx); 1059 cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
1009 /* Should not happen, but... */ 1069 /* Should not happen, but... */
1010 pipe_count = RCU_TORTURE_PIPE_LEN; 1070 pipe_count = RCU_TORTURE_PIPE_LEN;
1011 } 1071 }
1072 if (pipe_count > 1)
1073 rcutorture_trace_dump();
1012 __this_cpu_inc(rcu_torture_count[pipe_count]); 1074 __this_cpu_inc(rcu_torture_count[pipe_count]);
1013 completed = cur_ops->completed() - completed; 1075 completed = cur_ops->completed() - completed;
1014 if (completed > RCU_TORTURE_PIPE_LEN) { 1076 if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
1056 cnt += sprintf(&page[cnt], 1118 cnt += sprintf(&page[cnt],
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " 1119 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
1058 "rtmbe: %d rtbke: %ld rtbre: %ld " 1120 "rtmbe: %d rtbke: %ld rtbre: %ld "
1059 "rtbf: %ld rtb: %ld nt: %ld", 1121 "rtbf: %ld rtb: %ld nt: %ld "
1122 "onoff: %ld/%ld:%ld/%ld",
1060 rcu_torture_current, 1123 rcu_torture_current,
1061 rcu_torture_current_version, 1124 rcu_torture_current_version,
1062 list_empty(&rcu_torture_freelist), 1125 list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
1068 n_rcu_torture_boost_rterror, 1131 n_rcu_torture_boost_rterror,
1069 n_rcu_torture_boost_failure, 1132 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1133 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1134 n_rcu_torture_timers,
1135 n_online_successes,
1136 n_online_attempts,
1137 n_offline_successes,
1138 n_offline_attempts);
1072 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1139 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1073 n_rcu_torture_boost_ktrerror != 0 || 1140 n_rcu_torture_boost_ktrerror != 0 ||
1074 n_rcu_torture_boost_rterror != 0 || 1141 n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1232 "shuffle_interval=%d stutter=%d irqreader=%d " 1299 "shuffle_interval=%d stutter=%d irqreader=%d "
1233 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1234 "test_boost=%d/%d test_boost_interval=%d " 1301 "test_boost=%d/%d test_boost_interval=%d "
1235 "test_boost_duration=%d\n", 1302 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n",
1236 torture_type, tag, nrealreaders, nfakewriters, 1304 torture_type, tag, nrealreaders, nfakewriters,
1237 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1238 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1239 test_boost, cur_ops->can_boost, 1307 test_boost, cur_ops->can_boost,
1240 test_boost_interval, test_boost_duration); 1308 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval);
1241} 1310}
1242 1311
1243static struct notifier_block rcutorture_shutdown_nb = { 1312static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
1287 return 0; 1356 return 0;
1288} 1357}
1289 1358
1359/*
1360 * Cause the rcutorture test to shutdown the system after the test has
1361 * run for the time specified by the shutdown_secs module parameter.
1362 */
1363static int
1364rcu_torture_shutdown(void *arg)
1365{
1366 long delta;
1367 unsigned long jiffies_snap;
1368
1369 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
1370 jiffies_snap = ACCESS_ONCE(jiffies);
1371 while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
1372 !kthread_should_stop()) {
1373 delta = shutdown_time - jiffies_snap;
1374 if (verbose)
1375 printk(KERN_ALERT "%s" TORTURE_FLAG
1376 "rcu_torture_shutdown task: %lu "
1377 "jiffies remaining\n",
1378 torture_type, delta);
1379 schedule_timeout_interruptible(delta);
1380 jiffies_snap = ACCESS_ONCE(jiffies);
1381 }
1382 if (kthread_should_stop()) {
1383 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
1384 return 0;
1385 }
1386
1387 /* OK, shut down the system. */
1388
1389 VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
1390 shutdown_task = NULL; /* Avoid self-kill deadlock. */
1391 rcu_torture_cleanup(); /* Get the success/failure message. */
1392 kernel_power_off(); /* Shut down the system. */
1393 return 0;
1394}
1395
1396#ifdef CONFIG_HOTPLUG_CPU
1397
1398/*
1399 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval.
1401 */
1402static int
1403rcu_torture_onoff(void *arg)
1404{
1405 int cpu;
1406 int maxcpu = -1;
1407 DEFINE_RCU_RANDOM(rand);
1408
1409 VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
1410 for_each_online_cpu(cpu)
1411 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0);
1413 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
1416 if (verbose)
1417 printk(KERN_ALERT "%s" TORTURE_FLAG
1418 "rcu_torture_onoff task: offlining %d\n",
1419 torture_type, cpu);
1420 n_offline_attempts++;
1421 if (cpu_down(cpu) == 0) {
1422 if (verbose)
1423 printk(KERN_ALERT "%s" TORTURE_FLAG
1424 "rcu_torture_onoff task: "
1425 "offlined %d\n",
1426 torture_type, cpu);
1427 n_offline_successes++;
1428 }
1429 } else if (cpu_is_hotpluggable(cpu)) {
1430 if (verbose)
1431 printk(KERN_ALERT "%s" TORTURE_FLAG
1432 "rcu_torture_onoff task: onlining %d\n",
1433 torture_type, cpu);
1434 n_online_attempts++;
1435 if (cpu_up(cpu) == 0) {
1436 if (verbose)
1437 printk(KERN_ALERT "%s" TORTURE_FLAG
1438 "rcu_torture_onoff task: "
1439 "onlined %d\n",
1440 torture_type, cpu);
1441 n_online_successes++;
1442 }
1443 }
1444 schedule_timeout_interruptible(onoff_interval * HZ);
1445 }
1446 VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
1447 return 0;
1448}
1449
1450static int
1451rcu_torture_onoff_init(void)
1452{
1453 if (onoff_interval <= 0)
1454 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) {
1457 onoff_task = NULL;
1458 return PTR_ERR(onoff_task);
1459 }
1460 return 0;
1461}
1462
1463static void rcu_torture_onoff_cleanup(void)
1464{
1465 if (onoff_task == NULL)
1466 return;
1467 VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
1468 kthread_stop(onoff_task);
1469}
1470
1471#else /* #ifdef CONFIG_HOTPLUG_CPU */
1472
1473static void
1474rcu_torture_onoff_init(void)
1475{
1476}
1477
1478static void rcu_torture_onoff_cleanup(void)
1479{
1480}
1481
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483
1290static int rcutorture_cpu_notify(struct notifier_block *self, 1484static int rcutorture_cpu_notify(struct notifier_block *self,
1291 unsigned long action, void *hcpu) 1485 unsigned long action, void *hcpu)
1292{ 1486{
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
1391 for_each_possible_cpu(i) 1585 for_each_possible_cpu(i)
1392 rcutorture_booster_cleanup(i); 1586 rcutorture_booster_cleanup(i);
1393 } 1587 }
1588 if (shutdown_task != NULL) {
1589 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
1590 kthread_stop(shutdown_task);
1591 }
1592 rcu_torture_onoff_cleanup();
1394 1593
1395 /* Wait for all RCU callbacks to fire. */ 1594 /* Wait for all RCU callbacks to fire. */
1396 1595
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
1416 static struct rcu_torture_ops *torture_ops[] = 1615 static struct rcu_torture_ops *torture_ops[] =
1417 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, 1616 { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
1418 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, 1617 &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
1419 &srcu_ops, &srcu_expedited_ops, 1618 &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
1420 &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; 1619 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
1421 1620
1422 mutex_lock(&fullstop_mutex); 1621 mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
1607 } 1806 }
1608 } 1807 }
1609 } 1808 }
1809 if (shutdown_secs > 0) {
1810 shutdown_time = jiffies + shutdown_secs * HZ;
1811 shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
1812 "rcu_torture_shutdown");
1813 if (IS_ERR(shutdown_task)) {
1814 firsterr = PTR_ERR(shutdown_task);
1815 VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
1816 shutdown_task = NULL;
1817 goto unwind;
1818 }
1819 }
1820 rcu_torture_onoff_init();
1610 register_reboot_notifier(&rcutorture_shutdown_nb); 1821 register_reboot_notifier(&rcutorture_shutdown_nb);
1611 rcutorture_record_test_transition(); 1822 rcutorture_record_test_transition();
1612 mutex_unlock(&fullstop_mutex); 1823 mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740c..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
69 NUM_RCU_LVL_3, \ 69 NUM_RCU_LVL_3, \
70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ 70 NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
71 }, \ 71 }, \
72 .signaled = RCU_GP_IDLE, \ 72 .fqs_state = RCU_GP_IDLE, \
73 .gpnum = -300, \ 73 .gpnum = -300, \
74 .completed = -300, \ 74 .completed = -300, \
75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ 75 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
195} 195}
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 196EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 197
198#ifdef CONFIG_NO_HZ
199DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
200 .dynticks_nesting = 1, 199 .dynticks_nesting = DYNTICK_TASK_NESTING,
201 .dynticks = ATOMIC_INIT(1), 200 .dynticks = ATOMIC_INIT(1),
202}; 201};
203#endif /* #ifdef CONFIG_NO_HZ */
204 202
205static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 203static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
206static int qhimark = 10000; /* If this many pending, ignore blimit. */ 204static int qhimark = 10000; /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
328 return 1; 326 return 1;
329 } 327 }
330 328
331 /* If preemptible RCU, no point in sending reschedule IPI. */ 329 /*
332 if (rdp->preemptible) 330 * The CPU is online, so send it a reschedule IPI. This forces
333 return 0; 331 * it through the scheduler, and (inefficiently) also handles cases
334 332 * where idle loops fail to inform RCU about the CPU being idle.
335 /* The CPU is online, so send it a reschedule IPI. */ 333 */
336 if (rdp->cpu != smp_processor_id()) 334 if (rdp->cpu != smp_processor_id())
337 smp_send_reschedule(rdp->cpu); 335 smp_send_reschedule(rdp->cpu);
338 else 336 else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
343 341
344#endif /* #ifdef CONFIG_SMP */ 342#endif /* #ifdef CONFIG_SMP */
345 343
346#ifdef CONFIG_NO_HZ 344/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 *
347 * If the new value of the ->dynticks_nesting counter now is zero,
348 * we really have entered idle, and must do the appropriate accounting.
349 * The caller must have disabled interrupts.
350 */
351static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
352{
353 trace_rcu_dyntick("Start", oldval, 0);
354 if (!is_idle_task(current)) {
355 struct task_struct *idle = idle_task(smp_processor_id());
356
357 trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
358 ftrace_dump(DUMP_ALL);
359 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
360 current->pid, current->comm,
361 idle->pid, idle->comm); /* must be idle task! */
362 }
363 rcu_prepare_for_idle(smp_processor_id());
364 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
365 smp_mb__before_atomic_inc(); /* See above. */
366 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
369}
347 370
348/** 371/**
349 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 372 * rcu_idle_enter - inform RCU that current CPU is entering idle
350 * 373 *
351 * Enter nohz mode, in other words, -leave- the mode in which RCU 374 * Enter idle mode, in other words, -leave- the mode in which RCU
352 * read-side critical sections can occur. (Though RCU read-side 375 * read-side critical sections can occur. (Though RCU read-side
353 * critical sections can occur in irq handlers in nohz mode, a possibility 376 * critical sections can occur in irq handlers in idle, a possibility
354 * handled by rcu_irq_enter() and rcu_irq_exit()). 377 * handled by irq_enter() and irq_exit().)
378 *
379 * We crowbar the ->dynticks_nesting field to zero to allow for
380 * the possibility of usermode upcalls having messed up our count
381 * of interrupt nesting level during the prior busy period.
355 */ 382 */
356void rcu_enter_nohz(void) 383void rcu_idle_enter(void)
357{ 384{
358 unsigned long flags; 385 unsigned long flags;
386 long long oldval;
359 struct rcu_dynticks *rdtp; 387 struct rcu_dynticks *rdtp;
360 388
361 local_irq_save(flags); 389 local_irq_save(flags);
362 rdtp = &__get_cpu_var(rcu_dynticks); 390 rdtp = &__get_cpu_var(rcu_dynticks);
363 if (--rdtp->dynticks_nesting) { 391 oldval = rdtp->dynticks_nesting;
364 local_irq_restore(flags); 392 rdtp->dynticks_nesting = 0;
365 return; 393 rcu_idle_enter_common(rdtp, oldval);
366 }
367 trace_rcu_dyntick("Start");
368 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
369 smp_mb__before_atomic_inc(); /* See above. */
370 atomic_inc(&rdtp->dynticks);
371 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
372 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
373 local_irq_restore(flags); 394 local_irq_restore(flags);
374} 395}
375 396
376/* 397/**
377 * rcu_exit_nohz - inform RCU that current CPU is leaving nohz 398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
399 *
400 * Exit from an interrupt handler, which might possibly result in entering
401 * idle mode, in other words, leaving the mode in which read-side critical
402 * sections can occur.
378 * 403 *
379 * Exit nohz mode, in other words, -enter- the mode in which RCU 404 * This code assumes that the idle loop never does anything that might
380 * read-side critical sections normally occur. 405 * result in unbalanced calls to irq_enter() and irq_exit(). If your
406 * architecture violates this assumption, RCU will give you what you
407 * deserve, good and hard. But very infrequently and irreproducibly.
408 *
409 * Use things like work queues to work around this limitation.
410 *
411 * You have been warned.
381 */ 412 */
382void rcu_exit_nohz(void) 413void rcu_irq_exit(void)
383{ 414{
384 unsigned long flags; 415 unsigned long flags;
416 long long oldval;
385 struct rcu_dynticks *rdtp; 417 struct rcu_dynticks *rdtp;
386 418
387 local_irq_save(flags); 419 local_irq_save(flags);
388 rdtp = &__get_cpu_var(rcu_dynticks); 420 rdtp = &__get_cpu_var(rcu_dynticks);
389 if (rdtp->dynticks_nesting++) { 421 oldval = rdtp->dynticks_nesting;
390 local_irq_restore(flags); 422 rdtp->dynticks_nesting--;
391 return; 423 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
392 } 424 if (rdtp->dynticks_nesting)
425 trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
426 else
427 rcu_idle_enter_common(rdtp, oldval);
428 local_irq_restore(flags);
429}
430
431/*
432 * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
433 *
434 * If the new value of the ->dynticks_nesting counter was previously zero,
435 * we really have exited idle, and must do the appropriate accounting.
436 * The caller must have disabled interrupts.
437 */
438static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
439{
393 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ 440 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
394 atomic_inc(&rdtp->dynticks); 441 atomic_inc(&rdtp->dynticks);
395 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 442 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
396 smp_mb__after_atomic_inc(); /* See above. */ 443 smp_mb__after_atomic_inc(); /* See above. */
397 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); 444 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
398 trace_rcu_dyntick("End"); 445 rcu_cleanup_after_idle(smp_processor_id());
446 trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
447 if (!is_idle_task(current)) {
448 struct task_struct *idle = idle_task(smp_processor_id());
449
450 trace_rcu_dyntick("Error on exit: not idle task",
451 oldval, rdtp->dynticks_nesting);
452 ftrace_dump(DUMP_ALL);
453 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
454 current->pid, current->comm,
455 idle->pid, idle->comm); /* must be idle task! */
456 }
457}
458
459/**
460 * rcu_idle_exit - inform RCU that current CPU is leaving idle
461 *
462 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur.
464 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
466 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just
468 * now starting.
469 */
470void rcu_idle_exit(void)
471{
472 unsigned long flags;
473 struct rcu_dynticks *rdtp;
474 long long oldval;
475
476 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
481 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags);
483}
484
485/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
487 *
488 * Enter an interrupt handler, which might possibly result in exiting
489 * idle mode, in other words, entering the mode in which read-side critical
490 * sections can occur.
491 *
492 * Note that the Linux kernel is fully capable of entering an interrupt
493 * handler that it never exits, for example when doing upcalls to
494 * user mode! This code assumes that the idle loop never does upcalls to
495 * user mode. If your architecture does do upcalls from the idle loop (or
496 * does anything else that results in unbalanced calls to the irq_enter()
497 * and irq_exit() functions), RCU will give you what you deserve, good
498 * and hard. But very infrequently and irreproducibly.
499 *
500 * Use things like work queues to work around this limitation.
501 *
502 * You have been warned.
503 */
504void rcu_irq_enter(void)
505{
506 unsigned long flags;
507 struct rcu_dynticks *rdtp;
508 long long oldval;
509
510 local_irq_save(flags);
511 rdtp = &__get_cpu_var(rcu_dynticks);
512 oldval = rdtp->dynticks_nesting;
513 rdtp->dynticks_nesting++;
514 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
515 if (oldval)
516 trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
517 else
518 rcu_idle_exit_common(rdtp, oldval);
399 local_irq_restore(flags); 519 local_irq_restore(flags);
400} 520}
401 521
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
442 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 562 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
443} 563}
444 564
565#ifdef CONFIG_PROVE_RCU
566
445/** 567/**
446 * rcu_irq_enter - inform RCU of entry to hard irq context 568 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
447 * 569 *
448 * If the CPU was idle with dynamic ticks active, this updates the 570 * If the current CPU is in its idle loop and is neither in an interrupt
449 * rdtp->dynticks to let the RCU handling know that the CPU is active. 571 * or NMI handler, return true.
450 */ 572 */
451void rcu_irq_enter(void) 573int rcu_is_cpu_idle(void)
452{ 574{
453 rcu_exit_nohz(); 575 int ret;
576
577 preempt_disable();
578 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
579 preempt_enable();
580 return ret;
454} 581}
582EXPORT_SYMBOL(rcu_is_cpu_idle);
583
584#endif /* #ifdef CONFIG_PROVE_RCU */
455 585
456/** 586/**
457 * rcu_irq_exit - inform RCU of exit from hard irq context 587 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
458 * 588 *
459 * If the CPU was idle with dynamic ticks active, update the rdp->dynticks 589 * If the current CPU is idle or running at a first-level (not nested)
460 * to put let the RCU handling be aware that the CPU is going back to idle 590 * interrupt from idle, return true. The caller must have at least
461 * with no ticks. 591 * disabled preemption.
462 */ 592 */
463void rcu_irq_exit(void) 593int rcu_is_cpu_rrupt_from_idle(void)
464{ 594{
465 rcu_enter_nohz(); 595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
466} 596}
467 597
468#ifdef CONFIG_SMP 598#ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
475static int dyntick_save_progress_counter(struct rcu_data *rdp) 605static int dyntick_save_progress_counter(struct rcu_data *rdp)
476{ 606{
477 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); 607 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
478 return 0; 608 return (rdp->dynticks_snap & 0x1) == 0;
479} 609}
480 610
481/* 611/*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
512 642
513#endif /* #ifdef CONFIG_SMP */ 643#endif /* #ifdef CONFIG_SMP */
514 644
515#else /* #ifdef CONFIG_NO_HZ */
516
517#ifdef CONFIG_SMP
518
519static int dyntick_save_progress_counter(struct rcu_data *rdp)
520{
521 return 0;
522}
523
524static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
525{
526 return rcu_implicit_offline_qs(rdp);
527}
528
529#endif /* #ifdef CONFIG_SMP */
530
531#endif /* #else #ifdef CONFIG_NO_HZ */
532
533int rcu_cpu_stall_suppress __read_mostly;
534
535static void record_gp_stall_check_time(struct rcu_state *rsp) 645static void record_gp_stall_check_time(struct rcu_state *rsp)
536{ 646{
537 rsp->gp_start = jiffies; 647 rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
866 /* Advance to a new grace period and initialize state. */ 976 /* Advance to a new grace period and initialize state. */
867 rsp->gpnum++; 977 rsp->gpnum++;
868 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); 978 trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
869 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); 979 WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
870 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
871 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
872 record_gp_stall_check_time(rsp); 982 record_gp_stall_check_time(rsp);
873 983
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
877 rnp->qsmask = rnp->qsmaskinit; 987 rnp->qsmask = rnp->qsmaskinit;
878 rnp->gpnum = rsp->gpnum; 988 rnp->gpnum = rsp->gpnum;
879 rnp->completed = rsp->completed; 989 rnp->completed = rsp->completed;
880 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
881 rcu_start_gp_per_cpu(rsp, rnp, rdp); 991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
882 rcu_preempt_boost_start_gp(rnp); 992 rcu_preempt_boost_start_gp(rnp);
883 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
927 1037
928 rnp = rcu_get_root(rsp); 1038 rnp = rcu_get_root(rsp);
929 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1039 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
930 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 1040 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
931 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1041 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
932 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1042 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
933} 1043}
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
991 1101
992 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ 1102 rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
993 trace_rcu_grace_period(rsp->name, rsp->completed, "end"); 1103 trace_rcu_grace_period(rsp->name, rsp->completed, "end");
994 rsp->signaled = RCU_GP_IDLE; 1104 rsp->fqs_state = RCU_GP_IDLE;
995 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 1105 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
996} 1106}
997 1107
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1221 else 1331 else
1222 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1332 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1223 if (need_report & RCU_OFL_TASKS_EXP_GP) 1333 if (need_report & RCU_OFL_TASKS_EXP_GP)
1224 rcu_report_exp_rnp(rsp, rnp); 1334 rcu_report_exp_rnp(rsp, rnp, true);
1225 rcu_node_kthread_setaffinity(rnp, -1); 1335 rcu_node_kthread_setaffinity(rnp, -1);
1226} 1336}
1227 1337
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1263 /* If no callbacks are ready, just return.*/ 1373 /* If no callbacks are ready, just return.*/
1264 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1265 trace_rcu_batch_start(rsp->name, 0, 0); 1375 trace_rcu_batch_start(rsp->name, 0, 0);
1266 trace_rcu_batch_end(rsp->name, 0); 1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread());
1267 return; 1379 return;
1268 } 1380 }
1269 1381
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1291 debug_rcu_head_unqueue(list); 1403 debug_rcu_head_unqueue(list);
1292 __rcu_reclaim(rsp->name, list); 1404 __rcu_reclaim(rsp->name, list);
1293 list = next; 1405 list = next;
1294 if (++count >= bl) 1406 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl &&
1408 (need_resched() ||
1409 (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
1295 break; 1410 break;
1296 } 1411 }
1297 1412
1298 local_irq_save(flags); 1413 local_irq_save(flags);
1299 trace_rcu_batch_end(rsp->name, count); 1414 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
1415 is_idle_task(current),
1416 rcu_is_callbacks_kthread());
1300 1417
1301 /* Update count, and requeue any remaining callbacks. */ 1418 /* Update count, and requeue any remaining callbacks. */
1302 rdp->qlen -= count; 1419 rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1334 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 1451 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
1335 * Also schedule RCU core processing. 1452 * Also schedule RCU core processing.
1336 * 1453 *
1337 * This function must be called with hardirqs disabled. It is normally 1454 * This function must be called from hardirq context. It is normally
1338 * invoked from the scheduling-clock interrupt. If rcu_pending returns 1455 * invoked from the scheduling-clock interrupt. If rcu_pending returns
1339 * false, there is no point in invoking rcu_check_callbacks(). 1456 * false, there is no point in invoking rcu_check_callbacks().
1340 */ 1457 */
1341void rcu_check_callbacks(int cpu, int user) 1458void rcu_check_callbacks(int cpu, int user)
1342{ 1459{
1343 trace_rcu_utilization("Start scheduler-tick"); 1460 trace_rcu_utilization("Start scheduler-tick");
1344 if (user || 1461 if (user || rcu_is_cpu_rrupt_from_idle()) {
1345 (idle_cpu(cpu) && rcu_scheduler_active &&
1346 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
1347 1462
1348 /* 1463 /*
1349 * Get here if this CPU took its interrupt from user 1464 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1457 goto unlock_fqs_ret; /* no GP in progress, time updated. */ 1572 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1458 } 1573 }
1459 rsp->fqs_active = 1; 1574 rsp->fqs_active = 1;
1460 switch (rsp->signaled) { 1575 switch (rsp->fqs_state) {
1461 case RCU_GP_IDLE: 1576 case RCU_GP_IDLE:
1462 case RCU_GP_INIT: 1577 case RCU_GP_INIT:
1463 1578
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1473 force_qs_rnp(rsp, dyntick_save_progress_counter); 1588 force_qs_rnp(rsp, dyntick_save_progress_counter);
1474 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1589 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1475 if (rcu_gp_in_progress(rsp)) 1590 if (rcu_gp_in_progress(rsp))
1476 rsp->signaled = RCU_FORCE_QS; 1591 rsp->fqs_state = RCU_FORCE_QS;
1477 break; 1592 break;
1478 1593
1479 case RCU_FORCE_QS: 1594 case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
1812 * by the current CPU, even if none need be done immediately, returning 1927 * by the current CPU, even if none need be done immediately, returning
1813 * 1 if so. 1928 * 1 if so.
1814 */ 1929 */
1815static int rcu_needs_cpu_quick_check(int cpu) 1930static int rcu_cpu_has_callbacks(int cpu)
1816{ 1931{
1817 /* RCU callbacks either ready or pending? */ 1932 /* RCU callbacks either ready or pending? */
1818 return per_cpu(rcu_sched_data, cpu).nxtlist || 1933 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1913 for (i = 0; i < RCU_NEXT_SIZE; i++) 2028 for (i = 0; i < RCU_NEXT_SIZE; i++)
1914 rdp->nxttail[i] = &rdp->nxtlist; 2029 rdp->nxttail[i] = &rdp->nxtlist;
1915 rdp->qlen = 0; 2030 rdp->qlen = 0;
1916#ifdef CONFIG_NO_HZ
1917 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1918#endif /* #ifdef CONFIG_NO_HZ */ 2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
1919 rdp->cpu = cpu; 2034 rdp->cpu = cpu;
1920 rdp->rsp = rsp; 2035 rdp->rsp = rsp;
1921 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2036 raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
1942 rdp->qlen_last_fqs_check = 0; 2057 rdp->qlen_last_fqs_check = 0;
1943 rdp->n_force_qs_snap = rsp->n_force_qs; 2058 rdp->n_force_qs_snap = rsp->n_force_qs;
1944 rdp->blimit = blimit; 2059 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
2061 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu);
1945 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2064 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1946 2065
1947 /* 2066 /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2023 rcu_send_cbs_to_online(&rcu_bh_state); 2142 rcu_send_cbs_to_online(&rcu_bh_state);
2024 rcu_send_cbs_to_online(&rcu_sched_state); 2143 rcu_send_cbs_to_online(&rcu_sched_state);
2025 rcu_preempt_send_cbs_to_online(); 2144 rcu_preempt_send_cbs_to_online();
2145 rcu_cleanup_after_idle(cpu);
2026 break; 2146 break;
2027 case CPU_DEAD: 2147 case CPU_DEAD:
2028 case CPU_DEAD_FROZEN: 2148 case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51fe..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track irq/process nesting level. */ 87 long long dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks_nmi_nesting; /* Track NMI nesting level. */ 88 /* Process level is worth LLONG_MAX/2. */
89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ 89 int dynticks_nmi_nesting; /* Track NMI nesting level. */
90 atomic_t dynticks; /* Even value for idle, else odd. */
90}; 91};
91 92
92/* RCU's kthread states for tracing. */ 93/* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
274 /* did other CPU force QS recently? */ 275 /* did other CPU force QS recently? */
275 long blimit; /* Upper limit on a processed batch */ 276 long blimit; /* Upper limit on a processed batch */
276 277
277#ifdef CONFIG_NO_HZ
278 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
280 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
281#endif /* #ifdef CONFIG_NO_HZ */
282 281
283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
284#ifdef CONFIG_NO_HZ
285 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
286#endif /* #ifdef CONFIG_NO_HZ */
287 unsigned long offline_fqs; /* Kicked due to being offline. */ 284 unsigned long offline_fqs; /* Kicked due to being offline. */
288 unsigned long resched_ipi; /* Sent a resched IPI. */ 285 unsigned long resched_ipi; /* Sent a resched IPI. */
289 286
@@ -302,16 +299,12 @@ struct rcu_data {
302 struct rcu_state *rsp; 299 struct rcu_state *rsp;
303}; 300};
304 301
305/* Values for signaled field in struct rcu_state. */ 302/* Values for fqs_state field in struct rcu_state. */
306#define RCU_GP_IDLE 0 /* No grace period in progress. */ 303#define RCU_GP_IDLE 0 /* No grace period in progress. */
307#define RCU_GP_INIT 1 /* Grace period being initialized. */ 304#define RCU_GP_INIT 1 /* Grace period being initialized. */
308#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 305#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
309#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 306#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
310#ifdef CONFIG_NO_HZ
311#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 307#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
312#else /* #ifdef CONFIG_NO_HZ */
313#define RCU_SIGNAL_INIT RCU_FORCE_QS
314#endif /* #else #ifdef CONFIG_NO_HZ */
315 308
316#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 309#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
317 310
@@ -361,7 +354,7 @@ struct rcu_state {
361 354
362 /* The following fields are guarded by the root rcu_node's lock. */ 355 /* The following fields are guarded by the root rcu_node's lock. */
363 356
364 u8 signaled ____cacheline_internodealigned_in_smp; 357 u8 fqs_state ____cacheline_internodealigned_in_smp;
365 /* Force QS state. */ 358 /* Force QS state. */
366 u8 fqs_active; /* force_quiescent_state() */ 359 u8 fqs_active; /* force_quiescent_state() */
367 /* is running. */ 360 /* is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
451static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
452void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
453#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) 446#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
454static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); 447static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake);
455#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
456static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
457static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
461static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
462static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
463static void invoke_rcu_callbacks_kthread(void); 457static void invoke_rcu_callbacks_kthread(void);
458static bool rcu_is_callbacks_kthread(void);
464#ifdef CONFIG_RCU_BOOST 459#ifdef CONFIG_RCU_BOOST
465static void rcu_preempt_do_callbacks(void); 460static void rcu_preempt_do_callbacks(void);
466static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 461static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
473#endif /* #ifdef CONFIG_RCU_BOOST */ 468#endif /* #ifdef CONFIG_RCU_BOOST */
474static void rcu_cpu_kthread_setrt(int cpu, int to_rt); 469static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
475static void __cpuinit rcu_prepare_kthreads(int cpu); 470static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu);
476 474
477#endif /* #ifndef RCU_TREE_NONCORE */ 475#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a4184..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
312{ 312{
313 int empty; 313 int empty;
314 int empty_exp; 314 int empty_exp;
315 int empty_exp_now;
315 unsigned long flags; 316 unsigned long flags;
316 struct list_head *np; 317 struct list_head *np;
317#ifdef CONFIG_RCU_BOOST 318#ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
382 /* 383 /*
383 * If this was the last task on the current list, and if 384 * If this was the last task on the current list, and if
384 * we aren't waiting on any CPUs, report the quiescent state. 385 * we aren't waiting on any CPUs, report the quiescent state.
385 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 386 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
387 * so we must take a snapshot of the expedited state.
386 */ 388 */
389 empty_exp_now = !rcu_preempted_readers_exp(rnp);
387 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { 390 if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
388 trace_rcu_quiescent_state_report("preempt_rcu", 391 trace_rcu_quiescent_state_report("preempt_rcu",
389 rnp->gpnum, 392 rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
406 * If this was the last task on the expedited lists, 409 * If this was the last task on the expedited lists,
407 * then we need to report up the rcu_node hierarchy. 410 * then we need to report up the rcu_node hierarchy.
408 */ 411 */
409 if (!empty_exp && !rcu_preempted_readers_exp(rnp)) 412 if (!empty_exp && empty_exp_now)
410 rcu_report_exp_rnp(&rcu_preempt_state, rnp); 413 rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
411 } else { 414 } else {
412 local_irq_restore(flags); 415 local_irq_restore(flags);
413 } 416 }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
729 * recursively up the tree. (Calm down, calm down, we do the recursion 732 * recursively up the tree. (Calm down, calm down, we do the recursion
730 * iteratively!) 733 * iteratively!)
731 * 734 *
735 * Most callers will set the "wake" flag, but the task initiating the
736 * expedited grace period need not wake itself.
737 *
732 * Caller must hold sync_rcu_preempt_exp_mutex. 738 * Caller must hold sync_rcu_preempt_exp_mutex.
733 */ 739 */
734static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 740static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
741 bool wake)
735{ 742{
736 unsigned long flags; 743 unsigned long flags;
737 unsigned long mask; 744 unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
744 } 751 }
745 if (rnp->parent == NULL) { 752 if (rnp->parent == NULL) {
746 raw_spin_unlock_irqrestore(&rnp->lock, flags); 753 raw_spin_unlock_irqrestore(&rnp->lock, flags);
747 wake_up(&sync_rcu_preempt_exp_wq); 754 if (wake)
755 wake_up(&sync_rcu_preempt_exp_wq);
748 break; 756 break;
749 } 757 }
750 mask = rnp->grpmask; 758 mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
777 must_wait = 1; 785 must_wait = 1;
778 } 786 }
779 if (!must_wait) 787 if (!must_wait)
780 rcu_report_exp_rnp(rsp, rnp); 788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
781} 789}
782 790
783/* 791/*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1069 * report on tasks preempted in RCU read-side critical sections during 1077 * report on tasks preempted in RCU read-side critical sections during
1070 * expedited RCU grace periods. 1078 * expedited RCU grace periods.
1071 */ 1079 */
1072static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) 1080static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
1081 bool wake)
1073{ 1082{
1074 return;
1075} 1083}
1076 1084
1077#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 1085#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
1157 1165
1158#endif /* #else #ifdef CONFIG_RCU_TRACE */ 1166#endif /* #else #ifdef CONFIG_RCU_TRACE */
1159 1167
1160static struct lock_class_key rcu_boost_class;
1161
1162/* 1168/*
1163 * Carry out RCU priority boosting on the task indicated by ->exp_tasks 1169 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
1164 * or ->boost_tasks, advancing the pointer to the next task in the 1170 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
1221 */ 1227 */
1222 t = container_of(tb, struct task_struct, rcu_node_entry); 1228 t = container_of(tb, struct task_struct, rcu_node_entry);
1223 rt_mutex_init_proxy_locked(&mtx, t); 1229 rt_mutex_init_proxy_locked(&mtx, t);
1224 /* Avoid lockdep false positives. This rt_mutex is its own thing. */
1225 lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
1226 "rcu_boost_mutex");
1227 t->rcu_boost_mutex = &mtx; 1230 t->rcu_boost_mutex = &mtx;
1228 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1231 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1229 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ 1232 rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
1230 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 1233 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
1231 1234
1232 return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; 1235 return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1236 ACCESS_ONCE(rnp->boost_tasks) != NULL;
1233} 1237}
1234 1238
1235/* 1239/*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
1329} 1333}
1330 1334
1331/* 1335/*
1336 * Is the current CPU running the RCU-callbacks kthread?
1337 * Caller must have preemption disabled.
1338 */
1339static bool rcu_is_callbacks_kthread(void)
1340{
1341 return __get_cpu_var(rcu_cpu_kthread_task) == current;
1342}
1343
1344/*
1332 * Set the affinity of the boost kthread. The CPU-hotplug locks are 1345 * Set the affinity of the boost kthread. The CPU-hotplug locks are
1333 * held, so no one should be messing with the existence of the boost 1346 * held, so no one should be messing with the existence of the boost
1334 * kthread. 1347 * kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
1772 WARN_ON_ONCE(1); 1785 WARN_ON_ONCE(1);
1773} 1786}
1774 1787
1788static bool rcu_is_callbacks_kthread(void)
1789{
1790 return false;
1791}
1792
1775static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) 1793static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1776{ 1794{
1777} 1795}
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
1907 * grace period works for us. 1925 * grace period works for us.
1908 */ 1926 */
1909 get_online_cpus(); 1927 get_online_cpus();
1910 snap = atomic_read(&sync_sched_expedited_started) - 1; 1928 snap = atomic_read(&sync_sched_expedited_started);
1911 smp_mb(); /* ensure read is before try_stop_cpus(). */ 1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1912 } 1930 }
1913 1931
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1939 * 1 if so. This function is part of the RCU implementation; it is -not- 1957 * 1 if so. This function is part of the RCU implementation; it is -not-
1940 * an exported member of the RCU API. 1958 * an exported member of the RCU API.
1941 * 1959 *
1942 * Because we have preemptible RCU, just check whether this CPU needs 1960 * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1943 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption 1961 * any flavor of RCU.
1944 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
1945 */ 1962 */
1946int rcu_needs_cpu(int cpu) 1963int rcu_needs_cpu(int cpu)
1947{ 1964{
1948 return rcu_needs_cpu_quick_check(cpu); 1965 return rcu_cpu_has_callbacks(cpu);
1966}
1967
1968/*
1969 * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
1970 */
1971static void rcu_prepare_for_idle_init(int cpu)
1972{
1973}
1974
1975/*
1976 * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1977 * after it.
1978 */
1979static void rcu_cleanup_after_idle(int cpu)
1980{
1981}
1982
1983/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
1985 * is nothing.
1986 */
1987static void rcu_prepare_for_idle(int cpu)
1988{
1949} 1989}
1950 1990
1951#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1991#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1952 1992
1953#define RCU_NEEDS_CPU_FLUSHES 5 1993/*
1994 * This code is invoked when a CPU goes idle, at which point we want
1995 * to have the CPU do everything required for RCU so that it can enter
1996 * the energy-efficient dyntick-idle mode. This is handled by a
1997 * state machine implemented by rcu_prepare_for_idle() below.
1998 *
1999 * The following three proprocessor symbols control this state machine:
2000 *
2001 * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
2002 * to satisfy RCU. Beyond this point, it is better to incur a periodic
2003 * scheduling-clock interrupt than to loop through the state machine
2004 * at full power.
2005 * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
2006 * optional if RCU does not need anything immediately from this
2007 * CPU, even if this CPU still has RCU callbacks queued. The first
2008 * times through the state machine are mandatory: we need to give
2009 * the state machine a chance to communicate a quiescent state
2010 * to the RCU core.
2011 * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
2012 * to sleep in dyntick-idle mode with RCU callbacks pending. This
2013 * is sized to be roughly one RCU grace period. Those energy-efficiency
2014 * benchmarkers who might otherwise be tempted to set this to a large
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it!
2018 *
2019 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though
2021 * making the state machine smarter might be a better option.
2022 */
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
2026
1954static DEFINE_PER_CPU(int, rcu_dyntick_drain); 2027static DEFINE_PER_CPU(int, rcu_dyntick_drain);
1955static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait;
1956 2031
1957/* 2032/*
1958 * Check to see if any future RCU-related work will need to be done 2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
1959 * by the current CPU, even if none need be done immediately, returning 2034 * callbacks on this CPU, (2) this CPU has not yet attempted to enter
1960 * 1 if so. This function is part of the RCU implementation; it is -not- 2035 * dyntick-idle mode, or (3) this CPU is in the process of attempting to
1961 * an exported member of the RCU API. 2036 * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
2037 * to enter dyntick-idle mode, we refuse to try to enter it. After all,
2038 * it is better to incur scheduling-clock interrupts than to spin
2039 * continuously for the same time duration!
2040 */
2041int rcu_needs_cpu(int cpu)
2042{
2043 /* If no callbacks, RCU doesn't need the CPU. */
2044 if (!rcu_cpu_has_callbacks(cpu))
2045 return 0;
2046 /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
2047 return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
2048}
2049
2050/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the
2054 * real work is done upon re-entry to idle, or by the next scheduling-clock
2055 * interrupt should idle not be re-entered.
2056 */
2057static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
2058{
2059 trace_rcu_prep_idle("Timer");
2060 return HRTIMER_NORESTART;
2061}
2062
2063/*
2064 * Initialize the timer used to pull CPUs out of dyntick-idle mode.
2065 */
2066static void rcu_prepare_for_idle_init(int cpu)
2067{
2068 static int firsttime = 1;
2069 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2070
2071 hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2072 hrtp->function = rcu_idle_gp_timer_func;
2073 if (firsttime) {
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0;
2078 }
2079}
2080
2081/*
2082 * Clean up for exit from idle. Because we are exiting from idle, there
2083 * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
2084 * do nothing if this timer is not active, so just cancel it unconditionally.
2085 */
2086static void rcu_cleanup_after_idle(int cpu)
2087{
2088 hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
2089}
2090
2091/*
2092 * Check to see if any RCU-related work can be done by the current CPU,
2093 * and if so, schedule a softirq to get it done. This function is part
2094 * of the RCU implementation; it is -not- an exported member of the RCU API.
1962 * 2095 *
1963 * Because we are not supporting preemptible RCU, attempt to accelerate 2096 * The idea is for the current CPU to clear out all work required by the
1964 * any current grace periods so that RCU no longer needs this CPU, but 2097 * RCU core for the current grace period, so that this CPU can be permitted
1965 * only if all other CPUs are already in dynticks-idle mode. This will 2098 * to enter dyntick-idle mode. In some cases, it will need to be awakened
1966 * allow the CPU cores to be powered down immediately, as opposed to after 2099 * at the end of the grace period by whatever CPU ends the grace period.
1967 * waiting many milliseconds for grace periods to elapse. 2100 * This allows CPUs to go dyntick-idle more quickly, and to reduce the
2101 * number of wakeups by a modest integer factor.
1968 * 2102 *
1969 * Because it is not legal to invoke rcu_process_callbacks() with irqs 2103 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1970 * disabled, we do one pass of force_quiescent_state(), then do a 2104 * disabled, we do one pass of force_quiescent_state(), then do a
1971 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked 2105 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
1972 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. 2106 * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
2107 *
2108 * The caller must have disabled interrupts.
1973 */ 2109 */
1974int rcu_needs_cpu(int cpu) 2110static void rcu_prepare_for_idle(int cpu)
1975{ 2111{
1976 int c = 0; 2112 unsigned long flags;
1977 int snap; 2113
1978 int thatcpu; 2114 local_irq_save(flags);
1979 2115
1980 /* Check for being in the holdoff period. */ 2116 /*
1981 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) 2117 * If there are no callbacks on this CPU, enter dyntick-idle mode.
1982 return rcu_needs_cpu_quick_check(cpu); 2118 * Also reset state to avoid prejudicing later attempts.
1983 2119 */
1984 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 2120 if (!rcu_cpu_has_callbacks(cpu)) {
1985 for_each_online_cpu(thatcpu) { 2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1986 if (thatcpu == cpu) 2122 per_cpu(rcu_dyntick_drain, cpu) = 0;
1987 continue; 2123 local_irq_restore(flags);
1988 snap = atomic_add_return(0, &per_cpu(rcu_dynticks, 2124 trace_rcu_prep_idle("No callbacks");
1989 thatcpu).dynticks); 2125 return;
1990 smp_mb(); /* Order sampling of snap with end of grace period. */ 2126 }
1991 if ((snap & 0x1) != 0) { 2127
1992 per_cpu(rcu_dyntick_drain, cpu) = 0; 2128 /*
1993 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2129 * If in holdoff mode, just return. We will presumably have
1994 return rcu_needs_cpu_quick_check(cpu); 2130 * refrained from disabling the scheduling-clock tick.
1995 } 2131 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff");
2135 return;
1996 } 2136 }
1997 2137
1998 /* Check and update the rcu_dyntick_drain sequencing. */ 2138 /* Check and update the rcu_dyntick_drain sequencing. */
1999 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2139 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2000 /* First time through, initialize the counter. */ 2140 /* First time through, initialize the counter. */
2001 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; 2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) {
2144 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */
2002 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2003 /* We have hit the limit, so time to give up. */ 2152 /* We have hit the limit, so time to give up. */
2004 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2005 return rcu_needs_cpu_quick_check(cpu); 2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return;
2006 } 2158 }
2007 2159
2008 /* Do one step pushing remaining RCU callbacks through. */ 2160 /*
2161 * Do one step of pushing the remaining RCU callbacks through
2162 * the RCU core state machine.
2163 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2009 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2010 rcu_sched_qs(cpu); 2174 rcu_sched_qs(cpu);
2011 force_quiescent_state(&rcu_sched_state, 0); 2175 force_quiescent_state(&rcu_sched_state, 0);
2012 c = c || per_cpu(rcu_sched_data, cpu).nxtlist; 2176 local_irq_save(flags);
2013 } 2177 }
2014 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2015 rcu_bh_qs(cpu); 2180 rcu_bh_qs(cpu);
2016 force_quiescent_state(&rcu_bh_state, 0); 2181 force_quiescent_state(&rcu_bh_state, 0);
2017 c = c || per_cpu(rcu_bh_data, cpu).nxtlist; 2182 local_irq_save(flags);
2018 } 2183 }
2019 2184
2020 /* If RCU callbacks are still pending, RCU still needs this CPU. */ 2185 /*
2021 if (c) 2186 * If RCU callbacks are still pending, RCU still needs this CPU.
2187 * So try forcing the callbacks through the grace period.
2188 */
2189 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks");
2022 invoke_rcu_core(); 2192 invoke_rcu_core();
2023 return c; 2193 } else {
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained");
2196 }
2024} 2197}
2025 2198
2026#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c0695..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
67 rdp->completed, rdp->gpnum, 67 rdp->completed, rdp->gpnum,
68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 68 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
69 rdp->qs_pending); 69 rdp->qs_pending);
70#ifdef CONFIG_NO_HZ 70 seq_printf(m, " dt=%d/%llx/%d df=%lu",
71 seq_printf(m, " dt=%d/%d/%d df=%lu",
72 atomic_read(&rdp->dynticks->dynticks), 71 atomic_read(&rdp->dynticks->dynticks),
73 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
74 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
75 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
76#endif /* #ifdef CONFIG_NO_HZ */
77 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
78 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld qs=%c%c%c%c",
79 rdp->qlen, 77 rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
141 rdp->completed, rdp->gpnum, 139 rdp->completed, rdp->gpnum,
142 rdp->passed_quiesce, rdp->passed_quiesce_gpnum, 140 rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
143 rdp->qs_pending); 141 rdp->qs_pending);
144#ifdef CONFIG_NO_HZ 142 seq_printf(m, ",%d,%llx,%d,%lu",
145 seq_printf(m, ",%d,%d,%d,%lu",
146 atomic_read(&rdp->dynticks->dynticks), 143 atomic_read(&rdp->dynticks->dynticks),
147 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
148 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
149 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
150#endif /* #ifdef CONFIG_NO_HZ */
151 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
152 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
153 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
171static int show_rcudata_csv(struct seq_file *m, void *unused) 167static int show_rcudata_csv(struct seq_file *m, void *unused)
172{ 168{
173 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
174#ifdef CONFIG_NO_HZ
175 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
176#endif /* #ifdef CONFIG_NO_HZ */
177 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
178#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
179 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
278 gpnum = rsp->gpnum; 272 gpnum = rsp->gpnum;
279 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " 273 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
280 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", 274 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
281 rsp->completed, gpnum, rsp->signaled, 275 rsp->completed, gpnum, rsp->fqs_state,
282 (long)(rsp->jiffies_force_qs - jiffies), 276 (long)(rsp->jiffies_force_qs - jiffies),
283 (int)(jiffies & 0xffff), 277 (int)(jiffies & 0xffff),
284 rsp->n_force_qs, rsp->n_force_qs_ngp, 278 rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd487..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
579 struct rt_mutex_waiter *waiter) 579 struct rt_mutex_waiter *waiter)
580{ 580{
581 int ret = 0; 581 int ret = 0;
582 int was_disabled;
583 582
584 for (;;) { 583 for (;;) {
585 /* Try to acquire the lock: */ 584 /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
602 601
603 raw_spin_unlock(&lock->wait_lock); 602 raw_spin_unlock(&lock->wait_lock);
604 603
605 was_disabled = irqs_disabled();
606 if (was_disabled)
607 local_irq_enable();
608
609 debug_rt_mutex_print_deadlock(waiter); 604 debug_rt_mutex_print_deadlock(waiter);
610 605
611 schedule_rt_mutex(lock); 606 schedule_rt_mutex(lock);
612 607
613 if (was_disabled)
614 local_irq_disable();
615
616 raw_spin_lock(&lock->wait_lock); 608 raw_spin_lock(&lock->wait_lock);
617 set_current_state(state); 609 set_current_state(state);
618 } 610 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff0..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 rcu_irq_exit();
351#ifdef CONFIG_NO_HZ 350#ifdef CONFIG_NO_HZ
352 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
353 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
354 tick_nohz_stop_sched_tick(0); 353 tick_nohz_irq_exit();
355#endif 354#endif
355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 preempt_enable_no_resched();
357} 357}
358 358
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0ba..0ec8b832ab6b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
275} 275}
276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); 276EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
277 277
278/** 278static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
279 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
280 *
281 * When the next event is more than a tick into the future, stop the idle tick
282 * Called either from the idle loop or from irq_exit() when an idle period was
283 * just interrupted by an interrupt which did not cause a reschedule.
284 */
285void tick_nohz_stop_sched_tick(int inidle)
286{ 279{
287 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 280 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
288 struct tick_sched *ts;
289 ktime_t last_update, expires, now; 281 ktime_t last_update, expires, now;
290 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 282 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
291 u64 time_delta; 283 u64 time_delta;
292 int cpu; 284 int cpu;
293 285
294 local_irq_save(flags);
295
296 cpu = smp_processor_id(); 286 cpu = smp_processor_id();
297 ts = &per_cpu(tick_cpu_sched, cpu); 287 ts = &per_cpu(tick_cpu_sched, cpu);
298 288
299 /*
300 * Call to tick_nohz_start_idle stops the last_update_time from being
301 * updated. Thus, it must not be called in the event we are called from
302 * irq_exit() with the prior state different than idle.
303 */
304 if (!inidle && !ts->inidle)
305 goto end;
306
307 /*
308 * Set ts->inidle unconditionally. Even if the system did not
309 * switch to NOHZ mode the cpu frequency governers rely on the
310 * update of the idle time accounting in tick_nohz_start_idle().
311 */
312 ts->inidle = 1;
313
314 now = tick_nohz_start_idle(cpu, ts); 289 now = tick_nohz_start_idle(cpu, ts);
315 290
316 /* 291 /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
326 } 301 }
327 302
328 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 303 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
329 goto end; 304 return;
330 305
331 if (need_resched()) 306 if (need_resched())
332 goto end; 307 return;
333 308
334 if (unlikely(local_softirq_pending() && cpu_online(cpu))) { 309 if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
335 static int ratelimit; 310 static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
339 (unsigned int) local_softirq_pending()); 314 (unsigned int) local_softirq_pending());
340 ratelimit++; 315 ratelimit++;
341 } 316 }
342 goto end; 317 return;
343 } 318 }
344 319
345 ts->idle_calls++; 320 ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
434 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 409 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
435 ts->tick_stopped = 1; 410 ts->tick_stopped = 1;
436 ts->idle_jiffies = last_jiffies; 411 ts->idle_jiffies = last_jiffies;
437 rcu_enter_nohz();
438 } 412 }
439 413
440 ts->idle_sleeps++; 414 ts->idle_sleeps++;
@@ -472,8 +446,56 @@ out:
472 ts->next_jiffies = next_jiffies; 446 ts->next_jiffies = next_jiffies;
473 ts->last_jiffies = last_jiffies; 447 ts->last_jiffies = last_jiffies;
474 ts->sleep_length = ktime_sub(dev->next_event, now); 448 ts->sleep_length = ktime_sub(dev->next_event, now);
475end: 449}
476 local_irq_restore(flags); 450
451/**
452 * tick_nohz_idle_enter - stop the idle tick from the idle task
453 *
454 * When the next event is more than a tick into the future, stop the idle tick
455 * Called when we start the idle loop.
456 *
457 * The arch is responsible of calling:
458 *
459 * - rcu_idle_enter() after its last use of RCU before the CPU is put
460 * to sleep.
461 * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
462 */
463void tick_nohz_idle_enter(void)
464{
465 struct tick_sched *ts;
466
467 WARN_ON_ONCE(irqs_disabled());
468
469 local_irq_disable();
470
471 ts = &__get_cpu_var(tick_cpu_sched);
472 /*
473 * set ts->inidle unconditionally. even if the system did not
474 * switch to nohz mode the cpu frequency governers rely on the
475 * update of the idle time accounting in tick_nohz_start_idle().
476 */
477 ts->inidle = 1;
478 tick_nohz_stop_sched_tick(ts);
479
480 local_irq_enable();
481}
482
483/**
484 * tick_nohz_irq_exit - update next tick event from interrupt exit
485 *
486 * When an interrupt fires while we are idle and it doesn't cause
487 * a reschedule, it may still add, modify or delete a timer, enqueue
488 * an RCU callback, etc...
489 * So we need to re-calculate and reprogram the next tick event.
490 */
491void tick_nohz_irq_exit(void)
492{
493 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
494
495 if (!ts->inidle)
496 return;
497
498 tick_nohz_stop_sched_tick(ts);
477} 499}
478 500
479/** 501/**
@@ -515,11 +537,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
515} 537}
516 538
517/** 539/**
518 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 540 * tick_nohz_idle_exit - restart the idle tick from the idle task
519 * 541 *
520 * Restart the idle tick when the CPU is woken up from idle 542 * Restart the idle tick when the CPU is woken up from idle
543 * This also exit the RCU extended quiescent state. The CPU
544 * can use RCU again after this function is called.
521 */ 545 */
522void tick_nohz_restart_sched_tick(void) 546void tick_nohz_idle_exit(void)
523{ 547{
524 int cpu = smp_processor_id(); 548 int cpu = smp_processor_id();
525 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 549 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +553,7 @@ void tick_nohz_restart_sched_tick(void)
529 ktime_t now; 553 ktime_t now;
530 554
531 local_irq_disable(); 555 local_irq_disable();
556
532 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 557 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
533 now = ktime_get(); 558 now = ktime_get();
534 559
@@ -543,8 +568,6 @@ void tick_nohz_restart_sched_tick(void)
543 568
544 ts->inidle = 0; 569 ts->inidle = 0;
545 570
546 rcu_exit_nohz();
547
548 /* Update jiffies first */ 571 /* Update jiffies first */
549 select_nohz_load_balancer(0); 572 select_nohz_load_balancer(0);
550 tick_do_update_jiffies64(now); 573 tick_do_update_jiffies64(now);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..a043d224adf6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4775,6 +4775,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4775{ 4775{
4776 __ftrace_dump(true, oops_dump_mode); 4776 __ftrace_dump(true, oops_dump_mode);
4777} 4777}
4778EXPORT_SYMBOL_GPL(ftrace_dump);
4778 4779
4779__init static int tracer_alloc_buffers(void) 4780__init static int tracer_alloc_buffers(void)
4780{ 4781{