summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/atomic_ops.txt45
-rw-r--r--Documentation/kernel-parameters.txt20
-rw-r--r--Documentation/kernel-per-CPU-kthreads.txt34
-rw-r--r--Documentation/memory-barriers.txt42
-rw-r--r--Documentation/timers/NO_HZ.txt10
-rw-r--r--arch/blackfin/mach-common/smp.c6
-rw-r--r--arch/metag/kernel/smp.c5
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/kernel/smpboot.c39
-rw-r--r--arch/x86/xen/smp.c46
-rw-r--r--include/linux/cpu.h20
-rw-r--r--include/linux/lockdep.h7
-rw-r--r--include/linux/rcupdate.h40
-rw-r--r--include/linux/srcu.h2
-rw-r--r--init/Kconfig13
-rw-r--r--init/main.c1
-rw-r--r--kernel/cpu.c38
-rw-r--r--kernel/rcu/rcutorture.c27
-rw-r--r--kernel/rcu/srcu.c19
-rw-r--r--kernel/rcu/tiny.c14
-rw-r--r--kernel/rcu/tree.c437
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h267
-rw-r--r--kernel/rcu/tree_trace.c4
-rw-r--r--kernel/rcu/update.c72
-rw-r--r--kernel/sched/idle.c9
-rw-r--r--kernel/smpboot.c156
-rw-r--r--lib/Kconfig.debug35
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/CFcommon1
31 files changed, 986 insertions, 440 deletions
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 183e41bdcb69..dab6da3382d9 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -201,11 +201,11 @@ These routines add 1 and subtract 1, respectively, from the given
201atomic_t and return the new counter value after the operation is 201atomic_t and return the new counter value after the operation is
202performed. 202performed.
203 203
204Unlike the above routines, it is required that explicit memory 204Unlike the above routines, it is required that these primitives
205barriers are performed before and after the operation. It must be 205include explicit memory barriers that are performed before and after
206done such that all memory operations before and after the atomic 206the operation. It must be done such that all memory operations before
207operation calls are strongly ordered with respect to the atomic 207and after the atomic operation calls are strongly ordered with respect
208operation itself. 208to the atomic operation itself.
209 209
210For example, it should behave as if a smp_mb() call existed both 210For example, it should behave as if a smp_mb() call existed both
211before and after the atomic operation. 211before and after the atomic operation.
@@ -233,21 +233,21 @@ These two routines increment and decrement by 1, respectively, the
233given atomic counter. They return a boolean indicating whether the 233given atomic counter. They return a boolean indicating whether the
234resulting counter value was zero or not. 234resulting counter value was zero or not.
235 235
236It requires explicit memory barrier semantics around the operation as 236Again, these primitives provide explicit memory barrier semantics around
237above. 237the atomic operation.
238 238
239 int atomic_sub_and_test(int i, atomic_t *v); 239 int atomic_sub_and_test(int i, atomic_t *v);
240 240
241This is identical to atomic_dec_and_test() except that an explicit 241This is identical to atomic_dec_and_test() except that an explicit
242decrement is given instead of the implicit "1". It requires explicit 242decrement is given instead of the implicit "1". This primitive must
243memory barrier semantics around the operation. 243provide explicit memory barrier semantics around the operation.
244 244
245 int atomic_add_negative(int i, atomic_t *v); 245 int atomic_add_negative(int i, atomic_t *v);
246 246
247The given increment is added to the given atomic counter value. A 247The given increment is added to the given atomic counter value. A boolean
248boolean is return which indicates whether the resulting counter value 248is return which indicates whether the resulting counter value is negative.
249is negative. It requires explicit memory barrier semantics around the 249This primitive must provide explicit memory barrier semantics around
250operation. 250the operation.
251 251
252Then: 252Then:
253 253
@@ -257,7 +257,7 @@ This performs an atomic exchange operation on the atomic variable v, setting
257the given new value. It returns the old value that the atomic variable v had 257the given new value. It returns the old value that the atomic variable v had
258just before the operation. 258just before the operation.
259 259
260atomic_xchg requires explicit memory barriers around the operation. 260atomic_xchg must provide explicit memory barriers around the operation.
261 261
262 int atomic_cmpxchg(atomic_t *v, int old, int new); 262 int atomic_cmpxchg(atomic_t *v, int old, int new);
263 263
@@ -266,7 +266,7 @@ with the given old and new values. Like all atomic_xxx operations,
266atomic_cmpxchg will only satisfy its atomicity semantics as long as all 266atomic_cmpxchg will only satisfy its atomicity semantics as long as all
267other accesses of *v are performed through atomic_xxx operations. 267other accesses of *v are performed through atomic_xxx operations.
268 268
269atomic_cmpxchg requires explicit memory barriers around the operation. 269atomic_cmpxchg must provide explicit memory barriers around the operation.
270 270
271The semantics for atomic_cmpxchg are the same as those defined for 'cas' 271The semantics for atomic_cmpxchg are the same as those defined for 'cas'
272below. 272below.
@@ -279,8 +279,8 @@ If the atomic value v is not equal to u, this function adds a to v, and
279returns non zero. If v is equal to u then it returns zero. This is done as 279returns non zero. If v is equal to u then it returns zero. This is done as
280an atomic operation. 280an atomic operation.
281 281
282atomic_add_unless requires explicit memory barriers around the operation 282atomic_add_unless must provide explicit memory barriers around the
283unless it fails (returns 0). 283operation unless it fails (returns 0).
284 284
285atomic_inc_not_zero, equivalent to atomic_add_unless(v, 1, 0) 285atomic_inc_not_zero, equivalent to atomic_add_unless(v, 1, 0)
286 286
@@ -460,9 +460,9 @@ the return value into an int. There are other places where things
460like this occur as well. 460like this occur as well.
461 461
462These routines, like the atomic_t counter operations returning values, 462These routines, like the atomic_t counter operations returning values,
463require explicit memory barrier semantics around their execution. All 463must provide explicit memory barrier semantics around their execution.
464memory operations before the atomic bit operation call must be made 464All memory operations before the atomic bit operation call must be
465visible globally before the atomic bit operation is made visible. 465made visible globally before the atomic bit operation is made visible.
466Likewise, the atomic bit operation must be visible globally before any 466Likewise, the atomic bit operation must be visible globally before any
467subsequent memory operation is made visible. For example: 467subsequent memory operation is made visible. For example:
468 468
@@ -536,8 +536,9 @@ except that two underscores are prefixed to the interface name.
536These non-atomic variants also do not require any special memory 536These non-atomic variants also do not require any special memory
537barrier semantics. 537barrier semantics.
538 538
539The routines xchg() and cmpxchg() need the same exact memory barriers 539The routines xchg() and cmpxchg() must provide the same exact
540as the atomic and bit operations returning values. 540memory-barrier semantics as the atomic and bit operations returning
541values.
541 542
542Spinlocks and rwlocks have memory barrier expectations as well. 543Spinlocks and rwlocks have memory barrier expectations as well.
543The rule to follow is simple: 544The rule to follow is simple:
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 01aa47d3b6ab..05c36118f8d7 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2969,6 +2969,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2969 Set maximum number of finished RCU callbacks to 2969 Set maximum number of finished RCU callbacks to
2970 process in one batch. 2970 process in one batch.
2971 2971
2972 rcutree.gp_init_delay= [KNL]
2973 Set the number of jiffies to delay each step of
2974 RCU grace-period initialization. This only has
2975 effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
2976 set.
2977
2972 rcutree.rcu_fanout_leaf= [KNL] 2978 rcutree.rcu_fanout_leaf= [KNL]
2973 Increase the number of CPUs assigned to each 2979 Increase the number of CPUs assigned to each
2974 leaf rcu_node structure. Useful for very large 2980 leaf rcu_node structure. Useful for very large
@@ -2992,11 +2998,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2992 value is one, and maximum value is HZ. 2998 value is one, and maximum value is HZ.
2993 2999
2994 rcutree.kthread_prio= [KNL,BOOT] 3000 rcutree.kthread_prio= [KNL,BOOT]
2995 Set the SCHED_FIFO priority of the RCU 3001 Set the SCHED_FIFO priority of the RCU per-CPU
2996 per-CPU kthreads (rcuc/N). This value is also 3002 kthreads (rcuc/N). This value is also used for
2997 used for the priority of the RCU boost threads 3003 the priority of the RCU boost threads (rcub/N)
2998 (rcub/N). Valid values are 1-99 and the default 3004 and for the RCU grace-period kthreads (rcu_bh,
2999 is 1 (the least-favored priority). 3005 rcu_preempt, and rcu_sched). If RCU_BOOST is
3006 set, valid values are 1-99 and the default is 1
3007 (the least-favored priority). Otherwise, when
3008 RCU_BOOST is not set, valid values are 0-99 and
3009 the default is zero (non-realtime operation).
3000 3010
3001 rcutree.rcu_nocb_leader_stride= [KNL] 3011 rcutree.rcu_nocb_leader_stride= [KNL]
3002 Set the number of NOCB kthread groups, which 3012 Set the number of NOCB kthread groups, which
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index f3cd299fcc41..f4cbfe0ba108 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -190,20 +190,24 @@ To reduce its OS jitter, do any of the following:
190 on each CPU, including cs_dbs_timer() and od_dbs_timer(). 190 on each CPU, including cs_dbs_timer() and od_dbs_timer().
191 WARNING: Please check your CPU specifications to 191 WARNING: Please check your CPU specifications to
192 make sure that this is safe on your particular system. 192 make sure that this is safe on your particular system.
193 d. It is not possible to entirely get rid of OS jitter 193 d. As of v3.18, Christoph Lameter's on-demand vmstat workers
194 from vmstat_update() on CONFIG_SMP=y systems, but you 194 commit prevents OS jitter due to vmstat_update() on
195 can decrease its frequency by writing a large value 195 CONFIG_SMP=y systems. Before v3.18, is not possible
196 to /proc/sys/vm/stat_interval. The default value is 196 to entirely get rid of the OS jitter, but you can
197 HZ, for an interval of one second. Of course, larger 197 decrease its frequency by writing a large value to
198 values will make your virtual-memory statistics update 198 /proc/sys/vm/stat_interval. The default value is HZ,
199 more slowly. Of course, you can also run your workload 199 for an interval of one second. Of course, larger values
200 at a real-time priority, thus preempting vmstat_update(), 200 will make your virtual-memory statistics update more
201 slowly. Of course, you can also run your workload at
202 a real-time priority, thus preempting vmstat_update(),
201 but if your workload is CPU-bound, this is a bad idea. 203 but if your workload is CPU-bound, this is a bad idea.
202 However, there is an RFC patch from Christoph Lameter 204 However, there is an RFC patch from Christoph Lameter
203 (based on an earlier one from Gilad Ben-Yossef) that 205 (based on an earlier one from Gilad Ben-Yossef) that
204 reduces or even eliminates vmstat overhead for some 206 reduces or even eliminates vmstat overhead for some
205 workloads at https://lkml.org/lkml/2013/9/4/379. 207 workloads at https://lkml.org/lkml/2013/9/4/379.
206 e. If running on high-end powerpc servers, build with 208 e. Boot with "elevator=noop" to avoid workqueue use by
209 the block layer.
210 f. If running on high-end powerpc servers, build with
207 CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS 211 CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS
208 daemon from running on each CPU every second or so. 212 daemon from running on each CPU every second or so.
209 (This will require editing Kconfig files and will defeat 213 (This will require editing Kconfig files and will defeat
@@ -211,12 +215,12 @@ To reduce its OS jitter, do any of the following:
211 due to the rtas_event_scan() function. 215 due to the rtas_event_scan() function.
212 WARNING: Please check your CPU specifications to 216 WARNING: Please check your CPU specifications to
213 make sure that this is safe on your particular system. 217 make sure that this is safe on your particular system.
214 f. If running on Cell Processor, build your kernel with 218 g. If running on Cell Processor, build your kernel with
215 CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from 219 CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
216 spu_gov_work(). 220 spu_gov_work().
217 WARNING: Please check your CPU specifications to 221 WARNING: Please check your CPU specifications to
218 make sure that this is safe on your particular system. 222 make sure that this is safe on your particular system.
219 g. If running on PowerMAC, build your kernel with 223 h. If running on PowerMAC, build your kernel with
220 CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, 224 CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
221 avoiding OS jitter from rackmeter_do_timer(). 225 avoiding OS jitter from rackmeter_do_timer().
222 226
@@ -258,8 +262,12 @@ Purpose: Detect software lockups on each CPU.
258To reduce its OS jitter, do at least one of the following: 262To reduce its OS jitter, do at least one of the following:
2591. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these 2631. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
260 kthreads from being created in the first place. 264 kthreads from being created in the first place.
2612. Echo a zero to /proc/sys/kernel/watchdog to disable the 2652. Boot with "nosoftlockup=0", which will also prevent these kthreads
266 from being created. Other related watchdog and softlockup boot
267 parameters may be found in Documentation/kernel-parameters.txt
268 and Documentation/watchdog/watchdog-parameters.txt.
2693. Echo a zero to /proc/sys/kernel/watchdog to disable the
262 watchdog timer. 270 watchdog timer.
2633. Echo a large number of /proc/sys/kernel/watchdog_thresh in 2714. Echo a large number of /proc/sys/kernel/watchdog_thresh in
264 order to reduce the frequency of OS jitter due to the watchdog 272 order to reduce the frequency of OS jitter due to the watchdog
265 timer down to a level that is acceptable for your workload. 273 timer down to a level that is acceptable for your workload.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index ca2387ef27ab..6974f1c2b4e1 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -592,9 +592,9 @@ See also the subsection on "Cache Coherency" for a more thorough example.
592CONTROL DEPENDENCIES 592CONTROL DEPENDENCIES
593-------------------- 593--------------------
594 594
595A control dependency requires a full read memory barrier, not simply a data 595A load-load control dependency requires a full read memory barrier, not
596dependency barrier to make it work correctly. Consider the following bit of 596simply a data dependency barrier to make it work correctly. Consider the
597code: 597following bit of code:
598 598
599 q = ACCESS_ONCE(a); 599 q = ACCESS_ONCE(a);
600 if (q) { 600 if (q) {
@@ -615,14 +615,15 @@ case what's actually required is:
615 } 615 }
616 616
617However, stores are not speculated. This means that ordering -is- provided 617However, stores are not speculated. This means that ordering -is- provided
618in the following example: 618for load-store control dependencies, as in the following example:
619 619
620 q = ACCESS_ONCE(a); 620 q = ACCESS_ONCE(a);
621 if (q) { 621 if (q) {
622 ACCESS_ONCE(b) = p; 622 ACCESS_ONCE(b) = p;
623 } 623 }
624 624
625Please note that ACCESS_ONCE() is not optional! Without the 625Control dependencies pair normally with other types of barriers.
626That said, please note that ACCESS_ONCE() is not optional! Without the
626ACCESS_ONCE(), might combine the load from 'a' with other loads from 627ACCESS_ONCE(), might combine the load from 'a' with other loads from
627'a', and the store to 'b' with other stores to 'b', with possible highly 628'a', and the store to 'b' with other stores to 'b', with possible highly
628counterintuitive effects on ordering. 629counterintuitive effects on ordering.
@@ -813,6 +814,8 @@ In summary:
813 barrier() can help to preserve your control dependency. Please 814 barrier() can help to preserve your control dependency. Please
814 see the Compiler Barrier section for more information. 815 see the Compiler Barrier section for more information.
815 816
817 (*) Control dependencies pair normally with other types of barriers.
818
816 (*) Control dependencies do -not- provide transitivity. If you 819 (*) Control dependencies do -not- provide transitivity. If you
817 need transitivity, use smp_mb(). 820 need transitivity, use smp_mb().
818 821
@@ -823,14 +826,14 @@ SMP BARRIER PAIRING
823When dealing with CPU-CPU interactions, certain types of memory barrier should 826When dealing with CPU-CPU interactions, certain types of memory barrier should
824always be paired. A lack of appropriate pairing is almost certainly an error. 827always be paired. A lack of appropriate pairing is almost certainly an error.
825 828
826General barriers pair with each other, though they also pair with 829General barriers pair with each other, though they also pair with most
827most other types of barriers, albeit without transitivity. An acquire 830other types of barriers, albeit without transitivity. An acquire barrier
828barrier pairs with a release barrier, but both may also pair with other 831pairs with a release barrier, but both may also pair with other barriers,
829barriers, including of course general barriers. A write barrier pairs 832including of course general barriers. A write barrier pairs with a data
830with a data dependency barrier, an acquire barrier, a release barrier, 833dependency barrier, a control dependency, an acquire barrier, a release
831a read barrier, or a general barrier. Similarly a read barrier or a 834barrier, a read barrier, or a general barrier. Similarly a read barrier,
832data dependency barrier pairs with a write barrier, an acquire barrier, 835control dependency, or a data dependency barrier pairs with a write
833a release barrier, or a general barrier: 836barrier, an acquire barrier, a release barrier, or a general barrier:
834 837
835 CPU 1 CPU 2 838 CPU 1 CPU 2
836 =============== =============== 839 =============== ===============
@@ -850,6 +853,19 @@ Or:
850 <data dependency barrier> 853 <data dependency barrier>
851 y = *x; 854 y = *x;
852 855
856Or even:
857
858 CPU 1 CPU 2
859 =============== ===============================
860 r1 = ACCESS_ONCE(y);
861 <general barrier>
862 ACCESS_ONCE(y) = 1; if (r2 = ACCESS_ONCE(x)) {
863 <implicit control dependency>
864 ACCESS_ONCE(y) = 1;
865 }
866
867 assert(r1 == 0 || r2 == 0);
868
853Basically, the read barrier always has to be there, even though it can be of 869Basically, the read barrier always has to be there, even though it can be of
854the "weaker" type. 870the "weaker" type.
855 871
diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
index cca122f25120..6eaf576294f3 100644
--- a/Documentation/timers/NO_HZ.txt
+++ b/Documentation/timers/NO_HZ.txt
@@ -158,13 +158,9 @@ not come for free:
158 to the need to inform kernel subsystems (such as RCU) about 158 to the need to inform kernel subsystems (such as RCU) about
159 the change in mode. 159 the change in mode.
160 160
1613. POSIX CPU timers on adaptive-tick CPUs may miss their deadlines 1613. POSIX CPU timers prevent CPUs from entering adaptive-tick mode.
162 (perhaps indefinitely) because they currently rely on 162 Real-time applications needing to take actions based on CPU time
163 scheduling-tick interrupts. This will likely be fixed in 163 consumption need to use other means of doing so.
164 one of two ways: (1) Prevent CPUs with POSIX CPU timers from
165 entering adaptive-tick mode, or (2) Use hrtimers or other
166 adaptive-ticks-immune mechanism to cause the POSIX CPU timer to
167 fire properly.
168 164
1694. If there are more perf events pending than the hardware can 1654. If there are more perf events pending than the hardware can
170 accommodate, they are normally round-robined so as to collect 166 accommodate, they are normally round-robined so as to collect
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 8ad3e90cc8fc..1c7259597395 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -413,16 +413,14 @@ int __cpu_disable(void)
413 return 0; 413 return 0;
414} 414}
415 415
416static DECLARE_COMPLETION(cpu_killed);
417
418int __cpu_die(unsigned int cpu) 416int __cpu_die(unsigned int cpu)
419{ 417{
420 return wait_for_completion_timeout(&cpu_killed, 5000); 418 return cpu_wait_death(cpu, 5);
421} 419}
422 420
423void cpu_die(void) 421void cpu_die(void)
424{ 422{
425 complete(&cpu_killed); 423 (void)cpu_report_death();
426 424
427 atomic_dec(&init_mm.mm_users); 425 atomic_dec(&init_mm.mm_users);
428 atomic_dec(&init_mm.mm_count); 426 atomic_dec(&init_mm.mm_count);
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index f006d2276f40..ac3a199e33e7 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -261,7 +261,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
261} 261}
262 262
263#ifdef CONFIG_HOTPLUG_CPU 263#ifdef CONFIG_HOTPLUG_CPU
264static DECLARE_COMPLETION(cpu_killed);
265 264
266/* 265/*
267 * __cpu_disable runs on the processor to be shutdown. 266 * __cpu_disable runs on the processor to be shutdown.
@@ -299,7 +298,7 @@ int __cpu_disable(void)
299 */ 298 */
300void __cpu_die(unsigned int cpu) 299void __cpu_die(unsigned int cpu)
301{ 300{
302 if (!wait_for_completion_timeout(&cpu_killed, msecs_to_jiffies(1))) 301 if (!cpu_wait_death(cpu, 1))
303 pr_err("CPU%u: unable to kill\n", cpu); 302 pr_err("CPU%u: unable to kill\n", cpu);
304} 303}
305 304
@@ -314,7 +313,7 @@ void cpu_die(void)
314 local_irq_disable(); 313 local_irq_disable();
315 idle_task_exit(); 314 idle_task_exit();
316 315
317 complete(&cpu_killed); 316 (void)cpu_report_death();
318 317
319 asm ("XOR TXENABLE, D0Re0,D0Re0\n"); 318 asm ("XOR TXENABLE, D0Re0,D0Re0\n");
320} 319}
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
34#endif 34#endif
35#endif 35#endif
36 36
37DECLARE_PER_CPU(int, cpu_state);
38
39int mwait_usable(const struct cpuinfo_x86 *); 37int mwait_usable(const struct cpuinfo_x86 *);
40 38
41#endif /* _ASM_X86_CPU_H */ 39#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81d02fc7dafa..17a8dced12da 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,13 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
150} 150}
151 151
152void cpu_disable_common(void); 152void cpu_disable_common(void);
153void cpu_die_common(unsigned int cpu);
154void native_smp_prepare_boot_cpu(void); 153void native_smp_prepare_boot_cpu(void);
155void native_smp_prepare_cpus(unsigned int max_cpus); 154void native_smp_prepare_cpus(unsigned int max_cpus);
156void native_smp_cpus_done(unsigned int max_cpus); 155void native_smp_cpus_done(unsigned int max_cpus);
157void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); 156void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
158int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); 157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
159int native_cpu_disable(void); 158int native_cpu_disable(void);
159int common_cpu_die(unsigned int cpu);
160void native_cpu_die(unsigned int cpu); 160void native_cpu_die(unsigned int cpu);
161void native_play_dead(void); 161void native_play_dead(void);
162void play_dead_common(void); 162void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7035f6b21c3f..50e547eac8cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
77#include <asm/realmode.h> 77#include <asm/realmode.h>
78#include <asm/misc.h> 78#include <asm/misc.h>
79 79
80/* State of each CPU */
81DEFINE_PER_CPU(int, cpu_state) = { 0 };
82
83/* Number of siblings per CPU package */ 80/* Number of siblings per CPU package */
84int smp_num_siblings = 1; 81int smp_num_siblings = 1;
85EXPORT_SYMBOL(smp_num_siblings); 82EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
257 lock_vector_lock(); 254 lock_vector_lock();
258 set_cpu_online(smp_processor_id(), true); 255 set_cpu_online(smp_processor_id(), true);
259 unlock_vector_lock(); 256 unlock_vector_lock();
260 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 257 cpu_set_state_online(smp_processor_id());
261 x86_platform.nmi_init(); 258 x86_platform.nmi_init();
262 259
263 /* enable local interrupts */ 260 /* enable local interrupts */
@@ -954,7 +951,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
954 */ 951 */
955 mtrr_save_state(); 952 mtrr_save_state();
956 953
957 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 954 /* x86 CPUs take themselves offline, so delayed offline is OK. */
955 err = cpu_check_up_prepare(cpu);
956 if (err && err != -EBUSY)
957 return err;
958 958
959 /* the FPU context is blank, nobody can own it */ 959 /* the FPU context is blank, nobody can own it */
960 __cpu_disable_lazy_restore(cpu); 960 __cpu_disable_lazy_restore(cpu);
@@ -1197,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void)
1197 switch_to_new_gdt(me); 1197 switch_to_new_gdt(me);
1198 /* already set me in cpu_online_mask in boot_cpu_init() */ 1198 /* already set me in cpu_online_mask in boot_cpu_init() */
1199 cpumask_set_cpu(me, cpu_callout_mask); 1199 cpumask_set_cpu(me, cpu_callout_mask);
1200 per_cpu(cpu_state, me) = CPU_ONLINE; 1200 cpu_set_state_online(me);
1201} 1201}
1202 1202
1203void __init native_smp_cpus_done(unsigned int max_cpus) 1203void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1324,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu)
1324 numa_remove_cpu(cpu); 1324 numa_remove_cpu(cpu);
1325} 1325}
1326 1326
1327static DEFINE_PER_CPU(struct completion, die_complete);
1328
1329void cpu_disable_common(void) 1327void cpu_disable_common(void)
1330{ 1328{
1331 int cpu = smp_processor_id(); 1329 int cpu = smp_processor_id();
1332 1330
1333 init_completion(&per_cpu(die_complete, smp_processor_id()));
1334
1335 remove_siblinginfo(cpu); 1331 remove_siblinginfo(cpu);
1336 1332
1337 /* It's now safe to remove this processor from the online map */ 1333 /* It's now safe to remove this processor from the online map */
@@ -1355,24 +1351,27 @@ int native_cpu_disable(void)
1355 return 0; 1351 return 0;
1356} 1352}
1357 1353
1358void cpu_die_common(unsigned int cpu) 1354int common_cpu_die(unsigned int cpu)
1359{ 1355{
1360 wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); 1356 int ret = 0;
1361}
1362 1357
1363void native_cpu_die(unsigned int cpu)
1364{
1365 /* We don't do anything here: idle task is faking death itself. */ 1358 /* We don't do anything here: idle task is faking death itself. */
1366 1359
1367 cpu_die_common(cpu);
1368
1369 /* They ack this in play_dead() by setting CPU_DEAD */ 1360 /* They ack this in play_dead() by setting CPU_DEAD */
1370 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1361 if (cpu_wait_death(cpu, 5)) {
1371 if (system_state == SYSTEM_RUNNING) 1362 if (system_state == SYSTEM_RUNNING)
1372 pr_info("CPU %u is now offline\n", cpu); 1363 pr_info("CPU %u is now offline\n", cpu);
1373 } else { 1364 } else {
1374 pr_err("CPU %u didn't die...\n", cpu); 1365 pr_err("CPU %u didn't die...\n", cpu);
1366 ret = -1;
1375 } 1367 }
1368
1369 return ret;
1370}
1371
1372void native_cpu_die(unsigned int cpu)
1373{
1374 common_cpu_die(cpu);
1376} 1375}
1377 1376
1378void play_dead_common(void) 1377void play_dead_common(void)
@@ -1381,10 +1380,8 @@ void play_dead_common(void)
1381 reset_lazy_tlbstate(); 1380 reset_lazy_tlbstate();
1382 amd_e400_remove_cpu(raw_smp_processor_id()); 1381 amd_e400_remove_cpu(raw_smp_processor_id());
1383 1382
1384 mb();
1385 /* Ack it */ 1383 /* Ack it */
1386 __this_cpu_write(cpu_state, CPU_DEAD); 1384 (void)cpu_report_death();
1387 complete(&per_cpu(die_complete, smp_processor_id()));
1388 1385
1389 /* 1386 /*
1390 * With physical CPU hotplug, we should halt the cpu 1387 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7413ee3706d0..86484384492e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
90 90
91 set_cpu_online(cpu, true); 91 set_cpu_online(cpu, true);
92 92
93 this_cpu_write(cpu_state, CPU_ONLINE); 93 cpu_set_state_online(cpu); /* Implies full memory barrier. */
94
95 wmb();
96 94
97 /* We can take interrupts now: we're officially "up". */ 95 /* We can take interrupts now: we're officially "up". */
98 local_irq_enable(); 96 local_irq_enable();
99
100 wmb(); /* make sure everything is out */
101} 97}
102 98
103/* 99/*
@@ -451,7 +447,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
451 xen_setup_timer(cpu); 447 xen_setup_timer(cpu);
452 xen_init_lock_cpu(cpu); 448 xen_init_lock_cpu(cpu);
453 449
454 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 450 /*
451 * PV VCPUs are always successfully taken down (see 'while' loop
452 * in xen_cpu_die()), so -EBUSY is an error.
453 */
454 rc = cpu_check_up_prepare(cpu);
455 if (rc)
456 return rc;
455 457
456 /* make sure interrupts start blocked */ 458 /* make sure interrupts start blocked */
457 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 459 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -467,10 +469,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
467 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 469 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
468 BUG_ON(rc); 470 BUG_ON(rc);
469 471
470 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { 472 while (cpu_report_state(cpu) != CPU_ONLINE)
471 HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 473 HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
472 barrier();
473 }
474 474
475 return 0; 475 return 0;
476} 476}
@@ -499,11 +499,11 @@ static void xen_cpu_die(unsigned int cpu)
499 schedule_timeout(HZ/10); 499 schedule_timeout(HZ/10);
500 } 500 }
501 501
502 cpu_die_common(cpu); 502 if (common_cpu_die(cpu) == 0) {
503 503 xen_smp_intr_free(cpu);
504 xen_smp_intr_free(cpu); 504 xen_uninit_lock_cpu(cpu);
505 xen_uninit_lock_cpu(cpu); 505 xen_teardown_timer(cpu);
506 xen_teardown_timer(cpu); 506 }
507} 507}
508 508
509static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ 509static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -735,6 +735,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
735static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) 735static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
736{ 736{
737 int rc; 737 int rc;
738
739 /*
740 * This can happen if CPU was offlined earlier and
741 * offlining timed out in common_cpu_die().
742 */
743 if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
744 xen_smp_intr_free(cpu);
745 xen_uninit_lock_cpu(cpu);
746 }
747
738 /* 748 /*
739 * xen_smp_intr_init() needs to run before native_cpu_up() 749 * xen_smp_intr_init() needs to run before native_cpu_up()
740 * so that IPI vectors are set up on the booting CPU before 750 * so that IPI vectors are set up on the booting CPU before
@@ -756,12 +766,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
756 return rc; 766 return rc;
757} 767}
758 768
759static void xen_hvm_cpu_die(unsigned int cpu)
760{
761 xen_cpu_die(cpu);
762 native_cpu_die(cpu);
763}
764
765void __init xen_hvm_smp_init(void) 769void __init xen_hvm_smp_init(void)
766{ 770{
767 if (!xen_have_vector_callback) 771 if (!xen_have_vector_callback)
@@ -769,7 +773,7 @@ void __init xen_hvm_smp_init(void)
769 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 773 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
770 smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 774 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
771 smp_ops.cpu_up = xen_hvm_cpu_up; 775 smp_ops.cpu_up = xen_hvm_cpu_up;
772 smp_ops.cpu_die = xen_hvm_cpu_die; 776 smp_ops.cpu_die = xen_cpu_die;
773 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 777 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
774 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 778 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
775 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; 779 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4260e8594bd7..c0fb6b1b4712 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -73,6 +73,7 @@ enum {
73 /* migration should happen before other stuff but after perf */ 73 /* migration should happen before other stuff but after perf */
74 CPU_PRI_PERF = 20, 74 CPU_PRI_PERF = 20,
75 CPU_PRI_MIGRATION = 10, 75 CPU_PRI_MIGRATION = 10,
76 CPU_PRI_SMPBOOT = 9,
76 /* bring up workqueues before normal notifiers and down after */ 77 /* bring up workqueues before normal notifiers and down after */
77 CPU_PRI_WORKQUEUE_UP = 5, 78 CPU_PRI_WORKQUEUE_UP = 5,
78 CPU_PRI_WORKQUEUE_DOWN = -5, 79 CPU_PRI_WORKQUEUE_DOWN = -5,
@@ -95,6 +96,10 @@ enum {
95 * Called on the new cpu, just before 96 * Called on the new cpu, just before
96 * enabling interrupts. Must not sleep, 97 * enabling interrupts. Must not sleep,
97 * must not fail */ 98 * must not fail */
99#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached
100 * idle loop. */
101#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly,
102 * perhaps due to preemption. */
98 103
99/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend 104/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
100 * operation in progress 105 * operation in progress
@@ -161,6 +166,7 @@ static inline void __unregister_cpu_notifier(struct notifier_block *nb)
161} 166}
162#endif 167#endif
163 168
169void smpboot_thread_init(void);
164int cpu_up(unsigned int cpu); 170int cpu_up(unsigned int cpu);
165void notify_cpu_starting(unsigned int cpu); 171void notify_cpu_starting(unsigned int cpu);
166extern void cpu_maps_update_begin(void); 172extern void cpu_maps_update_begin(void);
@@ -208,6 +214,10 @@ static inline void cpu_notifier_register_done(void)
208{ 214{
209} 215}
210 216
217static inline void smpboot_thread_init(void)
218{
219}
220
211#endif /* CONFIG_SMP */ 221#endif /* CONFIG_SMP */
212extern struct bus_type cpu_subsys; 222extern struct bus_type cpu_subsys;
213 223
@@ -271,4 +281,14 @@ void arch_cpu_idle_enter(void);
271void arch_cpu_idle_exit(void); 281void arch_cpu_idle_exit(void);
272void arch_cpu_idle_dead(void); 282void arch_cpu_idle_dead(void);
273 283
284DECLARE_PER_CPU(bool, cpu_dead_idle);
285
286int cpu_report_state(int cpu);
287int cpu_check_up_prepare(int cpu);
288void cpu_set_state_online(int cpu);
289#ifdef CONFIG_HOTPLUG_CPU
290bool cpu_wait_death(unsigned int cpu, int seconds);
291bool cpu_report_death(void);
292#endif /* #ifdef CONFIG_HOTPLUG_CPU */
293
274#endif /* _LINUX_CPU_H_ */ 294#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 74ab23176e9b..066ba4157541 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -531,8 +531,13 @@ do { \
531# define might_lock_read(lock) do { } while (0) 531# define might_lock_read(lock) do { } while (0)
532#endif 532#endif
533 533
534#ifdef CONFIG_PROVE_RCU 534#ifdef CONFIG_LOCKDEP
535void lockdep_rcu_suspicious(const char *file, const int line, const char *s); 535void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
536#else
537static inline void
538lockdep_rcu_suspicious(const char *file, const int line, const char *s)
539{
540}
536#endif 541#endif
537 542
538#endif /* __LINUX_LOCKDEP_H */ 543#endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..573a5afd5ed8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,6 +48,26 @@
48 48
49extern int rcu_expedited; /* for sysctl */ 49extern int rcu_expedited; /* for sysctl */
50 50
51#ifdef CONFIG_TINY_RCU
52/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
53static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
54{
55 return false;
56}
57
58static inline void rcu_expedite_gp(void)
59{
60}
61
62static inline void rcu_unexpedite_gp(void)
63{
64}
65#else /* #ifdef CONFIG_TINY_RCU */
66bool rcu_gp_is_expedited(void); /* Internal RCU use. */
67void rcu_expedite_gp(void);
68void rcu_unexpedite_gp(void);
69#endif /* #else #ifdef CONFIG_TINY_RCU */
70
51enum rcutorture_type { 71enum rcutorture_type {
52 RCU_FLAVOR, 72 RCU_FLAVOR,
53 RCU_BH_FLAVOR, 73 RCU_BH_FLAVOR,
@@ -195,6 +215,15 @@ void call_rcu_sched(struct rcu_head *head,
195 215
196void synchronize_sched(void); 216void synchronize_sched(void);
197 217
218/*
219 * Structure allowing asynchronous waiting on RCU.
220 */
221struct rcu_synchronize {
222 struct rcu_head head;
223 struct completion completion;
224};
225void wakeme_after_rcu(struct rcu_head *head);
226
198/** 227/**
199 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period 228 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
200 * @head: structure to be used for queueing the RCU updates. 229 * @head: structure to be used for queueing the RCU updates.
@@ -258,6 +287,7 @@ static inline int rcu_preempt_depth(void)
258 287
259/* Internal to kernel */ 288/* Internal to kernel */
260void rcu_init(void); 289void rcu_init(void);
290void rcu_end_inkernel_boot(void);
261void rcu_sched_qs(void); 291void rcu_sched_qs(void);
262void rcu_bh_qs(void); 292void rcu_bh_qs(void);
263void rcu_check_callbacks(int user); 293void rcu_check_callbacks(int user);
@@ -266,6 +296,8 @@ void rcu_idle_enter(void);
266void rcu_idle_exit(void); 296void rcu_idle_exit(void);
267void rcu_irq_enter(void); 297void rcu_irq_enter(void);
268void rcu_irq_exit(void); 298void rcu_irq_exit(void);
299int rcu_cpu_notify(struct notifier_block *self,
300 unsigned long action, void *hcpu);
269 301
270#ifdef CONFIG_RCU_STALL_COMMON 302#ifdef CONFIG_RCU_STALL_COMMON
271void rcu_sysrq_start(void); 303void rcu_sysrq_start(void);
@@ -720,7 +752,7 @@ static inline void rcu_preempt_sleep_check(void)
720 * annotated as __rcu. 752 * annotated as __rcu.
721 */ 753 */
722#define rcu_dereference_check(p, c) \ 754#define rcu_dereference_check(p, c) \
723 __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu) 755 __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
724 756
725/** 757/**
726 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking 758 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
@@ -730,7 +762,7 @@ static inline void rcu_preempt_sleep_check(void)
730 * This is the RCU-bh counterpart to rcu_dereference_check(). 762 * This is the RCU-bh counterpart to rcu_dereference_check().
731 */ 763 */
732#define rcu_dereference_bh_check(p, c) \ 764#define rcu_dereference_bh_check(p, c) \
733 __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu) 765 __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
734 766
735/** 767/**
736 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking 768 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
@@ -740,7 +772,7 @@ static inline void rcu_preempt_sleep_check(void)
740 * This is the RCU-sched counterpart to rcu_dereference_check(). 772 * This is the RCU-sched counterpart to rcu_dereference_check().
741 */ 773 */
742#define rcu_dereference_sched_check(p, c) \ 774#define rcu_dereference_sched_check(p, c) \
743 __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \ 775 __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
744 __rcu) 776 __rcu)
745 777
746#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ 778#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
@@ -933,9 +965,9 @@ static inline void rcu_read_unlock(void)
933{ 965{
934 rcu_lockdep_assert(rcu_is_watching(), 966 rcu_lockdep_assert(rcu_is_watching(),
935 "rcu_read_unlock() used illegally while idle"); 967 "rcu_read_unlock() used illegally while idle");
936 rcu_lock_release(&rcu_lock_map);
937 __release(RCU); 968 __release(RCU);
938 __rcu_read_unlock(); 969 __rcu_read_unlock();
970 rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
939} 971}
940 972
941/** 973/**
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 9cfd9623fb03..bdeb4567b71e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -182,7 +182,7 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
182 * lockdep_is_held() calls. 182 * lockdep_is_held() calls.
183 */ 183 */
184#define srcu_dereference_check(p, sp, c) \ 184#define srcu_dereference_check(p, sp, c) \
185 __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu) 185 __rcu_dereference_check((p), (c) || srcu_read_lock_held(sp), __rcu)
186 186
187/** 187/**
188 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing 188 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d4261b..9a0592516f48 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -791,6 +791,19 @@ config RCU_NOCB_CPU_ALL
791 791
792endchoice 792endchoice
793 793
794config RCU_EXPEDITE_BOOT
795 bool
796 default n
797 help
798 This option enables expedited grace periods at boot time,
799 as if rcu_expedite_gp() had been invoked early in boot.
800 The corresponding rcu_unexpedite_gp() is invoked from
801 rcu_end_inkernel_boot(), which is intended to be invoked
802 at the end of the kernel-only boot sequence, just before
803 init is exec'ed.
804
805 Accept the default if unsure.
806
794endmenu # "RCU Subsystem" 807endmenu # "RCU Subsystem"
795 808
796config BUILD_BIN2C 809config BUILD_BIN2C
diff --git a/init/main.c b/init/main.c
index 54565bf57beb..e82171b99874 100644
--- a/init/main.c
+++ b/init/main.c
@@ -384,6 +384,7 @@ static noinline void __init_refok rest_init(void)
384 int pid; 384 int pid;
385 385
386 rcu_scheduler_starting(); 386 rcu_scheduler_starting();
387 smpboot_thread_init();
387 /* 388 /*
388 * We need to spawn init first so that it obtains pid 1, however 389 * We need to spawn init first so that it obtains pid 1, however
389 * the init task will end up wanting to create kthreads, which, if 390 * the init task will end up wanting to create kthreads, which, if
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 82eea9c5af61..94bbe4695232 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -411,8 +411,10 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
411 * 411 *
412 * Wait for the stop thread to go away. 412 * Wait for the stop thread to go away.
413 */ 413 */
414 while (!idle_cpu(cpu)) 414 while (!per_cpu(cpu_dead_idle, cpu))
415 cpu_relax(); 415 cpu_relax();
416 smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
417 per_cpu(cpu_dead_idle, cpu) = false;
416 418
417 hotplug_cpu__broadcast_tick_pull(cpu); 419 hotplug_cpu__broadcast_tick_pull(cpu);
418 /* This actually kills the CPU. */ 420 /* This actually kills the CPU. */
@@ -451,6 +453,37 @@ out:
451EXPORT_SYMBOL(cpu_down); 453EXPORT_SYMBOL(cpu_down);
452#endif /*CONFIG_HOTPLUG_CPU*/ 454#endif /*CONFIG_HOTPLUG_CPU*/
453 455
456/*
457 * Unpark per-CPU smpboot kthreads at CPU-online time.
458 */
459static int smpboot_thread_call(struct notifier_block *nfb,
460 unsigned long action, void *hcpu)
461{
462 int cpu = (long)hcpu;
463
464 switch (action & ~CPU_TASKS_FROZEN) {
465
466 case CPU_ONLINE:
467 smpboot_unpark_threads(cpu);
468 break;
469
470 default:
471 break;
472 }
473
474 return NOTIFY_OK;
475}
476
477static struct notifier_block smpboot_thread_notifier = {
478 .notifier_call = smpboot_thread_call,
479 .priority = CPU_PRI_SMPBOOT,
480};
481
482void __cpuinit smpboot_thread_init(void)
483{
484 register_cpu_notifier(&smpboot_thread_notifier);
485}
486
454/* Requires cpu_add_remove_lock to be held */ 487/* Requires cpu_add_remove_lock to be held */
455static int _cpu_up(unsigned int cpu, int tasks_frozen) 488static int _cpu_up(unsigned int cpu, int tasks_frozen)
456{ 489{
@@ -490,9 +523,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
490 goto out_notify; 523 goto out_notify;
491 BUG_ON(!cpu_online(cpu)); 524 BUG_ON(!cpu_online(cpu));
492 525
493 /* Wake the per cpu threads */
494 smpboot_unpark_threads(cpu);
495
496 /* Now call notifier in preparation. */ 526 /* Now call notifier in preparation. */
497 cpu_notify(CPU_ONLINE | mod, hcpu); 527 cpu_notify(CPU_ONLINE | mod, hcpu);
498 528
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
853static int 853static int
854rcu_torture_writer(void *arg) 854rcu_torture_writer(void *arg)
855{ 855{
856 bool can_expedite = !rcu_gp_is_expedited();
857 int expediting = 0;
856 unsigned long gp_snap; 858 unsigned long gp_snap;
857 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; 859 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
858 bool gp_sync1 = gp_sync; 860 bool gp_sync1 = gp_sync;
@@ -865,9 +867,15 @@ rcu_torture_writer(void *arg)
865 int nsynctypes = 0; 867 int nsynctypes = 0;
866 868
867 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 869 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
870 pr_alert("%s" TORTURE_FLAG
871 " Grace periods expedited from boot/sysfs for %s,\n",
872 torture_type, cur_ops->name);
873 pr_alert("%s" TORTURE_FLAG
874 " Testing of dynamic grace-period expediting diabled.\n",
875 torture_type);
868 876
869 /* Initialize synctype[] array. If none set, take default. */ 877 /* Initialize synctype[] array. If none set, take default. */
870 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) 878 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
871 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; 879 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
872 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) 880 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
873 synctype[nsynctypes++] = RTWS_COND_GET; 881 synctype[nsynctypes++] = RTWS_COND_GET;
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
949 } 957 }
950 } 958 }
951 rcutorture_record_progress(++rcu_torture_current_version); 959 rcutorture_record_progress(++rcu_torture_current_version);
960 /* Cycle through nesting levels of rcu_expedite_gp() calls. */
961 if (can_expedite &&
962 !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
963 WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
964 if (expediting >= 0)
965 rcu_expedite_gp();
966 else
967 rcu_unexpedite_gp();
968 if (++expediting > 3)
969 expediting = -expediting;
970 }
952 rcu_torture_writer_state = RTWS_STUTTER; 971 rcu_torture_writer_state = RTWS_STUTTER;
953 stutter_wait("rcu_torture_writer"); 972 stutter_wait("rcu_torture_writer");
954 } while (!torture_must_stop()); 973 } while (!torture_must_stop());
974 /* Reset expediting back to unexpedited. */
975 if (expediting > 0)
976 expediting = -expediting;
977 while (can_expedite && expediting++ < 0)
978 rcu_unexpedite_gp();
979 WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
955 rcu_torture_writer_state = RTWS_STOPPING; 980 rcu_torture_writer_state = RTWS_STOPPING;
956 torture_kthread_stopping("rcu_torture_writer"); 981 torture_kthread_stopping("rcu_torture_writer");
957 return 0; 982 return 0;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..cad76e76b4e7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
402} 402}
403EXPORT_SYMBOL_GPL(call_srcu); 403EXPORT_SYMBOL_GPL(call_srcu);
404 404
405struct rcu_synchronize {
406 struct rcu_head head;
407 struct completion completion;
408};
409
410/*
411 * Awaken the corresponding synchronize_srcu() instance now that a
412 * grace period has elapsed.
413 */
414static void wakeme_after_rcu(struct rcu_head *head)
415{
416 struct rcu_synchronize *rcu;
417
418 rcu = container_of(head, struct rcu_synchronize, head);
419 complete(&rcu->completion);
420}
421
422static void srcu_advance_batches(struct srcu_struct *sp, int trycount); 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
423static void srcu_reschedule(struct srcu_struct *sp); 406static void srcu_reschedule(struct srcu_struct *sp);
424 407
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
507 */ 490 */
508void synchronize_srcu(struct srcu_struct *sp) 491void synchronize_srcu(struct srcu_struct *sp)
509{ 492{
510 __synchronize_srcu(sp, rcu_expedited 493 __synchronize_srcu(sp, rcu_gp_is_expedited()
511 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT 494 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
512 : SYNCHRONIZE_SRCU_TRYCOUNT); 495 : SYNCHRONIZE_SRCU_TRYCOUNT);
513} 496}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
103static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 103static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
104{ 104{
105 RCU_TRACE(reset_cpu_stall_ticks(rcp)); 105 RCU_TRACE(reset_cpu_stall_ticks(rcp));
106 if (rcp->rcucblist != NULL && 106 if (rcp->donetail != rcp->curtail) {
107 rcp->donetail != rcp->curtail) {
108 rcp->donetail = rcp->curtail; 107 rcp->donetail = rcp->curtail;
109 return 1; 108 return 1;
110 } 109 }
@@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 unsigned long flags; 168 unsigned long flags;
170 RCU_TRACE(int cb_count = 0); 169 RCU_TRACE(int cb_count = 0);
171 170
172 /* If no RCU callbacks ready to invoke, just return. */
173 if (&rcp->rcucblist == rcp->donetail) {
174 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
175 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
176 !!ACCESS_ONCE(rcp->rcucblist),
177 need_resched(),
178 is_idle_task(current),
179 false));
180 return;
181 }
182
183 /* Move the ready-to-invoke callbacks to a local list. */ 171 /* Move the ready-to-invoke callbacks to a local list. */
184 local_irq_save(flags); 172 local_irq_save(flags);
185 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); 173 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..233165da782f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
91 91
92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
93DEFINE_RCU_TPS(sname) \ 93DEFINE_RCU_TPS(sname) \
94DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
94struct rcu_state sname##_state = { \ 95struct rcu_state sname##_state = { \
95 .level = { &sname##_state.node[0] }, \ 96 .level = { &sname##_state.node[0] }, \
97 .rda = &sname##_data, \
96 .call = cr, \ 98 .call = cr, \
97 .fqs_state = RCU_GP_IDLE, \ 99 .fqs_state = RCU_GP_IDLE, \
98 .gpnum = 0UL - 300UL, \ 100 .gpnum = 0UL - 300UL, \
@@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \
101 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 103 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
102 .orphan_donetail = &sname##_state.orphan_donelist, \ 104 .orphan_donetail = &sname##_state.orphan_donelist, \
103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 105 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
104 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
105 .name = RCU_STATE_NAME(sname), \ 106 .name = RCU_STATE_NAME(sname), \
106 .abbr = sabbr, \ 107 .abbr = sabbr, \
107}; \ 108}
108DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
109 109
110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
152 */ 152 */
153static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
154 154
155static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
156static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 157static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
156static void invoke_rcu_core(void); 158static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 159static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; 162static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644); 163module_param(kthread_prio, int, 0644);
162 164
165/* Delay in jiffies for grace-period initialization delays. */
166static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
167 ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
168 : 0;
169module_param(gp_init_delay, int, 0644);
170
163/* 171/*
164 * Track the rcutorture test sequence number and the update version 172 * Track the rcutorture test sequence number and the update version
165 * number within a given test. The rcutorture_testseq is incremented 173 * number within a given test. The rcutorture_testseq is incremented
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq;
173unsigned long rcutorture_vernum; 181unsigned long rcutorture_vernum;
174 182
175/* 183/*
184 * Compute the mask of online CPUs for the specified rcu_node structure.
185 * This will not be stable unless the rcu_node structure's ->lock is
186 * held, but the bit corresponding to the current CPU will be stable
187 * in most contexts.
188 */
189unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
190{
191 return ACCESS_ONCE(rnp->qsmaskinitnext);
192}
193
194/*
176 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 195 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
177 * permit this function to be invoked without holding the root rcu_node 196 * permit this function to be invoked without holding the root rcu_node
178 * structure's ->lock, but of course results can be subject to change. 197 * structure's ->lock, but of course results can be subject to change.
@@ -292,10 +311,10 @@ void rcu_note_context_switch(void)
292EXPORT_SYMBOL_GPL(rcu_note_context_switch); 311EXPORT_SYMBOL_GPL(rcu_note_context_switch);
293 312
294/* 313/*
295 * Register a quiesecent state for all RCU flavors. If there is an 314 * Register a quiescent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 315 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those 316 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally 317 * RCU flavors in desperate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for 318 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors. 319 * all RCU flavors.
301 */ 320 */
@@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void)
410EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 429EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
411 430
412/* 431/*
432 * Force a quiescent state for RCU-sched.
433 */
434void rcu_sched_force_quiescent_state(void)
435{
436 force_quiescent_state(&rcu_sched_state);
437}
438EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
439
440/*
413 * Show the state of the grace-period kthreads. 441 * Show the state of the grace-period kthreads.
414 */ 442 */
415void show_rcu_gp_kthreads(void) 443void show_rcu_gp_kthreads(void)
@@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum)
483EXPORT_SYMBOL_GPL(rcutorture_record_progress); 511EXPORT_SYMBOL_GPL(rcutorture_record_progress);
484 512
485/* 513/*
486 * Force a quiescent state for RCU-sched.
487 */
488void rcu_sched_force_quiescent_state(void)
489{
490 force_quiescent_state(&rcu_sched_state);
491}
492EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
493
494/*
495 * Does the CPU have callbacks ready to be invoked? 514 * Does the CPU have callbacks ready to be invoked?
496 */ 515 */
497static int 516static int
@@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
954 preempt_disable(); 973 preempt_disable();
955 rdp = this_cpu_ptr(&rcu_sched_data); 974 rdp = this_cpu_ptr(&rcu_sched_data);
956 rnp = rdp->mynode; 975 rnp = rdp->mynode;
957 ret = (rdp->grpmask & rnp->qsmaskinit) || 976 ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
958 !rcu_scheduler_fully_active; 977 !rcu_scheduler_fully_active;
959 preempt_enable(); 978 preempt_enable();
960 return ret; 979 return ret;
@@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1196 } else { 1215 } else {
1197 j = jiffies; 1216 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity); 1217 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", 1218 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
1200 rsp->name, j - gpa, j, gpa, 1219 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs); 1220 jiffies_till_next_fqs,
1221 rcu_get_root(rsp)->qsmask);
1202 /* In this case, the current CPU might be at fault. */ 1222 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current); 1223 sched_show_task(current);
1204 } 1224 }
@@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void)
1328} 1348}
1329 1349
1330/* 1350/*
1331 * Initialize the specified rcu_data structure's callback list to empty. 1351 * Initialize the specified rcu_data structure's default callback list
1352 * to empty. The default callback list is the one that is not used by
1353 * no-callbacks CPUs.
1332 */ 1354 */
1333static void init_callback_list(struct rcu_data *rdp) 1355static void init_default_callback_list(struct rcu_data *rdp)
1334{ 1356{
1335 int i; 1357 int i;
1336 1358
1337 if (init_nocb_callback_list(rdp))
1338 return;
1339 rdp->nxtlist = NULL; 1359 rdp->nxtlist = NULL;
1340 for (i = 0; i < RCU_NEXT_SIZE; i++) 1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1341 rdp->nxttail[i] = &rdp->nxtlist; 1361 rdp->nxttail[i] = &rdp->nxtlist;
1342} 1362}
1343 1363
1344/* 1364/*
1365 * Initialize the specified rcu_data structure's callback list to empty.
1366 */
1367static void init_callback_list(struct rcu_data *rdp)
1368{
1369 if (init_nocb_callback_list(rdp))
1370 return;
1371 init_default_callback_list(rdp);
1372}
1373
1374/*
1345 * Determine the value that ->completed will have at the end of the 1375 * Determine the value that ->completed will have at the end of the
1346 * next subsequent grace period. This is used to tag callbacks so that 1376 * next subsequent grace period. This is used to tag callbacks so that
1347 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1377 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1703 */ 1733 */
1704static int rcu_gp_init(struct rcu_state *rsp) 1734static int rcu_gp_init(struct rcu_state *rsp)
1705{ 1735{
1736 unsigned long oldmask;
1706 struct rcu_data *rdp; 1737 struct rcu_data *rdp;
1707 struct rcu_node *rnp = rcu_get_root(rsp); 1738 struct rcu_node *rnp = rcu_get_root(rsp);
1708 1739
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies; 1740 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1710 rcu_bind_gp_kthread();
1711 raw_spin_lock_irq(&rnp->lock); 1741 raw_spin_lock_irq(&rnp->lock);
1712 smp_mb__after_unlock_lock(); 1742 smp_mb__after_unlock_lock();
1713 if (!ACCESS_ONCE(rsp->gp_flags)) { 1743 if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp)
1733 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1763 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1734 raw_spin_unlock_irq(&rnp->lock); 1764 raw_spin_unlock_irq(&rnp->lock);
1735 1765
1736 /* Exclude any concurrent CPU-hotplug operations. */ 1766 /*
1737 mutex_lock(&rsp->onoff_mutex); 1767 * Apply per-leaf buffered online and offline operations to the
1738 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ 1768 * rcu_node tree. Note that this new grace period need not wait
1769 * for subsequent online CPUs, and that quiescent-state forcing
1770 * will handle subsequent offline CPUs.
1771 */
1772 rcu_for_each_leaf_node(rsp, rnp) {
1773 raw_spin_lock_irq(&rnp->lock);
1774 smp_mb__after_unlock_lock();
1775 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1776 !rnp->wait_blkd_tasks) {
1777 /* Nothing to do on this leaf rcu_node structure. */
1778 raw_spin_unlock_irq(&rnp->lock);
1779 continue;
1780 }
1781
1782 /* Record old state, apply changes to ->qsmaskinit field. */
1783 oldmask = rnp->qsmaskinit;
1784 rnp->qsmaskinit = rnp->qsmaskinitnext;
1785
1786 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1787 if (!oldmask != !rnp->qsmaskinit) {
1788 if (!oldmask) /* First online CPU for this rcu_node. */
1789 rcu_init_new_rnp(rnp);
1790 else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
1791 rnp->wait_blkd_tasks = true;
1792 else /* Last offline CPU and can propagate. */
1793 rcu_cleanup_dead_rnp(rnp);
1794 }
1795
1796 /*
1797 * If all waited-on tasks from prior grace period are
1798 * done, and if all this rcu_node structure's CPUs are
1799 * still offline, propagate up the rcu_node tree and
1800 * clear ->wait_blkd_tasks. Otherwise, if one of this
1801 * rcu_node structure's CPUs has since come back online,
1802 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
1803 * checks for this, so just call it unconditionally).
1804 */
1805 if (rnp->wait_blkd_tasks &&
1806 (!rcu_preempt_has_tasks(rnp) ||
1807 rnp->qsmaskinit)) {
1808 rnp->wait_blkd_tasks = false;
1809 rcu_cleanup_dead_rnp(rnp);
1810 }
1811
1812 raw_spin_unlock_irq(&rnp->lock);
1813 }
1739 1814
1740 /* 1815 /*
1741 * Set the quiescent-state-needed bits in all the rcu_node 1816 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1757 rcu_preempt_check_blocked_tasks(rnp); 1832 rcu_preempt_check_blocked_tasks(rnp);
1758 rnp->qsmask = rnp->qsmaskinit; 1833 rnp->qsmask = rnp->qsmaskinit;
1759 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; 1834 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1760 WARN_ON_ONCE(rnp->completed != rsp->completed); 1835 if (WARN_ON_ONCE(rnp->completed != rsp->completed))
1761 ACCESS_ONCE(rnp->completed) = rsp->completed; 1836 ACCESS_ONCE(rnp->completed) = rsp->completed;
1762 if (rnp == rdp->mynode) 1837 if (rnp == rdp->mynode)
1763 (void)__note_gp_changes(rsp, rnp, rdp); 1838 (void)__note_gp_changes(rsp, rnp, rdp);
1764 rcu_preempt_boost_start_gp(rnp); 1839 rcu_preempt_boost_start_gp(rnp);
@@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
1768 raw_spin_unlock_irq(&rnp->lock); 1843 raw_spin_unlock_irq(&rnp->lock);
1769 cond_resched_rcu_qs(); 1844 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies; 1845 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1846 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
1847 gp_init_delay > 0 &&
1848 !(rsp->gpnum % (rcu_num_nodes * 10)))
1849 schedule_timeout_uninterruptible(gp_init_delay);
1771 } 1850 }
1772 1851
1773 mutex_unlock(&rsp->onoff_mutex);
1774 return 1; 1852 return 1;
1775} 1853}
1776 1854
@@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1798 fqs_state = RCU_FORCE_QS; 1876 fqs_state = RCU_FORCE_QS;
1799 } else { 1877 } else {
1800 /* Handle dyntick-idle and offline CPUs. */ 1878 /* Handle dyntick-idle and offline CPUs. */
1801 isidle = false; 1879 isidle = true;
1802 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1880 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1803 } 1881 }
1804 /* Clear flag to prevent immediate re-entry. */ 1882 /* Clear flag to prevent immediate re-entry. */
@@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1852 rcu_for_each_node_breadth_first(rsp, rnp) { 1930 rcu_for_each_node_breadth_first(rsp, rnp) {
1853 raw_spin_lock_irq(&rnp->lock); 1931 raw_spin_lock_irq(&rnp->lock);
1854 smp_mb__after_unlock_lock(); 1932 smp_mb__after_unlock_lock();
1933 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
1934 WARN_ON_ONCE(rnp->qsmask);
1855 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1935 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1856 rdp = this_cpu_ptr(rsp->rda); 1936 rdp = this_cpu_ptr(rsp->rda);
1857 if (rnp == rdp->mynode) 1937 if (rnp == rdp->mynode)
@@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1895 struct rcu_state *rsp = arg; 1975 struct rcu_state *rsp = arg;
1896 struct rcu_node *rnp = rcu_get_root(rsp); 1976 struct rcu_node *rnp = rcu_get_root(rsp);
1897 1977
1978 rcu_bind_gp_kthread();
1898 for (;;) { 1979 for (;;) {
1899 1980
1900 /* Handle grace-period start. */ 1981 /* Handle grace-period start. */
@@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2062 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2143 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
2063 * Allows quiescent states for a group of CPUs to be reported at one go 2144 * Allows quiescent states for a group of CPUs to be reported at one go
2064 * to the specified rcu_node structure, though all the CPUs in the group 2145 * to the specified rcu_node structure, though all the CPUs in the group
2065 * must be represented by the same rcu_node structure (which need not be 2146 * must be represented by the same rcu_node structure (which need not be a
2066 * a leaf rcu_node structure, though it often will be). That structure's 2147 * leaf rcu_node structure, though it often will be). The gps parameter
2067 * lock must be held upon entry, and it is released before return. 2148 * is the grace-period snapshot, which means that the quiescent states
2149 * are valid only if rnp->gpnum is equal to gps. That structure's lock
2150 * must be held upon entry, and it is released before return.
2068 */ 2151 */
2069static void 2152static void
2070rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 2153rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2071 struct rcu_node *rnp, unsigned long flags) 2154 struct rcu_node *rnp, unsigned long gps, unsigned long flags)
2072 __releases(rnp->lock) 2155 __releases(rnp->lock)
2073{ 2156{
2157 unsigned long oldmask = 0;
2074 struct rcu_node *rnp_c; 2158 struct rcu_node *rnp_c;
2075 2159
2076 /* Walk up the rcu_node hierarchy. */ 2160 /* Walk up the rcu_node hierarchy. */
2077 for (;;) { 2161 for (;;) {
2078 if (!(rnp->qsmask & mask)) { 2162 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
2079 2163
2080 /* Our bit has already been cleared, so done. */ 2164 /*
2165 * Our bit has already been cleared, or the
2166 * relevant grace period is already over, so done.
2167 */
2081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2168 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2082 return; 2169 return;
2083 } 2170 }
2171 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
2084 rnp->qsmask &= ~mask; 2172 rnp->qsmask &= ~mask;
2085 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 2173 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
2086 mask, rnp->qsmask, rnp->level, 2174 mask, rnp->qsmask, rnp->level,
@@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2104 rnp = rnp->parent; 2192 rnp = rnp->parent;
2105 raw_spin_lock_irqsave(&rnp->lock, flags); 2193 raw_spin_lock_irqsave(&rnp->lock, flags);
2106 smp_mb__after_unlock_lock(); 2194 smp_mb__after_unlock_lock();
2107 WARN_ON_ONCE(rnp_c->qsmask); 2195 oldmask = rnp_c->qsmask;
2108 } 2196 }
2109 2197
2110 /* 2198 /*
@@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2116} 2204}
2117 2205
2118/* 2206/*
2207 * Record a quiescent state for all tasks that were previously queued
2208 * on the specified rcu_node structure and that were blocking the current
2209 * RCU grace period. The caller must hold the specified rnp->lock with
2210 * irqs disabled, and this lock is released upon return, but irqs remain
2211 * disabled.
2212 */
2213static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2214 struct rcu_node *rnp, unsigned long flags)
2215 __releases(rnp->lock)
2216{
2217 unsigned long gps;
2218 unsigned long mask;
2219 struct rcu_node *rnp_p;
2220
2221 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
2222 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2223 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2224 return; /* Still need more quiescent states! */
2225 }
2226
2227 rnp_p = rnp->parent;
2228 if (rnp_p == NULL) {
2229 /*
2230 * Only one rcu_node structure in the tree, so don't
2231 * try to report up to its nonexistent parent!
2232 */
2233 rcu_report_qs_rsp(rsp, flags);
2234 return;
2235 }
2236
2237 /* Report up the rest of the hierarchy, tracking current ->gpnum. */
2238 gps = rnp->gpnum;
2239 mask = rnp->grpmask;
2240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2241 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
2242 smp_mb__after_unlock_lock();
2243 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
2244}
2245
2246/*
2119 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2247 * Record a quiescent state for the specified CPU to that CPU's rcu_data
2120 * structure. This must be either called from the specified CPU, or 2248 * structure. This must be either called from the specified CPU, or
2121 * called when the specified CPU is known to be offline (and when it is 2249 * called when the specified CPU is known to be offline (and when it is
@@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2163 */ 2291 */
2164 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2292 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
2165 2293
2166 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 2294 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2295 /* ^^^ Released rnp->lock */
2167 if (needwake) 2296 if (needwake)
2168 rcu_gp_kthread_wake(rsp); 2297 rcu_gp_kthread_wake(rsp);
2169 } 2298 }
@@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2256 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 2385 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
2257 } 2386 }
2258 2387
2259 /* Finally, initialize the rcu_data structure's list to empty. */ 2388 /*
2389 * Finally, initialize the rcu_data structure's list to empty and
2390 * disallow further callbacks on this CPU.
2391 */
2260 init_callback_list(rdp); 2392 init_callback_list(rdp);
2393 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2261} 2394}
2262 2395
2263/* 2396/*
@@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2488 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */ 2489 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask; 2490 rnp->qsmaskinit &= ~mask;
2491 rnp->qsmask &= ~mask;
2358 if (rnp->qsmaskinit) { 2492 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2493 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return; 2494 return;
@@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2364} 2498}
2365 2499
2366/* 2500/*
2501 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
2502 * function. We now remove it from the rcu_node tree's ->qsmaskinit
2503 * bit masks.
2504 */
2505static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2506{
2507 unsigned long flags;
2508 unsigned long mask;
2509 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2510 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2511
2512 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2513 mask = rdp->grpmask;
2514 raw_spin_lock_irqsave(&rnp->lock, flags);
2515 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2516 rnp->qsmaskinitnext &= ~mask;
2517 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2518}
2519
2520/*
2367 * The CPU has been completely removed, and some other CPU is reporting 2521 * The CPU has been completely removed, and some other CPU is reporting
2368 * this fact from process context. Do the remainder of the cleanup, 2522 * this fact from process context. Do the remainder of the cleanup,
2369 * including orphaning the outgoing CPU's RCU callbacks, and also 2523 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2379 /* Adjust any no-longer-needed kthreads. */ 2533 /* Adjust any no-longer-needed kthreads. */
2380 rcu_boost_kthread_setaffinity(rnp, -1); 2534 rcu_boost_kthread_setaffinity(rnp, -1);
2381 2535
2382 /* Exclude any attempts to start a new grace period. */
2383 mutex_lock(&rsp->onoff_mutex);
2384 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
2385
2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2536 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2537 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2538 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2388 rcu_adopt_orphan_cbs(rsp, flags); 2539 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2540 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2390 2541
2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2394 rnp->qsmaskinit &= ~rdp->grpmask;
2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2396 rcu_cleanup_dead_rnp(rnp);
2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2542 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2543 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2400 cpu, rdp->qlen, rdp->nxtlist); 2544 cpu, rdp->qlen, rdp->nxtlist);
2401 init_callback_list(rdp);
2402 /* Disallow further callbacks on this CPU. */
2403 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2404 mutex_unlock(&rsp->onoff_mutex);
2405} 2545}
2406 2546
2407#else /* #ifdef CONFIG_HOTPLUG_CPU */ 2547#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{ 2554{
2415} 2555}
2416 2556
2557static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2558{
2559}
2560
2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2561static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2418{ 2562{
2419} 2563}
@@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp,
2589 return; 2733 return;
2590 } 2734 }
2591 if (rnp->qsmask == 0) { 2735 if (rnp->qsmask == 0) {
2592 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 2736 if (rcu_state_p == &rcu_sched_state ||
2593 continue; 2737 rsp != rcu_state_p ||
2738 rcu_preempt_blocked_readers_cgp(rnp)) {
2739 /*
2740 * No point in scanning bits because they
2741 * are all zero. But we might need to
2742 * priority-boost blocked readers.
2743 */
2744 rcu_initiate_boost(rnp, flags);
2745 /* rcu_initiate_boost() releases rnp->lock */
2746 continue;
2747 }
2748 if (rnp->parent &&
2749 (rnp->parent->qsmask & rnp->grpmask)) {
2750 /*
2751 * Race between grace-period
2752 * initialization and task exiting RCU
2753 * read-side critical section: Report.
2754 */
2755 rcu_report_unblock_qs_rnp(rsp, rnp, flags);
2756 /* rcu_report_unblock_qs_rnp() rlses ->lock */
2757 continue;
2758 }
2594 } 2759 }
2595 cpu = rnp->grplo; 2760 cpu = rnp->grplo;
2596 bit = 1; 2761 bit = 1;
2597 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2762 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2598 if ((rnp->qsmask & bit) != 0) { 2763 if ((rnp->qsmask & bit) != 0) {
2599 if ((rnp->qsmaskinit & bit) != 0) 2764 if ((rnp->qsmaskinit & bit) == 0)
2600 *isidle = false; 2765 *isidle = false; /* Pending hotplug. */
2601 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2766 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2602 mask |= bit; 2767 mask |= bit;
2603 } 2768 }
2604 } 2769 }
2605 if (mask != 0) { 2770 if (mask != 0) {
2606 2771 /* Idle/offline CPUs, report (releases rnp->lock. */
2607 /* rcu_report_qs_rnp() releases rnp->lock. */ 2772 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2608 rcu_report_qs_rnp(mask, rsp, rnp, flags); 2773 } else {
2609 continue; 2774 /* Nothing to do here, so just drop the lock. */
2775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2610 } 2776 }
2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2612 } 2777 }
2613} 2778}
2614 2779
@@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2741 * If called from an extended quiescent state, invoke the RCU 2906 * If called from an extended quiescent state, invoke the RCU
2742 * core in order to force a re-evaluation of RCU's idleness. 2907 * core in order to force a re-evaluation of RCU's idleness.
2743 */ 2908 */
2744 if (!rcu_is_watching() && cpu_online(smp_processor_id())) 2909 if (!rcu_is_watching())
2745 invoke_rcu_core(); 2910 invoke_rcu_core();
2746 2911
2747 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2912 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2827 2992
2828 if (cpu != -1) 2993 if (cpu != -1)
2829 rdp = per_cpu_ptr(rsp->rda, cpu); 2994 rdp = per_cpu_ptr(rsp->rda, cpu);
2830 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 2995 if (likely(rdp->mynode)) {
2831 WARN_ON_ONCE(offline); 2996 /* Post-boot, so this should be for a no-CBs CPU. */
2832 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2997 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2833 local_irq_restore(flags); 2998 WARN_ON_ONCE(offline);
2834 return; 2999 /* Offline CPU, _call_rcu() illegal, leak callback. */
3000 local_irq_restore(flags);
3001 return;
3002 }
3003 /*
3004 * Very early boot, before rcu_init(). Initialize if needed
3005 * and then drop through to queue the callback.
3006 */
3007 BUG_ON(cpu != -1);
3008 WARN_ON_ONCE(!rcu_is_watching());
3009 if (!likely(rdp->nxtlist))
3010 init_default_callback_list(rdp);
2835 } 3011 }
2836 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; 3012 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
2837 if (lazy) 3013 if (lazy)
@@ -2954,7 +3130,7 @@ void synchronize_sched(void)
2954 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3130 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2955 if (rcu_blocking_is_gp()) 3131 if (rcu_blocking_is_gp())
2956 return; 3132 return;
2957 if (rcu_expedited) 3133 if (rcu_gp_is_expedited())
2958 synchronize_sched_expedited(); 3134 synchronize_sched_expedited();
2959 else 3135 else
2960 wait_rcu_gp(call_rcu_sched); 3136 wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void)
2981 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3157 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2982 if (rcu_blocking_is_gp()) 3158 if (rcu_blocking_is_gp())
2983 return; 3159 return;
2984 if (rcu_expedited) 3160 if (rcu_gp_is_expedited())
2985 synchronize_rcu_bh_expedited(); 3161 synchronize_rcu_bh_expedited();
2986 else 3162 else
2987 wait_rcu_gp(call_rcu_bh); 3163 wait_rcu_gp(call_rcu_bh);
@@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void)
3518EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3694EXPORT_SYMBOL_GPL(rcu_barrier_sched);
3519 3695
3520/* 3696/*
3697 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
3698 * first CPU in a given leaf rcu_node structure coming online. The caller
3699 * must hold the corresponding leaf rcu_node ->lock with interrrupts
3700 * disabled.
3701 */
3702static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3703{
3704 long mask;
3705 struct rcu_node *rnp = rnp_leaf;
3706
3707 for (;;) {
3708 mask = rnp->grpmask;
3709 rnp = rnp->parent;
3710 if (rnp == NULL)
3711 return;
3712 raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
3713 rnp->qsmaskinit |= mask;
3714 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
3715 }
3716}
3717
3718/*
3521 * Do boot-time initialization of a CPU's per-CPU RCU data. 3719 * Do boot-time initialization of a CPU's per-CPU RCU data.
3522 */ 3720 */
3523static void __init 3721static void __init
@@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3553 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3751 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3554 struct rcu_node *rnp = rcu_get_root(rsp); 3752 struct rcu_node *rnp = rcu_get_root(rsp);
3555 3753
3556 /* Exclude new grace periods. */
3557 mutex_lock(&rsp->onoff_mutex);
3558
3559 /* Set up local state, ensuring consistent view of global state. */ 3754 /* Set up local state, ensuring consistent view of global state. */
3560 raw_spin_lock_irqsave(&rnp->lock, flags); 3755 raw_spin_lock_irqsave(&rnp->lock, flags);
3561 rdp->beenonline = 1; /* We have now been online. */ 3756 rdp->beenonline = 1; /* We have now been online. */
3562 rdp->qlen_last_fqs_check = 0; 3757 rdp->qlen_last_fqs_check = 0;
3563 rdp->n_force_qs_snap = rsp->n_force_qs; 3758 rdp->n_force_qs_snap = rsp->n_force_qs;
3564 rdp->blimit = blimit; 3759 rdp->blimit = blimit;
3565 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3760 if (!rdp->nxtlist)
3761 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
3566 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3762 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3567 rcu_sysidle_init_percpu_data(rdp->dynticks); 3763 rcu_sysidle_init_percpu_data(rdp->dynticks);
3568 atomic_set(&rdp->dynticks->dynticks, 3764 atomic_set(&rdp->dynticks->dynticks,
3569 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3765 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
3570 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3766 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
3571 3767
3572 /* Add CPU to rcu_node bitmasks. */ 3768 /*
3769 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
3770 * propagation up the rcu_node tree will happen at the beginning
3771 * of the next grace period.
3772 */
3573 rnp = rdp->mynode; 3773 rnp = rdp->mynode;
3574 mask = rdp->grpmask; 3774 mask = rdp->grpmask;
3575 do { 3775 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
3576 /* Exclude any attempts to start a new GP on small systems. */ 3776 smp_mb__after_unlock_lock();
3577 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 3777 rnp->qsmaskinitnext |= mask;
3578 rnp->qsmaskinit |= mask; 3778 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3579 mask = rnp->grpmask; 3779 rdp->completed = rnp->completed;
3580 if (rnp == rdp->mynode) { 3780 rdp->passed_quiesce = false;
3581 /* 3781 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3582 * If there is a grace period in progress, we will 3782 rdp->qs_pending = false;
3583 * set up to wait for it next time we run the 3783 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3584 * RCU core code. 3784 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3585 */
3586 rdp->gpnum = rnp->completed;
3587 rdp->completed = rnp->completed;
3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3590 rdp->qs_pending = 0;
3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3592 }
3593 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
3594 rnp = rnp->parent;
3595 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
3596 local_irq_restore(flags);
3597
3598 mutex_unlock(&rsp->onoff_mutex);
3599} 3785}
3600 3786
3601static void rcu_prepare_cpu(int cpu) 3787static void rcu_prepare_cpu(int cpu)
@@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu)
3609/* 3795/*
3610 * Handle CPU online/offline notification events. 3796 * Handle CPU online/offline notification events.
3611 */ 3797 */
3612static int rcu_cpu_notify(struct notifier_block *self, 3798int rcu_cpu_notify(struct notifier_block *self,
3613 unsigned long action, void *hcpu) 3799 unsigned long action, void *hcpu)
3614{ 3800{
3615 long cpu = (long)hcpu; 3801 long cpu = (long)hcpu;
3616 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3802 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3617 struct rcu_node *rnp = rdp->mynode; 3803 struct rcu_node *rnp = rdp->mynode;
3618 struct rcu_state *rsp; 3804 struct rcu_state *rsp;
3619 3805
3620 trace_rcu_utilization(TPS("Start CPU hotplug"));
3621 switch (action) { 3806 switch (action) {
3622 case CPU_UP_PREPARE: 3807 case CPU_UP_PREPARE:
3623 case CPU_UP_PREPARE_FROZEN: 3808 case CPU_UP_PREPARE_FROZEN:
@@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
3637 for_each_rcu_flavor(rsp) 3822 for_each_rcu_flavor(rsp)
3638 rcu_cleanup_dying_cpu(rsp); 3823 rcu_cleanup_dying_cpu(rsp);
3639 break; 3824 break;
3825 case CPU_DYING_IDLE:
3826 for_each_rcu_flavor(rsp) {
3827 rcu_cleanup_dying_idle_cpu(cpu, rsp);
3828 }
3829 break;
3640 case CPU_DEAD: 3830 case CPU_DEAD:
3641 case CPU_DEAD_FROZEN: 3831 case CPU_DEAD_FROZEN:
3642 case CPU_UP_CANCELED: 3832 case CPU_UP_CANCELED:
@@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
3649 default: 3839 default:
3650 break; 3840 break;
3651 } 3841 }
3652 trace_rcu_utilization(TPS("End CPU hotplug"));
3653 return NOTIFY_OK; 3842 return NOTIFY_OK;
3654} 3843}
3655 3844
@@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self,
3660 case PM_HIBERNATION_PREPARE: 3849 case PM_HIBERNATION_PREPARE:
3661 case PM_SUSPEND_PREPARE: 3850 case PM_SUSPEND_PREPARE:
3662 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3851 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3663 rcu_expedited = 1; 3852 rcu_expedite_gp();
3664 break; 3853 break;
3665 case PM_POST_HIBERNATION: 3854 case PM_POST_HIBERNATION:
3666 case PM_POST_SUSPEND: 3855 case PM_POST_SUSPEND:
3667 rcu_expedited = 0; 3856 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3857 rcu_unexpedite_gp();
3668 break; 3858 break;
3669 default: 3859 default:
3670 break; 3860 break;
@@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void)
3734 * Compute the per-level fanout, either using the exact fanout specified 3924 * Compute the per-level fanout, either using the exact fanout specified
3735 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 3925 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
3736 */ 3926 */
3737#ifdef CONFIG_RCU_FANOUT_EXACT
3738static void __init rcu_init_levelspread(struct rcu_state *rsp)
3739{
3740 int i;
3741
3742 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3743 for (i = rcu_num_lvls - 2; i >= 0; i--)
3744 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3745}
3746#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
3747static void __init rcu_init_levelspread(struct rcu_state *rsp) 3927static void __init rcu_init_levelspread(struct rcu_state *rsp)
3748{ 3928{
3749 int ccur;
3750 int cprv;
3751 int i; 3929 int i;
3752 3930
3753 cprv = nr_cpu_ids; 3931 if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
3754 for (i = rcu_num_lvls - 1; i >= 0; i--) { 3932 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3755 ccur = rsp->levelcnt[i]; 3933 for (i = rcu_num_lvls - 2; i >= 0; i--)
3756 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 3934 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3757 cprv = ccur; 3935 } else {
3936 int ccur;
3937 int cprv;
3938
3939 cprv = nr_cpu_ids;
3940 for (i = rcu_num_lvls - 1; i >= 0; i--) {
3941 ccur = rsp->levelcnt[i];
3942 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
3943 cprv = ccur;
3944 }
3758 } 3945 }
3759} 3946}
3760#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
3761 3947
3762/* 3948/*
3763 * Helper function for rcu_init() that initializes one rcu_state structure. 3949 * Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3833 } 4019 }
3834 } 4020 }
3835 4021
3836 rsp->rda = rda;
3837 init_waitqueue_head(&rsp->gp_wq); 4022 init_waitqueue_head(&rsp->gp_wq);
3838 rnp = rsp->level[rcu_num_lvls - 1]; 4023 rnp = rsp->level[rcu_num_lvls - 1];
3839 for_each_possible_cpu(i) { 4024 for_each_possible_cpu(i) {
@@ -3926,6 +4111,8 @@ void __init rcu_init(void)
3926{ 4111{
3927 int cpu; 4112 int cpu;
3928 4113
4114 rcu_early_boot_tests();
4115
3929 rcu_bootup_announce(); 4116 rcu_bootup_announce();
3930 rcu_init_geometry(); 4117 rcu_init_geometry();
3931 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 4118 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3942,8 +4129,6 @@ void __init rcu_init(void)
3942 pm_notifier(rcu_pm_notify, 0); 4129 pm_notifier(rcu_pm_notify, 0);
3943 for_each_online_cpu(cpu) 4130 for_each_online_cpu(cpu)
3944 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 4131 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3945
3946 rcu_early_boot_tests();
3947} 4132}
3948 4133
3949#include "tree_plugin.h" 4134#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
141 /* complete (only for PREEMPT_RCU). */ 141 /* complete (only for PREEMPT_RCU). */
142 unsigned long qsmaskinit; 142 unsigned long qsmaskinit;
143 /* Per-GP initial value for qsmask & expmask. */ 143 /* Per-GP initial value for qsmask & expmask. */
144 /* Initialized from ->qsmaskinitnext at the */
145 /* beginning of each grace period. */
146 unsigned long qsmaskinitnext;
147 /* Online CPUs for next grace period. */
144 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 148 unsigned long grpmask; /* Mask to apply to parent qsmask. */
145 /* Only one bit will be set in this mask. */ 149 /* Only one bit will be set in this mask. */
146 int grplo; /* lowest-numbered CPU or group here. */ 150 int grplo; /* lowest-numbered CPU or group here. */
147 int grphi; /* highest-numbered CPU or group here. */ 151 int grphi; /* highest-numbered CPU or group here. */
148 u8 grpnum; /* CPU/group number for next level up. */ 152 u8 grpnum; /* CPU/group number for next level up. */
149 u8 level; /* root is at level 0. */ 153 u8 level; /* root is at level 0. */
154 bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
155 /* exit RCU read-side critical sections */
156 /* before propagating offline up the */
157 /* rcu_node tree? */
150 struct rcu_node *parent; 158 struct rcu_node *parent;
151 struct list_head blkd_tasks; 159 struct list_head blkd_tasks;
152 /* Tasks blocked in RCU read-side critical */ 160 /* Tasks blocked in RCU read-side critical */
@@ -448,8 +456,6 @@ struct rcu_state {
448 long qlen; /* Total number of callbacks. */ 456 long qlen; /* Total number of callbacks. */
449 /* End of fields guarded by orphan_lock. */ 457 /* End of fields guarded by orphan_lock. */
450 458
451 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
452
453 struct mutex barrier_mutex; /* Guards barrier fields. */ 459 struct mutex barrier_mutex; /* Guards barrier fields. */
454 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 460 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
455 struct completion barrier_completion; /* Wake at barrier end. */ 461 struct completion barrier_completion; /* Wake at barrier end. */
@@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu);
559static void rcu_cleanup_after_idle(void); 565static void rcu_cleanup_after_idle(void);
560static void rcu_prepare_for_idle(void); 566static void rcu_prepare_for_idle(void);
561static void rcu_idle_count_callbacks_posted(void); 567static void rcu_idle_count_callbacks_posted(void);
568static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
562static void print_cpu_stall_info_begin(void); 569static void print_cpu_stall_info_begin(void);
563static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 570static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
564static void print_cpu_stall_info_end(void); 571static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..8c0ec0f5a027 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
58 */ 58 */
59static void __init rcu_bootup_announce_oddness(void) 59static void __init rcu_bootup_announce_oddness(void)
60{ 60{
61#ifdef CONFIG_RCU_TRACE 61 if (IS_ENABLED(CONFIG_RCU_TRACE))
62 pr_info("\tRCU debugfs-based tracing is enabled.\n"); 62 pr_info("\tRCU debugfs-based tracing is enabled.\n");
63#endif 63 if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
64#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 64 (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
65 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 65 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
66 CONFIG_RCU_FANOUT); 66 CONFIG_RCU_FANOUT);
67#endif 67 if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
68#ifdef CONFIG_RCU_FANOUT_EXACT 68 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
69 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 69 if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
70#endif 70 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
71#ifdef CONFIG_RCU_FAST_NO_HZ 71 if (IS_ENABLED(CONFIG_PROVE_RCU))
72 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 72 pr_info("\tRCU lockdep checking is enabled.\n");
73#endif 73 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
74#ifdef CONFIG_PROVE_RCU 74 pr_info("\tRCU torture testing starts during boot.\n");
75 pr_info("\tRCU lockdep checking is enabled.\n"); 75 if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
76#endif 76 pr_info("\tAdditional per-CPU info printed with stalls.\n");
77#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 77 if (NUM_RCU_LVL_4 != 0)
78 pr_info("\tRCU torture testing starts during boot.\n"); 78 pr_info("\tFour-level hierarchy is enabled.\n");
79#endif 79 if (CONFIG_RCU_FANOUT_LEAF != 16)
80#if defined(CONFIG_RCU_CPU_STALL_INFO) 80 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
81 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 81 CONFIG_RCU_FANOUT_LEAF);
82#endif
83#if NUM_RCU_LVL_4 != 0
84 pr_info("\tFour-level hierarchy is enabled.\n");
85#endif
86 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 82 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
87 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 83 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
88 if (nr_cpu_ids != NR_CPUS) 84 if (nr_cpu_ids != NR_CPUS)
89 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 85 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
90#ifdef CONFIG_RCU_BOOST 86 if (IS_ENABLED(CONFIG_RCU_BOOST))
91 pr_info("\tRCU kthread priority: %d.\n", kthread_prio); 87 pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
92#endif
93} 88}
94 89
95#ifdef CONFIG_PREEMPT_RCU 90#ifdef CONFIG_PREEMPT_RCU
@@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void)
180 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
181 * on line! 176 * on line!
182 */ 177 */
183 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
184 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
185 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
186 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
233} 228}
234 229
235/* 230/*
236 * Record a quiescent state for all tasks that were previously queued
237 * on the specified rcu_node structure and that were blocking the current
238 * RCU grace period. The caller must hold the specified rnp->lock with
239 * irqs disabled, and this lock is released upon return, but irqs remain
240 * disabled.
241 */
242static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
243 __releases(rnp->lock)
244{
245 unsigned long mask;
246 struct rcu_node *rnp_p;
247
248 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
249 raw_spin_unlock_irqrestore(&rnp->lock, flags);
250 return; /* Still need more quiescent states! */
251 }
252
253 rnp_p = rnp->parent;
254 if (rnp_p == NULL) {
255 /*
256 * Either there is only one rcu_node in the tree,
257 * or tasks were kicked up to root rcu_node due to
258 * CPUs going offline.
259 */
260 rcu_report_qs_rsp(&rcu_preempt_state, flags);
261 return;
262 }
263
264 /* Report up the rest of the hierarchy. */
265 mask = rnp->grpmask;
266 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
267 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
268 smp_mb__after_unlock_lock();
269 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
270}
271
272/*
273 * Advance a ->blkd_tasks-list pointer to the next entry, instead 231 * Advance a ->blkd_tasks-list pointer to the next entry, instead
274 * returning NULL if at the end of the list. 232 * returning NULL if at the end of the list.
275 */ 233 */
@@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
300 */ 258 */
301void rcu_read_unlock_special(struct task_struct *t) 259void rcu_read_unlock_special(struct task_struct *t)
302{ 260{
303 bool empty;
304 bool empty_exp; 261 bool empty_exp;
305 bool empty_norm; 262 bool empty_norm;
306 bool empty_exp_now; 263 bool empty_exp_now;
@@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t)
334 } 291 }
335 292
336 /* Hardware IRQ handlers cannot block, complain if they get here. */ 293 /* Hardware IRQ handlers cannot block, complain if they get here. */
337 if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { 294 if (in_irq() || in_serving_softirq()) {
295 lockdep_rcu_suspicious(__FILE__, __LINE__,
296 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
297 pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
298 t->rcu_read_unlock_special.s,
299 t->rcu_read_unlock_special.b.blocked,
300 t->rcu_read_unlock_special.b.need_qs);
338 local_irq_restore(flags); 301 local_irq_restore(flags);
339 return; 302 return;
340 } 303 }
@@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t)
356 break; 319 break;
357 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 320 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
358 } 321 }
359 empty = !rcu_preempt_has_tasks(rnp);
360 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 322 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
361 empty_exp = !rcu_preempted_readers_exp(rnp); 323 empty_exp = !rcu_preempted_readers_exp(rnp);
362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 324 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t)
377#endif /* #ifdef CONFIG_RCU_BOOST */ 339#endif /* #ifdef CONFIG_RCU_BOOST */
378 340
379 /* 341 /*
380 * If this was the last task on the list, go see if we
381 * need to propagate ->qsmaskinit bit clearing up the
382 * rcu_node tree.
383 */
384 if (!empty && !rcu_preempt_has_tasks(rnp))
385 rcu_cleanup_dead_rnp(rnp);
386
387 /*
388 * If this was the last task on the current list, and if 342 * If this was the last task on the current list, and if
389 * we aren't waiting on any CPUs, report the quiescent state. 343 * we aren't waiting on any CPUs, report the quiescent state.
390 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 344 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
@@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t)
399 rnp->grplo, 353 rnp->grplo,
400 rnp->grphi, 354 rnp->grphi,
401 !!rnp->gp_tasks); 355 !!rnp->gp_tasks);
402 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(&rcu_preempt_state,
357 rnp, flags);
403 } else { 358 } else {
404 raw_spin_unlock_irqrestore(&rnp->lock, flags); 359 raw_spin_unlock_irqrestore(&rnp->lock, flags);
405 } 360 }
@@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520 WARN_ON_ONCE(rnp->qsmask); 475 WARN_ON_ONCE(rnp->qsmask);
521} 476}
522 477
523#ifdef CONFIG_HOTPLUG_CPU
524
525#endif /* #ifdef CONFIG_HOTPLUG_CPU */
526
527/* 478/*
528 * Check for a quiescent state from the current CPU. When a task blocks, 479 * Check for a quiescent state from the current CPU. When a task blocks,
529 * the task is recorded in the corresponding CPU's rcu_node structure, 480 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -585,7 +536,7 @@ void synchronize_rcu(void)
585 "Illegal synchronize_rcu() in RCU read-side critical section"); 536 "Illegal synchronize_rcu() in RCU read-side critical section");
586 if (!rcu_scheduler_active) 537 if (!rcu_scheduler_active)
587 return; 538 return;
588 if (rcu_expedited) 539 if (rcu_gp_is_expedited())
589 synchronize_rcu_expedited(); 540 synchronize_rcu_expedited();
590 else 541 else
591 wait_rcu_gp(call_rcu); 542 wait_rcu_gp(call_rcu);
@@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
630 * recursively up the tree. (Calm down, calm down, we do the recursion 581 * recursively up the tree. (Calm down, calm down, we do the recursion
631 * iteratively!) 582 * iteratively!)
632 * 583 *
633 * Most callers will set the "wake" flag, but the task initiating the
634 * expedited grace period need not wake itself.
635 *
636 * Caller must hold sync_rcu_preempt_exp_mutex. 584 * Caller must hold sync_rcu_preempt_exp_mutex.
637 */ 585 */
638static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 586static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
667 615
668/* 616/*
669 * Snapshot the tasks blocking the newly started preemptible-RCU expedited 617 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
670 * grace period for the specified rcu_node structure. If there are no such 618 * grace period for the specified rcu_node structure, phase 1. If there
671 * tasks, report it up the rcu_node hierarchy. 619 * are such tasks, set the ->expmask bits up the rcu_node tree and also
620 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
621 * that work is needed here.
672 * 622 *
673 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude 623 * Caller must hold sync_rcu_preempt_exp_mutex.
674 * CPU hotplug operations.
675 */ 624 */
676static void 625static void
677sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 626sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
678{ 627{
679 unsigned long flags; 628 unsigned long flags;
680 int must_wait = 0; 629 unsigned long mask;
630 struct rcu_node *rnp_up;
681 631
682 raw_spin_lock_irqsave(&rnp->lock, flags); 632 raw_spin_lock_irqsave(&rnp->lock, flags);
683 smp_mb__after_unlock_lock(); 633 smp_mb__after_unlock_lock();
634 WARN_ON_ONCE(rnp->expmask);
635 WARN_ON_ONCE(rnp->exp_tasks);
684 if (!rcu_preempt_has_tasks(rnp)) { 636 if (!rcu_preempt_has_tasks(rnp)) {
637 /* No blocked tasks, nothing to do. */
685 raw_spin_unlock_irqrestore(&rnp->lock, flags); 638 raw_spin_unlock_irqrestore(&rnp->lock, flags);
686 } else { 639 return;
640 }
641 /* Call for Phase 2 and propagate ->expmask bits up the tree. */
642 rnp->expmask = 1;
643 rnp_up = rnp;
644 while (rnp_up->parent) {
645 mask = rnp_up->grpmask;
646 rnp_up = rnp_up->parent;
647 if (rnp_up->expmask & mask)
648 break;
649 raw_spin_lock(&rnp_up->lock); /* irqs already off */
650 smp_mb__after_unlock_lock();
651 rnp_up->expmask |= mask;
652 raw_spin_unlock(&rnp_up->lock); /* irqs still off */
653 }
654 raw_spin_unlock_irqrestore(&rnp->lock, flags);
655}
656
657/*
658 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
659 * grace period for the specified rcu_node structure, phase 2. If the
660 * leaf rcu_node structure has its ->expmask field set, check for tasks.
661 * If there are some, clear ->expmask and set ->exp_tasks accordingly,
662 * then initiate RCU priority boosting. Otherwise, clear ->expmask and
663 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
664 * enabling rcu_read_unlock_special() to do the bit-clearing.
665 *
666 * Caller must hold sync_rcu_preempt_exp_mutex.
667 */
668static void
669sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
670{
671 unsigned long flags;
672
673 raw_spin_lock_irqsave(&rnp->lock, flags);
674 smp_mb__after_unlock_lock();
675 if (!rnp->expmask) {
676 /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
677 raw_spin_unlock_irqrestore(&rnp->lock, flags);
678 return;
679 }
680
681 /* Phase 1 is over. */
682 rnp->expmask = 0;
683
684 /*
685 * If there are still blocked tasks, set up ->exp_tasks so that
686 * rcu_read_unlock_special() will wake us and then boost them.
687 */
688 if (rcu_preempt_has_tasks(rnp)) {
687 rnp->exp_tasks = rnp->blkd_tasks.next; 689 rnp->exp_tasks = rnp->blkd_tasks.next;
688 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 690 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
689 must_wait = 1; 691 return;
690 } 692 }
691 if (!must_wait) 693
692 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 694 /* No longer any blocked tasks, so undo bit setting. */
695 raw_spin_unlock_irqrestore(&rnp->lock, flags);
696 rcu_report_exp_rnp(rsp, rnp, false);
693} 697}
694 698
695/** 699/**
@@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
706 */ 710 */
707void synchronize_rcu_expedited(void) 711void synchronize_rcu_expedited(void)
708{ 712{
709 unsigned long flags;
710 struct rcu_node *rnp; 713 struct rcu_node *rnp;
711 struct rcu_state *rsp = &rcu_preempt_state; 714 struct rcu_state *rsp = &rcu_preempt_state;
712 unsigned long snap; 715 unsigned long snap;
@@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void)
757 /* force all RCU readers onto ->blkd_tasks lists. */ 760 /* force all RCU readers onto ->blkd_tasks lists. */
758 synchronize_sched_expedited(); 761 synchronize_sched_expedited();
759 762
760 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 763 /*
761 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 764 * Snapshot current state of ->blkd_tasks lists into ->expmask.
762 raw_spin_lock_irqsave(&rnp->lock, flags); 765 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
763 smp_mb__after_unlock_lock(); 766 * to start clearing them. Doing this in one phase leads to
764 rnp->expmask = rnp->qsmaskinit; 767 * strange races between setting and clearing bits, so just say "no"!
765 raw_spin_unlock_irqrestore(&rnp->lock, flags); 768 */
766 } 769 rcu_for_each_leaf_node(rsp, rnp)
767 770 sync_rcu_preempt_exp_init1(rsp, rnp);
768 /* Snapshot current state of ->blkd_tasks lists. */
769 rcu_for_each_leaf_node(rsp, rnp) 771 rcu_for_each_leaf_node(rsp, rnp)
770 sync_rcu_preempt_exp_init(rsp, rnp); 772 sync_rcu_preempt_exp_init2(rsp, rnp);
771 if (NUM_RCU_NODES > 1)
772 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
773 773
774 put_online_cpus(); 774 put_online_cpus();
775 775
@@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
859 return 0; 859 return 0;
860} 860}
861 861
862#ifdef CONFIG_HOTPLUG_CPU
863
864/* 862/*
865 * Because there is no preemptible RCU, there can be no readers blocked. 863 * Because there is no preemptible RCU, there can be no readers blocked.
866 */ 864 */
@@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
869 return false; 867 return false;
870} 868}
871 869
872#endif /* #ifdef CONFIG_HOTPLUG_CPU */
873
874/* 870/*
875 * Because preemptible RCU does not exist, we never have to check for 871 * Because preemptible RCU does not exist, we never have to check for
876 * tasks blocked within RCU read-side critical sections. 872 * tasks blocked within RCU read-side critical sections.
@@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1170 * Returns zero if all is well, a negated errno otherwise. 1166 * Returns zero if all is well, a negated errno otherwise.
1171 */ 1167 */
1172static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1168static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1173 struct rcu_node *rnp) 1169 struct rcu_node *rnp)
1174{ 1170{
1175 int rnp_index = rnp - &rsp->node[0]; 1171 int rnp_index = rnp - &rsp->node[0];
1176 unsigned long flags; 1172 unsigned long flags;
@@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1180 if (&rcu_preempt_state != rsp) 1176 if (&rcu_preempt_state != rsp)
1181 return 0; 1177 return 0;
1182 1178
1183 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) 1179 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
1184 return 0; 1180 return 0;
1185 1181
1186 rsp->boost = 1; 1182 rsp->boost = 1;
@@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
1273static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1269static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1274{ 1270{
1275 struct task_struct *t = rnp->boost_kthread_task; 1271 struct task_struct *t = rnp->boost_kthread_task;
1276 unsigned long mask = rnp->qsmaskinit; 1272 unsigned long mask = rcu_rnp_online_cpus(rnp);
1277 cpumask_var_t cm; 1273 cpumask_var_t cm;
1278 int cpu; 1274 int cpu;
1279 1275
@@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
1945 rhp = ACCESS_ONCE(rdp->nocb_follower_head); 1941 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
1946 1942
1947 /* Having no rcuo kthread but CBs after scheduler starts is bad! */ 1943 /* Having no rcuo kthread but CBs after scheduler starts is bad! */
1948 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { 1944 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
1945 rcu_scheduler_fully_active) {
1949 /* RCU callback enqueued before CPU first came online??? */ 1946 /* RCU callback enqueued before CPU first came online??? */
1950 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", 1947 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
1951 cpu, rhp->func); 1948 cpu, rhp->func);
@@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void)
2392 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2389 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2393 2390
2394 for_each_rcu_flavor(rsp) { 2391 for_each_rcu_flavor(rsp) {
2395 for_each_cpu(cpu, rcu_nocb_mask) { 2392 for_each_cpu(cpu, rcu_nocb_mask)
2396 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2393 init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
2397
2398 /*
2399 * If there are early callbacks, they will need
2400 * to be moved to the nocb lists.
2401 */
2402 WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
2403 &rdp->nxtlist &&
2404 rdp->nxttail[RCU_NEXT_TAIL] != NULL);
2405 init_nocb_callback_list(rdp);
2406 }
2407 rcu_organize_nocb_kthreads(rsp); 2394 rcu_organize_nocb_kthreads(rsp);
2408 } 2395 }
2409} 2396}
@@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2540 if (!rcu_is_nocb_cpu(rdp->cpu)) 2527 if (!rcu_is_nocb_cpu(rdp->cpu))
2541 return false; 2528 return false;
2542 2529
2530 /* If there are early-boot callbacks, move them to nocb lists. */
2531 if (rdp->nxtlist) {
2532 rdp->nocb_head = rdp->nxtlist;
2533 rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
2534 atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
2535 atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
2536 rdp->nxtlist = NULL;
2537 rdp->qlen = 0;
2538 rdp->qlen_lazy = 0;
2539 }
2543 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2540 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2544 return true; 2541 return true;
2545} 2542}
@@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq)
2763 2760
2764/* 2761/*
2765 * Check to see if the current CPU is idle. Note that usermode execution 2762 * Check to see if the current CPU is idle. Note that usermode execution
2766 * does not count as idle. The caller must have disabled interrupts. 2763 * does not count as idle. The caller must have disabled interrupts,
2764 * and must be running on tick_do_timer_cpu.
2767 */ 2765 */
2768static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2766static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2769 unsigned long *maxj) 2767 unsigned long *maxj)
@@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2784 if (!*isidle || rdp->rsp != rcu_state_p || 2782 if (!*isidle || rdp->rsp != rcu_state_p ||
2785 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2783 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2786 return; 2784 return;
2787 if (rcu_gp_in_progress(rdp->rsp)) 2785 /* Verify affinity of current kthread. */
2788 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); 2786 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2789 2787
2790 /* Pick up current idle and NMI-nesting counter and check. */ 2788 /* Pick up current idle and NMI-nesting counter and check. */
2791 cur = atomic_read(&rdtp->dynticks_idle); 2789 cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void)
3068 return; 3066 return;
3069#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 3067#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
3070 cpu = tick_do_timer_cpu; 3068 cpu = tick_do_timer_cpu;
3071 if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) 3069 if (cpu >= 0 && cpu < nr_cpu_ids)
3072 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 3070 set_cpus_allowed_ptr(current, cpumask_of(cpu));
3073#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3071#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3074 if (!is_housekeeping_cpu(raw_smp_processor_id())) 3072 housekeeping_affine(current);
3075 housekeeping_affine(current);
3076#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3073#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3077} 3074}
3078 3075
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
284 level = rnp->level; 284 level = rnp->level;
285 } 285 }
286 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", 286 seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
287 rnp->qsmask, rnp->qsmaskinit, 287 rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
288 ".G"[rnp->gp_tasks != NULL], 288 ".G"[rnp->gp_tasks != NULL],
289 ".E"[rnp->exp_tasks != NULL], 289 ".E"[rnp->exp_tasks != NULL],
290 ".T"[!list_empty(&rnp->blkd_tasks)], 290 ".T"[!list_empty(&rnp->blkd_tasks)],
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..1f133350da01 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate");
62 62
63module_param(rcu_expedited, int, 0); 63module_param(rcu_expedited, int, 0);
64 64
65#ifndef CONFIG_TINY_RCU
66
67static atomic_t rcu_expedited_nesting =
68 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
69
70/*
71 * Should normal grace-period primitives be expedited? Intended for
72 * use within RCU. Note that this function takes the rcu_expedited
73 * sysfs/boot variable into account as well as the rcu_expedite_gp()
74 * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
75 * returns false is a -really- bad idea.
76 */
77bool rcu_gp_is_expedited(void)
78{
79 return rcu_expedited || atomic_read(&rcu_expedited_nesting);
80}
81EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
82
83/**
84 * rcu_expedite_gp - Expedite future RCU grace periods
85 *
86 * After a call to this function, future calls to synchronize_rcu() and
87 * friends act as the corresponding synchronize_rcu_expedited() function
88 * had instead been called.
89 */
90void rcu_expedite_gp(void)
91{
92 atomic_inc(&rcu_expedited_nesting);
93}
94EXPORT_SYMBOL_GPL(rcu_expedite_gp);
95
96/**
97 * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
98 *
99 * Undo a prior call to rcu_expedite_gp(). If all prior calls to
100 * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
101 * and if the rcu_expedited sysfs/boot parameter is not set, then all
102 * subsequent calls to synchronize_rcu() and friends will return to
103 * their normal non-expedited behavior.
104 */
105void rcu_unexpedite_gp(void)
106{
107 atomic_dec(&rcu_expedited_nesting);
108}
109EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
110
111#endif /* #ifndef CONFIG_TINY_RCU */
112
113/*
114 * Inform RCU of the end of the in-kernel boot sequence.
115 */
116void rcu_end_inkernel_boot(void)
117{
118 if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
119 rcu_unexpedite_gp();
120}
121
65#ifdef CONFIG_PREEMPT_RCU 122#ifdef CONFIG_PREEMPT_RCU
66 123
67/* 124/*
@@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
199 256
200#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 257#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
201 258
202struct rcu_synchronize { 259/**
203 struct rcu_head head; 260 * wakeme_after_rcu() - Callback function to awaken a task after grace period
204 struct completion completion; 261 * @head: Pointer to rcu_head member within rcu_synchronize structure
205}; 262 *
206 263 * Awaken the corresponding task now that a grace period has elapsed.
207/*
208 * Awaken the corresponding synchronize_rcu() instance now that a
209 * grace period has elapsed.
210 */ 264 */
211static void wakeme_after_rcu(struct rcu_head *head) 265void wakeme_after_rcu(struct rcu_head *head)
212{ 266{
213 struct rcu_synchronize *rcu; 267 struct rcu_synchronize *rcu;
214 268
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 4d207d2abcbd..deef1caa94c6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,8 @@ use_default:
209 goto exit_idle; 209 goto exit_idle;
210} 210}
211 211
212DEFINE_PER_CPU(bool, cpu_dead_idle);
213
212/* 214/*
213 * Generic idle loop implementation 215 * Generic idle loop implementation
214 * 216 *
@@ -233,8 +235,13 @@ static void cpu_idle_loop(void)
233 check_pgt_cache(); 235 check_pgt_cache();
234 rmb(); 236 rmb();
235 237
236 if (cpu_is_offline(smp_processor_id())) 238 if (cpu_is_offline(smp_processor_id())) {
239 rcu_cpu_notify(NULL, CPU_DYING_IDLE,
240 (void *)(long)smp_processor_id());
241 smp_mb(); /* all activity before dead. */
242 this_cpu_write(cpu_dead_idle, true);
237 arch_cpu_idle_dead(); 243 arch_cpu_idle_dead();
244 }
238 245
239 local_irq_disable(); 246 local_irq_disable();
240 arch_cpu_idle_enter(); 247 arch_cpu_idle_enter();
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/delay.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/list.h> 9#include <linux/list.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
314 put_online_cpus(); 315 put_online_cpus();
315} 316}
316EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); 317EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
318
319static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
320
321/*
322 * Called to poll specified CPU's state, for example, when waiting for
323 * a CPU to come online.
324 */
325int cpu_report_state(int cpu)
326{
327 return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
328}
329
330/*
331 * If CPU has died properly, set its state to CPU_UP_PREPARE and
332 * return success. Otherwise, return -EBUSY if the CPU died after
333 * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
334 * if cpu_wait_death() timed out and the CPU still hasn't gotten around
335 * to dying. In the latter two cases, the CPU might not be set up
336 * properly, but it is up to the arch-specific code to decide.
337 * Finally, -EIO indicates an unanticipated problem.
338 *
339 * Note that it is permissible to omit this call entirely, as is
340 * done in architectures that do no CPU-hotplug error checking.
341 */
342int cpu_check_up_prepare(int cpu)
343{
344 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
345 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
346 return 0;
347 }
348
349 switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
350
351 case CPU_POST_DEAD:
352
353 /* The CPU died properly, so just start it up again. */
354 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
355 return 0;
356
357 case CPU_DEAD_FROZEN:
358
359 /*
360 * Timeout during CPU death, so let caller know.
361 * The outgoing CPU completed its processing, but after
362 * cpu_wait_death() timed out and reported the error. The
363 * caller is free to proceed, in which case the state
364 * will be reset properly by cpu_set_state_online().
365 * Proceeding despite this -EBUSY return makes sense
366 * for systems where the outgoing CPUs take themselves
367 * offline, with no post-death manipulation required from
368 * a surviving CPU.
369 */
370 return -EBUSY;
371
372 case CPU_BROKEN:
373
374 /*
375 * The most likely reason we got here is that there was
376 * a timeout during CPU death, and the outgoing CPU never
377 * did complete its processing. This could happen on
378 * a virtualized system if the outgoing VCPU gets preempted
379 * for more than five seconds, and the user attempts to
380 * immediately online that same CPU. Trying again later
381 * might return -EBUSY above, hence -EAGAIN.
382 */
383 return -EAGAIN;
384
385 default:
386
387 /* Should not happen. Famous last words. */
388 return -EIO;
389 }
390}
391
392/*
393 * Mark the specified CPU online.
394 *
395 * Note that it is permissible to omit this call entirely, as is
396 * done in architectures that do no CPU-hotplug error checking.
397 */
398void cpu_set_state_online(int cpu)
399{
400 (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
401}
402
403#ifdef CONFIG_HOTPLUG_CPU
404
405/*
406 * Wait for the specified CPU to exit the idle loop and die.
407 */
408bool cpu_wait_death(unsigned int cpu, int seconds)
409{
410 int jf_left = seconds * HZ;
411 int oldstate;
412 bool ret = true;
413 int sleep_jf = 1;
414
415 might_sleep();
416
417 /* The outgoing CPU will normally get done quite quickly. */
418 if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
419 goto update_state;
420 udelay(5);
421
422 /* But if the outgoing CPU dawdles, wait increasingly long times. */
423 while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
424 schedule_timeout_uninterruptible(sleep_jf);
425 jf_left -= sleep_jf;
426 if (jf_left <= 0)
427 break;
428 sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
429 }
430update_state:
431 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
432 if (oldstate == CPU_DEAD) {
433 /* Outgoing CPU died normally, update state. */
434 smp_mb(); /* atomic_read() before update. */
435 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
436 } else {
437 /* Outgoing CPU still hasn't died, set state accordingly. */
438 if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
439 oldstate, CPU_BROKEN) != oldstate)
440 goto update_state;
441 ret = false;
442 }
443 return ret;
444}
445
446/*
447 * Called by the outgoing CPU to report its successful death. Return
448 * false if this report follows the surviving CPU's timing out.
449 *
450 * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
451 * timed out. This approach allows architectures to omit calls to
452 * cpu_check_up_prepare() and cpu_set_state_online() without defeating
453 * the next cpu_wait_death()'s polling loop.
454 */
455bool cpu_report_death(void)
456{
457 int oldstate;
458 int newstate;
459 int cpu = smp_processor_id();
460
461 do {
462 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
463 if (oldstate != CPU_BROKEN)
464 newstate = CPU_DEAD;
465 else
466 newstate = CPU_DEAD_FROZEN;
467 } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
468 oldstate, newstate) != oldstate);
469 return newstate == CPU_DEAD;
470}
471
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 36b6fa88ce5b..93967e634a1e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1193,16 +1193,7 @@ config DEBUG_CREDENTIALS
1193menu "RCU Debugging" 1193menu "RCU Debugging"
1194 1194
1195config PROVE_RCU 1195config PROVE_RCU
1196 bool "RCU debugging: prove RCU correctness" 1196 def_bool PROVE_LOCKING
1197 depends on PROVE_LOCKING
1198 default n
1199 help
1200 This feature enables lockdep extensions that check for correct
1201 use of RCU APIs. This is currently under development. Say Y
1202 if you want to debug RCU usage or help work on the PROVE_RCU
1203 feature.
1204
1205 Say N if you are unsure.
1206 1197
1207config PROVE_RCU_REPEATEDLY 1198config PROVE_RCU_REPEATEDLY
1208 bool "RCU debugging: don't disable PROVE_RCU on first splat" 1199 bool "RCU debugging: don't disable PROVE_RCU on first splat"
@@ -1270,6 +1261,30 @@ config RCU_TORTURE_TEST_RUNNABLE
1270 Say N here if you want the RCU torture tests to start only 1261 Say N here if you want the RCU torture tests to start only
1271 after being manually enabled via /proc. 1262 after being manually enabled via /proc.
1272 1263
1264config RCU_TORTURE_TEST_SLOW_INIT
1265 bool "Slow down RCU grace-period initialization to expose races"
1266 depends on RCU_TORTURE_TEST
1267 help
1268 This option makes grace-period initialization block for a
1269 few jiffies between initializing each pair of consecutive
1270 rcu_node structures. This helps to expose races involving
1271 grace-period initialization, in other words, it makes your
1272 kernel less stable. It can also greatly increase grace-period
1273 latency, especially on systems with large numbers of CPUs.
1274 This is useful when torture-testing RCU, but in almost no
1275 other circumstance.
1276
1277 Say Y here if you want your system to crash and hang more often.
1278 Say N if you want a sane system.
1279
1280config RCU_TORTURE_TEST_SLOW_INIT_DELAY
1281 int "How much to slow down RCU grace-period initialization"
1282 range 0 5
1283 default 3
1284 help
1285 This option specifies the number of jiffies to wait between
1286 each rcu_node structure initialization.
1287
1273config RCU_CPU_STALL_TIMEOUT 1288config RCU_CPU_STALL_TIMEOUT
1274 int "RCU CPU stall timeout in seconds" 1289 int "RCU CPU stall timeout in seconds"
1275 depends on RCU_STALL_COMMON 1290 depends on RCU_STALL_COMMON
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 368d64ac779e..dd2812ceb0ba 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -310,7 +310,7 @@ function dump(first, pastlast)
310 cfr[jn] = cf[j] "." cfrep[cf[j]]; 310 cfr[jn] = cf[j] "." cfrep[cf[j]];
311 } 311 }
312 if (cpusr[jn] > ncpus && ncpus != 0) 312 if (cpusr[jn] > ncpus && ncpus != 0)
313 ovf = "(!)"; 313 ovf = "-ovf";
314 else 314 else
315 ovf = ""; 315 ovf = "";
316 print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date`"; 316 print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date`";
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index d2d2a86139db..49701218dc62 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,2 +1,3 @@
1CONFIG_RCU_TORTURE_TEST=y 1CONFIG_RCU_TORTURE_TEST=y
2CONFIG_PRINTK_TIME=y 2CONFIG_PRINTK_TIME=y
3CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y