aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt6
-rw-r--r--arch/blackfin/mach-common/smp.c6
-rw-r--r--arch/metag/kernel/smp.c5
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/kernel/smpboot.c39
-rw-r--r--arch/x86/xen/smp.c46
-rw-r--r--include/linux/cpu.h14
-rw-r--r--include/linux/lockdep.h7
-rw-r--r--include/linux/rcupdate.h40
-rw-r--r--include/linux/srcu.h2
-rw-r--r--init/Kconfig13
-rw-r--r--kernel/cpu.c4
-rw-r--r--kernel/rcu/rcutorture.c27
-rw-r--r--kernel/rcu/srcu.c19
-rw-r--r--kernel/rcu/tiny.c14
-rw-r--r--kernel/rcu/tree.c437
-rw-r--r--kernel/rcu/tree.h11
-rw-r--r--kernel/rcu/tree_plugin.h267
-rw-r--r--kernel/rcu/tree_trace.c4
-rw-r--r--kernel/rcu/update.c72
-rw-r--r--kernel/sched/idle.c9
-rw-r--r--kernel/smpboot.c156
-rw-r--r--lib/Kconfig.debug35
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/CFcommon1
26 files changed, 863 insertions, 377 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index d913e3b4bf0d..5368ba701de2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2968,6 +2968,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2968 Set maximum number of finished RCU callbacks to 2968 Set maximum number of finished RCU callbacks to
2969 process in one batch. 2969 process in one batch.
2970 2970
2971 rcutree.gp_init_delay= [KNL]
2972 Set the number of jiffies to delay each step of
2973 RCU grace-period initialization. This only has
2974 effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
2975 set.
2976
2971 rcutree.rcu_fanout_leaf= [KNL] 2977 rcutree.rcu_fanout_leaf= [KNL]
2972 Increase the number of CPUs assigned to each 2978 Increase the number of CPUs assigned to each
2973 leaf rcu_node structure. Useful for very large 2979 leaf rcu_node structure. Useful for very large
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 8ad3e90cc8fc..1c7259597395 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -413,16 +413,14 @@ int __cpu_disable(void)
413 return 0; 413 return 0;
414} 414}
415 415
416static DECLARE_COMPLETION(cpu_killed);
417
418int __cpu_die(unsigned int cpu) 416int __cpu_die(unsigned int cpu)
419{ 417{
420 return wait_for_completion_timeout(&cpu_killed, 5000); 418 return cpu_wait_death(cpu, 5);
421} 419}
422 420
423void cpu_die(void) 421void cpu_die(void)
424{ 422{
425 complete(&cpu_killed); 423 (void)cpu_report_death();
426 424
427 atomic_dec(&init_mm.mm_users); 425 atomic_dec(&init_mm.mm_users);
428 atomic_dec(&init_mm.mm_count); 426 atomic_dec(&init_mm.mm_count);
diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index f006d2276f40..ac3a199e33e7 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -261,7 +261,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
261} 261}
262 262
263#ifdef CONFIG_HOTPLUG_CPU 263#ifdef CONFIG_HOTPLUG_CPU
264static DECLARE_COMPLETION(cpu_killed);
265 264
266/* 265/*
267 * __cpu_disable runs on the processor to be shutdown. 266 * __cpu_disable runs on the processor to be shutdown.
@@ -299,7 +298,7 @@ int __cpu_disable(void)
299 */ 298 */
300void __cpu_die(unsigned int cpu) 299void __cpu_die(unsigned int cpu)
301{ 300{
302 if (!wait_for_completion_timeout(&cpu_killed, msecs_to_jiffies(1))) 301 if (!cpu_wait_death(cpu, 1))
303 pr_err("CPU%u: unable to kill\n", cpu); 302 pr_err("CPU%u: unable to kill\n", cpu);
304} 303}
305 304
@@ -314,7 +313,7 @@ void cpu_die(void)
314 local_irq_disable(); 313 local_irq_disable();
315 idle_task_exit(); 314 idle_task_exit();
316 315
317 complete(&cpu_killed); 316 (void)cpu_report_death();
318 317
319 asm ("XOR TXENABLE, D0Re0,D0Re0\n"); 318 asm ("XOR TXENABLE, D0Re0,D0Re0\n");
320} 319}
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
34#endif 34#endif
35#endif 35#endif
36 36
37DECLARE_PER_CPU(int, cpu_state);
38
39int mwait_usable(const struct cpuinfo_x86 *); 37int mwait_usable(const struct cpuinfo_x86 *);
40 38
41#endif /* _ASM_X86_CPU_H */ 39#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..a5cb4f6e9492 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
150} 150}
151 151
152void cpu_disable_common(void); 152void cpu_disable_common(void);
153void cpu_die_common(unsigned int cpu);
154void native_smp_prepare_boot_cpu(void); 153void native_smp_prepare_boot_cpu(void);
155void native_smp_prepare_cpus(unsigned int max_cpus); 154void native_smp_prepare_cpus(unsigned int max_cpus);
156void native_smp_cpus_done(unsigned int max_cpus); 155void native_smp_cpus_done(unsigned int max_cpus);
157int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); 156int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
158int native_cpu_disable(void); 157int native_cpu_disable(void);
158int common_cpu_die(unsigned int cpu);
159void native_cpu_die(unsigned int cpu); 159void native_cpu_die(unsigned int cpu);
160void native_play_dead(void); 160void native_play_dead(void);
161void play_dead_common(void); 161void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..c8fa34963ead 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
77#include <asm/realmode.h> 77#include <asm/realmode.h>
78#include <asm/misc.h> 78#include <asm/misc.h>
79 79
80/* State of each CPU */
81DEFINE_PER_CPU(int, cpu_state) = { 0 };
82
83/* Number of siblings per CPU package */ 80/* Number of siblings per CPU package */
84int smp_num_siblings = 1; 81int smp_num_siblings = 1;
85EXPORT_SYMBOL(smp_num_siblings); 82EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
257 lock_vector_lock(); 254 lock_vector_lock();
258 set_cpu_online(smp_processor_id(), true); 255 set_cpu_online(smp_processor_id(), true);
259 unlock_vector_lock(); 256 unlock_vector_lock();
260 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 257 cpu_set_state_online(smp_processor_id());
261 x86_platform.nmi_init(); 258 x86_platform.nmi_init();
262 259
263 /* enable local interrupts */ 260 /* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
948 */ 945 */
949 mtrr_save_state(); 946 mtrr_save_state();
950 947
951 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 948 /* x86 CPUs take themselves offline, so delayed offline is OK. */
949 err = cpu_check_up_prepare(cpu);
950 if (err && err != -EBUSY)
951 return err;
952 952
953 /* the FPU context is blank, nobody can own it */ 953 /* the FPU context is blank, nobody can own it */
954 __cpu_disable_lazy_restore(cpu); 954 __cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
1191 switch_to_new_gdt(me); 1191 switch_to_new_gdt(me);
1192 /* already set me in cpu_online_mask in boot_cpu_init() */ 1192 /* already set me in cpu_online_mask in boot_cpu_init() */
1193 cpumask_set_cpu(me, cpu_callout_mask); 1193 cpumask_set_cpu(me, cpu_callout_mask);
1194 per_cpu(cpu_state, me) = CPU_ONLINE; 1194 cpu_set_state_online(me);
1195} 1195}
1196 1196
1197void __init native_smp_cpus_done(unsigned int max_cpus) 1197void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
1318 numa_remove_cpu(cpu); 1318 numa_remove_cpu(cpu);
1319} 1319}
1320 1320
1321static DEFINE_PER_CPU(struct completion, die_complete);
1322
1323void cpu_disable_common(void) 1321void cpu_disable_common(void)
1324{ 1322{
1325 int cpu = smp_processor_id(); 1323 int cpu = smp_processor_id();
1326 1324
1327 init_completion(&per_cpu(die_complete, smp_processor_id()));
1328
1329 remove_siblinginfo(cpu); 1325 remove_siblinginfo(cpu);
1330 1326
1331 /* It's now safe to remove this processor from the online map */ 1327 /* It's now safe to remove this processor from the online map */
@@ -1349,24 +1345,27 @@ int native_cpu_disable(void)
1349 return 0; 1345 return 0;
1350} 1346}
1351 1347
1352void cpu_die_common(unsigned int cpu) 1348int common_cpu_die(unsigned int cpu)
1353{ 1349{
1354 wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); 1350 int ret = 0;
1355}
1356 1351
1357void native_cpu_die(unsigned int cpu)
1358{
1359 /* We don't do anything here: idle task is faking death itself. */ 1352 /* We don't do anything here: idle task is faking death itself. */
1360 1353
1361 cpu_die_common(cpu);
1362
1363 /* They ack this in play_dead() by setting CPU_DEAD */ 1354 /* They ack this in play_dead() by setting CPU_DEAD */
1364 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1355 if (cpu_wait_death(cpu, 5)) {
1365 if (system_state == SYSTEM_RUNNING) 1356 if (system_state == SYSTEM_RUNNING)
1366 pr_info("CPU %u is now offline\n", cpu); 1357 pr_info("CPU %u is now offline\n", cpu);
1367 } else { 1358 } else {
1368 pr_err("CPU %u didn't die...\n", cpu); 1359 pr_err("CPU %u didn't die...\n", cpu);
1360 ret = -1;
1369 } 1361 }
1362
1363 return ret;
1364}
1365
1366void native_cpu_die(unsigned int cpu)
1367{
1368 common_cpu_die(cpu);
1370} 1369}
1371 1370
1372void play_dead_common(void) 1371void play_dead_common(void)
@@ -1375,10 +1374,8 @@ void play_dead_common(void)
1375 reset_lazy_tlbstate(); 1374 reset_lazy_tlbstate();
1376 amd_e400_remove_cpu(raw_smp_processor_id()); 1375 amd_e400_remove_cpu(raw_smp_processor_id());
1377 1376
1378 mb();
1379 /* Ack it */ 1377 /* Ack it */
1380 __this_cpu_write(cpu_state, CPU_DEAD); 1378 (void)cpu_report_death();
1381 complete(&per_cpu(die_complete, smp_processor_id()));
1382 1379
1383 /* 1380 /*
1384 * With physical CPU hotplug, we should halt the cpu 1381 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..1c5e760f34ca 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
90 90
91 set_cpu_online(cpu, true); 91 set_cpu_online(cpu, true);
92 92
93 this_cpu_write(cpu_state, CPU_ONLINE); 93 cpu_set_state_online(cpu); /* Implies full memory barrier. */
94
95 wmb();
96 94
97 /* We can take interrupts now: we're officially "up". */ 95 /* We can take interrupts now: we're officially "up". */
98 local_irq_enable(); 96 local_irq_enable();
99
100 wmb(); /* make sure everything is out */
101} 97}
102 98
103/* 99/*
@@ -459,7 +455,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
459 xen_setup_timer(cpu); 455 xen_setup_timer(cpu);
460 xen_init_lock_cpu(cpu); 456 xen_init_lock_cpu(cpu);
461 457
462 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 458 /*
459 * PV VCPUs are always successfully taken down (see 'while' loop
460 * in xen_cpu_die()), so -EBUSY is an error.
461 */
462 rc = cpu_check_up_prepare(cpu);
463 if (rc)
464 return rc;
463 465
464 /* make sure interrupts start blocked */ 466 /* make sure interrupts start blocked */
465 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 467 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -479,10 +481,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
479 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 481 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
480 BUG_ON(rc); 482 BUG_ON(rc);
481 483
482 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { 484 while (cpu_report_state(cpu) != CPU_ONLINE)
483 HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 485 HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
484 barrier();
485 }
486 486
487 return 0; 487 return 0;
488} 488}
@@ -511,11 +511,11 @@ static void xen_cpu_die(unsigned int cpu)
511 schedule_timeout(HZ/10); 511 schedule_timeout(HZ/10);
512 } 512 }
513 513
514 cpu_die_common(cpu); 514 if (common_cpu_die(cpu) == 0) {
515 515 xen_smp_intr_free(cpu);
516 xen_smp_intr_free(cpu); 516 xen_uninit_lock_cpu(cpu);
517 xen_uninit_lock_cpu(cpu); 517 xen_teardown_timer(cpu);
518 xen_teardown_timer(cpu); 518 }
519} 519}
520 520
521static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ 521static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +747,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
747static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle) 747static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
748{ 748{
749 int rc; 749 int rc;
750
751 /*
752 * This can happen if CPU was offlined earlier and
753 * offlining timed out in common_cpu_die().
754 */
755 if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
756 xen_smp_intr_free(cpu);
757 xen_uninit_lock_cpu(cpu);
758 }
759
750 /* 760 /*
751 * xen_smp_intr_init() needs to run before native_cpu_up() 761 * xen_smp_intr_init() needs to run before native_cpu_up()
752 * so that IPI vectors are set up on the booting CPU before 762 * so that IPI vectors are set up on the booting CPU before
@@ -768,12 +778,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
768 return rc; 778 return rc;
769} 779}
770 780
771static void xen_hvm_cpu_die(unsigned int cpu)
772{
773 xen_cpu_die(cpu);
774 native_cpu_die(cpu);
775}
776
777void __init xen_hvm_smp_init(void) 781void __init xen_hvm_smp_init(void)
778{ 782{
779 if (!xen_have_vector_callback) 783 if (!xen_have_vector_callback)
@@ -781,7 +785,7 @@ void __init xen_hvm_smp_init(void)
781 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 785 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
782 smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 786 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
783 smp_ops.cpu_up = xen_hvm_cpu_up; 787 smp_ops.cpu_up = xen_hvm_cpu_up;
784 smp_ops.cpu_die = xen_hvm_cpu_die; 788 smp_ops.cpu_die = xen_cpu_die;
785 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 789 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
786 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 790 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
787 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; 791 smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4260e8594bd7..d028721748d4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -95,6 +95,10 @@ enum {
95 * Called on the new cpu, just before 95 * Called on the new cpu, just before
96 * enabling interrupts. Must not sleep, 96 * enabling interrupts. Must not sleep,
97 * must not fail */ 97 * must not fail */
98#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached
99 * idle loop. */
100#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly,
101 * perhaps due to preemption. */
98 102
99/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend 103/* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
100 * operation in progress 104 * operation in progress
@@ -271,4 +275,14 @@ void arch_cpu_idle_enter(void);
271void arch_cpu_idle_exit(void); 275void arch_cpu_idle_exit(void);
272void arch_cpu_idle_dead(void); 276void arch_cpu_idle_dead(void);
273 277
278DECLARE_PER_CPU(bool, cpu_dead_idle);
279
280int cpu_report_state(int cpu);
281int cpu_check_up_prepare(int cpu);
282void cpu_set_state_online(int cpu);
283#ifdef CONFIG_HOTPLUG_CPU
284bool cpu_wait_death(unsigned int cpu, int seconds);
285bool cpu_report_death(void);
286#endif /* #ifdef CONFIG_HOTPLUG_CPU */
287
274#endif /* _LINUX_CPU_H_ */ 288#endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 74ab23176e9b..066ba4157541 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -531,8 +531,13 @@ do { \
531# define might_lock_read(lock) do { } while (0) 531# define might_lock_read(lock) do { } while (0)
532#endif 532#endif
533 533
534#ifdef CONFIG_PROVE_RCU 534#ifdef CONFIG_LOCKDEP
535void lockdep_rcu_suspicious(const char *file, const int line, const char *s); 535void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
536#else
537static inline void
538lockdep_rcu_suspicious(const char *file, const int line, const char *s)
539{
540}
536#endif 541#endif
537 542
538#endif /* __LINUX_LOCKDEP_H */ 543#endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..573a5afd5ed8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,6 +48,26 @@
48 48
49extern int rcu_expedited; /* for sysctl */ 49extern int rcu_expedited; /* for sysctl */
50 50
51#ifdef CONFIG_TINY_RCU
52/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
53static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
54{
55 return false;
56}
57
58static inline void rcu_expedite_gp(void)
59{
60}
61
62static inline void rcu_unexpedite_gp(void)
63{
64}
65#else /* #ifdef CONFIG_TINY_RCU */
66bool rcu_gp_is_expedited(void); /* Internal RCU use. */
67void rcu_expedite_gp(void);
68void rcu_unexpedite_gp(void);
69#endif /* #else #ifdef CONFIG_TINY_RCU */
70
51enum rcutorture_type { 71enum rcutorture_type {
52 RCU_FLAVOR, 72 RCU_FLAVOR,
53 RCU_BH_FLAVOR, 73 RCU_BH_FLAVOR,
@@ -195,6 +215,15 @@ void call_rcu_sched(struct rcu_head *head,
195 215
196void synchronize_sched(void); 216void synchronize_sched(void);
197 217
218/*
219 * Structure allowing asynchronous waiting on RCU.
220 */
221struct rcu_synchronize {
222 struct rcu_head head;
223 struct completion completion;
224};
225void wakeme_after_rcu(struct rcu_head *head);
226
198/** 227/**
199 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period 228 * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
200 * @head: structure to be used for queueing the RCU updates. 229 * @head: structure to be used for queueing the RCU updates.
@@ -258,6 +287,7 @@ static inline int rcu_preempt_depth(void)
258 287
259/* Internal to kernel */ 288/* Internal to kernel */
260void rcu_init(void); 289void rcu_init(void);
290void rcu_end_inkernel_boot(void);
261void rcu_sched_qs(void); 291void rcu_sched_qs(void);
262void rcu_bh_qs(void); 292void rcu_bh_qs(void);
263void rcu_check_callbacks(int user); 293void rcu_check_callbacks(int user);
@@ -266,6 +296,8 @@ void rcu_idle_enter(void);
266void rcu_idle_exit(void); 296void rcu_idle_exit(void);
267void rcu_irq_enter(void); 297void rcu_irq_enter(void);
268void rcu_irq_exit(void); 298void rcu_irq_exit(void);
299int rcu_cpu_notify(struct notifier_block *self,
300 unsigned long action, void *hcpu);
269 301
270#ifdef CONFIG_RCU_STALL_COMMON 302#ifdef CONFIG_RCU_STALL_COMMON
271void rcu_sysrq_start(void); 303void rcu_sysrq_start(void);
@@ -720,7 +752,7 @@ static inline void rcu_preempt_sleep_check(void)
720 * annotated as __rcu. 752 * annotated as __rcu.
721 */ 753 */
722#define rcu_dereference_check(p, c) \ 754#define rcu_dereference_check(p, c) \
723 __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu) 755 __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
724 756
725/** 757/**
726 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking 758 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
@@ -730,7 +762,7 @@ static inline void rcu_preempt_sleep_check(void)
730 * This is the RCU-bh counterpart to rcu_dereference_check(). 762 * This is the RCU-bh counterpart to rcu_dereference_check().
731 */ 763 */
732#define rcu_dereference_bh_check(p, c) \ 764#define rcu_dereference_bh_check(p, c) \
733 __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu) 765 __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
734 766
735/** 767/**
736 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking 768 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
@@ -740,7 +772,7 @@ static inline void rcu_preempt_sleep_check(void)
740 * This is the RCU-sched counterpart to rcu_dereference_check(). 772 * This is the RCU-sched counterpart to rcu_dereference_check().
741 */ 773 */
742#define rcu_dereference_sched_check(p, c) \ 774#define rcu_dereference_sched_check(p, c) \
743 __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \ 775 __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
744 __rcu) 776 __rcu)
745 777
746#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/ 778#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
@@ -933,9 +965,9 @@ static inline void rcu_read_unlock(void)
933{ 965{
934 rcu_lockdep_assert(rcu_is_watching(), 966 rcu_lockdep_assert(rcu_is_watching(),
935 "rcu_read_unlock() used illegally while idle"); 967 "rcu_read_unlock() used illegally while idle");
936 rcu_lock_release(&rcu_lock_map);
937 __release(RCU); 968 __release(RCU);
938 __rcu_read_unlock(); 969 __rcu_read_unlock();
970 rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
939} 971}
940 972
941/** 973/**
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 9cfd9623fb03..bdeb4567b71e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -182,7 +182,7 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
182 * lockdep_is_held() calls. 182 * lockdep_is_held() calls.
183 */ 183 */
184#define srcu_dereference_check(p, sp, c) \ 184#define srcu_dereference_check(p, sp, c) \
185 __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu) 185 __rcu_dereference_check((p), (c) || srcu_read_lock_held(sp), __rcu)
186 186
187/** 187/**
188 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing 188 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d4261b..9a0592516f48 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -791,6 +791,19 @@ config RCU_NOCB_CPU_ALL
791 791
792endchoice 792endchoice
793 793
794config RCU_EXPEDITE_BOOT
795 bool
796 default n
797 help
798 This option enables expedited grace periods at boot time,
799 as if rcu_expedite_gp() had been invoked early in boot.
800 The corresponding rcu_unexpedite_gp() is invoked from
801 rcu_end_inkernel_boot(), which is intended to be invoked
802 at the end of the kernel-only boot sequence, just before
803 init is exec'ed.
804
805 Accept the default if unsure.
806
794endmenu # "RCU Subsystem" 807endmenu # "RCU Subsystem"
795 808
796config BUILD_BIN2C 809config BUILD_BIN2C
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b161c61e..d46b4dae0ca0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -408,8 +408,10 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
408 * 408 *
409 * Wait for the stop thread to go away. 409 * Wait for the stop thread to go away.
410 */ 410 */
411 while (!idle_cpu(cpu)) 411 while (!per_cpu(cpu_dead_idle, cpu))
412 cpu_relax(); 412 cpu_relax();
413 smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
414 per_cpu(cpu_dead_idle, cpu) = false;
413 415
414 /* This actually kills the CPU. */ 416 /* This actually kills the CPU. */
415 __cpu_die(cpu); 417 __cpu_die(cpu);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
853static int 853static int
854rcu_torture_writer(void *arg) 854rcu_torture_writer(void *arg)
855{ 855{
856 bool can_expedite = !rcu_gp_is_expedited();
857 int expediting = 0;
856 unsigned long gp_snap; 858 unsigned long gp_snap;
857 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; 859 bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
858 bool gp_sync1 = gp_sync; 860 bool gp_sync1 = gp_sync;
@@ -865,9 +867,15 @@ rcu_torture_writer(void *arg)
865 int nsynctypes = 0; 867 int nsynctypes = 0;
866 868
867 VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); 869 VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
870 pr_alert("%s" TORTURE_FLAG
871 " Grace periods expedited from boot/sysfs for %s,\n",
872 torture_type, cur_ops->name);
873 pr_alert("%s" TORTURE_FLAG
874 " Testing of dynamic grace-period expediting diabled.\n",
875 torture_type);
868 876
869 /* Initialize synctype[] array. If none set, take default. */ 877 /* Initialize synctype[] array. If none set, take default. */
870 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) 878 if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
871 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; 879 gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
872 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) 880 if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
873 synctype[nsynctypes++] = RTWS_COND_GET; 881 synctype[nsynctypes++] = RTWS_COND_GET;
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
949 } 957 }
950 } 958 }
951 rcutorture_record_progress(++rcu_torture_current_version); 959 rcutorture_record_progress(++rcu_torture_current_version);
960 /* Cycle through nesting levels of rcu_expedite_gp() calls. */
961 if (can_expedite &&
962 !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
963 WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
964 if (expediting >= 0)
965 rcu_expedite_gp();
966 else
967 rcu_unexpedite_gp();
968 if (++expediting > 3)
969 expediting = -expediting;
970 }
952 rcu_torture_writer_state = RTWS_STUTTER; 971 rcu_torture_writer_state = RTWS_STUTTER;
953 stutter_wait("rcu_torture_writer"); 972 stutter_wait("rcu_torture_writer");
954 } while (!torture_must_stop()); 973 } while (!torture_must_stop());
974 /* Reset expediting back to unexpedited. */
975 if (expediting > 0)
976 expediting = -expediting;
977 while (can_expedite && expediting++ < 0)
978 rcu_unexpedite_gp();
979 WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
955 rcu_torture_writer_state = RTWS_STOPPING; 980 rcu_torture_writer_state = RTWS_STOPPING;
956 torture_kthread_stopping("rcu_torture_writer"); 981 torture_kthread_stopping("rcu_torture_writer");
957 return 0; 982 return 0;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..cad76e76b4e7 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
402} 402}
403EXPORT_SYMBOL_GPL(call_srcu); 403EXPORT_SYMBOL_GPL(call_srcu);
404 404
405struct rcu_synchronize {
406 struct rcu_head head;
407 struct completion completion;
408};
409
410/*
411 * Awaken the corresponding synchronize_srcu() instance now that a
412 * grace period has elapsed.
413 */
414static void wakeme_after_rcu(struct rcu_head *head)
415{
416 struct rcu_synchronize *rcu;
417
418 rcu = container_of(head, struct rcu_synchronize, head);
419 complete(&rcu->completion);
420}
421
422static void srcu_advance_batches(struct srcu_struct *sp, int trycount); 405static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
423static void srcu_reschedule(struct srcu_struct *sp); 406static void srcu_reschedule(struct srcu_struct *sp);
424 407
@@ -507,7 +490,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
507 */ 490 */
508void synchronize_srcu(struct srcu_struct *sp) 491void synchronize_srcu(struct srcu_struct *sp)
509{ 492{
510 __synchronize_srcu(sp, rcu_expedited 493 __synchronize_srcu(sp, rcu_gp_is_expedited()
511 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT 494 ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
512 : SYNCHRONIZE_SRCU_TRYCOUNT); 495 : SYNCHRONIZE_SRCU_TRYCOUNT);
513} 496}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
103static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) 103static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
104{ 104{
105 RCU_TRACE(reset_cpu_stall_ticks(rcp)); 105 RCU_TRACE(reset_cpu_stall_ticks(rcp));
106 if (rcp->rcucblist != NULL && 106 if (rcp->donetail != rcp->curtail) {
107 rcp->donetail != rcp->curtail) {
108 rcp->donetail = rcp->curtail; 107 rcp->donetail = rcp->curtail;
109 return 1; 108 return 1;
110 } 109 }
@@ -169,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
169 unsigned long flags; 168 unsigned long flags;
170 RCU_TRACE(int cb_count = 0); 169 RCU_TRACE(int cb_count = 0);
171 170
172 /* If no RCU callbacks ready to invoke, just return. */
173 if (&rcp->rcucblist == rcp->donetail) {
174 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
175 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
176 !!ACCESS_ONCE(rcp->rcucblist),
177 need_resched(),
178 is_idle_task(current),
179 false));
180 return;
181 }
182
183 /* Move the ready-to-invoke callbacks to a local list. */ 171 /* Move the ready-to-invoke callbacks to a local list. */
184 local_irq_save(flags); 172 local_irq_save(flags);
185 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); 173 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..233165da782f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
91 91
92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
93DEFINE_RCU_TPS(sname) \ 93DEFINE_RCU_TPS(sname) \
94DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
94struct rcu_state sname##_state = { \ 95struct rcu_state sname##_state = { \
95 .level = { &sname##_state.node[0] }, \ 96 .level = { &sname##_state.node[0] }, \
97 .rda = &sname##_data, \
96 .call = cr, \ 98 .call = cr, \
97 .fqs_state = RCU_GP_IDLE, \ 99 .fqs_state = RCU_GP_IDLE, \
98 .gpnum = 0UL - 300UL, \ 100 .gpnum = 0UL - 300UL, \
@@ -101,11 +103,9 @@ struct rcu_state sname##_state = { \
101 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 103 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
102 .orphan_donetail = &sname##_state.orphan_donelist, \ 104 .orphan_donetail = &sname##_state.orphan_donelist, \
103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 105 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
104 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
105 .name = RCU_STATE_NAME(sname), \ 106 .name = RCU_STATE_NAME(sname), \
106 .abbr = sabbr, \ 107 .abbr = sabbr, \
107}; \ 108}
108DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
109 109
110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 110RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 111RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
152 */ 152 */
153static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
154 154
155static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
156static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 157static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
156static void invoke_rcu_core(void); 158static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 159static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -160,6 +162,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
160static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; 162static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
161module_param(kthread_prio, int, 0644); 163module_param(kthread_prio, int, 0644);
162 164
165/* Delay in jiffies for grace-period initialization delays. */
166static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
167 ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
168 : 0;
169module_param(gp_init_delay, int, 0644);
170
163/* 171/*
164 * Track the rcutorture test sequence number and the update version 172 * Track the rcutorture test sequence number and the update version
165 * number within a given test. The rcutorture_testseq is incremented 173 * number within a given test. The rcutorture_testseq is incremented
@@ -173,6 +181,17 @@ unsigned long rcutorture_testseq;
173unsigned long rcutorture_vernum; 181unsigned long rcutorture_vernum;
174 182
175/* 183/*
184 * Compute the mask of online CPUs for the specified rcu_node structure.
185 * This will not be stable unless the rcu_node structure's ->lock is
186 * held, but the bit corresponding to the current CPU will be stable
187 * in most contexts.
188 */
189unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
190{
191 return ACCESS_ONCE(rnp->qsmaskinitnext);
192}
193
194/*
176 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 195 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
177 * permit this function to be invoked without holding the root rcu_node 196 * permit this function to be invoked without holding the root rcu_node
178 * structure's ->lock, but of course results can be subject to change. 197 * structure's ->lock, but of course results can be subject to change.
@@ -292,10 +311,10 @@ void rcu_note_context_switch(void)
292EXPORT_SYMBOL_GPL(rcu_note_context_switch); 311EXPORT_SYMBOL_GPL(rcu_note_context_switch);
293 312
294/* 313/*
295 * Register a quiesecent state for all RCU flavors. If there is an 314 * Register a quiescent state for all RCU flavors. If there is an
296 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 315 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
297 * dyntick-idle quiescent state visible to other CPUs (but only for those 316 * dyntick-idle quiescent state visible to other CPUs (but only for those
298 * RCU flavors in desparate need of a quiescent state, which will normally 317 * RCU flavors in desperate need of a quiescent state, which will normally
299 * be none of them). Either way, do a lightweight quiescent state for 318 * be none of them). Either way, do a lightweight quiescent state for
300 * all RCU flavors. 319 * all RCU flavors.
301 */ 320 */
@@ -410,6 +429,15 @@ void rcu_bh_force_quiescent_state(void)
410EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 429EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
411 430
412/* 431/*
432 * Force a quiescent state for RCU-sched.
433 */
434void rcu_sched_force_quiescent_state(void)
435{
436 force_quiescent_state(&rcu_sched_state);
437}
438EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
439
440/*
413 * Show the state of the grace-period kthreads. 441 * Show the state of the grace-period kthreads.
414 */ 442 */
415void show_rcu_gp_kthreads(void) 443void show_rcu_gp_kthreads(void)
@@ -483,15 +511,6 @@ void rcutorture_record_progress(unsigned long vernum)
483EXPORT_SYMBOL_GPL(rcutorture_record_progress); 511EXPORT_SYMBOL_GPL(rcutorture_record_progress);
484 512
485/* 513/*
486 * Force a quiescent state for RCU-sched.
487 */
488void rcu_sched_force_quiescent_state(void)
489{
490 force_quiescent_state(&rcu_sched_state);
491}
492EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
493
494/*
495 * Does the CPU have callbacks ready to be invoked? 514 * Does the CPU have callbacks ready to be invoked?
496 */ 515 */
497static int 516static int
@@ -954,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
954 preempt_disable(); 973 preempt_disable();
955 rdp = this_cpu_ptr(&rcu_sched_data); 974 rdp = this_cpu_ptr(&rcu_sched_data);
956 rnp = rdp->mynode; 975 rnp = rdp->mynode;
957 ret = (rdp->grpmask & rnp->qsmaskinit) || 976 ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
958 !rcu_scheduler_fully_active; 977 !rcu_scheduler_fully_active;
959 preempt_enable(); 978 preempt_enable();
960 return ret; 979 return ret;
@@ -1196,9 +1215,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1196 } else { 1215 } else {
1197 j = jiffies; 1216 j = jiffies;
1198 gpa = ACCESS_ONCE(rsp->gp_activity); 1217 gpa = ACCESS_ONCE(rsp->gp_activity);
1199 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", 1218 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
1200 rsp->name, j - gpa, j, gpa, 1219 rsp->name, j - gpa, j, gpa,
1201 jiffies_till_next_fqs); 1220 jiffies_till_next_fqs,
1221 rcu_get_root(rsp)->qsmask);
1202 /* In this case, the current CPU might be at fault. */ 1222 /* In this case, the current CPU might be at fault. */
1203 sched_show_task(current); 1223 sched_show_task(current);
1204 } 1224 }
@@ -1328,20 +1348,30 @@ void rcu_cpu_stall_reset(void)
1328} 1348}
1329 1349
1330/* 1350/*
1331 * Initialize the specified rcu_data structure's callback list to empty. 1351 * Initialize the specified rcu_data structure's default callback list
1352 * to empty. The default callback list is the one that is not used by
1353 * no-callbacks CPUs.
1332 */ 1354 */
1333static void init_callback_list(struct rcu_data *rdp) 1355static void init_default_callback_list(struct rcu_data *rdp)
1334{ 1356{
1335 int i; 1357 int i;
1336 1358
1337 if (init_nocb_callback_list(rdp))
1338 return;
1339 rdp->nxtlist = NULL; 1359 rdp->nxtlist = NULL;
1340 for (i = 0; i < RCU_NEXT_SIZE; i++) 1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1341 rdp->nxttail[i] = &rdp->nxtlist; 1361 rdp->nxttail[i] = &rdp->nxtlist;
1342} 1362}
1343 1363
1344/* 1364/*
1365 * Initialize the specified rcu_data structure's callback list to empty.
1366 */
1367static void init_callback_list(struct rcu_data *rdp)
1368{
1369 if (init_nocb_callback_list(rdp))
1370 return;
1371 init_default_callback_list(rdp);
1372}
1373
1374/*
1345 * Determine the value that ->completed will have at the end of the 1375 * Determine the value that ->completed will have at the end of the
1346 * next subsequent grace period. This is used to tag callbacks so that 1376 * next subsequent grace period. This is used to tag callbacks so that
1347 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1377 * a CPU can invoke callbacks in a timely fashion even if that CPU has
@@ -1703,11 +1733,11 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1703 */ 1733 */
1704static int rcu_gp_init(struct rcu_state *rsp) 1734static int rcu_gp_init(struct rcu_state *rsp)
1705{ 1735{
1736 unsigned long oldmask;
1706 struct rcu_data *rdp; 1737 struct rcu_data *rdp;
1707 struct rcu_node *rnp = rcu_get_root(rsp); 1738 struct rcu_node *rnp = rcu_get_root(rsp);
1708 1739
1709 ACCESS_ONCE(rsp->gp_activity) = jiffies; 1740 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1710 rcu_bind_gp_kthread();
1711 raw_spin_lock_irq(&rnp->lock); 1741 raw_spin_lock_irq(&rnp->lock);
1712 smp_mb__after_unlock_lock(); 1742 smp_mb__after_unlock_lock();
1713 if (!ACCESS_ONCE(rsp->gp_flags)) { 1743 if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1733,9 +1763,54 @@ static int rcu_gp_init(struct rcu_state *rsp)
1733 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1763 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1734 raw_spin_unlock_irq(&rnp->lock); 1764 raw_spin_unlock_irq(&rnp->lock);
1735 1765
1736 /* Exclude any concurrent CPU-hotplug operations. */ 1766 /*
1737 mutex_lock(&rsp->onoff_mutex); 1767 * Apply per-leaf buffered online and offline operations to the
1738 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ 1768 * rcu_node tree. Note that this new grace period need not wait
1769 * for subsequent online CPUs, and that quiescent-state forcing
1770 * will handle subsequent offline CPUs.
1771 */
1772 rcu_for_each_leaf_node(rsp, rnp) {
1773 raw_spin_lock_irq(&rnp->lock);
1774 smp_mb__after_unlock_lock();
1775 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1776 !rnp->wait_blkd_tasks) {
1777 /* Nothing to do on this leaf rcu_node structure. */
1778 raw_spin_unlock_irq(&rnp->lock);
1779 continue;
1780 }
1781
1782 /* Record old state, apply changes to ->qsmaskinit field. */
1783 oldmask = rnp->qsmaskinit;
1784 rnp->qsmaskinit = rnp->qsmaskinitnext;
1785
1786 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1787 if (!oldmask != !rnp->qsmaskinit) {
1788 if (!oldmask) /* First online CPU for this rcu_node. */
1789 rcu_init_new_rnp(rnp);
1790 else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
1791 rnp->wait_blkd_tasks = true;
1792 else /* Last offline CPU and can propagate. */
1793 rcu_cleanup_dead_rnp(rnp);
1794 }
1795
1796 /*
1797 * If all waited-on tasks from prior grace period are
1798 * done, and if all this rcu_node structure's CPUs are
1799 * still offline, propagate up the rcu_node tree and
1800 * clear ->wait_blkd_tasks. Otherwise, if one of this
1801 * rcu_node structure's CPUs has since come back online,
1802 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
1803 * checks for this, so just call it unconditionally).
1804 */
1805 if (rnp->wait_blkd_tasks &&
1806 (!rcu_preempt_has_tasks(rnp) ||
1807 rnp->qsmaskinit)) {
1808 rnp->wait_blkd_tasks = false;
1809 rcu_cleanup_dead_rnp(rnp);
1810 }
1811
1812 raw_spin_unlock_irq(&rnp->lock);
1813 }
1739 1814
1740 /* 1815 /*
1741 * Set the quiescent-state-needed bits in all the rcu_node 1816 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1757,8 +1832,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
1757 rcu_preempt_check_blocked_tasks(rnp); 1832 rcu_preempt_check_blocked_tasks(rnp);
1758 rnp->qsmask = rnp->qsmaskinit; 1833 rnp->qsmask = rnp->qsmaskinit;
1759 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; 1834 ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
1760 WARN_ON_ONCE(rnp->completed != rsp->completed); 1835 if (WARN_ON_ONCE(rnp->completed != rsp->completed))
1761 ACCESS_ONCE(rnp->completed) = rsp->completed; 1836 ACCESS_ONCE(rnp->completed) = rsp->completed;
1762 if (rnp == rdp->mynode) 1837 if (rnp == rdp->mynode)
1763 (void)__note_gp_changes(rsp, rnp, rdp); 1838 (void)__note_gp_changes(rsp, rnp, rdp);
1764 rcu_preempt_boost_start_gp(rnp); 1839 rcu_preempt_boost_start_gp(rnp);
@@ -1768,9 +1843,12 @@ static int rcu_gp_init(struct rcu_state *rsp)
1768 raw_spin_unlock_irq(&rnp->lock); 1843 raw_spin_unlock_irq(&rnp->lock);
1769 cond_resched_rcu_qs(); 1844 cond_resched_rcu_qs();
1770 ACCESS_ONCE(rsp->gp_activity) = jiffies; 1845 ACCESS_ONCE(rsp->gp_activity) = jiffies;
1846 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
1847 gp_init_delay > 0 &&
1848 !(rsp->gpnum % (rcu_num_nodes * 10)))
1849 schedule_timeout_uninterruptible(gp_init_delay);
1771 } 1850 }
1772 1851
1773 mutex_unlock(&rsp->onoff_mutex);
1774 return 1; 1852 return 1;
1775} 1853}
1776 1854
@@ -1798,7 +1876,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1798 fqs_state = RCU_FORCE_QS; 1876 fqs_state = RCU_FORCE_QS;
1799 } else { 1877 } else {
1800 /* Handle dyntick-idle and offline CPUs. */ 1878 /* Handle dyntick-idle and offline CPUs. */
1801 isidle = false; 1879 isidle = true;
1802 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1880 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1803 } 1881 }
1804 /* Clear flag to prevent immediate re-entry. */ 1882 /* Clear flag to prevent immediate re-entry. */
@@ -1852,6 +1930,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1852 rcu_for_each_node_breadth_first(rsp, rnp) { 1930 rcu_for_each_node_breadth_first(rsp, rnp) {
1853 raw_spin_lock_irq(&rnp->lock); 1931 raw_spin_lock_irq(&rnp->lock);
1854 smp_mb__after_unlock_lock(); 1932 smp_mb__after_unlock_lock();
1933 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
1934 WARN_ON_ONCE(rnp->qsmask);
1855 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1935 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1856 rdp = this_cpu_ptr(rsp->rda); 1936 rdp = this_cpu_ptr(rsp->rda);
1857 if (rnp == rdp->mynode) 1937 if (rnp == rdp->mynode)
@@ -1895,6 +1975,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
1895 struct rcu_state *rsp = arg; 1975 struct rcu_state *rsp = arg;
1896 struct rcu_node *rnp = rcu_get_root(rsp); 1976 struct rcu_node *rnp = rcu_get_root(rsp);
1897 1977
1978 rcu_bind_gp_kthread();
1898 for (;;) { 1979 for (;;) {
1899 1980
1900 /* Handle grace-period start. */ 1981 /* Handle grace-period start. */
@@ -2062,25 +2143,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
2062 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2143 * Similar to rcu_report_qs_rdp(), for which it is a helper function.
2063 * Allows quiescent states for a group of CPUs to be reported at one go 2144 * Allows quiescent states for a group of CPUs to be reported at one go
2064 * to the specified rcu_node structure, though all the CPUs in the group 2145 * to the specified rcu_node structure, though all the CPUs in the group
2065 * must be represented by the same rcu_node structure (which need not be 2146 * must be represented by the same rcu_node structure (which need not be a
2066 * a leaf rcu_node structure, though it often will be). That structure's 2147 * leaf rcu_node structure, though it often will be). The gps parameter
2067 * lock must be held upon entry, and it is released before return. 2148 * is the grace-period snapshot, which means that the quiescent states
2149 * are valid only if rnp->gpnum is equal to gps. That structure's lock
2150 * must be held upon entry, and it is released before return.
2068 */ 2151 */
2069static void 2152static void
2070rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 2153rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2071 struct rcu_node *rnp, unsigned long flags) 2154 struct rcu_node *rnp, unsigned long gps, unsigned long flags)
2072 __releases(rnp->lock) 2155 __releases(rnp->lock)
2073{ 2156{
2157 unsigned long oldmask = 0;
2074 struct rcu_node *rnp_c; 2158 struct rcu_node *rnp_c;
2075 2159
2076 /* Walk up the rcu_node hierarchy. */ 2160 /* Walk up the rcu_node hierarchy. */
2077 for (;;) { 2161 for (;;) {
2078 if (!(rnp->qsmask & mask)) { 2162 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
2079 2163
2080 /* Our bit has already been cleared, so done. */ 2164 /*
2165 * Our bit has already been cleared, or the
2166 * relevant grace period is already over, so done.
2167 */
2081 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2168 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2082 return; 2169 return;
2083 } 2170 }
2171 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
2084 rnp->qsmask &= ~mask; 2172 rnp->qsmask &= ~mask;
2085 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 2173 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
2086 mask, rnp->qsmask, rnp->level, 2174 mask, rnp->qsmask, rnp->level,
@@ -2104,7 +2192,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2104 rnp = rnp->parent; 2192 rnp = rnp->parent;
2105 raw_spin_lock_irqsave(&rnp->lock, flags); 2193 raw_spin_lock_irqsave(&rnp->lock, flags);
2106 smp_mb__after_unlock_lock(); 2194 smp_mb__after_unlock_lock();
2107 WARN_ON_ONCE(rnp_c->qsmask); 2195 oldmask = rnp_c->qsmask;
2108 } 2196 }
2109 2197
2110 /* 2198 /*
@@ -2116,6 +2204,46 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2116} 2204}
2117 2205
2118/* 2206/*
2207 * Record a quiescent state for all tasks that were previously queued
2208 * on the specified rcu_node structure and that were blocking the current
2209 * RCU grace period. The caller must hold the specified rnp->lock with
2210 * irqs disabled, and this lock is released upon return, but irqs remain
2211 * disabled.
2212 */
2213static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2214 struct rcu_node *rnp, unsigned long flags)
2215 __releases(rnp->lock)
2216{
2217 unsigned long gps;
2218 unsigned long mask;
2219 struct rcu_node *rnp_p;
2220
2221 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
2222 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
2223 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2224 return; /* Still need more quiescent states! */
2225 }
2226
2227 rnp_p = rnp->parent;
2228 if (rnp_p == NULL) {
2229 /*
2230 * Only one rcu_node structure in the tree, so don't
2231 * try to report up to its nonexistent parent!
2232 */
2233 rcu_report_qs_rsp(rsp, flags);
2234 return;
2235 }
2236
2237 /* Report up the rest of the hierarchy, tracking current ->gpnum. */
2238 gps = rnp->gpnum;
2239 mask = rnp->grpmask;
2240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2241 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
2242 smp_mb__after_unlock_lock();
2243 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
2244}
2245
2246/*
2119 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2247 * Record a quiescent state for the specified CPU to that CPU's rcu_data
2120 * structure. This must be either called from the specified CPU, or 2248 * structure. This must be either called from the specified CPU, or
2121 * called when the specified CPU is known to be offline (and when it is 2249 * called when the specified CPU is known to be offline (and when it is
@@ -2163,7 +2291,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2163 */ 2291 */
2164 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2292 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
2165 2293
2166 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 2294 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2295 /* ^^^ Released rnp->lock */
2167 if (needwake) 2296 if (needwake)
2168 rcu_gp_kthread_wake(rsp); 2297 rcu_gp_kthread_wake(rsp);
2169 } 2298 }
@@ -2256,8 +2385,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
2256 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 2385 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
2257 } 2386 }
2258 2387
2259 /* Finally, initialize the rcu_data structure's list to empty. */ 2388 /*
2389 * Finally, initialize the rcu_data structure's list to empty and
2390 * disallow further callbacks on this CPU.
2391 */
2260 init_callback_list(rdp); 2392 init_callback_list(rdp);
2393 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2261} 2394}
2262 2395
2263/* 2396/*
@@ -2355,6 +2488,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2355 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2488 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2356 smp_mb__after_unlock_lock(); /* GP memory ordering. */ 2489 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2357 rnp->qsmaskinit &= ~mask; 2490 rnp->qsmaskinit &= ~mask;
2491 rnp->qsmask &= ~mask;
2358 if (rnp->qsmaskinit) { 2492 if (rnp->qsmaskinit) {
2359 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2493 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2360 return; 2494 return;
@@ -2364,6 +2498,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2364} 2498}
2365 2499
2366/* 2500/*
2501 * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
2502 * function. We now remove it from the rcu_node tree's ->qsmaskinit
2503 * bit masks.
2504 */
2505static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2506{
2507 unsigned long flags;
2508 unsigned long mask;
2509 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2510 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2511
2512 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2513 mask = rdp->grpmask;
2514 raw_spin_lock_irqsave(&rnp->lock, flags);
2515 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2516 rnp->qsmaskinitnext &= ~mask;
2517 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2518}
2519
2520/*
2367 * The CPU has been completely removed, and some other CPU is reporting 2521 * The CPU has been completely removed, and some other CPU is reporting
2368 * this fact from process context. Do the remainder of the cleanup, 2522 * this fact from process context. Do the remainder of the cleanup,
2369 * including orphaning the outgoing CPU's RCU callbacks, and also 2523 * including orphaning the outgoing CPU's RCU callbacks, and also
@@ -2379,29 +2533,15 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2379 /* Adjust any no-longer-needed kthreads. */ 2533 /* Adjust any no-longer-needed kthreads. */
2380 rcu_boost_kthread_setaffinity(rnp, -1); 2534 rcu_boost_kthread_setaffinity(rnp, -1);
2381 2535
2382 /* Exclude any attempts to start a new grace period. */
2383 mutex_lock(&rsp->onoff_mutex);
2384 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
2385
2386 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2536 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2537 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
2387 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2538 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2388 rcu_adopt_orphan_cbs(rsp, flags); 2539 rcu_adopt_orphan_cbs(rsp, flags);
2389 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2540 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2390 2541
2391 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2392 raw_spin_lock_irqsave(&rnp->lock, flags);
2393 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2394 rnp->qsmaskinit &= ~rdp->grpmask;
2395 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
2396 rcu_cleanup_dead_rnp(rnp);
2397 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2398 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2542 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2399 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2543 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2400 cpu, rdp->qlen, rdp->nxtlist); 2544 cpu, rdp->qlen, rdp->nxtlist);
2401 init_callback_list(rdp);
2402 /* Disallow further callbacks on this CPU. */
2403 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2404 mutex_unlock(&rsp->onoff_mutex);
2405} 2545}
2406 2546
2407#else /* #ifdef CONFIG_HOTPLUG_CPU */ 2547#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2414,6 +2554,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2414{ 2554{
2415} 2555}
2416 2556
2557static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
2558{
2559}
2560
2417static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2561static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2418{ 2562{
2419} 2563}
@@ -2589,26 +2733,47 @@ static void force_qs_rnp(struct rcu_state *rsp,
2589 return; 2733 return;
2590 } 2734 }
2591 if (rnp->qsmask == 0) { 2735 if (rnp->qsmask == 0) {
2592 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 2736 if (rcu_state_p == &rcu_sched_state ||
2593 continue; 2737 rsp != rcu_state_p ||
2738 rcu_preempt_blocked_readers_cgp(rnp)) {
2739 /*
2740 * No point in scanning bits because they
2741 * are all zero. But we might need to
2742 * priority-boost blocked readers.
2743 */
2744 rcu_initiate_boost(rnp, flags);
2745 /* rcu_initiate_boost() releases rnp->lock */
2746 continue;
2747 }
2748 if (rnp->parent &&
2749 (rnp->parent->qsmask & rnp->grpmask)) {
2750 /*
2751 * Race between grace-period
2752 * initialization and task exiting RCU
2753 * read-side critical section: Report.
2754 */
2755 rcu_report_unblock_qs_rnp(rsp, rnp, flags);
2756 /* rcu_report_unblock_qs_rnp() rlses ->lock */
2757 continue;
2758 }
2594 } 2759 }
2595 cpu = rnp->grplo; 2760 cpu = rnp->grplo;
2596 bit = 1; 2761 bit = 1;
2597 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2762 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2598 if ((rnp->qsmask & bit) != 0) { 2763 if ((rnp->qsmask & bit) != 0) {
2599 if ((rnp->qsmaskinit & bit) != 0) 2764 if ((rnp->qsmaskinit & bit) == 0)
2600 *isidle = false; 2765 *isidle = false; /* Pending hotplug. */
2601 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2766 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2602 mask |= bit; 2767 mask |= bit;
2603 } 2768 }
2604 } 2769 }
2605 if (mask != 0) { 2770 if (mask != 0) {
2606 2771 /* Idle/offline CPUs, report (releases rnp->lock. */
2607 /* rcu_report_qs_rnp() releases rnp->lock. */ 2772 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
2608 rcu_report_qs_rnp(mask, rsp, rnp, flags); 2773 } else {
2609 continue; 2774 /* Nothing to do here, so just drop the lock. */
2775 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2610 } 2776 }
2611 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2612 } 2777 }
2613} 2778}
2614 2779
@@ -2741,7 +2906,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2741 * If called from an extended quiescent state, invoke the RCU 2906 * If called from an extended quiescent state, invoke the RCU
2742 * core in order to force a re-evaluation of RCU's idleness. 2907 * core in order to force a re-evaluation of RCU's idleness.
2743 */ 2908 */
2744 if (!rcu_is_watching() && cpu_online(smp_processor_id())) 2909 if (!rcu_is_watching())
2745 invoke_rcu_core(); 2910 invoke_rcu_core();
2746 2911
2747 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2912 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2827,11 +2992,22 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2827 2992
2828 if (cpu != -1) 2993 if (cpu != -1)
2829 rdp = per_cpu_ptr(rsp->rda, cpu); 2994 rdp = per_cpu_ptr(rsp->rda, cpu);
2830 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 2995 if (likely(rdp->mynode)) {
2831 WARN_ON_ONCE(offline); 2996 /* Post-boot, so this should be for a no-CBs CPU. */
2832 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2997 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2833 local_irq_restore(flags); 2998 WARN_ON_ONCE(offline);
2834 return; 2999 /* Offline CPU, _call_rcu() illegal, leak callback. */
3000 local_irq_restore(flags);
3001 return;
3002 }
3003 /*
3004 * Very early boot, before rcu_init(). Initialize if needed
3005 * and then drop through to queue the callback.
3006 */
3007 BUG_ON(cpu != -1);
3008 WARN_ON_ONCE(!rcu_is_watching());
3009 if (!likely(rdp->nxtlist))
3010 init_default_callback_list(rdp);
2835 } 3011 }
2836 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; 3012 ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
2837 if (lazy) 3013 if (lazy)
@@ -2954,7 +3130,7 @@ void synchronize_sched(void)
2954 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3130 "Illegal synchronize_sched() in RCU-sched read-side critical section");
2955 if (rcu_blocking_is_gp()) 3131 if (rcu_blocking_is_gp())
2956 return; 3132 return;
2957 if (rcu_expedited) 3133 if (rcu_gp_is_expedited())
2958 synchronize_sched_expedited(); 3134 synchronize_sched_expedited();
2959 else 3135 else
2960 wait_rcu_gp(call_rcu_sched); 3136 wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +3157,7 @@ void synchronize_rcu_bh(void)
2981 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3157 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
2982 if (rcu_blocking_is_gp()) 3158 if (rcu_blocking_is_gp())
2983 return; 3159 return;
2984 if (rcu_expedited) 3160 if (rcu_gp_is_expedited())
2985 synchronize_rcu_bh_expedited(); 3161 synchronize_rcu_bh_expedited();
2986 else 3162 else
2987 wait_rcu_gp(call_rcu_bh); 3163 wait_rcu_gp(call_rcu_bh);
@@ -3518,6 +3694,28 @@ void rcu_barrier_sched(void)
3518EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3694EXPORT_SYMBOL_GPL(rcu_barrier_sched);
3519 3695
3520/* 3696/*
3697 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
3698 * first CPU in a given leaf rcu_node structure coming online. The caller
3699 * must hold the corresponding leaf rcu_node ->lock with interrrupts
3700 * disabled.
3701 */
3702static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3703{
3704 long mask;
3705 struct rcu_node *rnp = rnp_leaf;
3706
3707 for (;;) {
3708 mask = rnp->grpmask;
3709 rnp = rnp->parent;
3710 if (rnp == NULL)
3711 return;
3712 raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
3713 rnp->qsmaskinit |= mask;
3714 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
3715 }
3716}
3717
3718/*
3521 * Do boot-time initialization of a CPU's per-CPU RCU data. 3719 * Do boot-time initialization of a CPU's per-CPU RCU data.
3522 */ 3720 */
3523static void __init 3721static void __init
@@ -3553,49 +3751,37 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3553 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3751 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3554 struct rcu_node *rnp = rcu_get_root(rsp); 3752 struct rcu_node *rnp = rcu_get_root(rsp);
3555 3753
3556 /* Exclude new grace periods. */
3557 mutex_lock(&rsp->onoff_mutex);
3558
3559 /* Set up local state, ensuring consistent view of global state. */ 3754 /* Set up local state, ensuring consistent view of global state. */
3560 raw_spin_lock_irqsave(&rnp->lock, flags); 3755 raw_spin_lock_irqsave(&rnp->lock, flags);
3561 rdp->beenonline = 1; /* We have now been online. */ 3756 rdp->beenonline = 1; /* We have now been online. */
3562 rdp->qlen_last_fqs_check = 0; 3757 rdp->qlen_last_fqs_check = 0;
3563 rdp->n_force_qs_snap = rsp->n_force_qs; 3758 rdp->n_force_qs_snap = rsp->n_force_qs;
3564 rdp->blimit = blimit; 3759 rdp->blimit = blimit;
3565 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3760 if (!rdp->nxtlist)
3761 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
3566 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3762 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
3567 rcu_sysidle_init_percpu_data(rdp->dynticks); 3763 rcu_sysidle_init_percpu_data(rdp->dynticks);
3568 atomic_set(&rdp->dynticks->dynticks, 3764 atomic_set(&rdp->dynticks->dynticks,
3569 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3765 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
3570 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3766 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
3571 3767
3572 /* Add CPU to rcu_node bitmasks. */ 3768 /*
3769 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
3770 * propagation up the rcu_node tree will happen at the beginning
3771 * of the next grace period.
3772 */
3573 rnp = rdp->mynode; 3773 rnp = rdp->mynode;
3574 mask = rdp->grpmask; 3774 mask = rdp->grpmask;
3575 do { 3775 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
3576 /* Exclude any attempts to start a new GP on small systems. */ 3776 smp_mb__after_unlock_lock();
3577 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 3777 rnp->qsmaskinitnext |= mask;
3578 rnp->qsmaskinit |= mask; 3778 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3579 mask = rnp->grpmask; 3779 rdp->completed = rnp->completed;
3580 if (rnp == rdp->mynode) { 3780 rdp->passed_quiesce = false;
3581 /* 3781 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3582 * If there is a grace period in progress, we will 3782 rdp->qs_pending = false;
3583 * set up to wait for it next time we run the 3783 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3584 * RCU core code. 3784 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3585 */
3586 rdp->gpnum = rnp->completed;
3587 rdp->completed = rnp->completed;
3588 rdp->passed_quiesce = 0;
3589 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3590 rdp->qs_pending = 0;
3591 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3592 }
3593 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
3594 rnp = rnp->parent;
3595 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
3596 local_irq_restore(flags);
3597
3598 mutex_unlock(&rsp->onoff_mutex);
3599} 3785}
3600 3786
3601static void rcu_prepare_cpu(int cpu) 3787static void rcu_prepare_cpu(int cpu)
@@ -3609,15 +3795,14 @@ static void rcu_prepare_cpu(int cpu)
3609/* 3795/*
3610 * Handle CPU online/offline notification events. 3796 * Handle CPU online/offline notification events.
3611 */ 3797 */
3612static int rcu_cpu_notify(struct notifier_block *self, 3798int rcu_cpu_notify(struct notifier_block *self,
3613 unsigned long action, void *hcpu) 3799 unsigned long action, void *hcpu)
3614{ 3800{
3615 long cpu = (long)hcpu; 3801 long cpu = (long)hcpu;
3616 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3802 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
3617 struct rcu_node *rnp = rdp->mynode; 3803 struct rcu_node *rnp = rdp->mynode;
3618 struct rcu_state *rsp; 3804 struct rcu_state *rsp;
3619 3805
3620 trace_rcu_utilization(TPS("Start CPU hotplug"));
3621 switch (action) { 3806 switch (action) {
3622 case CPU_UP_PREPARE: 3807 case CPU_UP_PREPARE:
3623 case CPU_UP_PREPARE_FROZEN: 3808 case CPU_UP_PREPARE_FROZEN:
@@ -3637,6 +3822,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
3637 for_each_rcu_flavor(rsp) 3822 for_each_rcu_flavor(rsp)
3638 rcu_cleanup_dying_cpu(rsp); 3823 rcu_cleanup_dying_cpu(rsp);
3639 break; 3824 break;
3825 case CPU_DYING_IDLE:
3826 for_each_rcu_flavor(rsp) {
3827 rcu_cleanup_dying_idle_cpu(cpu, rsp);
3828 }
3829 break;
3640 case CPU_DEAD: 3830 case CPU_DEAD:
3641 case CPU_DEAD_FROZEN: 3831 case CPU_DEAD_FROZEN:
3642 case CPU_UP_CANCELED: 3832 case CPU_UP_CANCELED:
@@ -3649,7 +3839,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
3649 default: 3839 default:
3650 break; 3840 break;
3651 } 3841 }
3652 trace_rcu_utilization(TPS("End CPU hotplug"));
3653 return NOTIFY_OK; 3842 return NOTIFY_OK;
3654} 3843}
3655 3844
@@ -3660,11 +3849,12 @@ static int rcu_pm_notify(struct notifier_block *self,
3660 case PM_HIBERNATION_PREPARE: 3849 case PM_HIBERNATION_PREPARE:
3661 case PM_SUSPEND_PREPARE: 3850 case PM_SUSPEND_PREPARE:
3662 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 3851 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3663 rcu_expedited = 1; 3852 rcu_expedite_gp();
3664 break; 3853 break;
3665 case PM_POST_HIBERNATION: 3854 case PM_POST_HIBERNATION:
3666 case PM_POST_SUSPEND: 3855 case PM_POST_SUSPEND:
3667 rcu_expedited = 0; 3856 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
3857 rcu_unexpedite_gp();
3668 break; 3858 break;
3669 default: 3859 default:
3670 break; 3860 break;
@@ -3734,30 +3924,26 @@ void rcu_scheduler_starting(void)
3734 * Compute the per-level fanout, either using the exact fanout specified 3924 * Compute the per-level fanout, either using the exact fanout specified
3735 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 3925 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
3736 */ 3926 */
3737#ifdef CONFIG_RCU_FANOUT_EXACT
3738static void __init rcu_init_levelspread(struct rcu_state *rsp)
3739{
3740 int i;
3741
3742 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3743 for (i = rcu_num_lvls - 2; i >= 0; i--)
3744 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3745}
3746#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
3747static void __init rcu_init_levelspread(struct rcu_state *rsp) 3927static void __init rcu_init_levelspread(struct rcu_state *rsp)
3748{ 3928{
3749 int ccur;
3750 int cprv;
3751 int i; 3929 int i;
3752 3930
3753 cprv = nr_cpu_ids; 3931 if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
3754 for (i = rcu_num_lvls - 1; i >= 0; i--) { 3932 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3755 ccur = rsp->levelcnt[i]; 3933 for (i = rcu_num_lvls - 2; i >= 0; i--)
3756 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 3934 rsp->levelspread[i] = CONFIG_RCU_FANOUT;
3757 cprv = ccur; 3935 } else {
3936 int ccur;
3937 int cprv;
3938
3939 cprv = nr_cpu_ids;
3940 for (i = rcu_num_lvls - 1; i >= 0; i--) {
3941 ccur = rsp->levelcnt[i];
3942 rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
3943 cprv = ccur;
3944 }
3758 } 3945 }
3759} 3946}
3760#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
3761 3947
3762/* 3948/*
3763 * Helper function for rcu_init() that initializes one rcu_state structure. 3949 * Helper function for rcu_init() that initializes one rcu_state structure.
@@ -3833,7 +4019,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3833 } 4019 }
3834 } 4020 }
3835 4021
3836 rsp->rda = rda;
3837 init_waitqueue_head(&rsp->gp_wq); 4022 init_waitqueue_head(&rsp->gp_wq);
3838 rnp = rsp->level[rcu_num_lvls - 1]; 4023 rnp = rsp->level[rcu_num_lvls - 1];
3839 for_each_possible_cpu(i) { 4024 for_each_possible_cpu(i) {
@@ -3926,6 +4111,8 @@ void __init rcu_init(void)
3926{ 4111{
3927 int cpu; 4112 int cpu;
3928 4113
4114 rcu_early_boot_tests();
4115
3929 rcu_bootup_announce(); 4116 rcu_bootup_announce();
3930 rcu_init_geometry(); 4117 rcu_init_geometry();
3931 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 4118 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3942,8 +4129,6 @@ void __init rcu_init(void)
3942 pm_notifier(rcu_pm_notify, 0); 4129 pm_notifier(rcu_pm_notify, 0);
3943 for_each_online_cpu(cpu) 4130 for_each_online_cpu(cpu)
3944 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 4131 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3945
3946 rcu_early_boot_tests();
3947} 4132}
3948 4133
3949#include "tree_plugin.h" 4134#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
141 /* complete (only for PREEMPT_RCU). */ 141 /* complete (only for PREEMPT_RCU). */
142 unsigned long qsmaskinit; 142 unsigned long qsmaskinit;
143 /* Per-GP initial value for qsmask & expmask. */ 143 /* Per-GP initial value for qsmask & expmask. */
144 /* Initialized from ->qsmaskinitnext at the */
145 /* beginning of each grace period. */
146 unsigned long qsmaskinitnext;
147 /* Online CPUs for next grace period. */
144 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 148 unsigned long grpmask; /* Mask to apply to parent qsmask. */
145 /* Only one bit will be set in this mask. */ 149 /* Only one bit will be set in this mask. */
146 int grplo; /* lowest-numbered CPU or group here. */ 150 int grplo; /* lowest-numbered CPU or group here. */
147 int grphi; /* highest-numbered CPU or group here. */ 151 int grphi; /* highest-numbered CPU or group here. */
148 u8 grpnum; /* CPU/group number for next level up. */ 152 u8 grpnum; /* CPU/group number for next level up. */
149 u8 level; /* root is at level 0. */ 153 u8 level; /* root is at level 0. */
154 bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
155 /* exit RCU read-side critical sections */
156 /* before propagating offline up the */
157 /* rcu_node tree? */
150 struct rcu_node *parent; 158 struct rcu_node *parent;
151 struct list_head blkd_tasks; 159 struct list_head blkd_tasks;
152 /* Tasks blocked in RCU read-side critical */ 160 /* Tasks blocked in RCU read-side critical */
@@ -448,8 +456,6 @@ struct rcu_state {
448 long qlen; /* Total number of callbacks. */ 456 long qlen; /* Total number of callbacks. */
449 /* End of fields guarded by orphan_lock. */ 457 /* End of fields guarded by orphan_lock. */
450 458
451 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
452
453 struct mutex barrier_mutex; /* Guards barrier fields. */ 459 struct mutex barrier_mutex; /* Guards barrier fields. */
454 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 460 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
455 struct completion barrier_completion; /* Wake at barrier end. */ 461 struct completion barrier_completion; /* Wake at barrier end. */
@@ -559,6 +565,7 @@ static void rcu_prepare_kthreads(int cpu);
559static void rcu_cleanup_after_idle(void); 565static void rcu_cleanup_after_idle(void);
560static void rcu_prepare_for_idle(void); 566static void rcu_prepare_for_idle(void);
561static void rcu_idle_count_callbacks_posted(void); 567static void rcu_idle_count_callbacks_posted(void);
568static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
562static void print_cpu_stall_info_begin(void); 569static void print_cpu_stall_info_begin(void);
563static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); 570static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
564static void print_cpu_stall_info_end(void); 571static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..8c0ec0f5a027 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,33 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
58 */ 58 */
59static void __init rcu_bootup_announce_oddness(void) 59static void __init rcu_bootup_announce_oddness(void)
60{ 60{
61#ifdef CONFIG_RCU_TRACE 61 if (IS_ENABLED(CONFIG_RCU_TRACE))
62 pr_info("\tRCU debugfs-based tracing is enabled.\n"); 62 pr_info("\tRCU debugfs-based tracing is enabled.\n");
63#endif 63 if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
64#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) 64 (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
65 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", 65 pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
66 CONFIG_RCU_FANOUT); 66 CONFIG_RCU_FANOUT);
67#endif 67 if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
68#ifdef CONFIG_RCU_FANOUT_EXACT 68 pr_info("\tHierarchical RCU autobalancing is disabled.\n");
69 pr_info("\tHierarchical RCU autobalancing is disabled.\n"); 69 if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
70#endif 70 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
71#ifdef CONFIG_RCU_FAST_NO_HZ 71 if (IS_ENABLED(CONFIG_PROVE_RCU))
72 pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); 72 pr_info("\tRCU lockdep checking is enabled.\n");
73#endif 73 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
74#ifdef CONFIG_PROVE_RCU 74 pr_info("\tRCU torture testing starts during boot.\n");
75 pr_info("\tRCU lockdep checking is enabled.\n"); 75 if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
76#endif 76 pr_info("\tAdditional per-CPU info printed with stalls.\n");
77#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE 77 if (NUM_RCU_LVL_4 != 0)
78 pr_info("\tRCU torture testing starts during boot.\n"); 78 pr_info("\tFour-level hierarchy is enabled.\n");
79#endif 79 if (CONFIG_RCU_FANOUT_LEAF != 16)
80#if defined(CONFIG_RCU_CPU_STALL_INFO) 80 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
81 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 81 CONFIG_RCU_FANOUT_LEAF);
82#endif
83#if NUM_RCU_LVL_4 != 0
84 pr_info("\tFour-level hierarchy is enabled.\n");
85#endif
86 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) 82 if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
87 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 83 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
88 if (nr_cpu_ids != NR_CPUS) 84 if (nr_cpu_ids != NR_CPUS)
89 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 85 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
90#ifdef CONFIG_RCU_BOOST 86 if (IS_ENABLED(CONFIG_RCU_BOOST))
91 pr_info("\tRCU kthread priority: %d.\n", kthread_prio); 87 pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
92#endif
93} 88}
94 89
95#ifdef CONFIG_PREEMPT_RCU 90#ifdef CONFIG_PREEMPT_RCU
@@ -180,7 +175,7 @@ static void rcu_preempt_note_context_switch(void)
180 * But first, note that the current CPU must still be 175 * But first, note that the current CPU must still be
181 * on line! 176 * on line!
182 */ 177 */
183 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); 178 WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
184 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 179 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
185 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
186 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -233,43 +228,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
233} 228}
234 229
235/* 230/*
236 * Record a quiescent state for all tasks that were previously queued
237 * on the specified rcu_node structure and that were blocking the current
238 * RCU grace period. The caller must hold the specified rnp->lock with
239 * irqs disabled, and this lock is released upon return, but irqs remain
240 * disabled.
241 */
242static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
243 __releases(rnp->lock)
244{
245 unsigned long mask;
246 struct rcu_node *rnp_p;
247
248 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
249 raw_spin_unlock_irqrestore(&rnp->lock, flags);
250 return; /* Still need more quiescent states! */
251 }
252
253 rnp_p = rnp->parent;
254 if (rnp_p == NULL) {
255 /*
256 * Either there is only one rcu_node in the tree,
257 * or tasks were kicked up to root rcu_node due to
258 * CPUs going offline.
259 */
260 rcu_report_qs_rsp(&rcu_preempt_state, flags);
261 return;
262 }
263
264 /* Report up the rest of the hierarchy. */
265 mask = rnp->grpmask;
266 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
267 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
268 smp_mb__after_unlock_lock();
269 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
270}
271
272/*
273 * Advance a ->blkd_tasks-list pointer to the next entry, instead 231 * Advance a ->blkd_tasks-list pointer to the next entry, instead
274 * returning NULL if at the end of the list. 232 * returning NULL if at the end of the list.
275 */ 233 */
@@ -300,7 +258,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
300 */ 258 */
301void rcu_read_unlock_special(struct task_struct *t) 259void rcu_read_unlock_special(struct task_struct *t)
302{ 260{
303 bool empty;
304 bool empty_exp; 261 bool empty_exp;
305 bool empty_norm; 262 bool empty_norm;
306 bool empty_exp_now; 263 bool empty_exp_now;
@@ -334,7 +291,13 @@ void rcu_read_unlock_special(struct task_struct *t)
334 } 291 }
335 292
336 /* Hardware IRQ handlers cannot block, complain if they get here. */ 293 /* Hardware IRQ handlers cannot block, complain if they get here. */
337 if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { 294 if (in_irq() || in_serving_softirq()) {
295 lockdep_rcu_suspicious(__FILE__, __LINE__,
296 "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
297 pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
298 t->rcu_read_unlock_special.s,
299 t->rcu_read_unlock_special.b.blocked,
300 t->rcu_read_unlock_special.b.need_qs);
338 local_irq_restore(flags); 301 local_irq_restore(flags);
339 return; 302 return;
340 } 303 }
@@ -356,7 +319,6 @@ void rcu_read_unlock_special(struct task_struct *t)
356 break; 319 break;
357 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 320 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
358 } 321 }
359 empty = !rcu_preempt_has_tasks(rnp);
360 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 322 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
361 empty_exp = !rcu_preempted_readers_exp(rnp); 323 empty_exp = !rcu_preempted_readers_exp(rnp);
362 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 324 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -377,14 +339,6 @@ void rcu_read_unlock_special(struct task_struct *t)
377#endif /* #ifdef CONFIG_RCU_BOOST */ 339#endif /* #ifdef CONFIG_RCU_BOOST */
378 340
379 /* 341 /*
380 * If this was the last task on the list, go see if we
381 * need to propagate ->qsmaskinit bit clearing up the
382 * rcu_node tree.
383 */
384 if (!empty && !rcu_preempt_has_tasks(rnp))
385 rcu_cleanup_dead_rnp(rnp);
386
387 /*
388 * If this was the last task on the current list, and if 342 * If this was the last task on the current list, and if
389 * we aren't waiting on any CPUs, report the quiescent state. 343 * we aren't waiting on any CPUs, report the quiescent state.
390 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, 344 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
@@ -399,7 +353,8 @@ void rcu_read_unlock_special(struct task_struct *t)
399 rnp->grplo, 353 rnp->grplo,
400 rnp->grphi, 354 rnp->grphi,
401 !!rnp->gp_tasks); 355 !!rnp->gp_tasks);
402 rcu_report_unblock_qs_rnp(rnp, flags); 356 rcu_report_unblock_qs_rnp(&rcu_preempt_state,
357 rnp, flags);
403 } else { 358 } else {
404 raw_spin_unlock_irqrestore(&rnp->lock, flags); 359 raw_spin_unlock_irqrestore(&rnp->lock, flags);
405 } 360 }
@@ -520,10 +475,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
520 WARN_ON_ONCE(rnp->qsmask); 475 WARN_ON_ONCE(rnp->qsmask);
521} 476}
522 477
523#ifdef CONFIG_HOTPLUG_CPU
524
525#endif /* #ifdef CONFIG_HOTPLUG_CPU */
526
527/* 478/*
528 * Check for a quiescent state from the current CPU. When a task blocks, 479 * Check for a quiescent state from the current CPU. When a task blocks,
529 * the task is recorded in the corresponding CPU's rcu_node structure, 480 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -585,7 +536,7 @@ void synchronize_rcu(void)
585 "Illegal synchronize_rcu() in RCU read-side critical section"); 536 "Illegal synchronize_rcu() in RCU read-side critical section");
586 if (!rcu_scheduler_active) 537 if (!rcu_scheduler_active)
587 return; 538 return;
588 if (rcu_expedited) 539 if (rcu_gp_is_expedited())
589 synchronize_rcu_expedited(); 540 synchronize_rcu_expedited();
590 else 541 else
591 wait_rcu_gp(call_rcu); 542 wait_rcu_gp(call_rcu);
@@ -630,9 +581,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
630 * recursively up the tree. (Calm down, calm down, we do the recursion 581 * recursively up the tree. (Calm down, calm down, we do the recursion
631 * iteratively!) 582 * iteratively!)
632 * 583 *
633 * Most callers will set the "wake" flag, but the task initiating the
634 * expedited grace period need not wake itself.
635 *
636 * Caller must hold sync_rcu_preempt_exp_mutex. 584 * Caller must hold sync_rcu_preempt_exp_mutex.
637 */ 585 */
638static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 586static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -667,29 +615,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
667 615
668/* 616/*
669 * Snapshot the tasks blocking the newly started preemptible-RCU expedited 617 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
670 * grace period for the specified rcu_node structure. If there are no such 618 * grace period for the specified rcu_node structure, phase 1. If there
671 * tasks, report it up the rcu_node hierarchy. 619 * are such tasks, set the ->expmask bits up the rcu_node tree and also
620 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
621 * that work is needed here.
672 * 622 *
673 * Caller must hold sync_rcu_preempt_exp_mutex and must exclude 623 * Caller must hold sync_rcu_preempt_exp_mutex.
674 * CPU hotplug operations.
675 */ 624 */
676static void 625static void
677sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) 626sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
678{ 627{
679 unsigned long flags; 628 unsigned long flags;
680 int must_wait = 0; 629 unsigned long mask;
630 struct rcu_node *rnp_up;
681 631
682 raw_spin_lock_irqsave(&rnp->lock, flags); 632 raw_spin_lock_irqsave(&rnp->lock, flags);
683 smp_mb__after_unlock_lock(); 633 smp_mb__after_unlock_lock();
634 WARN_ON_ONCE(rnp->expmask);
635 WARN_ON_ONCE(rnp->exp_tasks);
684 if (!rcu_preempt_has_tasks(rnp)) { 636 if (!rcu_preempt_has_tasks(rnp)) {
637 /* No blocked tasks, nothing to do. */
685 raw_spin_unlock_irqrestore(&rnp->lock, flags); 638 raw_spin_unlock_irqrestore(&rnp->lock, flags);
686 } else { 639 return;
640 }
641 /* Call for Phase 2 and propagate ->expmask bits up the tree. */
642 rnp->expmask = 1;
643 rnp_up = rnp;
644 while (rnp_up->parent) {
645 mask = rnp_up->grpmask;
646 rnp_up = rnp_up->parent;
647 if (rnp_up->expmask & mask)
648 break;
649 raw_spin_lock(&rnp_up->lock); /* irqs already off */
650 smp_mb__after_unlock_lock();
651 rnp_up->expmask |= mask;
652 raw_spin_unlock(&rnp_up->lock); /* irqs still off */
653 }
654 raw_spin_unlock_irqrestore(&rnp->lock, flags);
655}
656
657/*
658 * Snapshot the tasks blocking the newly started preemptible-RCU expedited
659 * grace period for the specified rcu_node structure, phase 2. If the
660 * leaf rcu_node structure has its ->expmask field set, check for tasks.
661 * If there are some, clear ->expmask and set ->exp_tasks accordingly,
662 * then initiate RCU priority boosting. Otherwise, clear ->expmask and
663 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
664 * enabling rcu_read_unlock_special() to do the bit-clearing.
665 *
666 * Caller must hold sync_rcu_preempt_exp_mutex.
667 */
668static void
669sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
670{
671 unsigned long flags;
672
673 raw_spin_lock_irqsave(&rnp->lock, flags);
674 smp_mb__after_unlock_lock();
675 if (!rnp->expmask) {
676 /* Phase 1 didn't do anything, so Phase 2 doesn't either. */
677 raw_spin_unlock_irqrestore(&rnp->lock, flags);
678 return;
679 }
680
681 /* Phase 1 is over. */
682 rnp->expmask = 0;
683
684 /*
685 * If there are still blocked tasks, set up ->exp_tasks so that
686 * rcu_read_unlock_special() will wake us and then boost them.
687 */
688 if (rcu_preempt_has_tasks(rnp)) {
687 rnp->exp_tasks = rnp->blkd_tasks.next; 689 rnp->exp_tasks = rnp->blkd_tasks.next;
688 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ 690 rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
689 must_wait = 1; 691 return;
690 } 692 }
691 if (!must_wait) 693
692 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 694 /* No longer any blocked tasks, so undo bit setting. */
695 raw_spin_unlock_irqrestore(&rnp->lock, flags);
696 rcu_report_exp_rnp(rsp, rnp, false);
693} 697}
694 698
695/** 699/**
@@ -706,7 +710,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
706 */ 710 */
707void synchronize_rcu_expedited(void) 711void synchronize_rcu_expedited(void)
708{ 712{
709 unsigned long flags;
710 struct rcu_node *rnp; 713 struct rcu_node *rnp;
711 struct rcu_state *rsp = &rcu_preempt_state; 714 struct rcu_state *rsp = &rcu_preempt_state;
712 unsigned long snap; 715 unsigned long snap;
@@ -757,19 +760,16 @@ void synchronize_rcu_expedited(void)
757 /* force all RCU readers onto ->blkd_tasks lists. */ 760 /* force all RCU readers onto ->blkd_tasks lists. */
758 synchronize_sched_expedited(); 761 synchronize_sched_expedited();
759 762
760 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 763 /*
761 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 764 * Snapshot current state of ->blkd_tasks lists into ->expmask.
762 raw_spin_lock_irqsave(&rnp->lock, flags); 765 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
763 smp_mb__after_unlock_lock(); 766 * to start clearing them. Doing this in one phase leads to
764 rnp->expmask = rnp->qsmaskinit; 767 * strange races between setting and clearing bits, so just say "no"!
765 raw_spin_unlock_irqrestore(&rnp->lock, flags); 768 */
766 } 769 rcu_for_each_leaf_node(rsp, rnp)
767 770 sync_rcu_preempt_exp_init1(rsp, rnp);
768 /* Snapshot current state of ->blkd_tasks lists. */
769 rcu_for_each_leaf_node(rsp, rnp) 771 rcu_for_each_leaf_node(rsp, rnp)
770 sync_rcu_preempt_exp_init(rsp, rnp); 772 sync_rcu_preempt_exp_init2(rsp, rnp);
771 if (NUM_RCU_NODES > 1)
772 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
773 773
774 put_online_cpus(); 774 put_online_cpus();
775 775
@@ -859,8 +859,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
859 return 0; 859 return 0;
860} 860}
861 861
862#ifdef CONFIG_HOTPLUG_CPU
863
864/* 862/*
865 * Because there is no preemptible RCU, there can be no readers blocked. 863 * Because there is no preemptible RCU, there can be no readers blocked.
866 */ 864 */
@@ -869,8 +867,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
869 return false; 867 return false;
870} 868}
871 869
872#endif /* #ifdef CONFIG_HOTPLUG_CPU */
873
874/* 870/*
875 * Because preemptible RCU does not exist, we never have to check for 871 * Because preemptible RCU does not exist, we never have to check for
876 * tasks blocked within RCU read-side critical sections. 872 * tasks blocked within RCU read-side critical sections.
@@ -1170,7 +1166,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1170 * Returns zero if all is well, a negated errno otherwise. 1166 * Returns zero if all is well, a negated errno otherwise.
1171 */ 1167 */
1172static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1168static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1173 struct rcu_node *rnp) 1169 struct rcu_node *rnp)
1174{ 1170{
1175 int rnp_index = rnp - &rsp->node[0]; 1171 int rnp_index = rnp - &rsp->node[0];
1176 unsigned long flags; 1172 unsigned long flags;
@@ -1180,7 +1176,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1180 if (&rcu_preempt_state != rsp) 1176 if (&rcu_preempt_state != rsp)
1181 return 0; 1177 return 0;
1182 1178
1183 if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) 1179 if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
1184 return 0; 1180 return 0;
1185 1181
1186 rsp->boost = 1; 1182 rsp->boost = 1;
@@ -1273,7 +1269,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
1273static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) 1269static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1274{ 1270{
1275 struct task_struct *t = rnp->boost_kthread_task; 1271 struct task_struct *t = rnp->boost_kthread_task;
1276 unsigned long mask = rnp->qsmaskinit; 1272 unsigned long mask = rcu_rnp_online_cpus(rnp);
1277 cpumask_var_t cm; 1273 cpumask_var_t cm;
1278 int cpu; 1274 int cpu;
1279 1275
@@ -1945,7 +1941,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
1945 rhp = ACCESS_ONCE(rdp->nocb_follower_head); 1941 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
1946 1942
1947 /* Having no rcuo kthread but CBs after scheduler starts is bad! */ 1943 /* Having no rcuo kthread but CBs after scheduler starts is bad! */
1948 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { 1944 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
1945 rcu_scheduler_fully_active) {
1949 /* RCU callback enqueued before CPU first came online??? */ 1946 /* RCU callback enqueued before CPU first came online??? */
1950 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", 1947 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
1951 cpu, rhp->func); 1948 cpu, rhp->func);
@@ -2392,18 +2389,8 @@ void __init rcu_init_nohz(void)
2392 pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); 2389 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2393 2390
2394 for_each_rcu_flavor(rsp) { 2391 for_each_rcu_flavor(rsp) {
2395 for_each_cpu(cpu, rcu_nocb_mask) { 2392 for_each_cpu(cpu, rcu_nocb_mask)
2396 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2393 init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
2397
2398 /*
2399 * If there are early callbacks, they will need
2400 * to be moved to the nocb lists.
2401 */
2402 WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
2403 &rdp->nxtlist &&
2404 rdp->nxttail[RCU_NEXT_TAIL] != NULL);
2405 init_nocb_callback_list(rdp);
2406 }
2407 rcu_organize_nocb_kthreads(rsp); 2394 rcu_organize_nocb_kthreads(rsp);
2408 } 2395 }
2409} 2396}
@@ -2540,6 +2527,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
2540 if (!rcu_is_nocb_cpu(rdp->cpu)) 2527 if (!rcu_is_nocb_cpu(rdp->cpu))
2541 return false; 2528 return false;
2542 2529
2530 /* If there are early-boot callbacks, move them to nocb lists. */
2531 if (rdp->nxtlist) {
2532 rdp->nocb_head = rdp->nxtlist;
2533 rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
2534 atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
2535 atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
2536 rdp->nxtlist = NULL;
2537 rdp->qlen = 0;
2538 rdp->qlen_lazy = 0;
2539 }
2543 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2540 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2544 return true; 2541 return true;
2545} 2542}
@@ -2763,7 +2760,8 @@ static void rcu_sysidle_exit(int irq)
2763 2760
2764/* 2761/*
2765 * Check to see if the current CPU is idle. Note that usermode execution 2762 * Check to see if the current CPU is idle. Note that usermode execution
2766 * does not count as idle. The caller must have disabled interrupts. 2763 * does not count as idle. The caller must have disabled interrupts,
2764 * and must be running on tick_do_timer_cpu.
2767 */ 2765 */
2768static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, 2766static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2769 unsigned long *maxj) 2767 unsigned long *maxj)
@@ -2784,8 +2782,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2784 if (!*isidle || rdp->rsp != rcu_state_p || 2782 if (!*isidle || rdp->rsp != rcu_state_p ||
2785 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2783 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2786 return; 2784 return;
2787 if (rcu_gp_in_progress(rdp->rsp)) 2785 /* Verify affinity of current kthread. */
2788 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); 2786 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2789 2787
2790 /* Pick up current idle and NMI-nesting counter and check. */ 2788 /* Pick up current idle and NMI-nesting counter and check. */
2791 cur = atomic_read(&rdtp->dynticks_idle); 2789 cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3066,10 @@ static void rcu_bind_gp_kthread(void)
3068 return; 3066 return;
3069#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 3067#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
3070 cpu = tick_do_timer_cpu; 3068 cpu = tick_do_timer_cpu;
3071 if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) 3069 if (cpu >= 0 && cpu < nr_cpu_ids)
3072 set_cpus_allowed_ptr(current, cpumask_of(cpu)); 3070 set_cpus_allowed_ptr(current, cpumask_of(cpu));
3073#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3071#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3074 if (!is_housekeeping_cpu(raw_smp_processor_id())) 3072 housekeeping_affine(current);
3075 housekeeping_affine(current);
3076#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3073#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3077} 3074}
3078 3075
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
283 seq_puts(m, "\n"); 283 seq_puts(m, "\n");
284 level = rnp->level; 284 level = rnp->level;
285 } 285 }
286 seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", 286 seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ",
287 rnp->qsmask, rnp->qsmaskinit, 287 rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
288 ".G"[rnp->gp_tasks != NULL], 288 ".G"[rnp->gp_tasks != NULL],
289 ".E"[rnp->exp_tasks != NULL], 289 ".E"[rnp->exp_tasks != NULL],
290 ".T"[!list_empty(&rnp->blkd_tasks)], 290 ".T"[!list_empty(&rnp->blkd_tasks)],
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..1f133350da01 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,63 @@ MODULE_ALIAS("rcupdate");
62 62
63module_param(rcu_expedited, int, 0); 63module_param(rcu_expedited, int, 0);
64 64
65#ifndef CONFIG_TINY_RCU
66
67static atomic_t rcu_expedited_nesting =
68 ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
69
70/*
71 * Should normal grace-period primitives be expedited? Intended for
72 * use within RCU. Note that this function takes the rcu_expedited
73 * sysfs/boot variable into account as well as the rcu_expedite_gp()
74 * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
75 * returns false is a -really- bad idea.
76 */
77bool rcu_gp_is_expedited(void)
78{
79 return rcu_expedited || atomic_read(&rcu_expedited_nesting);
80}
81EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
82
83/**
84 * rcu_expedite_gp - Expedite future RCU grace periods
85 *
86 * After a call to this function, future calls to synchronize_rcu() and
87 * friends act as the corresponding synchronize_rcu_expedited() function
88 * had instead been called.
89 */
90void rcu_expedite_gp(void)
91{
92 atomic_inc(&rcu_expedited_nesting);
93}
94EXPORT_SYMBOL_GPL(rcu_expedite_gp);
95
96/**
97 * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
98 *
99 * Undo a prior call to rcu_expedite_gp(). If all prior calls to
100 * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
101 * and if the rcu_expedited sysfs/boot parameter is not set, then all
102 * subsequent calls to synchronize_rcu() and friends will return to
103 * their normal non-expedited behavior.
104 */
105void rcu_unexpedite_gp(void)
106{
107 atomic_dec(&rcu_expedited_nesting);
108}
109EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
110
111#endif /* #ifndef CONFIG_TINY_RCU */
112
113/*
114 * Inform RCU of the end of the in-kernel boot sequence.
115 */
116void rcu_end_inkernel_boot(void)
117{
118 if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
119 rcu_unexpedite_gp();
120}
121
65#ifdef CONFIG_PREEMPT_RCU 122#ifdef CONFIG_PREEMPT_RCU
66 123
67/* 124/*
@@ -199,16 +256,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
199 256
200#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 257#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
201 258
202struct rcu_synchronize { 259/**
203 struct rcu_head head; 260 * wakeme_after_rcu() - Callback function to awaken a task after grace period
204 struct completion completion; 261 * @head: Pointer to rcu_head member within rcu_synchronize structure
205}; 262 *
206 263 * Awaken the corresponding task now that a grace period has elapsed.
207/*
208 * Awaken the corresponding synchronize_rcu() instance now that a
209 * grace period has elapsed.
210 */ 264 */
211static void wakeme_after_rcu(struct rcu_head *head) 265void wakeme_after_rcu(struct rcu_head *head)
212{ 266{
213 struct rcu_synchronize *rcu; 267 struct rcu_synchronize *rcu;
214 268
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 94b2d7b88a27..b0090accfb5b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -198,6 +198,8 @@ exit_idle:
198 start_critical_timings(); 198 start_critical_timings();
199} 199}
200 200
201DEFINE_PER_CPU(bool, cpu_dead_idle);
202
201/* 203/*
202 * Generic idle loop implementation 204 * Generic idle loop implementation
203 * 205 *
@@ -222,8 +224,13 @@ static void cpu_idle_loop(void)
222 check_pgt_cache(); 224 check_pgt_cache();
223 rmb(); 225 rmb();
224 226
225 if (cpu_is_offline(smp_processor_id())) 227 if (cpu_is_offline(smp_processor_id())) {
228 rcu_cpu_notify(NULL, CPU_DYING_IDLE,
229 (void *)(long)smp_processor_id());
230 smp_mb(); /* all activity before dead. */
231 this_cpu_write(cpu_dead_idle, true);
226 arch_cpu_idle_dead(); 232 arch_cpu_idle_dead();
233 }
227 234
228 local_irq_disable(); 235 local_irq_disable();
229 arch_cpu_idle_enter(); 236 arch_cpu_idle_enter();
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/delay.h>
7#include <linux/init.h> 8#include <linux/init.h>
8#include <linux/list.h> 9#include <linux/list.h>
9#include <linux/slab.h> 10#include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
314 put_online_cpus(); 315 put_online_cpus();
315} 316}
316EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); 317EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
318
319static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
320
321/*
322 * Called to poll specified CPU's state, for example, when waiting for
323 * a CPU to come online.
324 */
325int cpu_report_state(int cpu)
326{
327 return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
328}
329
330/*
331 * If CPU has died properly, set its state to CPU_UP_PREPARE and
332 * return success. Otherwise, return -EBUSY if the CPU died after
333 * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN
334 * if cpu_wait_death() timed out and the CPU still hasn't gotten around
335 * to dying. In the latter two cases, the CPU might not be set up
336 * properly, but it is up to the arch-specific code to decide.
337 * Finally, -EIO indicates an unanticipated problem.
338 *
339 * Note that it is permissible to omit this call entirely, as is
340 * done in architectures that do no CPU-hotplug error checking.
341 */
342int cpu_check_up_prepare(int cpu)
343{
344 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
345 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
346 return 0;
347 }
348
349 switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
350
351 case CPU_POST_DEAD:
352
353 /* The CPU died properly, so just start it up again. */
354 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
355 return 0;
356
357 case CPU_DEAD_FROZEN:
358
359 /*
360 * Timeout during CPU death, so let caller know.
361 * The outgoing CPU completed its processing, but after
362 * cpu_wait_death() timed out and reported the error. The
363 * caller is free to proceed, in which case the state
364 * will be reset properly by cpu_set_state_online().
365 * Proceeding despite this -EBUSY return makes sense
366 * for systems where the outgoing CPUs take themselves
367 * offline, with no post-death manipulation required from
368 * a surviving CPU.
369 */
370 return -EBUSY;
371
372 case CPU_BROKEN:
373
374 /*
375 * The most likely reason we got here is that there was
376 * a timeout during CPU death, and the outgoing CPU never
377 * did complete its processing. This could happen on
378 * a virtualized system if the outgoing VCPU gets preempted
379 * for more than five seconds, and the user attempts to
380 * immediately online that same CPU. Trying again later
381 * might return -EBUSY above, hence -EAGAIN.
382 */
383 return -EAGAIN;
384
385 default:
386
387 /* Should not happen. Famous last words. */
388 return -EIO;
389 }
390}
391
392/*
393 * Mark the specified CPU online.
394 *
395 * Note that it is permissible to omit this call entirely, as is
396 * done in architectures that do no CPU-hotplug error checking.
397 */
398void cpu_set_state_online(int cpu)
399{
400 (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
401}
402
403#ifdef CONFIG_HOTPLUG_CPU
404
405/*
406 * Wait for the specified CPU to exit the idle loop and die.
407 */
408bool cpu_wait_death(unsigned int cpu, int seconds)
409{
410 int jf_left = seconds * HZ;
411 int oldstate;
412 bool ret = true;
413 int sleep_jf = 1;
414
415 might_sleep();
416
417 /* The outgoing CPU will normally get done quite quickly. */
418 if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
419 goto update_state;
420 udelay(5);
421
422 /* But if the outgoing CPU dawdles, wait increasingly long times. */
423 while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
424 schedule_timeout_uninterruptible(sleep_jf);
425 jf_left -= sleep_jf;
426 if (jf_left <= 0)
427 break;
428 sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
429 }
430update_state:
431 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
432 if (oldstate == CPU_DEAD) {
433 /* Outgoing CPU died normally, update state. */
434 smp_mb(); /* atomic_read() before update. */
435 atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
436 } else {
437 /* Outgoing CPU still hasn't died, set state accordingly. */
438 if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
439 oldstate, CPU_BROKEN) != oldstate)
440 goto update_state;
441 ret = false;
442 }
443 return ret;
444}
445
446/*
447 * Called by the outgoing CPU to report its successful death. Return
448 * false if this report follows the surviving CPU's timing out.
449 *
450 * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
451 * timed out. This approach allows architectures to omit calls to
452 * cpu_check_up_prepare() and cpu_set_state_online() without defeating
453 * the next cpu_wait_death()'s polling loop.
454 */
455bool cpu_report_death(void)
456{
457 int oldstate;
458 int newstate;
459 int cpu = smp_processor_id();
460
461 do {
462 oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
463 if (oldstate != CPU_BROKEN)
464 newstate = CPU_DEAD;
465 else
466 newstate = CPU_DEAD_FROZEN;
467 } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
468 oldstate, newstate) != oldstate);
469 return newstate == CPU_DEAD;
470}
471
472#endif /* #ifdef CONFIG_HOTPLUG_CPU */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c5cefb3c009c..1ad74c0df01f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1180,16 +1180,7 @@ config DEBUG_CREDENTIALS
1180menu "RCU Debugging" 1180menu "RCU Debugging"
1181 1181
1182config PROVE_RCU 1182config PROVE_RCU
1183 bool "RCU debugging: prove RCU correctness" 1183 def_bool PROVE_LOCKING
1184 depends on PROVE_LOCKING
1185 default n
1186 help
1187 This feature enables lockdep extensions that check for correct
1188 use of RCU APIs. This is currently under development. Say Y
1189 if you want to debug RCU usage or help work on the PROVE_RCU
1190 feature.
1191
1192 Say N if you are unsure.
1193 1184
1194config PROVE_RCU_REPEATEDLY 1185config PROVE_RCU_REPEATEDLY
1195 bool "RCU debugging: don't disable PROVE_RCU on first splat" 1186 bool "RCU debugging: don't disable PROVE_RCU on first splat"
@@ -1257,6 +1248,30 @@ config RCU_TORTURE_TEST_RUNNABLE
1257 Say N here if you want the RCU torture tests to start only 1248 Say N here if you want the RCU torture tests to start only
1258 after being manually enabled via /proc. 1249 after being manually enabled via /proc.
1259 1250
1251config RCU_TORTURE_TEST_SLOW_INIT
1252 bool "Slow down RCU grace-period initialization to expose races"
1253 depends on RCU_TORTURE_TEST
1254 help
1255 This option makes grace-period initialization block for a
1256 few jiffies between initializing each pair of consecutive
1257 rcu_node structures. This helps to expose races involving
1258 grace-period initialization, in other words, it makes your
1259 kernel less stable. It can also greatly increase grace-period
1260 latency, especially on systems with large numbers of CPUs.
1261 This is useful when torture-testing RCU, but in almost no
1262 other circumstance.
1263
1264 Say Y here if you want your system to crash and hang more often.
1265 Say N if you want a sane system.
1266
1267config RCU_TORTURE_TEST_SLOW_INIT_DELAY
1268 int "How much to slow down RCU grace-period initialization"
1269 range 0 5
1270 default 3
1271 help
1272 This option specifies the number of jiffies to wait between
1273 each rcu_node structure initialization.
1274
1260config RCU_CPU_STALL_TIMEOUT 1275config RCU_CPU_STALL_TIMEOUT
1261 int "RCU CPU stall timeout in seconds" 1276 int "RCU CPU stall timeout in seconds"
1262 depends on RCU_STALL_COMMON 1277 depends on RCU_STALL_COMMON
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 368d64ac779e..dd2812ceb0ba 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -310,7 +310,7 @@ function dump(first, pastlast)
310 cfr[jn] = cf[j] "." cfrep[cf[j]]; 310 cfr[jn] = cf[j] "." cfrep[cf[j]];
311 } 311 }
312 if (cpusr[jn] > ncpus && ncpus != 0) 312 if (cpusr[jn] > ncpus && ncpus != 0)
313 ovf = "(!)"; 313 ovf = "-ovf";
314 else 314 else
315 ovf = ""; 315 ovf = "";
316 print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date`"; 316 print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date`";
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index d2d2a86139db..49701218dc62 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,2 +1,3 @@
1CONFIG_RCU_TORTURE_TEST=y 1CONFIG_RCU_TORTURE_TEST=y
2CONFIG_PRINTK_TIME=y 2CONFIG_PRINTK_TIME=y
3CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y