aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-01-16 01:45:44 -0500
committerIngo Molnar <mingo@kernel.org>2017-01-16 01:45:44 -0500
commit3e4f7a4956e54143f7fc15c636158ad4166d219d (patch)
treec286358c42c47328a12f523878e8cc9fa022f9f9
parentf4d3935e4f4884ba80561db5549394afb8eef8f7 (diff)
parent52d7e48b86fc108e45a656d8e53e4237993c481d (diff)
Merge branch 'rcu/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu into rcu/urgent
Pull an urgent RCU fix from Paul E. McKenney: "This series contains a pair of commits that permit RCU synchronous grace periods (synchronize_rcu() and friends) to work correctly throughout boot. This eliminates the current "dead time" starting when the scheduler spawns its first taks and ending when the last of RCU's kthreads is spawned (this last happens during early_initcall() time). Although RCU's synchronous grace periods have long been documented as not working during this time, prior to 4.9, the expedited grace periods worked by accident, and some ACPI code came to rely on this unintentional behavior. (Note that this unintentional behavior was -not- reliable. For example, failures from ACPI could occur on !SMP systems and on systems booting with the rcu_normal kernel boot parameter.) Either way, there is a bug that needs fixing, and the 4.9 switch of RCU's expedited grace periods to workqueues could be considered to have caused a regression. This series therefore makes RCU's expedited grace periods operate correctly throughout the boot process. This has been demonstrated to fix the problems ACPI was encountering, and has the added longer-term benefit of simplifying RCU's behavior." Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/rcupdate.h4
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/tiny.c4
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c33
-rw-r--r--kernel/rcu/tree_exp.h52
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c38
8 files changed, 104 insertions, 39 deletions
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 321f9ed552a9..01f71e1d2e94 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -444,6 +444,10 @@ bool __rcu_is_watching(void);
444#error "Unknown RCU implementation specified to kernel configuration" 444#error "Unknown RCU implementation specified to kernel configuration"
445#endif 445#endif
446 446
447#define RCU_SCHEDULER_INACTIVE 0
448#define RCU_SCHEDULER_INIT 1
449#define RCU_SCHEDULER_RUNNING 2
450
447/* 451/*
448 * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic 452 * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
449 * initialization and destruction of rcu_head on the stack. rcu_head structures 453 * initialization and destruction of rcu_head on the stack. rcu_head structures
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
136#define TPS(x) tracepoint_string(x) 136#define TPS(x) tracepoint_string(x)
137 137
138void rcu_early_boot_tests(void); 138void rcu_early_boot_tests(void);
139void rcu_test_sync_prims(void);
139 140
140/* 141/*
141 * This function really isn't for public consumption, but RCU is special in 142 * This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1898559e6b60..b23a4d076f3d 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -185,9 +185,6 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused
185 * benefits of doing might_sleep() to reduce latency.) 185 * benefits of doing might_sleep() to reduce latency.)
186 * 186 *
187 * Cool, huh? (Due to Josh Triplett.) 187 * Cool, huh? (Due to Josh Triplett.)
188 *
189 * But we want to make this a static inline later. The cond_resched()
190 * currently makes this problematic.
191 */ 188 */
192void synchronize_sched(void) 189void synchronize_sched(void)
193{ 190{
@@ -195,7 +192,6 @@ void synchronize_sched(void)
195 lock_is_held(&rcu_lock_map) || 192 lock_is_held(&rcu_lock_map) ||
196 lock_is_held(&rcu_sched_lock_map), 193 lock_is_held(&rcu_sched_lock_map),
197 "Illegal synchronize_sched() in RCU read-side critical section"); 194 "Illegal synchronize_sched() in RCU read-side critical section");
198 cond_resched();
199} 195}
200EXPORT_SYMBOL_GPL(synchronize_sched); 196EXPORT_SYMBOL_GPL(synchronize_sched);
201 197
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60 60
61/* 61/*
62 * During boot, we forgive RCU lockdep issues. After this function is 62 * During boot, we forgive RCU lockdep issues. After this function is
63 * invoked, we start taking RCU lockdep issues seriously. 63 * invoked, we start taking RCU lockdep issues seriously. Note that unlike
64 * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
66 * The reason for this is that Tiny RCU does not need kthreads, so does
67 * not have to care about the fact that the scheduler is half-initialized
68 * at a certain phase of the boot process.
64 */ 69 */
65void __init rcu_scheduler_starting(void) 70void __init rcu_scheduler_starting(void)
66{ 71{
67 WARN_ON(nr_context_switches() > 0); 72 WARN_ON(nr_context_switches() > 0);
68 rcu_scheduler_active = 1; 73 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
69} 74}
70 75
71#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 76#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 96c52e43f7ca..cb4e2056ccf3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
127int sysctl_panic_on_rcu_stall __read_mostly; 127int sysctl_panic_on_rcu_stall __read_mostly;
128 128
129/* 129/*
130 * The rcu_scheduler_active variable transitions from zero to one just 130 * The rcu_scheduler_active variable is initialized to the value
131 * before the first task is spawned. So when this variable is zero, RCU 131 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
132 * can assume that there is but one task, allowing RCU to (for example) 132 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
133 * RCU can assume that there is but one task, allowing RCU to (for example)
133 * optimize synchronize_rcu() to a simple barrier(). When this variable 134 * optimize synchronize_rcu() to a simple barrier(). When this variable
134 * is one, RCU must actually do all the hard work required to detect real 135 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
135 * grace periods. This variable is also used to suppress boot-time false 136 * to detect real grace periods. This variable is also used to suppress
136 * positives from lockdep-RCU error checking. 137 * boot-time false positives from lockdep-RCU error checking. Finally, it
138 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
139 * is fully initialized, including all of its kthreads having been spawned.
137 */ 140 */
138int rcu_scheduler_active __read_mostly; 141int rcu_scheduler_active __read_mostly;
139EXPORT_SYMBOL_GPL(rcu_scheduler_active); 142EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -3980,18 +3983,22 @@ static int __init rcu_spawn_gp_kthread(void)
3980early_initcall(rcu_spawn_gp_kthread); 3983early_initcall(rcu_spawn_gp_kthread);
3981 3984
3982/* 3985/*
3983 * This function is invoked towards the end of the scheduler's initialization 3986 * This function is invoked towards the end of the scheduler's
3984 * process. Before this is called, the idle task might contain 3987 * initialization process. Before this is called, the idle task might
3985 * RCU read-side critical sections (during which time, this idle 3988 * contain synchronous grace-period primitives (during which time, this idle
3986 * task is booting the system). After this function is called, the 3989 * task is booting the system, and such primitives are no-ops). After this
3987 * idle tasks are prohibited from containing RCU read-side critical 3990 * function is called, any synchronous grace-period primitives are run as
3988 * sections. This function also enables RCU lockdep checking. 3991 * expedited, with the requesting task driving the grace period forward.
3992 * A later core_initcall() rcu_exp_runtime_mode() will switch to full
3993 * runtime RCU functionality.
3989 */ 3994 */
3990void rcu_scheduler_starting(void) 3995void rcu_scheduler_starting(void)
3991{ 3996{
3992 WARN_ON(num_online_cpus() != 1); 3997 WARN_ON(num_online_cpus() != 1);
3993 WARN_ON(nr_context_switches() > 0); 3998 WARN_ON(nr_context_switches() > 0);
3994 rcu_scheduler_active = 1; 3999 rcu_test_sync_prims();
4000 rcu_scheduler_active = RCU_SCHEDULER_INIT;
4001 rcu_test_sync_prims();
3995} 4002}
3996 4003
3997/* 4004/*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index d3053e99fdb6..e59e1849b89a 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -532,18 +532,28 @@ struct rcu_exp_work {
532}; 532};
533 533
534/* 534/*
535 * Common code to drive an expedited grace period forward, used by
536 * workqueues and mid-boot-time tasks.
537 */
538static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
539 smp_call_func_t func, unsigned long s)
540{
541 /* Initialize the rcu_node tree in preparation for the wait. */
542 sync_rcu_exp_select_cpus(rsp, func);
543
544 /* Wait and clean up, including waking everyone. */
545 rcu_exp_wait_wake(rsp, s);
546}
547
548/*
535 * Work-queue handler to drive an expedited grace period forward. 549 * Work-queue handler to drive an expedited grace period forward.
536 */ 550 */
537static void wait_rcu_exp_gp(struct work_struct *wp) 551static void wait_rcu_exp_gp(struct work_struct *wp)
538{ 552{
539 struct rcu_exp_work *rewp; 553 struct rcu_exp_work *rewp;
540 554
541 /* Initialize the rcu_node tree in preparation for the wait. */
542 rewp = container_of(wp, struct rcu_exp_work, rew_work); 555 rewp = container_of(wp, struct rcu_exp_work, rew_work);
543 sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); 556 rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
544
545 /* Wait and clean up, including waking everyone. */
546 rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
547} 557}
548 558
549/* 559/*
@@ -569,12 +579,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
569 if (exp_funnel_lock(rsp, s)) 579 if (exp_funnel_lock(rsp, s))
570 return; /* Someone else did our work for us. */ 580 return; /* Someone else did our work for us. */
571 581
572 /* Marshall arguments and schedule the expedited grace period. */ 582 /* Ensure that load happens before action based on it. */
573 rew.rew_func = func; 583 if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
574 rew.rew_rsp = rsp; 584 /* Direct call during scheduler init and early_initcalls(). */
575 rew.rew_s = s; 585 rcu_exp_sel_wait_wake(rsp, func, s);
576 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); 586 } else {
577 schedule_work(&rew.rew_work); 587 /* Marshall arguments & schedule the expedited grace period. */
588 rew.rew_func = func;
589 rew.rew_rsp = rsp;
590 rew.rew_s = s;
591 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
592 schedule_work(&rew.rew_work);
593 }
578 594
579 /* Wait for expedited grace period to complete. */ 595 /* Wait for expedited grace period to complete. */
580 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 596 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -676,6 +692,8 @@ void synchronize_rcu_expedited(void)
676{ 692{
677 struct rcu_state *rsp = rcu_state_p; 693 struct rcu_state *rsp = rcu_state_p;
678 694
695 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
696 return;
679 _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); 697 _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
680} 698}
681EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 699EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -693,3 +711,15 @@ void synchronize_rcu_expedited(void)
693EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 711EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
694 712
695#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 713#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
714
715/*
716 * Switch to run-time mode once Tree RCU has fully initialized.
717 */
718static int __init rcu_exp_runtime_mode(void)
719{
720 rcu_test_sync_prims();
721 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
722 rcu_test_sync_prims();
723 return 0;
724}
725core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 85c5a883c6e3..56583e764ebf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
670 lock_is_held(&rcu_lock_map) || 670 lock_is_held(&rcu_lock_map) ||
671 lock_is_held(&rcu_sched_lock_map), 671 lock_is_held(&rcu_sched_lock_map),
672 "Illegal synchronize_rcu() in RCU read-side critical section"); 672 "Illegal synchronize_rcu() in RCU read-side critical section");
673 if (!rcu_scheduler_active) 673 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
674 return; 674 return;
675 if (rcu_gp_is_expedited()) 675 if (rcu_gp_is_expedited())
676 synchronize_rcu_expedited(); 676 synchronize_rcu_expedited();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f19271dce0a9..4f6db7e6a117 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
121 * Should expedited grace-period primitives always fall back to their 121 * Should expedited grace-period primitives always fall back to their
122 * non-expedited counterparts? Intended for use within RCU. Note 122 * non-expedited counterparts? Intended for use within RCU. Note
123 * that if the user specifies both rcu_expedited and rcu_normal, then 123 * that if the user specifies both rcu_expedited and rcu_normal, then
124 * rcu_normal wins. 124 * rcu_normal wins. (Except during the time period during boot from
125 * when the first task is spawned until the rcu_exp_runtime_mode()
126 * core_initcall() is invoked, at which point everything is expedited.)
125 */ 127 */
126bool rcu_gp_is_normal(void) 128bool rcu_gp_is_normal(void)
127{ 129{
128 return READ_ONCE(rcu_normal); 130 return READ_ONCE(rcu_normal) &&
131 rcu_scheduler_active != RCU_SCHEDULER_INIT;
129} 132}
130EXPORT_SYMBOL_GPL(rcu_gp_is_normal); 133EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
131 134
@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
135/* 138/*
136 * Should normal grace-period primitives be expedited? Intended for 139 * Should normal grace-period primitives be expedited? Intended for
137 * use within RCU. Note that this function takes the rcu_expedited 140 * use within RCU. Note that this function takes the rcu_expedited
138 * sysfs/boot variable into account as well as the rcu_expedite_gp() 141 * sysfs/boot variable and rcu_scheduler_active into account as well
139 * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() 142 * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
140 * returns false is a -really- bad idea. 143 * until rcu_gp_is_expedited() returns false is a -really- bad idea.
141 */ 144 */
142bool rcu_gp_is_expedited(void) 145bool rcu_gp_is_expedited(void)
143{ 146{
144 return rcu_expedited || atomic_read(&rcu_expedited_nesting); 147 return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
148 rcu_scheduler_active == RCU_SCHEDULER_INIT;
145} 149}
146EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); 150EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
147 151
@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
257 261
258int notrace debug_lockdep_rcu_enabled(void) 262int notrace debug_lockdep_rcu_enabled(void)
259{ 263{
260 return rcu_scheduler_active && debug_locks && 264 return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
261 current->lockdep_recursion == 0; 265 current->lockdep_recursion == 0;
262} 266}
263EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 267EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
591void synchronize_rcu_tasks(void) 595void synchronize_rcu_tasks(void)
592{ 596{
593 /* Complain if the scheduler has not started. */ 597 /* Complain if the scheduler has not started. */
594 RCU_LOCKDEP_WARN(!rcu_scheduler_active, 598 RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
595 "synchronize_rcu_tasks called too soon"); 599 "synchronize_rcu_tasks called too soon");
596 600
597 /* Wait for the grace period. */ 601 /* Wait for the grace period. */
@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
813 817
814#endif /* #ifdef CONFIG_TASKS_RCU */ 818#endif /* #ifdef CONFIG_TASKS_RCU */
815 819
820/*
821 * Test each non-SRCU synchronous grace-period wait API. This is
822 * useful just after a change in mode for these primitives, and
823 * during early boot.
824 */
825void rcu_test_sync_prims(void)
826{
827 if (!IS_ENABLED(CONFIG_PROVE_RCU))
828 return;
829 synchronize_rcu();
830 synchronize_rcu_bh();
831 synchronize_sched();
832 synchronize_rcu_expedited();
833 synchronize_rcu_bh_expedited();
834 synchronize_sched_expedited();
835}
836
816#ifdef CONFIG_PROVE_RCU 837#ifdef CONFIG_PROVE_RCU
817 838
818/* 839/*
@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
865 early_boot_test_call_rcu_bh(); 886 early_boot_test_call_rcu_bh();
866 if (rcu_self_test_sched) 887 if (rcu_self_test_sched)
867 early_boot_test_call_rcu_sched(); 888 early_boot_test_call_rcu_sched();
889 rcu_test_sync_prims();
868} 890}
869 891
870static int rcu_verify_early_boot_tests(void) 892static int rcu_verify_early_boot_tests(void)