aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcu
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2017-01-10 05:28:26 -0500
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-01-26 02:24:37 -0500
commit90687fc3c8c386a16326089d68cf616b8049440f (patch)
tree0ab423adffc0ff7731fbc1e858b0cf20ac537ce8 /kernel/rcu
parentbdeaa468e22171497b289c2f31d20ab6b318d53c (diff)
rcu: Narrow early boot window of illegal synchronous grace periods
commit 52d7e48b86fc108e45a656d8e53e4237993c481d upstream. The current preemptible RCU implementation goes through three phases during bootup. In the first phase, there is only one CPU that is running with preemption disabled, so that a no-op is a synchronous grace period. In the second mid-boot phase, the scheduler is running, but RCU has not yet gotten its kthreads spawned (and, for expedited grace periods, workqueues are not yet running. During this time, any attempt to do a synchronous grace period will hang the system (or complain bitterly, depending). In the third and final phase, RCU is fully operational and everything works normally. This has been OK for some time, but there has recently been some synchronous grace periods showing up during the second mid-boot phase. This code worked "by accident" for awhile, but started failing as soon as expedited RCU grace periods switched over to workqueues in commit 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue"). Note that the code was buggy even before this commit, as it was subject to failure on real-time systems that forced all expedited grace periods to run as normal grace periods (for example, using the rcu_normal ksysfs parameter). The callchain from the failure case is as follows: early_amd_iommu_init() |-> acpi_put_table(ivrs_base); |-> acpi_tb_put_table(table_desc); |-> acpi_tb_invalidate_table(table_desc); |-> acpi_tb_release_table(...) |-> acpi_os_unmap_memory |-> acpi_os_unmap_iomem |-> acpi_os_map_cleanup |-> synchronize_rcu_expedited The kernel showing this callchain was built with CONFIG_PREEMPT_RCU=y, which caused the code to try using workqueues before they were initialized, which did not go well. This commit therefore reworks RCU to permit synchronous grace periods to proceed during this mid-boot phase. This commit is therefore a fix to a regression introduced in v4.9, and is therefore being put forward post-merge-window in v4.10. This commit sets a flag from the existing rcu_scheduler_starting() function which causes all synchronous grace periods to take the expedited path. The expedited path now checks this flag, using the requesting task to drive the expedited grace period forward during the mid-boot phase. Finally, this flag is updated by a core_initcall() function named rcu_exp_runtime_mode(), which causes the runtime codepaths to be used. Note that this arrangement assumes that tasks are not sent POSIX signals (or anything similar) from the time that the first task is spawned through core_initcall() time. Fixes: 8b355e3bc140 ("rcu: Drive expedited grace periods from workqueue") Reported-by: "Zheng, Lv" <lv.zheng@intel.com> Reported-by: Borislav Petkov <bp@alien8.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Stan Kain <stan.kain@gmail.com> Tested-by: Ivan <waffolz@hotmail.com> Tested-by: Emanuel Castelo <emanuel.castelo@gmail.com> Tested-by: Bruno Pesavento <bpesavento@infinito.it> Tested-by: Borislav Petkov <bp@suse.de> Tested-by: Frederic Bezies <fredbezies@gmail.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel/rcu')
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/tiny_plugin.h9
-rw-r--r--kernel/rcu/tree.c33
-rw-r--r--kernel/rcu/tree_exp.h52
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/update.c38
6 files changed, 100 insertions, 35 deletions
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 80adef7d4c3d..0d6ff3e471be 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -136,6 +136,7 @@ int rcu_jiffies_till_stall_check(void);
136#define TPS(x) tracepoint_string(x) 136#define TPS(x) tracepoint_string(x)
137 137
138void rcu_early_boot_tests(void); 138void rcu_early_boot_tests(void);
139void rcu_test_sync_prims(void);
139 140
140/* 141/*
141 * This function really isn't for public consumption, but RCU is special in 142 * This function really isn't for public consumption, but RCU is special in
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 196f0302e2f4..c64b827ecbca 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -60,12 +60,17 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60 60
61/* 61/*
62 * During boot, we forgive RCU lockdep issues. After this function is 62 * During boot, we forgive RCU lockdep issues. After this function is
63 * invoked, we start taking RCU lockdep issues seriously. 63 * invoked, we start taking RCU lockdep issues seriously. Note that unlike
64 * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
65 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
66 * The reason for this is that Tiny RCU does not need kthreads, so does
67 * not have to care about the fact that the scheduler is half-initialized
68 * at a certain phase of the boot process.
64 */ 69 */
65void __init rcu_scheduler_starting(void) 70void __init rcu_scheduler_starting(void)
66{ 71{
67 WARN_ON(nr_context_switches() > 0); 72 WARN_ON(nr_context_switches() > 0);
68 rcu_scheduler_active = 1; 73 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
69} 74}
70 75
71#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 76#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 69a5611a7e7c..10f62c6f48e7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -127,13 +127,16 @@ int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
127int sysctl_panic_on_rcu_stall __read_mostly; 127int sysctl_panic_on_rcu_stall __read_mostly;
128 128
129/* 129/*
130 * The rcu_scheduler_active variable transitions from zero to one just 130 * The rcu_scheduler_active variable is initialized to the value
131 * before the first task is spawned. So when this variable is zero, RCU 131 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the
132 * can assume that there is but one task, allowing RCU to (for example) 132 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE,
133 * RCU can assume that there is but one task, allowing RCU to (for example)
133 * optimize synchronize_rcu() to a simple barrier(). When this variable 134 * optimize synchronize_rcu() to a simple barrier(). When this variable
134 * is one, RCU must actually do all the hard work required to detect real 135 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required
135 * grace periods. This variable is also used to suppress boot-time false 136 * to detect real grace periods. This variable is also used to suppress
136 * positives from lockdep-RCU error checking. 137 * boot-time false positives from lockdep-RCU error checking. Finally, it
138 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU
139 * is fully initialized, including all of its kthreads having been spawned.
137 */ 140 */
138int rcu_scheduler_active __read_mostly; 141int rcu_scheduler_active __read_mostly;
139EXPORT_SYMBOL_GPL(rcu_scheduler_active); 142EXPORT_SYMBOL_GPL(rcu_scheduler_active);
@@ -3985,18 +3988,22 @@ static int __init rcu_spawn_gp_kthread(void)
3985early_initcall(rcu_spawn_gp_kthread); 3988early_initcall(rcu_spawn_gp_kthread);
3986 3989
3987/* 3990/*
3988 * This function is invoked towards the end of the scheduler's initialization 3991 * This function is invoked towards the end of the scheduler's
3989 * process. Before this is called, the idle task might contain 3992 * initialization process. Before this is called, the idle task might
3990 * RCU read-side critical sections (during which time, this idle 3993 * contain synchronous grace-period primitives (during which time, this idle
3991 * task is booting the system). After this function is called, the 3994 * task is booting the system, and such primitives are no-ops). After this
3992 * idle tasks are prohibited from containing RCU read-side critical 3995 * function is called, any synchronous grace-period primitives are run as
3993 * sections. This function also enables RCU lockdep checking. 3996 * expedited, with the requesting task driving the grace period forward.
3997 * A later core_initcall() rcu_exp_runtime_mode() will switch to full
3998 * runtime RCU functionality.
3994 */ 3999 */
3995void rcu_scheduler_starting(void) 4000void rcu_scheduler_starting(void)
3996{ 4001{
3997 WARN_ON(num_online_cpus() != 1); 4002 WARN_ON(num_online_cpus() != 1);
3998 WARN_ON(nr_context_switches() > 0); 4003 WARN_ON(nr_context_switches() > 0);
3999 rcu_scheduler_active = 1; 4004 rcu_test_sync_prims();
4005 rcu_scheduler_active = RCU_SCHEDULER_INIT;
4006 rcu_test_sync_prims();
4000} 4007}
4001 4008
4002/* 4009/*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 24343eb87b58..78eba4120d46 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -522,18 +522,28 @@ struct rcu_exp_work {
522}; 522};
523 523
524/* 524/*
525 * Common code to drive an expedited grace period forward, used by
526 * workqueues and mid-boot-time tasks.
527 */
528static void rcu_exp_sel_wait_wake(struct rcu_state *rsp,
529 smp_call_func_t func, unsigned long s)
530{
531 /* Initialize the rcu_node tree in preparation for the wait. */
532 sync_rcu_exp_select_cpus(rsp, func);
533
534 /* Wait and clean up, including waking everyone. */
535 rcu_exp_wait_wake(rsp, s);
536}
537
538/*
525 * Work-queue handler to drive an expedited grace period forward. 539 * Work-queue handler to drive an expedited grace period forward.
526 */ 540 */
527static void wait_rcu_exp_gp(struct work_struct *wp) 541static void wait_rcu_exp_gp(struct work_struct *wp)
528{ 542{
529 struct rcu_exp_work *rewp; 543 struct rcu_exp_work *rewp;
530 544
531 /* Initialize the rcu_node tree in preparation for the wait. */
532 rewp = container_of(wp, struct rcu_exp_work, rew_work); 545 rewp = container_of(wp, struct rcu_exp_work, rew_work);
533 sync_rcu_exp_select_cpus(rewp->rew_rsp, rewp->rew_func); 546 rcu_exp_sel_wait_wake(rewp->rew_rsp, rewp->rew_func, rewp->rew_s);
534
535 /* Wait and clean up, including waking everyone. */
536 rcu_exp_wait_wake(rewp->rew_rsp, rewp->rew_s);
537} 547}
538 548
539/* 549/*
@@ -559,12 +569,18 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp,
559 if (exp_funnel_lock(rsp, s)) 569 if (exp_funnel_lock(rsp, s))
560 return; /* Someone else did our work for us. */ 570 return; /* Someone else did our work for us. */
561 571
562 /* Marshall arguments and schedule the expedited grace period. */ 572 /* Ensure that load happens before action based on it. */
563 rew.rew_func = func; 573 if (unlikely(rcu_scheduler_active == RCU_SCHEDULER_INIT)) {
564 rew.rew_rsp = rsp; 574 /* Direct call during scheduler init and early_initcalls(). */
565 rew.rew_s = s; 575 rcu_exp_sel_wait_wake(rsp, func, s);
566 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); 576 } else {
567 schedule_work(&rew.rew_work); 577 /* Marshall arguments & schedule the expedited grace period. */
578 rew.rew_func = func;
579 rew.rew_rsp = rsp;
580 rew.rew_s = s;
581 INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp);
582 schedule_work(&rew.rew_work);
583 }
568 584
569 /* Wait for expedited grace period to complete. */ 585 /* Wait for expedited grace period to complete. */
570 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 586 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
@@ -666,6 +682,8 @@ void synchronize_rcu_expedited(void)
666{ 682{
667 struct rcu_state *rsp = rcu_state_p; 683 struct rcu_state *rsp = rcu_state_p;
668 684
685 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
686 return;
669 _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler); 687 _synchronize_rcu_expedited(rsp, sync_rcu_exp_handler);
670} 688}
671EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 689EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
@@ -683,3 +701,15 @@ void synchronize_rcu_expedited(void)
683EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 701EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
684 702
685#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ 703#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
704
705/*
706 * Switch to run-time mode once Tree RCU has fully initialized.
707 */
708static int __init rcu_exp_runtime_mode(void)
709{
710 rcu_test_sync_prims();
711 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
712 rcu_test_sync_prims();
713 return 0;
714}
715core_initcall(rcu_exp_runtime_mode);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 85c5a883c6e3..56583e764ebf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -670,7 +670,7 @@ void synchronize_rcu(void)
670 lock_is_held(&rcu_lock_map) || 670 lock_is_held(&rcu_lock_map) ||
671 lock_is_held(&rcu_sched_lock_map), 671 lock_is_held(&rcu_sched_lock_map),
672 "Illegal synchronize_rcu() in RCU read-side critical section"); 672 "Illegal synchronize_rcu() in RCU read-side critical section");
673 if (!rcu_scheduler_active) 673 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
674 return; 674 return;
675 if (rcu_gp_is_expedited()) 675 if (rcu_gp_is_expedited())
676 synchronize_rcu_expedited(); 676 synchronize_rcu_expedited();
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f19271dce0a9..4f6db7e6a117 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -121,11 +121,14 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
121 * Should expedited grace-period primitives always fall back to their 121 * Should expedited grace-period primitives always fall back to their
122 * non-expedited counterparts? Intended for use within RCU. Note 122 * non-expedited counterparts? Intended for use within RCU. Note
123 * that if the user specifies both rcu_expedited and rcu_normal, then 123 * that if the user specifies both rcu_expedited and rcu_normal, then
124 * rcu_normal wins. 124 * rcu_normal wins. (Except during the time period during boot from
125 * when the first task is spawned until the rcu_exp_runtime_mode()
126 * core_initcall() is invoked, at which point everything is expedited.)
125 */ 127 */
126bool rcu_gp_is_normal(void) 128bool rcu_gp_is_normal(void)
127{ 129{
128 return READ_ONCE(rcu_normal); 130 return READ_ONCE(rcu_normal) &&
131 rcu_scheduler_active != RCU_SCHEDULER_INIT;
129} 132}
130EXPORT_SYMBOL_GPL(rcu_gp_is_normal); 133EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
131 134
@@ -135,13 +138,14 @@ static atomic_t rcu_expedited_nesting =
135/* 138/*
136 * Should normal grace-period primitives be expedited? Intended for 139 * Should normal grace-period primitives be expedited? Intended for
137 * use within RCU. Note that this function takes the rcu_expedited 140 * use within RCU. Note that this function takes the rcu_expedited
138 * sysfs/boot variable into account as well as the rcu_expedite_gp() 141 * sysfs/boot variable and rcu_scheduler_active into account as well
139 * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() 142 * as the rcu_expedite_gp() nesting. So looping on rcu_unexpedite_gp()
140 * returns false is a -really- bad idea. 143 * until rcu_gp_is_expedited() returns false is a -really- bad idea.
141 */ 144 */
142bool rcu_gp_is_expedited(void) 145bool rcu_gp_is_expedited(void)
143{ 146{
144 return rcu_expedited || atomic_read(&rcu_expedited_nesting); 147 return rcu_expedited || atomic_read(&rcu_expedited_nesting) ||
148 rcu_scheduler_active == RCU_SCHEDULER_INIT;
145} 149}
146EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); 150EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
147 151
@@ -257,7 +261,7 @@ EXPORT_SYMBOL_GPL(rcu_callback_map);
257 261
258int notrace debug_lockdep_rcu_enabled(void) 262int notrace debug_lockdep_rcu_enabled(void)
259{ 263{
260 return rcu_scheduler_active && debug_locks && 264 return rcu_scheduler_active != RCU_SCHEDULER_INACTIVE && debug_locks &&
261 current->lockdep_recursion == 0; 265 current->lockdep_recursion == 0;
262} 266}
263EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 267EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
@@ -591,7 +595,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
591void synchronize_rcu_tasks(void) 595void synchronize_rcu_tasks(void)
592{ 596{
593 /* Complain if the scheduler has not started. */ 597 /* Complain if the scheduler has not started. */
594 RCU_LOCKDEP_WARN(!rcu_scheduler_active, 598 RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE,
595 "synchronize_rcu_tasks called too soon"); 599 "synchronize_rcu_tasks called too soon");
596 600
597 /* Wait for the grace period. */ 601 /* Wait for the grace period. */
@@ -813,6 +817,23 @@ static void rcu_spawn_tasks_kthread(void)
813 817
814#endif /* #ifdef CONFIG_TASKS_RCU */ 818#endif /* #ifdef CONFIG_TASKS_RCU */
815 819
820/*
821 * Test each non-SRCU synchronous grace-period wait API. This is
822 * useful just after a change in mode for these primitives, and
823 * during early boot.
824 */
825void rcu_test_sync_prims(void)
826{
827 if (!IS_ENABLED(CONFIG_PROVE_RCU))
828 return;
829 synchronize_rcu();
830 synchronize_rcu_bh();
831 synchronize_sched();
832 synchronize_rcu_expedited();
833 synchronize_rcu_bh_expedited();
834 synchronize_sched_expedited();
835}
836
816#ifdef CONFIG_PROVE_RCU 837#ifdef CONFIG_PROVE_RCU
817 838
818/* 839/*
@@ -865,6 +886,7 @@ void rcu_early_boot_tests(void)
865 early_boot_test_call_rcu_bh(); 886 early_boot_test_call_rcu_bh();
866 if (rcu_self_test_sched) 887 if (rcu_self_test_sched)
867 early_boot_test_call_rcu_sched(); 888 early_boot_test_call_rcu_sched();
889 rcu_test_sync_prims();
868} 890}
869 891
870static int rcu_verify_early_boot_tests(void) 892static int rcu_verify_early_boot_tests(void)