diff options
| author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2018-05-25 22:23:09 -0400 | 
|---|---|---|
| committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2018-07-12 18:39:04 -0400 | 
| commit | 1e64b15a4b102e1cd059d4d798b7a78f93341333 (patch) | |
| tree | 47057a34ee2231117fb7ce6281d5f13fc66dc2b7 /kernel/rcu/tree.c | |
| parent | ec2c29765a4ab12c236ac5a89b89660222ff6b01 (diff) | |
rcu: Fix grace-period hangs due to race with CPU offline
Without special fail-safe quiescent-state-propagation checks, grace-period
hangs can result from the following scenario:
1.	CPU 1 goes offline.
2.	Because CPU 1 is the only CPU in the system blocking the current
	grace period, the grace period ends as soon as
	rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp()
	returns.
3.	At this point, the leaf rcu_node structure's ->lock is no longer
	held: rcu_report_qs_rnp() has released it, as it must in order
	to awaken the RCU grace-period kthread.
4.	At this point, that same leaf rcu_node structure's ->qsmaskinitnext
	field still records CPU 1 as being online.  This is absolutely
	necessary because the scheduler uses RCU (in this case on the
	wake-up path while awakening RCU's grace-period kthread), and
	->qsmaskinitnext contains RCU's idea as to which CPUs are online.
	Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's
	bit from ->qsmaskinitnext would result in a lockdep-RCU splat
	due to RCU being used from an offline CPU.
5.	RCU's grace-period kthread awakens, sees that the old grace period
	has completed and that a new one is needed.  It therefore starts
	a new grace period, but because CPU 1's leaf rcu_node structure's
	->qsmaskinitnext field still shows CPU 1 as being online, this new
	grace period is initialized to wait for a quiescent state from the
	now-offline CPU 1.
6.	Without the fail-safe force-quiescent-state checks, there would
	be no quiescent state from the now-offline CPU 1, which would
	eventually result in RCU CPU stall warnings and memory exhaustion.
It would be good to get rid of the special fail-safe quiescent-state
propagation checks, and thus it would be good to fix things so that
the above scenario cannot happen.  This commit therefore adds a new
->ofl_lock to the rcu_state structure.  This lock is held by rcu_gp_init()
across the applying of buffered online and offline operations to the
rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu()
when buffering a new offline operation.  This prevents rcu_gp_init()
from acquiring the leaf rcu_node structure's lock during the interval
between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(),
which releases ->lock and the re-acquisition of that same lock.
This in turn prevents the failure scenario outlined above, and will
hopefully eventually allow removal of the offline-CPU checks from the
force-quiescent-state code path.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcu/tree.c')
| -rw-r--r-- | kernel/rcu/tree.c | 6 | 
1 files changed, 6 insertions, 0 deletions
| diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7f872721c54e..50e4f7ebf8cf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -101,6 +101,7 @@ struct rcu_state sname##_state = { \ | |||
| 101 | .abbr = sabbr, \ | 101 | .abbr = sabbr, \ | 
| 102 | .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ | 102 | .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ | 
| 103 | .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ | 103 | .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ | 
| 104 | .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \ | ||
| 104 | } | 105 | } | 
| 105 | 106 | ||
| 106 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 107 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | 
| @@ -1900,11 +1901,13 @@ static bool rcu_gp_init(struct rcu_state *rsp) | |||
| 1900 | */ | 1901 | */ | 
| 1901 | rcu_for_each_leaf_node(rsp, rnp) { | 1902 | rcu_for_each_leaf_node(rsp, rnp) { | 
| 1902 | rcu_gp_slow(rsp, gp_preinit_delay); | 1903 | rcu_gp_slow(rsp, gp_preinit_delay); | 
| 1904 | spin_lock(&rsp->ofl_lock); | ||
| 1903 | raw_spin_lock_irq_rcu_node(rnp); | 1905 | raw_spin_lock_irq_rcu_node(rnp); | 
| 1904 | if (rnp->qsmaskinit == rnp->qsmaskinitnext && | 1906 | if (rnp->qsmaskinit == rnp->qsmaskinitnext && | 
| 1905 | !rnp->wait_blkd_tasks) { | 1907 | !rnp->wait_blkd_tasks) { | 
| 1906 | /* Nothing to do on this leaf rcu_node structure. */ | 1908 | /* Nothing to do on this leaf rcu_node structure. */ | 
| 1907 | raw_spin_unlock_irq_rcu_node(rnp); | 1909 | raw_spin_unlock_irq_rcu_node(rnp); | 
| 1910 | spin_unlock(&rsp->ofl_lock); | ||
| 1908 | continue; | 1911 | continue; | 
| 1909 | } | 1912 | } | 
| 1910 | 1913 | ||
| @@ -1940,6 +1943,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) | |||
| 1940 | } | 1943 | } | 
| 1941 | 1944 | ||
| 1942 | raw_spin_unlock_irq_rcu_node(rnp); | 1945 | raw_spin_unlock_irq_rcu_node(rnp); | 
| 1946 | spin_unlock(&rsp->ofl_lock); | ||
| 1943 | } | 1947 | } | 
| 1944 | 1948 | ||
| 1945 | /* | 1949 | /* | 
| @@ -3749,6 +3753,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
| 3749 | 3753 | ||
| 3750 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | 3754 | /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ | 
| 3751 | mask = rdp->grpmask; | 3755 | mask = rdp->grpmask; | 
| 3756 | spin_lock(&rsp->ofl_lock); | ||
| 3752 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ | 3757 | raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ | 
| 3753 | if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ | 3758 | if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ | 
| 3754 | /* Report quiescent state -before- changing ->qsmaskinitnext! */ | 3759 | /* Report quiescent state -before- changing ->qsmaskinitnext! */ | 
| @@ -3757,6 +3762,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) | |||
| 3757 | } | 3762 | } | 
| 3758 | rnp->qsmaskinitnext &= ~mask; | 3763 | rnp->qsmaskinitnext &= ~mask; | 
| 3759 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 3764 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 
| 3765 | spin_unlock(&rsp->ofl_lock); | ||
| 3760 | } | 3766 | } | 
| 3761 | 3767 | ||
| 3762 | /* | 3768 | /* | 
