aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2018-05-25 22:23:09 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2018-07-12 18:39:04 -0400
commit1e64b15a4b102e1cd059d4d798b7a78f93341333 (patch)
tree47057a34ee2231117fb7ce6281d5f13fc66dc2b7 /kernel/rcu/tree.c
parentec2c29765a4ab12c236ac5a89b89660222ff6b01 (diff)
rcu: Fix grace-period hangs due to race with CPU offline
Without special fail-safe quiescent-state-propagation checks, grace-period hangs can result from the following scenario: 1. CPU 1 goes offline. 2. Because CPU 1 is the only CPU in the system blocking the current grace period, the grace period ends as soon as rcu_cleanup_dying_idle_cpu()'s call to rcu_report_qs_rnp() returns. 3. At this point, the leaf rcu_node structure's ->lock is no longer held: rcu_report_qs_rnp() has released it, as it must in order to awaken the RCU grace-period kthread. 4. At this point, that same leaf rcu_node structure's ->qsmaskinitnext field still records CPU 1 as being online. This is absolutely necessary because the scheduler uses RCU (in this case on the wake-up path while awakening RCU's grace-period kthread), and ->qsmaskinitnext contains RCU's idea as to which CPUs are online. Therefore, invoking rcu_report_qs_rnp() after clearing CPU 1's bit from ->qsmaskinitnext would result in a lockdep-RCU splat due to RCU being used from an offline CPU. 5. RCU's grace-period kthread awakens, sees that the old grace period has completed and that a new one is needed. It therefore starts a new grace period, but because CPU 1's leaf rcu_node structure's ->qsmaskinitnext field still shows CPU 1 as being online, this new grace period is initialized to wait for a quiescent state from the now-offline CPU 1. 6. Without the fail-safe force-quiescent-state checks, there would be no quiescent state from the now-offline CPU 1, which would eventually result in RCU CPU stall warnings and memory exhaustion. It would be good to get rid of the special fail-safe quiescent-state propagation checks, and thus it would be good to fix things so that the above scenario cannot happen. This commit therefore adds a new ->ofl_lock to the rcu_state structure. This lock is held by rcu_gp_init() across the applying of buffered online and offline operations to the rcu_node tree, and it is also held by rcu_cleanup_dying_idle_cpu() when buffering a new offline operation. This prevents rcu_gp_init() from acquiring the leaf rcu_node structure's lock during the interval between when rcu_cleanup_dying_idle_cpu() invokes rcu_report_qs_rnp(), which releases ->lock and the re-acquisition of that same lock. This in turn prevents the failure scenario outlined above, and will hopefully eventually allow removal of the offline-CPU checks from the force-quiescent-state code path. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c6
1 files changed, 6 insertions, 0 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 7f872721c54e..50e4f7ebf8cf 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,6 +101,7 @@ struct rcu_state sname##_state = { \
101 .abbr = sabbr, \ 101 .abbr = sabbr, \
102 .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ 102 .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
103 .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ 103 .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
104 .ofl_lock = __SPIN_LOCK_UNLOCKED(sname##_state.ofl_lock), \
104} 105}
105 106
106RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 107RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -1900,11 +1901,13 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1900 */ 1901 */
1901 rcu_for_each_leaf_node(rsp, rnp) { 1902 rcu_for_each_leaf_node(rsp, rnp) {
1902 rcu_gp_slow(rsp, gp_preinit_delay); 1903 rcu_gp_slow(rsp, gp_preinit_delay);
1904 spin_lock(&rsp->ofl_lock);
1903 raw_spin_lock_irq_rcu_node(rnp); 1905 raw_spin_lock_irq_rcu_node(rnp);
1904 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 1906 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1905 !rnp->wait_blkd_tasks) { 1907 !rnp->wait_blkd_tasks) {
1906 /* Nothing to do on this leaf rcu_node structure. */ 1908 /* Nothing to do on this leaf rcu_node structure. */
1907 raw_spin_unlock_irq_rcu_node(rnp); 1909 raw_spin_unlock_irq_rcu_node(rnp);
1910 spin_unlock(&rsp->ofl_lock);
1908 continue; 1911 continue;
1909 } 1912 }
1910 1913
@@ -1940,6 +1943,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
1940 } 1943 }
1941 1944
1942 raw_spin_unlock_irq_rcu_node(rnp); 1945 raw_spin_unlock_irq_rcu_node(rnp);
1946 spin_unlock(&rsp->ofl_lock);
1943 } 1947 }
1944 1948
1945 /* 1949 /*
@@ -3749,6 +3753,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
3749 3753
3750 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 3754 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
3751 mask = rdp->grpmask; 3755 mask = rdp->grpmask;
3756 spin_lock(&rsp->ofl_lock);
3752 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 3757 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
3753 if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ 3758 if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
3754 /* Report quiescent state -before- changing ->qsmaskinitnext! */ 3759 /* Report quiescent state -before- changing ->qsmaskinitnext! */
@@ -3757,6 +3762,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
3757 } 3762 }
3758 rnp->qsmaskinitnext &= ~mask; 3763 rnp->qsmaskinitnext &= ~mask;
3759 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3764 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3765 spin_unlock(&rsp->ofl_lock);
3760} 3766}
3761 3767
3762/* 3768/*