aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2014-12-08 23:26:55 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-01-06 14:05:28 -0500
commite3663b1024d1f94688e5233440ad67a9bc10b94e (patch)
tree429ec7e23feb1193df7b45d8633a822adb77905b
parent6ccd2ecd422644277b7d8b37222e3af3f43ea9ae (diff)
rcu: Handle gpnum/completed wrap while dyntick idle
Subtle race conditions can result if a CPU stays in dyntick-idle mode long enough for the ->gpnum and ->completed fields to wrap. For example, consider the following sequence of events: o CPU 1 encounters a quiescent state while waiting for grace period 5 to complete, but then enters dyntick-idle mode. o While CPU 1 is in dyntick-idle mode, the grace-period counters wrap around so that the grace period number is now 4. o Just as CPU 1 exits dyntick-idle mode, grace period 4 completes and grace period 5 begins. o The quiescent state that CPU 1 passed through during the old grace period 5 looks like it applies to the new grace period 5. Therefore, the new grace period 5 completes without CPU 1 having passed through a quiescent state. This could clearly be a fatal surprise to any long-running RCU read-side critical section that happened to be running on CPU 1 at the time. At one time, this was not a problem, given that it takes significant time for the grace-period counters to overflow even on 32-bit systems. However, with the advent of NO_HZ_FULL and SMP embedded systems, arbitrarily long idle periods are now becoming quite feasible. It is therefore time to close this race. This commit therefore avoids this race condition by having the quiescent-state forcing code detect when a CPU is falling too far behind, and setting a new rcu_data field ->gpwrap when this happens. Whenever this new ->gpwrap field is set, the CPU's ->gpnum and ->completed fields are known to be untrustworthy, and can be ignored, along with any associated quiescent states. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-rw-r--r--kernel/rcu/tree.c17
-rw-r--r--kernel/rcu/tree.h1
-rw-r--r--kernel/rcu/tree_plugin.h3
3 files changed, 15 insertions, 6 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a2ceb66bcd67..5987fdc85fc4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -930,6 +930,9 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
930 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 930 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
931 return 1; 931 return 1;
932 } else { 932 } else {
933 if (ULONG_CMP_LT(ACCESS_ONCE(rdp->gpnum) + ULONG_MAX / 4,
934 rdp->mynode->gpnum))
935 ACCESS_ONCE(rdp->gpwrap) = true;
933 return 0; 936 return 0;
934 } 937 }
935} 938}
@@ -1577,7 +1580,8 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1577 bool ret; 1580 bool ret;
1578 1581
1579 /* Handle the ends of any preceding grace periods first. */ 1582 /* Handle the ends of any preceding grace periods first. */
1580 if (rdp->completed == rnp->completed) { 1583 if (rdp->completed == rnp->completed &&
1584 !unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1581 1585
1582 /* No grace period end, so just accelerate recent callbacks. */ 1586 /* No grace period end, so just accelerate recent callbacks. */
1583 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1587 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
@@ -1592,7 +1596,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1592 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1596 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend"));
1593 } 1597 }
1594 1598
1595 if (rdp->gpnum != rnp->gpnum) { 1599 if (rdp->gpnum != rnp->gpnum || unlikely(ACCESS_ONCE(rdp->gpwrap))) {
1596 /* 1600 /*
1597 * If the current grace period is waiting for this CPU, 1601 * If the current grace period is waiting for this CPU,
1598 * set up to detect a quiescent state, otherwise don't 1602 * set up to detect a quiescent state, otherwise don't
@@ -1603,6 +1607,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1603 rdp->passed_quiesce = 0; 1607 rdp->passed_quiesce = 0;
1604 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1608 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1605 zero_cpu_stall_ticks(rdp); 1609 zero_cpu_stall_ticks(rdp);
1610 ACCESS_ONCE(rdp->gpwrap) = false;
1606 } 1611 }
1607 return ret; 1612 return ret;
1608} 1613}
@@ -1616,7 +1621,8 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1616 local_irq_save(flags); 1621 local_irq_save(flags);
1617 rnp = rdp->mynode; 1622 rnp = rdp->mynode;
1618 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && 1623 if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) &&
1619 rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ 1624 rdp->completed == ACCESS_ONCE(rnp->completed) &&
1625 !unlikely(ACCESS_ONCE(rdp->gpwrap))) || /* w/out lock. */
1620 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ 1626 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
1621 local_irq_restore(flags); 1627 local_irq_restore(flags);
1622 return; 1628 return;
@@ -2066,7 +2072,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2066 raw_spin_lock_irqsave(&rnp->lock, flags); 2072 raw_spin_lock_irqsave(&rnp->lock, flags);
2067 smp_mb__after_unlock_lock(); 2073 smp_mb__after_unlock_lock();
2068 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || 2074 if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
2069 rnp->completed == rnp->gpnum) { 2075 rnp->completed == rnp->gpnum || rdp->gpwrap) {
2070 2076
2071 /* 2077 /*
2072 * The grace period in which this quiescent state was 2078 * The grace period in which this quiescent state was
@@ -3190,7 +3196,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3190 } 3196 }
3191 3197
3192 /* Has a new RCU grace period started? */ 3198 /* Has a new RCU grace period started? */
3193 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ 3199 if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum ||
3200 unlikely(ACCESS_ONCE(rdp->gpwrap))) { /* outside lock */
3194 rdp->n_rp_gp_started++; 3201 rdp->n_rp_gp_started++;
3195 return 1; 3202 return 1;
3196 } 3203 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 5ec81cf938fd..7472ff388d55 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -260,6 +260,7 @@ struct rcu_data {
260 bool passed_quiesce; /* User-mode/idle loop etc. */ 260 bool passed_quiesce; /* User-mode/idle loop etc. */
261 bool qs_pending; /* Core waits for quiesc state. */ 261 bool qs_pending; /* Core waits for quiesc state. */
262 bool beenonline; /* CPU online at least once. */ 262 bool beenonline; /* CPU online at least once. */
263 bool gpwrap; /* Possible gpnum/completed wrap. */
263 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 264 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
264 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 265 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
265#ifdef CONFIG_RCU_CPU_STALL_INFO 266#ifdef CONFIG_RCU_CPU_STALL_INFO
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 769384d77437..81ff8b9a5a39 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1605,7 +1605,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1605 * completed since we last checked and there are 1605 * completed since we last checked and there are
1606 * callbacks not yet ready to invoke. 1606 * callbacks not yet ready to invoke.
1607 */ 1607 */
1608 if (rdp->completed != rnp->completed && 1608 if ((rdp->completed != rnp->completed ||
1609 unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
1609 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) 1610 rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1610 note_gp_changes(rsp, rdp); 1611 note_gp_changes(rsp, rdp);
1611 1612