aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2014-03-11 16:02:16 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2014-04-29 11:44:07 -0400
commit48a7639ce80cf279834d0d44865e49ecd714f37d (patch)
tree78ef128affd547e83363a6e2258b915fb0c73ab4 /kernel/rcu/tree.c
parent4fc5b75537d4f56577ad00355b4cd09627deb3c3 (diff)
rcu: Make callers awaken grace-period kthread
The rcu_start_gp_advanced() function currently uses irq_work_queue() to defer wakeups of the RCU grace-period kthread. This deferring is necessary to avoid RCU-scheduler deadlocks involving the rcu_node structure's lock, meaning that RCU cannot call any of the scheduler's wake-up functions while holding one of these locks. Unfortunately, the second and subsequent calls to irq_work_queue() are ignored, and the first call will be ignored (aside from queuing the work item) if the scheduler-clock tick is turned off. This is OK for many uses, especially those where irq_work_queue() is called from an interrupt or softirq handler, because in those cases the scheduler-clock-tick state will be re-evaluated, which will turn the scheduler-clock tick back on. On the next tick, any deferred work will then be processed. However, this strategy does not always work for RCU, which can be invoked at process level from idle CPUs. In this case, the tick might never be turned back on, indefinitely defering a grace-period start request. Note that the RCU CPU stall detector cannot see this condition, because there is no RCU grace period in progress. Therefore, we can (and do!) see long tens-of-seconds stalls in grace-period handling. In theory, we could see a full grace-period hang, but rcutorture testing to date has seen only the tens-of-seconds stalls. Event tracing demonstrates that irq_work_queue() is being called repeatedly to no effect during these stalls: The "newreq" event appears repeatedly from a task that is not one of the grace-period kthreads. In theory, irq_work_queue() might be fixed to avoid this sort of issue, but RCU's requirements are unusual and it is quite straightforward to pass wake-up responsibility up through RCU's call chain, so that the wakeup happens when the offending locks are released. This commit therefore makes this change. The rcu_start_gp_advanced(), rcu_start_future_gp(), rcu_accelerate_cbs(), rcu_advance_cbs(), __note_gp_changes(), and rcu_start_gp() functions now return a boolean which indicates when a wake-up is needed. A new rcu_gp_kthread_wake() does the wakeup when it is necessary and safe to do so: No self-wakes, no wake-ups if the ->gp_flags field indicates there is no need (as in someone else did the wake-up before we got around to it), and no wake-ups before the grace-period kthread has been created. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Frederic Weisbecker <fweisbec@gmail.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c137
1 files changed, 86 insertions, 51 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c624415f8386..fca911b6b29c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -243,7 +243,7 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
243module_param(jiffies_till_first_fqs, ulong, 0644); 243module_param(jiffies_till_first_fqs, ulong, 0644);
244module_param(jiffies_till_next_fqs, ulong, 0644); 244module_param(jiffies_till_next_fqs, ulong, 0644);
245 245
246static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 246static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
247 struct rcu_data *rdp); 247 struct rcu_data *rdp);
248static void force_qs_rnp(struct rcu_state *rsp, 248static void force_qs_rnp(struct rcu_state *rsp,
249 int (*f)(struct rcu_data *rsp, bool *isidle, 249 int (*f)(struct rcu_data *rsp, bool *isidle,
@@ -1138,15 +1138,18 @@ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1138/* 1138/*
1139 * Start some future grace period, as needed to handle newly arrived 1139 * Start some future grace period, as needed to handle newly arrived
1140 * callbacks. The required future grace periods are recorded in each 1140 * callbacks. The required future grace periods are recorded in each
1141 * rcu_node structure's ->need_future_gp field. 1141 * rcu_node structure's ->need_future_gp field. Returns true if there
1142 * is reason to awaken the grace-period kthread.
1142 * 1143 *
1143 * The caller must hold the specified rcu_node structure's ->lock. 1144 * The caller must hold the specified rcu_node structure's ->lock.
1144 */ 1145 */
1145static unsigned long __maybe_unused 1146static bool __maybe_unused
1146rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) 1147rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
1148 unsigned long *c_out)
1147{ 1149{
1148 unsigned long c; 1150 unsigned long c;
1149 int i; 1151 int i;
1152 bool ret = false;
1150 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1153 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
1151 1154
1152 /* 1155 /*
@@ -1157,7 +1160,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1157 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1160 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf"));
1158 if (rnp->need_future_gp[c & 0x1]) { 1161 if (rnp->need_future_gp[c & 0x1]) {
1159 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1162 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf"));
1160 return c; 1163 goto out;
1161 } 1164 }
1162 1165
1163 /* 1166 /*
@@ -1171,7 +1174,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1171 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { 1174 ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
1172 rnp->need_future_gp[c & 0x1]++; 1175 rnp->need_future_gp[c & 0x1]++;
1173 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1176 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
1174 return c; 1177 goto out;
1175 } 1178 }
1176 1179
1177 /* 1180 /*
@@ -1212,12 +1215,15 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
1212 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1215 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot"));
1213 } else { 1216 } else {
1214 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1217 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot"));
1215 rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1218 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
1216 } 1219 }
1217unlock_out: 1220unlock_out:
1218 if (rnp != rnp_root) 1221 if (rnp != rnp_root)
1219 raw_spin_unlock(&rnp_root->lock); 1222 raw_spin_unlock(&rnp_root->lock);
1220 return c; 1223out:
1224 if (c_out != NULL)
1225 *c_out = c;
1226 return ret;
1221} 1227}
1222 1228
1223/* 1229/*
@@ -1241,25 +1247,43 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1241} 1247}
1242 1248
1243/* 1249/*
1250 * Awaken the grace-period kthread for the specified flavor of RCU.
1251 * Don't do a self-awaken, and don't bother awakening when there is
1252 * nothing for the grace-period kthread to do (as in several CPUs
1253 * raced to awaken, and we lost), and finally don't try to awaken
1254 * a kthread that has not yet been created.
1255 */
1256static void rcu_gp_kthread_wake(struct rcu_state *rsp)
1257{
1258 if (current == rsp->gp_kthread ||
1259 !ACCESS_ONCE(rsp->gp_flags) ||
1260 !rsp->gp_kthread)
1261 return;
1262 wake_up(&rsp->gp_wq);
1263}
1264
1265/*
1244 * If there is room, assign a ->completed number to any callbacks on 1266 * If there is room, assign a ->completed number to any callbacks on
1245 * this CPU that have not already been assigned. Also accelerate any 1267 * this CPU that have not already been assigned. Also accelerate any
1246 * callbacks that were previously assigned a ->completed number that has 1268 * callbacks that were previously assigned a ->completed number that has
1247 * since proven to be too conservative, which can happen if callbacks get 1269 * since proven to be too conservative, which can happen if callbacks get
1248 * assigned a ->completed number while RCU is idle, but with reference to 1270 * assigned a ->completed number while RCU is idle, but with reference to
1249 * a non-root rcu_node structure. This function is idempotent, so it does 1271 * a non-root rcu_node structure. This function is idempotent, so it does
1250 * not hurt to call it repeatedly. 1272 * not hurt to call it repeatedly. Returns an flag saying that we should
1273 * awaken the RCU grace-period kthread.
1251 * 1274 *
1252 * The caller must hold rnp->lock with interrupts disabled. 1275 * The caller must hold rnp->lock with interrupts disabled.
1253 */ 1276 */
1254static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1277static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1255 struct rcu_data *rdp) 1278 struct rcu_data *rdp)
1256{ 1279{
1257 unsigned long c; 1280 unsigned long c;
1258 int i; 1281 int i;
1282 bool ret;
1259 1283
1260 /* If the CPU has no callbacks, nothing to do. */ 1284 /* If the CPU has no callbacks, nothing to do. */
1261 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1285 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1262 return; 1286 return false;
1263 1287
1264 /* 1288 /*
1265 * Starting from the sublist containing the callbacks most 1289 * Starting from the sublist containing the callbacks most
@@ -1288,7 +1312,7 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1288 * be grouped into. 1312 * be grouped into.
1289 */ 1313 */
1290 if (++i >= RCU_NEXT_TAIL) 1314 if (++i >= RCU_NEXT_TAIL)
1291 return; 1315 return false;
1292 1316
1293 /* 1317 /*
1294 * Assign all subsequent callbacks' ->completed number to the next 1318 * Assign all subsequent callbacks' ->completed number to the next
@@ -1300,13 +1324,14 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1300 rdp->nxtcompleted[i] = c; 1324 rdp->nxtcompleted[i] = c;
1301 } 1325 }
1302 /* Record any needed additional grace periods. */ 1326 /* Record any needed additional grace periods. */
1303 rcu_start_future_gp(rnp, rdp); 1327 ret = rcu_start_future_gp(rnp, rdp, NULL);
1304 1328
1305 /* Trace depending on how much we were able to accelerate. */ 1329 /* Trace depending on how much we were able to accelerate. */
1306 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1330 if (!*rdp->nxttail[RCU_WAIT_TAIL])
1307 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1331 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB"));
1308 else 1332 else
1309 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1333 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB"));
1334 return ret;
1310} 1335}
1311 1336
1312/* 1337/*
@@ -1315,17 +1340,18 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1315 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1340 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL
1316 * sublist. This function is idempotent, so it does not hurt to 1341 * sublist. This function is idempotent, so it does not hurt to
1317 * invoke it repeatedly. As long as it is not invoked -too- often... 1342 * invoke it repeatedly. As long as it is not invoked -too- often...
1343 * Returns true if the RCU grace-period kthread needs to be awakened.
1318 * 1344 *
1319 * The caller must hold rnp->lock with interrupts disabled. 1345 * The caller must hold rnp->lock with interrupts disabled.
1320 */ 1346 */
1321static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1347static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1322 struct rcu_data *rdp) 1348 struct rcu_data *rdp)
1323{ 1349{
1324 int i, j; 1350 int i, j;
1325 1351
1326 /* If the CPU has no callbacks, nothing to do. */ 1352 /* If the CPU has no callbacks, nothing to do. */
1327 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1353 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL])
1328 return; 1354 return false;
1329 1355
1330 /* 1356 /*
1331 * Find all callbacks whose ->completed numbers indicate that they 1357 * Find all callbacks whose ->completed numbers indicate that they
@@ -1349,26 +1375,30 @@ static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
1349 } 1375 }
1350 1376
1351 /* Classify any remaining callbacks. */ 1377 /* Classify any remaining callbacks. */
1352 rcu_accelerate_cbs(rsp, rnp, rdp); 1378 return rcu_accelerate_cbs(rsp, rnp, rdp);
1353} 1379}
1354 1380
1355/* 1381/*
1356 * Update CPU-local rcu_data state to record the beginnings and ends of 1382 * Update CPU-local rcu_data state to record the beginnings and ends of
1357 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1383 * grace periods. The caller must hold the ->lock of the leaf rcu_node
1358 * structure corresponding to the current CPU, and must have irqs disabled. 1384 * structure corresponding to the current CPU, and must have irqs disabled.
1385 * Returns true if the grace-period kthread needs to be awakened.
1359 */ 1386 */
1360static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) 1387static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1388 struct rcu_data *rdp)
1361{ 1389{
1390 bool ret;
1391
1362 /* Handle the ends of any preceding grace periods first. */ 1392 /* Handle the ends of any preceding grace periods first. */
1363 if (rdp->completed == rnp->completed) { 1393 if (rdp->completed == rnp->completed) {
1364 1394
1365 /* No grace period end, so just accelerate recent callbacks. */ 1395 /* No grace period end, so just accelerate recent callbacks. */
1366 rcu_accelerate_cbs(rsp, rnp, rdp); 1396 ret = rcu_accelerate_cbs(rsp, rnp, rdp);
1367 1397
1368 } else { 1398 } else {
1369 1399
1370 /* Advance callbacks. */ 1400 /* Advance callbacks. */
1371 rcu_advance_cbs(rsp, rnp, rdp); 1401 ret = rcu_advance_cbs(rsp, rnp, rdp);
1372 1402
1373 /* Remember that we saw this grace-period completion. */ 1403 /* Remember that we saw this grace-period completion. */
1374 rdp->completed = rnp->completed; 1404 rdp->completed = rnp->completed;
@@ -1387,11 +1417,13 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc
1387 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1417 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
1388 zero_cpu_stall_ticks(rdp); 1418 zero_cpu_stall_ticks(rdp);
1389 } 1419 }
1420 return ret;
1390} 1421}
1391 1422
1392static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 1423static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1393{ 1424{
1394 unsigned long flags; 1425 unsigned long flags;
1426 bool needwake;
1395 struct rcu_node *rnp; 1427 struct rcu_node *rnp;
1396 1428
1397 local_irq_save(flags); 1429 local_irq_save(flags);
@@ -1403,8 +1435,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1403 return; 1435 return;
1404 } 1436 }
1405 smp_mb__after_unlock_lock(); 1437 smp_mb__after_unlock_lock();
1406 __note_gp_changes(rsp, rnp, rdp); 1438 needwake = __note_gp_changes(rsp, rnp, rdp);
1407 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1439 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1440 if (needwake)
1441 rcu_gp_kthread_wake(rsp);
1408} 1442}
1409 1443
1410/* 1444/*
@@ -1468,7 +1502,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1468 WARN_ON_ONCE(rnp->completed != rsp->completed); 1502 WARN_ON_ONCE(rnp->completed != rsp->completed);
1469 ACCESS_ONCE(rnp->completed) = rsp->completed; 1503 ACCESS_ONCE(rnp->completed) = rsp->completed;
1470 if (rnp == rdp->mynode) 1504 if (rnp == rdp->mynode)
1471 __note_gp_changes(rsp, rnp, rdp); 1505 (void)__note_gp_changes(rsp, rnp, rdp);
1472 rcu_preempt_boost_start_gp(rnp); 1506 rcu_preempt_boost_start_gp(rnp);
1473 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 1507 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
1474 rnp->level, rnp->grplo, 1508 rnp->level, rnp->grplo,
@@ -1528,6 +1562,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1528static void rcu_gp_cleanup(struct rcu_state *rsp) 1562static void rcu_gp_cleanup(struct rcu_state *rsp)
1529{ 1563{
1530 unsigned long gp_duration; 1564 unsigned long gp_duration;
1565 bool needgp = false;
1531 int nocb = 0; 1566 int nocb = 0;
1532 struct rcu_data *rdp; 1567 struct rcu_data *rdp;
1533 struct rcu_node *rnp = rcu_get_root(rsp); 1568 struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1563,7 +1598,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1563 ACCESS_ONCE(rnp->completed) = rsp->gpnum; 1598 ACCESS_ONCE(rnp->completed) = rsp->gpnum;
1564 rdp = this_cpu_ptr(rsp->rda); 1599 rdp = this_cpu_ptr(rsp->rda);
1565 if (rnp == rdp->mynode) 1600 if (rnp == rdp->mynode)
1566 __note_gp_changes(rsp, rnp, rdp); 1601 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
1567 /* smp_mb() provided by prior unlock-lock pair. */ 1602 /* smp_mb() provided by prior unlock-lock pair. */
1568 nocb += rcu_future_gp_cleanup(rsp, rnp); 1603 nocb += rcu_future_gp_cleanup(rsp, rnp);
1569 raw_spin_unlock_irq(&rnp->lock); 1604 raw_spin_unlock_irq(&rnp->lock);
@@ -1579,8 +1614,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1579 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 1614 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
1580 rsp->fqs_state = RCU_GP_IDLE; 1615 rsp->fqs_state = RCU_GP_IDLE;
1581 rdp = this_cpu_ptr(rsp->rda); 1616 rdp = this_cpu_ptr(rsp->rda);
1582 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1617 /* Advance CBs to reduce false positives below. */
1583 if (cpu_needs_another_gp(rsp, rdp)) { 1618 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp;
1619 if (needgp || cpu_needs_another_gp(rsp, rdp)) {
1584 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; 1620 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1585 trace_rcu_grace_period(rsp->name, 1621 trace_rcu_grace_period(rsp->name,
1586 ACCESS_ONCE(rsp->gpnum), 1622 ACCESS_ONCE(rsp->gpnum),
@@ -1680,16 +1716,6 @@ static int __noreturn rcu_gp_kthread(void *arg)
1680 } 1716 }
1681} 1717}
1682 1718
1683static void rsp_wakeup(struct irq_work *work)
1684{
1685 struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
1686
1687 /* Wake up rcu_gp_kthread() to start the grace period. */
1688 wake_up(&rsp->gp_wq);
1689 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1690 "Workqueuewoken");
1691}
1692
1693/* 1719/*
1694 * Start a new RCU grace period if warranted, re-initializing the hierarchy 1720 * Start a new RCU grace period if warranted, re-initializing the hierarchy
1695 * in preparation for detecting the next grace period. The caller must hold 1721 * in preparation for detecting the next grace period. The caller must hold
@@ -1698,8 +1724,10 @@ static void rsp_wakeup(struct irq_work *work)
1698 * Note that it is legal for a dying CPU (which is marked as offline) to 1724 * Note that it is legal for a dying CPU (which is marked as offline) to
1699 * invoke this function. This can happen when the dying CPU reports its 1725 * invoke this function. This can happen when the dying CPU reports its
1700 * quiescent state. 1726 * quiescent state.
1727 *
1728 * Returns true if the grace-period kthread must be awakened.
1701 */ 1729 */
1702static void 1730static bool
1703rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 1731rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1704 struct rcu_data *rdp) 1732 struct rcu_data *rdp)
1705{ 1733{
@@ -1710,7 +1738,7 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1710 * or a grace period is already in progress. 1738 * or a grace period is already in progress.
1711 * Either way, don't start a new grace period. 1739 * Either way, don't start a new grace period.
1712 */ 1740 */
1713 return; 1741 return false;
1714 } 1742 }
1715 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT; 1743 ACCESS_ONCE(rsp->gp_flags) = RCU_GP_FLAG_INIT;
1716 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), 1744 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
@@ -1719,14 +1747,9 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1719 /* 1747 /*
1720 * We can't do wakeups while holding the rnp->lock, as that 1748 * We can't do wakeups while holding the rnp->lock, as that
1721 * could cause possible deadlocks with the rq->lock. Defer 1749 * could cause possible deadlocks with the rq->lock. Defer
1722 * the wakeup to interrupt context. And don't bother waking 1750 * the wakeup to our caller.
1723 * up the running kthread.
1724 */ 1751 */
1725 if (current != rsp->gp_kthread) { 1752 return true;
1726 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1727 "Workqueuewake");
1728 irq_work_queue(&rsp->wakeup_work);
1729 }
1730} 1753}
1731 1754
1732/* 1755/*
@@ -1735,12 +1758,14 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1735 * is invoked indirectly from rcu_advance_cbs(), which would result in 1758 * is invoked indirectly from rcu_advance_cbs(), which would result in
1736 * endless recursion -- or would do so if it wasn't for the self-deadlock 1759 * endless recursion -- or would do so if it wasn't for the self-deadlock
1737 * that is encountered beforehand. 1760 * that is encountered beforehand.
1761 *
1762 * Returns true if the grace-period kthread needs to be awakened.
1738 */ 1763 */
1739static void 1764static bool rcu_start_gp(struct rcu_state *rsp)
1740rcu_start_gp(struct rcu_state *rsp)
1741{ 1765{
1742 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1766 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1743 struct rcu_node *rnp = rcu_get_root(rsp); 1767 struct rcu_node *rnp = rcu_get_root(rsp);
1768 bool ret = false;
1744 1769
1745 /* 1770 /*
1746 * If there is no grace period in progress right now, any 1771 * If there is no grace period in progress right now, any
@@ -1750,8 +1775,9 @@ rcu_start_gp(struct rcu_state *rsp)
1750 * resulting in pointless grace periods. So, advance callbacks 1775 * resulting in pointless grace periods. So, advance callbacks
1751 * then start the grace period! 1776 * then start the grace period!
1752 */ 1777 */
1753 rcu_advance_cbs(rsp, rnp, rdp); 1778 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret;
1754 rcu_start_gp_advanced(rsp, rnp, rdp); 1779 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret;
1780 return ret;
1755} 1781}
1756 1782
1757/* 1783/*
@@ -1840,6 +1866,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1840{ 1866{
1841 unsigned long flags; 1867 unsigned long flags;
1842 unsigned long mask; 1868 unsigned long mask;
1869 bool needwake;
1843 struct rcu_node *rnp; 1870 struct rcu_node *rnp;
1844 1871
1845 rnp = rdp->mynode; 1872 rnp = rdp->mynode;
@@ -1868,9 +1895,11 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
1868 * This GP can't end until cpu checks in, so all of our 1895 * This GP can't end until cpu checks in, so all of our
1869 * callbacks can be processed during the next GP. 1896 * callbacks can be processed during the next GP.
1870 */ 1897 */
1871 rcu_accelerate_cbs(rsp, rnp, rdp); 1898 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1872 1899
1873 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ 1900 rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
1901 if (needwake)
1902 rcu_gp_kthread_wake(rsp);
1874 } 1903 }
1875} 1904}
1876 1905
@@ -2354,6 +2383,7 @@ static void
2354__rcu_process_callbacks(struct rcu_state *rsp) 2383__rcu_process_callbacks(struct rcu_state *rsp)
2355{ 2384{
2356 unsigned long flags; 2385 unsigned long flags;
2386 bool needwake;
2357 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 2387 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
2358 2388
2359 WARN_ON_ONCE(rdp->beenonline == 0); 2389 WARN_ON_ONCE(rdp->beenonline == 0);
@@ -2365,8 +2395,10 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2365 local_irq_save(flags); 2395 local_irq_save(flags);
2366 if (cpu_needs_another_gp(rsp, rdp)) { 2396 if (cpu_needs_another_gp(rsp, rdp)) {
2367 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ 2397 raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
2368 rcu_start_gp(rsp); 2398 needwake = rcu_start_gp(rsp);
2369 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 2399 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
2400 if (needwake)
2401 rcu_gp_kthread_wake(rsp);
2370 } else { 2402 } else {
2371 local_irq_restore(flags); 2403 local_irq_restore(flags);
2372 } 2404 }
@@ -2424,6 +2456,8 @@ static void invoke_rcu_core(void)
2424static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 2456static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2425 struct rcu_head *head, unsigned long flags) 2457 struct rcu_head *head, unsigned long flags)
2426{ 2458{
2459 bool needwake;
2460
2427 /* 2461 /*
2428 * If called from an extended quiescent state, invoke the RCU 2462 * If called from an extended quiescent state, invoke the RCU
2429 * core in order to force a re-evaluation of RCU's idleness. 2463 * core in order to force a re-evaluation of RCU's idleness.
@@ -2453,8 +2487,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2453 2487
2454 raw_spin_lock(&rnp_root->lock); 2488 raw_spin_lock(&rnp_root->lock);
2455 smp_mb__after_unlock_lock(); 2489 smp_mb__after_unlock_lock();
2456 rcu_start_gp(rsp); 2490 needwake = rcu_start_gp(rsp);
2457 raw_spin_unlock(&rnp_root->lock); 2491 raw_spin_unlock(&rnp_root->lock);
2492 if (needwake)
2493 rcu_gp_kthread_wake(rsp);
2458 } else { 2494 } else {
2459 /* Give the grace period a kick. */ 2495 /* Give the grace period a kick. */
2460 rdp->blimit = LONG_MAX; 2496 rdp->blimit = LONG_MAX;
@@ -3440,7 +3476,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3440 3476
3441 rsp->rda = rda; 3477 rsp->rda = rda;
3442 init_waitqueue_head(&rsp->gp_wq); 3478 init_waitqueue_head(&rsp->gp_wq);
3443 init_irq_work(&rsp->wakeup_work, rsp_wakeup);
3444 rnp = rsp->level[rcu_num_lvls - 1]; 3479 rnp = rsp->level[rcu_num_lvls - 1];
3445 for_each_possible_cpu(i) { 3480 for_each_possible_cpu(i) {
3446 while (i > rnp->grphi) 3481 while (i > rnp->grphi)