diff options
| author | Paul E. McKenney <paul.mckenney@linaro.org> | 2012-03-01 16:18:08 -0500 |
|---|---|---|
| committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-05-09 17:27:54 -0400 |
| commit | b1420f1c8bfc30ecf6380a31d0f686884834b599 (patch) | |
| tree | 56eb378bcd64175a7302a1031b388ecf569a30bb /kernel | |
| parent | 98248a0e24327bc64eb7518145c44bff7bebebc3 (diff) | |
rcu: Make rcu_barrier() less disruptive
The rcu_barrier() primitive interrupts each and every CPU, registering
a callback on every CPU. Once all of these callbacks have been invoked,
rcu_barrier() knows that every callback that was registered before
the call to rcu_barrier() has also been invoked.
However, there is no point in registering a callback on a CPU that
currently has no callbacks, most especially if that CPU is in a
deep idle state. This commit therefore makes rcu_barrier() avoid
interrupting CPUs that have no callbacks. Doing this requires reworking
the handling of orphaned callbacks, otherwise callbacks could slip through
rcu_barrier()'s net by being orphaned from a CPU that rcu_barrier() had
not yet interrupted to a CPU that rcu_barrier() had already interrupted.
This reworking was needed anyway to take a first step towards weaning
RCU from the CPU_DYING notifier's use of stop_cpu().
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/rcutree.c | 295 | ||||
| -rw-r--r-- | kernel/rcutree.h | 11 | ||||
| -rw-r--r-- | kernel/rcutree_trace.c | 4 |
3 files changed, 222 insertions, 88 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 403306b86e7..e578dd327c6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -75,6 +75,8 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; | |||
| 75 | .gpnum = -300, \ | 75 | .gpnum = -300, \ |
| 76 | .completed = -300, \ | 76 | .completed = -300, \ |
| 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ | 77 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ |
| 78 | .orphan_nxttail = &structname##_state.orphan_nxtlist, \ | ||
| 79 | .orphan_donetail = &structname##_state.orphan_donelist, \ | ||
| 78 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ | 80 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ |
| 79 | .n_force_qs = 0, \ | 81 | .n_force_qs = 0, \ |
| 80 | .n_force_qs_ngp = 0, \ | 82 | .n_force_qs_ngp = 0, \ |
| @@ -145,6 +147,13 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | |||
| 145 | unsigned long rcutorture_testseq; | 147 | unsigned long rcutorture_testseq; |
| 146 | unsigned long rcutorture_vernum; | 148 | unsigned long rcutorture_vernum; |
| 147 | 149 | ||
| 150 | /* State information for rcu_barrier() and friends. */ | ||
| 151 | |||
| 152 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | ||
| 153 | static atomic_t rcu_barrier_cpu_count; | ||
| 154 | static DEFINE_MUTEX(rcu_barrier_mutex); | ||
| 155 | static struct completion rcu_barrier_completion; | ||
| 156 | |||
| 148 | /* | 157 | /* |
| 149 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | 158 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s |
| 150 | * permit this function to be invoked without holding the root rcu_node | 159 | * permit this function to be invoked without holding the root rcu_node |
| @@ -1311,95 +1320,133 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1311 | #ifdef CONFIG_HOTPLUG_CPU | 1320 | #ifdef CONFIG_HOTPLUG_CPU |
| 1312 | 1321 | ||
| 1313 | /* | 1322 | /* |
| 1314 | * Move a dying CPU's RCU callbacks to online CPU's callback list. | 1323 | * Send the specified CPU's RCU callbacks to the orphanage. The |
| 1315 | * Also record a quiescent state for this CPU for the current grace period. | 1324 | * specified CPU must be offline, and the caller must hold the |
| 1316 | * Synchronization and interrupt disabling are not required because | 1325 | * ->onofflock. |
| 1317 | * this function executes in stop_machine() context. Therefore, cleanup | ||
| 1318 | * operations that might block must be done later from the CPU_DEAD | ||
| 1319 | * notifier. | ||
| 1320 | * | ||
| 1321 | * Note that the outgoing CPU's bit has already been cleared in the | ||
| 1322 | * cpu_online_mask. This allows us to randomly pick a callback | ||
| 1323 | * destination from the bits set in that mask. | ||
| 1324 | */ | 1326 | */ |
| 1325 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1327 | static void |
| 1328 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
| 1329 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
| 1326 | { | 1330 | { |
| 1327 | int i; | 1331 | int i; |
| 1328 | unsigned long mask; | ||
| 1329 | int receive_cpu = cpumask_any(cpu_online_mask); | ||
| 1330 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
| 1331 | struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | ||
| 1332 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ | ||
| 1333 | 1332 | ||
| 1334 | /* First, adjust the counts. */ | 1333 | /* |
| 1334 | * Orphan the callbacks. First adjust the counts. This is safe | ||
| 1335 | * because ->onofflock excludes _rcu_barrier()'s adoption of | ||
| 1336 | * the callbacks, thus no memory barrier is required. | ||
| 1337 | */ | ||
| 1335 | if (rdp->nxtlist != NULL) { | 1338 | if (rdp->nxtlist != NULL) { |
| 1336 | receive_rdp->qlen_lazy += rdp->qlen_lazy; | 1339 | rsp->qlen_lazy += rdp->qlen_lazy; |
| 1337 | receive_rdp->qlen += rdp->qlen; | 1340 | rsp->qlen += rdp->qlen; |
| 1341 | rdp->n_cbs_orphaned += rdp->qlen; | ||
| 1338 | rdp->qlen_lazy = 0; | 1342 | rdp->qlen_lazy = 0; |
| 1339 | rdp->qlen = 0; | 1343 | rdp->qlen = 0; |
| 1340 | } | 1344 | } |
| 1341 | 1345 | ||
| 1342 | /* | 1346 | /* |
| 1343 | * Next, move ready-to-invoke callbacks to be invoked on some | 1347 | * Next, move those callbacks still needing a grace period to |
| 1344 | * other CPU. These will not be required to pass through another | 1348 | * the orphanage, where some other CPU will pick them up. |
| 1345 | * grace period: They are done, regardless of CPU. | 1349 | * Some of the callbacks might have gone partway through a grace |
| 1350 | * period, but that is too bad. They get to start over because we | ||
| 1351 | * cannot assume that grace periods are synchronized across CPUs. | ||
| 1352 | * We don't bother updating the ->nxttail[] array yet, instead | ||
| 1353 | * we just reset the whole thing later on. | ||
| 1346 | */ | 1354 | */ |
| 1347 | if (rdp->nxtlist != NULL && | 1355 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { |
| 1348 | rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { | 1356 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; |
| 1349 | struct rcu_head *oldhead; | 1357 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; |
| 1350 | struct rcu_head **oldtail; | 1358 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; |
| 1351 | struct rcu_head **newtail; | ||
| 1352 | |||
| 1353 | oldhead = rdp->nxtlist; | ||
| 1354 | oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1355 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1356 | *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; | ||
| 1357 | *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; | ||
| 1358 | newtail = rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1359 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { | ||
| 1360 | if (receive_rdp->nxttail[i] == oldtail) | ||
| 1361 | receive_rdp->nxttail[i] = newtail; | ||
| 1362 | if (rdp->nxttail[i] == newtail) | ||
| 1363 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 1364 | } | ||
| 1365 | } | 1359 | } |
| 1366 | 1360 | ||
| 1367 | /* | 1361 | /* |
| 1368 | * Finally, put the rest of the callbacks at the end of the list. | 1362 | * Then move the ready-to-invoke callbacks to the orphanage, |
| 1369 | * The ones that made it partway through get to start over: We | 1363 | * where some other CPU will pick them up. These will not be |
| 1370 | * cannot assume that grace periods are synchronized across CPUs. | 1364 | * required to pass though another grace period: They are done. |
| 1371 | * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but | ||
| 1372 | * this does not seem compelling. Not yet, anyway.) | ||
| 1373 | */ | 1365 | */ |
| 1374 | if (rdp->nxtlist != NULL) { | 1366 | if (rdp->nxtlist != NULL) { |
| 1375 | *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | 1367 | *rsp->orphan_donetail = rdp->nxtlist; |
| 1376 | receive_rdp->nxttail[RCU_NEXT_TAIL] = | 1368 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; |
| 1377 | rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1378 | receive_rdp->n_cbs_adopted += rdp->qlen; | ||
| 1379 | rdp->n_cbs_orphaned += rdp->qlen; | ||
| 1380 | |||
| 1381 | rdp->nxtlist = NULL; | ||
| 1382 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 1383 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 1384 | } | 1369 | } |
| 1385 | 1370 | ||
| 1371 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
| 1372 | rdp->nxtlist = NULL; | ||
| 1373 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 1374 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 1375 | } | ||
| 1376 | |||
| 1377 | /* | ||
| 1378 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
| 1379 | * orphanage. The caller must hold the ->onofflock. | ||
| 1380 | */ | ||
| 1381 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
| 1382 | { | ||
| 1383 | int i; | ||
| 1384 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
| 1385 | |||
| 1386 | /* | 1386 | /* |
| 1387 | * Record a quiescent state for the dying CPU. This is safe | 1387 | * If there is an rcu_barrier() operation in progress, then |
| 1388 | * only because we have already cleared out the callbacks. | 1388 | * only the task doing that operation is permitted to adopt |
| 1389 | * (Otherwise, the RCU core might try to schedule the invocation | 1389 | * callbacks. To do otherwise breaks rcu_barrier() and friends |
| 1390 | * of callbacks on this now-offline CPU, which would be bad.) | 1390 | * by causing them to fail to wait for the callbacks in the |
| 1391 | * orphanage. | ||
| 1391 | */ | 1392 | */ |
| 1392 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1393 | if (rsp->rcu_barrier_in_progress && |
| 1394 | rsp->rcu_barrier_in_progress != current) | ||
| 1395 | return; | ||
| 1396 | |||
| 1397 | /* Do the accounting first. */ | ||
| 1398 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
| 1399 | rdp->qlen += rsp->qlen; | ||
| 1400 | rdp->n_cbs_adopted += rsp->qlen; | ||
| 1401 | rsp->qlen_lazy = 0; | ||
| 1402 | rsp->qlen = 0; | ||
| 1403 | |||
| 1404 | /* | ||
| 1405 | * We do not need a memory barrier here because the only way we | ||
| 1406 | * can get here if there is an rcu_barrier() in flight is if | ||
| 1407 | * we are the task doing the rcu_barrier(). | ||
| 1408 | */ | ||
| 1409 | |||
| 1410 | /* First adopt the ready-to-invoke callbacks. */ | ||
| 1411 | if (rsp->orphan_donelist != NULL) { | ||
| 1412 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1413 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
| 1414 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
| 1415 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1416 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
| 1417 | rsp->orphan_donelist = NULL; | ||
| 1418 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
| 1419 | } | ||
| 1420 | |||
| 1421 | /* And then adopt the callbacks that still need a grace period. */ | ||
| 1422 | if (rsp->orphan_nxtlist != NULL) { | ||
| 1423 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
| 1424 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
| 1425 | rsp->orphan_nxtlist = NULL; | ||
| 1426 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
| 1427 | } | ||
| 1428 | } | ||
| 1429 | |||
| 1430 | /* | ||
| 1431 | * Trace the fact that this CPU is going offline. | ||
| 1432 | */ | ||
| 1433 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
| 1434 | { | ||
| 1435 | RCU_TRACE(unsigned long mask); | ||
| 1436 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
| 1437 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
| 1438 | |||
| 1439 | RCU_TRACE(mask = rdp->grpmask); | ||
| 1393 | trace_rcu_grace_period(rsp->name, | 1440 | trace_rcu_grace_period(rsp->name, |
| 1394 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | 1441 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), |
| 1395 | "cpuofl"); | 1442 | "cpuofl"); |
| 1396 | rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); | ||
| 1397 | /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ | ||
| 1398 | } | 1443 | } |
| 1399 | 1444 | ||
| 1400 | /* | 1445 | /* |
| 1401 | * The CPU has been completely removed, and some other CPU is reporting | 1446 | * The CPU has been completely removed, and some other CPU is reporting |
| 1402 | * this fact from process context. Do the remainder of the cleanup. | 1447 | * this fact from process context. Do the remainder of the cleanup, |
| 1448 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
| 1449 | * adopting them, if there is no _rcu_barrier() instance running. | ||
| 1403 | * There can only be one CPU hotplug operation at a time, so no other | 1450 | * There can only be one CPU hotplug operation at a time, so no other |
| 1404 | * CPU can be attempting to update rcu_cpu_kthread_task. | 1451 | * CPU can be attempting to update rcu_cpu_kthread_task. |
| 1405 | */ | 1452 | */ |
| @@ -1409,17 +1456,21 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 1409 | unsigned long mask; | 1456 | unsigned long mask; |
| 1410 | int need_report = 0; | 1457 | int need_report = 0; |
| 1411 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 1458 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
| 1412 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ | 1459 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
| 1413 | 1460 | ||
| 1414 | /* Adjust any no-longer-needed kthreads. */ | 1461 | /* Adjust any no-longer-needed kthreads. */ |
| 1415 | rcu_stop_cpu_kthread(cpu); | 1462 | rcu_stop_cpu_kthread(cpu); |
| 1416 | rcu_node_kthread_setaffinity(rnp, -1); | 1463 | rcu_node_kthread_setaffinity(rnp, -1); |
| 1417 | 1464 | ||
| 1418 | /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ | 1465 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
| 1419 | 1466 | ||
| 1420 | /* Exclude any attempts to start a new grace period. */ | 1467 | /* Exclude any attempts to start a new grace period. */ |
| 1421 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1468 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 1422 | 1469 | ||
| 1470 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
| 1471 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
| 1472 | rcu_adopt_orphan_cbs(rsp); | ||
| 1473 | |||
| 1423 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | 1474 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ |
| 1424 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | 1475 | mask = rdp->grpmask; /* rnp->grplo is constant. */ |
| 1425 | do { | 1476 | do { |
| @@ -1456,6 +1507,10 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 1456 | 1507 | ||
| 1457 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1508 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 1458 | 1509 | ||
| 1510 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
| 1511 | { | ||
| 1512 | } | ||
| 1513 | |||
| 1459 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1514 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
| 1460 | { | 1515 | { |
| 1461 | } | 1516 | } |
| @@ -1524,9 +1579,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1524 | rcu_is_callbacks_kthread()); | 1579 | rcu_is_callbacks_kthread()); |
| 1525 | 1580 | ||
| 1526 | /* Update count, and requeue any remaining callbacks. */ | 1581 | /* Update count, and requeue any remaining callbacks. */ |
| 1527 | rdp->qlen_lazy -= count_lazy; | ||
| 1528 | rdp->qlen -= count; | ||
| 1529 | rdp->n_cbs_invoked += count; | ||
| 1530 | if (list != NULL) { | 1582 | if (list != NULL) { |
| 1531 | *tail = rdp->nxtlist; | 1583 | *tail = rdp->nxtlist; |
| 1532 | rdp->nxtlist = list; | 1584 | rdp->nxtlist = list; |
| @@ -1536,6 +1588,10 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1536 | else | 1588 | else |
| 1537 | break; | 1589 | break; |
| 1538 | } | 1590 | } |
| 1591 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
| 1592 | rdp->qlen_lazy -= count_lazy; | ||
| 1593 | rdp->qlen -= count; | ||
| 1594 | rdp->n_cbs_invoked += count; | ||
| 1539 | 1595 | ||
| 1540 | /* Reinstate batch limit if we have worked down the excess. */ | 1596 | /* Reinstate batch limit if we have worked down the excess. */ |
| 1541 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1597 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
| @@ -1824,13 +1880,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1824 | rdp = this_cpu_ptr(rsp->rda); | 1880 | rdp = this_cpu_ptr(rsp->rda); |
| 1825 | 1881 | ||
| 1826 | /* Add the callback to our list. */ | 1882 | /* Add the callback to our list. */ |
| 1827 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
| 1828 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
| 1829 | rdp->qlen++; | 1883 | rdp->qlen++; |
| 1830 | if (lazy) | 1884 | if (lazy) |
| 1831 | rdp->qlen_lazy++; | 1885 | rdp->qlen_lazy++; |
| 1832 | else | 1886 | else |
| 1833 | rcu_idle_count_callbacks_posted(); | 1887 | rcu_idle_count_callbacks_posted(); |
| 1888 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
| 1889 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
| 1890 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
| 1834 | 1891 | ||
| 1835 | if (__is_kfree_rcu_offset((unsigned long)func)) | 1892 | if (__is_kfree_rcu_offset((unsigned long)func)) |
| 1836 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | 1893 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, |
| @@ -2169,11 +2226,10 @@ static int rcu_cpu_has_callbacks(int cpu) | |||
| 2169 | rcu_preempt_cpu_has_callbacks(cpu); | 2226 | rcu_preempt_cpu_has_callbacks(cpu); |
| 2170 | } | 2227 | } |
| 2171 | 2228 | ||
| 2172 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; | 2229 | /* |
| 2173 | static atomic_t rcu_barrier_cpu_count; | 2230 | * RCU callback function for _rcu_barrier(). If we are last, wake |
| 2174 | static DEFINE_MUTEX(rcu_barrier_mutex); | 2231 | * up the task executing _rcu_barrier(). |
| 2175 | static struct completion rcu_barrier_completion; | 2232 | */ |
| 2176 | |||
| 2177 | static void rcu_barrier_callback(struct rcu_head *notused) | 2233 | static void rcu_barrier_callback(struct rcu_head *notused) |
| 2178 | { | 2234 | { |
| 2179 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2235 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
| @@ -2203,27 +2259,94 @@ static void _rcu_barrier(struct rcu_state *rsp, | |||
| 2203 | void (*call_rcu_func)(struct rcu_head *head, | 2259 | void (*call_rcu_func)(struct rcu_head *head, |
| 2204 | void (*func)(struct rcu_head *head))) | 2260 | void (*func)(struct rcu_head *head))) |
| 2205 | { | 2261 | { |
| 2206 | BUG_ON(in_interrupt()); | 2262 | int cpu; |
| 2263 | unsigned long flags; | ||
| 2264 | struct rcu_data *rdp; | ||
| 2265 | struct rcu_head rh; | ||
| 2266 | |||
| 2267 | init_rcu_head_on_stack(&rh); | ||
| 2268 | |||
| 2207 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2269 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
| 2208 | mutex_lock(&rcu_barrier_mutex); | 2270 | mutex_lock(&rcu_barrier_mutex); |
| 2209 | init_completion(&rcu_barrier_completion); | 2271 | |
| 2272 | smp_mb(); /* Prevent any prior operations from leaking in. */ | ||
| 2273 | |||
| 2210 | /* | 2274 | /* |
| 2211 | * Initialize rcu_barrier_cpu_count to 1, then invoke | 2275 | * Initialize the count to one rather than to zero in order to |
| 2212 | * rcu_barrier_func() on each CPU, so that each CPU also has | 2276 | * avoid a too-soon return to zero in case of a short grace period |
| 2213 | * incremented rcu_barrier_cpu_count. Only then is it safe to | 2277 | * (or preemption of this task). Also flag this task as doing |
| 2214 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU | 2278 | * an rcu_barrier(). This will prevent anyone else from adopting |
| 2215 | * might complete its grace period before all of the other CPUs | 2279 | * orphaned callbacks, which could cause otherwise failure if a |
| 2216 | * did their increment, causing this function to return too | 2280 | * CPU went offline and quickly came back online. To see this, |
| 2217 | * early. Note that on_each_cpu() disables irqs, which prevents | 2281 | * consider the following sequence of events: |
| 2218 | * any CPUs from coming online or going offline until each online | 2282 | * |
| 2219 | * CPU has queued its RCU-barrier callback. | 2283 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. |
| 2284 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
| 2285 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
| 2286 | * 4. CPU 1 comes back online. | ||
| 2287 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
| 2288 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
| 2289 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
| 2220 | */ | 2290 | */ |
| 2291 | init_completion(&rcu_barrier_completion); | ||
| 2221 | atomic_set(&rcu_barrier_cpu_count, 1); | 2292 | atomic_set(&rcu_barrier_cpu_count, 1); |
| 2222 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); | 2293 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
| 2294 | rsp->rcu_barrier_in_progress = current; | ||
| 2295 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 2296 | |||
| 2297 | /* | ||
| 2298 | * Force every CPU with callbacks to register a new callback | ||
| 2299 | * that will tell us when all the preceding callbacks have | ||
| 2300 | * been invoked. If an offline CPU has callbacks, wait for | ||
| 2301 | * it to either come back online or to finish orphaning those | ||
| 2302 | * callbacks. | ||
| 2303 | */ | ||
| 2304 | for_each_possible_cpu(cpu) { | ||
| 2305 | preempt_disable(); | ||
| 2306 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2307 | if (cpu_is_offline(cpu)) { | ||
| 2308 | preempt_enable(); | ||
| 2309 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
| 2310 | schedule_timeout_interruptible(1); | ||
| 2311 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
| 2312 | smp_call_function_single(cpu, rcu_barrier_func, | ||
| 2313 | (void *)call_rcu_func, 1); | ||
| 2314 | preempt_enable(); | ||
| 2315 | } else { | ||
| 2316 | preempt_enable(); | ||
| 2317 | } | ||
| 2318 | } | ||
| 2319 | |||
| 2320 | /* | ||
| 2321 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
| 2322 | * posted, we can adopt all of the orphaned callbacks and place | ||
| 2323 | * an rcu_barrier_callback() callback after them. When that is done, | ||
| 2324 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
| 2325 | * following every callback that could possibly have been | ||
| 2326 | * registered before _rcu_barrier() was called. | ||
| 2327 | */ | ||
| 2328 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
| 2329 | rcu_adopt_orphan_cbs(rsp); | ||
| 2330 | rsp->rcu_barrier_in_progress = NULL; | ||
| 2331 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 2332 | atomic_inc(&rcu_barrier_cpu_count); | ||
| 2333 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
| 2334 | call_rcu_func(&rh, rcu_barrier_callback); | ||
| 2335 | |||
| 2336 | /* | ||
| 2337 | * Now that we have an rcu_barrier_callback() callback on each | ||
| 2338 | * CPU, and thus each counted, remove the initial count. | ||
| 2339 | */ | ||
| 2223 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 2340 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
| 2224 | complete(&rcu_barrier_completion); | 2341 | complete(&rcu_barrier_completion); |
| 2342 | |||
| 2343 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
| 2225 | wait_for_completion(&rcu_barrier_completion); | 2344 | wait_for_completion(&rcu_barrier_completion); |
| 2345 | |||
| 2346 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
| 2226 | mutex_unlock(&rcu_barrier_mutex); | 2347 | mutex_unlock(&rcu_barrier_mutex); |
| 2348 | |||
| 2349 | destroy_rcu_head_on_stack(&rh); | ||
| 2227 | } | 2350 | } |
| 2228 | 2351 | ||
| 2229 | /** | 2352 | /** |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 36ca28ecedc..1e49c568596 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -371,6 +371,17 @@ struct rcu_state { | |||
| 371 | 371 | ||
| 372 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 372 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
| 373 | /* starting new GP. */ | 373 | /* starting new GP. */ |
| 374 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
| 375 | /* need a grace period. */ | ||
| 376 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
| 377 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
| 378 | /* are ready to invoke. */ | ||
| 379 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
| 380 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
| 381 | long qlen; /* Total number of callbacks. */ | ||
| 382 | struct task_struct *rcu_barrier_in_progress; | ||
| 383 | /* Task doing rcu_barrier(), */ | ||
| 384 | /* or NULL if no barrier. */ | ||
| 374 | raw_spinlock_t fqslock; /* Only one task forcing */ | 385 | raw_spinlock_t fqslock; /* Only one task forcing */ |
| 375 | /* quiescent states. */ | 386 | /* quiescent states. */ |
| 376 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 387 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index ed459edeff4..d4bc16ddd1d 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -271,13 +271,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | |||
| 271 | 271 | ||
| 272 | gpnum = rsp->gpnum; | 272 | gpnum = rsp->gpnum; |
| 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " | 273 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
| 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | 274 | "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", |
| 275 | rsp->completed, gpnum, rsp->fqs_state, | 275 | rsp->completed, gpnum, rsp->fqs_state, |
| 276 | (long)(rsp->jiffies_force_qs - jiffies), | 276 | (long)(rsp->jiffies_force_qs - jiffies), |
| 277 | (int)(jiffies & 0xffff), | 277 | (int)(jiffies & 0xffff), |
| 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, | 278 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
| 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, | 279 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
| 280 | rsp->n_force_qs_lh); | 280 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); |
| 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { | 281 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
| 282 | if (rnp->level != level) { | 282 | if (rnp->level != level) { |
| 283 | seq_puts(m, "\n"); | 283 | seq_puts(m, "\n"); |
