aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-01-24 00:52:37 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-03-12 18:19:37 -0400
commit0aa04b055e71bd3b8040dd71a126126c66b6f01e (patch)
tree7f4269bf77d46611b380d1ea2e98360b8c02ce35 /kernel/rcu/tree.c
parentcc99a310caf811aebbd0986f433d824e4a5e7ce5 (diff)
rcu: Process offlining and onlining only at grace-period start
Races between CPU hotplug and grace periods can be difficult to resolve, so the ->onoff_mutex is used to exclude the two events. Unfortunately, this means that it is impossible for an outgoing CPU to perform the last bits of its offlining from its last pass through the idle loop, because sleeplocks cannot be acquired in that context. This commit avoids these problems by buffering online and offline events in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a grace period starts, the events accumulated in this mask are applied to the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special case of all CPUs corresponding to a given leaf rcu_node structure being offline while there are still elements in that structure's ->blkd_tasks list is handled using a new ->wait_blkd_tasks field. In this case, propagating the offline bits up the tree is deferred until the beginning of the grace period after all of the tasks have exited their RCU read-side critical sections and removed themselves from the list, at which point the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node structure's CPUs comes back online before the list empties, then the ->wait_blkd_tasks flag is simply cleared. This of course means that RCU's notion of which CPUs are offline can be out of date. This is OK because RCU need only wait on CPUs that were online at the time that the grace period started. In addition, RCU's force-quiescent-state actions will handle the case where a CPU goes offline after the grace period starts. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c154
1 files changed, 121 insertions, 33 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5b5cb1ff73ed..f0f4d3510d24 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
152 */ 152 */
153static int rcu_scheduler_fully_active __read_mostly; 153static int rcu_scheduler_fully_active __read_mostly;
154 154
155static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
156static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
155static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 157static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
156static void invoke_rcu_core(void); 158static void invoke_rcu_core(void);
157static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 159static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -179,6 +181,17 @@ unsigned long rcutorture_testseq;
179unsigned long rcutorture_vernum; 181unsigned long rcutorture_vernum;
180 182
181/* 183/*
184 * Compute the mask of online CPUs for the specified rcu_node structure.
185 * This will not be stable unless the rcu_node structure's ->lock is
186 * held, but the bit corresponding to the current CPU will be stable
187 * in most contexts.
188 */
189unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
190{
191 return ACCESS_ONCE(rnp->qsmaskinitnext);
192}
193
194/*
182 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 195 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
183 * permit this function to be invoked without holding the root rcu_node 196 * permit this function to be invoked without holding the root rcu_node
184 * structure's ->lock, but of course results can be subject to change. 197 * structure's ->lock, but of course results can be subject to change.
@@ -960,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
960 preempt_disable(); 973 preempt_disable();
961 rdp = this_cpu_ptr(&rcu_sched_data); 974 rdp = this_cpu_ptr(&rcu_sched_data);
962 rnp = rdp->mynode; 975 rnp = rdp->mynode;
963 ret = (rdp->grpmask & rnp->qsmaskinit) || 976 ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
964 !rcu_scheduler_fully_active; 977 !rcu_scheduler_fully_active;
965 preempt_enable(); 978 preempt_enable();
966 return ret; 979 return ret;
@@ -1710,6 +1723,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1710 */ 1723 */
1711static int rcu_gp_init(struct rcu_state *rsp) 1724static int rcu_gp_init(struct rcu_state *rsp)
1712{ 1725{
1726 unsigned long oldmask;
1713 struct rcu_data *rdp; 1727 struct rcu_data *rdp;
1714 struct rcu_node *rnp = rcu_get_root(rsp); 1728 struct rcu_node *rnp = rcu_get_root(rsp);
1715 1729
@@ -1745,6 +1759,55 @@ static int rcu_gp_init(struct rcu_state *rsp)
1745 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ 1759 smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
1746 1760
1747 /* 1761 /*
1762 * Apply per-leaf buffered online and offline operations to the
1763 * rcu_node tree. Note that this new grace period need not wait
1764 * for subsequent online CPUs, and that quiescent-state forcing
1765 * will handle subsequent offline CPUs.
1766 */
1767 rcu_for_each_leaf_node(rsp, rnp) {
1768 raw_spin_lock_irq(&rnp->lock);
1769 smp_mb__after_unlock_lock();
1770 if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
1771 !rnp->wait_blkd_tasks) {
1772 /* Nothing to do on this leaf rcu_node structure. */
1773 raw_spin_unlock_irq(&rnp->lock);
1774 continue;
1775 }
1776
1777 /* Record old state, apply changes to ->qsmaskinit field. */
1778 oldmask = rnp->qsmaskinit;
1779 rnp->qsmaskinit = rnp->qsmaskinitnext;
1780
1781 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */
1782 if (!oldmask != !rnp->qsmaskinit) {
1783 if (!oldmask) /* First online CPU for this rcu_node. */
1784 rcu_init_new_rnp(rnp);
1785 else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
1786 rnp->wait_blkd_tasks = true;
1787 else /* Last offline CPU and can propagate. */
1788 rcu_cleanup_dead_rnp(rnp);
1789 }
1790
1791 /*
1792 * If all waited-on tasks from prior grace period are
1793 * done, and if all this rcu_node structure's CPUs are
1794 * still offline, propagate up the rcu_node tree and
1795 * clear ->wait_blkd_tasks. Otherwise, if one of this
1796 * rcu_node structure's CPUs has since come back online,
1797 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
1798 * checks for this, so just call it unconditionally).
1799 */
1800 if (rnp->wait_blkd_tasks &&
1801 (!rcu_preempt_has_tasks(rnp) ||
1802 rnp->qsmaskinit)) {
1803 rnp->wait_blkd_tasks = false;
1804 rcu_cleanup_dead_rnp(rnp);
1805 }
1806
1807 raw_spin_unlock_irq(&rnp->lock);
1808 }
1809
1810 /*
1748 * Set the quiescent-state-needed bits in all the rcu_node 1811 * Set the quiescent-state-needed bits in all the rcu_node
1749 * structures for all currently online CPUs in breadth-first order, 1812 * structures for all currently online CPUs in breadth-first order,
1750 * starting from the root rcu_node structure, relying on the layout 1813 * starting from the root rcu_node structure, relying on the layout
@@ -2133,7 +2196,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2133 * irqs disabled, and this lock is released upon return, but irqs remain 2196 * irqs disabled, and this lock is released upon return, but irqs remain
2134 * disabled. 2197 * disabled.
2135 */ 2198 */
2136static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp, 2199static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
2137 struct rcu_node *rnp, unsigned long flags) 2200 struct rcu_node *rnp, unsigned long flags)
2138 __releases(rnp->lock) 2201 __releases(rnp->lock)
2139{ 2202{
@@ -2409,6 +2472,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2409 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 2472 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
2410 smp_mb__after_unlock_lock(); /* GP memory ordering. */ 2473 smp_mb__after_unlock_lock(); /* GP memory ordering. */
2411 rnp->qsmaskinit &= ~mask; 2474 rnp->qsmaskinit &= ~mask;
2475 rnp->qsmask &= ~mask;
2412 if (rnp->qsmaskinit) { 2476 if (rnp->qsmaskinit) {
2413 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2477 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2414 return; 2478 return;
@@ -2427,6 +2491,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
2427static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2491static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2428{ 2492{
2429 unsigned long flags; 2493 unsigned long flags;
2494 unsigned long mask;
2430 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2495 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2431 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2496 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
2432 2497
@@ -2443,12 +2508,12 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2443 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2508 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
2444 2509
2445 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 2510 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */
2511 mask = rdp->grpmask;
2446 raw_spin_lock_irqsave(&rnp->lock, flags); 2512 raw_spin_lock_irqsave(&rnp->lock, flags);
2447 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ 2513 smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */
2448 rnp->qsmaskinit &= ~rdp->grpmask; 2514 rnp->qsmaskinitnext &= ~mask;
2449 if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) 2515 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2450 rcu_cleanup_dead_rnp(rnp); 2516
2451 rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
2452 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2517 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
2453 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2518 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
2454 cpu, rdp->qlen, rdp->nxtlist); 2519 cpu, rdp->qlen, rdp->nxtlist);
@@ -2654,12 +2719,21 @@ static void force_qs_rnp(struct rcu_state *rsp,
2654 } 2719 }
2655 } 2720 }
2656 if (mask != 0) { 2721 if (mask != 0) {
2657 2722 /* Idle/offline CPUs, report. */
2658 /* rcu_report_qs_rnp() releases rnp->lock. */
2659 rcu_report_qs_rnp(mask, rsp, rnp, flags); 2723 rcu_report_qs_rnp(mask, rsp, rnp, flags);
2660 continue; 2724 } else if (rnp->parent &&
2725 list_empty(&rnp->blkd_tasks) &&
2726 !rnp->qsmask &&
2727 (rnp->parent->qsmask & rnp->grpmask)) {
2728 /*
2729 * Race between grace-period initialization and task
2730 * existing RCU read-side critical section, report.
2731 */
2732 rcu_report_unblock_qs_rnp(rsp, rnp, flags);
2733 } else {
2734 /* Nothing to do here, so just drop the lock. */
2735 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2661 } 2736 }
2662 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2663 } 2737 }
2664} 2738}
2665 2739
@@ -3569,6 +3643,28 @@ void rcu_barrier_sched(void)
3569EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3643EXPORT_SYMBOL_GPL(rcu_barrier_sched);
3570 3644
3571/* 3645/*
3646 * Propagate ->qsinitmask bits up the rcu_node tree to account for the
3647 * first CPU in a given leaf rcu_node structure coming online. The caller
3648 * must hold the corresponding leaf rcu_node ->lock with interrrupts
3649 * disabled.
3650 */
3651static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3652{
3653 long mask;
3654 struct rcu_node *rnp = rnp_leaf;
3655
3656 for (;;) {
3657 mask = rnp->grpmask;
3658 rnp = rnp->parent;
3659 if (rnp == NULL)
3660 return;
3661 raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
3662 rnp->qsmaskinit |= mask;
3663 raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
3664 }
3665}
3666
3667/*
3572 * Do boot-time initialization of a CPU's per-CPU RCU data. 3668 * Do boot-time initialization of a CPU's per-CPU RCU data.
3573 */ 3669 */
3574static void __init 3670static void __init
@@ -3620,31 +3716,23 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3620 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 3716 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
3621 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 3717 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
3622 3718
3623 /* Add CPU to rcu_node bitmasks. */ 3719 /*
3720 * Add CPU to leaf rcu_node pending-online bitmask. Any needed
3721 * propagation up the rcu_node tree will happen at the beginning
3722 * of the next grace period.
3723 */
3624 rnp = rdp->mynode; 3724 rnp = rdp->mynode;
3625 mask = rdp->grpmask; 3725 mask = rdp->grpmask;
3626 do { 3726 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
3627 /* Exclude any attempts to start a new GP on small systems. */ 3727 smp_mb__after_unlock_lock();
3628 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 3728 rnp->qsmaskinitnext |= mask;
3629 rnp->qsmaskinit |= mask; 3729 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3630 mask = rnp->grpmask; 3730 rdp->completed = rnp->completed;
3631 if (rnp == rdp->mynode) { 3731 rdp->passed_quiesce = false;
3632 /* 3732 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3633 * If there is a grace period in progress, we will 3733 rdp->qs_pending = false;
3634 * set up to wait for it next time we run the 3734 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3635 * RCU core code. 3735 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3636 */
3637 rdp->gpnum = rnp->completed;
3638 rdp->completed = rnp->completed;
3639 rdp->passed_quiesce = 0;
3640 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
3641 rdp->qs_pending = 0;
3642 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3643 }
3644 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
3645 rnp = rnp->parent;
3646 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
3647 local_irq_restore(flags);
3648 3736
3649 mutex_unlock(&rsp->onoff_mutex); 3737 mutex_unlock(&rsp->onoff_mutex);
3650} 3738}