rcu: Move propagation of ->completed from rcu_start_gp() to rcu_report_qs_rsp()

It is possible for the CPU that noted the end of the prior grace period to not need a new one, and therefore to decide to propagate ->completed throughout the rcu_node tree without starting another grace period. However, in so doing, it releases the root rcu_node structure's lock, which can allow some other CPU to start another grace period. The first CPU will be propagating ->completed in parallel with the second CPU initializing the rcu_node tree for the new grace period. In theory this is harmless, but in practice we need to keep things simple. This commit therefore moves the propagation of ->completed to rcu_report_qs_rsp(), and refrains from marking the old grace period as having been completed until it has finished doing this. This prevents anyone from starting a new grace period concurrently with marking the old grace period as having been completed. Of course, the optimization where a CPU needing a new grace period doesn't bother marking the old one completed is still in effect: In that case, the marking happens implicitly as part of initializing the new grace period. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2011-08-24 19:52:09 -0400
committer: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2011-09-29 00:38:49 -0400
commit: afe24b122eb6edb5f1cb942570ac8d766105c7fc (patch)
tree: c1f9e2fcbcf2d374f36ee3bfc45babf576cb6246 /kernel/rcutree.c
parent: e90c53d3e238dd0b7b02964370e8fece1778df96 (diff)
1 files changed, 51 insertions, 20 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e75df0c93abd..e234eb92a177 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -842,28 +842,24 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        struct rcu_node *rnp = rcu_get_root(rsp);
        if (!rcu_scheduler_fully_active ||
-            !cpu_needs_another_gp(rsp, rdp) ||
+            !cpu_needs_another_gp(rsp, rdp)) {
-            rsp->fqs_active) {
+                /*
-                if (rcu_scheduler_fully_active &&
+                 * Either the scheduler hasn't yet spawned the first
-                    cpu_needs_another_gp(rsp, rdp))
+                 * non-idle task or this CPU does not need another
-                        rsp->fqs_need_gp = 1;
+                 * grace period.  Either way, don't start a new grace
-                if (rnp->completed == rsp->completed) {
+                 * period.
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                 */
-                        return;
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                }
+                return;
-                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+        }
+        if (rsp->fqs_active) {
                /*
-                 * Propagate new ->completed value to rcu_node structures
+                 * This CPU needs a grace period, but force_quiescent_state()
-                 * so that other CPUs don't have to wait until the start
+                 * is running.  Tell it to start one on this CPU's behalf.
-                 * of the next grace period to process their callbacks.
                 */
-                rcu_for_each_node_breadth_first(rsp, rnp) {
+                rsp->fqs_need_gp = 1;
-                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        rnp->completed = rsp->completed;
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                }
-                local_irq_restore(flags);
                return;
        }
@@ -947,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
        unsigned long gp_duration;
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
@@ -958,7 +956,40 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
-        rsp->completed = rsp->gpnum;
+        /*
+         * We know the grace period is complete, but to everyone else
+         * it appears to still be ongoing.  But it is also the case
+         * that to everyone else it looks like there is nothing that
+         * they can do to advance the grace period.  It is therefore
+         * safe for us to drop the lock in order to mark the grace
+         * period as completed in all of the rcu_node structures.
+         *
+         * But if this CPU needs another grace period, it will take
+         * care of this while initializing the next grace period.
+         * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+         * because the callbacks have not yet been advanced: Those
+         * callbacks are waiting on the grace period that just now
+         * completed.
+         */
+        if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+                /*
+                 * Propagate new ->completed value to rcu_node structures
+                 * so that other CPUs don't have to wait until the start
+                 * of the next grace period to process their callbacks.
+                 */
+                rcu_for_each_node_breadth_first(rsp, rnp) {
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                        rnp->completed = rsp->gpnum;
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                }
+                rnp = rcu_get_root(rsp);
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+        }
+        rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
        rsp->signaled = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2011-08-24 19:52:09 -0400
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2011-09-29 00:38:49 -0400
commit	afe24b122eb6edb5f1cb942570ac8d766105c7fc (patch)
tree	c1f9e2fcbcf2d374f36ee3bfc45babf576cb6246 /kernel/rcutree.c
parent	e90c53d3e238dd0b7b02964370e8fece1778df96 (diff)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e75df0c93abd..e234eb92a177 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c
@@ -842,28 +842,24 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
842	struct rcu_node *rnp = rcu_get_root(rsp);	842	struct rcu_node *rnp = rcu_get_root(rsp);
843		843
844	if (!rcu_scheduler_fully_active \|\|	844	if (!rcu_scheduler_fully_active \|\|
845	!cpu_needs_another_gp(rsp, rdp) \|\|	845	!cpu_needs_another_gp(rsp, rdp)) {
846	rsp->fqs_active) {	846	/*
847	if (rcu_scheduler_fully_active &&	847	* Either the scheduler hasn't yet spawned the first
848	cpu_needs_another_gp(rsp, rdp))	848	* non-idle task or this CPU does not need another
849	rsp->fqs_need_gp = 1;	849	* grace period. Either way, don't start a new grace
850	if (rnp->completed == rsp->completed) {	850	* period.
851	raw_spin_unlock_irqrestore(&rnp->lock, flags);	851	*/
852	return;	852	raw_spin_unlock_irqrestore(&rnp->lock, flags);
853	}	853	return;
854	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */	854	}
855		855
		856	if (rsp->fqs_active) {
856	/*	857	/*
857	* Propagate new ->completed value to rcu_node structures	858	* This CPU needs a grace period, but force_quiescent_state()
858	* so that other CPUs don't have to wait until the start	859	* is running. Tell it to start one on this CPU's behalf.
859	* of the next grace period to process their callbacks.
860	*/	860	*/
861	rcu_for_each_node_breadth_first(rsp, rnp) {	861	rsp->fqs_need_gp = 1;
862	raw_spin_lock(&rnp->lock); /* irqs already disabled. */	862	raw_spin_unlock_irqrestore(&rnp->lock, flags);
863	rnp->completed = rsp->completed;
864	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
865	}
866	local_irq_restore(flags);
867	return;	863	return;
868	}	864	}
869		865
@@ -947,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
947	__releases(rcu_get_root(rsp)->lock)	943	__releases(rcu_get_root(rsp)->lock)
948	{	944	{
949	unsigned long gp_duration;	945	unsigned long gp_duration;
		946	struct rcu_node *rnp = rcu_get_root(rsp);
		947	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
950		948
951	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));	949	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
952		950
@@ -958,7 +956,40 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
958	gp_duration = jiffies - rsp->gp_start;	956	gp_duration = jiffies - rsp->gp_start;
959	if (gp_duration > rsp->gp_max)	957	if (gp_duration > rsp->gp_max)
960	rsp->gp_max = gp_duration;	958	rsp->gp_max = gp_duration;
961	rsp->completed = rsp->gpnum;	959
		960	/*
		961	* We know the grace period is complete, but to everyone else
		962	* it appears to still be ongoing. But it is also the case
		963	* that to everyone else it looks like there is nothing that
		964	* they can do to advance the grace period. It is therefore
		965	* safe for us to drop the lock in order to mark the grace
		966	* period as completed in all of the rcu_node structures.
		967	*
		968	* But if this CPU needs another grace period, it will take
		969	* care of this while initializing the next grace period.
		970	* We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
		971	* because the callbacks have not yet been advanced: Those
		972	* callbacks are waiting on the grace period that just now
		973	* completed.
		974	*/
		975	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
		976	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
		977
		978	/*
		979	* Propagate new ->completed value to rcu_node structures
		980	* so that other CPUs don't have to wait until the start
		981	* of the next grace period to process their callbacks.
		982	*/
		983	rcu_for_each_node_breadth_first(rsp, rnp) {
		984	raw_spin_lock(&rnp->lock); /* irqs already disabled. */
		985	rnp->completed = rsp->gpnum;
		986	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
		987	}
		988	rnp = rcu_get_root(rsp);
		989	raw_spin_lock(&rnp->lock); /* irqs already disabled. */
		990	}
		991
		992	rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
962	trace_rcu_grace_period(rsp->name, rsp->completed, "end");	993	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
963	rsp->signaled = RCU_GP_IDLE;	994	rsp->signaled = RCU_GP_IDLE;
964	rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */	995	rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */