rcu: Move RCU grace-period cleanup into kthread

As a first step towards allowing grace-period cleanup to be preemptible, this commit moves the RCU grace-period cleanup into the same kthread that is now used to initialize grace periods. This is needed to keep scheduling latency down to a dull roar. [ paulmck: Get rid of stray spin_lock_irqsave() calls. ] Reported-by: Mike Galbraith <mgalbraith@suse.de> Reported-by: Dimitri Sivanich <sivanich@sgi.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2012-06-20 20:07:14 -0400
committer: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2012-09-23 10:41:52 -0400
commit: cabc49c1ff51baaf1958d501a7a616ce91245c93 (patch)
tree: 9d99237196fb45d01b38fb9235815b32fd1995d7 /kernel/rcutree.c
parent: 755609a9087fa983f567dc5452b2fa7b089b591f (diff)
1 files changed, 62 insertions, 50 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 781e5f0b7b17..52c3102dc5f7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1032,6 +1032,7 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
 */
 static int __noreturn rcu_gp_kthread(void *arg)
 {
+        unsigned long gp_duration;
        struct rcu_data *rdp;
        struct rcu_node *rnp;
        struct rcu_state *rsp = arg;
@@ -1116,6 +1117,65 @@ static int __noreturn rcu_gp_kthread(void *arg)
                rsp->fqs_state = RCU_SIGNAL_INIT;
                raw_spin_unlock_irq(&rnp->lock);
                put_online_cpus();
+                /* Handle grace-period end. */
+                rnp = rcu_get_root(rsp);
+                for (;;) {
+                        wait_event_interruptible(rsp->gp_wq,
+                                                 !ACCESS_ONCE(rnp->qsmask) &&
+                                                 !rcu_preempt_blocked_readers_cgp(rnp));
+                        if (!ACCESS_ONCE(rnp->qsmask) &&
+                            !rcu_preempt_blocked_readers_cgp(rnp))
+                                break;
+                        flush_signals(current);
+                }
+                raw_spin_lock_irq(&rnp->lock);
+                gp_duration = jiffies - rsp->gp_start;
+                if (gp_duration > rsp->gp_max)
+                        rsp->gp_max = gp_duration;
+                /*
+                 * We know the grace period is complete, but to everyone else
+                 * it appears to still be ongoing.  But it is also the case
+                 * that to everyone else it looks like there is nothing that
+                 * they can do to advance the grace period.  It is therefore
+                 * safe for us to drop the lock in order to mark the grace
+                 * period as completed in all of the rcu_node structures.
+                 *
+                 * But if this CPU needs another grace period, it will take
+                 * care of this while initializing the next grace period.
+                 * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+                 * because the callbacks have not yet been advanced: Those
+                 * callbacks are waiting on the grace period that just now
+                 * completed.
+                 */
+                if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        /*
+                         * Propagate new ->completed value to rcu_node
+                         * structures so that other CPUs don't have to
+                         * wait until the start of the next grace period
+                         * to process their callbacks.
+                         */
+                        rcu_for_each_node_breadth_first(rsp, rnp) {
+                                /* irqs already disabled. */
+                                raw_spin_lock(&rnp->lock);
+                                rnp->completed = rsp->gpnum;
+                                /* irqs remain disabled. */
+                                raw_spin_unlock(&rnp->lock);
+                        }
+                        rnp = rcu_get_root(rsp);
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                }
+                rsp->completed = rsp->gpnum; /* Declare grace period done. */
+                trace_rcu_grace_period(rsp->name, rsp->completed, "end");
+                rsp->fqs_state = RCU_GP_IDLE;
+                if (cpu_needs_another_gp(rsp, rdp))
+                        rsp->gp_flags = 1;
+                raw_spin_unlock_irq(&rnp->lock);
        }
 }
@@ -1162,57 +1222,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
-        unsigned long gp_duration;
-        struct rcu_node *rnp = rcu_get_root(rsp);
-        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+        raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
-        /*
+        wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
-         * Ensure that all grace-period and pre-grace-period activity
-         * is seen before the assignment to rsp->completed.
-         */
-        smp_mb(); /* See above block comment. */
-        gp_duration = jiffies - rsp->gp_start;
-        if (gp_duration > rsp->gp_max)
-                rsp->gp_max = gp_duration;
-        /*
-         * We know the grace period is complete, but to everyone else
-         * it appears to still be ongoing.  But it is also the case
-         * that to everyone else it looks like there is nothing that
-         * they can do to advance the grace period.  It is therefore
-         * safe for us to drop the lock in order to mark the grace
-         * period as completed in all of the rcu_node structures.
-         *
-         * But if this CPU needs another grace period, it will take
-         * care of this while initializing the next grace period.
-         * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
-         * because the callbacks have not yet been advanced: Those
-         * callbacks are waiting on the grace period that just now
-         * completed.
-         */
-        if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
-                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
-                /*
-                 * Propagate new ->completed value to rcu_node structures
-                 * so that other CPUs don't have to wait until the start
-                 * of the next grace period to process their callbacks.
-                 */
-                rcu_for_each_node_breadth_first(rsp, rnp) {
-                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-                        rnp->completed = rsp->gpnum;
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                }
-                rnp = rcu_get_root(rsp);
-                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
-        }
-        rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
-        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
-        rsp->fqs_state = RCU_GP_IDLE;
-        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
 /*
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2012-06-20 20:07:14 -0400
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2012-09-23 10:41:52 -0400
commit	cabc49c1ff51baaf1958d501a7a616ce91245c93 (patch)
tree	9d99237196fb45d01b38fb9235815b32fd1995d7 /kernel/rcutree.c
parent	755609a9087fa983f567dc5452b2fa7b089b591f (diff)

diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 781e5f0b7b17..52c3102dc5f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c
@@ -1032,6 +1032,7 @@ rcu_start_gp_per_cpu(struct rcu_state rsp, struct rcu_node rnp, struct rcu_dat
1032	*/	1032	*/
1033	static int __noreturn rcu_gp_kthread(void *arg)	1033	static int __noreturn rcu_gp_kthread(void *arg)
1034	{	1034	{
		1035	unsigned long gp_duration;
1035	struct rcu_data *rdp;	1036	struct rcu_data *rdp;
1036	struct rcu_node *rnp;	1037	struct rcu_node *rnp;
1037	struct rcu_state *rsp = arg;	1038	struct rcu_state *rsp = arg;
@@ -1116,6 +1117,65 @@ static int __noreturn rcu_gp_kthread(void *arg)
1116	rsp->fqs_state = RCU_SIGNAL_INIT;	1117	rsp->fqs_state = RCU_SIGNAL_INIT;
1117	raw_spin_unlock_irq(&rnp->lock);	1118	raw_spin_unlock_irq(&rnp->lock);
1118	put_online_cpus();	1119	put_online_cpus();
		1120
		1121	/* Handle grace-period end. */
		1122	rnp = rcu_get_root(rsp);
		1123	for (;;) {
		1124	wait_event_interruptible(rsp->gp_wq,
		1125	!ACCESS_ONCE(rnp->qsmask) &&
		1126	!rcu_preempt_blocked_readers_cgp(rnp));
		1127	if (!ACCESS_ONCE(rnp->qsmask) &&
		1128	!rcu_preempt_blocked_readers_cgp(rnp))
		1129	break;
		1130	flush_signals(current);
		1131	}
		1132
		1133	raw_spin_lock_irq(&rnp->lock);
		1134	gp_duration = jiffies - rsp->gp_start;
		1135	if (gp_duration > rsp->gp_max)
		1136	rsp->gp_max = gp_duration;
		1137
		1138	/*
		1139	* We know the grace period is complete, but to everyone else
		1140	* it appears to still be ongoing. But it is also the case
		1141	* that to everyone else it looks like there is nothing that
		1142	* they can do to advance the grace period. It is therefore
		1143	* safe for us to drop the lock in order to mark the grace
		1144	* period as completed in all of the rcu_node structures.
		1145	*
		1146	* But if this CPU needs another grace period, it will take
		1147	* care of this while initializing the next grace period.
		1148	* We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
		1149	* because the callbacks have not yet been advanced: Those
		1150	* callbacks are waiting on the grace period that just now
		1151	* completed.
		1152	*/
		1153	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
		1154	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
		1155
		1156	/*
		1157	* Propagate new ->completed value to rcu_node
		1158	* structures so that other CPUs don't have to
		1159	* wait until the start of the next grace period
		1160	* to process their callbacks.
		1161	*/
		1162	rcu_for_each_node_breadth_first(rsp, rnp) {
		1163	/* irqs already disabled. */
		1164	raw_spin_lock(&rnp->lock);
		1165	rnp->completed = rsp->gpnum;
		1166	/* irqs remain disabled. */
		1167	raw_spin_unlock(&rnp->lock);
		1168	}
		1169	rnp = rcu_get_root(rsp);
		1170	raw_spin_lock(&rnp->lock); /* irqs already disabled. */
		1171	}
		1172
		1173	rsp->completed = rsp->gpnum; /* Declare grace period done. */
		1174	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
		1175	rsp->fqs_state = RCU_GP_IDLE;
		1176	if (cpu_needs_another_gp(rsp, rdp))
		1177	rsp->gp_flags = 1;
		1178	raw_spin_unlock_irq(&rnp->lock);
1119	}	1179	}
1120	}	1180	}
1121		1181
@@ -1162,57 +1222,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
1162	static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)	1222	static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1163	__releases(rcu_get_root(rsp)->lock)	1223	__releases(rcu_get_root(rsp)->lock)
1164	{	1224	{
1165	unsigned long gp_duration;
1166	struct rcu_node *rnp = rcu_get_root(rsp);
1167	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1168
1169	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));	1225	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1170		1226	raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1171	/*	1227	wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
1172	* Ensure that all grace-period and pre-grace-period activity
1173	* is seen before the assignment to rsp->completed.
1174	*/
1175	smp_mb(); /* See above block comment. */
1176	gp_duration = jiffies - rsp->gp_start;
1177	if (gp_duration > rsp->gp_max)
1178	rsp->gp_max = gp_duration;
1179
1180	/*
1181	* We know the grace period is complete, but to everyone else
1182	* it appears to still be ongoing. But it is also the case
1183	* that to everyone else it looks like there is nothing that
1184	* they can do to advance the grace period. It is therefore
1185	* safe for us to drop the lock in order to mark the grace
1186	* period as completed in all of the rcu_node structures.
1187	*
1188	* But if this CPU needs another grace period, it will take
1189	* care of this while initializing the next grace period.
1190	* We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
1191	* because the callbacks have not yet been advanced: Those
1192	* callbacks are waiting on the grace period that just now
1193	* completed.
1194	*/
1195	if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
1196	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1197
1198	/*
1199	* Propagate new ->completed value to rcu_node structures
1200	* so that other CPUs don't have to wait until the start
1201	* of the next grace period to process their callbacks.
1202	*/
1203	rcu_for_each_node_breadth_first(rsp, rnp) {
1204	raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1205	rnp->completed = rsp->gpnum;
1206	raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1207	}
1208	rnp = rcu_get_root(rsp);
1209	raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1210	}
1211
1212	rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
1213	trace_rcu_grace_period(rsp->name, rsp->completed, "end");
1214	rsp->fqs_state = RCU_GP_IDLE;
1215	rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
1216	}	1228	}
1217		1229
1218	/*	1230	/*