rcu: Parallelize expedited grace-period initialization

The latency of RCU expedited grace periods grows with increasing numbers of CPUs, eventually failing to be all that expedited. Much of the growth in latency is in the initialization phase, so this commit uses workqueues to carry out this initialization concurrently on a rcu_node-by-rcu_node basis. This change makes use of a new rcu_par_gp_wq because flushing a work item from another work item running from the same workqueue can result in deadlock. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2018-02-02 01:05:38 -0500
committer: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2018-05-15 13:25:44 -0400
commit: 25f3d7effab632eb10d145f1a5aebf6515a04b98 (patch)
tree: 8f088c805ed43c898543f1e74d21e808605143fc
parent: 60cc43fc888428bb2f18f08997432d426a243338 (diff)
4 files changed, 120 insertions, 78 deletions
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7a693e31184a..976019d6fa06 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -486,6 +486,7 @@ void rcu_force_quiescent_state(void);
 void rcu_bh_force_quiescent_state(void);
 void rcu_sched_force_quiescent_state(void);
 extern struct workqueue_struct *rcu_gp_wq;
+extern struct workqueue_struct *rcu_par_gp_wq;
 #endif /* #else #ifdef CONFIG_TINY_RCU */
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a734692a581..23781fc90830 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
 }
 struct workqueue_struct *rcu_gp_wq;
+struct workqueue_struct *rcu_par_gp_wq;
 void __init rcu_init(void)
 {
@@ -4199,6 +4200,8 @@ void __init rcu_init(void)
        /* Create workqueue for expedited GPs and for Tree SRCU. */
        rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
        WARN_ON(!rcu_gp_wq);
+        rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
+        WARN_ON(!rcu_par_gp_wq);
 }
 #include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f491ab4f2e8e..98d33902b65c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
+/* Communicate arguments to a workqueue handler. */
+struct rcu_exp_work {
+        smp_call_func_t rew_func;
+        struct rcu_state *rew_rsp;
+        unsigned long rew_s;
+        struct work_struct rew_work;
+};
 /* RCU's kthread states for tracing. */
 #define RCU_KTHREAD_STOPPED  0
 #define RCU_KTHREAD_RUNNING  1
@@ -157,6 +165,8 @@ struct rcu_node {
        spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
        unsigned long exp_seq_rq;
        wait_queue_head_t exp_wq[4];
+        struct rcu_exp_work rew;
+        bool exp_need_flush;    /* Need to flush workitem? */
 } ____cacheline_internodealigned_in_smp;
 /*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f72eefab8543..73e1d3dca5b1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -362,93 +362,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
 }
 /*
- * Select the nodes that the upcoming expedited grace period needs
+ * Select the CPUs within the specified rcu_node that the upcoming
- * to wait for.
+ * expedited grace period needs to wait for.
 */
-static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
-                                     smp_call_func_t func)
 {
        int cpu;
        unsigned long flags;
+        smp_call_func_t func;
        unsigned long mask_ofl_test;
        unsigned long mask_ofl_ipi;
        int ret;
-        struct rcu_node *rnp;
+        struct rcu_exp_work *rewp =
+                container_of(wp, struct rcu_exp_work, rew_work);
+        struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
+        struct rcu_state *rsp = rewp->rew_rsp;
-        trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+        func = rewp->rew_func;
-        sync_exp_reset_tree(rsp);
+        raw_spin_lock_irqsave_rcu_node(rnp, flags);
-        trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
-        rcu_for_each_leaf_node(rsp, rnp) {
-                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                /* Each pass checks a CPU for identity, offline, and idle. */
-                mask_ofl_test = 0;
-                for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
-                        unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
-                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                        struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
-                        int snap;
-                        if (raw_smp_processor_id() == cpu ||
+        /* Each pass checks a CPU for identity, offline, and idle. */
-                            !(rnp->qsmaskinitnext & mask)) {
+        mask_ofl_test = 0;
+        for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+                unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+                struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+                struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
+                int snap;
+                if (raw_smp_processor_id() == cpu ||
+                    !(rnp->qsmaskinitnext & mask)) {
+                        mask_ofl_test |= mask;
+                } else {
+                        snap = rcu_dynticks_snap(rdtp);
+                        if (rcu_dynticks_in_eqs(snap))
                                mask_ofl_test |= mask;
-                        } else {
+                        else
-                                snap = rcu_dynticks_snap(rdtp);
+                                rdp->exp_dynticks_snap = snap;
-                                if (rcu_dynticks_in_eqs(snap))
-                                        mask_ofl_test |= mask;
-                                else
-                                        rdp->exp_dynticks_snap = snap;
-                        }
                }
-                mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
+        }
+        mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
-                /*
-                 * Need to wait for any blocked tasks as well.  Note that
-                 * additional blocking tasks will also block the expedited
-                 * GP until such time as the ->expmask bits are cleared.
-                 */
-                if (rcu_preempt_has_tasks(rnp))
-                        rnp->exp_tasks = rnp->blkd_tasks.next;
-                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                /* IPI the remaining CPUs for expedited quiescent state. */
+        /*
-                for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+         * Need to wait for any blocked tasks as well.  Note that
-                        unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+         * additional blocking tasks will also block the expedited GP
-                        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+         * until such time as the ->expmask bits are cleared.
+         */
+        if (rcu_preempt_has_tasks(rnp))
+                rnp->exp_tasks = rnp->blkd_tasks.next;
+        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        /* IPI the remaining CPUs for expedited quiescent state. */
+        for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+                unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
+                struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-                        if (!(mask_ofl_ipi & mask))
+                if (!(mask_ofl_ipi & mask))
-                                continue;
+                        continue;
 retry_ipi:
-                        if (rcu_dynticks_in_eqs_since(rdp->dynticks,
+                if (rcu_dynticks_in_eqs_since(rdp->dynticks,
-                                                      rdp->exp_dynticks_snap)) {
+                                              rdp->exp_dynticks_snap)) {
-                                mask_ofl_test |= mask;
+                        mask_ofl_test |= mask;
-                                continue;
+                        continue;
-                        }
+                }
-                        ret = smp_call_function_single(cpu, func, rsp, 0);
+                ret = smp_call_function_single(cpu, func, rsp, 0);
-                        if (!ret) {
+                if (!ret) {
-                                mask_ofl_ipi &= ~mask;
+                        mask_ofl_ipi &= ~mask;
-                                continue;
+                        continue;
-                        }
+                }
-                        /* Failed, raced with CPU hotplug operation. */
+                /* Failed, raced with CPU hotplug operation. */
-                        raw_spin_lock_irqsave_rcu_node(rnp, flags);
+                raw_spin_lock_irqsave_rcu_node(rnp, flags);
-                        if ((rnp->qsmaskinitnext & mask) &&
+                if ((rnp->qsmaskinitnext & mask) &&
-                            (rnp->expmask & mask)) {
+                    (rnp->expmask & mask)) {
-                                /* Online, so delay for a bit and try again. */
+                        /* Online, so delay for a bit and try again. */
-                                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
-                                trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
-                                schedule_timeout_uninterruptible(1);
-                                goto retry_ipi;
-                        }
-                        /* CPU really is offline, so we can ignore it. */
-                        if (!(rnp->expmask & mask))
-                                mask_ofl_ipi &= ~mask;
                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                        trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
+                        schedule_timeout_uninterruptible(1);
+                        goto retry_ipi;
+                }
+                /* CPU really is offline, so we can ignore it. */
+                if (!(rnp->expmask & mask))
+                        mask_ofl_ipi &= ~mask;
+                raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+        }
+        /* Report quiescent states for those that went offline. */
+        mask_ofl_test |= mask_ofl_ipi;
+        if (mask_ofl_test)
+                rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
+}
+/*
+ * Select the nodes that the upcoming expedited grace period needs
+ * to wait for.
+ */
+static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
+                                     smp_call_func_t func)
+{
+        struct rcu_node *rnp;
+        trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
+        sync_exp_reset_tree(rsp);
+        trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
+        /* Schedule work for each leaf rcu_node structure. */
+        rcu_for_each_leaf_node(rsp, rnp) {
+                rnp->exp_need_flush = false;
+                if (!READ_ONCE(rnp->expmask))
+                        continue; /* Avoid early boot non-existent wq. */
+                rnp->rew.rew_func = func;
+                rnp->rew.rew_rsp = rsp;
+                if (!READ_ONCE(rcu_par_gp_wq) ||
+                    rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+                        /* No workqueues yet. */
+                        sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
+                        continue;
                }
-                /* Report quiescent states for those that went offline. */
+                INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
-                mask_ofl_test |= mask_ofl_ipi;
+                queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
-                if (mask_ofl_test)
+                rnp->exp_need_flush = true;
-                        rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
        }
+        /* Wait for workqueue jobs (if any) to complete. */
+        rcu_for_each_leaf_node(rsp, rnp)
+                if (rnp->exp_need_flush)
+                        flush_work(&rnp->rew.rew_work);
 }
 static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -560,14 +596,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
        mutex_unlock(&rsp->exp_wake_mutex);
 }
-/* Let the workqueue handler know what it is supposed to do. */
-struct rcu_exp_work {
-        smp_call_func_t rew_func;
-        struct rcu_state *rew_rsp;
-        unsigned long rew_s;
-        struct work_struct rew_work;
-};
 /*
 * Common code to drive an expedited grace period forward, used by
 * workqueues and mid-boot-time tasks.
author	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-02 01:05:38 -0500
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-05-15 13:25:44 -0400
commit	25f3d7effab632eb10d145f1a5aebf6515a04b98 (patch)
tree	8f088c805ed43c898543f1e74d21e808605143fc
parent	60cc43fc888428bb2f18f08997432d426a243338 (diff)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 7a693e31184a..976019d6fa06 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h
@@ -486,6 +486,7 @@ void rcu_force_quiescent_state(void);
486	void rcu_bh_force_quiescent_state(void);	486	void rcu_bh_force_quiescent_state(void);
487	void rcu_sched_force_quiescent_state(void);	487	void rcu_sched_force_quiescent_state(void);
488	extern struct workqueue_struct *rcu_gp_wq;	488	extern struct workqueue_struct *rcu_gp_wq;
		489	extern struct workqueue_struct *rcu_par_gp_wq;
489	#endif /* #else #ifdef CONFIG_TINY_RCU */	490	#endif /* #else #ifdef CONFIG_TINY_RCU */
490		491
491	#ifdef CONFIG_RCU_NOCB_CPU	492	#ifdef CONFIG_RCU_NOCB_CPU


diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a734692a581..23781fc90830 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c
@@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
4168	}	4168	}
4169		4169
4170	struct workqueue_struct *rcu_gp_wq;	4170	struct workqueue_struct *rcu_gp_wq;
		4171	struct workqueue_struct *rcu_par_gp_wq;
4171		4172
4172	void __init rcu_init(void)	4173	void __init rcu_init(void)
4173	{	4174	{
@@ -4199,6 +4200,8 @@ void __init rcu_init(void)
4199	/* Create workqueue for expedited GPs and for Tree SRCU. */	4200	/* Create workqueue for expedited GPs and for Tree SRCU. */
4200	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);	4201	rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
4201	WARN_ON(!rcu_gp_wq);	4202	WARN_ON(!rcu_gp_wq);
		4203	rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
		4204	WARN_ON(!rcu_par_gp_wq);
4202	}	4205	}
4203		4206
4204	#include "tree_exp.h"	4207	#include "tree_exp.h"


diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index f491ab4f2e8e..98d33902b65c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
58	#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */	58	#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
59	};	59	};
60		60
		61	/* Communicate arguments to a workqueue handler. */
		62	struct rcu_exp_work {
		63	smp_call_func_t rew_func;
		64	struct rcu_state *rew_rsp;
		65	unsigned long rew_s;
		66	struct work_struct rew_work;
		67	};
		68
61	/* RCU's kthread states for tracing. */	69	/* RCU's kthread states for tracing. */
62	#define RCU_KTHREAD_STOPPED 0	70	#define RCU_KTHREAD_STOPPED 0
63	#define RCU_KTHREAD_RUNNING 1	71	#define RCU_KTHREAD_RUNNING 1
@@ -157,6 +165,8 @@ struct rcu_node {
157	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;	165	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
158	unsigned long exp_seq_rq;	166	unsigned long exp_seq_rq;
159	wait_queue_head_t exp_wq[4];	167	wait_queue_head_t exp_wq[4];
		168	struct rcu_exp_work rew;
		169	bool exp_need_flush; /* Need to flush workitem? */
160	} ____cacheline_internodealigned_in_smp;	170	} ____cacheline_internodealigned_in_smp;
161		171
162	/*	172	/*


diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f72eefab8543..73e1d3dca5b1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h
@@ -362,93 +362,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
362	}	362	}
363		363
364	/*	364	/*
365	* Select the nodes that the upcoming expedited grace period needs	365	* Select the CPUs within the specified rcu_node that the upcoming
366	* to wait for.	366	* expedited grace period needs to wait for.
367	*/	367	*/
368	static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,	368	static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
369	smp_call_func_t func)
370	{	369	{
371	int cpu;	370	int cpu;
372	unsigned long flags;	371	unsigned long flags;
		372	smp_call_func_t func;
373	unsigned long mask_ofl_test;	373	unsigned long mask_ofl_test;
374	unsigned long mask_ofl_ipi;	374	unsigned long mask_ofl_ipi;
375	int ret;	375	int ret;
376	struct rcu_node *rnp;	376	struct rcu_exp_work *rewp =
		377	container_of(wp, struct rcu_exp_work, rew_work);
		378	struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
		379	struct rcu_state *rsp = rewp->rew_rsp;
377		380
378	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));	381	func = rewp->rew_func;
379	sync_exp_reset_tree(rsp);	382	raw_spin_lock_irqsave_rcu_node(rnp, flags);
380	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
381	rcu_for_each_leaf_node(rsp, rnp) {
382	raw_spin_lock_irqsave_rcu_node(rnp, flags);
383
384	/* Each pass checks a CPU for identity, offline, and idle. */
385	mask_ofl_test = 0;
386	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
387	unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
388	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
389	struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
390	int snap;
391		383
392	if (raw_smp_processor_id() == cpu \|\|	384	/* Each pass checks a CPU for identity, offline, and idle. */
393	!(rnp->qsmaskinitnext & mask)) {	385	mask_ofl_test = 0;
		386	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
		387	unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
		388	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
		389	struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
		390	int snap;
		391
		392	if (raw_smp_processor_id() == cpu \|\|
		393	!(rnp->qsmaskinitnext & mask)) {
		394	mask_ofl_test \|= mask;
		395	} else {
		396	snap = rcu_dynticks_snap(rdtp);
		397	if (rcu_dynticks_in_eqs(snap))
394	mask_ofl_test \|= mask;	398	mask_ofl_test \|= mask;
395	} else {	399	else
396	snap = rcu_dynticks_snap(rdtp);	400	rdp->exp_dynticks_snap = snap;
397	if (rcu_dynticks_in_eqs(snap))
398	mask_ofl_test \|= mask;
399	else
400	rdp->exp_dynticks_snap = snap;
401	}
402	}	401	}
403	mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;	402	}
404		403	mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
405	/*
406	* Need to wait for any blocked tasks as well. Note that
407	* additional blocking tasks will also block the expedited
408	* GP until such time as the ->expmask bits are cleared.
409	*/
410	if (rcu_preempt_has_tasks(rnp))
411	rnp->exp_tasks = rnp->blkd_tasks.next;
412	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
413		404
414	/* IPI the remaining CPUs for expedited quiescent state. */	405	/*
415	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {	406	* Need to wait for any blocked tasks as well. Note that
416	unsigned long mask = leaf_node_cpu_bit(rnp, cpu);	407	* additional blocking tasks will also block the expedited GP
417	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);	408	* until such time as the ->expmask bits are cleared.
		409	*/
		410	if (rcu_preempt_has_tasks(rnp))
		411	rnp->exp_tasks = rnp->blkd_tasks.next;
		412	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
		413
		414	/* IPI the remaining CPUs for expedited quiescent state. */
		415	for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
		416	unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
		417	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
418		418
419	if (!(mask_ofl_ipi & mask))	419	if (!(mask_ofl_ipi & mask))
420	continue;	420	continue;
421	retry_ipi:	421	retry_ipi:
422	if (rcu_dynticks_in_eqs_since(rdp->dynticks,	422	if (rcu_dynticks_in_eqs_since(rdp->dynticks,
423	rdp->exp_dynticks_snap)) {	423	rdp->exp_dynticks_snap)) {
424	mask_ofl_test \|= mask;	424	mask_ofl_test \|= mask;
425	continue;	425	continue;
426	}	426	}
427	ret = smp_call_function_single(cpu, func, rsp, 0);	427	ret = smp_call_function_single(cpu, func, rsp, 0);
428	if (!ret) {	428	if (!ret) {
429	mask_ofl_ipi &= ~mask;	429	mask_ofl_ipi &= ~mask;
430	continue;	430	continue;
431	}	431	}
432	/* Failed, raced with CPU hotplug operation. */	432	/* Failed, raced with CPU hotplug operation. */
433	raw_spin_lock_irqsave_rcu_node(rnp, flags);	433	raw_spin_lock_irqsave_rcu_node(rnp, flags);
434	if ((rnp->qsmaskinitnext & mask) &&	434	if ((rnp->qsmaskinitnext & mask) &&
435	(rnp->expmask & mask)) {	435	(rnp->expmask & mask)) {
436	/* Online, so delay for a bit and try again. */	436	/* Online, so delay for a bit and try again. */
437	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
438	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
439	schedule_timeout_uninterruptible(1);
440	goto retry_ipi;
441	}
442	/* CPU really is offline, so we can ignore it. */
443	if (!(rnp->expmask & mask))
444	mask_ofl_ipi &= ~mask;
445	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);	437	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
		438	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
		439	schedule_timeout_uninterruptible(1);
		440	goto retry_ipi;
		441	}
		442	/* CPU really is offline, so we can ignore it. */
		443	if (!(rnp->expmask & mask))
		444	mask_ofl_ipi &= ~mask;
		445	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
		446	}
		447	/* Report quiescent states for those that went offline. */
		448	mask_ofl_test \|= mask_ofl_ipi;
		449	if (mask_ofl_test)
		450	rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
		451	}
		452
		453	/*
		454	* Select the nodes that the upcoming expedited grace period needs
		455	* to wait for.
		456	*/
		457	static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
		458	smp_call_func_t func)
		459	{
		460	struct rcu_node *rnp;
		461
		462	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
		463	sync_exp_reset_tree(rsp);
		464	trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
		465
		466	/* Schedule work for each leaf rcu_node structure. */
		467	rcu_for_each_leaf_node(rsp, rnp) {
		468	rnp->exp_need_flush = false;
		469	if (!READ_ONCE(rnp->expmask))
		470	continue; /* Avoid early boot non-existent wq. */
		471	rnp->rew.rew_func = func;
		472	rnp->rew.rew_rsp = rsp;
		473	if (!READ_ONCE(rcu_par_gp_wq) \|\|
		474	rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
		475	/* No workqueues yet. */
		476	sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
		477	continue;
446	}	478	}
447	/* Report quiescent states for those that went offline. */	479	INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
448	mask_ofl_test \|= mask_ofl_ipi;	480	queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
449	if (mask_ofl_test)	481	rnp->exp_need_flush = true;
450	rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
451	}	482	}
		483
		484	/* Wait for workqueue jobs (if any) to complete. */
		485	rcu_for_each_leaf_node(rsp, rnp)
		486	if (rnp->exp_need_flush)
		487	flush_work(&rnp->rew.rew_work);
452	}	488	}
453		489
454	static void synchronize_sched_expedited_wait(struct rcu_state *rsp)	490	static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -560,14 +596,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
560	mutex_unlock(&rsp->exp_wake_mutex);	596	mutex_unlock(&rsp->exp_wake_mutex);
561	}	597	}
562		598
563	/* Let the workqueue handler know what it is supposed to do. */
564	struct rcu_exp_work {
565	smp_call_func_t rew_func;
566	struct rcu_state *rew_rsp;
567	unsigned long rew_s;
568	struct work_struct rew_work;
569	};
570
571	/*	599	/*
572	* Common code to drive an expedited grace period forward, used by	600	* Common code to drive an expedited grace period forward, used by
573	* workqueues and mid-boot-time tasks.	601	* workqueues and mid-boot-time tasks.