aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2016-01-30 20:57:35 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2016-03-31 16:34:08 -0400
commitf6a12f34a448cc8a624070fd365c29c890138a48 (patch)
treec7fc5c50f1bf0c5af3b6a7d5f2dc61f43a4cfca3
parentd40a4f09a448382961fa9b1a2f7d4f34813f0273 (diff)
rcu: Enforce expedited-GP fairness via funnel wait queue
The current mutex-based funnel-locking approach used by expedited grace periods is subject to severe unfairness. The problem arises when a few tasks, making a path from leaves to root, all wake up before other tasks do. A new task can then follow this path all the way to the root, which needlessly delays tasks whose grace period is done, but who do not happen to acquire the lock quickly enough. This commit avoids this problem by maintaining per-rcu_node wait queues, along with a per-rcu_node counter that tracks the latest grace period sought by an earlier task to visit this node. If that grace period would satisfy the current task, instead of proceeding up the tree, it waits on the current rcu_node structure using a pair of wait queues provided for that purpose. This decouples awakening of old tasks from the arrival of new tasks. If the wakeups prove to be a bottleneck, additional kthreads can be brought to bear for that purpose. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-rw-r--r--include/trace/events/rcu.h5
-rw-r--r--kernel/rcu/tree.c155
-rw-r--r--kernel/rcu/tree.h10
-rw-r--r--kernel/rcu/tree_plugin.h16
4 files changed, 93 insertions, 93 deletions
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index aacc172eba7e..d3e756539d44 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -179,6 +179,7 @@ TRACE_EVENT(rcu_grace_period_init,
179 * "snap": Captured snapshot of expedited grace period sequence number. 179 * "snap": Captured snapshot of expedited grace period sequence number.
180 * "start": Started a real expedited grace period. 180 * "start": Started a real expedited grace period.
181 * "end": Ended a real expedited grace period. 181 * "end": Ended a real expedited grace period.
182 * "endwake": Woke piggybackers up.
182 * "done": Someone else did the expedited grace period for us. 183 * "done": Someone else did the expedited grace period for us.
183 */ 184 */
184TRACE_EVENT(rcu_exp_grace_period, 185TRACE_EVENT(rcu_exp_grace_period,
@@ -210,8 +211,8 @@ TRACE_EVENT(rcu_exp_grace_period,
210 * and highest-numbered CPU associated with the current rcu_node structure, 211 * and highest-numbered CPU associated with the current rcu_node structure,
211 * and a string. identifying the grace-period-related event as follows: 212 * and a string. identifying the grace-period-related event as follows:
212 * 213 *
213 * "acq": Acquired a level of funnel lock 214 * "nxtlvl": Advance to next level of rcu_node funnel
214 * "rel": Released a level of funnel lock 215 * "wait": Wait for someone else to do expedited GP
215 */ 216 */
216TRACE_EVENT(rcu_exp_funnel_lock, 217TRACE_EVENT(rcu_exp_funnel_lock,
217 218
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 89f028767765..bd2658edce00 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,6 +102,7 @@ struct rcu_state sname##_state = { \
102 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 102 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
103 .name = RCU_STATE_NAME(sname), \ 103 .name = RCU_STATE_NAME(sname), \
104 .abbr = sabbr, \ 104 .abbr = sabbr, \
105 .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
105} 106}
106 107
107RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 108RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -3484,7 +3485,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
3484 * for the current expedited grace period. Works only for preemptible 3485 * for the current expedited grace period. Works only for preemptible
3485 * RCU -- other RCU implementation use other means. 3486 * RCU -- other RCU implementation use other means.
3486 * 3487 *
3487 * Caller must hold the root rcu_node's exp_funnel_mutex. 3488 * Caller must hold the rcu_state's exp_mutex.
3488 */ 3489 */
3489static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) 3490static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
3490{ 3491{
@@ -3500,8 +3501,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
3500 * recursively up the tree. (Calm down, calm down, we do the recursion 3501 * recursively up the tree. (Calm down, calm down, we do the recursion
3501 * iteratively!) 3502 * iteratively!)
3502 * 3503 *
3503 * Caller must hold the root rcu_node's exp_funnel_mutex and the 3504 * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
3504 * specified rcu_node structure's ->lock. 3505 * structure's ->lock.
3505 */ 3506 */
3506static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 3507static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3507 bool wake, unsigned long flags) 3508 bool wake, unsigned long flags)
@@ -3538,7 +3539,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3538 * Report expedited quiescent state for specified node. This is a 3539 * Report expedited quiescent state for specified node. This is a
3539 * lock-acquisition wrapper function for __rcu_report_exp_rnp(). 3540 * lock-acquisition wrapper function for __rcu_report_exp_rnp().
3540 * 3541 *
3541 * Caller must hold the root rcu_node's exp_funnel_mutex. 3542 * Caller must hold the rcu_state's exp_mutex.
3542 */ 3543 */
3543static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, 3544static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
3544 struct rcu_node *rnp, bool wake) 3545 struct rcu_node *rnp, bool wake)
@@ -3551,8 +3552,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
3551 3552
3552/* 3553/*
3553 * Report expedited quiescent state for multiple CPUs, all covered by the 3554 * Report expedited quiescent state for multiple CPUs, all covered by the
3554 * specified leaf rcu_node structure. Caller must hold the root 3555 * specified leaf rcu_node structure. Caller must hold the rcu_state's
3555 * rcu_node's exp_funnel_mutex. 3556 * exp_mutex.
3556 */ 3557 */
3557static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, 3558static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3558 unsigned long mask, bool wake) 3559 unsigned long mask, bool wake)
@@ -3570,7 +3571,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3570 3571
3571/* 3572/*
3572 * Report expedited quiescent state for specified rcu_data (CPU). 3573 * Report expedited quiescent state for specified rcu_data (CPU).
3573 * Caller must hold the root rcu_node's exp_funnel_mutex.
3574 */ 3574 */
3575static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, 3575static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
3576 bool wake) 3576 bool wake)
@@ -3579,24 +3579,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
3579} 3579}
3580 3580
3581/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ 3581/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
3582static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, 3582static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
3583 struct rcu_data *rdp, 3583 unsigned long s)
3584 atomic_long_t *stat, unsigned long s)
3585{ 3584{
3586 if (rcu_exp_gp_seq_done(rsp, s)) { 3585 if (rcu_exp_gp_seq_done(rsp, s)) {
3587 trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); 3586 trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
3588 if (rnp) {
3589 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
3590 rnp->grplo, rnp->grphi,
3591 TPS("rel"));
3592 mutex_unlock(&rnp->exp_funnel_mutex);
3593 } else if (rdp) {
3594 trace_rcu_exp_funnel_lock(rsp->name,
3595 rdp->mynode->level + 1,
3596 rdp->cpu, rdp->cpu,
3597 TPS("rel"));
3598 mutex_unlock(&rdp->exp_funnel_mutex);
3599 }
3600 /* Ensure test happens before caller kfree(). */ 3587 /* Ensure test happens before caller kfree(). */
3601 smp_mb__before_atomic(); /* ^^^ */ 3588 smp_mb__before_atomic(); /* ^^^ */
3602 atomic_long_inc(stat); 3589 atomic_long_inc(stat);
@@ -3606,53 +3593,53 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
3606} 3593}
3607 3594
3608/* 3595/*
3609 * Funnel-lock acquisition for expedited grace periods. Returns a 3596 * Funnel-lock acquisition for expedited grace periods. Returns true
3610 * pointer to the root rcu_node structure, or NULL if some other 3597 * if some other task completed an expedited grace period that this task
3611 * task did the expedited grace period for us. 3598 * can piggy-back on, and with no mutex held. Otherwise, returns false
3599 * with the mutex held, indicating that the caller must actually do the
3600 * expedited grace period.
3612 */ 3601 */
3613static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) 3602static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3614{ 3603{
3615 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); 3604 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
3616 struct rcu_node *rnp0; 3605 struct rcu_node *rnp = rdp->mynode;
3617 struct rcu_node *rnp1 = NULL;
3618 3606
3619 /* 3607 /*
3620 * Each pass through the following loop works its way 3608 * Each pass through the following loop works its way up
3621 * up the rcu_node tree, returning if others have done the 3609 * the rcu_node tree, returning if others have done the work or
3622 * work or otherwise falls through holding the root rnp's 3610 * otherwise falls through to acquire rsp->exp_mutex. The mapping
3623 * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure 3611 * from CPU to rcu_node structure can be inexact, as it is just
3624 * can be inexact, as it is just promoting locality and is not 3612 * promoting locality and is not strictly needed for correctness.
3625 * strictly needed for correctness.
3626 */ 3613 */
3627 if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s)) 3614 for (; rnp != NULL; rnp = rnp->parent) {
3628 return NULL; 3615 if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
3629 mutex_lock(&rdp->exp_funnel_mutex); 3616 return true;
3630 trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1, 3617
3631 rdp->cpu, rdp->cpu, TPS("acq")); 3618 /* Work not done, either wait here or go up. */
3632 rnp0 = rdp->mynode; 3619 spin_lock(&rnp->exp_lock);
3633 for (; rnp0 != NULL; rnp0 = rnp0->parent) { 3620 if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
3634 if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s)) 3621
3635 return NULL; 3622 /* Someone else doing GP, so wait for them. */
3636 mutex_lock(&rnp0->exp_funnel_mutex); 3623 spin_unlock(&rnp->exp_lock);
3637 trace_rcu_exp_funnel_lock(rsp->name, rnp0->level, 3624 trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
3638 rnp0->grplo, rnp0->grphi, TPS("acq")); 3625 rnp->grplo, rnp->grphi,
3639 if (rnp1) { 3626 TPS("wait"));
3640 trace_rcu_exp_funnel_lock(rsp->name, rnp1->level, 3627 wait_event(rnp->exp_wq[(s >> 1) & 0x1],
3641 rnp1->grplo, rnp1->grphi, 3628 sync_exp_work_done(rsp,
3642 TPS("rel")); 3629 &rdp->exp_workdone2, s));
3643 mutex_unlock(&rnp1->exp_funnel_mutex); 3630 return true;
3644 } else {
3645 trace_rcu_exp_funnel_lock(rsp->name,
3646 rdp->mynode->level + 1,
3647 rdp->cpu, rdp->cpu,
3648 TPS("rel"));
3649 mutex_unlock(&rdp->exp_funnel_mutex);
3650 } 3631 }
3651 rnp1 = rnp0; 3632 rnp->exp_seq_rq = s; /* Followers can wait on us. */
3633 spin_unlock(&rnp->exp_lock);
3634 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
3635 rnp->grphi, TPS("nxtlvl"));
3652 } 3636 }
3653 if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s)) 3637 mutex_lock(&rsp->exp_mutex);
3654 return NULL; 3638 if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
3655 return rnp1; 3639 mutex_unlock(&rsp->exp_mutex);
3640 return true;
3641 }
3642 return false;
3656} 3643}
3657 3644
3658/* Invoked on each online non-idle CPU for expedited quiescent state. */ 3645/* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -3841,6 +3828,27 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3841 } 3828 }
3842} 3829}
3843 3830
3831/*
3832 * Wake up everyone who piggybacked on the just-completed expedited
3833 * grace period. Also update all the ->exp_seq_rq counters as needed
3834 * in order to avoid counter-wrap problems.
3835 */
3836static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
3837{
3838 struct rcu_node *rnp;
3839
3840 rcu_for_each_node_breadth_first(rsp, rnp) {
3841 if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
3842 spin_lock(&rnp->exp_lock);
3843 /* Recheck, avoid hang in case someone just arrived. */
3844 if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
3845 rnp->exp_seq_rq = s;
3846 spin_unlock(&rnp->exp_lock);
3847 }
3848 wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
3849 }
3850}
3851
3844/** 3852/**
3845 * synchronize_sched_expedited - Brute-force RCU-sched grace period 3853 * synchronize_sched_expedited - Brute-force RCU-sched grace period
3846 * 3854 *
@@ -3860,7 +3868,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3860void synchronize_sched_expedited(void) 3868void synchronize_sched_expedited(void)
3861{ 3869{
3862 unsigned long s; 3870 unsigned long s;
3863 struct rcu_node *rnp;
3864 struct rcu_state *rsp = &rcu_sched_state; 3871 struct rcu_state *rsp = &rcu_sched_state;
3865 3872
3866 /* If only one CPU, this is automatically a grace period. */ 3873 /* If only one CPU, this is automatically a grace period. */
@@ -3877,20 +3884,23 @@ void synchronize_sched_expedited(void)
3877 s = rcu_exp_gp_seq_snap(rsp); 3884 s = rcu_exp_gp_seq_snap(rsp);
3878 trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); 3885 trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
3879 3886
3880 rnp = exp_funnel_lock(rsp, s); 3887 if (exp_funnel_lock(rsp, s))
3881 if (rnp == NULL)
3882 return; /* Someone else did our work for us. */ 3888 return; /* Someone else did our work for us. */
3883 3889
3884 rcu_exp_gp_seq_start(rsp); 3890 rcu_exp_gp_seq_start(rsp);
3885 trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); 3891 trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
3892
3893 /* Initialize the rcu_node tree in preparation for the wait. */
3886 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); 3894 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
3887 synchronize_sched_expedited_wait(rsp);
3888 3895
3896 /* Wait and clean up, including waking everyone. */
3897 synchronize_sched_expedited_wait(rsp);
3889 rcu_exp_gp_seq_end(rsp); 3898 rcu_exp_gp_seq_end(rsp);
3890 trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); 3899 trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
3891 trace_rcu_exp_funnel_lock(rsp->name, rnp->level, 3900 rcu_exp_wake(rsp, s);
3892 rnp->grplo, rnp->grphi, TPS("rel")); 3901
3893 mutex_unlock(&rnp->exp_funnel_mutex); 3902 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
3903 mutex_unlock(&rsp->exp_mutex);
3894} 3904}
3895EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 3905EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
3896 3906
@@ -4190,7 +4200,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
4190 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 4200 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
4191 rdp->cpu = cpu; 4201 rdp->cpu = cpu;
4192 rdp->rsp = rsp; 4202 rdp->rsp = rsp;
4193 mutex_init(&rdp->exp_funnel_mutex);
4194 rcu_boot_init_nocb_percpu_data(rdp); 4203 rcu_boot_init_nocb_percpu_data(rdp);
4195 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4204 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
4196} 4205}
@@ -4448,10 +4457,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4448{ 4457{
4449 static const char * const buf[] = RCU_NODE_NAME_INIT; 4458 static const char * const buf[] = RCU_NODE_NAME_INIT;
4450 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4459 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4451 static const char * const exp[] = RCU_EXP_NAME_INIT;
4452 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 4460 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
4453 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 4461 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
4454 static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
4455 static u8 fl_mask = 0x1; 4462 static u8 fl_mask = 0x1;
4456 4463
4457 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ 4464 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4510,9 +4517,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
4510 rnp->level = i; 4517 rnp->level = i;
4511 INIT_LIST_HEAD(&rnp->blkd_tasks); 4518 INIT_LIST_HEAD(&rnp->blkd_tasks);
4512 rcu_init_one_nocb(rnp); 4519 rcu_init_one_nocb(rnp);
4513 mutex_init(&rnp->exp_funnel_mutex); 4520 init_waitqueue_head(&rnp->exp_wq[0]);
4514 lockdep_set_class_and_name(&rnp->exp_funnel_mutex, 4521 init_waitqueue_head(&rnp->exp_wq[1]);
4515 &rcu_exp_class[i], exp[i]); 4522 spin_lock_init(&rnp->exp_lock);
4516 } 4523 }
4517 } 4524 }
4518 4525
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6a8f09446924..f9d4fbb1e014 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,7 +70,6 @@
70# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } 70# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
71# define RCU_NODE_NAME_INIT { "rcu_node_0" } 71# define RCU_NODE_NAME_INIT { "rcu_node_0" }
72# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } 72# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
73# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
74#elif NR_CPUS <= RCU_FANOUT_2 73#elif NR_CPUS <= RCU_FANOUT_2
75# define RCU_NUM_LVLS 2 74# define RCU_NUM_LVLS 2
76# define NUM_RCU_LVL_0 1 75# define NUM_RCU_LVL_0 1
@@ -79,7 +78,6 @@
79# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } 78# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
80# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } 79# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
81# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } 80# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
82# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
83#elif NR_CPUS <= RCU_FANOUT_3 81#elif NR_CPUS <= RCU_FANOUT_3
84# define RCU_NUM_LVLS 3 82# define RCU_NUM_LVLS 3
85# define NUM_RCU_LVL_0 1 83# define NUM_RCU_LVL_0 1
@@ -89,7 +87,6 @@
89# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } 87# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
90# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } 88# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
91# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } 89# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
92# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
93#elif NR_CPUS <= RCU_FANOUT_4 90#elif NR_CPUS <= RCU_FANOUT_4
94# define RCU_NUM_LVLS 4 91# define RCU_NUM_LVLS 4
95# define NUM_RCU_LVL_0 1 92# define NUM_RCU_LVL_0 1
@@ -100,7 +97,6 @@
100# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } 97# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
101# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } 98# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
102# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } 99# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
103# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
104#else 100#else
105# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 101# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
106#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ 102#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -252,7 +248,9 @@ struct rcu_node {
252 /* Counts of upcoming no-CB GP requests. */ 248 /* Counts of upcoming no-CB GP requests. */
253 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 249 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
254 250
255 struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; 251 spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
252 unsigned long exp_seq_rq;
253 wait_queue_head_t exp_wq[2];
256} ____cacheline_internodealigned_in_smp; 254} ____cacheline_internodealigned_in_smp;
257 255
258/* 256/*
@@ -387,7 +385,6 @@ struct rcu_data {
387#ifdef CONFIG_RCU_FAST_NO_HZ 385#ifdef CONFIG_RCU_FAST_NO_HZ
388 struct rcu_head oom_head; 386 struct rcu_head oom_head;
389#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 387#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
390 struct mutex exp_funnel_mutex;
391 atomic_long_t exp_workdone1; /* # done by others #1. */ 388 atomic_long_t exp_workdone1; /* # done by others #1. */
392 atomic_long_t exp_workdone2; /* # done by others #2. */ 389 atomic_long_t exp_workdone2; /* # done by others #2. */
393 atomic_long_t exp_workdone3; /* # done by others #3. */ 390 atomic_long_t exp_workdone3; /* # done by others #3. */
@@ -504,6 +501,7 @@ struct rcu_state {
504 /* _rcu_barrier(). */ 501 /* _rcu_barrier(). */
505 /* End of fields guarded by barrier_mutex. */ 502 /* End of fields guarded by barrier_mutex. */
506 503
504 struct mutex exp_mutex; /* Serialize expedited GP. */
507 unsigned long expedited_sequence; /* Take a ticket. */ 505 unsigned long expedited_sequence; /* Take a ticket. */
508 atomic_long_t expedited_normal; /* # fallbacks to normal. */ 506 atomic_long_t expedited_normal; /* # fallbacks to normal. */
509 atomic_t expedited_need_qs; /* # CPUs left to check in. */ 507 atomic_t expedited_need_qs; /* # CPUs left to check in. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 36e94aed38a7..c82c3640493f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -738,8 +738,6 @@ static void sync_rcu_exp_handler(void *info)
738 */ 738 */
739void synchronize_rcu_expedited(void) 739void synchronize_rcu_expedited(void)
740{ 740{
741 struct rcu_node *rnp;
742 struct rcu_node *rnp_unlock;
743 struct rcu_state *rsp = rcu_state_p; 741 struct rcu_state *rsp = rcu_state_p;
744 unsigned long s; 742 unsigned long s;
745 743
@@ -752,8 +750,7 @@ void synchronize_rcu_expedited(void)
752 s = rcu_exp_gp_seq_snap(rsp); 750 s = rcu_exp_gp_seq_snap(rsp);
753 trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); 751 trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
754 752
755 rnp_unlock = exp_funnel_lock(rsp, s); 753 if (exp_funnel_lock(rsp, s))
756 if (rnp_unlock == NULL)
757 return; /* Someone else did our work for us. */ 754 return; /* Someone else did our work for us. */
758 755
759 rcu_exp_gp_seq_start(rsp); 756 rcu_exp_gp_seq_start(rsp);
@@ -763,16 +760,13 @@ void synchronize_rcu_expedited(void)
763 sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); 760 sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
764 761
765 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 762 /* Wait for snapshotted ->blkd_tasks lists to drain. */
766 rnp = rcu_get_root(rsp);
767 synchronize_sched_expedited_wait(rsp); 763 synchronize_sched_expedited_wait(rsp);
768
769 /* Clean up and exit. */
770 rcu_exp_gp_seq_end(rsp); 764 rcu_exp_gp_seq_end(rsp);
771 trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); 765 trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
772 mutex_unlock(&rnp_unlock->exp_funnel_mutex); 766 rcu_exp_wake(rsp, s);
773 trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level, 767
774 rnp_unlock->grplo, rnp_unlock->grphi, 768 trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
775 TPS("rel")); 769 mutex_unlock(&rsp->exp_mutex);
776} 770}
777EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 771EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
778 772