aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-10-07 19:05:21 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2015-10-07 19:05:21 -0400
commitd2856b046d2ce2bfb664727cb8671ad0e371bd6c (patch)
treecb9056e8fb6a3038db6629781dfefbac8387d0c2 /kernel/rcu/tree.c
parent7f5f873c6a0772970d5fee1f364231207051ecd8 (diff)
parent338b0f760e84676130c6e4d8268cb8c923b38c8c (diff)
Merge branches 'fixes.2015.10.06a' and 'exp.2015.10.07a' into HEAD
exp.2015.10.07a: Reduce OS jitter of RCU-sched expedited grace periods. fixes.2015.10.06a: Miscellaneous fixes.
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c457
1 files changed, 364 insertions, 93 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4d296b0fb987..f07343b54fe5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -71,7 +71,6 @@ MODULE_ALIAS("rcutree");
71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
73static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; 73static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
74static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
75 74
76/* 75/*
77 * In order to export the rcu_state name to the tracing tools, it 76 * In order to export the rcu_state name to the tracing tools, it
@@ -161,6 +160,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
161static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 160static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
162static void invoke_rcu_core(void); 161static void invoke_rcu_core(void);
163static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 162static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
163static void rcu_report_exp_rdp(struct rcu_state *rsp,
164 struct rcu_data *rdp, bool wake);
164 165
165/* rcuc/rcub kthread realtime priority */ 166/* rcuc/rcub kthread realtime priority */
166#ifdef CONFIG_RCU_KTHREAD_PRIO 167#ifdef CONFIG_RCU_KTHREAD_PRIO
@@ -245,21 +246,33 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
245 */ 246 */
246void rcu_sched_qs(void) 247void rcu_sched_qs(void)
247{ 248{
248 if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) { 249 unsigned long flags;
250
251 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
249 trace_rcu_grace_period(TPS("rcu_sched"), 252 trace_rcu_grace_period(TPS("rcu_sched"),
250 __this_cpu_read(rcu_sched_data.gpnum), 253 __this_cpu_read(rcu_sched_data.gpnum),
251 TPS("cpuqs")); 254 TPS("cpuqs"));
252 __this_cpu_write(rcu_sched_data.passed_quiesce, 1); 255 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
256 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
257 return;
258 local_irq_save(flags);
259 if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
260 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
261 rcu_report_exp_rdp(&rcu_sched_state,
262 this_cpu_ptr(&rcu_sched_data),
263 true);
264 }
265 local_irq_restore(flags);
253 } 266 }
254} 267}
255 268
256void rcu_bh_qs(void) 269void rcu_bh_qs(void)
257{ 270{
258 if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) { 271 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) {
259 trace_rcu_grace_period(TPS("rcu_bh"), 272 trace_rcu_grace_period(TPS("rcu_bh"),
260 __this_cpu_read(rcu_bh_data.gpnum), 273 __this_cpu_read(rcu_bh_data.gpnum),
261 TPS("cpuqs")); 274 TPS("cpuqs"));
262 __this_cpu_write(rcu_bh_data.passed_quiesce, 1); 275 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false);
263 } 276 }
264} 277}
265 278
@@ -1753,9 +1766,9 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
1753 */ 1766 */
1754 rdp->gpnum = rnp->gpnum; 1767 rdp->gpnum = rnp->gpnum;
1755 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1768 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
1756 rdp->passed_quiesce = 0; 1769 rdp->cpu_no_qs.b.norm = true;
1757 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1770 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
1758 rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); 1771 rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
1759 zero_cpu_stall_ticks(rdp); 1772 zero_cpu_stall_ticks(rdp);
1760 WRITE_ONCE(rdp->gpwrap, false); 1773 WRITE_ONCE(rdp->gpwrap, false);
1761 } 1774 }
@@ -2344,7 +2357,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2344 rnp = rdp->mynode; 2357 rnp = rdp->mynode;
2345 raw_spin_lock_irqsave(&rnp->lock, flags); 2358 raw_spin_lock_irqsave(&rnp->lock, flags);
2346 smp_mb__after_unlock_lock(); 2359 smp_mb__after_unlock_lock();
2347 if ((rdp->passed_quiesce == 0 && 2360 if ((rdp->cpu_no_qs.b.norm &&
2348 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || 2361 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
2349 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || 2362 rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
2350 rdp->gpwrap) { 2363 rdp->gpwrap) {
@@ -2355,7 +2368,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2355 * We will instead need a new quiescent state that lies 2368 * We will instead need a new quiescent state that lies
2356 * within the current grace period. 2369 * within the current grace period.
2357 */ 2370 */
2358 rdp->passed_quiesce = 0; /* need qs for new gp. */ 2371 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
2359 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2372 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
2360 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2373 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2361 return; 2374 return;
@@ -2364,7 +2377,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
2364 if ((rnp->qsmask & mask) == 0) { 2377 if ((rnp->qsmask & mask) == 0) {
2365 raw_spin_unlock_irqrestore(&rnp->lock, flags); 2378 raw_spin_unlock_irqrestore(&rnp->lock, flags);
2366 } else { 2379 } else {
2367 rdp->qs_pending = 0; 2380 rdp->core_needs_qs = 0;
2368 2381
2369 /* 2382 /*
2370 * This GP can't end until cpu checks in, so all of our 2383 * This GP can't end until cpu checks in, so all of our
@@ -2395,14 +2408,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
2395 * Does this CPU still need to do its part for current grace period? 2408 * Does this CPU still need to do its part for current grace period?
2396 * If no, return and let the other CPUs do their part as well. 2409 * If no, return and let the other CPUs do their part as well.
2397 */ 2410 */
2398 if (!rdp->qs_pending) 2411 if (!rdp->core_needs_qs)
2399 return; 2412 return;
2400 2413
2401 /* 2414 /*
2402 * Was there a quiescent state since the beginning of the grace 2415 * Was there a quiescent state since the beginning of the grace
2403 * period? If no, then exit and wait for the next call. 2416 * period? If no, then exit and wait for the next call.
2404 */ 2417 */
2405 if (!rdp->passed_quiesce && 2418 if (rdp->cpu_no_qs.b.norm &&
2406 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) 2419 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
2407 return; 2420 return;
2408 2421
@@ -3386,6 +3399,191 @@ static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
3386 return rcu_seq_done(&rsp->expedited_sequence, s); 3399 return rcu_seq_done(&rsp->expedited_sequence, s);
3387} 3400}
3388 3401
3402/*
3403 * Reset the ->expmaskinit values in the rcu_node tree to reflect any
3404 * recent CPU-online activity. Note that these masks are not cleared
3405 * when CPUs go offline, so they reflect the union of all CPUs that have
3406 * ever been online. This means that this function normally takes its
3407 * no-work-to-do fastpath.
3408 */
3409static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
3410{
3411 bool done;
3412 unsigned long flags;
3413 unsigned long mask;
3414 unsigned long oldmask;
3415 int ncpus = READ_ONCE(rsp->ncpus);
3416 struct rcu_node *rnp;
3417 struct rcu_node *rnp_up;
3418
3419 /* If no new CPUs onlined since last time, nothing to do. */
3420 if (likely(ncpus == rsp->ncpus_snap))
3421 return;
3422 rsp->ncpus_snap = ncpus;
3423
3424 /*
3425 * Each pass through the following loop propagates newly onlined
3426 * CPUs for the current rcu_node structure up the rcu_node tree.
3427 */
3428 rcu_for_each_leaf_node(rsp, rnp) {
3429 raw_spin_lock_irqsave(&rnp->lock, flags);
3430 smp_mb__after_unlock_lock();
3431 if (rnp->expmaskinit == rnp->expmaskinitnext) {
3432 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3433 continue; /* No new CPUs, nothing to do. */
3434 }
3435
3436 /* Update this node's mask, track old value for propagation. */
3437 oldmask = rnp->expmaskinit;
3438 rnp->expmaskinit = rnp->expmaskinitnext;
3439 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3440
3441 /* If was already nonzero, nothing to propagate. */
3442 if (oldmask)
3443 continue;
3444
3445 /* Propagate the new CPU up the tree. */
3446 mask = rnp->grpmask;
3447 rnp_up = rnp->parent;
3448 done = false;
3449 while (rnp_up) {
3450 raw_spin_lock_irqsave(&rnp_up->lock, flags);
3451 smp_mb__after_unlock_lock();
3452 if (rnp_up->expmaskinit)
3453 done = true;
3454 rnp_up->expmaskinit |= mask;
3455 raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
3456 if (done)
3457 break;
3458 mask = rnp_up->grpmask;
3459 rnp_up = rnp_up->parent;
3460 }
3461 }
3462}
3463
3464/*
3465 * Reset the ->expmask values in the rcu_node tree in preparation for
3466 * a new expedited grace period.
3467 */
3468static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
3469{
3470 unsigned long flags;
3471 struct rcu_node *rnp;
3472
3473 sync_exp_reset_tree_hotplug(rsp);
3474 rcu_for_each_node_breadth_first(rsp, rnp) {
3475 raw_spin_lock_irqsave(&rnp->lock, flags);
3476 smp_mb__after_unlock_lock();
3477 WARN_ON_ONCE(rnp->expmask);
3478 rnp->expmask = rnp->expmaskinit;
3479 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3480 }
3481}
3482
3483/*
3484 * Return non-zero if there is no RCU expedited grace period in progress
3485 * for the specified rcu_node structure, in other words, if all CPUs and
3486 * tasks covered by the specified rcu_node structure have done their bit
3487 * for the current expedited grace period. Works only for preemptible
3488 * RCU -- other RCU implementation use other means.
3489 *
3490 * Caller must hold the root rcu_node's exp_funnel_mutex.
3491 */
3492static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
3493{
3494 return rnp->exp_tasks == NULL &&
3495 READ_ONCE(rnp->expmask) == 0;
3496}
3497
3498/*
3499 * Report the exit from RCU read-side critical section for the last task
3500 * that queued itself during or before the current expedited preemptible-RCU
3501 * grace period. This event is reported either to the rcu_node structure on
3502 * which the task was queued or to one of that rcu_node structure's ancestors,
3503 * recursively up the tree. (Calm down, calm down, we do the recursion
3504 * iteratively!)
3505 *
3506 * Caller must hold the root rcu_node's exp_funnel_mutex and the
3507 * specified rcu_node structure's ->lock.
3508 */
3509static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
3510 bool wake, unsigned long flags)
3511 __releases(rnp->lock)
3512{
3513 unsigned long mask;
3514
3515 for (;;) {
3516 if (!sync_rcu_preempt_exp_done(rnp)) {
3517 if (!rnp->expmask)
3518 rcu_initiate_boost(rnp, flags);
3519 else
3520 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3521 break;
3522 }
3523 if (rnp->parent == NULL) {
3524 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3525 if (wake) {
3526 smp_mb(); /* EGP done before wake_up(). */
3527 wake_up(&rsp->expedited_wq);
3528 }
3529 break;
3530 }
3531 mask = rnp->grpmask;
3532 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
3533 rnp = rnp->parent;
3534 raw_spin_lock(&rnp->lock); /* irqs already disabled */
3535 smp_mb__after_unlock_lock();
3536 WARN_ON_ONCE(!(rnp->expmask & mask));
3537 rnp->expmask &= ~mask;
3538 }
3539}
3540
3541/*
3542 * Report expedited quiescent state for specified node. This is a
3543 * lock-acquisition wrapper function for __rcu_report_exp_rnp().
3544 *
3545 * Caller must hold the root rcu_node's exp_funnel_mutex.
3546 */
3547static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
3548 struct rcu_node *rnp, bool wake)
3549{
3550 unsigned long flags;
3551
3552 raw_spin_lock_irqsave(&rnp->lock, flags);
3553 smp_mb__after_unlock_lock();
3554 __rcu_report_exp_rnp(rsp, rnp, wake, flags);
3555}
3556
3557/*
3558 * Report expedited quiescent state for multiple CPUs, all covered by the
3559 * specified leaf rcu_node structure. Caller must hold the root
3560 * rcu_node's exp_funnel_mutex.
3561 */
3562static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
3563 unsigned long mask, bool wake)
3564{
3565 unsigned long flags;
3566
3567 raw_spin_lock_irqsave(&rnp->lock, flags);
3568 smp_mb__after_unlock_lock();
3569 if (!(rnp->expmask & mask)) {
3570 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3571 return;
3572 }
3573 rnp->expmask &= ~mask;
3574 __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */
3575}
3576
3577/*
3578 * Report expedited quiescent state for specified rcu_data (CPU).
3579 * Caller must hold the root rcu_node's exp_funnel_mutex.
3580 */
3581static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
3582 bool wake)
3583{
3584 rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake);
3585}
3586
3389/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ 3587/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
3390static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, 3588static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
3391 struct rcu_data *rdp, 3589 struct rcu_data *rdp,
@@ -3462,16 +3660,111 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3462} 3660}
3463 3661
3464/* Invoked on each online non-idle CPU for expedited quiescent state. */ 3662/* Invoked on each online non-idle CPU for expedited quiescent state. */
3465static int synchronize_sched_expedited_cpu_stop(void *data) 3663static void sync_sched_exp_handler(void *data)
3466{ 3664{
3467 struct rcu_data *rdp = data; 3665 struct rcu_data *rdp;
3468 struct rcu_state *rsp = rdp->rsp; 3666 struct rcu_node *rnp;
3667 struct rcu_state *rsp = data;
3469 3668
3470 /* We are here: If we are last, do the wakeup. */ 3669 rdp = this_cpu_ptr(rsp->rda);
3471 rdp->exp_done = true; 3670 rnp = rdp->mynode;
3472 if (atomic_dec_and_test(&rsp->expedited_need_qs)) 3671 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
3473 wake_up(&rsp->expedited_wq); 3672 __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
3474 return 0; 3673 return;
3674 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
3675 resched_cpu(smp_processor_id());
3676}
3677
3678/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
3679static void sync_sched_exp_online_cleanup(int cpu)
3680{
3681 struct rcu_data *rdp;
3682 int ret;
3683 struct rcu_node *rnp;
3684 struct rcu_state *rsp = &rcu_sched_state;
3685
3686 rdp = per_cpu_ptr(rsp->rda, cpu);
3687 rnp = rdp->mynode;
3688 if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
3689 return;
3690 ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0);
3691 WARN_ON_ONCE(ret);
3692}
3693
3694/*
3695 * Select the nodes that the upcoming expedited grace period needs
3696 * to wait for.
3697 */
3698static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
3699 smp_call_func_t func)
3700{
3701 int cpu;
3702 unsigned long flags;
3703 unsigned long mask;
3704 unsigned long mask_ofl_test;
3705 unsigned long mask_ofl_ipi;
3706 int ret;
3707 struct rcu_node *rnp;
3708
3709 sync_exp_reset_tree(rsp);
3710 rcu_for_each_leaf_node(rsp, rnp) {
3711 raw_spin_lock_irqsave(&rnp->lock, flags);
3712 smp_mb__after_unlock_lock();
3713
3714 /* Each pass checks a CPU for identity, offline, and idle. */
3715 mask_ofl_test = 0;
3716 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
3717 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3718 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3719
3720 if (raw_smp_processor_id() == cpu ||
3721 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3722 mask_ofl_test |= rdp->grpmask;
3723 }
3724 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
3725
3726 /*
3727 * Need to wait for any blocked tasks as well. Note that
3728 * additional blocking tasks will also block the expedited
3729 * GP until such time as the ->expmask bits are cleared.
3730 */
3731 if (rcu_preempt_has_tasks(rnp))
3732 rnp->exp_tasks = rnp->blkd_tasks.next;
3733 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3734
3735 /* IPI the remaining CPUs for expedited quiescent state. */
3736 mask = 1;
3737 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3738 if (!(mask_ofl_ipi & mask))
3739 continue;
3740retry_ipi:
3741 ret = smp_call_function_single(cpu, func, rsp, 0);
3742 if (!ret) {
3743 mask_ofl_ipi &= ~mask;
3744 } else {
3745 /* Failed, raced with offline. */
3746 raw_spin_lock_irqsave(&rnp->lock, flags);
3747 if (cpu_online(cpu) &&
3748 (rnp->expmask & mask)) {
3749 raw_spin_unlock_irqrestore(&rnp->lock,
3750 flags);
3751 schedule_timeout_uninterruptible(1);
3752 if (cpu_online(cpu) &&
3753 (rnp->expmask & mask))
3754 goto retry_ipi;
3755 raw_spin_lock_irqsave(&rnp->lock,
3756 flags);
3757 }
3758 if (!(rnp->expmask & mask))
3759 mask_ofl_ipi &= ~mask;
3760 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3761 }
3762 }
3763 /* Report quiescent states for those that went offline. */
3764 mask_ofl_test |= mask_ofl_ipi;
3765 if (mask_ofl_test)
3766 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
3767 }
3475} 3768}
3476 3769
3477static void synchronize_sched_expedited_wait(struct rcu_state *rsp) 3770static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -3479,7 +3772,9 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3479 int cpu; 3772 int cpu;
3480 unsigned long jiffies_stall; 3773 unsigned long jiffies_stall;
3481 unsigned long jiffies_start; 3774 unsigned long jiffies_start;
3482 struct rcu_data *rdp; 3775 unsigned long mask;
3776 struct rcu_node *rnp;
3777 struct rcu_node *rnp_root = rcu_get_root(rsp);
3483 int ret; 3778 int ret;
3484 3779
3485 jiffies_stall = rcu_jiffies_till_stall_check(); 3780 jiffies_stall = rcu_jiffies_till_stall_check();
@@ -3488,33 +3783,43 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3488 for (;;) { 3783 for (;;) {
3489 ret = wait_event_interruptible_timeout( 3784 ret = wait_event_interruptible_timeout(
3490 rsp->expedited_wq, 3785 rsp->expedited_wq,
3491 !atomic_read(&rsp->expedited_need_qs), 3786 sync_rcu_preempt_exp_done(rnp_root),
3492 jiffies_stall); 3787 jiffies_stall);
3493 if (ret > 0) 3788 if (ret > 0)
3494 return; 3789 return;
3495 if (ret < 0) { 3790 if (ret < 0) {
3496 /* Hit a signal, disable CPU stall warnings. */ 3791 /* Hit a signal, disable CPU stall warnings. */
3497 wait_event(rsp->expedited_wq, 3792 wait_event(rsp->expedited_wq,
3498 !atomic_read(&rsp->expedited_need_qs)); 3793 sync_rcu_preempt_exp_done(rnp_root));
3499 return; 3794 return;
3500 } 3795 }
3501 pr_err("INFO: %s detected expedited stalls on CPUs: {", 3796 pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
3502 rsp->name); 3797 rsp->name);
3503 for_each_online_cpu(cpu) { 3798 rcu_for_each_leaf_node(rsp, rnp) {
3504 rdp = per_cpu_ptr(rsp->rda, cpu); 3799 (void)rcu_print_task_exp_stall(rnp);
3505 3800 mask = 1;
3506 if (rdp->exp_done) 3801 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3507 continue; 3802 struct rcu_data *rdp;
3508 pr_cont(" %d", cpu); 3803
3804 if (!(rnp->expmask & mask))
3805 continue;
3806 rdp = per_cpu_ptr(rsp->rda, cpu);
3807 pr_cont(" %d-%c%c%c", cpu,
3808 "O."[cpu_online(cpu)],
3809 "o."[!!(rdp->grpmask & rnp->expmaskinit)],
3810 "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
3811 }
3812 mask <<= 1;
3509 } 3813 }
3510 pr_cont(" } %lu jiffies s: %lu\n", 3814 pr_cont(" } %lu jiffies s: %lu\n",
3511 jiffies - jiffies_start, rsp->expedited_sequence); 3815 jiffies - jiffies_start, rsp->expedited_sequence);
3512 for_each_online_cpu(cpu) { 3816 rcu_for_each_leaf_node(rsp, rnp) {
3513 rdp = per_cpu_ptr(rsp->rda, cpu); 3817 mask = 1;
3514 3818 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
3515 if (rdp->exp_done) 3819 if (!(rnp->expmask & mask))
3516 continue; 3820 continue;
3517 dump_cpu_task(cpu); 3821 dump_cpu_task(cpu);
3822 }
3518 } 3823 }
3519 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; 3824 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
3520 } 3825 }
@@ -3538,7 +3843,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3538 */ 3843 */
3539void synchronize_sched_expedited(void) 3844void synchronize_sched_expedited(void)
3540{ 3845{
3541 int cpu;
3542 unsigned long s; 3846 unsigned long s;
3543 struct rcu_node *rnp; 3847 struct rcu_node *rnp;
3544 struct rcu_state *rsp = &rcu_sched_state; 3848 struct rcu_state *rsp = &rcu_sched_state;
@@ -3546,48 +3850,16 @@ void synchronize_sched_expedited(void)
3546 /* Take a snapshot of the sequence number. */ 3850 /* Take a snapshot of the sequence number. */
3547 s = rcu_exp_gp_seq_snap(rsp); 3851 s = rcu_exp_gp_seq_snap(rsp);
3548 3852
3549 if (!try_get_online_cpus()) {
3550 /* CPU hotplug operation in flight, fall back to normal GP. */
3551 wait_rcu_gp(call_rcu_sched);
3552 atomic_long_inc(&rsp->expedited_normal);
3553 return;
3554 }
3555 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
3556
3557 rnp = exp_funnel_lock(rsp, s); 3853 rnp = exp_funnel_lock(rsp, s);
3558 if (rnp == NULL) { 3854 if (rnp == NULL)
3559 put_online_cpus();
3560 return; /* Someone else did our work for us. */ 3855 return; /* Someone else did our work for us. */
3561 }
3562 3856
3563 rcu_exp_gp_seq_start(rsp); 3857 rcu_exp_gp_seq_start(rsp);
3564 3858 sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
3565 /* Stop each CPU that is online, non-idle, and not us. */ 3859 synchronize_sched_expedited_wait(rsp);
3566 init_waitqueue_head(&rsp->expedited_wq);
3567 atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
3568 for_each_online_cpu(cpu) {
3569 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3570 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3571
3572 rdp->exp_done = false;
3573
3574 /* Skip our CPU and any idle CPUs. */
3575 if (raw_smp_processor_id() == cpu ||
3576 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3577 continue;
3578 atomic_inc(&rsp->expedited_need_qs);
3579 stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
3580 rdp, &rdp->exp_stop_work);
3581 }
3582
3583 /* Remove extra count and, if necessary, wait for CPUs to stop. */
3584 if (!atomic_dec_and_test(&rsp->expedited_need_qs))
3585 synchronize_sched_expedited_wait(rsp);
3586 3860
3587 rcu_exp_gp_seq_end(rsp); 3861 rcu_exp_gp_seq_end(rsp);
3588 mutex_unlock(&rnp->exp_funnel_mutex); 3862 mutex_unlock(&rnp->exp_funnel_mutex);
3589
3590 put_online_cpus();
3591} 3863}
3592EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 3864EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
3593 3865
@@ -3613,11 +3885,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
3613 3885
3614 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3886 /* Is the RCU core waiting for a quiescent state from this CPU? */
3615 if (rcu_scheduler_fully_active && 3887 if (rcu_scheduler_fully_active &&
3616 rdp->qs_pending && !rdp->passed_quiesce && 3888 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm &&
3617 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3889 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
3618 rdp->n_rp_qs_pending++; 3890 rdp->n_rp_core_needs_qs++;
3619 } else if (rdp->qs_pending && 3891 } else if (rdp->core_needs_qs &&
3620 (rdp->passed_quiesce || 3892 (!rdp->cpu_no_qs.b.norm ||
3621 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { 3893 rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
3622 rdp->n_rp_report_qs++; 3894 rdp->n_rp_report_qs++;
3623 return 1; 3895 return 1;
@@ -3875,7 +4147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
3875static void __init 4147static void __init
3876rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 4148rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3877{ 4149{
3878 static struct lock_class_key rcu_exp_sched_rdp_class;
3879 unsigned long flags; 4150 unsigned long flags;
3880 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 4151 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3881 struct rcu_node *rnp = rcu_get_root(rsp); 4152 struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3891,10 +4162,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3891 mutex_init(&rdp->exp_funnel_mutex); 4162 mutex_init(&rdp->exp_funnel_mutex);
3892 rcu_boot_init_nocb_percpu_data(rdp); 4163 rcu_boot_init_nocb_percpu_data(rdp);
3893 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4164 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3894 if (rsp == &rcu_sched_state)
3895 lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
3896 &rcu_exp_sched_rdp_class,
3897 "rcu_data_exp_sched");
3898} 4165}
3899 4166
3900/* 4167/*
@@ -3913,7 +4180,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3913 4180
3914 /* Set up local state, ensuring consistent view of global state. */ 4181 /* Set up local state, ensuring consistent view of global state. */
3915 raw_spin_lock_irqsave(&rnp->lock, flags); 4182 raw_spin_lock_irqsave(&rnp->lock, flags);
3916 rdp->beenonline = 1; /* We have now been online. */
3917 rdp->qlen_last_fqs_check = 0; 4183 rdp->qlen_last_fqs_check = 0;
3918 rdp->n_force_qs_snap = rsp->n_force_qs; 4184 rdp->n_force_qs_snap = rsp->n_force_qs;
3919 rdp->blimit = blimit; 4185 rdp->blimit = blimit;
@@ -3935,11 +4201,15 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3935 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 4201 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
3936 smp_mb__after_unlock_lock(); 4202 smp_mb__after_unlock_lock();
3937 rnp->qsmaskinitnext |= mask; 4203 rnp->qsmaskinitnext |= mask;
4204 rnp->expmaskinitnext |= mask;
4205 if (!rdp->beenonline)
4206 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
4207 rdp->beenonline = true; /* We have now been online. */
3938 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 4208 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3939 rdp->completed = rnp->completed; 4209 rdp->completed = rnp->completed;
3940 rdp->passed_quiesce = false; 4210 rdp->cpu_no_qs.b.norm = true;
3941 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 4211 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
3942 rdp->qs_pending = false; 4212 rdp->core_needs_qs = false;
3943 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 4213 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
3944 raw_spin_unlock_irqrestore(&rnp->lock, flags); 4214 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3945} 4215}
@@ -3972,6 +4242,7 @@ int rcu_cpu_notify(struct notifier_block *self,
3972 break; 4242 break;
3973 case CPU_ONLINE: 4243 case CPU_ONLINE:
3974 case CPU_DOWN_FAILED: 4244 case CPU_DOWN_FAILED:
4245 sync_sched_exp_online_cleanup(cpu);
3975 rcu_boost_kthread_setaffinity(rnp, -1); 4246 rcu_boost_kthread_setaffinity(rnp, -1);
3976 break; 4247 break;
3977 case CPU_DOWN_PREPARE: 4248 case CPU_DOWN_PREPARE:
@@ -3983,6 +4254,12 @@ int rcu_cpu_notify(struct notifier_block *self,
3983 rcu_cleanup_dying_cpu(rsp); 4254 rcu_cleanup_dying_cpu(rsp);
3984 break; 4255 break;
3985 case CPU_DYING_IDLE: 4256 case CPU_DYING_IDLE:
4257 /* QS for any half-done expedited RCU-sched GP. */
4258 preempt_disable();
4259 rcu_report_exp_rdp(&rcu_sched_state,
4260 this_cpu_ptr(rcu_sched_state.rda), true);
4261 preempt_enable();
4262
3986 for_each_rcu_flavor(rsp) { 4263 for_each_rcu_flavor(rsp) {
3987 rcu_cleanup_dying_idle_cpu(cpu, rsp); 4264 rcu_cleanup_dying_idle_cpu(cpu, rsp);
3988 } 4265 }
@@ -4114,7 +4391,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
4114 static const char * const buf[] = RCU_NODE_NAME_INIT; 4391 static const char * const buf[] = RCU_NODE_NAME_INIT;
4115 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4392 static const char * const fqs[] = RCU_FQS_NAME_INIT;
4116 static const char * const exp[] = RCU_EXP_NAME_INIT; 4393 static const char * const exp[] = RCU_EXP_NAME_INIT;
4117 static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
4118 static u8 fl_mask = 0x1; 4394 static u8 fl_mask = 0x1;
4119 4395
4120 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ 4396 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
@@ -4174,18 +4450,13 @@ static void __init rcu_init_one(struct rcu_state *rsp,
4174 INIT_LIST_HEAD(&rnp->blkd_tasks); 4450 INIT_LIST_HEAD(&rnp->blkd_tasks);
4175 rcu_init_one_nocb(rnp); 4451 rcu_init_one_nocb(rnp);
4176 mutex_init(&rnp->exp_funnel_mutex); 4452 mutex_init(&rnp->exp_funnel_mutex);
4177 if (rsp == &rcu_sched_state) 4453 lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
4178 lockdep_set_class_and_name( 4454 &rcu_exp_class[i], exp[i]);
4179 &rnp->exp_funnel_mutex,
4180 &rcu_exp_sched_class[i], exp_sched[i]);
4181 else
4182 lockdep_set_class_and_name(
4183 &rnp->exp_funnel_mutex,
4184 &rcu_exp_class[i], exp[i]);
4185 } 4455 }
4186 } 4456 }
4187 4457
4188 init_waitqueue_head(&rsp->gp_wq); 4458 init_waitqueue_head(&rsp->gp_wq);
4459 init_waitqueue_head(&rsp->expedited_wq);
4189 rnp = rsp->level[rcu_num_lvls - 1]; 4460 rnp = rsp->level[rcu_num_lvls - 1];
4190 for_each_possible_cpu(i) { 4461 for_each_possible_cpu(i) {
4191 while (i > rnp->grphi) 4462 while (i > rnp->grphi)