aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2018-02-02 01:05:38 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2018-05-15 13:25:44 -0400
commit25f3d7effab632eb10d145f1a5aebf6515a04b98 (patch)
tree8f088c805ed43c898543f1e74d21e808605143fc
parent60cc43fc888428bb2f18f08997432d426a243338 (diff)
rcu: Parallelize expedited grace-period initialization
The latency of RCU expedited grace periods grows with increasing numbers of CPUs, eventually failing to be all that expedited. Much of the growth in latency is in the initialization phase, so this commit uses workqueues to carry out this initialization concurrently on a rcu_node-by-rcu_node basis. This change makes use of a new rcu_par_gp_wq because flushing a work item from another work item running from the same workqueue can result in deadlock. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>
-rw-r--r--kernel/rcu/rcu.h1
-rw-r--r--kernel/rcu/tree.c3
-rw-r--r--kernel/rcu/tree.h10
-rw-r--r--kernel/rcu/tree_exp.h184
4 files changed, 120 insertions, 78 deletions
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 7a693e31184a..976019d6fa06 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -486,6 +486,7 @@ void rcu_force_quiescent_state(void);
486void rcu_bh_force_quiescent_state(void); 486void rcu_bh_force_quiescent_state(void);
487void rcu_sched_force_quiescent_state(void); 487void rcu_sched_force_quiescent_state(void);
488extern struct workqueue_struct *rcu_gp_wq; 488extern struct workqueue_struct *rcu_gp_wq;
489extern struct workqueue_struct *rcu_par_gp_wq;
489#endif /* #else #ifdef CONFIG_TINY_RCU */ 490#endif /* #else #ifdef CONFIG_TINY_RCU */
490 491
491#ifdef CONFIG_RCU_NOCB_CPU 492#ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a734692a581..23781fc90830 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4168,6 +4168,7 @@ static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp)
4168} 4168}
4169 4169
4170struct workqueue_struct *rcu_gp_wq; 4170struct workqueue_struct *rcu_gp_wq;
4171struct workqueue_struct *rcu_par_gp_wq;
4171 4172
4172void __init rcu_init(void) 4173void __init rcu_init(void)
4173{ 4174{
@@ -4199,6 +4200,8 @@ void __init rcu_init(void)
4199 /* Create workqueue for expedited GPs and for Tree SRCU. */ 4200 /* Create workqueue for expedited GPs and for Tree SRCU. */
4200 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); 4201 rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
4201 WARN_ON(!rcu_gp_wq); 4202 WARN_ON(!rcu_gp_wq);
4203 rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
4204 WARN_ON(!rcu_par_gp_wq);
4202} 4205}
4203 4206
4204#include "tree_exp.h" 4207#include "tree_exp.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f491ab4f2e8e..98d33902b65c 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -58,6 +58,14 @@ struct rcu_dynticks {
58#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 58#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
59}; 59};
60 60
61/* Communicate arguments to a workqueue handler. */
62struct rcu_exp_work {
63 smp_call_func_t rew_func;
64 struct rcu_state *rew_rsp;
65 unsigned long rew_s;
66 struct work_struct rew_work;
67};
68
61/* RCU's kthread states for tracing. */ 69/* RCU's kthread states for tracing. */
62#define RCU_KTHREAD_STOPPED 0 70#define RCU_KTHREAD_STOPPED 0
63#define RCU_KTHREAD_RUNNING 1 71#define RCU_KTHREAD_RUNNING 1
@@ -157,6 +165,8 @@ struct rcu_node {
157 spinlock_t exp_lock ____cacheline_internodealigned_in_smp; 165 spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
158 unsigned long exp_seq_rq; 166 unsigned long exp_seq_rq;
159 wait_queue_head_t exp_wq[4]; 167 wait_queue_head_t exp_wq[4];
168 struct rcu_exp_work rew;
169 bool exp_need_flush; /* Need to flush workitem? */
160} ____cacheline_internodealigned_in_smp; 170} ____cacheline_internodealigned_in_smp;
161 171
162/* 172/*
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index f72eefab8543..73e1d3dca5b1 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -362,93 +362,129 @@ static void sync_sched_exp_online_cleanup(int cpu)
362} 362}
363 363
364/* 364/*
365 * Select the nodes that the upcoming expedited grace period needs 365 * Select the CPUs within the specified rcu_node that the upcoming
366 * to wait for. 366 * expedited grace period needs to wait for.
367 */ 367 */
368static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, 368static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
369 smp_call_func_t func)
370{ 369{
371 int cpu; 370 int cpu;
372 unsigned long flags; 371 unsigned long flags;
372 smp_call_func_t func;
373 unsigned long mask_ofl_test; 373 unsigned long mask_ofl_test;
374 unsigned long mask_ofl_ipi; 374 unsigned long mask_ofl_ipi;
375 int ret; 375 int ret;
376 struct rcu_node *rnp; 376 struct rcu_exp_work *rewp =
377 container_of(wp, struct rcu_exp_work, rew_work);
378 struct rcu_node *rnp = container_of(rewp, struct rcu_node, rew);
379 struct rcu_state *rsp = rewp->rew_rsp;
377 380
378 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset")); 381 func = rewp->rew_func;
379 sync_exp_reset_tree(rsp); 382 raw_spin_lock_irqsave_rcu_node(rnp, flags);
380 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
381 rcu_for_each_leaf_node(rsp, rnp) {
382 raw_spin_lock_irqsave_rcu_node(rnp, flags);
383
384 /* Each pass checks a CPU for identity, offline, and idle. */
385 mask_ofl_test = 0;
386 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
387 unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
388 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
389 struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
390 int snap;
391 383
392 if (raw_smp_processor_id() == cpu || 384 /* Each pass checks a CPU for identity, offline, and idle. */
393 !(rnp->qsmaskinitnext & mask)) { 385 mask_ofl_test = 0;
386 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
387 unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
388 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
389 struct rcu_dynticks *rdtp = per_cpu_ptr(&rcu_dynticks, cpu);
390 int snap;
391
392 if (raw_smp_processor_id() == cpu ||
393 !(rnp->qsmaskinitnext & mask)) {
394 mask_ofl_test |= mask;
395 } else {
396 snap = rcu_dynticks_snap(rdtp);
397 if (rcu_dynticks_in_eqs(snap))
394 mask_ofl_test |= mask; 398 mask_ofl_test |= mask;
395 } else { 399 else
396 snap = rcu_dynticks_snap(rdtp); 400 rdp->exp_dynticks_snap = snap;
397 if (rcu_dynticks_in_eqs(snap))
398 mask_ofl_test |= mask;
399 else
400 rdp->exp_dynticks_snap = snap;
401 }
402 } 401 }
403 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; 402 }
404 403 mask_ofl_ipi = rnp->expmask & ~mask_ofl_test;
405 /*
406 * Need to wait for any blocked tasks as well. Note that
407 * additional blocking tasks will also block the expedited
408 * GP until such time as the ->expmask bits are cleared.
409 */
410 if (rcu_preempt_has_tasks(rnp))
411 rnp->exp_tasks = rnp->blkd_tasks.next;
412 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
413 404
414 /* IPI the remaining CPUs for expedited quiescent state. */ 405 /*
415 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) { 406 * Need to wait for any blocked tasks as well. Note that
416 unsigned long mask = leaf_node_cpu_bit(rnp, cpu); 407 * additional blocking tasks will also block the expedited GP
417 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 408 * until such time as the ->expmask bits are cleared.
409 */
410 if (rcu_preempt_has_tasks(rnp))
411 rnp->exp_tasks = rnp->blkd_tasks.next;
412 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
413
414 /* IPI the remaining CPUs for expedited quiescent state. */
415 for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
416 unsigned long mask = leaf_node_cpu_bit(rnp, cpu);
417 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
418 418
419 if (!(mask_ofl_ipi & mask)) 419 if (!(mask_ofl_ipi & mask))
420 continue; 420 continue;
421retry_ipi: 421retry_ipi:
422 if (rcu_dynticks_in_eqs_since(rdp->dynticks, 422 if (rcu_dynticks_in_eqs_since(rdp->dynticks,
423 rdp->exp_dynticks_snap)) { 423 rdp->exp_dynticks_snap)) {
424 mask_ofl_test |= mask; 424 mask_ofl_test |= mask;
425 continue; 425 continue;
426 } 426 }
427 ret = smp_call_function_single(cpu, func, rsp, 0); 427 ret = smp_call_function_single(cpu, func, rsp, 0);
428 if (!ret) { 428 if (!ret) {
429 mask_ofl_ipi &= ~mask; 429 mask_ofl_ipi &= ~mask;
430 continue; 430 continue;
431 } 431 }
432 /* Failed, raced with CPU hotplug operation. */ 432 /* Failed, raced with CPU hotplug operation. */
433 raw_spin_lock_irqsave_rcu_node(rnp, flags); 433 raw_spin_lock_irqsave_rcu_node(rnp, flags);
434 if ((rnp->qsmaskinitnext & mask) && 434 if ((rnp->qsmaskinitnext & mask) &&
435 (rnp->expmask & mask)) { 435 (rnp->expmask & mask)) {
436 /* Online, so delay for a bit and try again. */ 436 /* Online, so delay for a bit and try again. */
437 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
438 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
439 schedule_timeout_uninterruptible(1);
440 goto retry_ipi;
441 }
442 /* CPU really is offline, so we can ignore it. */
443 if (!(rnp->expmask & mask))
444 mask_ofl_ipi &= ~mask;
445 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 437 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
438 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("selectofl"));
439 schedule_timeout_uninterruptible(1);
440 goto retry_ipi;
441 }
442 /* CPU really is offline, so we can ignore it. */
443 if (!(rnp->expmask & mask))
444 mask_ofl_ipi &= ~mask;
445 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
446 }
447 /* Report quiescent states for those that went offline. */
448 mask_ofl_test |= mask_ofl_ipi;
449 if (mask_ofl_test)
450 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
451}
452
453/*
454 * Select the nodes that the upcoming expedited grace period needs
455 * to wait for.
456 */
457static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
458 smp_call_func_t func)
459{
460 struct rcu_node *rnp;
461
462 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("reset"));
463 sync_exp_reset_tree(rsp);
464 trace_rcu_exp_grace_period(rsp->name, rcu_exp_gp_seq_endval(rsp), TPS("select"));
465
466 /* Schedule work for each leaf rcu_node structure. */
467 rcu_for_each_leaf_node(rsp, rnp) {
468 rnp->exp_need_flush = false;
469 if (!READ_ONCE(rnp->expmask))
470 continue; /* Avoid early boot non-existent wq. */
471 rnp->rew.rew_func = func;
472 rnp->rew.rew_rsp = rsp;
473 if (!READ_ONCE(rcu_par_gp_wq) ||
474 rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
475 /* No workqueues yet. */
476 sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work);
477 continue;
446 } 478 }
447 /* Report quiescent states for those that went offline. */ 479 INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
448 mask_ofl_test |= mask_ofl_ipi; 480 queue_work_on(rnp->grplo, rcu_par_gp_wq, &rnp->rew.rew_work);
449 if (mask_ofl_test) 481 rnp->exp_need_flush = true;
450 rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false);
451 } 482 }
483
484 /* Wait for workqueue jobs (if any) to complete. */
485 rcu_for_each_leaf_node(rsp, rnp)
486 if (rnp->exp_need_flush)
487 flush_work(&rnp->rew.rew_work);
452} 488}
453 489
454static void synchronize_sched_expedited_wait(struct rcu_state *rsp) 490static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
@@ -560,14 +596,6 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
560 mutex_unlock(&rsp->exp_wake_mutex); 596 mutex_unlock(&rsp->exp_wake_mutex);
561} 597}
562 598
563/* Let the workqueue handler know what it is supposed to do. */
564struct rcu_exp_work {
565 smp_call_func_t rew_func;
566 struct rcu_state *rew_rsp;
567 unsigned long rew_s;
568 struct work_struct rew_work;
569};
570
571/* 599/*
572 * Common code to drive an expedited grace period forward, used by 600 * Common code to drive an expedited grace period forward, used by
573 * workqueues and mid-boot-time tasks. 601 * workqueues and mid-boot-time tasks.