diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-06-18 21:36:08 -0400 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-09-23 10:41:52 -0400 |
commit | b3dbec76e5334fbb063987dea14e7b255602d7e4 (patch) | |
tree | 5b75e737856baf09b2303bebf7d9893b8279c84e | |
parent | a10d206ef1a83121ab7430cb196e0376a7145b22 (diff) |
rcu: Move RCU grace-period initialization into a kthread
As the first step towards allowing grace-period initialization to be
preemptible, this commit moves the RCU grace-period initialization
into its own kthread. This is needed to keep large-system scheduling
latency at reasonable levels.
Also change raw_spin_lock_irqsave() to raw_spin_lock_irq() as suggested
by Peter Zijlstra in review comments.
Reported-by: Mike Galbraith <mgalbraith@suse.de>
Reported-by: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
-rw-r--r-- | kernel/rcutree.c | 190 | ||||
-rw-r--r-- | kernel/rcutree.h | 3 |
2 files changed, 129 insertions, 64 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f7bcd9e6c054..4792f1642bf2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -1042,6 +1042,102 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
1042 | } | 1042 | } |
1043 | 1043 | ||
1044 | /* | 1044 | /* |
1045 | * Body of kthread that handles grace periods. | ||
1046 | */ | ||
1047 | static int rcu_gp_kthread(void *arg) | ||
1048 | { | ||
1049 | struct rcu_data *rdp; | ||
1050 | struct rcu_node *rnp; | ||
1051 | struct rcu_state *rsp = arg; | ||
1052 | |||
1053 | for (;;) { | ||
1054 | |||
1055 | /* Handle grace-period start. */ | ||
1056 | rnp = rcu_get_root(rsp); | ||
1057 | for (;;) { | ||
1058 | wait_event_interruptible(rsp->gp_wq, rsp->gp_flags); | ||
1059 | if (rsp->gp_flags) | ||
1060 | break; | ||
1061 | flush_signals(current); | ||
1062 | } | ||
1063 | raw_spin_lock_irq(&rnp->lock); | ||
1064 | rsp->gp_flags = 0; | ||
1065 | rdp = this_cpu_ptr(rsp->rda); | ||
1066 | |||
1067 | if (rcu_gp_in_progress(rsp)) { | ||
1068 | /* | ||
1069 | * A grace period is already in progress, so | ||
1070 | * don't start another one. | ||
1071 | */ | ||
1072 | raw_spin_unlock_irq(&rnp->lock); | ||
1073 | continue; | ||
1074 | } | ||
1075 | |||
1076 | if (rsp->fqs_active) { | ||
1077 | /* | ||
1078 | * We need a grace period, but force_quiescent_state() | ||
1079 | * is running. Tell it to start one on our behalf. | ||
1080 | */ | ||
1081 | rsp->fqs_need_gp = 1; | ||
1082 | raw_spin_unlock_irq(&rnp->lock); | ||
1083 | continue; | ||
1084 | } | ||
1085 | |||
1086 | /* Advance to a new grace period and initialize state. */ | ||
1087 | rsp->gpnum++; | ||
1088 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | ||
1089 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); | ||
1090 | rsp->fqs_state = RCU_GP_INIT; /* Stop force_quiescent_state. */ | ||
1091 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | ||
1092 | record_gp_stall_check_time(rsp); | ||
1093 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ | ||
1094 | |||
1095 | /* Exclude any concurrent CPU-hotplug operations. */ | ||
1096 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | ||
1097 | |||
1098 | /* | ||
1099 | * Set the quiescent-state-needed bits in all the rcu_node | ||
1100 | * structures for all currently online CPUs in breadth-first | ||
1101 | * order, starting from the root rcu_node structure. | ||
1102 | * This operation relies on the layout of the hierarchy | ||
1103 | * within the rsp->node[] array. Note that other CPUs will | ||
1104 | * access only the leaves of the hierarchy, which still | ||
1105 | * indicate that no grace period is in progress, at least | ||
1106 | * until the corresponding leaf node has been initialized. | ||
1107 | * In addition, we have excluded CPU-hotplug operations. | ||
1108 | * | ||
1109 | * Note that the grace period cannot complete until | ||
1110 | * we finish the initialization process, as there will | ||
1111 | * be at least one qsmask bit set in the root node until | ||
1112 | * that time, namely the one corresponding to this CPU, | ||
1113 | * due to the fact that we have irqs disabled. | ||
1114 | */ | ||
1115 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
1116 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1117 | rcu_preempt_check_blocked_tasks(rnp); | ||
1118 | rnp->qsmask = rnp->qsmaskinit; | ||
1119 | rnp->gpnum = rsp->gpnum; | ||
1120 | rnp->completed = rsp->completed; | ||
1121 | if (rnp == rdp->mynode) | ||
1122 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | ||
1123 | rcu_preempt_boost_start_gp(rnp); | ||
1124 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
1125 | rnp->level, rnp->grplo, | ||
1126 | rnp->grphi, rnp->qsmask); | ||
1127 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1128 | } | ||
1129 | |||
1130 | rnp = rcu_get_root(rsp); | ||
1131 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1132 | /* force_quiescent_state() now OK. */ | ||
1133 | rsp->fqs_state = RCU_SIGNAL_INIT; | ||
1134 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1135 | raw_spin_unlock_irq(&rsp->onofflock); | ||
1136 | } | ||
1137 | return 0; | ||
1138 | } | ||
1139 | |||
1140 | /* | ||
1045 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1141 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
1046 | * in preparation for detecting the next grace period. The caller must hold | 1142 | * in preparation for detecting the next grace period. The caller must hold |
1047 | * the root node's ->lock, which is released before return. Hard irqs must | 1143 | * the root node's ->lock, which is released before return. Hard irqs must |
@@ -1058,77 +1154,20 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
1058 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1154 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1059 | struct rcu_node *rnp = rcu_get_root(rsp); | 1155 | struct rcu_node *rnp = rcu_get_root(rsp); |
1060 | 1156 | ||
1061 | if (!rcu_scheduler_fully_active || | 1157 | if (!rsp->gp_kthread || |
1062 | !cpu_needs_another_gp(rsp, rdp)) { | 1158 | !cpu_needs_another_gp(rsp, rdp)) { |
1063 | /* | 1159 | /* |
1064 | * Either the scheduler hasn't yet spawned the first | 1160 | * Either we have not yet spawned the grace-period |
1065 | * non-idle task or this CPU does not need another | 1161 | * task or this CPU does not need another grace period. |
1066 | * grace period. Either way, don't start a new grace | 1162 | * Either way, don't start a new grace period. |
1067 | * period. | ||
1068 | */ | 1163 | */ |
1069 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1164 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1070 | return; | 1165 | return; |
1071 | } | 1166 | } |
1072 | 1167 | ||
1073 | if (rsp->fqs_active) { | 1168 | rsp->gp_flags = 1; |
1074 | /* | 1169 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1075 | * This CPU needs a grace period, but force_quiescent_state() | 1170 | wake_up(&rsp->gp_wq); |
1076 | * is running. Tell it to start one on this CPU's behalf. | ||
1077 | */ | ||
1078 | rsp->fqs_need_gp = 1; | ||
1079 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1080 | return; | ||
1081 | } | ||
1082 | |||
1083 | /* Advance to a new grace period and initialize state. */ | ||
1084 | rsp->gpnum++; | ||
1085 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | ||
1086 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); | ||
1087 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | ||
1088 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | ||
1089 | record_gp_stall_check_time(rsp); | ||
1090 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ | ||
1091 | |||
1092 | /* Exclude any concurrent CPU-hotplug operations. */ | ||
1093 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | ||
1094 | |||
1095 | /* | ||
1096 | * Set the quiescent-state-needed bits in all the rcu_node | ||
1097 | * structures for all currently online CPUs in breadth-first | ||
1098 | * order, starting from the root rcu_node structure. This | ||
1099 | * operation relies on the layout of the hierarchy within the | ||
1100 | * rsp->node[] array. Note that other CPUs will access only | ||
1101 | * the leaves of the hierarchy, which still indicate that no | ||
1102 | * grace period is in progress, at least until the corresponding | ||
1103 | * leaf node has been initialized. In addition, we have excluded | ||
1104 | * CPU-hotplug operations. | ||
1105 | * | ||
1106 | * Note that the grace period cannot complete until we finish | ||
1107 | * the initialization process, as there will be at least one | ||
1108 | * qsmask bit set in the root node until that time, namely the | ||
1109 | * one corresponding to this CPU, due to the fact that we have | ||
1110 | * irqs disabled. | ||
1111 | */ | ||
1112 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
1113 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1114 | rcu_preempt_check_blocked_tasks(rnp); | ||
1115 | rnp->qsmask = rnp->qsmaskinit; | ||
1116 | rnp->gpnum = rsp->gpnum; | ||
1117 | rnp->completed = rsp->completed; | ||
1118 | if (rnp == rdp->mynode) | ||
1119 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | ||
1120 | rcu_preempt_boost_start_gp(rnp); | ||
1121 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
1122 | rnp->level, rnp->grplo, | ||
1123 | rnp->grphi, rnp->qsmask); | ||
1124 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1125 | } | ||
1126 | |||
1127 | rnp = rcu_get_root(rsp); | ||
1128 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1129 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | ||
1130 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1131 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
1132 | } | 1171 | } |
1133 | 1172 | ||
1134 | /* | 1173 | /* |
@@ -2629,6 +2668,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
2629 | } | 2668 | } |
2630 | 2669 | ||
2631 | /* | 2670 | /* |
2671 | * Spawn the kthread that handles this RCU flavor's grace periods. | ||
2672 | */ | ||
2673 | static int __init rcu_spawn_gp_kthread(void) | ||
2674 | { | ||
2675 | unsigned long flags; | ||
2676 | struct rcu_node *rnp; | ||
2677 | struct rcu_state *rsp; | ||
2678 | struct task_struct *t; | ||
2679 | |||
2680 | for_each_rcu_flavor(rsp) { | ||
2681 | t = kthread_run(rcu_gp_kthread, rsp, rsp->name); | ||
2682 | BUG_ON(IS_ERR(t)); | ||
2683 | rnp = rcu_get_root(rsp); | ||
2684 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2685 | rsp->gp_kthread = t; | ||
2686 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2687 | } | ||
2688 | return 0; | ||
2689 | } | ||
2690 | early_initcall(rcu_spawn_gp_kthread); | ||
2691 | |||
2692 | /* | ||
2632 | * This function is invoked towards the end of the scheduler's initialization | 2693 | * This function is invoked towards the end of the scheduler's initialization |
2633 | * process. Before this is called, the idle task might contain | 2694 | * process. Before this is called, the idle task might contain |
2634 | * RCU read-side critical sections (during which time, this idle | 2695 | * RCU read-side critical sections (during which time, this idle |
@@ -2729,6 +2790,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
2729 | } | 2790 | } |
2730 | 2791 | ||
2731 | rsp->rda = rda; | 2792 | rsp->rda = rda; |
2793 | init_waitqueue_head(&rsp->gp_wq); | ||
2732 | rnp = rsp->level[rcu_num_lvls - 1]; | 2794 | rnp = rsp->level[rcu_num_lvls - 1]; |
2733 | for_each_possible_cpu(i) { | 2795 | for_each_possible_cpu(i) { |
2734 | while (i > rnp->grphi) | 2796 | while (i > rnp->grphi) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..117a15019e99 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -385,6 +385,9 @@ struct rcu_state { | |||
385 | u8 boost; /* Subject to priority boost. */ | 385 | u8 boost; /* Subject to priority boost. */ |
386 | unsigned long gpnum; /* Current gp number. */ | 386 | unsigned long gpnum; /* Current gp number. */ |
387 | unsigned long completed; /* # of last completed gp. */ | 387 | unsigned long completed; /* # of last completed gp. */ |
388 | struct task_struct *gp_kthread; /* Task for grace periods. */ | ||
389 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | ||
390 | int gp_flags; /* Commands for GP task. */ | ||
388 | 391 | ||
389 | /* End of fields guarded by root rcu_node's lock. */ | 392 | /* End of fields guarded by root rcu_node's lock. */ |
390 | 393 | ||