diff options
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 140 |
1 files changed, 112 insertions, 28 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1ba77363fbb..625d0b0cd75a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu) | |||
206 | rdp->passed_quiesce = 1; | 206 | rdp->passed_quiesce = 1; |
207 | } | 207 | } |
208 | 208 | ||
209 | static DEFINE_PER_CPU(int, rcu_sched_qs_mask); | ||
210 | |||
211 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
212 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
213 | .dynticks = ATOMIC_INIT(1), | ||
214 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
215 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
216 | .dynticks_idle = ATOMIC_INIT(1), | ||
217 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
218 | }; | ||
219 | |||
220 | /* | ||
221 | * Let the RCU core know that this CPU has gone through the scheduler, | ||
222 | * which is a quiescent state. This is called when the need for a | ||
223 | * quiescent state is urgent, so we burn an atomic operation and full | ||
224 | * memory barriers to let the RCU core know about it, regardless of what | ||
225 | * this CPU might (or might not) do in the near future. | ||
226 | * | ||
227 | * We inform the RCU core by emulating a zero-duration dyntick-idle | ||
228 | * period, which we in turn do by incrementing the ->dynticks counter | ||
229 | * by two. | ||
230 | */ | ||
231 | static void rcu_momentary_dyntick_idle(void) | ||
232 | { | ||
233 | unsigned long flags; | ||
234 | struct rcu_data *rdp; | ||
235 | struct rcu_dynticks *rdtp; | ||
236 | int resched_mask; | ||
237 | struct rcu_state *rsp; | ||
238 | |||
239 | local_irq_save(flags); | ||
240 | |||
241 | /* | ||
242 | * Yes, we can lose flag-setting operations. This is OK, because | ||
243 | * the flag will be set again after some delay. | ||
244 | */ | ||
245 | resched_mask = raw_cpu_read(rcu_sched_qs_mask); | ||
246 | raw_cpu_write(rcu_sched_qs_mask, 0); | ||
247 | |||
248 | /* Find the flavor that needs a quiescent state. */ | ||
249 | for_each_rcu_flavor(rsp) { | ||
250 | rdp = raw_cpu_ptr(rsp->rda); | ||
251 | if (!(resched_mask & rsp->flavor_mask)) | ||
252 | continue; | ||
253 | smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ | ||
254 | if (ACCESS_ONCE(rdp->mynode->completed) != | ||
255 | ACCESS_ONCE(rdp->cond_resched_completed)) | ||
256 | continue; | ||
257 | |||
258 | /* | ||
259 | * Pretend to be momentarily idle for the quiescent state. | ||
260 | * This allows the grace-period kthread to record the | ||
261 | * quiescent state, with no need for this CPU to do anything | ||
262 | * further. | ||
263 | */ | ||
264 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
265 | smp_mb__before_atomic(); /* Earlier stuff before QS. */ | ||
266 | atomic_add(2, &rdtp->dynticks); /* QS. */ | ||
267 | smp_mb__after_atomic(); /* Later stuff after QS. */ | ||
268 | break; | ||
269 | } | ||
270 | local_irq_restore(flags); | ||
271 | } | ||
272 | |||
209 | /* | 273 | /* |
210 | * Note a context switch. This is a quiescent state for RCU-sched, | 274 | * Note a context switch. This is a quiescent state for RCU-sched, |
211 | * and requires special handling for preemptible RCU. | 275 | * and requires special handling for preemptible RCU. |
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu) | |||
216 | trace_rcu_utilization(TPS("Start context switch")); | 280 | trace_rcu_utilization(TPS("Start context switch")); |
217 | rcu_sched_qs(cpu); | 281 | rcu_sched_qs(cpu); |
218 | rcu_preempt_note_context_switch(cpu); | 282 | rcu_preempt_note_context_switch(cpu); |
283 | if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) | ||
284 | rcu_momentary_dyntick_idle(); | ||
219 | trace_rcu_utilization(TPS("End context switch")); | 285 | trace_rcu_utilization(TPS("End context switch")); |
220 | } | 286 | } |
221 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | 287 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); |
222 | 288 | ||
223 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
224 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
225 | .dynticks = ATOMIC_INIT(1), | ||
226 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
227 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
228 | .dynticks_idle = ATOMIC_INIT(1), | ||
229 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
230 | }; | ||
231 | |||
232 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 289 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
233 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ | 290 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ |
234 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ | 291 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ |
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; | |||
243 | module_param(jiffies_till_first_fqs, ulong, 0644); | 300 | module_param(jiffies_till_first_fqs, ulong, 0644); |
244 | module_param(jiffies_till_next_fqs, ulong, 0644); | 301 | module_param(jiffies_till_next_fqs, ulong, 0644); |
245 | 302 | ||
303 | /* | ||
304 | * How long the grace period must be before we start recruiting | ||
305 | * quiescent-state help from rcu_note_context_switch(). | ||
306 | */ | ||
307 | static ulong jiffies_till_sched_qs = HZ / 20; | ||
308 | module_param(jiffies_till_sched_qs, ulong, 0644); | ||
309 | |||
246 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | 310 | static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, |
247 | struct rcu_data *rdp); | 311 | struct rcu_data *rdp); |
248 | static void force_qs_rnp(struct rcu_state *rsp, | 312 | static void force_qs_rnp(struct rcu_state *rsp, |
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
853 | bool *isidle, unsigned long *maxj) | 917 | bool *isidle, unsigned long *maxj) |
854 | { | 918 | { |
855 | unsigned int curr; | 919 | unsigned int curr; |
920 | int *rcrmp; | ||
856 | unsigned int snap; | 921 | unsigned int snap; |
857 | 922 | ||
858 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); | 923 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); |
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | |||
893 | } | 958 | } |
894 | 959 | ||
895 | /* | 960 | /* |
896 | * There is a possibility that a CPU in adaptive-ticks state | 961 | * A CPU running for an extended time within the kernel can |
897 | * might run in the kernel with the scheduling-clock tick disabled | 962 | * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, |
898 | * for an extended time period. Invoke rcu_kick_nohz_cpu() to | 963 | * even context-switching back and forth between a pair of |
899 | * force the CPU to restart the scheduling-clock tick in this | 964 | * in-kernel CPU-bound tasks cannot advance grace periods. |
900 | * CPU is in this state. | 965 | * So if the grace period is old enough, make the CPU pay attention. |
901 | */ | 966 | * Note that the unsynchronized assignments to the per-CPU |
902 | rcu_kick_nohz_cpu(rdp->cpu); | 967 | * rcu_sched_qs_mask variable are safe. Yes, setting of |
903 | 968 | * bits can be lost, but they will be set again on the next | |
904 | /* | 969 | * force-quiescent-state pass. So lost bit sets do not result |
905 | * Alternatively, the CPU might be running in the kernel | 970 | * in incorrect behavior, merely in a grace period lasting |
906 | * for an extended period of time without a quiescent state. | 971 | * a few jiffies longer than it might otherwise. Because |
907 | * Attempt to force the CPU through the scheduler to gain the | 972 | * there are at most four threads involved, and because the |
908 | * needed quiescent state, but only if the grace period has gone | 973 | * updates are only once every few jiffies, the probability of |
909 | * on for an uncommonly long time. If there are many stuck CPUs, | 974 | * lossage (and thus of slight grace-period extension) is |
910 | * we will beat on the first one until it gets unstuck, then move | 975 | * quite low. |
911 | * to the next. Only do this for the primary flavor of RCU. | 976 | * |
977 | * Note that if the jiffies_till_sched_qs boot/sysfs parameter | ||
978 | * is set too high, we override with half of the RCU CPU stall | ||
979 | * warning delay. | ||
912 | */ | 980 | */ |
913 | if (rdp->rsp == rcu_state_p && | 981 | rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); |
982 | if (ULONG_CMP_GE(jiffies, | ||
983 | rdp->rsp->gp_start + jiffies_till_sched_qs) || | ||
914 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | 984 | ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { |
915 | rdp->rsp->jiffies_resched += 5; | 985 | if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { |
916 | resched_cpu(rdp->cpu); | 986 | ACCESS_ONCE(rdp->cond_resched_completed) = |
987 | ACCESS_ONCE(rdp->mynode->completed); | ||
988 | smp_mb(); /* ->cond_resched_completed before *rcrmp. */ | ||
989 | ACCESS_ONCE(*rcrmp) = | ||
990 | ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; | ||
991 | resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ | ||
992 | rdp->rsp->jiffies_resched += 5; /* Enable beating. */ | ||
993 | } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { | ||
994 | /* Time to beat on that CPU again! */ | ||
995 | resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ | ||
996 | rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ | ||
997 | } | ||
917 | } | 998 | } |
918 | 999 | ||
919 | return 0; | 1000 | return 0; |
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3491 | "rcu_node_fqs_1", | 3572 | "rcu_node_fqs_1", |
3492 | "rcu_node_fqs_2", | 3573 | "rcu_node_fqs_2", |
3493 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | 3574 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ |
3575 | static u8 fl_mask = 0x1; | ||
3494 | int cpustride = 1; | 3576 | int cpustride = 1; |
3495 | int i; | 3577 | int i; |
3496 | int j; | 3578 | int j; |
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
3509 | for (i = 1; i < rcu_num_lvls; i++) | 3591 | for (i = 1; i < rcu_num_lvls; i++) |
3510 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | 3592 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; |
3511 | rcu_init_levelspread(rsp); | 3593 | rcu_init_levelspread(rsp); |
3594 | rsp->flavor_mask = fl_mask; | ||
3595 | fl_mask <<= 1; | ||
3512 | 3596 | ||
3513 | /* Initialize the elements themselves, starting from the leaves. */ | 3597 | /* Initialize the elements themselves, starting from the leaves. */ |
3514 | 3598 | ||