diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2013-10-08 23:23:47 -0400 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2013-10-15 15:53:31 -0400 |
commit | 4102adab9189c8ea2f0cdd2f88345fd25d2790f1 (patch) | |
tree | 235964cfd9c09a5c642a2d0d8745a651a0d4bcfa /kernel/rcu/tree_plugin.h | |
parent | 252997330908cb8ee3d5714539ed967b977c2eae (diff) |
rcu: Move RCU-related source code to kernel/rcu directory
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/rcu/tree_plugin.h')
-rw-r--r-- | kernel/rcu/tree_plugin.h | 2831 |
1 files changed, 2831 insertions, 0 deletions
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h new file mode 100644 index 000000000000..3822ac0c4b27 --- /dev/null +++ b/kernel/rcu/tree_plugin.h | |||
@@ -0,0 +1,2831 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | ||
3 | * Internal non-public definitions that provide either classic | ||
4 | * or preemptible semantics. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
19 | * | ||
20 | * Copyright Red Hat, 2009 | ||
21 | * Copyright IBM Corporation, 2009 | ||
22 | * | ||
23 | * Author: Ingo Molnar <mingo@elte.hu> | ||
24 | * Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
25 | */ | ||
26 | |||
27 | #include <linux/delay.h> | ||
28 | #include <linux/gfp.h> | ||
29 | #include <linux/oom.h> | ||
30 | #include <linux/smpboot.h> | ||
31 | #include "../time/tick-internal.h" | ||
32 | |||
33 | #define RCU_KTHREAD_PRIO 1 | ||
34 | |||
35 | #ifdef CONFIG_RCU_BOOST | ||
36 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
37 | #else | ||
38 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | ||
39 | #endif | ||
40 | |||
41 | #ifdef CONFIG_RCU_NOCB_CPU | ||
42 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | ||
43 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | ||
44 | static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ | ||
45 | static char __initdata nocb_buf[NR_CPUS * 5]; | ||
46 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
47 | |||
48 | /* | ||
49 | * Check the RCU kernel configuration parameters and print informative | ||
50 | * messages about anything out of the ordinary. If you like #ifdef, you | ||
51 | * will love this function. | ||
52 | */ | ||
53 | static void __init rcu_bootup_announce_oddness(void) | ||
54 | { | ||
55 | #ifdef CONFIG_RCU_TRACE | ||
56 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); | ||
57 | #endif | ||
58 | #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) | ||
59 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | ||
60 | CONFIG_RCU_FANOUT); | ||
61 | #endif | ||
62 | #ifdef CONFIG_RCU_FANOUT_EXACT | ||
63 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); | ||
64 | #endif | ||
65 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
66 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | ||
67 | #endif | ||
68 | #ifdef CONFIG_PROVE_RCU | ||
69 | pr_info("\tRCU lockdep checking is enabled.\n"); | ||
70 | #endif | ||
71 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | ||
72 | pr_info("\tRCU torture testing starts during boot.\n"); | ||
73 | #endif | ||
74 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | ||
75 | pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); | ||
76 | #endif | ||
77 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | ||
78 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | ||
79 | #endif | ||
80 | #if NUM_RCU_LVL_4 != 0 | ||
81 | pr_info("\tFour-level hierarchy is enabled.\n"); | ||
82 | #endif | ||
83 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | ||
84 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | ||
85 | if (nr_cpu_ids != NR_CPUS) | ||
86 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | ||
87 | #ifdef CONFIG_RCU_NOCB_CPU | ||
88 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | ||
89 | if (!have_rcu_nocb_mask) { | ||
90 | zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); | ||
91 | have_rcu_nocb_mask = true; | ||
92 | } | ||
93 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
94 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
95 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
99 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
102 | if (have_rcu_nocb_mask) { | ||
103 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
104 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
105 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
106 | rcu_nocb_mask); | ||
107 | } | ||
108 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
109 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | ||
110 | if (rcu_nocb_poll) | ||
111 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | ||
112 | } | ||
113 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
114 | } | ||
115 | |||
116 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
117 | |||
118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
119 | static struct rcu_state *rcu_state = &rcu_preempt_state; | ||
120 | |||
121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | ||
122 | |||
123 | /* | ||
124 | * Tell them what RCU they are running. | ||
125 | */ | ||
126 | static void __init rcu_bootup_announce(void) | ||
127 | { | ||
128 | pr_info("Preemptible hierarchical RCU implementation.\n"); | ||
129 | rcu_bootup_announce_oddness(); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Return the number of RCU-preempt batches processed thus far | ||
134 | * for debug and statistics. | ||
135 | */ | ||
136 | long rcu_batches_completed_preempt(void) | ||
137 | { | ||
138 | return rcu_preempt_state.completed; | ||
139 | } | ||
140 | EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); | ||
141 | |||
142 | /* | ||
143 | * Return the number of RCU batches processed thus far for debug & stats. | ||
144 | */ | ||
145 | long rcu_batches_completed(void) | ||
146 | { | ||
147 | return rcu_batches_completed_preempt(); | ||
148 | } | ||
149 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
150 | |||
151 | /* | ||
152 | * Force a quiescent state for preemptible RCU. | ||
153 | */ | ||
154 | void rcu_force_quiescent_state(void) | ||
155 | { | ||
156 | force_quiescent_state(&rcu_preempt_state); | ||
157 | } | ||
158 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
159 | |||
160 | /* | ||
161 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
162 | * that this just means that the task currently running on the CPU is | ||
163 | * not in a quiescent state. There might be any number of tasks blocked | ||
164 | * while in an RCU read-side critical section. | ||
165 | * | ||
166 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
167 | * must disable irqs in order to protect the assignment to | ||
168 | * ->rcu_read_unlock_special. | ||
169 | */ | ||
170 | static void rcu_preempt_qs(int cpu) | ||
171 | { | ||
172 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | ||
173 | |||
174 | if (rdp->passed_quiesce == 0) | ||
175 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); | ||
176 | rdp->passed_quiesce = 1; | ||
177 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * We have entered the scheduler, and the current task might soon be | ||
182 | * context-switched away from. If this task is in an RCU read-side | ||
183 | * critical section, we will no longer be able to rely on the CPU to | ||
184 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
185 | * The task will dequeue itself when it exits the outermost enclosing | ||
186 | * RCU read-side critical section. Therefore, the current grace period | ||
187 | * cannot be permitted to complete until the blkd_tasks list entries | ||
188 | * predating the current grace period drain, in other words, until | ||
189 | * rnp->gp_tasks becomes NULL. | ||
190 | * | ||
191 | * Caller must disable preemption. | ||
192 | */ | ||
193 | static void rcu_preempt_note_context_switch(int cpu) | ||
194 | { | ||
195 | struct task_struct *t = current; | ||
196 | unsigned long flags; | ||
197 | struct rcu_data *rdp; | ||
198 | struct rcu_node *rnp; | ||
199 | |||
200 | if (t->rcu_read_lock_nesting > 0 && | ||
201 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
202 | |||
203 | /* Possibly blocking in an RCU read-side critical section. */ | ||
204 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | ||
205 | rnp = rdp->mynode; | ||
206 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
207 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
208 | t->rcu_blocked_node = rnp; | ||
209 | |||
210 | /* | ||
211 | * If this CPU has already checked in, then this task | ||
212 | * will hold up the next grace period rather than the | ||
213 | * current grace period. Queue the task accordingly. | ||
214 | * If the task is queued for the current grace period | ||
215 | * (i.e., this CPU has not yet passed through a quiescent | ||
216 | * state for the current grace period), then as long | ||
217 | * as that task remains queued, the current grace period | ||
218 | * cannot end. Note that there is some uncertainty as | ||
219 | * to exactly when the current grace period started. | ||
220 | * We take a conservative approach, which can result | ||
221 | * in unnecessarily waiting on tasks that started very | ||
222 | * slightly after the current grace period began. C'est | ||
223 | * la vie!!! | ||
224 | * | ||
225 | * But first, note that the current CPU must still be | ||
226 | * on line! | ||
227 | */ | ||
228 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | ||
229 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | ||
230 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { | ||
231 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); | ||
232 | rnp->gp_tasks = &t->rcu_node_entry; | ||
233 | #ifdef CONFIG_RCU_BOOST | ||
234 | if (rnp->boost_tasks != NULL) | ||
235 | rnp->boost_tasks = rnp->gp_tasks; | ||
236 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
237 | } else { | ||
238 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
239 | if (rnp->qsmask & rdp->grpmask) | ||
240 | rnp->gp_tasks = &t->rcu_node_entry; | ||
241 | } | ||
242 | trace_rcu_preempt_task(rdp->rsp->name, | ||
243 | t->pid, | ||
244 | (rnp->qsmask & rdp->grpmask) | ||
245 | ? rnp->gpnum | ||
246 | : rnp->gpnum + 1); | ||
247 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
248 | } else if (t->rcu_read_lock_nesting < 0 && | ||
249 | t->rcu_read_unlock_special) { | ||
250 | |||
251 | /* | ||
252 | * Complete exit from RCU read-side critical section on | ||
253 | * behalf of preempted instance of __rcu_read_unlock(). | ||
254 | */ | ||
255 | rcu_read_unlock_special(t); | ||
256 | } | ||
257 | |||
258 | /* | ||
259 | * Either we were not in an RCU read-side critical section to | ||
260 | * begin with, or we have now recorded that critical section | ||
261 | * globally. Either way, we can now note a quiescent state | ||
262 | * for this CPU. Again, if we were in an RCU read-side critical | ||
263 | * section, and if that critical section was blocking the current | ||
264 | * grace period, then the fact that the task has been enqueued | ||
265 | * means that we continue to block the current grace period. | ||
266 | */ | ||
267 | local_irq_save(flags); | ||
268 | rcu_preempt_qs(cpu); | ||
269 | local_irq_restore(flags); | ||
270 | } | ||
271 | |||
272 | /* | ||
273 | * Check for preempted RCU readers blocking the current grace period | ||
274 | * for the specified rcu_node structure. If the caller needs a reliable | ||
275 | * answer, it must hold the rcu_node's ->lock. | ||
276 | */ | ||
277 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | ||
278 | { | ||
279 | return rnp->gp_tasks != NULL; | ||
280 | } | ||
281 | |||
282 | /* | ||
283 | * Record a quiescent state for all tasks that were previously queued | ||
284 | * on the specified rcu_node structure and that were blocking the current | ||
285 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
286 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
287 | * disabled. | ||
288 | */ | ||
289 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | ||
290 | __releases(rnp->lock) | ||
291 | { | ||
292 | unsigned long mask; | ||
293 | struct rcu_node *rnp_p; | ||
294 | |||
295 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
296 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
297 | return; /* Still need more quiescent states! */ | ||
298 | } | ||
299 | |||
300 | rnp_p = rnp->parent; | ||
301 | if (rnp_p == NULL) { | ||
302 | /* | ||
303 | * Either there is only one rcu_node in the tree, | ||
304 | * or tasks were kicked up to root rcu_node due to | ||
305 | * CPUs going offline. | ||
306 | */ | ||
307 | rcu_report_qs_rsp(&rcu_preempt_state, flags); | ||
308 | return; | ||
309 | } | ||
310 | |||
311 | /* Report up the rest of the hierarchy. */ | ||
312 | mask = rnp->grpmask; | ||
313 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
314 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
315 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
320 | * returning NULL if at the end of the list. | ||
321 | */ | ||
322 | static struct list_head *rcu_next_node_entry(struct task_struct *t, | ||
323 | struct rcu_node *rnp) | ||
324 | { | ||
325 | struct list_head *np; | ||
326 | |||
327 | np = t->rcu_node_entry.next; | ||
328 | if (np == &rnp->blkd_tasks) | ||
329 | np = NULL; | ||
330 | return np; | ||
331 | } | ||
332 | |||
333 | /* | ||
334 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
335 | * notify RCU core processing or task having blocked during the RCU | ||
336 | * read-side critical section. | ||
337 | */ | ||
338 | void rcu_read_unlock_special(struct task_struct *t) | ||
339 | { | ||
340 | int empty; | ||
341 | int empty_exp; | ||
342 | int empty_exp_now; | ||
343 | unsigned long flags; | ||
344 | struct list_head *np; | ||
345 | #ifdef CONFIG_RCU_BOOST | ||
346 | struct rt_mutex *rbmp = NULL; | ||
347 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
348 | struct rcu_node *rnp; | ||
349 | int special; | ||
350 | |||
351 | /* NMI handlers cannot block and cannot safely manipulate state. */ | ||
352 | if (in_nmi()) | ||
353 | return; | ||
354 | |||
355 | local_irq_save(flags); | ||
356 | |||
357 | /* | ||
358 | * If RCU core is waiting for this CPU to exit critical section, | ||
359 | * let it know that we have done so. | ||
360 | */ | ||
361 | special = t->rcu_read_unlock_special; | ||
362 | if (special & RCU_READ_UNLOCK_NEED_QS) { | ||
363 | rcu_preempt_qs(smp_processor_id()); | ||
364 | } | ||
365 | |||
366 | /* Hardware IRQ handlers cannot block. */ | ||
367 | if (in_irq() || in_serving_softirq()) { | ||
368 | local_irq_restore(flags); | ||
369 | return; | ||
370 | } | ||
371 | |||
372 | /* Clean up if blocked during RCU read-side critical section. */ | ||
373 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
374 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
375 | |||
376 | /* | ||
377 | * Remove this task from the list it blocked on. The | ||
378 | * task can migrate while we acquire the lock, but at | ||
379 | * most one time. So at most two passes through loop. | ||
380 | */ | ||
381 | for (;;) { | ||
382 | rnp = t->rcu_blocked_node; | ||
383 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
384 | if (rnp == t->rcu_blocked_node) | ||
385 | break; | ||
386 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
387 | } | ||
388 | empty = !rcu_preempt_blocked_readers_cgp(rnp); | ||
389 | empty_exp = !rcu_preempted_readers_exp(rnp); | ||
390 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | ||
391 | np = rcu_next_node_entry(t, rnp); | ||
392 | list_del_init(&t->rcu_node_entry); | ||
393 | t->rcu_blocked_node = NULL; | ||
394 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), | ||
395 | rnp->gpnum, t->pid); | ||
396 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
397 | rnp->gp_tasks = np; | ||
398 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
399 | rnp->exp_tasks = np; | ||
400 | #ifdef CONFIG_RCU_BOOST | ||
401 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
402 | rnp->boost_tasks = np; | ||
403 | /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ | ||
404 | if (t->rcu_boost_mutex) { | ||
405 | rbmp = t->rcu_boost_mutex; | ||
406 | t->rcu_boost_mutex = NULL; | ||
407 | } | ||
408 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
409 | |||
410 | /* | ||
411 | * If this was the last task on the current list, and if | ||
412 | * we aren't waiting on any CPUs, report the quiescent state. | ||
413 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, | ||
414 | * so we must take a snapshot of the expedited state. | ||
415 | */ | ||
416 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | ||
417 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | ||
418 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), | ||
419 | rnp->gpnum, | ||
420 | 0, rnp->qsmask, | ||
421 | rnp->level, | ||
422 | rnp->grplo, | ||
423 | rnp->grphi, | ||
424 | !!rnp->gp_tasks); | ||
425 | rcu_report_unblock_qs_rnp(rnp, flags); | ||
426 | } else { | ||
427 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
428 | } | ||
429 | |||
430 | #ifdef CONFIG_RCU_BOOST | ||
431 | /* Unboost if we were boosted. */ | ||
432 | if (rbmp) | ||
433 | rt_mutex_unlock(rbmp); | ||
434 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
435 | |||
436 | /* | ||
437 | * If this was the last task on the expedited lists, | ||
438 | * then we need to report up the rcu_node hierarchy. | ||
439 | */ | ||
440 | if (!empty_exp && empty_exp_now) | ||
441 | rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); | ||
442 | } else { | ||
443 | local_irq_restore(flags); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
448 | |||
449 | /* | ||
450 | * Dump detailed information for all tasks blocking the current RCU | ||
451 | * grace period on the specified rcu_node structure. | ||
452 | */ | ||
453 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
454 | { | ||
455 | unsigned long flags; | ||
456 | struct task_struct *t; | ||
457 | |||
458 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
459 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | ||
460 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
461 | return; | ||
462 | } | ||
463 | t = list_entry(rnp->gp_tasks, | ||
464 | struct task_struct, rcu_node_entry); | ||
465 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | ||
466 | sched_show_task(t); | ||
467 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
468 | } | ||
469 | |||
470 | /* | ||
471 | * Dump detailed information for all tasks blocking the current RCU | ||
472 | * grace period. | ||
473 | */ | ||
474 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
475 | { | ||
476 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
477 | |||
478 | rcu_print_detail_task_stall_rnp(rnp); | ||
479 | rcu_for_each_leaf_node(rsp, rnp) | ||
480 | rcu_print_detail_task_stall_rnp(rnp); | ||
481 | } | ||
482 | |||
483 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
484 | |||
485 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
486 | { | ||
487 | } | ||
488 | |||
489 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
490 | |||
491 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
492 | |||
493 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
494 | { | ||
495 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | ||
496 | rnp->level, rnp->grplo, rnp->grphi); | ||
497 | } | ||
498 | |||
499 | static void rcu_print_task_stall_end(void) | ||
500 | { | ||
501 | pr_cont("\n"); | ||
502 | } | ||
503 | |||
504 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
505 | |||
506 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | static void rcu_print_task_stall_end(void) | ||
511 | { | ||
512 | } | ||
513 | |||
514 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
515 | |||
516 | /* | ||
517 | * Scan the current list of tasks blocked within RCU read-side critical | ||
518 | * sections, printing out the tid of each. | ||
519 | */ | ||
520 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
521 | { | ||
522 | struct task_struct *t; | ||
523 | int ndetected = 0; | ||
524 | |||
525 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | ||
526 | return 0; | ||
527 | rcu_print_task_stall_begin(rnp); | ||
528 | t = list_entry(rnp->gp_tasks, | ||
529 | struct task_struct, rcu_node_entry); | ||
530 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
531 | pr_cont(" P%d", t->pid); | ||
532 | ndetected++; | ||
533 | } | ||
534 | rcu_print_task_stall_end(); | ||
535 | return ndetected; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * Check that the list of blocked tasks for the newly completed grace | ||
540 | * period is in fact empty. It is a serious bug to complete a grace | ||
541 | * period that still has RCU readers blocked! This function must be | ||
542 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | ||
543 | * must be held by the caller. | ||
544 | * | ||
545 | * Also, if there are blocked tasks on the list, they automatically | ||
546 | * block the newly created grace period, so set up ->gp_tasks accordingly. | ||
547 | */ | ||
548 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | ||
549 | { | ||
550 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); | ||
551 | if (!list_empty(&rnp->blkd_tasks)) | ||
552 | rnp->gp_tasks = rnp->blkd_tasks.next; | ||
553 | WARN_ON_ONCE(rnp->qsmask); | ||
554 | } | ||
555 | |||
556 | #ifdef CONFIG_HOTPLUG_CPU | ||
557 | |||
558 | /* | ||
559 | * Handle tasklist migration for case in which all CPUs covered by the | ||
560 | * specified rcu_node have gone offline. Move them up to the root | ||
561 | * rcu_node. The reason for not just moving them to the immediate | ||
562 | * parent is to remove the need for rcu_read_unlock_special() to | ||
563 | * make more than two attempts to acquire the target rcu_node's lock. | ||
564 | * Returns true if there were tasks blocking the current RCU grace | ||
565 | * period. | ||
566 | * | ||
567 | * Returns 1 if there was previously a task blocking the current grace | ||
568 | * period on the specified rcu_node structure. | ||
569 | * | ||
570 | * The caller must hold rnp->lock with irqs disabled. | ||
571 | */ | ||
572 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | ||
573 | struct rcu_node *rnp, | ||
574 | struct rcu_data *rdp) | ||
575 | { | ||
576 | struct list_head *lp; | ||
577 | struct list_head *lp_root; | ||
578 | int retval = 0; | ||
579 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
580 | struct task_struct *t; | ||
581 | |||
582 | if (rnp == rnp_root) { | ||
583 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | ||
584 | return 0; /* Shouldn't happen: at least one CPU online. */ | ||
585 | } | ||
586 | |||
587 | /* If we are on an internal node, complain bitterly. */ | ||
588 | WARN_ON_ONCE(rnp != rdp->mynode); | ||
589 | |||
590 | /* | ||
591 | * Move tasks up to root rcu_node. Don't try to get fancy for | ||
592 | * this corner-case operation -- just put this node's tasks | ||
593 | * at the head of the root node's list, and update the root node's | ||
594 | * ->gp_tasks and ->exp_tasks pointers to those of this node's, | ||
595 | * if non-NULL. This might result in waiting for more tasks than | ||
596 | * absolutely necessary, but this is a good performance/complexity | ||
597 | * tradeoff. | ||
598 | */ | ||
599 | if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) | ||
600 | retval |= RCU_OFL_TASKS_NORM_GP; | ||
601 | if (rcu_preempted_readers_exp(rnp)) | ||
602 | retval |= RCU_OFL_TASKS_EXP_GP; | ||
603 | lp = &rnp->blkd_tasks; | ||
604 | lp_root = &rnp_root->blkd_tasks; | ||
605 | while (!list_empty(lp)) { | ||
606 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); | ||
607 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
608 | list_del(&t->rcu_node_entry); | ||
609 | t->rcu_blocked_node = rnp_root; | ||
610 | list_add(&t->rcu_node_entry, lp_root); | ||
611 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
612 | rnp_root->gp_tasks = rnp->gp_tasks; | ||
613 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
614 | rnp_root->exp_tasks = rnp->exp_tasks; | ||
615 | #ifdef CONFIG_RCU_BOOST | ||
616 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
617 | rnp_root->boost_tasks = rnp->boost_tasks; | ||
618 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
619 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
620 | } | ||
621 | |||
622 | rnp->gp_tasks = NULL; | ||
623 | rnp->exp_tasks = NULL; | ||
624 | #ifdef CONFIG_RCU_BOOST | ||
625 | rnp->boost_tasks = NULL; | ||
626 | /* | ||
627 | * In case root is being boosted and leaf was not. Make sure | ||
628 | * that we boost the tasks blocking the current grace period | ||
629 | * in this case. | ||
630 | */ | ||
631 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
632 | if (rnp_root->boost_tasks != NULL && | ||
633 | rnp_root->boost_tasks != rnp_root->gp_tasks && | ||
634 | rnp_root->boost_tasks != rnp_root->exp_tasks) | ||
635 | rnp_root->boost_tasks = rnp_root->gp_tasks; | ||
636 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
637 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
638 | |||
639 | return retval; | ||
640 | } | ||
641 | |||
642 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
643 | |||
644 | /* | ||
645 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
646 | * the task is recorded in the corresponding CPU's rcu_node structure, | ||
647 | * which is checked elsewhere. | ||
648 | * | ||
649 | * Caller must disable hard irqs. | ||
650 | */ | ||
651 | static void rcu_preempt_check_callbacks(int cpu) | ||
652 | { | ||
653 | struct task_struct *t = current; | ||
654 | |||
655 | if (t->rcu_read_lock_nesting == 0) { | ||
656 | rcu_preempt_qs(cpu); | ||
657 | return; | ||
658 | } | ||
659 | if (t->rcu_read_lock_nesting > 0 && | ||
660 | per_cpu(rcu_preempt_data, cpu).qs_pending) | ||
661 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
662 | } | ||
663 | |||
664 | #ifdef CONFIG_RCU_BOOST | ||
665 | |||
666 | static void rcu_preempt_do_callbacks(void) | ||
667 | { | ||
668 | rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); | ||
669 | } | ||
670 | |||
671 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
672 | |||
673 | /* | ||
674 | * Queue a preemptible-RCU callback for invocation after a grace period. | ||
675 | */ | ||
676 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
677 | { | ||
678 | __call_rcu(head, func, &rcu_preempt_state, -1, 0); | ||
679 | } | ||
680 | EXPORT_SYMBOL_GPL(call_rcu); | ||
681 | |||
682 | /* | ||
683 | * Queue an RCU callback for lazy invocation after a grace period. | ||
684 | * This will likely be later named something like "call_rcu_lazy()", | ||
685 | * but this change will require some way of tagging the lazy RCU | ||
686 | * callbacks in the list of pending callbacks. Until then, this | ||
687 | * function may only be called from __kfree_rcu(). | ||
688 | */ | ||
689 | void kfree_call_rcu(struct rcu_head *head, | ||
690 | void (*func)(struct rcu_head *rcu)) | ||
691 | { | ||
692 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); | ||
693 | } | ||
694 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
695 | |||
696 | /** | ||
697 | * synchronize_rcu - wait until a grace period has elapsed. | ||
698 | * | ||
699 | * Control will return to the caller some time after a full grace | ||
700 | * period has elapsed, in other words after all currently executing RCU | ||
701 | * read-side critical sections have completed. Note, however, that | ||
702 | * upon return from synchronize_rcu(), the caller might well be executing | ||
703 | * concurrently with new RCU read-side critical sections that began while | ||
704 | * synchronize_rcu() was waiting. RCU read-side critical sections are | ||
705 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | ||
706 | * | ||
707 | * See the description of synchronize_sched() for more detailed information | ||
708 | * on memory ordering guarantees. | ||
709 | */ | ||
710 | void synchronize_rcu(void) | ||
711 | { | ||
712 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
713 | !lock_is_held(&rcu_lock_map) && | ||
714 | !lock_is_held(&rcu_sched_lock_map), | ||
715 | "Illegal synchronize_rcu() in RCU read-side critical section"); | ||
716 | if (!rcu_scheduler_active) | ||
717 | return; | ||
718 | if (rcu_expedited) | ||
719 | synchronize_rcu_expedited(); | ||
720 | else | ||
721 | wait_rcu_gp(call_rcu); | ||
722 | } | ||
723 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
724 | |||
725 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
726 | static unsigned long sync_rcu_preempt_exp_count; | ||
727 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
728 | |||
729 | /* | ||
730 | * Return non-zero if there are any tasks in RCU read-side critical | ||
731 | * sections blocking the current preemptible-RCU expedited grace period. | ||
732 | * If there is no preemptible-RCU expedited grace period currently in | ||
733 | * progress, returns zero unconditionally. | ||
734 | */ | ||
735 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | ||
736 | { | ||
737 | return rnp->exp_tasks != NULL; | ||
738 | } | ||
739 | |||
740 | /* | ||
741 | * return non-zero if there is no RCU expedited grace period in progress | ||
742 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
743 | * tasks covered by the specified rcu_node structure have done their bit | ||
744 | * for the current expedited grace period. Works only for preemptible | ||
745 | * RCU -- other RCU implementation use other means. | ||
746 | * | ||
747 | * Caller must hold sync_rcu_preempt_exp_mutex. | ||
748 | */ | ||
749 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
750 | { | ||
751 | return !rcu_preempted_readers_exp(rnp) && | ||
752 | ACCESS_ONCE(rnp->expmask) == 0; | ||
753 | } | ||
754 | |||
755 | /* | ||
756 | * Report the exit from RCU read-side critical section for the last task | ||
757 | * that queued itself during or before the current expedited preemptible-RCU | ||
758 | * grace period. This event is reported either to the rcu_node structure on | ||
759 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
760 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
761 | * iteratively!) | ||
762 | * | ||
763 | * Most callers will set the "wake" flag, but the task initiating the | ||
764 | * expedited grace period need not wake itself. | ||
765 | * | ||
766 | * Caller must hold sync_rcu_preempt_exp_mutex. | ||
767 | */ | ||
768 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
769 | bool wake) | ||
770 | { | ||
771 | unsigned long flags; | ||
772 | unsigned long mask; | ||
773 | |||
774 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
775 | for (;;) { | ||
776 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
777 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
778 | break; | ||
779 | } | ||
780 | if (rnp->parent == NULL) { | ||
781 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
782 | if (wake) | ||
783 | wake_up(&sync_rcu_preempt_exp_wq); | ||
784 | break; | ||
785 | } | ||
786 | mask = rnp->grpmask; | ||
787 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
788 | rnp = rnp->parent; | ||
789 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
790 | rnp->expmask &= ~mask; | ||
791 | } | ||
792 | } | ||
793 | |||
794 | /* | ||
795 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | ||
796 | * grace period for the specified rcu_node structure. If there are no such | ||
797 | * tasks, report it up the rcu_node hierarchy. | ||
798 | * | ||
799 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude | ||
800 | * CPU hotplug operations. | ||
801 | */ | ||
802 | static void | ||
803 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | ||
804 | { | ||
805 | unsigned long flags; | ||
806 | int must_wait = 0; | ||
807 | |||
808 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
809 | if (list_empty(&rnp->blkd_tasks)) { | ||
810 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
811 | } else { | ||
812 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
813 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
814 | must_wait = 1; | ||
815 | } | ||
816 | if (!must_wait) | ||
817 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ | ||
818 | } | ||
819 | |||
820 | /** | ||
821 | * synchronize_rcu_expedited - Brute-force RCU grace period | ||
822 | * | ||
823 | * Wait for an RCU-preempt grace period, but expedite it. The basic | ||
824 | * idea is to invoke synchronize_sched_expedited() to push all the tasks to | ||
825 | * the ->blkd_tasks lists and wait for this list to drain. This consumes | ||
826 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
827 | * so is thus not recommended for any sort of common-case code. | ||
828 | * In fact, if you are using synchronize_rcu_expedited() in a loop, | ||
829 | * please restructure your code to batch your updates, and then Use a | ||
830 | * single synchronize_rcu() instead. | ||
831 | * | ||
832 | * Note that it is illegal to call this function while holding any lock | ||
833 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
834 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
835 | * these restriction will result in deadlock. | ||
836 | */ | ||
837 | void synchronize_rcu_expedited(void) | ||
838 | { | ||
839 | unsigned long flags; | ||
840 | struct rcu_node *rnp; | ||
841 | struct rcu_state *rsp = &rcu_preempt_state; | ||
842 | unsigned long snap; | ||
843 | int trycount = 0; | ||
844 | |||
845 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
846 | snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; | ||
847 | smp_mb(); /* Above access cannot bleed into critical section. */ | ||
848 | |||
849 | /* | ||
850 | * Block CPU-hotplug operations. This means that any CPU-hotplug | ||
851 | * operation that finds an rcu_node structure with tasks in the | ||
852 | * process of being boosted will know that all tasks blocking | ||
853 | * this expedited grace period will already be in the process of | ||
854 | * being boosted. This simplifies the process of moving tasks | ||
855 | * from leaf to root rcu_node structures. | ||
856 | */ | ||
857 | get_online_cpus(); | ||
858 | |||
859 | /* | ||
860 | * Acquire lock, falling back to synchronize_rcu() if too many | ||
861 | * lock-acquisition failures. Of course, if someone does the | ||
862 | * expedited grace period for us, just leave. | ||
863 | */ | ||
864 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | ||
865 | if (ULONG_CMP_LT(snap, | ||
866 | ACCESS_ONCE(sync_rcu_preempt_exp_count))) { | ||
867 | put_online_cpus(); | ||
868 | goto mb_ret; /* Others did our work for us. */ | ||
869 | } | ||
870 | if (trycount++ < 10) { | ||
871 | udelay(trycount * num_online_cpus()); | ||
872 | } else { | ||
873 | put_online_cpus(); | ||
874 | wait_rcu_gp(call_rcu); | ||
875 | return; | ||
876 | } | ||
877 | } | ||
878 | if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { | ||
879 | put_online_cpus(); | ||
880 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
881 | } | ||
882 | |||
883 | /* force all RCU readers onto ->blkd_tasks lists. */ | ||
884 | synchronize_sched_expedited(); | ||
885 | |||
886 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | ||
887 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | ||
888 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
889 | rnp->expmask = rnp->qsmaskinit; | ||
890 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
891 | } | ||
892 | |||
893 | /* Snapshot current state of ->blkd_tasks lists. */ | ||
894 | rcu_for_each_leaf_node(rsp, rnp) | ||
895 | sync_rcu_preempt_exp_init(rsp, rnp); | ||
896 | if (NUM_RCU_NODES > 1) | ||
897 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | ||
898 | |||
899 | put_online_cpus(); | ||
900 | |||
901 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ | ||
902 | rnp = rcu_get_root(rsp); | ||
903 | wait_event(sync_rcu_preempt_exp_wq, | ||
904 | sync_rcu_preempt_exp_done(rnp)); | ||
905 | |||
906 | /* Clean up and exit. */ | ||
907 | smp_mb(); /* ensure expedited GP seen before counter increment. */ | ||
908 | ACCESS_ONCE(sync_rcu_preempt_exp_count)++; | ||
909 | unlock_mb_ret: | ||
910 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
911 | mb_ret: | ||
912 | smp_mb(); /* ensure subsequent action seen after grace period. */ | ||
913 | } | ||
914 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
915 | |||
916 | /** | ||
917 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | ||
918 | * | ||
919 | * Note that this primitive does not necessarily wait for an RCU grace period | ||
920 | * to complete. For example, if there are no RCU callbacks queued anywhere | ||
921 | * in the system, then rcu_barrier() is within its rights to return | ||
922 | * immediately, without waiting for anything, much less an RCU grace period. | ||
923 | */ | ||
924 | void rcu_barrier(void) | ||
925 | { | ||
926 | _rcu_barrier(&rcu_preempt_state); | ||
927 | } | ||
928 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
929 | |||
930 | /* | ||
931 | * Initialize preemptible RCU's state structures. | ||
932 | */ | ||
933 | static void __init __rcu_init_preempt(void) | ||
934 | { | ||
935 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | ||
936 | } | ||
937 | |||
938 | /* | ||
939 | * Check for a task exiting while in a preemptible-RCU read-side | ||
940 | * critical section, clean up if so. No need to issue warnings, | ||
941 | * as debug_check_no_locks_held() already does this if lockdep | ||
942 | * is enabled. | ||
943 | */ | ||
944 | void exit_rcu(void) | ||
945 | { | ||
946 | struct task_struct *t = current; | ||
947 | |||
948 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
949 | return; | ||
950 | t->rcu_read_lock_nesting = 1; | ||
951 | barrier(); | ||
952 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
953 | __rcu_read_unlock(); | ||
954 | } | ||
955 | |||
956 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
957 | |||
958 | static struct rcu_state *rcu_state = &rcu_sched_state; | ||
959 | |||
960 | /* | ||
961 | * Tell them what RCU they are running. | ||
962 | */ | ||
963 | static void __init rcu_bootup_announce(void) | ||
964 | { | ||
965 | pr_info("Hierarchical RCU implementation.\n"); | ||
966 | rcu_bootup_announce_oddness(); | ||
967 | } | ||
968 | |||
969 | /* | ||
970 | * Return the number of RCU batches processed thus far for debug & stats. | ||
971 | */ | ||
972 | long rcu_batches_completed(void) | ||
973 | { | ||
974 | return rcu_batches_completed_sched(); | ||
975 | } | ||
976 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
977 | |||
978 | /* | ||
979 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
980 | * RCU, becomes the same as rcu-sched. | ||
981 | */ | ||
982 | void rcu_force_quiescent_state(void) | ||
983 | { | ||
984 | rcu_sched_force_quiescent_state(); | ||
985 | } | ||
986 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
987 | |||
988 | /* | ||
989 | * Because preemptible RCU does not exist, we never have to check for | ||
990 | * CPUs being in quiescent states. | ||
991 | */ | ||
992 | static void rcu_preempt_note_context_switch(int cpu) | ||
993 | { | ||
994 | } | ||
995 | |||
996 | /* | ||
997 | * Because preemptible RCU does not exist, there are never any preempted | ||
998 | * RCU readers. | ||
999 | */ | ||
1000 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | ||
1001 | { | ||
1002 | return 0; | ||
1003 | } | ||
1004 | |||
1005 | #ifdef CONFIG_HOTPLUG_CPU | ||
1006 | |||
1007 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | ||
1008 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | ||
1009 | { | ||
1010 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1011 | } | ||
1012 | |||
1013 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1014 | |||
1015 | /* | ||
1016 | * Because preemptible RCU does not exist, we never have to check for | ||
1017 | * tasks blocked within RCU read-side critical sections. | ||
1018 | */ | ||
1019 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
1020 | { | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * Because preemptible RCU does not exist, we never have to check for | ||
1025 | * tasks blocked within RCU read-side critical sections. | ||
1026 | */ | ||
1027 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
1028 | { | ||
1029 | return 0; | ||
1030 | } | ||
1031 | |||
1032 | /* | ||
1033 | * Because there is no preemptible RCU, there can be no readers blocked, | ||
1034 | * so there is no need to check for blocked tasks. So check only for | ||
1035 | * bogus qsmask values. | ||
1036 | */ | ||
1037 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | ||
1038 | { | ||
1039 | WARN_ON_ONCE(rnp->qsmask); | ||
1040 | } | ||
1041 | |||
1042 | #ifdef CONFIG_HOTPLUG_CPU | ||
1043 | |||
1044 | /* | ||
1045 | * Because preemptible RCU does not exist, it never needs to migrate | ||
1046 | * tasks that were blocked within RCU read-side critical sections, and | ||
1047 | * such non-existent tasks cannot possibly have been blocking the current | ||
1048 | * grace period. | ||
1049 | */ | ||
1050 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | ||
1051 | struct rcu_node *rnp, | ||
1052 | struct rcu_data *rdp) | ||
1053 | { | ||
1054 | return 0; | ||
1055 | } | ||
1056 | |||
1057 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1058 | |||
1059 | /* | ||
1060 | * Because preemptible RCU does not exist, it never has any callbacks | ||
1061 | * to check. | ||
1062 | */ | ||
1063 | static void rcu_preempt_check_callbacks(int cpu) | ||
1064 | { | ||
1065 | } | ||
1066 | |||
1067 | /* | ||
1068 | * Queue an RCU callback for lazy invocation after a grace period. | ||
1069 | * This will likely be later named something like "call_rcu_lazy()", | ||
1070 | * but this change will require some way of tagging the lazy RCU | ||
1071 | * callbacks in the list of pending callbacks. Until then, this | ||
1072 | * function may only be called from __kfree_rcu(). | ||
1073 | * | ||
1074 | * Because there is no preemptible RCU, we use RCU-sched instead. | ||
1075 | */ | ||
1076 | void kfree_call_rcu(struct rcu_head *head, | ||
1077 | void (*func)(struct rcu_head *rcu)) | ||
1078 | { | ||
1079 | __call_rcu(head, func, &rcu_sched_state, -1, 1); | ||
1080 | } | ||
1081 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
1082 | |||
1083 | /* | ||
1084 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
1085 | * But because preemptible RCU does not exist, map to rcu-sched. | ||
1086 | */ | ||
1087 | void synchronize_rcu_expedited(void) | ||
1088 | { | ||
1089 | synchronize_sched_expedited(); | ||
1090 | } | ||
1091 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
1092 | |||
1093 | #ifdef CONFIG_HOTPLUG_CPU | ||
1094 | |||
1095 | /* | ||
1096 | * Because preemptible RCU does not exist, there is never any need to | ||
1097 | * report on tasks preempted in RCU read-side critical sections during | ||
1098 | * expedited RCU grace periods. | ||
1099 | */ | ||
1100 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
1101 | bool wake) | ||
1102 | { | ||
1103 | } | ||
1104 | |||
1105 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1106 | |||
1107 | /* | ||
1108 | * Because preemptible RCU does not exist, rcu_barrier() is just | ||
1109 | * another name for rcu_barrier_sched(). | ||
1110 | */ | ||
1111 | void rcu_barrier(void) | ||
1112 | { | ||
1113 | rcu_barrier_sched(); | ||
1114 | } | ||
1115 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
1116 | |||
1117 | /* | ||
1118 | * Because preemptible RCU does not exist, it need not be initialized. | ||
1119 | */ | ||
1120 | static void __init __rcu_init_preempt(void) | ||
1121 | { | ||
1122 | } | ||
1123 | |||
1124 | /* | ||
1125 | * Because preemptible RCU does not exist, tasks cannot possibly exit | ||
1126 | * while in preemptible RCU read-side critical sections. | ||
1127 | */ | ||
1128 | void exit_rcu(void) | ||
1129 | { | ||
1130 | } | ||
1131 | |||
1132 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
1133 | |||
1134 | #ifdef CONFIG_RCU_BOOST | ||
1135 | |||
1136 | #include "../rtmutex_common.h" | ||
1137 | |||
1138 | #ifdef CONFIG_RCU_TRACE | ||
1139 | |||
1140 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1141 | { | ||
1142 | if (list_empty(&rnp->blkd_tasks)) | ||
1143 | rnp->n_balk_blkd_tasks++; | ||
1144 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
1145 | rnp->n_balk_exp_gp_tasks++; | ||
1146 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
1147 | rnp->n_balk_boost_tasks++; | ||
1148 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
1149 | rnp->n_balk_notblocked++; | ||
1150 | else if (rnp->gp_tasks != NULL && | ||
1151 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
1152 | rnp->n_balk_notyet++; | ||
1153 | else | ||
1154 | rnp->n_balk_nos++; | ||
1155 | } | ||
1156 | |||
1157 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
1158 | |||
1159 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
1160 | { | ||
1161 | } | ||
1162 | |||
1163 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
1164 | |||
1165 | static void rcu_wake_cond(struct task_struct *t, int status) | ||
1166 | { | ||
1167 | /* | ||
1168 | * If the thread is yielding, only wake it when this | ||
1169 | * is invoked from idle | ||
1170 | */ | ||
1171 | if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) | ||
1172 | wake_up_process(t); | ||
1173 | } | ||
1174 | |||
1175 | /* | ||
1176 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | ||
1177 | * or ->boost_tasks, advancing the pointer to the next task in the | ||
1178 | * ->blkd_tasks list. | ||
1179 | * | ||
1180 | * Note that irqs must be enabled: boosting the task can block. | ||
1181 | * Returns 1 if there are more tasks needing to be boosted. | ||
1182 | */ | ||
1183 | static int rcu_boost(struct rcu_node *rnp) | ||
1184 | { | ||
1185 | unsigned long flags; | ||
1186 | struct rt_mutex mtx; | ||
1187 | struct task_struct *t; | ||
1188 | struct list_head *tb; | ||
1189 | |||
1190 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) | ||
1191 | return 0; /* Nothing left to boost. */ | ||
1192 | |||
1193 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1194 | |||
1195 | /* | ||
1196 | * Recheck under the lock: all tasks in need of boosting | ||
1197 | * might exit their RCU read-side critical sections on their own. | ||
1198 | */ | ||
1199 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { | ||
1200 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1201 | return 0; | ||
1202 | } | ||
1203 | |||
1204 | /* | ||
1205 | * Preferentially boost tasks blocking expedited grace periods. | ||
1206 | * This cannot starve the normal grace periods because a second | ||
1207 | * expedited grace period must boost all blocked tasks, including | ||
1208 | * those blocking the pre-existing normal grace period. | ||
1209 | */ | ||
1210 | if (rnp->exp_tasks != NULL) { | ||
1211 | tb = rnp->exp_tasks; | ||
1212 | rnp->n_exp_boosts++; | ||
1213 | } else { | ||
1214 | tb = rnp->boost_tasks; | ||
1215 | rnp->n_normal_boosts++; | ||
1216 | } | ||
1217 | rnp->n_tasks_boosted++; | ||
1218 | |||
1219 | /* | ||
1220 | * We boost task t by manufacturing an rt_mutex that appears to | ||
1221 | * be held by task t. We leave a pointer to that rt_mutex where | ||
1222 | * task t can find it, and task t will release the mutex when it | ||
1223 | * exits its outermost RCU read-side critical section. Then | ||
1224 | * simply acquiring this artificial rt_mutex will boost task | ||
1225 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
1226 | * | ||
1227 | * Note that task t must acquire rnp->lock to remove itself from | ||
1228 | * the ->blkd_tasks list, which it will do from exit() if from | ||
1229 | * nowhere else. We therefore are guaranteed that task t will | ||
1230 | * stay around at least until we drop rnp->lock. Note that | ||
1231 | * rnp->lock also resolves races between our priority boosting | ||
1232 | * and task t's exiting its outermost RCU read-side critical | ||
1233 | * section. | ||
1234 | */ | ||
1235 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
1236 | rt_mutex_init_proxy_locked(&mtx, t); | ||
1237 | t->rcu_boost_mutex = &mtx; | ||
1238 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1239 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | ||
1240 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
1241 | |||
1242 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || | ||
1243 | ACCESS_ONCE(rnp->boost_tasks) != NULL; | ||
1244 | } | ||
1245 | |||
1246 | /* | ||
1247 | * Priority-boosting kthread. One per leaf rcu_node and one for the | ||
1248 | * root rcu_node. | ||
1249 | */ | ||
1250 | static int rcu_boost_kthread(void *arg) | ||
1251 | { | ||
1252 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
1253 | int spincnt = 0; | ||
1254 | int more2boost; | ||
1255 | |||
1256 | trace_rcu_utilization(TPS("Start boost kthread@init")); | ||
1257 | for (;;) { | ||
1258 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | ||
1259 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); | ||
1260 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | ||
1261 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); | ||
1262 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | ||
1263 | more2boost = rcu_boost(rnp); | ||
1264 | if (more2boost) | ||
1265 | spincnt++; | ||
1266 | else | ||
1267 | spincnt = 0; | ||
1268 | if (spincnt > 10) { | ||
1269 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | ||
1270 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); | ||
1271 | schedule_timeout_interruptible(2); | ||
1272 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); | ||
1273 | spincnt = 0; | ||
1274 | } | ||
1275 | } | ||
1276 | /* NOTREACHED */ | ||
1277 | trace_rcu_utilization(TPS("End boost kthread@notreached")); | ||
1278 | return 0; | ||
1279 | } | ||
1280 | |||
1281 | /* | ||
1282 | * Check to see if it is time to start boosting RCU readers that are | ||
1283 | * blocking the current grace period, and, if so, tell the per-rcu_node | ||
1284 | * kthread to start boosting them. If there is an expedited grace | ||
1285 | * period in progress, it is always time to boost. | ||
1286 | * | ||
1287 | * The caller must hold rnp->lock, which this function releases. | ||
1288 | * The ->boost_kthread_task is immortal, so we don't need to worry | ||
1289 | * about it going away. | ||
1290 | */ | ||
1291 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1292 | { | ||
1293 | struct task_struct *t; | ||
1294 | |||
1295 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | ||
1296 | rnp->n_balk_exp_gp_tasks++; | ||
1297 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1298 | return; | ||
1299 | } | ||
1300 | if (rnp->exp_tasks != NULL || | ||
1301 | (rnp->gp_tasks != NULL && | ||
1302 | rnp->boost_tasks == NULL && | ||
1303 | rnp->qsmask == 0 && | ||
1304 | ULONG_CMP_GE(jiffies, rnp->boost_time))) { | ||
1305 | if (rnp->exp_tasks == NULL) | ||
1306 | rnp->boost_tasks = rnp->gp_tasks; | ||
1307 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1308 | t = rnp->boost_kthread_task; | ||
1309 | if (t) | ||
1310 | rcu_wake_cond(t, rnp->boost_kthread_status); | ||
1311 | } else { | ||
1312 | rcu_initiate_boost_trace(rnp); | ||
1313 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1314 | } | ||
1315 | } | ||
1316 | |||
1317 | /* | ||
1318 | * Wake up the per-CPU kthread to invoke RCU callbacks. | ||
1319 | */ | ||
1320 | static void invoke_rcu_callbacks_kthread(void) | ||
1321 | { | ||
1322 | unsigned long flags; | ||
1323 | |||
1324 | local_irq_save(flags); | ||
1325 | __this_cpu_write(rcu_cpu_has_work, 1); | ||
1326 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && | ||
1327 | current != __this_cpu_read(rcu_cpu_kthread_task)) { | ||
1328 | rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), | ||
1329 | __this_cpu_read(rcu_cpu_kthread_status)); | ||
1330 | } | ||
1331 | local_irq_restore(flags); | ||
1332 | } | ||
1333 | |||
1334 | /* | ||
1335 | * Is the current CPU running the RCU-callbacks kthread? | ||
1336 | * Caller must have preemption disabled. | ||
1337 | */ | ||
1338 | static bool rcu_is_callbacks_kthread(void) | ||
1339 | { | ||
1340 | return __this_cpu_read(rcu_cpu_kthread_task) == current; | ||
1341 | } | ||
1342 | |||
1343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
1344 | |||
1345 | /* | ||
1346 | * Do priority-boost accounting for the start of a new grace period. | ||
1347 | */ | ||
1348 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1349 | { | ||
1350 | rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
1351 | } | ||
1352 | |||
1353 | /* | ||
1354 | * Create an RCU-boost kthread for the specified node if one does not | ||
1355 | * already exist. We only create this kthread for preemptible RCU. | ||
1356 | * Returns zero if all is well, a negated errno otherwise. | ||
1357 | */ | ||
1358 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
1359 | struct rcu_node *rnp) | ||
1360 | { | ||
1361 | int rnp_index = rnp - &rsp->node[0]; | ||
1362 | unsigned long flags; | ||
1363 | struct sched_param sp; | ||
1364 | struct task_struct *t; | ||
1365 | |||
1366 | if (&rcu_preempt_state != rsp) | ||
1367 | return 0; | ||
1368 | |||
1369 | if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) | ||
1370 | return 0; | ||
1371 | |||
1372 | rsp->boost = 1; | ||
1373 | if (rnp->boost_kthread_task != NULL) | ||
1374 | return 0; | ||
1375 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | ||
1376 | "rcub/%d", rnp_index); | ||
1377 | if (IS_ERR(t)) | ||
1378 | return PTR_ERR(t); | ||
1379 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
1380 | rnp->boost_kthread_task = t; | ||
1381 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1382 | sp.sched_priority = RCU_BOOST_PRIO; | ||
1383 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
1384 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
1385 | return 0; | ||
1386 | } | ||
1387 | |||
1388 | static void rcu_kthread_do_work(void) | ||
1389 | { | ||
1390 | rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); | ||
1391 | rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); | ||
1392 | rcu_preempt_do_callbacks(); | ||
1393 | } | ||
1394 | |||
1395 | static void rcu_cpu_kthread_setup(unsigned int cpu) | ||
1396 | { | ||
1397 | struct sched_param sp; | ||
1398 | |||
1399 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
1400 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
1401 | } | ||
1402 | |||
1403 | static void rcu_cpu_kthread_park(unsigned int cpu) | ||
1404 | { | ||
1405 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
1406 | } | ||
1407 | |||
1408 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | ||
1409 | { | ||
1410 | return __this_cpu_read(rcu_cpu_has_work); | ||
1411 | } | ||
1412 | |||
1413 | /* | ||
1414 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | ||
1415 | * RCU softirq used in flavors and configurations of RCU that do not | ||
1416 | * support RCU priority boosting. | ||
1417 | */ | ||
1418 | static void rcu_cpu_kthread(unsigned int cpu) | ||
1419 | { | ||
1420 | unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); | ||
1421 | char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); | ||
1422 | int spincnt; | ||
1423 | |||
1424 | for (spincnt = 0; spincnt < 10; spincnt++) { | ||
1425 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); | ||
1426 | local_bh_disable(); | ||
1427 | *statusp = RCU_KTHREAD_RUNNING; | ||
1428 | this_cpu_inc(rcu_cpu_kthread_loops); | ||
1429 | local_irq_disable(); | ||
1430 | work = *workp; | ||
1431 | *workp = 0; | ||
1432 | local_irq_enable(); | ||
1433 | if (work) | ||
1434 | rcu_kthread_do_work(); | ||
1435 | local_bh_enable(); | ||
1436 | if (*workp == 0) { | ||
1437 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); | ||
1438 | *statusp = RCU_KTHREAD_WAITING; | ||
1439 | return; | ||
1440 | } | ||
1441 | } | ||
1442 | *statusp = RCU_KTHREAD_YIELDING; | ||
1443 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); | ||
1444 | schedule_timeout_interruptible(2); | ||
1445 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); | ||
1446 | *statusp = RCU_KTHREAD_WAITING; | ||
1447 | } | ||
1448 | |||
1449 | /* | ||
1450 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | ||
1451 | * served by the rcu_node in question. The CPU hotplug lock is still | ||
1452 | * held, so the value of rnp->qsmaskinit will be stable. | ||
1453 | * | ||
1454 | * We don't include outgoingcpu in the affinity set, use -1 if there is | ||
1455 | * no outgoing CPU. If there are no CPUs left in the affinity set, | ||
1456 | * this function allows the kthread to execute on any CPU. | ||
1457 | */ | ||
1458 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1459 | { | ||
1460 | struct task_struct *t = rnp->boost_kthread_task; | ||
1461 | unsigned long mask = rnp->qsmaskinit; | ||
1462 | cpumask_var_t cm; | ||
1463 | int cpu; | ||
1464 | |||
1465 | if (!t) | ||
1466 | return; | ||
1467 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) | ||
1468 | return; | ||
1469 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | ||
1470 | if ((mask & 0x1) && cpu != outgoingcpu) | ||
1471 | cpumask_set_cpu(cpu, cm); | ||
1472 | if (cpumask_weight(cm) == 0) { | ||
1473 | cpumask_setall(cm); | ||
1474 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) | ||
1475 | cpumask_clear_cpu(cpu, cm); | ||
1476 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | ||
1477 | } | ||
1478 | set_cpus_allowed_ptr(t, cm); | ||
1479 | free_cpumask_var(cm); | ||
1480 | } | ||
1481 | |||
1482 | static struct smp_hotplug_thread rcu_cpu_thread_spec = { | ||
1483 | .store = &rcu_cpu_kthread_task, | ||
1484 | .thread_should_run = rcu_cpu_kthread_should_run, | ||
1485 | .thread_fn = rcu_cpu_kthread, | ||
1486 | .thread_comm = "rcuc/%u", | ||
1487 | .setup = rcu_cpu_kthread_setup, | ||
1488 | .park = rcu_cpu_kthread_park, | ||
1489 | }; | ||
1490 | |||
1491 | /* | ||
1492 | * Spawn all kthreads -- called as soon as the scheduler is running. | ||
1493 | */ | ||
1494 | static int __init rcu_spawn_kthreads(void) | ||
1495 | { | ||
1496 | struct rcu_node *rnp; | ||
1497 | int cpu; | ||
1498 | |||
1499 | rcu_scheduler_fully_active = 1; | ||
1500 | for_each_possible_cpu(cpu) | ||
1501 | per_cpu(rcu_cpu_has_work, cpu) = 0; | ||
1502 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | ||
1503 | rnp = rcu_get_root(rcu_state); | ||
1504 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | ||
1505 | if (NUM_RCU_NODES > 1) { | ||
1506 | rcu_for_each_leaf_node(rcu_state, rnp) | ||
1507 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | ||
1508 | } | ||
1509 | return 0; | ||
1510 | } | ||
1511 | early_initcall(rcu_spawn_kthreads); | ||
1512 | |||
1513 | static void rcu_prepare_kthreads(int cpu) | ||
1514 | { | ||
1515 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
1516 | struct rcu_node *rnp = rdp->mynode; | ||
1517 | |||
1518 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | ||
1519 | if (rcu_scheduler_fully_active) | ||
1520 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | ||
1521 | } | ||
1522 | |||
1523 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
1524 | |||
1525 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
1526 | { | ||
1527 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
1528 | } | ||
1529 | |||
1530 | static void invoke_rcu_callbacks_kthread(void) | ||
1531 | { | ||
1532 | WARN_ON_ONCE(1); | ||
1533 | } | ||
1534 | |||
1535 | static bool rcu_is_callbacks_kthread(void) | ||
1536 | { | ||
1537 | return false; | ||
1538 | } | ||
1539 | |||
1540 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
1541 | { | ||
1542 | } | ||
1543 | |||
1544 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
1545 | { | ||
1546 | } | ||
1547 | |||
1548 | static int __init rcu_scheduler_really_started(void) | ||
1549 | { | ||
1550 | rcu_scheduler_fully_active = 1; | ||
1551 | return 0; | ||
1552 | } | ||
1553 | early_initcall(rcu_scheduler_really_started); | ||
1554 | |||
1555 | static void rcu_prepare_kthreads(int cpu) | ||
1556 | { | ||
1557 | } | ||
1558 | |||
1559 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
1560 | |||
1561 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | ||
1562 | |||
1563 | /* | ||
1564 | * Check to see if any future RCU-related work will need to be done | ||
1565 | * by the current CPU, even if none need be done immediately, returning | ||
1566 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
1567 | * an exported member of the RCU API. | ||
1568 | * | ||
1569 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | ||
1570 | * any flavor of RCU. | ||
1571 | */ | ||
1572 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
1573 | { | ||
1574 | *delta_jiffies = ULONG_MAX; | ||
1575 | return rcu_cpu_has_callbacks(cpu, NULL); | ||
1576 | } | ||
1577 | |||
1578 | /* | ||
1579 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | ||
1580 | * after it. | ||
1581 | */ | ||
1582 | static void rcu_cleanup_after_idle(int cpu) | ||
1583 | { | ||
1584 | } | ||
1585 | |||
1586 | /* | ||
1587 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, | ||
1588 | * is nothing. | ||
1589 | */ | ||
1590 | static void rcu_prepare_for_idle(int cpu) | ||
1591 | { | ||
1592 | } | ||
1593 | |||
1594 | /* | ||
1595 | * Don't bother keeping a running count of the number of RCU callbacks | ||
1596 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
1597 | */ | ||
1598 | static void rcu_idle_count_callbacks_posted(void) | ||
1599 | { | ||
1600 | } | ||
1601 | |||
1602 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
1603 | |||
1604 | /* | ||
1605 | * This code is invoked when a CPU goes idle, at which point we want | ||
1606 | * to have the CPU do everything required for RCU so that it can enter | ||
1607 | * the energy-efficient dyntick-idle mode. This is handled by a | ||
1608 | * state machine implemented by rcu_prepare_for_idle() below. | ||
1609 | * | ||
1610 | * The following three proprocessor symbols control this state machine: | ||
1611 | * | ||
1612 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | ||
1613 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | ||
1614 | * is sized to be roughly one RCU grace period. Those energy-efficiency | ||
1615 | * benchmarkers who might otherwise be tempted to set this to a large | ||
1616 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your | ||
1617 | * system. And if you are -that- concerned about energy efficiency, | ||
1618 | * just power the system down and be done with it! | ||
1619 | * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is | ||
1620 | * permitted to sleep in dyntick-idle mode with only lazy RCU | ||
1621 | * callbacks pending. Setting this too high can OOM your system. | ||
1622 | * | ||
1623 | * The values below work well in practice. If future workloads require | ||
1624 | * adjustment, they can be converted into kernel config parameters, though | ||
1625 | * making the state machine smarter might be a better option. | ||
1626 | */ | ||
1627 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ | ||
1628 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | ||
1629 | |||
1630 | static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; | ||
1631 | module_param(rcu_idle_gp_delay, int, 0644); | ||
1632 | static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; | ||
1633 | module_param(rcu_idle_lazy_gp_delay, int, 0644); | ||
1634 | |||
1635 | extern int tick_nohz_enabled; | ||
1636 | |||
1637 | /* | ||
1638 | * Try to advance callbacks for all flavors of RCU on the current CPU, but | ||
1639 | * only if it has been awhile since the last time we did so. Afterwards, | ||
1640 | * if there are any callbacks ready for immediate invocation, return true. | ||
1641 | */ | ||
1642 | static bool rcu_try_advance_all_cbs(void) | ||
1643 | { | ||
1644 | bool cbs_ready = false; | ||
1645 | struct rcu_data *rdp; | ||
1646 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
1647 | struct rcu_node *rnp; | ||
1648 | struct rcu_state *rsp; | ||
1649 | |||
1650 | /* Exit early if we advanced recently. */ | ||
1651 | if (jiffies == rdtp->last_advance_all) | ||
1652 | return 0; | ||
1653 | rdtp->last_advance_all = jiffies; | ||
1654 | |||
1655 | for_each_rcu_flavor(rsp) { | ||
1656 | rdp = this_cpu_ptr(rsp->rda); | ||
1657 | rnp = rdp->mynode; | ||
1658 | |||
1659 | /* | ||
1660 | * Don't bother checking unless a grace period has | ||
1661 | * completed since we last checked and there are | ||
1662 | * callbacks not yet ready to invoke. | ||
1663 | */ | ||
1664 | if (rdp->completed != rnp->completed && | ||
1665 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | ||
1666 | note_gp_changes(rsp, rdp); | ||
1667 | |||
1668 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
1669 | cbs_ready = true; | ||
1670 | } | ||
1671 | return cbs_ready; | ||
1672 | } | ||
1673 | |||
1674 | /* | ||
1675 | * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready | ||
1676 | * to invoke. If the CPU has callbacks, try to advance them. Tell the | ||
1677 | * caller to set the timeout based on whether or not there are non-lazy | ||
1678 | * callbacks. | ||
1679 | * | ||
1680 | * The caller must have disabled interrupts. | ||
1681 | */ | ||
1682 | int rcu_needs_cpu(int cpu, unsigned long *dj) | ||
1683 | { | ||
1684 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1685 | |||
1686 | /* Snapshot to detect later posting of non-lazy callback. */ | ||
1687 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
1688 | |||
1689 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
1690 | if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { | ||
1691 | *dj = ULONG_MAX; | ||
1692 | return 0; | ||
1693 | } | ||
1694 | |||
1695 | /* Attempt to advance callbacks. */ | ||
1696 | if (rcu_try_advance_all_cbs()) { | ||
1697 | /* Some ready to invoke, so initiate later invocation. */ | ||
1698 | invoke_rcu_core(); | ||
1699 | return 1; | ||
1700 | } | ||
1701 | rdtp->last_accelerate = jiffies; | ||
1702 | |||
1703 | /* Request timer delay depending on laziness, and round. */ | ||
1704 | if (!rdtp->all_lazy) { | ||
1705 | *dj = round_up(rcu_idle_gp_delay + jiffies, | ||
1706 | rcu_idle_gp_delay) - jiffies; | ||
1707 | } else { | ||
1708 | *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; | ||
1709 | } | ||
1710 | return 0; | ||
1711 | } | ||
1712 | |||
1713 | /* | ||
1714 | * Prepare a CPU for idle from an RCU perspective. The first major task | ||
1715 | * is to sense whether nohz mode has been enabled or disabled via sysfs. | ||
1716 | * The second major task is to check to see if a non-lazy callback has | ||
1717 | * arrived at a CPU that previously had only lazy callbacks. The third | ||
1718 | * major task is to accelerate (that is, assign grace-period numbers to) | ||
1719 | * any recently arrived callbacks. | ||
1720 | * | ||
1721 | * The caller must have disabled interrupts. | ||
1722 | */ | ||
1723 | static void rcu_prepare_for_idle(int cpu) | ||
1724 | { | ||
1725 | struct rcu_data *rdp; | ||
1726 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1727 | struct rcu_node *rnp; | ||
1728 | struct rcu_state *rsp; | ||
1729 | int tne; | ||
1730 | |||
1731 | /* Handle nohz enablement switches conservatively. */ | ||
1732 | tne = ACCESS_ONCE(tick_nohz_enabled); | ||
1733 | if (tne != rdtp->tick_nohz_enabled_snap) { | ||
1734 | if (rcu_cpu_has_callbacks(cpu, NULL)) | ||
1735 | invoke_rcu_core(); /* force nohz to see update. */ | ||
1736 | rdtp->tick_nohz_enabled_snap = tne; | ||
1737 | return; | ||
1738 | } | ||
1739 | if (!tne) | ||
1740 | return; | ||
1741 | |||
1742 | /* If this is a no-CBs CPU, no callbacks, just return. */ | ||
1743 | if (rcu_is_nocb_cpu(cpu)) | ||
1744 | return; | ||
1745 | |||
1746 | /* | ||
1747 | * If a non-lazy callback arrived at a CPU having only lazy | ||
1748 | * callbacks, invoke RCU core for the side-effect of recalculating | ||
1749 | * idle duration on re-entry to idle. | ||
1750 | */ | ||
1751 | if (rdtp->all_lazy && | ||
1752 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { | ||
1753 | rdtp->all_lazy = false; | ||
1754 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
1755 | invoke_rcu_core(); | ||
1756 | return; | ||
1757 | } | ||
1758 | |||
1759 | /* | ||
1760 | * If we have not yet accelerated this jiffy, accelerate all | ||
1761 | * callbacks on this CPU. | ||
1762 | */ | ||
1763 | if (rdtp->last_accelerate == jiffies) | ||
1764 | return; | ||
1765 | rdtp->last_accelerate = jiffies; | ||
1766 | for_each_rcu_flavor(rsp) { | ||
1767 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
1768 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | ||
1769 | continue; | ||
1770 | rnp = rdp->mynode; | ||
1771 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
1772 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1773 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
1774 | } | ||
1775 | } | ||
1776 | |||
1777 | /* | ||
1778 | * Clean up for exit from idle. Attempt to advance callbacks based on | ||
1779 | * any grace periods that elapsed while the CPU was idle, and if any | ||
1780 | * callbacks are now ready to invoke, initiate invocation. | ||
1781 | */ | ||
1782 | static void rcu_cleanup_after_idle(int cpu) | ||
1783 | { | ||
1784 | |||
1785 | if (rcu_is_nocb_cpu(cpu)) | ||
1786 | return; | ||
1787 | if (rcu_try_advance_all_cbs()) | ||
1788 | invoke_rcu_core(); | ||
1789 | } | ||
1790 | |||
1791 | /* | ||
1792 | * Keep a running count of the number of non-lazy callbacks posted | ||
1793 | * on this CPU. This running counter (which is never decremented) allows | ||
1794 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
1795 | * posts a callback, even if an equal number of callbacks are invoked. | ||
1796 | * Of course, callbacks should only be posted from within a trace event | ||
1797 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
1798 | */ | ||
1799 | static void rcu_idle_count_callbacks_posted(void) | ||
1800 | { | ||
1801 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); | ||
1802 | } | ||
1803 | |||
1804 | /* | ||
1805 | * Data for flushing lazy RCU callbacks at OOM time. | ||
1806 | */ | ||
1807 | static atomic_t oom_callback_count; | ||
1808 | static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); | ||
1809 | |||
1810 | /* | ||
1811 | * RCU OOM callback -- decrement the outstanding count and deliver the | ||
1812 | * wake-up if we are the last one. | ||
1813 | */ | ||
1814 | static void rcu_oom_callback(struct rcu_head *rhp) | ||
1815 | { | ||
1816 | if (atomic_dec_and_test(&oom_callback_count)) | ||
1817 | wake_up(&oom_callback_wq); | ||
1818 | } | ||
1819 | |||
1820 | /* | ||
1821 | * Post an rcu_oom_notify callback on the current CPU if it has at | ||
1822 | * least one lazy callback. This will unnecessarily post callbacks | ||
1823 | * to CPUs that already have a non-lazy callback at the end of their | ||
1824 | * callback list, but this is an infrequent operation, so accept some | ||
1825 | * extra overhead to keep things simple. | ||
1826 | */ | ||
1827 | static void rcu_oom_notify_cpu(void *unused) | ||
1828 | { | ||
1829 | struct rcu_state *rsp; | ||
1830 | struct rcu_data *rdp; | ||
1831 | |||
1832 | for_each_rcu_flavor(rsp) { | ||
1833 | rdp = __this_cpu_ptr(rsp->rda); | ||
1834 | if (rdp->qlen_lazy != 0) { | ||
1835 | atomic_inc(&oom_callback_count); | ||
1836 | rsp->call(&rdp->oom_head, rcu_oom_callback); | ||
1837 | } | ||
1838 | } | ||
1839 | } | ||
1840 | |||
1841 | /* | ||
1842 | * If low on memory, ensure that each CPU has a non-lazy callback. | ||
1843 | * This will wake up CPUs that have only lazy callbacks, in turn | ||
1844 | * ensuring that they free up the corresponding memory in a timely manner. | ||
1845 | * Because an uncertain amount of memory will be freed in some uncertain | ||
1846 | * timeframe, we do not claim to have freed anything. | ||
1847 | */ | ||
1848 | static int rcu_oom_notify(struct notifier_block *self, | ||
1849 | unsigned long notused, void *nfreed) | ||
1850 | { | ||
1851 | int cpu; | ||
1852 | |||
1853 | /* Wait for callbacks from earlier instance to complete. */ | ||
1854 | wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); | ||
1855 | |||
1856 | /* | ||
1857 | * Prevent premature wakeup: ensure that all increments happen | ||
1858 | * before there is a chance of the counter reaching zero. | ||
1859 | */ | ||
1860 | atomic_set(&oom_callback_count, 1); | ||
1861 | |||
1862 | get_online_cpus(); | ||
1863 | for_each_online_cpu(cpu) { | ||
1864 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | ||
1865 | cond_resched(); | ||
1866 | } | ||
1867 | put_online_cpus(); | ||
1868 | |||
1869 | /* Unconditionally decrement: no need to wake ourselves up. */ | ||
1870 | atomic_dec(&oom_callback_count); | ||
1871 | |||
1872 | return NOTIFY_OK; | ||
1873 | } | ||
1874 | |||
1875 | static struct notifier_block rcu_oom_nb = { | ||
1876 | .notifier_call = rcu_oom_notify | ||
1877 | }; | ||
1878 | |||
1879 | static int __init rcu_register_oom_notifier(void) | ||
1880 | { | ||
1881 | register_oom_notifier(&rcu_oom_nb); | ||
1882 | return 0; | ||
1883 | } | ||
1884 | early_initcall(rcu_register_oom_notifier); | ||
1885 | |||
1886 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
1887 | |||
1888 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
1889 | |||
1890 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
1891 | |||
1892 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
1893 | { | ||
1894 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
1895 | unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; | ||
1896 | |||
1897 | sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", | ||
1898 | rdtp->last_accelerate & 0xffff, jiffies & 0xffff, | ||
1899 | ulong2long(nlpd), | ||
1900 | rdtp->all_lazy ? 'L' : '.', | ||
1901 | rdtp->tick_nohz_enabled_snap ? '.' : 'D'); | ||
1902 | } | ||
1903 | |||
1904 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
1905 | |||
1906 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
1907 | { | ||
1908 | *cp = '\0'; | ||
1909 | } | ||
1910 | |||
1911 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
1912 | |||
1913 | /* Initiate the stall-info list. */ | ||
1914 | static void print_cpu_stall_info_begin(void) | ||
1915 | { | ||
1916 | pr_cont("\n"); | ||
1917 | } | ||
1918 | |||
1919 | /* | ||
1920 | * Print out diagnostic information for the specified stalled CPU. | ||
1921 | * | ||
1922 | * If the specified CPU is aware of the current RCU grace period | ||
1923 | * (flavor specified by rsp), then print the number of scheduling | ||
1924 | * clock interrupts the CPU has taken during the time that it has | ||
1925 | * been aware. Otherwise, print the number of RCU grace periods | ||
1926 | * that this CPU is ignorant of, for example, "1" if the CPU was | ||
1927 | * aware of the previous grace period. | ||
1928 | * | ||
1929 | * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. | ||
1930 | */ | ||
1931 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
1932 | { | ||
1933 | char fast_no_hz[72]; | ||
1934 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
1935 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
1936 | char *ticks_title; | ||
1937 | unsigned long ticks_value; | ||
1938 | |||
1939 | if (rsp->gpnum == rdp->gpnum) { | ||
1940 | ticks_title = "ticks this GP"; | ||
1941 | ticks_value = rdp->ticks_this_gp; | ||
1942 | } else { | ||
1943 | ticks_title = "GPs behind"; | ||
1944 | ticks_value = rsp->gpnum - rdp->gpnum; | ||
1945 | } | ||
1946 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | ||
1947 | pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", | ||
1948 | cpu, ticks_value, ticks_title, | ||
1949 | atomic_read(&rdtp->dynticks) & 0xfff, | ||
1950 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | ||
1951 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | ||
1952 | fast_no_hz); | ||
1953 | } | ||
1954 | |||
1955 | /* Terminate the stall-info list. */ | ||
1956 | static void print_cpu_stall_info_end(void) | ||
1957 | { | ||
1958 | pr_err("\t"); | ||
1959 | } | ||
1960 | |||
1961 | /* Zero ->ticks_this_gp for all flavors of RCU. */ | ||
1962 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
1963 | { | ||
1964 | rdp->ticks_this_gp = 0; | ||
1965 | rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); | ||
1966 | } | ||
1967 | |||
1968 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | ||
1969 | static void increment_cpu_stall_ticks(void) | ||
1970 | { | ||
1971 | struct rcu_state *rsp; | ||
1972 | |||
1973 | for_each_rcu_flavor(rsp) | ||
1974 | __this_cpu_ptr(rsp->rda)->ticks_this_gp++; | ||
1975 | } | ||
1976 | |||
1977 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
1978 | |||
1979 | static void print_cpu_stall_info_begin(void) | ||
1980 | { | ||
1981 | pr_cont(" {"); | ||
1982 | } | ||
1983 | |||
1984 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
1985 | { | ||
1986 | pr_cont(" %d", cpu); | ||
1987 | } | ||
1988 | |||
1989 | static void print_cpu_stall_info_end(void) | ||
1990 | { | ||
1991 | pr_cont("} "); | ||
1992 | } | ||
1993 | |||
1994 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
1995 | { | ||
1996 | } | ||
1997 | |||
1998 | static void increment_cpu_stall_ticks(void) | ||
1999 | { | ||
2000 | } | ||
2001 | |||
2002 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
2003 | |||
2004 | #ifdef CONFIG_RCU_NOCB_CPU | ||
2005 | |||
2006 | /* | ||
2007 | * Offload callback processing from the boot-time-specified set of CPUs | ||
2008 | * specified by rcu_nocb_mask. For each CPU in the set, there is a | ||
2009 | * kthread created that pulls the callbacks from the corresponding CPU, | ||
2010 | * waits for a grace period to elapse, and invokes the callbacks. | ||
2011 | * The no-CBs CPUs do a wake_up() on their kthread when they insert | ||
2012 | * a callback into any empty list, unless the rcu_nocb_poll boot parameter | ||
2013 | * has been specified, in which case each kthread actively polls its | ||
2014 | * CPU. (Which isn't so great for energy efficiency, but which does | ||
2015 | * reduce RCU's overhead on that CPU.) | ||
2016 | * | ||
2017 | * This is intended to be used in conjunction with Frederic Weisbecker's | ||
2018 | * adaptive-idle work, which would seriously reduce OS jitter on CPUs | ||
2019 | * running CPU-bound user-mode computations. | ||
2020 | * | ||
2021 | * Offloading of callback processing could also in theory be used as | ||
2022 | * an energy-efficiency measure because CPUs with no RCU callbacks | ||
2023 | * queued are more aggressive about entering dyntick-idle mode. | ||
2024 | */ | ||
2025 | |||
2026 | |||
2027 | /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ | ||
2028 | static int __init rcu_nocb_setup(char *str) | ||
2029 | { | ||
2030 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | ||
2031 | have_rcu_nocb_mask = true; | ||
2032 | cpulist_parse(str, rcu_nocb_mask); | ||
2033 | return 1; | ||
2034 | } | ||
2035 | __setup("rcu_nocbs=", rcu_nocb_setup); | ||
2036 | |||
2037 | static int __init parse_rcu_nocb_poll(char *arg) | ||
2038 | { | ||
2039 | rcu_nocb_poll = 1; | ||
2040 | return 0; | ||
2041 | } | ||
2042 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | ||
2043 | |||
2044 | /* | ||
2045 | * Do any no-CBs CPUs need another grace period? | ||
2046 | * | ||
2047 | * Interrupts must be disabled. If the caller does not hold the root | ||
2048 | * rnp_node structure's ->lock, the results are advisory only. | ||
2049 | */ | ||
2050 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2051 | { | ||
2052 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
2053 | |||
2054 | return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; | ||
2055 | } | ||
2056 | |||
2057 | /* | ||
2058 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | ||
2059 | * grace period. | ||
2060 | */ | ||
2061 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
2062 | { | ||
2063 | wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); | ||
2064 | } | ||
2065 | |||
2066 | /* | ||
2067 | * Set the root rcu_node structure's ->need_future_gp field | ||
2068 | * based on the sum of those of all rcu_node structures. This does | ||
2069 | * double-count the root rcu_node structure's requests, but this | ||
2070 | * is necessary to handle the possibility of a rcu_nocb_kthread() | ||
2071 | * having awakened during the time that the rcu_node structures | ||
2072 | * were being updated for the end of the previous grace period. | ||
2073 | */ | ||
2074 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | ||
2075 | { | ||
2076 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; | ||
2077 | } | ||
2078 | |||
2079 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
2080 | { | ||
2081 | init_waitqueue_head(&rnp->nocb_gp_wq[0]); | ||
2082 | init_waitqueue_head(&rnp->nocb_gp_wq[1]); | ||
2083 | } | ||
2084 | |||
2085 | /* Is the specified CPU a no-CPUs CPU? */ | ||
2086 | bool rcu_is_nocb_cpu(int cpu) | ||
2087 | { | ||
2088 | if (have_rcu_nocb_mask) | ||
2089 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | ||
2090 | return false; | ||
2091 | } | ||
2092 | |||
2093 | /* | ||
2094 | * Enqueue the specified string of rcu_head structures onto the specified | ||
2095 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | ||
2096 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | ||
2097 | * counts are supplied by rhcount and rhcount_lazy. | ||
2098 | * | ||
2099 | * If warranted, also wake up the kthread servicing this CPUs queues. | ||
2100 | */ | ||
2101 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | ||
2102 | struct rcu_head *rhp, | ||
2103 | struct rcu_head **rhtp, | ||
2104 | int rhcount, int rhcount_lazy) | ||
2105 | { | ||
2106 | int len; | ||
2107 | struct rcu_head **old_rhpp; | ||
2108 | struct task_struct *t; | ||
2109 | |||
2110 | /* Enqueue the callback on the nocb list and update counts. */ | ||
2111 | old_rhpp = xchg(&rdp->nocb_tail, rhtp); | ||
2112 | ACCESS_ONCE(*old_rhpp) = rhp; | ||
2113 | atomic_long_add(rhcount, &rdp->nocb_q_count); | ||
2114 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | ||
2115 | |||
2116 | /* If we are not being polled and there is a kthread, awaken it ... */ | ||
2117 | t = ACCESS_ONCE(rdp->nocb_kthread); | ||
2118 | if (rcu_nocb_poll || !t) { | ||
2119 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2120 | TPS("WakeNotPoll")); | ||
2121 | return; | ||
2122 | } | ||
2123 | len = atomic_long_read(&rdp->nocb_q_count); | ||
2124 | if (old_rhpp == &rdp->nocb_head) { | ||
2125 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | ||
2126 | rdp->qlen_last_fqs_check = 0; | ||
2127 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); | ||
2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | ||
2129 | wake_up_process(t); /* ... or if many callbacks queued. */ | ||
2130 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | ||
2131 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
2132 | } else { | ||
2133 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | ||
2134 | } | ||
2135 | return; | ||
2136 | } | ||
2137 | |||
2138 | /* | ||
2139 | * This is a helper for __call_rcu(), which invokes this when the normal | ||
2140 | * callback queue is inoperable. If this is not a no-CBs CPU, this | ||
2141 | * function returns failure back to __call_rcu(), which can complain | ||
2142 | * appropriately. | ||
2143 | * | ||
2144 | * Otherwise, this function queues the callback where the corresponding | ||
2145 | * "rcuo" kthread can find it. | ||
2146 | */ | ||
2147 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2148 | bool lazy) | ||
2149 | { | ||
2150 | |||
2151 | if (!rcu_is_nocb_cpu(rdp->cpu)) | ||
2152 | return 0; | ||
2153 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | ||
2154 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | ||
2155 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | ||
2156 | (unsigned long)rhp->func, | ||
2157 | -atomic_long_read(&rdp->nocb_q_count_lazy), | ||
2158 | -atomic_long_read(&rdp->nocb_q_count)); | ||
2159 | else | ||
2160 | trace_rcu_callback(rdp->rsp->name, rhp, | ||
2161 | -atomic_long_read(&rdp->nocb_q_count_lazy), | ||
2162 | -atomic_long_read(&rdp->nocb_q_count)); | ||
2163 | return 1; | ||
2164 | } | ||
2165 | |||
2166 | /* | ||
2167 | * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is | ||
2168 | * not a no-CBs CPU. | ||
2169 | */ | ||
2170 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2171 | struct rcu_data *rdp) | ||
2172 | { | ||
2173 | long ql = rsp->qlen; | ||
2174 | long qll = rsp->qlen_lazy; | ||
2175 | |||
2176 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | ||
2177 | if (!rcu_is_nocb_cpu(smp_processor_id())) | ||
2178 | return 0; | ||
2179 | rsp->qlen = 0; | ||
2180 | rsp->qlen_lazy = 0; | ||
2181 | |||
2182 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | ||
2183 | if (rsp->orphan_donelist != NULL) { | ||
2184 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | ||
2185 | rsp->orphan_donetail, ql, qll); | ||
2186 | ql = qll = 0; | ||
2187 | rsp->orphan_donelist = NULL; | ||
2188 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
2189 | } | ||
2190 | if (rsp->orphan_nxtlist != NULL) { | ||
2191 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | ||
2192 | rsp->orphan_nxttail, ql, qll); | ||
2193 | ql = qll = 0; | ||
2194 | rsp->orphan_nxtlist = NULL; | ||
2195 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
2196 | } | ||
2197 | return 1; | ||
2198 | } | ||
2199 | |||
2200 | /* | ||
2201 | * If necessary, kick off a new grace period, and either way wait | ||
2202 | * for a subsequent grace period to complete. | ||
2203 | */ | ||
2204 | static void rcu_nocb_wait_gp(struct rcu_data *rdp) | ||
2205 | { | ||
2206 | unsigned long c; | ||
2207 | bool d; | ||
2208 | unsigned long flags; | ||
2209 | struct rcu_node *rnp = rdp->mynode; | ||
2210 | |||
2211 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
2212 | c = rcu_start_future_gp(rnp, rdp); | ||
2213 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
2214 | |||
2215 | /* | ||
2216 | * Wait for the grace period. Do so interruptibly to avoid messing | ||
2217 | * up the load average. | ||
2218 | */ | ||
2219 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); | ||
2220 | for (;;) { | ||
2221 | wait_event_interruptible( | ||
2222 | rnp->nocb_gp_wq[c & 0x1], | ||
2223 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); | ||
2224 | if (likely(d)) | ||
2225 | break; | ||
2226 | flush_signals(current); | ||
2227 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); | ||
2228 | } | ||
2229 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); | ||
2230 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | ||
2231 | } | ||
2232 | |||
2233 | /* | ||
2234 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | ||
2235 | * callbacks queued by the corresponding no-CBs CPU. | ||
2236 | */ | ||
2237 | static int rcu_nocb_kthread(void *arg) | ||
2238 | { | ||
2239 | int c, cl; | ||
2240 | bool firsttime = 1; | ||
2241 | struct rcu_head *list; | ||
2242 | struct rcu_head *next; | ||
2243 | struct rcu_head **tail; | ||
2244 | struct rcu_data *rdp = arg; | ||
2245 | |||
2246 | /* Each pass through this loop invokes one batch of callbacks */ | ||
2247 | for (;;) { | ||
2248 | /* If not polling, wait for next batch of callbacks. */ | ||
2249 | if (!rcu_nocb_poll) { | ||
2250 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2251 | TPS("Sleep")); | ||
2252 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | ||
2253 | } else if (firsttime) { | ||
2254 | firsttime = 0; | ||
2255 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2256 | TPS("Poll")); | ||
2257 | } | ||
2258 | list = ACCESS_ONCE(rdp->nocb_head); | ||
2259 | if (!list) { | ||
2260 | if (!rcu_nocb_poll) | ||
2261 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2262 | TPS("WokeEmpty")); | ||
2263 | schedule_timeout_interruptible(1); | ||
2264 | flush_signals(current); | ||
2265 | continue; | ||
2266 | } | ||
2267 | firsttime = 1; | ||
2268 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2269 | TPS("WokeNonEmpty")); | ||
2270 | |||
2271 | /* | ||
2272 | * Extract queued callbacks, update counts, and wait | ||
2273 | * for a grace period to elapse. | ||
2274 | */ | ||
2275 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
2276 | tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
2277 | c = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
2278 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
2279 | ACCESS_ONCE(rdp->nocb_p_count) += c; | ||
2280 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | ||
2281 | rcu_nocb_wait_gp(rdp); | ||
2282 | |||
2283 | /* Each pass through the following loop invokes a callback. */ | ||
2284 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | ||
2285 | c = cl = 0; | ||
2286 | while (list) { | ||
2287 | next = list->next; | ||
2288 | /* Wait for enqueuing to complete, if needed. */ | ||
2289 | while (next == NULL && &list->next != tail) { | ||
2290 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2291 | TPS("WaitQueue")); | ||
2292 | schedule_timeout_interruptible(1); | ||
2293 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
2294 | TPS("WokeQueue")); | ||
2295 | next = list->next; | ||
2296 | } | ||
2297 | debug_rcu_head_unqueue(list); | ||
2298 | local_bh_disable(); | ||
2299 | if (__rcu_reclaim(rdp->rsp->name, list)) | ||
2300 | cl++; | ||
2301 | c++; | ||
2302 | local_bh_enable(); | ||
2303 | list = next; | ||
2304 | } | ||
2305 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | ||
2306 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | ||
2307 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | ||
2308 | rdp->n_nocbs_invoked += c; | ||
2309 | } | ||
2310 | return 0; | ||
2311 | } | ||
2312 | |||
2313 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | ||
2314 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2315 | { | ||
2316 | rdp->nocb_tail = &rdp->nocb_head; | ||
2317 | init_waitqueue_head(&rdp->nocb_wq); | ||
2318 | } | ||
2319 | |||
2320 | /* Create a kthread for each RCU flavor for each no-CBs CPU. */ | ||
2321 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2322 | { | ||
2323 | int cpu; | ||
2324 | struct rcu_data *rdp; | ||
2325 | struct task_struct *t; | ||
2326 | |||
2327 | if (rcu_nocb_mask == NULL) | ||
2328 | return; | ||
2329 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
2330 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
2331 | t = kthread_run(rcu_nocb_kthread, rdp, | ||
2332 | "rcuo%c/%d", rsp->abbr, cpu); | ||
2333 | BUG_ON(IS_ERR(t)); | ||
2334 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
2335 | } | ||
2336 | } | ||
2337 | |||
2338 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | ||
2339 | static bool init_nocb_callback_list(struct rcu_data *rdp) | ||
2340 | { | ||
2341 | if (rcu_nocb_mask == NULL || | ||
2342 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
2343 | return false; | ||
2344 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
2345 | return true; | ||
2346 | } | ||
2347 | |||
2348 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
2349 | |||
2350 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
2351 | { | ||
2352 | return 0; | ||
2353 | } | ||
2354 | |||
2355 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
2356 | { | ||
2357 | } | ||
2358 | |||
2359 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | ||
2360 | { | ||
2361 | } | ||
2362 | |||
2363 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
2364 | { | ||
2365 | } | ||
2366 | |||
2367 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
2368 | bool lazy) | ||
2369 | { | ||
2370 | return 0; | ||
2371 | } | ||
2372 | |||
2373 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
2374 | struct rcu_data *rdp) | ||
2375 | { | ||
2376 | return 0; | ||
2377 | } | ||
2378 | |||
2379 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
2380 | { | ||
2381 | } | ||
2382 | |||
2383 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
2384 | { | ||
2385 | } | ||
2386 | |||
2387 | static bool init_nocb_callback_list(struct rcu_data *rdp) | ||
2388 | { | ||
2389 | return false; | ||
2390 | } | ||
2391 | |||
2392 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
2393 | |||
2394 | /* | ||
2395 | * An adaptive-ticks CPU can potentially execute in kernel mode for an | ||
2396 | * arbitrarily long period of time with the scheduling-clock tick turned | ||
2397 | * off. RCU will be paying attention to this CPU because it is in the | ||
2398 | * kernel, but the CPU cannot be guaranteed to be executing the RCU state | ||
2399 | * machine because the scheduling-clock tick has been disabled. Therefore, | ||
2400 | * if an adaptive-ticks CPU is failing to respond to the current grace | ||
2401 | * period and has not be idle from an RCU perspective, kick it. | ||
2402 | */ | ||
2403 | static void rcu_kick_nohz_cpu(int cpu) | ||
2404 | { | ||
2405 | #ifdef CONFIG_NO_HZ_FULL | ||
2406 | if (tick_nohz_full_cpu(cpu)) | ||
2407 | smp_send_reschedule(cpu); | ||
2408 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
2409 | } | ||
2410 | |||
2411 | |||
2412 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
2413 | |||
2414 | /* | ||
2415 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
2416 | * most active flavor of RCU. | ||
2417 | */ | ||
2418 | #ifdef CONFIG_PREEMPT_RCU | ||
2419 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
2420 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
2421 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
2422 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
2423 | |||
2424 | static int full_sysidle_state; /* Current system-idle state. */ | ||
2425 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
2426 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
2427 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
2428 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
2429 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
2430 | |||
2431 | /* | ||
2432 | * Invoked to note exit from irq or task transition to idle. Note that | ||
2433 | * usermode execution does -not- count as idle here! After all, we want | ||
2434 | * to detect full-system idle states, not RCU quiescent states and grace | ||
2435 | * periods. The caller must have disabled interrupts. | ||
2436 | */ | ||
2437 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2438 | { | ||
2439 | unsigned long j; | ||
2440 | |||
2441 | /* Adjust nesting, check for fully idle. */ | ||
2442 | if (irq) { | ||
2443 | rdtp->dynticks_idle_nesting--; | ||
2444 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2445 | if (rdtp->dynticks_idle_nesting != 0) | ||
2446 | return; /* Still not fully idle. */ | ||
2447 | } else { | ||
2448 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
2449 | DYNTICK_TASK_NEST_VALUE) { | ||
2450 | rdtp->dynticks_idle_nesting = 0; | ||
2451 | } else { | ||
2452 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
2453 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
2454 | return; /* Still not fully idle. */ | ||
2455 | } | ||
2456 | } | ||
2457 | |||
2458 | /* Record start of fully idle period. */ | ||
2459 | j = jiffies; | ||
2460 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
2461 | smp_mb__before_atomic_inc(); | ||
2462 | atomic_inc(&rdtp->dynticks_idle); | ||
2463 | smp_mb__after_atomic_inc(); | ||
2464 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
2465 | } | ||
2466 | |||
2467 | /* | ||
2468 | * Unconditionally force exit from full system-idle state. This is | ||
2469 | * invoked when a normal CPU exits idle, but must be called separately | ||
2470 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
2471 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
2472 | * interrupts while the system is in system-idle state, and of course | ||
2473 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
2474 | * interrupt from any other type of interrupt. | ||
2475 | */ | ||
2476 | void rcu_sysidle_force_exit(void) | ||
2477 | { | ||
2478 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
2479 | int newoldstate; | ||
2480 | |||
2481 | /* | ||
2482 | * Each pass through the following loop attempts to exit full | ||
2483 | * system-idle state. If contention proves to be a problem, | ||
2484 | * a trylock-based contention tree could be used here. | ||
2485 | */ | ||
2486 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
2487 | newoldstate = cmpxchg(&full_sysidle_state, | ||
2488 | oldstate, RCU_SYSIDLE_NOT); | ||
2489 | if (oldstate == newoldstate && | ||
2490 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
2491 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
2492 | return; /* We cleared it, done! */ | ||
2493 | } | ||
2494 | oldstate = newoldstate; | ||
2495 | } | ||
2496 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
2497 | } | ||
2498 | |||
2499 | /* | ||
2500 | * Invoked to note entry to irq or task transition from idle. Note that | ||
2501 | * usermode execution does -not- count as idle here! The caller must | ||
2502 | * have disabled interrupts. | ||
2503 | */ | ||
2504 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2505 | { | ||
2506 | /* Adjust nesting, check for already non-idle. */ | ||
2507 | if (irq) { | ||
2508 | rdtp->dynticks_idle_nesting++; | ||
2509 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2510 | if (rdtp->dynticks_idle_nesting != 1) | ||
2511 | return; /* Already non-idle. */ | ||
2512 | } else { | ||
2513 | /* | ||
2514 | * Allow for irq misnesting. Yes, it really is possible | ||
2515 | * to enter an irq handler then never leave it, and maybe | ||
2516 | * also vice versa. Handle both possibilities. | ||
2517 | */ | ||
2518 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
2519 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
2520 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
2521 | return; /* Already non-idle. */ | ||
2522 | } else { | ||
2523 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
2524 | } | ||
2525 | } | ||
2526 | |||
2527 | /* Record end of idle period. */ | ||
2528 | smp_mb__before_atomic_inc(); | ||
2529 | atomic_inc(&rdtp->dynticks_idle); | ||
2530 | smp_mb__after_atomic_inc(); | ||
2531 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
2532 | |||
2533 | /* | ||
2534 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
2535 | * during a system-idle state. This must be the case, because | ||
2536 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
2537 | * during the time that the system is transitioning to full | ||
2538 | * system-idle state. This means that the timekeeping CPU must | ||
2539 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
2540 | * more than take a scheduling-clock interrupt. | ||
2541 | */ | ||
2542 | if (smp_processor_id() == tick_do_timer_cpu) | ||
2543 | return; | ||
2544 | |||
2545 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
2546 | rcu_sysidle_force_exit(); | ||
2547 | } | ||
2548 | |||
2549 | /* | ||
2550 | * Check to see if the current CPU is idle. Note that usermode execution | ||
2551 | * does not count as idle. The caller must have disabled interrupts. | ||
2552 | */ | ||
2553 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2554 | unsigned long *maxj) | ||
2555 | { | ||
2556 | int cur; | ||
2557 | unsigned long j; | ||
2558 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
2559 | |||
2560 | /* | ||
2561 | * If some other CPU has already reported non-idle, if this is | ||
2562 | * not the flavor of RCU that tracks sysidle state, or if this | ||
2563 | * is an offline or the timekeeping CPU, nothing to do. | ||
2564 | */ | ||
2565 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
2566 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
2567 | return; | ||
2568 | if (rcu_gp_in_progress(rdp->rsp)) | ||
2569 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
2570 | |||
2571 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
2572 | cur = atomic_read(&rdtp->dynticks_idle); | ||
2573 | if (cur & 0x1) { | ||
2574 | *isidle = false; /* We are not idle! */ | ||
2575 | return; | ||
2576 | } | ||
2577 | smp_mb(); /* Read counters before timestamps. */ | ||
2578 | |||
2579 | /* Pick up timestamps. */ | ||
2580 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
2581 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
2582 | if (ULONG_CMP_LT(*maxj, j)) | ||
2583 | *maxj = j; | ||
2584 | } | ||
2585 | |||
2586 | /* | ||
2587 | * Is this the flavor of RCU that is handling full-system idle? | ||
2588 | */ | ||
2589 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2590 | { | ||
2591 | return rsp == rcu_sysidle_state; | ||
2592 | } | ||
2593 | |||
2594 | /* | ||
2595 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
2596 | * timekeeping CPU. | ||
2597 | */ | ||
2598 | static void rcu_bind_gp_kthread(void) | ||
2599 | { | ||
2600 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
2601 | |||
2602 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
2603 | return; | ||
2604 | if (raw_smp_processor_id() != cpu) | ||
2605 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
2606 | } | ||
2607 | |||
2608 | /* | ||
2609 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
2610 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
2611 | * systems more time to transition to full-idle state in order to | ||
2612 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
2613 | * Really small systems (less than a couple of tens of CPUs) should | ||
2614 | * instead use a single global atomically incremented counter, and later | ||
2615 | * versions of this will automatically reconfigure themselves accordingly. | ||
2616 | */ | ||
2617 | static unsigned long rcu_sysidle_delay(void) | ||
2618 | { | ||
2619 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2620 | return 0; | ||
2621 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
2622 | } | ||
2623 | |||
2624 | /* | ||
2625 | * Advance the full-system-idle state. This is invoked when all of | ||
2626 | * the non-timekeeping CPUs are idle. | ||
2627 | */ | ||
2628 | static void rcu_sysidle(unsigned long j) | ||
2629 | { | ||
2630 | /* Check the current state. */ | ||
2631 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
2632 | case RCU_SYSIDLE_NOT: | ||
2633 | |||
2634 | /* First time all are idle, so note a short idle period. */ | ||
2635 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
2636 | break; | ||
2637 | |||
2638 | case RCU_SYSIDLE_SHORT: | ||
2639 | |||
2640 | /* | ||
2641 | * Idle for a bit, time to advance to next state? | ||
2642 | * cmpxchg failure means race with non-idle, let them win. | ||
2643 | */ | ||
2644 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2645 | (void)cmpxchg(&full_sysidle_state, | ||
2646 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
2647 | break; | ||
2648 | |||
2649 | case RCU_SYSIDLE_LONG: | ||
2650 | |||
2651 | /* | ||
2652 | * Do an additional check pass before advancing to full. | ||
2653 | * cmpxchg failure means race with non-idle, let them win. | ||
2654 | */ | ||
2655 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
2656 | (void)cmpxchg(&full_sysidle_state, | ||
2657 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
2658 | break; | ||
2659 | |||
2660 | default: | ||
2661 | break; | ||
2662 | } | ||
2663 | } | ||
2664 | |||
2665 | /* | ||
2666 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
2667 | * back to the beginning. | ||
2668 | */ | ||
2669 | static void rcu_sysidle_cancel(void) | ||
2670 | { | ||
2671 | smp_mb(); | ||
2672 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
2673 | } | ||
2674 | |||
2675 | /* | ||
2676 | * Update the sysidle state based on the results of a force-quiescent-state | ||
2677 | * scan of the CPUs' dyntick-idle state. | ||
2678 | */ | ||
2679 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
2680 | unsigned long maxj, bool gpkt) | ||
2681 | { | ||
2682 | if (rsp != rcu_sysidle_state) | ||
2683 | return; /* Wrong flavor, ignore. */ | ||
2684 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
2685 | return; /* Running state machine from timekeeping CPU. */ | ||
2686 | if (isidle) | ||
2687 | rcu_sysidle(maxj); /* More idle! */ | ||
2688 | else | ||
2689 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
2690 | } | ||
2691 | |||
2692 | /* | ||
2693 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
2694 | * kthread's context. | ||
2695 | */ | ||
2696 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2697 | unsigned long maxj) | ||
2698 | { | ||
2699 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
2700 | } | ||
2701 | |||
2702 | /* Callback and function for forcing an RCU grace period. */ | ||
2703 | struct rcu_sysidle_head { | ||
2704 | struct rcu_head rh; | ||
2705 | int inuse; | ||
2706 | }; | ||
2707 | |||
2708 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
2709 | { | ||
2710 | struct rcu_sysidle_head *rshp; | ||
2711 | |||
2712 | /* | ||
2713 | * The following memory barrier is needed to replace the | ||
2714 | * memory barriers that would normally be in the memory | ||
2715 | * allocator. | ||
2716 | */ | ||
2717 | smp_mb(); /* grace period precedes setting inuse. */ | ||
2718 | |||
2719 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
2720 | ACCESS_ONCE(rshp->inuse) = 0; | ||
2721 | } | ||
2722 | |||
2723 | /* | ||
2724 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
2725 | * The caller must have disabled interrupts. | ||
2726 | */ | ||
2727 | bool rcu_sys_is_idle(void) | ||
2728 | { | ||
2729 | static struct rcu_sysidle_head rsh; | ||
2730 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
2731 | |||
2732 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
2733 | return false; | ||
2734 | |||
2735 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
2736 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
2737 | int oldrss = rss - 1; | ||
2738 | |||
2739 | /* | ||
2740 | * One pass to advance to each state up to _FULL. | ||
2741 | * Give up if any pass fails to advance the state. | ||
2742 | */ | ||
2743 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
2744 | int cpu; | ||
2745 | bool isidle = true; | ||
2746 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
2747 | struct rcu_data *rdp; | ||
2748 | |||
2749 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
2750 | for_each_possible_cpu(cpu) { | ||
2751 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
2752 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
2753 | if (!isidle) | ||
2754 | break; | ||
2755 | } | ||
2756 | rcu_sysidle_report(rcu_sysidle_state, | ||
2757 | isidle, maxj, false); | ||
2758 | oldrss = rss; | ||
2759 | rss = ACCESS_ONCE(full_sysidle_state); | ||
2760 | } | ||
2761 | } | ||
2762 | |||
2763 | /* If this is the first observation of an idle period, record it. */ | ||
2764 | if (rss == RCU_SYSIDLE_FULL) { | ||
2765 | rss = cmpxchg(&full_sysidle_state, | ||
2766 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
2767 | return rss == RCU_SYSIDLE_FULL; | ||
2768 | } | ||
2769 | |||
2770 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
2771 | |||
2772 | /* If already fully idle, tell the caller (in case of races). */ | ||
2773 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
2774 | return true; | ||
2775 | |||
2776 | /* | ||
2777 | * If we aren't there yet, and a grace period is not in flight, | ||
2778 | * initiate a grace period. Either way, tell the caller that | ||
2779 | * we are not there yet. We use an xchg() rather than an assignment | ||
2780 | * to make up for the memory barriers that would otherwise be | ||
2781 | * provided by the memory allocator. | ||
2782 | */ | ||
2783 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
2784 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
2785 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
2786 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
2787 | return false; | ||
2788 | } | ||
2789 | |||
2790 | /* | ||
2791 | * Initialize dynticks sysidle state for CPUs coming online. | ||
2792 | */ | ||
2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2794 | { | ||
2795 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
2796 | } | ||
2797 | |||
2798 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
2799 | |||
2800 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
2801 | { | ||
2802 | } | ||
2803 | |||
2804 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
2805 | { | ||
2806 | } | ||
2807 | |||
2808 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
2809 | unsigned long *maxj) | ||
2810 | { | ||
2811 | } | ||
2812 | |||
2813 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
2814 | { | ||
2815 | return false; | ||
2816 | } | ||
2817 | |||
2818 | static void rcu_bind_gp_kthread(void) | ||
2819 | { | ||
2820 | } | ||
2821 | |||
2822 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
2823 | unsigned long maxj) | ||
2824 | { | ||
2825 | } | ||
2826 | |||
2827 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
2828 | { | ||
2829 | } | ||
2830 | |||
2831 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||