diff options
Diffstat (limited to 'kernel/rcupreempt.c')
-rw-r--r-- | kernel/rcupreempt.c | 1518 |
1 files changed, 0 insertions, 1518 deletions
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c deleted file mode 100644 index 0053ce56e326..000000000000 --- a/kernel/rcupreempt.c +++ /dev/null | |||
@@ -1,1518 +0,0 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * - Added handling of Dynamic Ticks | ||
27 | * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> | ||
28 | * - Steven Rostedt <srostedt@redhat.com> | ||
29 | * | ||
30 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
31 | * | ||
32 | * Design Document: http://lwn.net/Articles/253651/ | ||
33 | * | ||
34 | * For detailed explanation of Read-Copy Update mechanism see - | ||
35 | * Documentation/RCU/ *.txt | ||
36 | * | ||
37 | */ | ||
38 | #include <linux/types.h> | ||
39 | #include <linux/kernel.h> | ||
40 | #include <linux/init.h> | ||
41 | #include <linux/spinlock.h> | ||
42 | #include <linux/smp.h> | ||
43 | #include <linux/rcupdate.h> | ||
44 | #include <linux/interrupt.h> | ||
45 | #include <linux/sched.h> | ||
46 | #include <asm/atomic.h> | ||
47 | #include <linux/bitops.h> | ||
48 | #include <linux/module.h> | ||
49 | #include <linux/kthread.h> | ||
50 | #include <linux/completion.h> | ||
51 | #include <linux/moduleparam.h> | ||
52 | #include <linux/percpu.h> | ||
53 | #include <linux/notifier.h> | ||
54 | #include <linux/cpu.h> | ||
55 | #include <linux/random.h> | ||
56 | #include <linux/delay.h> | ||
57 | #include <linux/cpumask.h> | ||
58 | #include <linux/rcupreempt_trace.h> | ||
59 | #include <asm/byteorder.h> | ||
60 | |||
61 | /* | ||
62 | * PREEMPT_RCU data structures. | ||
63 | */ | ||
64 | |||
65 | /* | ||
66 | * GP_STAGES specifies the number of times the state machine has | ||
67 | * to go through the all the rcu_try_flip_states (see below) | ||
68 | * in a single Grace Period. | ||
69 | * | ||
70 | * GP in GP_STAGES stands for Grace Period ;) | ||
71 | */ | ||
72 | #define GP_STAGES 2 | ||
73 | struct rcu_data { | ||
74 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
75 | long completed; /* Number of last completed batch. */ | ||
76 | int waitlistcount; | ||
77 | struct rcu_head *nextlist; | ||
78 | struct rcu_head **nexttail; | ||
79 | struct rcu_head *waitlist[GP_STAGES]; | ||
80 | struct rcu_head **waittail[GP_STAGES]; | ||
81 | struct rcu_head *donelist; /* from waitlist & waitschedlist */ | ||
82 | struct rcu_head **donetail; | ||
83 | long rcu_flipctr[2]; | ||
84 | struct rcu_head *nextschedlist; | ||
85 | struct rcu_head **nextschedtail; | ||
86 | struct rcu_head *waitschedlist; | ||
87 | struct rcu_head **waitschedtail; | ||
88 | int rcu_sched_sleeping; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | /* | ||
131 | * States for rcu_ctrlblk.rcu_sched_sleep. | ||
132 | */ | ||
133 | |||
134 | enum rcu_sched_sleep_states { | ||
135 | rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */ | ||
136 | rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */ | ||
137 | rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */ | ||
138 | }; | ||
139 | |||
140 | struct rcu_ctrlblk { | ||
141 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
142 | long completed; /* Number of last completed batch. */ | ||
143 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
144 | the rcu state machine */ | ||
145 | spinlock_t schedlock; /* Protect rcu_sched sleep state. */ | ||
146 | enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */ | ||
147 | wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ | ||
148 | }; | ||
149 | |||
150 | struct rcu_dyntick_sched { | ||
151 | int dynticks; | ||
152 | int dynticks_snap; | ||
153 | int sched_qs; | ||
154 | int sched_qs_snap; | ||
155 | int sched_dynticks_snap; | ||
156 | }; | ||
157 | |||
158 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { | ||
159 | .dynticks = 1, | ||
160 | }; | ||
161 | |||
162 | static int rcu_pending(int cpu); | ||
163 | |||
164 | void rcu_sched_qs(int cpu) | ||
165 | { | ||
166 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
167 | |||
168 | rdssp->sched_qs++; | ||
169 | } | ||
170 | |||
171 | #ifdef CONFIG_NO_HZ | ||
172 | |||
173 | void rcu_enter_nohz(void) | ||
174 | { | ||
175 | static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); | ||
176 | |||
177 | smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ | ||
178 | __get_cpu_var(rcu_dyntick_sched).dynticks++; | ||
179 | WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs); | ||
180 | } | ||
181 | |||
182 | void rcu_exit_nohz(void) | ||
183 | { | ||
184 | static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); | ||
185 | |||
186 | __get_cpu_var(rcu_dyntick_sched).dynticks++; | ||
187 | smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ | ||
188 | WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), | ||
189 | &rs); | ||
190 | } | ||
191 | |||
192 | #endif /* CONFIG_NO_HZ */ | ||
193 | |||
194 | |||
195 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
196 | |||
197 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
198 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
199 | .completed = 0, | ||
200 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
201 | .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock), | ||
202 | .sched_sleep = rcu_sched_not_sleeping, | ||
203 | .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq), | ||
204 | }; | ||
205 | |||
206 | static struct task_struct *rcu_sched_grace_period_task; | ||
207 | |||
208 | #ifdef CONFIG_RCU_TRACE | ||
209 | static char *rcu_try_flip_state_names[] = | ||
210 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
211 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
212 | |||
213 | static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly | ||
214 | = CPU_BITS_NONE; | ||
215 | |||
216 | /* | ||
217 | * Enum and per-CPU flag to determine when each CPU has seen | ||
218 | * the most recent counter flip. | ||
219 | */ | ||
220 | |||
221 | enum rcu_flip_flag_values { | ||
222 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
223 | /* Only GP detector can update. */ | ||
224 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
225 | /* Only corresponding CPU can update. */ | ||
226 | }; | ||
227 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
228 | = rcu_flip_seen; | ||
229 | |||
230 | /* | ||
231 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
232 | * needed memory barrier to fence in memory references from its last RCU | ||
233 | * read-side critical section in the just-completed grace period. | ||
234 | */ | ||
235 | |||
236 | enum rcu_mb_flag_values { | ||
237 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
238 | /* Only GP detector can update. */ | ||
239 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
240 | /* Only corresponding CPU can update. */ | ||
241 | }; | ||
242 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
243 | = rcu_mb_done; | ||
244 | |||
245 | /* | ||
246 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
247 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
248 | */ | ||
249 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
250 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
251 | |||
252 | /* | ||
253 | * Helper macro for tracing when the appropriate rcu_data is not | ||
254 | * cached in a local variable, but where the CPU number is so cached. | ||
255 | */ | ||
256 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
257 | |||
258 | /* | ||
259 | * Helper macro for tracing when the appropriate rcu_data is not | ||
260 | * cached in a local variable. | ||
261 | */ | ||
262 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
263 | |||
264 | /* | ||
265 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
266 | * to by a local variable. | ||
267 | */ | ||
268 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
269 | |||
270 | #define RCU_SCHED_BATCH_TIME (HZ / 50) | ||
271 | |||
272 | /* | ||
273 | * Return the number of RCU batches processed thus far. Useful | ||
274 | * for debug and statistics. | ||
275 | */ | ||
276 | long rcu_batches_completed(void) | ||
277 | { | ||
278 | return rcu_ctrlblk.completed; | ||
279 | } | ||
280 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
281 | |||
282 | void __rcu_read_lock(void) | ||
283 | { | ||
284 | int idx; | ||
285 | struct task_struct *t = current; | ||
286 | int nesting; | ||
287 | |||
288 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
289 | if (nesting != 0) { | ||
290 | |||
291 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
292 | |||
293 | t->rcu_read_lock_nesting = nesting + 1; | ||
294 | |||
295 | } else { | ||
296 | unsigned long flags; | ||
297 | |||
298 | /* | ||
299 | * We disable interrupts for the following reasons: | ||
300 | * - If we get scheduling clock interrupt here, and we | ||
301 | * end up acking the counter flip, it's like a promise | ||
302 | * that we will never increment the old counter again. | ||
303 | * Thus we will break that promise if that | ||
304 | * scheduling clock interrupt happens between the time | ||
305 | * we pick the .completed field and the time that we | ||
306 | * increment our counter. | ||
307 | * | ||
308 | * - We don't want to be preempted out here. | ||
309 | * | ||
310 | * NMIs can still occur, of course, and might themselves | ||
311 | * contain rcu_read_lock(). | ||
312 | */ | ||
313 | |||
314 | local_irq_save(flags); | ||
315 | |||
316 | /* | ||
317 | * Outermost nesting of rcu_read_lock(), so increment | ||
318 | * the current counter for the current CPU. Use volatile | ||
319 | * casts to prevent the compiler from reordering. | ||
320 | */ | ||
321 | |||
322 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
323 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
324 | |||
325 | /* | ||
326 | * Now that the per-CPU counter has been incremented, we | ||
327 | * are protected from races with rcu_read_lock() invoked | ||
328 | * from NMI handlers on this CPU. We can therefore safely | ||
329 | * increment the nesting counter, relieving further NMIs | ||
330 | * of the need to increment the per-CPU counter. | ||
331 | */ | ||
332 | |||
333 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
334 | |||
335 | /* | ||
336 | * Now that we have preventing any NMIs from storing | ||
337 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
338 | * remember which counter to decrement in the matching | ||
339 | * rcu_read_unlock(). | ||
340 | */ | ||
341 | |||
342 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
343 | local_irq_restore(flags); | ||
344 | } | ||
345 | } | ||
346 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
347 | |||
348 | void __rcu_read_unlock(void) | ||
349 | { | ||
350 | int idx; | ||
351 | struct task_struct *t = current; | ||
352 | int nesting; | ||
353 | |||
354 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
355 | if (nesting > 1) { | ||
356 | |||
357 | /* | ||
358 | * We are still protected by the enclosing rcu_read_lock(), | ||
359 | * so simply decrement the counter. | ||
360 | */ | ||
361 | |||
362 | t->rcu_read_lock_nesting = nesting - 1; | ||
363 | |||
364 | } else { | ||
365 | unsigned long flags; | ||
366 | |||
367 | /* | ||
368 | * Disable local interrupts to prevent the grace-period | ||
369 | * detection state machine from seeing us half-done. | ||
370 | * NMIs can still occur, of course, and might themselves | ||
371 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
372 | */ | ||
373 | |||
374 | local_irq_save(flags); | ||
375 | |||
376 | /* | ||
377 | * Outermost nesting of rcu_read_unlock(), so we must | ||
378 | * decrement the current counter for the current CPU. | ||
379 | * This must be done carefully, because NMIs can | ||
380 | * occur at any point in this code, and any rcu_read_lock() | ||
381 | * and rcu_read_unlock() pairs in the NMI handlers | ||
382 | * must interact non-destructively with this code. | ||
383 | * Lots of volatile casts, and -very- careful ordering. | ||
384 | * | ||
385 | * Changes to this code, including this one, must be | ||
386 | * inspected, validated, and tested extremely carefully!!! | ||
387 | */ | ||
388 | |||
389 | /* | ||
390 | * First, pick up the index. | ||
391 | */ | ||
392 | |||
393 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
394 | |||
395 | /* | ||
396 | * Now that we have fetched the counter index, it is | ||
397 | * safe to decrement the per-task RCU nesting counter. | ||
398 | * After this, any interrupts or NMIs will increment and | ||
399 | * decrement the per-CPU counters. | ||
400 | */ | ||
401 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
402 | |||
403 | /* | ||
404 | * It is now safe to decrement this task's nesting count. | ||
405 | * NMIs that occur after this statement will route their | ||
406 | * rcu_read_lock() calls through this "else" clause, and | ||
407 | * will thus start incrementing the per-CPU counter on | ||
408 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
409 | * but that is OK, since we have already fetched it. | ||
410 | */ | ||
411 | |||
412 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
413 | local_irq_restore(flags); | ||
414 | } | ||
415 | } | ||
416 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
417 | |||
418 | /* | ||
419 | * If a global counter flip has occurred since the last time that we | ||
420 | * advanced callbacks, advance them. Hardware interrupts must be | ||
421 | * disabled when calling this function. | ||
422 | */ | ||
423 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
424 | { | ||
425 | int cpu; | ||
426 | int i; | ||
427 | int wlc = 0; | ||
428 | |||
429 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
430 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
431 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
432 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
433 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
434 | } | ||
435 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
436 | if (rdp->waitlist[i] != NULL) { | ||
437 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
438 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
439 | wlc++; | ||
440 | } else { | ||
441 | rdp->waitlist[i + 1] = NULL; | ||
442 | rdp->waittail[i + 1] = | ||
443 | &rdp->waitlist[i + 1]; | ||
444 | } | ||
445 | } | ||
446 | if (rdp->nextlist != NULL) { | ||
447 | rdp->waitlist[0] = rdp->nextlist; | ||
448 | rdp->waittail[0] = rdp->nexttail; | ||
449 | wlc++; | ||
450 | rdp->nextlist = NULL; | ||
451 | rdp->nexttail = &rdp->nextlist; | ||
452 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
453 | } else { | ||
454 | rdp->waitlist[0] = NULL; | ||
455 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
456 | } | ||
457 | rdp->waitlistcount = wlc; | ||
458 | rdp->completed = rcu_ctrlblk.completed; | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Check to see if this CPU needs to report that it has seen | ||
463 | * the most recent counter flip, thereby declaring that all | ||
464 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
465 | */ | ||
466 | |||
467 | cpu = raw_smp_processor_id(); | ||
468 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
469 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
470 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
471 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
472 | /* seen -after- acknowledgement. */ | ||
473 | } | ||
474 | } | ||
475 | |||
476 | #ifdef CONFIG_NO_HZ | ||
477 | static DEFINE_PER_CPU(int, rcu_update_flag); | ||
478 | |||
479 | /** | ||
480 | * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. | ||
481 | * | ||
482 | * If the CPU was idle with dynamic ticks active, this updates the | ||
483 | * rcu_dyntick_sched.dynticks to let the RCU handling know that the | ||
484 | * CPU is active. | ||
485 | */ | ||
486 | void rcu_irq_enter(void) | ||
487 | { | ||
488 | int cpu = smp_processor_id(); | ||
489 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
490 | |||
491 | if (per_cpu(rcu_update_flag, cpu)) | ||
492 | per_cpu(rcu_update_flag, cpu)++; | ||
493 | |||
494 | /* | ||
495 | * Only update if we are coming from a stopped ticks mode | ||
496 | * (rcu_dyntick_sched.dynticks is even). | ||
497 | */ | ||
498 | if (!in_interrupt() && | ||
499 | (rdssp->dynticks & 0x1) == 0) { | ||
500 | /* | ||
501 | * The following might seem like we could have a race | ||
502 | * with NMI/SMIs. But this really isn't a problem. | ||
503 | * Here we do a read/modify/write, and the race happens | ||
504 | * when an NMI/SMI comes in after the read and before | ||
505 | * the write. But NMI/SMIs will increment this counter | ||
506 | * twice before returning, so the zero bit will not | ||
507 | * be corrupted by the NMI/SMI which is the most important | ||
508 | * part. | ||
509 | * | ||
510 | * The only thing is that we would bring back the counter | ||
511 | * to a postion that it was in during the NMI/SMI. | ||
512 | * But the zero bit would be set, so the rest of the | ||
513 | * counter would again be ignored. | ||
514 | * | ||
515 | * On return from the IRQ, the counter may have the zero | ||
516 | * bit be 0 and the counter the same as the return from | ||
517 | * the NMI/SMI. If the state machine was so unlucky to | ||
518 | * see that, it still doesn't matter, since all | ||
519 | * RCU read-side critical sections on this CPU would | ||
520 | * have already completed. | ||
521 | */ | ||
522 | rdssp->dynticks++; | ||
523 | /* | ||
524 | * The following memory barrier ensures that any | ||
525 | * rcu_read_lock() primitives in the irq handler | ||
526 | * are seen by other CPUs to follow the above | ||
527 | * increment to rcu_dyntick_sched.dynticks. This is | ||
528 | * required in order for other CPUs to correctly | ||
529 | * determine when it is safe to advance the RCU | ||
530 | * grace-period state machine. | ||
531 | */ | ||
532 | smp_mb(); /* see above block comment. */ | ||
533 | /* | ||
534 | * Since we can't determine the dynamic tick mode from | ||
535 | * the rcu_dyntick_sched.dynticks after this routine, | ||
536 | * we use a second flag to acknowledge that we came | ||
537 | * from an idle state with ticks stopped. | ||
538 | */ | ||
539 | per_cpu(rcu_update_flag, cpu)++; | ||
540 | /* | ||
541 | * If we take an NMI/SMI now, they will also increment | ||
542 | * the rcu_update_flag, and will not update the | ||
543 | * rcu_dyntick_sched.dynticks on exit. That is for | ||
544 | * this IRQ to do. | ||
545 | */ | ||
546 | } | ||
547 | } | ||
548 | |||
549 | /** | ||
550 | * rcu_irq_exit - Called from exiting Hard irq context. | ||
551 | * | ||
552 | * If the CPU was idle with dynamic ticks active, update the | ||
553 | * rcu_dyntick_sched.dynticks to let the RCU handling be | ||
554 | * aware that the CPU is going back to idle with no ticks. | ||
555 | */ | ||
556 | void rcu_irq_exit(void) | ||
557 | { | ||
558 | int cpu = smp_processor_id(); | ||
559 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
560 | |||
561 | /* | ||
562 | * rcu_update_flag is set if we interrupted the CPU | ||
563 | * when it was idle with ticks stopped. | ||
564 | * Once this occurs, we keep track of interrupt nesting | ||
565 | * because a NMI/SMI could also come in, and we still | ||
566 | * only want the IRQ that started the increment of the | ||
567 | * rcu_dyntick_sched.dynticks to be the one that modifies | ||
568 | * it on exit. | ||
569 | */ | ||
570 | if (per_cpu(rcu_update_flag, cpu)) { | ||
571 | if (--per_cpu(rcu_update_flag, cpu)) | ||
572 | return; | ||
573 | |||
574 | /* This must match the interrupt nesting */ | ||
575 | WARN_ON(in_interrupt()); | ||
576 | |||
577 | /* | ||
578 | * If an NMI/SMI happens now we are still | ||
579 | * protected by the rcu_dyntick_sched.dynticks being odd. | ||
580 | */ | ||
581 | |||
582 | /* | ||
583 | * The following memory barrier ensures that any | ||
584 | * rcu_read_unlock() primitives in the irq handler | ||
585 | * are seen by other CPUs to preceed the following | ||
586 | * increment to rcu_dyntick_sched.dynticks. This | ||
587 | * is required in order for other CPUs to determine | ||
588 | * when it is safe to advance the RCU grace-period | ||
589 | * state machine. | ||
590 | */ | ||
591 | smp_mb(); /* see above block comment. */ | ||
592 | rdssp->dynticks++; | ||
593 | WARN_ON(rdssp->dynticks & 0x1); | ||
594 | } | ||
595 | } | ||
596 | |||
597 | void rcu_nmi_enter(void) | ||
598 | { | ||
599 | rcu_irq_enter(); | ||
600 | } | ||
601 | |||
602 | void rcu_nmi_exit(void) | ||
603 | { | ||
604 | rcu_irq_exit(); | ||
605 | } | ||
606 | |||
607 | static void dyntick_save_progress_counter(int cpu) | ||
608 | { | ||
609 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
610 | |||
611 | rdssp->dynticks_snap = rdssp->dynticks; | ||
612 | } | ||
613 | |||
614 | static inline int | ||
615 | rcu_try_flip_waitack_needed(int cpu) | ||
616 | { | ||
617 | long curr; | ||
618 | long snap; | ||
619 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
620 | |||
621 | curr = rdssp->dynticks; | ||
622 | snap = rdssp->dynticks_snap; | ||
623 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
624 | |||
625 | /* | ||
626 | * If the CPU remained in dynticks mode for the entire time | ||
627 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
628 | * then it cannot be in the middle of an rcu_read_lock(), so | ||
629 | * the next rcu_read_lock() it executes must use the new value | ||
630 | * of the counter. So we can safely pretend that this CPU | ||
631 | * already acknowledged the counter. | ||
632 | */ | ||
633 | |||
634 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
635 | return 0; | ||
636 | |||
637 | /* | ||
638 | * If the CPU passed through or entered a dynticks idle phase with | ||
639 | * no active irq handlers, then, as above, we can safely pretend | ||
640 | * that this CPU already acknowledged the counter. | ||
641 | */ | ||
642 | |||
643 | if ((curr - snap) > 2 || (curr & 0x1) == 0) | ||
644 | return 0; | ||
645 | |||
646 | /* We need this CPU to explicitly acknowledge the counter flip. */ | ||
647 | |||
648 | return 1; | ||
649 | } | ||
650 | |||
651 | static inline int | ||
652 | rcu_try_flip_waitmb_needed(int cpu) | ||
653 | { | ||
654 | long curr; | ||
655 | long snap; | ||
656 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
657 | |||
658 | curr = rdssp->dynticks; | ||
659 | snap = rdssp->dynticks_snap; | ||
660 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
661 | |||
662 | /* | ||
663 | * If the CPU remained in dynticks mode for the entire time | ||
664 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
665 | * then it cannot have executed an RCU read-side critical section | ||
666 | * during that time, so there is no need for it to execute a | ||
667 | * memory barrier. | ||
668 | */ | ||
669 | |||
670 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
671 | return 0; | ||
672 | |||
673 | /* | ||
674 | * If the CPU either entered or exited an outermost interrupt, | ||
675 | * SMI, NMI, or whatever handler, then we know that it executed | ||
676 | * a memory barrier when doing so. So we don't need another one. | ||
677 | */ | ||
678 | if (curr != snap) | ||
679 | return 0; | ||
680 | |||
681 | /* We need the CPU to execute a memory barrier. */ | ||
682 | |||
683 | return 1; | ||
684 | } | ||
685 | |||
686 | static void dyntick_save_progress_counter_sched(int cpu) | ||
687 | { | ||
688 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
689 | |||
690 | rdssp->sched_dynticks_snap = rdssp->dynticks; | ||
691 | } | ||
692 | |||
693 | static int rcu_qsctr_inc_needed_dyntick(int cpu) | ||
694 | { | ||
695 | long curr; | ||
696 | long snap; | ||
697 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
698 | |||
699 | curr = rdssp->dynticks; | ||
700 | snap = rdssp->sched_dynticks_snap; | ||
701 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
702 | |||
703 | /* | ||
704 | * If the CPU remained in dynticks mode for the entire time | ||
705 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
706 | * then it cannot be in the middle of an rcu_read_lock(), so | ||
707 | * the next rcu_read_lock() it executes must use the new value | ||
708 | * of the counter. Therefore, this CPU has been in a quiescent | ||
709 | * state the entire time, and we don't need to wait for it. | ||
710 | */ | ||
711 | |||
712 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
713 | return 0; | ||
714 | |||
715 | /* | ||
716 | * If the CPU passed through or entered a dynticks idle phase with | ||
717 | * no active irq handlers, then, as above, this CPU has already | ||
718 | * passed through a quiescent state. | ||
719 | */ | ||
720 | |||
721 | if ((curr - snap) > 2 || (snap & 0x1) == 0) | ||
722 | return 0; | ||
723 | |||
724 | /* We need this CPU to go through a quiescent state. */ | ||
725 | |||
726 | return 1; | ||
727 | } | ||
728 | |||
729 | #else /* !CONFIG_NO_HZ */ | ||
730 | |||
731 | # define dyntick_save_progress_counter(cpu) do { } while (0) | ||
732 | # define rcu_try_flip_waitack_needed(cpu) (1) | ||
733 | # define rcu_try_flip_waitmb_needed(cpu) (1) | ||
734 | |||
735 | # define dyntick_save_progress_counter_sched(cpu) do { } while (0) | ||
736 | # define rcu_qsctr_inc_needed_dyntick(cpu) (1) | ||
737 | |||
738 | #endif /* CONFIG_NO_HZ */ | ||
739 | |||
740 | static void save_qsctr_sched(int cpu) | ||
741 | { | ||
742 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
743 | |||
744 | rdssp->sched_qs_snap = rdssp->sched_qs; | ||
745 | } | ||
746 | |||
747 | static inline int rcu_qsctr_inc_needed(int cpu) | ||
748 | { | ||
749 | struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); | ||
750 | |||
751 | /* | ||
752 | * If there has been a quiescent state, no more need to wait | ||
753 | * on this CPU. | ||
754 | */ | ||
755 | |||
756 | if (rdssp->sched_qs != rdssp->sched_qs_snap) { | ||
757 | smp_mb(); /* force ordering with cpu entering schedule(). */ | ||
758 | return 0; | ||
759 | } | ||
760 | |||
761 | /* We need this CPU to go through a quiescent state. */ | ||
762 | |||
763 | return 1; | ||
764 | } | ||
765 | |||
766 | /* | ||
767 | * Get here when RCU is idle. Decide whether we need to | ||
768 | * move out of idle state, and return non-zero if so. | ||
769 | * "Straightforward" approach for the moment, might later | ||
770 | * use callback-list lengths, grace-period duration, or | ||
771 | * some such to determine when to exit idle state. | ||
772 | * Might also need a pre-idle test that does not acquire | ||
773 | * the lock, but let's get the simple case working first... | ||
774 | */ | ||
775 | |||
776 | static int | ||
777 | rcu_try_flip_idle(void) | ||
778 | { | ||
779 | int cpu; | ||
780 | |||
781 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
782 | if (!rcu_pending(smp_processor_id())) { | ||
783 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
784 | return 0; | ||
785 | } | ||
786 | |||
787 | /* | ||
788 | * Do the flip. | ||
789 | */ | ||
790 | |||
791 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
792 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
793 | |||
794 | /* | ||
795 | * Need a memory barrier so that other CPUs see the new | ||
796 | * counter value before they see the subsequent change of all | ||
797 | * the rcu_flip_flag instances to rcu_flipped. | ||
798 | */ | ||
799 | |||
800 | smp_mb(); /* see above block comment. */ | ||
801 | |||
802 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
803 | |||
804 | for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { | ||
805 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
806 | dyntick_save_progress_counter(cpu); | ||
807 | } | ||
808 | |||
809 | return 1; | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * Wait for CPUs to acknowledge the flip. | ||
814 | */ | ||
815 | |||
816 | static int | ||
817 | rcu_try_flip_waitack(void) | ||
818 | { | ||
819 | int cpu; | ||
820 | |||
821 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
822 | for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) | ||
823 | if (rcu_try_flip_waitack_needed(cpu) && | ||
824 | per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
825 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
826 | return 0; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * Make sure our checks above don't bleed into subsequent | ||
831 | * waiting for the sum of the counters to reach zero. | ||
832 | */ | ||
833 | |||
834 | smp_mb(); /* see above block comment. */ | ||
835 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
836 | return 1; | ||
837 | } | ||
838 | |||
839 | /* | ||
840 | * Wait for collective ``last'' counter to reach zero, | ||
841 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
842 | */ | ||
843 | |||
844 | static int | ||
845 | rcu_try_flip_waitzero(void) | ||
846 | { | ||
847 | int cpu; | ||
848 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
849 | int sum = 0; | ||
850 | |||
851 | /* Check to see if the sum of the "last" counters is zero. */ | ||
852 | |||
853 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
854 | for_each_possible_cpu(cpu) | ||
855 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
856 | if (sum != 0) { | ||
857 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
858 | return 0; | ||
859 | } | ||
860 | |||
861 | /* | ||
862 | * This ensures that the other CPUs see the call for | ||
863 | * memory barriers -after- the sum to zero has been | ||
864 | * detected here | ||
865 | */ | ||
866 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
867 | |||
868 | /* Call for a memory barrier from each CPU. */ | ||
869 | for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) { | ||
870 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
871 | dyntick_save_progress_counter(cpu); | ||
872 | } | ||
873 | |||
874 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
875 | return 1; | ||
876 | } | ||
877 | |||
878 | /* | ||
879 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
880 | * Return 0 once all CPUs have done so. | ||
881 | */ | ||
882 | |||
883 | static int | ||
884 | rcu_try_flip_waitmb(void) | ||
885 | { | ||
886 | int cpu; | ||
887 | |||
888 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
889 | for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) | ||
890 | if (rcu_try_flip_waitmb_needed(cpu) && | ||
891 | per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
892 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
893 | return 0; | ||
894 | } | ||
895 | |||
896 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
897 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
898 | return 1; | ||
899 | } | ||
900 | |||
901 | /* | ||
902 | * Attempt a single flip of the counters. Remember, a single flip does | ||
903 | * -not- constitute a grace period. Instead, the interval between | ||
904 | * at least GP_STAGES consecutive flips is a grace period. | ||
905 | * | ||
906 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
907 | * on a large SMP, they might want to use a hierarchical organization of | ||
908 | * the per-CPU-counter pairs. | ||
909 | */ | ||
910 | static void rcu_try_flip(void) | ||
911 | { | ||
912 | unsigned long flags; | ||
913 | |||
914 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
915 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
916 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
917 | return; | ||
918 | } | ||
919 | |||
920 | /* | ||
921 | * Take the next transition(s) through the RCU grace-period | ||
922 | * flip-counter state machine. | ||
923 | */ | ||
924 | |||
925 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
926 | case rcu_try_flip_idle_state: | ||
927 | if (rcu_try_flip_idle()) | ||
928 | rcu_ctrlblk.rcu_try_flip_state = | ||
929 | rcu_try_flip_waitack_state; | ||
930 | break; | ||
931 | case rcu_try_flip_waitack_state: | ||
932 | if (rcu_try_flip_waitack()) | ||
933 | rcu_ctrlblk.rcu_try_flip_state = | ||
934 | rcu_try_flip_waitzero_state; | ||
935 | break; | ||
936 | case rcu_try_flip_waitzero_state: | ||
937 | if (rcu_try_flip_waitzero()) | ||
938 | rcu_ctrlblk.rcu_try_flip_state = | ||
939 | rcu_try_flip_waitmb_state; | ||
940 | break; | ||
941 | case rcu_try_flip_waitmb_state: | ||
942 | if (rcu_try_flip_waitmb()) | ||
943 | rcu_ctrlblk.rcu_try_flip_state = | ||
944 | rcu_try_flip_idle_state; | ||
945 | } | ||
946 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
947 | } | ||
948 | |||
949 | /* | ||
950 | * Check to see if this CPU needs to do a memory barrier in order to | ||
951 | * ensure that any prior RCU read-side critical sections have committed | ||
952 | * their counter manipulations and critical-section memory references | ||
953 | * before declaring the grace period to be completed. | ||
954 | */ | ||
955 | static void rcu_check_mb(int cpu) | ||
956 | { | ||
957 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
958 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
959 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
960 | } | ||
961 | } | ||
962 | |||
963 | void rcu_check_callbacks(int cpu, int user) | ||
964 | { | ||
965 | unsigned long flags; | ||
966 | struct rcu_data *rdp; | ||
967 | |||
968 | if (!rcu_pending(cpu)) | ||
969 | return; /* if nothing for RCU to do. */ | ||
970 | |||
971 | /* | ||
972 | * If this CPU took its interrupt from user mode or from the | ||
973 | * idle loop, and this is not a nested interrupt, then | ||
974 | * this CPU has to have exited all prior preept-disable | ||
975 | * sections of code. So invoke rcu_sched_qs() to note this. | ||
976 | * | ||
977 | * The memory barrier is needed to handle the case where | ||
978 | * writes from a preempt-disable section of code get reordered | ||
979 | * into schedule() by this CPU's write buffer. So the memory | ||
980 | * barrier makes sure that the rcu_sched_qs() is seen by other | ||
981 | * CPUs to happen after any such write. | ||
982 | */ | ||
983 | |||
984 | rdp = RCU_DATA_CPU(cpu); | ||
985 | if (user || | ||
986 | (idle_cpu(cpu) && !in_softirq() && | ||
987 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
988 | smp_mb(); /* Guard against aggressive schedule(). */ | ||
989 | rcu_sched_qs(cpu); | ||
990 | } | ||
991 | |||
992 | rcu_check_mb(cpu); | ||
993 | if (rcu_ctrlblk.completed == rdp->completed) | ||
994 | rcu_try_flip(); | ||
995 | spin_lock_irqsave(&rdp->lock, flags); | ||
996 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
997 | __rcu_advance_callbacks(rdp); | ||
998 | if (rdp->donelist == NULL) { | ||
999 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1000 | } else { | ||
1001 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1002 | raise_softirq(RCU_SOFTIRQ); | ||
1003 | } | ||
1004 | } | ||
1005 | |||
1006 | /* | ||
1007 | * Needed by dynticks, to make sure all RCU processing has finished | ||
1008 | * when we go idle: | ||
1009 | */ | ||
1010 | void rcu_advance_callbacks(int cpu, int user) | ||
1011 | { | ||
1012 | unsigned long flags; | ||
1013 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
1014 | |||
1015 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
1016 | rcu_try_flip(); | ||
1017 | if (rcu_ctrlblk.completed == rdp->completed) | ||
1018 | return; | ||
1019 | } | ||
1020 | spin_lock_irqsave(&rdp->lock, flags); | ||
1021 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
1022 | __rcu_advance_callbacks(rdp); | ||
1023 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1024 | } | ||
1025 | |||
1026 | #ifdef CONFIG_HOTPLUG_CPU | ||
1027 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
1028 | *dsttail = srclist; \ | ||
1029 | if (srclist != NULL) { \ | ||
1030 | dsttail = srctail; \ | ||
1031 | srclist = NULL; \ | ||
1032 | srctail = &srclist;\ | ||
1033 | } \ | ||
1034 | } while (0) | ||
1035 | |||
1036 | void rcu_offline_cpu(int cpu) | ||
1037 | { | ||
1038 | int i; | ||
1039 | struct rcu_head *list = NULL; | ||
1040 | unsigned long flags; | ||
1041 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
1042 | struct rcu_head *schedlist = NULL; | ||
1043 | struct rcu_head **schedtail = &schedlist; | ||
1044 | struct rcu_head **tail = &list; | ||
1045 | |||
1046 | /* | ||
1047 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
1048 | * Otherwise rcu_barrier() will fail | ||
1049 | */ | ||
1050 | |||
1051 | spin_lock_irqsave(&rdp->lock, flags); | ||
1052 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
1053 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
1054 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
1055 | list, tail); | ||
1056 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
1057 | rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail, | ||
1058 | schedlist, schedtail); | ||
1059 | rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail, | ||
1060 | schedlist, schedtail); | ||
1061 | rdp->rcu_sched_sleeping = 0; | ||
1062 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1063 | rdp->waitlistcount = 0; | ||
1064 | |||
1065 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
1066 | |||
1067 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
1068 | rcu_check_mb(cpu); | ||
1069 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
1070 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
1071 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
1072 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
1073 | /* seen -after- acknowledgement. */ | ||
1074 | } | ||
1075 | |||
1076 | cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map)); | ||
1077 | |||
1078 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
1079 | |||
1080 | /* | ||
1081 | * Place the removed callbacks on the current CPU's queue. | ||
1082 | * Make them all start a new grace period: simple approach, | ||
1083 | * in theory could starve a given set of callbacks, but | ||
1084 | * you would need to be doing some serious CPU hotplugging | ||
1085 | * to make this happen. If this becomes a problem, adding | ||
1086 | * a synchronize_rcu() to the hotplug path would be a simple | ||
1087 | * fix. | ||
1088 | */ | ||
1089 | |||
1090 | local_irq_save(flags); /* disable preempt till we know what lock. */ | ||
1091 | rdp = RCU_DATA_ME(); | ||
1092 | spin_lock(&rdp->lock); | ||
1093 | *rdp->nexttail = list; | ||
1094 | if (list) | ||
1095 | rdp->nexttail = tail; | ||
1096 | *rdp->nextschedtail = schedlist; | ||
1097 | if (schedlist) | ||
1098 | rdp->nextschedtail = schedtail; | ||
1099 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1100 | } | ||
1101 | |||
1102 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
1103 | |||
1104 | void rcu_offline_cpu(int cpu) | ||
1105 | { | ||
1106 | } | ||
1107 | |||
1108 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
1109 | |||
1110 | void __cpuinit rcu_online_cpu(int cpu) | ||
1111 | { | ||
1112 | unsigned long flags; | ||
1113 | struct rcu_data *rdp; | ||
1114 | |||
1115 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
1116 | cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map)); | ||
1117 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
1118 | |||
1119 | /* | ||
1120 | * The rcu_sched grace-period processing might have bypassed | ||
1121 | * this CPU, given that it was not in the rcu_cpu_online_map | ||
1122 | * when the grace-period scan started. This means that the | ||
1123 | * grace-period task might sleep. So make sure that if this | ||
1124 | * should happen, the first callback posted to this CPU will | ||
1125 | * wake up the grace-period task if need be. | ||
1126 | */ | ||
1127 | |||
1128 | rdp = RCU_DATA_CPU(cpu); | ||
1129 | spin_lock_irqsave(&rdp->lock, flags); | ||
1130 | rdp->rcu_sched_sleeping = 1; | ||
1131 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1132 | } | ||
1133 | |||
1134 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
1135 | { | ||
1136 | unsigned long flags; | ||
1137 | struct rcu_head *next, *list; | ||
1138 | struct rcu_data *rdp; | ||
1139 | |||
1140 | local_irq_save(flags); | ||
1141 | rdp = RCU_DATA_ME(); | ||
1142 | spin_lock(&rdp->lock); | ||
1143 | list = rdp->donelist; | ||
1144 | if (list == NULL) { | ||
1145 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1146 | return; | ||
1147 | } | ||
1148 | rdp->donelist = NULL; | ||
1149 | rdp->donetail = &rdp->donelist; | ||
1150 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
1151 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1152 | while (list) { | ||
1153 | next = list->next; | ||
1154 | list->func(list); | ||
1155 | list = next; | ||
1156 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
1157 | } | ||
1158 | } | ||
1159 | |||
1160 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
1161 | { | ||
1162 | unsigned long flags; | ||
1163 | struct rcu_data *rdp; | ||
1164 | |||
1165 | head->func = func; | ||
1166 | head->next = NULL; | ||
1167 | local_irq_save(flags); | ||
1168 | rdp = RCU_DATA_ME(); | ||
1169 | spin_lock(&rdp->lock); | ||
1170 | __rcu_advance_callbacks(rdp); | ||
1171 | *rdp->nexttail = head; | ||
1172 | rdp->nexttail = &head->next; | ||
1173 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
1174 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1175 | } | ||
1176 | EXPORT_SYMBOL_GPL(call_rcu); | ||
1177 | |||
1178 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
1179 | { | ||
1180 | unsigned long flags; | ||
1181 | struct rcu_data *rdp; | ||
1182 | int wake_gp = 0; | ||
1183 | |||
1184 | head->func = func; | ||
1185 | head->next = NULL; | ||
1186 | local_irq_save(flags); | ||
1187 | rdp = RCU_DATA_ME(); | ||
1188 | spin_lock(&rdp->lock); | ||
1189 | *rdp->nextschedtail = head; | ||
1190 | rdp->nextschedtail = &head->next; | ||
1191 | if (rdp->rcu_sched_sleeping) { | ||
1192 | |||
1193 | /* Grace-period processing might be sleeping... */ | ||
1194 | |||
1195 | rdp->rcu_sched_sleeping = 0; | ||
1196 | wake_gp = 1; | ||
1197 | } | ||
1198 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1199 | if (wake_gp) { | ||
1200 | |||
1201 | /* Wake up grace-period processing, unless someone beat us. */ | ||
1202 | |||
1203 | spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); | ||
1204 | if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping) | ||
1205 | wake_gp = 0; | ||
1206 | rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping; | ||
1207 | spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); | ||
1208 | if (wake_gp) | ||
1209 | wake_up_interruptible(&rcu_ctrlblk.sched_wq); | ||
1210 | } | ||
1211 | } | ||
1212 | EXPORT_SYMBOL_GPL(call_rcu_sched); | ||
1213 | |||
1214 | /* | ||
1215 | * Wait until all currently running preempt_disable() code segments | ||
1216 | * (including hardware-irq-disable segments) complete. Note that | ||
1217 | * in -rt this does -not- necessarily result in all currently executing | ||
1218 | * interrupt -handlers- having completed. | ||
1219 | */ | ||
1220 | void __synchronize_sched(void) | ||
1221 | { | ||
1222 | struct rcu_synchronize rcu; | ||
1223 | |||
1224 | if (num_online_cpus() == 1) | ||
1225 | return; /* blocking is gp if only one CPU! */ | ||
1226 | |||
1227 | init_completion(&rcu.completion); | ||
1228 | /* Will wake me after RCU finished. */ | ||
1229 | call_rcu_sched(&rcu.head, wakeme_after_rcu); | ||
1230 | /* Wait for it. */ | ||
1231 | wait_for_completion(&rcu.completion); | ||
1232 | } | ||
1233 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
1234 | |||
1235 | /* | ||
1236 | * kthread function that manages call_rcu_sched grace periods. | ||
1237 | */ | ||
1238 | static int rcu_sched_grace_period(void *arg) | ||
1239 | { | ||
1240 | int couldsleep; /* might sleep after current pass. */ | ||
1241 | int couldsleepnext = 0; /* might sleep after next pass. */ | ||
1242 | int cpu; | ||
1243 | unsigned long flags; | ||
1244 | struct rcu_data *rdp; | ||
1245 | int ret; | ||
1246 | |||
1247 | /* | ||
1248 | * Each pass through the following loop handles one | ||
1249 | * rcu_sched grace period cycle. | ||
1250 | */ | ||
1251 | do { | ||
1252 | /* Save each CPU's current state. */ | ||
1253 | |||
1254 | for_each_online_cpu(cpu) { | ||
1255 | dyntick_save_progress_counter_sched(cpu); | ||
1256 | save_qsctr_sched(cpu); | ||
1257 | } | ||
1258 | |||
1259 | /* | ||
1260 | * Sleep for about an RCU grace-period's worth to | ||
1261 | * allow better batching and to consume less CPU. | ||
1262 | */ | ||
1263 | schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME); | ||
1264 | |||
1265 | /* | ||
1266 | * If there was nothing to do last time, prepare to | ||
1267 | * sleep at the end of the current grace period cycle. | ||
1268 | */ | ||
1269 | couldsleep = couldsleepnext; | ||
1270 | couldsleepnext = 1; | ||
1271 | if (couldsleep) { | ||
1272 | spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); | ||
1273 | rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep; | ||
1274 | spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); | ||
1275 | } | ||
1276 | |||
1277 | /* | ||
1278 | * Wait on each CPU in turn to have either visited | ||
1279 | * a quiescent state or been in dynticks-idle mode. | ||
1280 | */ | ||
1281 | for_each_online_cpu(cpu) { | ||
1282 | while (rcu_qsctr_inc_needed(cpu) && | ||
1283 | rcu_qsctr_inc_needed_dyntick(cpu)) { | ||
1284 | /* resched_cpu(cpu); @@@ */ | ||
1285 | schedule_timeout_interruptible(1); | ||
1286 | } | ||
1287 | } | ||
1288 | |||
1289 | /* Advance callbacks for each CPU. */ | ||
1290 | |||
1291 | for_each_online_cpu(cpu) { | ||
1292 | |||
1293 | rdp = RCU_DATA_CPU(cpu); | ||
1294 | spin_lock_irqsave(&rdp->lock, flags); | ||
1295 | |||
1296 | /* | ||
1297 | * We are running on this CPU irq-disabled, so no | ||
1298 | * CPU can go offline until we re-enable irqs. | ||
1299 | * The current CPU might have already gone | ||
1300 | * offline (between the for_each_offline_cpu and | ||
1301 | * the spin_lock_irqsave), but in that case all its | ||
1302 | * callback lists will be empty, so no harm done. | ||
1303 | * | ||
1304 | * Advance the callbacks! We share normal RCU's | ||
1305 | * donelist, since callbacks are invoked the | ||
1306 | * same way in either case. | ||
1307 | */ | ||
1308 | if (rdp->waitschedlist != NULL) { | ||
1309 | *rdp->donetail = rdp->waitschedlist; | ||
1310 | rdp->donetail = rdp->waitschedtail; | ||
1311 | |||
1312 | /* | ||
1313 | * Next rcu_check_callbacks() will | ||
1314 | * do the required raise_softirq(). | ||
1315 | */ | ||
1316 | } | ||
1317 | if (rdp->nextschedlist != NULL) { | ||
1318 | rdp->waitschedlist = rdp->nextschedlist; | ||
1319 | rdp->waitschedtail = rdp->nextschedtail; | ||
1320 | couldsleep = 0; | ||
1321 | couldsleepnext = 0; | ||
1322 | } else { | ||
1323 | rdp->waitschedlist = NULL; | ||
1324 | rdp->waitschedtail = &rdp->waitschedlist; | ||
1325 | } | ||
1326 | rdp->nextschedlist = NULL; | ||
1327 | rdp->nextschedtail = &rdp->nextschedlist; | ||
1328 | |||
1329 | /* Mark sleep intention. */ | ||
1330 | |||
1331 | rdp->rcu_sched_sleeping = couldsleep; | ||
1332 | |||
1333 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
1334 | } | ||
1335 | |||
1336 | /* If we saw callbacks on the last scan, go deal with them. */ | ||
1337 | |||
1338 | if (!couldsleep) | ||
1339 | continue; | ||
1340 | |||
1341 | /* Attempt to block... */ | ||
1342 | |||
1343 | spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags); | ||
1344 | if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) { | ||
1345 | |||
1346 | /* | ||
1347 | * Someone posted a callback after we scanned. | ||
1348 | * Go take care of it. | ||
1349 | */ | ||
1350 | spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); | ||
1351 | couldsleepnext = 0; | ||
1352 | continue; | ||
1353 | } | ||
1354 | |||
1355 | /* Block until the next person posts a callback. */ | ||
1356 | |||
1357 | rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; | ||
1358 | spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); | ||
1359 | ret = 0; /* unused */ | ||
1360 | __wait_event_interruptible(rcu_ctrlblk.sched_wq, | ||
1361 | rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, | ||
1362 | ret); | ||
1363 | |||
1364 | couldsleepnext = 0; | ||
1365 | |||
1366 | } while (!kthread_should_stop()); | ||
1367 | |||
1368 | return (0); | ||
1369 | } | ||
1370 | |||
1371 | /* | ||
1372 | * Check to see if any future RCU-related work will need to be done | ||
1373 | * by the current CPU, even if none need be done immediately, returning | ||
1374 | * 1 if so. Assumes that notifiers would take care of handling any | ||
1375 | * outstanding requests from the RCU core. | ||
1376 | * | ||
1377 | * This function is part of the RCU implementation; it is -not- | ||
1378 | * an exported member of the RCU API. | ||
1379 | */ | ||
1380 | int rcu_needs_cpu(int cpu) | ||
1381 | { | ||
1382 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
1383 | |||
1384 | return (rdp->donelist != NULL || | ||
1385 | !!rdp->waitlistcount || | ||
1386 | rdp->nextlist != NULL || | ||
1387 | rdp->nextschedlist != NULL || | ||
1388 | rdp->waitschedlist != NULL); | ||
1389 | } | ||
1390 | |||
1391 | static int rcu_pending(int cpu) | ||
1392 | { | ||
1393 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
1394 | |||
1395 | /* The CPU has at least one callback queued somewhere. */ | ||
1396 | |||
1397 | if (rdp->donelist != NULL || | ||
1398 | !!rdp->waitlistcount || | ||
1399 | rdp->nextlist != NULL || | ||
1400 | rdp->nextschedlist != NULL || | ||
1401 | rdp->waitschedlist != NULL) | ||
1402 | return 1; | ||
1403 | |||
1404 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
1405 | |||
1406 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
1407 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
1408 | return 1; | ||
1409 | |||
1410 | /* This CPU has fallen behind the global grace-period number. */ | ||
1411 | |||
1412 | if (rdp->completed != rcu_ctrlblk.completed) | ||
1413 | return 1; | ||
1414 | |||
1415 | /* Nothing needed from this CPU. */ | ||
1416 | |||
1417 | return 0; | ||
1418 | } | ||
1419 | |||
1420 | int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
1421 | unsigned long action, void *hcpu) | ||
1422 | { | ||
1423 | long cpu = (long)hcpu; | ||
1424 | |||
1425 | switch (action) { | ||
1426 | case CPU_UP_PREPARE: | ||
1427 | case CPU_UP_PREPARE_FROZEN: | ||
1428 | rcu_online_cpu(cpu); | ||
1429 | break; | ||
1430 | case CPU_UP_CANCELED: | ||
1431 | case CPU_UP_CANCELED_FROZEN: | ||
1432 | case CPU_DEAD: | ||
1433 | case CPU_DEAD_FROZEN: | ||
1434 | rcu_offline_cpu(cpu); | ||
1435 | break; | ||
1436 | default: | ||
1437 | break; | ||
1438 | } | ||
1439 | return NOTIFY_OK; | ||
1440 | } | ||
1441 | |||
1442 | void __init __rcu_init(void) | ||
1443 | { | ||
1444 | int cpu; | ||
1445 | int i; | ||
1446 | struct rcu_data *rdp; | ||
1447 | |||
1448 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
1449 | for_each_possible_cpu(cpu) { | ||
1450 | rdp = RCU_DATA_CPU(cpu); | ||
1451 | spin_lock_init(&rdp->lock); | ||
1452 | rdp->completed = 0; | ||
1453 | rdp->waitlistcount = 0; | ||
1454 | rdp->nextlist = NULL; | ||
1455 | rdp->nexttail = &rdp->nextlist; | ||
1456 | for (i = 0; i < GP_STAGES; i++) { | ||
1457 | rdp->waitlist[i] = NULL; | ||
1458 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
1459 | } | ||
1460 | rdp->donelist = NULL; | ||
1461 | rdp->donetail = &rdp->donelist; | ||
1462 | rdp->rcu_flipctr[0] = 0; | ||
1463 | rdp->rcu_flipctr[1] = 0; | ||
1464 | rdp->nextschedlist = NULL; | ||
1465 | rdp->nextschedtail = &rdp->nextschedlist; | ||
1466 | rdp->waitschedlist = NULL; | ||
1467 | rdp->waitschedtail = &rdp->waitschedlist; | ||
1468 | rdp->rcu_sched_sleeping = 0; | ||
1469 | } | ||
1470 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
1471 | } | ||
1472 | |||
1473 | /* | ||
1474 | * Late-boot-time RCU initialization that must wait until after scheduler | ||
1475 | * has been initialized. | ||
1476 | */ | ||
1477 | void __init rcu_init_sched(void) | ||
1478 | { | ||
1479 | rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period, | ||
1480 | NULL, | ||
1481 | "rcu_sched_grace_period"); | ||
1482 | WARN_ON(IS_ERR(rcu_sched_grace_period_task)); | ||
1483 | } | ||
1484 | |||
1485 | #ifdef CONFIG_RCU_TRACE | ||
1486 | long *rcupreempt_flipctr(int cpu) | ||
1487 | { | ||
1488 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
1489 | } | ||
1490 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
1491 | |||
1492 | int rcupreempt_flip_flag(int cpu) | ||
1493 | { | ||
1494 | return per_cpu(rcu_flip_flag, cpu); | ||
1495 | } | ||
1496 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
1497 | |||
1498 | int rcupreempt_mb_flag(int cpu) | ||
1499 | { | ||
1500 | return per_cpu(rcu_mb_flag, cpu); | ||
1501 | } | ||
1502 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
1503 | |||
1504 | char *rcupreempt_try_flip_state_name(void) | ||
1505 | { | ||
1506 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
1507 | } | ||
1508 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
1509 | |||
1510 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
1511 | { | ||
1512 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
1513 | |||
1514 | return &rdp->trace; | ||
1515 | } | ||
1516 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
1517 | |||
1518 | #endif /* #ifdef RCU_TRACE */ | ||