diff options
Diffstat (limited to 'kernel/rcupreempt.c')
-rw-r--r-- | kernel/rcupreempt.c | 953 |
1 files changed, 953 insertions, 0 deletions
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..987cfb7ade89 --- /dev/null +++ b/kernel/rcupreempt.c | |||
@@ -0,0 +1,953 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
27 | * | ||
28 | * Design Document: http://lwn.net/Articles/253651/ | ||
29 | * | ||
30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
31 | * Documentation/RCU/ *.txt | ||
32 | * | ||
33 | */ | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/init.h> | ||
37 | #include <linux/spinlock.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/rcupdate.h> | ||
40 | #include <linux/interrupt.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <asm/atomic.h> | ||
43 | #include <linux/bitops.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/completion.h> | ||
46 | #include <linux/moduleparam.h> | ||
47 | #include <linux/percpu.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/rcupdate.h> | ||
50 | #include <linux/cpu.h> | ||
51 | #include <linux/random.h> | ||
52 | #include <linux/delay.h> | ||
53 | #include <linux/byteorder/swabb.h> | ||
54 | #include <linux/cpumask.h> | ||
55 | #include <linux/rcupreempt_trace.h> | ||
56 | |||
57 | /* | ||
58 | * Macro that prevents the compiler from reordering accesses, but does | ||
59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
60 | * only to mediate communication between mainline code and hardware | ||
61 | * interrupt and NMI handlers. | ||
62 | */ | ||
63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
64 | |||
65 | /* | ||
66 | * PREEMPT_RCU data structures. | ||
67 | */ | ||
68 | |||
69 | /* | ||
70 | * GP_STAGES specifies the number of times the state machine has | ||
71 | * to go through the all the rcu_try_flip_states (see below) | ||
72 | * in a single Grace Period. | ||
73 | * | ||
74 | * GP in GP_STAGES stands for Grace Period ;) | ||
75 | */ | ||
76 | #define GP_STAGES 2 | ||
77 | struct rcu_data { | ||
78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
79 | long completed; /* Number of last completed batch. */ | ||
80 | int waitlistcount; | ||
81 | struct tasklet_struct rcu_tasklet; | ||
82 | struct rcu_head *nextlist; | ||
83 | struct rcu_head **nexttail; | ||
84 | struct rcu_head *waitlist[GP_STAGES]; | ||
85 | struct rcu_head **waittail[GP_STAGES]; | ||
86 | struct rcu_head *donelist; | ||
87 | struct rcu_head **donetail; | ||
88 | long rcu_flipctr[2]; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | struct rcu_ctrlblk { | ||
131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
132 | long completed; /* Number of last completed batch. */ | ||
133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
134 | the rcu state machine */ | ||
135 | }; | ||
136 | |||
137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
140 | .completed = 0, | ||
141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
142 | }; | ||
143 | |||
144 | |||
145 | #ifdef CONFIG_RCU_TRACE | ||
146 | static char *rcu_try_flip_state_names[] = | ||
147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
149 | |||
150 | static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; | ||
151 | |||
152 | /* | ||
153 | * Enum and per-CPU flag to determine when each CPU has seen | ||
154 | * the most recent counter flip. | ||
155 | */ | ||
156 | |||
157 | enum rcu_flip_flag_values { | ||
158 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
159 | /* Only GP detector can update. */ | ||
160 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
161 | /* Only corresponding CPU can update. */ | ||
162 | }; | ||
163 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
164 | = rcu_flip_seen; | ||
165 | |||
166 | /* | ||
167 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
168 | * needed memory barrier to fence in memory references from its last RCU | ||
169 | * read-side critical section in the just-completed grace period. | ||
170 | */ | ||
171 | |||
172 | enum rcu_mb_flag_values { | ||
173 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
174 | /* Only GP detector can update. */ | ||
175 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
176 | /* Only corresponding CPU can update. */ | ||
177 | }; | ||
178 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
179 | = rcu_mb_done; | ||
180 | |||
181 | /* | ||
182 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
183 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
184 | */ | ||
185 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
186 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
187 | |||
188 | /* | ||
189 | * Helper macro for tracing when the appropriate rcu_data is not | ||
190 | * cached in a local variable, but where the CPU number is so cached. | ||
191 | */ | ||
192 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
193 | |||
194 | /* | ||
195 | * Helper macro for tracing when the appropriate rcu_data is not | ||
196 | * cached in a local variable. | ||
197 | */ | ||
198 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
199 | |||
200 | /* | ||
201 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
202 | * to by a local variable. | ||
203 | */ | ||
204 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
205 | |||
206 | /* | ||
207 | * Return the number of RCU batches processed thus far. Useful | ||
208 | * for debug and statistics. | ||
209 | */ | ||
210 | long rcu_batches_completed(void) | ||
211 | { | ||
212 | return rcu_ctrlblk.completed; | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
215 | |||
216 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
217 | |||
218 | void __rcu_read_lock(void) | ||
219 | { | ||
220 | int idx; | ||
221 | struct task_struct *t = current; | ||
222 | int nesting; | ||
223 | |||
224 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
225 | if (nesting != 0) { | ||
226 | |||
227 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
228 | |||
229 | t->rcu_read_lock_nesting = nesting + 1; | ||
230 | |||
231 | } else { | ||
232 | unsigned long flags; | ||
233 | |||
234 | /* | ||
235 | * We disable interrupts for the following reasons: | ||
236 | * - If we get scheduling clock interrupt here, and we | ||
237 | * end up acking the counter flip, it's like a promise | ||
238 | * that we will never increment the old counter again. | ||
239 | * Thus we will break that promise if that | ||
240 | * scheduling clock interrupt happens between the time | ||
241 | * we pick the .completed field and the time that we | ||
242 | * increment our counter. | ||
243 | * | ||
244 | * - We don't want to be preempted out here. | ||
245 | * | ||
246 | * NMIs can still occur, of course, and might themselves | ||
247 | * contain rcu_read_lock(). | ||
248 | */ | ||
249 | |||
250 | local_irq_save(flags); | ||
251 | |||
252 | /* | ||
253 | * Outermost nesting of rcu_read_lock(), so increment | ||
254 | * the current counter for the current CPU. Use volatile | ||
255 | * casts to prevent the compiler from reordering. | ||
256 | */ | ||
257 | |||
258 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
259 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
260 | |||
261 | /* | ||
262 | * Now that the per-CPU counter has been incremented, we | ||
263 | * are protected from races with rcu_read_lock() invoked | ||
264 | * from NMI handlers on this CPU. We can therefore safely | ||
265 | * increment the nesting counter, relieving further NMIs | ||
266 | * of the need to increment the per-CPU counter. | ||
267 | */ | ||
268 | |||
269 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
270 | |||
271 | /* | ||
272 | * Now that we have preventing any NMIs from storing | ||
273 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
274 | * remember which counter to decrement in the matching | ||
275 | * rcu_read_unlock(). | ||
276 | */ | ||
277 | |||
278 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
279 | local_irq_restore(flags); | ||
280 | } | ||
281 | } | ||
282 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
283 | |||
284 | void __rcu_read_unlock(void) | ||
285 | { | ||
286 | int idx; | ||
287 | struct task_struct *t = current; | ||
288 | int nesting; | ||
289 | |||
290 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
291 | if (nesting > 1) { | ||
292 | |||
293 | /* | ||
294 | * We are still protected by the enclosing rcu_read_lock(), | ||
295 | * so simply decrement the counter. | ||
296 | */ | ||
297 | |||
298 | t->rcu_read_lock_nesting = nesting - 1; | ||
299 | |||
300 | } else { | ||
301 | unsigned long flags; | ||
302 | |||
303 | /* | ||
304 | * Disable local interrupts to prevent the grace-period | ||
305 | * detection state machine from seeing us half-done. | ||
306 | * NMIs can still occur, of course, and might themselves | ||
307 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
308 | */ | ||
309 | |||
310 | local_irq_save(flags); | ||
311 | |||
312 | /* | ||
313 | * Outermost nesting of rcu_read_unlock(), so we must | ||
314 | * decrement the current counter for the current CPU. | ||
315 | * This must be done carefully, because NMIs can | ||
316 | * occur at any point in this code, and any rcu_read_lock() | ||
317 | * and rcu_read_unlock() pairs in the NMI handlers | ||
318 | * must interact non-destructively with this code. | ||
319 | * Lots of volatile casts, and -very- careful ordering. | ||
320 | * | ||
321 | * Changes to this code, including this one, must be | ||
322 | * inspected, validated, and tested extremely carefully!!! | ||
323 | */ | ||
324 | |||
325 | /* | ||
326 | * First, pick up the index. | ||
327 | */ | ||
328 | |||
329 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
330 | |||
331 | /* | ||
332 | * Now that we have fetched the counter index, it is | ||
333 | * safe to decrement the per-task RCU nesting counter. | ||
334 | * After this, any interrupts or NMIs will increment and | ||
335 | * decrement the per-CPU counters. | ||
336 | */ | ||
337 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
338 | |||
339 | /* | ||
340 | * It is now safe to decrement this task's nesting count. | ||
341 | * NMIs that occur after this statement will route their | ||
342 | * rcu_read_lock() calls through this "else" clause, and | ||
343 | * will thus start incrementing the per-CPU counter on | ||
344 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
345 | * but that is OK, since we have already fetched it. | ||
346 | */ | ||
347 | |||
348 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
349 | local_irq_restore(flags); | ||
350 | } | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
353 | |||
354 | /* | ||
355 | * If a global counter flip has occurred since the last time that we | ||
356 | * advanced callbacks, advance them. Hardware interrupts must be | ||
357 | * disabled when calling this function. | ||
358 | */ | ||
359 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
360 | { | ||
361 | int cpu; | ||
362 | int i; | ||
363 | int wlc = 0; | ||
364 | |||
365 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
366 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
367 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
368 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
369 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
370 | } | ||
371 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
372 | if (rdp->waitlist[i] != NULL) { | ||
373 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
374 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
375 | wlc++; | ||
376 | } else { | ||
377 | rdp->waitlist[i + 1] = NULL; | ||
378 | rdp->waittail[i + 1] = | ||
379 | &rdp->waitlist[i + 1]; | ||
380 | } | ||
381 | } | ||
382 | if (rdp->nextlist != NULL) { | ||
383 | rdp->waitlist[0] = rdp->nextlist; | ||
384 | rdp->waittail[0] = rdp->nexttail; | ||
385 | wlc++; | ||
386 | rdp->nextlist = NULL; | ||
387 | rdp->nexttail = &rdp->nextlist; | ||
388 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
389 | } else { | ||
390 | rdp->waitlist[0] = NULL; | ||
391 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
392 | } | ||
393 | rdp->waitlistcount = wlc; | ||
394 | rdp->completed = rcu_ctrlblk.completed; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Check to see if this CPU needs to report that it has seen | ||
399 | * the most recent counter flip, thereby declaring that all | ||
400 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
401 | */ | ||
402 | |||
403 | cpu = raw_smp_processor_id(); | ||
404 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
405 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
406 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
407 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
408 | /* seen -after- acknowledgement. */ | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * Get here when RCU is idle. Decide whether we need to | ||
414 | * move out of idle state, and return non-zero if so. | ||
415 | * "Straightforward" approach for the moment, might later | ||
416 | * use callback-list lengths, grace-period duration, or | ||
417 | * some such to determine when to exit idle state. | ||
418 | * Might also need a pre-idle test that does not acquire | ||
419 | * the lock, but let's get the simple case working first... | ||
420 | */ | ||
421 | |||
422 | static int | ||
423 | rcu_try_flip_idle(void) | ||
424 | { | ||
425 | int cpu; | ||
426 | |||
427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
428 | if (!rcu_pending(smp_processor_id())) { | ||
429 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Do the flip. | ||
435 | */ | ||
436 | |||
437 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
438 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
439 | |||
440 | /* | ||
441 | * Need a memory barrier so that other CPUs see the new | ||
442 | * counter value before they see the subsequent change of all | ||
443 | * the rcu_flip_flag instances to rcu_flipped. | ||
444 | */ | ||
445 | |||
446 | smp_mb(); /* see above block comment. */ | ||
447 | |||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
449 | |||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
452 | |||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Wait for CPUs to acknowledge the flip. | ||
458 | */ | ||
459 | |||
460 | static int | ||
461 | rcu_try_flip_waitack(void) | ||
462 | { | ||
463 | int cpu; | ||
464 | |||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Make sure our checks above don't bleed into subsequent | ||
474 | * waiting for the sum of the counters to reach zero. | ||
475 | */ | ||
476 | |||
477 | smp_mb(); /* see above block comment. */ | ||
478 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Wait for collective ``last'' counter to reach zero, | ||
484 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
485 | */ | ||
486 | |||
487 | static int | ||
488 | rcu_try_flip_waitzero(void) | ||
489 | { | ||
490 | int cpu; | ||
491 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
492 | int sum = 0; | ||
493 | |||
494 | /* Check to see if the sum of the "last" counters is zero. */ | ||
495 | |||
496 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
497 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
498 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
499 | if (sum != 0) { | ||
500 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * This ensures that the other CPUs see the call for | ||
506 | * memory barriers -after- the sum to zero has been | ||
507 | * detected here | ||
508 | */ | ||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
510 | |||
511 | /* Call for a memory barrier from each CPU. */ | ||
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
514 | |||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
521 | * Return 0 once all CPUs have done so. | ||
522 | */ | ||
523 | |||
524 | static int | ||
525 | rcu_try_flip_waitmb(void) | ||
526 | { | ||
527 | int cpu; | ||
528 | |||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
537 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
538 | return 1; | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Attempt a single flip of the counters. Remember, a single flip does | ||
543 | * -not- constitute a grace period. Instead, the interval between | ||
544 | * at least GP_STAGES consecutive flips is a grace period. | ||
545 | * | ||
546 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
547 | * on a large SMP, they might want to use a hierarchical organization of | ||
548 | * the per-CPU-counter pairs. | ||
549 | */ | ||
550 | static void rcu_try_flip(void) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | |||
554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
555 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
556 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
557 | return; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Take the next transition(s) through the RCU grace-period | ||
562 | * flip-counter state machine. | ||
563 | */ | ||
564 | |||
565 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
566 | case rcu_try_flip_idle_state: | ||
567 | if (rcu_try_flip_idle()) | ||
568 | rcu_ctrlblk.rcu_try_flip_state = | ||
569 | rcu_try_flip_waitack_state; | ||
570 | break; | ||
571 | case rcu_try_flip_waitack_state: | ||
572 | if (rcu_try_flip_waitack()) | ||
573 | rcu_ctrlblk.rcu_try_flip_state = | ||
574 | rcu_try_flip_waitzero_state; | ||
575 | break; | ||
576 | case rcu_try_flip_waitzero_state: | ||
577 | if (rcu_try_flip_waitzero()) | ||
578 | rcu_ctrlblk.rcu_try_flip_state = | ||
579 | rcu_try_flip_waitmb_state; | ||
580 | break; | ||
581 | case rcu_try_flip_waitmb_state: | ||
582 | if (rcu_try_flip_waitmb()) | ||
583 | rcu_ctrlblk.rcu_try_flip_state = | ||
584 | rcu_try_flip_idle_state; | ||
585 | } | ||
586 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Check to see if this CPU needs to do a memory barrier in order to | ||
591 | * ensure that any prior RCU read-side critical sections have committed | ||
592 | * their counter manipulations and critical-section memory references | ||
593 | * before declaring the grace period to be completed. | ||
594 | */ | ||
595 | static void rcu_check_mb(int cpu) | ||
596 | { | ||
597 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
598 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
599 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
600 | } | ||
601 | } | ||
602 | |||
603 | void rcu_check_callbacks(int cpu, int user) | ||
604 | { | ||
605 | unsigned long flags; | ||
606 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
607 | |||
608 | rcu_check_mb(cpu); | ||
609 | if (rcu_ctrlblk.completed == rdp->completed) | ||
610 | rcu_try_flip(); | ||
611 | spin_lock_irqsave(&rdp->lock, flags); | ||
612 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
613 | __rcu_advance_callbacks(rdp); | ||
614 | if (rdp->donelist == NULL) { | ||
615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
616 | } else { | ||
617 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
618 | raise_softirq(RCU_SOFTIRQ); | ||
619 | } | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Needed by dynticks, to make sure all RCU processing has finished | ||
624 | * when we go idle: | ||
625 | */ | ||
626 | void rcu_advance_callbacks(int cpu, int user) | ||
627 | { | ||
628 | unsigned long flags; | ||
629 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
630 | |||
631 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
632 | rcu_try_flip(); | ||
633 | if (rcu_ctrlblk.completed == rdp->completed) | ||
634 | return; | ||
635 | } | ||
636 | spin_lock_irqsave(&rdp->lock, flags); | ||
637 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
638 | __rcu_advance_callbacks(rdp); | ||
639 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
640 | } | ||
641 | |||
642 | #ifdef CONFIG_HOTPLUG_CPU | ||
643 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
644 | *dsttail = srclist; \ | ||
645 | if (srclist != NULL) { \ | ||
646 | dsttail = srctail; \ | ||
647 | srclist = NULL; \ | ||
648 | srctail = &srclist;\ | ||
649 | } \ | ||
650 | } while (0) | ||
651 | |||
652 | void rcu_offline_cpu(int cpu) | ||
653 | { | ||
654 | int i; | ||
655 | struct rcu_head *list = NULL; | ||
656 | unsigned long flags; | ||
657 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
658 | struct rcu_head **tail = &list; | ||
659 | |||
660 | /* | ||
661 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
662 | * Otherwise rcu_barrier() will fail | ||
663 | */ | ||
664 | |||
665 | spin_lock_irqsave(&rdp->lock, flags); | ||
666 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
667 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
668 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
669 | list, tail); | ||
670 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
671 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
672 | rdp->waitlistcount = 0; | ||
673 | |||
674 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
675 | |||
676 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
677 | rcu_check_mb(cpu); | ||
678 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
679 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
680 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
681 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
682 | /* seen -after- acknowledgement. */ | ||
683 | } | ||
684 | |||
685 | RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
686 | RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; | ||
687 | |||
688 | RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; | ||
689 | RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; | ||
690 | |||
691 | cpu_clear(cpu, rcu_cpu_online_map); | ||
692 | |||
693 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
694 | |||
695 | /* | ||
696 | * Place the removed callbacks on the current CPU's queue. | ||
697 | * Make them all start a new grace period: simple approach, | ||
698 | * in theory could starve a given set of callbacks, but | ||
699 | * you would need to be doing some serious CPU hotplugging | ||
700 | * to make this happen. If this becomes a problem, adding | ||
701 | * a synchronize_rcu() to the hotplug path would be a simple | ||
702 | * fix. | ||
703 | */ | ||
704 | |||
705 | rdp = RCU_DATA_ME(); | ||
706 | spin_lock_irqsave(&rdp->lock, flags); | ||
707 | *rdp->nexttail = list; | ||
708 | if (list) | ||
709 | rdp->nexttail = tail; | ||
710 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
711 | } | ||
712 | |||
713 | void __devinit rcu_online_cpu(int cpu) | ||
714 | { | ||
715 | unsigned long flags; | ||
716 | |||
717 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
718 | cpu_set(cpu, rcu_cpu_online_map); | ||
719 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
720 | } | ||
721 | |||
722 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
723 | |||
724 | void rcu_offline_cpu(int cpu) | ||
725 | { | ||
726 | } | ||
727 | |||
728 | void __devinit rcu_online_cpu(int cpu) | ||
729 | { | ||
730 | } | ||
731 | |||
732 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
733 | |||
734 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
735 | { | ||
736 | unsigned long flags; | ||
737 | struct rcu_head *next, *list; | ||
738 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
739 | |||
740 | spin_lock_irqsave(&rdp->lock, flags); | ||
741 | list = rdp->donelist; | ||
742 | if (list == NULL) { | ||
743 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
744 | return; | ||
745 | } | ||
746 | rdp->donelist = NULL; | ||
747 | rdp->donetail = &rdp->donelist; | ||
748 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
749 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
750 | while (list) { | ||
751 | next = list->next; | ||
752 | list->func(list); | ||
753 | list = next; | ||
754 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
755 | } | ||
756 | } | ||
757 | |||
758 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
759 | { | ||
760 | unsigned long flags; | ||
761 | struct rcu_data *rdp; | ||
762 | |||
763 | head->func = func; | ||
764 | head->next = NULL; | ||
765 | local_irq_save(flags); | ||
766 | rdp = RCU_DATA_ME(); | ||
767 | spin_lock(&rdp->lock); | ||
768 | __rcu_advance_callbacks(rdp); | ||
769 | *rdp->nexttail = head; | ||
770 | rdp->nexttail = &head->next; | ||
771 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
772 | spin_unlock(&rdp->lock); | ||
773 | local_irq_restore(flags); | ||
774 | } | ||
775 | EXPORT_SYMBOL_GPL(call_rcu); | ||
776 | |||
777 | /* | ||
778 | * Wait until all currently running preempt_disable() code segments | ||
779 | * (including hardware-irq-disable segments) complete. Note that | ||
780 | * in -rt this does -not- necessarily result in all currently executing | ||
781 | * interrupt -handlers- having completed. | ||
782 | */ | ||
783 | void __synchronize_sched(void) | ||
784 | { | ||
785 | cpumask_t oldmask; | ||
786 | int cpu; | ||
787 | |||
788 | if (sched_getaffinity(0, &oldmask) < 0) | ||
789 | oldmask = cpu_possible_map; | ||
790 | for_each_online_cpu(cpu) { | ||
791 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
792 | schedule(); | ||
793 | } | ||
794 | sched_setaffinity(0, oldmask); | ||
795 | } | ||
796 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
797 | |||
798 | /* | ||
799 | * Check to see if any future RCU-related work will need to be done | ||
800 | * by the current CPU, even if none need be done immediately, returning | ||
801 | * 1 if so. Assumes that notifiers would take care of handling any | ||
802 | * outstanding requests from the RCU core. | ||
803 | * | ||
804 | * This function is part of the RCU implementation; it is -not- | ||
805 | * an exported member of the RCU API. | ||
806 | */ | ||
807 | int rcu_needs_cpu(int cpu) | ||
808 | { | ||
809 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
810 | |||
811 | return (rdp->donelist != NULL || | ||
812 | !!rdp->waitlistcount || | ||
813 | rdp->nextlist != NULL); | ||
814 | } | ||
815 | |||
816 | int rcu_pending(int cpu) | ||
817 | { | ||
818 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
819 | |||
820 | /* The CPU has at least one callback queued somewhere. */ | ||
821 | |||
822 | if (rdp->donelist != NULL || | ||
823 | !!rdp->waitlistcount || | ||
824 | rdp->nextlist != NULL) | ||
825 | return 1; | ||
826 | |||
827 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
828 | |||
829 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
830 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
831 | return 1; | ||
832 | |||
833 | /* This CPU has fallen behind the global grace-period number. */ | ||
834 | |||
835 | if (rdp->completed != rcu_ctrlblk.completed) | ||
836 | return 1; | ||
837 | |||
838 | /* Nothing needed from this CPU. */ | ||
839 | |||
840 | return 0; | ||
841 | } | ||
842 | |||
843 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
844 | unsigned long action, void *hcpu) | ||
845 | { | ||
846 | long cpu = (long)hcpu; | ||
847 | |||
848 | switch (action) { | ||
849 | case CPU_UP_PREPARE: | ||
850 | case CPU_UP_PREPARE_FROZEN: | ||
851 | rcu_online_cpu(cpu); | ||
852 | break; | ||
853 | case CPU_UP_CANCELED: | ||
854 | case CPU_UP_CANCELED_FROZEN: | ||
855 | case CPU_DEAD: | ||
856 | case CPU_DEAD_FROZEN: | ||
857 | rcu_offline_cpu(cpu); | ||
858 | break; | ||
859 | default: | ||
860 | break; | ||
861 | } | ||
862 | return NOTIFY_OK; | ||
863 | } | ||
864 | |||
865 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
866 | .notifier_call = rcu_cpu_notify, | ||
867 | }; | ||
868 | |||
869 | void __init __rcu_init(void) | ||
870 | { | ||
871 | int cpu; | ||
872 | int i; | ||
873 | struct rcu_data *rdp; | ||
874 | |||
875 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
876 | for_each_possible_cpu(cpu) { | ||
877 | rdp = RCU_DATA_CPU(cpu); | ||
878 | spin_lock_init(&rdp->lock); | ||
879 | rdp->completed = 0; | ||
880 | rdp->waitlistcount = 0; | ||
881 | rdp->nextlist = NULL; | ||
882 | rdp->nexttail = &rdp->nextlist; | ||
883 | for (i = 0; i < GP_STAGES; i++) { | ||
884 | rdp->waitlist[i] = NULL; | ||
885 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
886 | } | ||
887 | rdp->donelist = NULL; | ||
888 | rdp->donetail = &rdp->donelist; | ||
889 | rdp->rcu_flipctr[0] = 0; | ||
890 | rdp->rcu_flipctr[1] = 0; | ||
891 | } | ||
892 | register_cpu_notifier(&rcu_nb); | ||
893 | |||
894 | /* | ||
895 | * We don't need protection against CPU-Hotplug here | ||
896 | * since | ||
897 | * a) If a CPU comes online while we are iterating over the | ||
898 | * cpu_online_map below, we would only end up making a | ||
899 | * duplicate call to rcu_online_cpu() which sets the corresponding | ||
900 | * CPU's mask in the rcu_cpu_online_map. | ||
901 | * | ||
902 | * b) A CPU cannot go offline at this point in time since the user | ||
903 | * does not have access to the sysfs interface, nor do we | ||
904 | * suspend the system. | ||
905 | */ | ||
906 | for_each_online_cpu(cpu) | ||
907 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); | ||
908 | |||
909 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
914 | */ | ||
915 | void synchronize_kernel(void) | ||
916 | { | ||
917 | synchronize_rcu(); | ||
918 | } | ||
919 | |||
920 | #ifdef CONFIG_RCU_TRACE | ||
921 | long *rcupreempt_flipctr(int cpu) | ||
922 | { | ||
923 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
924 | } | ||
925 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
926 | |||
927 | int rcupreempt_flip_flag(int cpu) | ||
928 | { | ||
929 | return per_cpu(rcu_flip_flag, cpu); | ||
930 | } | ||
931 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
932 | |||
933 | int rcupreempt_mb_flag(int cpu) | ||
934 | { | ||
935 | return per_cpu(rcu_mb_flag, cpu); | ||
936 | } | ||
937 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
938 | |||
939 | char *rcupreempt_try_flip_state_name(void) | ||
940 | { | ||
941 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
942 | } | ||
943 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
944 | |||
945 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
946 | { | ||
947 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
948 | |||
949 | return &rdp->trace; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
952 | |||
953 | #endif /* #ifdef RCU_TRACE */ | ||