diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2008-01-25 15:08:24 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-01-25 15:08:24 -0500 |
commit | e260be673a15b6125068270e0216a3bfbfc12f87 (patch) | |
tree | f50760606d395bf6faa9e865f814761a3c88d32c /kernel/rcupreempt.c | |
parent | e0ecfa7917cafe72f4a75f87e8bb5d8d51dc534f (diff) |
Preempt-RCU: implementation
This patch implements a new version of RCU which allows its read-side
critical sections to be preempted. It uses a set of counter pairs
to keep track of the read-side critical sections and flips them
when all tasks exit read-side critical section. The details
of this implementation can be found in this paper -
http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf
and the article-
http://lwn.net/Articles/253651/
This patch was developed as a part of the -rt kernel development and
meant to provide better latencies when read-side critical sections of
RCU don't disable preemption. As a consequence of keeping track of RCU
readers, the readers have a slight overhead (optimizations in the paper).
This implementation co-exists with the "classic" RCU implementations
and can be switched to at compiler.
Also includes RCU tracing summarized in debugfs.
[ akpm@linux-foundation.org: build fixes on non-preempt architectures ]
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@us.ibm.com>
Reviewed-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/rcupreempt.c')
-rw-r--r-- | kernel/rcupreempt.c | 816 |
1 files changed, 816 insertions, 0 deletions
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..a5aabb1677f8 --- /dev/null +++ b/kernel/rcupreempt.c | |||
@@ -0,0 +1,816 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
27 | * | ||
28 | * Design Document: http://lwn.net/Articles/253651/ | ||
29 | * | ||
30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
31 | * Documentation/RCU/ *.txt | ||
32 | * | ||
33 | */ | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/init.h> | ||
37 | #include <linux/spinlock.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/rcupdate.h> | ||
40 | #include <linux/interrupt.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <asm/atomic.h> | ||
43 | #include <linux/bitops.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/completion.h> | ||
46 | #include <linux/moduleparam.h> | ||
47 | #include <linux/percpu.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/rcupdate.h> | ||
50 | #include <linux/cpu.h> | ||
51 | #include <linux/random.h> | ||
52 | #include <linux/delay.h> | ||
53 | #include <linux/byteorder/swabb.h> | ||
54 | #include <linux/cpumask.h> | ||
55 | #include <linux/rcupreempt_trace.h> | ||
56 | |||
57 | /* | ||
58 | * Macro that prevents the compiler from reordering accesses, but does | ||
59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
60 | * only to mediate communication between mainline code and hardware | ||
61 | * interrupt and NMI handlers. | ||
62 | */ | ||
63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
64 | |||
65 | /* | ||
66 | * PREEMPT_RCU data structures. | ||
67 | */ | ||
68 | |||
69 | /* | ||
70 | * GP_STAGES specifies the number of times the state machine has | ||
71 | * to go through the all the rcu_try_flip_states (see below) | ||
72 | * in a single Grace Period. | ||
73 | * | ||
74 | * GP in GP_STAGES stands for Grace Period ;) | ||
75 | */ | ||
76 | #define GP_STAGES 2 | ||
77 | struct rcu_data { | ||
78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
79 | long completed; /* Number of last completed batch. */ | ||
80 | int waitlistcount; | ||
81 | struct tasklet_struct rcu_tasklet; | ||
82 | struct rcu_head *nextlist; | ||
83 | struct rcu_head **nexttail; | ||
84 | struct rcu_head *waitlist[GP_STAGES]; | ||
85 | struct rcu_head **waittail[GP_STAGES]; | ||
86 | struct rcu_head *donelist; | ||
87 | struct rcu_head **donetail; | ||
88 | long rcu_flipctr[2]; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | struct rcu_ctrlblk { | ||
131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
132 | long completed; /* Number of last completed batch. */ | ||
133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
134 | the rcu state machine */ | ||
135 | }; | ||
136 | |||
137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
140 | .completed = 0, | ||
141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
142 | }; | ||
143 | |||
144 | |||
145 | #ifdef CONFIG_RCU_TRACE | ||
146 | static char *rcu_try_flip_state_names[] = | ||
147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
149 | |||
150 | /* | ||
151 | * Enum and per-CPU flag to determine when each CPU has seen | ||
152 | * the most recent counter flip. | ||
153 | */ | ||
154 | |||
155 | enum rcu_flip_flag_values { | ||
156 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
157 | /* Only GP detector can update. */ | ||
158 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
159 | /* Only corresponding CPU can update. */ | ||
160 | }; | ||
161 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
162 | = rcu_flip_seen; | ||
163 | |||
164 | /* | ||
165 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
166 | * needed memory barrier to fence in memory references from its last RCU | ||
167 | * read-side critical section in the just-completed grace period. | ||
168 | */ | ||
169 | |||
170 | enum rcu_mb_flag_values { | ||
171 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
172 | /* Only GP detector can update. */ | ||
173 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
174 | /* Only corresponding CPU can update. */ | ||
175 | }; | ||
176 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
177 | = rcu_mb_done; | ||
178 | |||
179 | /* | ||
180 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
181 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
182 | */ | ||
183 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
184 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
185 | |||
186 | /* | ||
187 | * Helper macro for tracing when the appropriate rcu_data is not | ||
188 | * cached in a local variable, but where the CPU number is so cached. | ||
189 | */ | ||
190 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
191 | |||
192 | /* | ||
193 | * Helper macro for tracing when the appropriate rcu_data is not | ||
194 | * cached in a local variable. | ||
195 | */ | ||
196 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
197 | |||
198 | /* | ||
199 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
200 | * to by a local variable. | ||
201 | */ | ||
202 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
203 | |||
204 | /* | ||
205 | * Return the number of RCU batches processed thus far. Useful | ||
206 | * for debug and statistics. | ||
207 | */ | ||
208 | long rcu_batches_completed(void) | ||
209 | { | ||
210 | return rcu_ctrlblk.completed; | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
213 | |||
214 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
215 | |||
216 | void __rcu_read_lock(void) | ||
217 | { | ||
218 | int idx; | ||
219 | struct task_struct *t = current; | ||
220 | int nesting; | ||
221 | |||
222 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
223 | if (nesting != 0) { | ||
224 | |||
225 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
226 | |||
227 | t->rcu_read_lock_nesting = nesting + 1; | ||
228 | |||
229 | } else { | ||
230 | unsigned long flags; | ||
231 | |||
232 | /* | ||
233 | * We disable interrupts for the following reasons: | ||
234 | * - If we get scheduling clock interrupt here, and we | ||
235 | * end up acking the counter flip, it's like a promise | ||
236 | * that we will never increment the old counter again. | ||
237 | * Thus we will break that promise if that | ||
238 | * scheduling clock interrupt happens between the time | ||
239 | * we pick the .completed field and the time that we | ||
240 | * increment our counter. | ||
241 | * | ||
242 | * - We don't want to be preempted out here. | ||
243 | * | ||
244 | * NMIs can still occur, of course, and might themselves | ||
245 | * contain rcu_read_lock(). | ||
246 | */ | ||
247 | |||
248 | local_irq_save(flags); | ||
249 | |||
250 | /* | ||
251 | * Outermost nesting of rcu_read_lock(), so increment | ||
252 | * the current counter for the current CPU. Use volatile | ||
253 | * casts to prevent the compiler from reordering. | ||
254 | */ | ||
255 | |||
256 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
257 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
258 | |||
259 | /* | ||
260 | * Now that the per-CPU counter has been incremented, we | ||
261 | * are protected from races with rcu_read_lock() invoked | ||
262 | * from NMI handlers on this CPU. We can therefore safely | ||
263 | * increment the nesting counter, relieving further NMIs | ||
264 | * of the need to increment the per-CPU counter. | ||
265 | */ | ||
266 | |||
267 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
268 | |||
269 | /* | ||
270 | * Now that we have preventing any NMIs from storing | ||
271 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
272 | * remember which counter to decrement in the matching | ||
273 | * rcu_read_unlock(). | ||
274 | */ | ||
275 | |||
276 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
277 | local_irq_restore(flags); | ||
278 | } | ||
279 | } | ||
280 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
281 | |||
282 | void __rcu_read_unlock(void) | ||
283 | { | ||
284 | int idx; | ||
285 | struct task_struct *t = current; | ||
286 | int nesting; | ||
287 | |||
288 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
289 | if (nesting > 1) { | ||
290 | |||
291 | /* | ||
292 | * We are still protected by the enclosing rcu_read_lock(), | ||
293 | * so simply decrement the counter. | ||
294 | */ | ||
295 | |||
296 | t->rcu_read_lock_nesting = nesting - 1; | ||
297 | |||
298 | } else { | ||
299 | unsigned long flags; | ||
300 | |||
301 | /* | ||
302 | * Disable local interrupts to prevent the grace-period | ||
303 | * detection state machine from seeing us half-done. | ||
304 | * NMIs can still occur, of course, and might themselves | ||
305 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
306 | */ | ||
307 | |||
308 | local_irq_save(flags); | ||
309 | |||
310 | /* | ||
311 | * Outermost nesting of rcu_read_unlock(), so we must | ||
312 | * decrement the current counter for the current CPU. | ||
313 | * This must be done carefully, because NMIs can | ||
314 | * occur at any point in this code, and any rcu_read_lock() | ||
315 | * and rcu_read_unlock() pairs in the NMI handlers | ||
316 | * must interact non-destructively with this code. | ||
317 | * Lots of volatile casts, and -very- careful ordering. | ||
318 | * | ||
319 | * Changes to this code, including this one, must be | ||
320 | * inspected, validated, and tested extremely carefully!!! | ||
321 | */ | ||
322 | |||
323 | /* | ||
324 | * First, pick up the index. | ||
325 | */ | ||
326 | |||
327 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
328 | |||
329 | /* | ||
330 | * Now that we have fetched the counter index, it is | ||
331 | * safe to decrement the per-task RCU nesting counter. | ||
332 | * After this, any interrupts or NMIs will increment and | ||
333 | * decrement the per-CPU counters. | ||
334 | */ | ||
335 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
336 | |||
337 | /* | ||
338 | * It is now safe to decrement this task's nesting count. | ||
339 | * NMIs that occur after this statement will route their | ||
340 | * rcu_read_lock() calls through this "else" clause, and | ||
341 | * will thus start incrementing the per-CPU counter on | ||
342 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
343 | * but that is OK, since we have already fetched it. | ||
344 | */ | ||
345 | |||
346 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
347 | local_irq_restore(flags); | ||
348 | } | ||
349 | } | ||
350 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
351 | |||
352 | /* | ||
353 | * If a global counter flip has occurred since the last time that we | ||
354 | * advanced callbacks, advance them. Hardware interrupts must be | ||
355 | * disabled when calling this function. | ||
356 | */ | ||
357 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
358 | { | ||
359 | int cpu; | ||
360 | int i; | ||
361 | int wlc = 0; | ||
362 | |||
363 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
364 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
365 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
366 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
367 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
368 | } | ||
369 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
370 | if (rdp->waitlist[i] != NULL) { | ||
371 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
372 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
373 | wlc++; | ||
374 | } else { | ||
375 | rdp->waitlist[i + 1] = NULL; | ||
376 | rdp->waittail[i + 1] = | ||
377 | &rdp->waitlist[i + 1]; | ||
378 | } | ||
379 | } | ||
380 | if (rdp->nextlist != NULL) { | ||
381 | rdp->waitlist[0] = rdp->nextlist; | ||
382 | rdp->waittail[0] = rdp->nexttail; | ||
383 | wlc++; | ||
384 | rdp->nextlist = NULL; | ||
385 | rdp->nexttail = &rdp->nextlist; | ||
386 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
387 | } else { | ||
388 | rdp->waitlist[0] = NULL; | ||
389 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
390 | } | ||
391 | rdp->waitlistcount = wlc; | ||
392 | rdp->completed = rcu_ctrlblk.completed; | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * Check to see if this CPU needs to report that it has seen | ||
397 | * the most recent counter flip, thereby declaring that all | ||
398 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
399 | */ | ||
400 | |||
401 | cpu = raw_smp_processor_id(); | ||
402 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
403 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
404 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
405 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
406 | /* seen -after- acknowledgement. */ | ||
407 | } | ||
408 | } | ||
409 | |||
410 | /* | ||
411 | * Get here when RCU is idle. Decide whether we need to | ||
412 | * move out of idle state, and return non-zero if so. | ||
413 | * "Straightforward" approach for the moment, might later | ||
414 | * use callback-list lengths, grace-period duration, or | ||
415 | * some such to determine when to exit idle state. | ||
416 | * Might also need a pre-idle test that does not acquire | ||
417 | * the lock, but let's get the simple case working first... | ||
418 | */ | ||
419 | |||
420 | static int | ||
421 | rcu_try_flip_idle(void) | ||
422 | { | ||
423 | int cpu; | ||
424 | |||
425 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
426 | if (!rcu_pending(smp_processor_id())) { | ||
427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
428 | return 0; | ||
429 | } | ||
430 | |||
431 | /* | ||
432 | * Do the flip. | ||
433 | */ | ||
434 | |||
435 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
436 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
437 | |||
438 | /* | ||
439 | * Need a memory barrier so that other CPUs see the new | ||
440 | * counter value before they see the subsequent change of all | ||
441 | * the rcu_flip_flag instances to rcu_flipped. | ||
442 | */ | ||
443 | |||
444 | smp_mb(); /* see above block comment. */ | ||
445 | |||
446 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
447 | |||
448 | for_each_possible_cpu(cpu) | ||
449 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
450 | |||
451 | return 1; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Wait for CPUs to acknowledge the flip. | ||
456 | */ | ||
457 | |||
458 | static int | ||
459 | rcu_try_flip_waitack(void) | ||
460 | { | ||
461 | int cpu; | ||
462 | |||
463 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
464 | for_each_possible_cpu(cpu) | ||
465 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
466 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
467 | return 0; | ||
468 | } | ||
469 | |||
470 | /* | ||
471 | * Make sure our checks above don't bleed into subsequent | ||
472 | * waiting for the sum of the counters to reach zero. | ||
473 | */ | ||
474 | |||
475 | smp_mb(); /* see above block comment. */ | ||
476 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
477 | return 1; | ||
478 | } | ||
479 | |||
480 | /* | ||
481 | * Wait for collective ``last'' counter to reach zero, | ||
482 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
483 | */ | ||
484 | |||
485 | static int | ||
486 | rcu_try_flip_waitzero(void) | ||
487 | { | ||
488 | int cpu; | ||
489 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
490 | int sum = 0; | ||
491 | |||
492 | /* Check to see if the sum of the "last" counters is zero. */ | ||
493 | |||
494 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
495 | for_each_possible_cpu(cpu) | ||
496 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
497 | if (sum != 0) { | ||
498 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * This ensures that the other CPUs see the call for | ||
504 | * memory barriers -after- the sum to zero has been | ||
505 | * detected here | ||
506 | */ | ||
507 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
508 | |||
509 | /* Call for a memory barrier from each CPU. */ | ||
510 | for_each_possible_cpu(cpu) | ||
511 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
512 | |||
513 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
514 | return 1; | ||
515 | } | ||
516 | |||
517 | /* | ||
518 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
519 | * Return 0 once all CPUs have done so. | ||
520 | */ | ||
521 | |||
522 | static int | ||
523 | rcu_try_flip_waitmb(void) | ||
524 | { | ||
525 | int cpu; | ||
526 | |||
527 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
528 | for_each_possible_cpu(cpu) | ||
529 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
530 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
531 | return 0; | ||
532 | } | ||
533 | |||
534 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
535 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
536 | return 1; | ||
537 | } | ||
538 | |||
539 | /* | ||
540 | * Attempt a single flip of the counters. Remember, a single flip does | ||
541 | * -not- constitute a grace period. Instead, the interval between | ||
542 | * at least GP_STAGES consecutive flips is a grace period. | ||
543 | * | ||
544 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
545 | * on a large SMP, they might want to use a hierarchical organization of | ||
546 | * the per-CPU-counter pairs. | ||
547 | */ | ||
548 | static void rcu_try_flip(void) | ||
549 | { | ||
550 | unsigned long flags; | ||
551 | |||
552 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
553 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
555 | return; | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * Take the next transition(s) through the RCU grace-period | ||
560 | * flip-counter state machine. | ||
561 | */ | ||
562 | |||
563 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
564 | case rcu_try_flip_idle_state: | ||
565 | if (rcu_try_flip_idle()) | ||
566 | rcu_ctrlblk.rcu_try_flip_state = | ||
567 | rcu_try_flip_waitack_state; | ||
568 | break; | ||
569 | case rcu_try_flip_waitack_state: | ||
570 | if (rcu_try_flip_waitack()) | ||
571 | rcu_ctrlblk.rcu_try_flip_state = | ||
572 | rcu_try_flip_waitzero_state; | ||
573 | break; | ||
574 | case rcu_try_flip_waitzero_state: | ||
575 | if (rcu_try_flip_waitzero()) | ||
576 | rcu_ctrlblk.rcu_try_flip_state = | ||
577 | rcu_try_flip_waitmb_state; | ||
578 | break; | ||
579 | case rcu_try_flip_waitmb_state: | ||
580 | if (rcu_try_flip_waitmb()) | ||
581 | rcu_ctrlblk.rcu_try_flip_state = | ||
582 | rcu_try_flip_idle_state; | ||
583 | } | ||
584 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
585 | } | ||
586 | |||
587 | /* | ||
588 | * Check to see if this CPU needs to do a memory barrier in order to | ||
589 | * ensure that any prior RCU read-side critical sections have committed | ||
590 | * their counter manipulations and critical-section memory references | ||
591 | * before declaring the grace period to be completed. | ||
592 | */ | ||
593 | static void rcu_check_mb(int cpu) | ||
594 | { | ||
595 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
596 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
597 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
598 | } | ||
599 | } | ||
600 | |||
601 | void rcu_check_callbacks(int cpu, int user) | ||
602 | { | ||
603 | unsigned long flags; | ||
604 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
605 | |||
606 | rcu_check_mb(cpu); | ||
607 | if (rcu_ctrlblk.completed == rdp->completed) | ||
608 | rcu_try_flip(); | ||
609 | spin_lock_irqsave(&rdp->lock, flags); | ||
610 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
611 | __rcu_advance_callbacks(rdp); | ||
612 | if (rdp->donelist == NULL) { | ||
613 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
614 | } else { | ||
615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
616 | raise_softirq(RCU_SOFTIRQ); | ||
617 | } | ||
618 | } | ||
619 | |||
620 | /* | ||
621 | * Needed by dynticks, to make sure all RCU processing has finished | ||
622 | * when we go idle: | ||
623 | */ | ||
624 | void rcu_advance_callbacks(int cpu, int user) | ||
625 | { | ||
626 | unsigned long flags; | ||
627 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
628 | |||
629 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
630 | rcu_try_flip(); | ||
631 | if (rcu_ctrlblk.completed == rdp->completed) | ||
632 | return; | ||
633 | } | ||
634 | spin_lock_irqsave(&rdp->lock, flags); | ||
635 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
636 | __rcu_advance_callbacks(rdp); | ||
637 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
638 | } | ||
639 | |||
640 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
641 | { | ||
642 | unsigned long flags; | ||
643 | struct rcu_head *next, *list; | ||
644 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
645 | |||
646 | spin_lock_irqsave(&rdp->lock, flags); | ||
647 | list = rdp->donelist; | ||
648 | if (list == NULL) { | ||
649 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
650 | return; | ||
651 | } | ||
652 | rdp->donelist = NULL; | ||
653 | rdp->donetail = &rdp->donelist; | ||
654 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
655 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
656 | while (list) { | ||
657 | next = list->next; | ||
658 | list->func(list); | ||
659 | list = next; | ||
660 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
661 | } | ||
662 | } | ||
663 | |||
664 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
665 | { | ||
666 | unsigned long flags; | ||
667 | struct rcu_data *rdp; | ||
668 | |||
669 | head->func = func; | ||
670 | head->next = NULL; | ||
671 | local_irq_save(flags); | ||
672 | rdp = RCU_DATA_ME(); | ||
673 | spin_lock(&rdp->lock); | ||
674 | __rcu_advance_callbacks(rdp); | ||
675 | *rdp->nexttail = head; | ||
676 | rdp->nexttail = &head->next; | ||
677 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
678 | spin_unlock(&rdp->lock); | ||
679 | local_irq_restore(flags); | ||
680 | } | ||
681 | EXPORT_SYMBOL_GPL(call_rcu); | ||
682 | |||
683 | /* | ||
684 | * Wait until all currently running preempt_disable() code segments | ||
685 | * (including hardware-irq-disable segments) complete. Note that | ||
686 | * in -rt this does -not- necessarily result in all currently executing | ||
687 | * interrupt -handlers- having completed. | ||
688 | */ | ||
689 | void __synchronize_sched(void) | ||
690 | { | ||
691 | cpumask_t oldmask; | ||
692 | int cpu; | ||
693 | |||
694 | if (sched_getaffinity(0, &oldmask) < 0) | ||
695 | oldmask = cpu_possible_map; | ||
696 | for_each_online_cpu(cpu) { | ||
697 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
698 | schedule(); | ||
699 | } | ||
700 | sched_setaffinity(0, oldmask); | ||
701 | } | ||
702 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
703 | |||
704 | /* | ||
705 | * Check to see if any future RCU-related work will need to be done | ||
706 | * by the current CPU, even if none need be done immediately, returning | ||
707 | * 1 if so. Assumes that notifiers would take care of handling any | ||
708 | * outstanding requests from the RCU core. | ||
709 | * | ||
710 | * This function is part of the RCU implementation; it is -not- | ||
711 | * an exported member of the RCU API. | ||
712 | */ | ||
713 | int rcu_needs_cpu(int cpu) | ||
714 | { | ||
715 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
716 | |||
717 | return (rdp->donelist != NULL || | ||
718 | !!rdp->waitlistcount || | ||
719 | rdp->nextlist != NULL); | ||
720 | } | ||
721 | |||
722 | int rcu_pending(int cpu) | ||
723 | { | ||
724 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
725 | |||
726 | /* The CPU has at least one callback queued somewhere. */ | ||
727 | |||
728 | if (rdp->donelist != NULL || | ||
729 | !!rdp->waitlistcount || | ||
730 | rdp->nextlist != NULL) | ||
731 | return 1; | ||
732 | |||
733 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
734 | |||
735 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
736 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
737 | return 1; | ||
738 | |||
739 | /* This CPU has fallen behind the global grace-period number. */ | ||
740 | |||
741 | if (rdp->completed != rcu_ctrlblk.completed) | ||
742 | return 1; | ||
743 | |||
744 | /* Nothing needed from this CPU. */ | ||
745 | |||
746 | return 0; | ||
747 | } | ||
748 | |||
749 | void __init __rcu_init(void) | ||
750 | { | ||
751 | int cpu; | ||
752 | int i; | ||
753 | struct rcu_data *rdp; | ||
754 | |||
755 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
756 | for_each_possible_cpu(cpu) { | ||
757 | rdp = RCU_DATA_CPU(cpu); | ||
758 | spin_lock_init(&rdp->lock); | ||
759 | rdp->completed = 0; | ||
760 | rdp->waitlistcount = 0; | ||
761 | rdp->nextlist = NULL; | ||
762 | rdp->nexttail = &rdp->nextlist; | ||
763 | for (i = 0; i < GP_STAGES; i++) { | ||
764 | rdp->waitlist[i] = NULL; | ||
765 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
766 | } | ||
767 | rdp->donelist = NULL; | ||
768 | rdp->donetail = &rdp->donelist; | ||
769 | rdp->rcu_flipctr[0] = 0; | ||
770 | rdp->rcu_flipctr[1] = 0; | ||
771 | } | ||
772 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
773 | } | ||
774 | |||
775 | /* | ||
776 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
777 | */ | ||
778 | void synchronize_kernel(void) | ||
779 | { | ||
780 | synchronize_rcu(); | ||
781 | } | ||
782 | |||
783 | #ifdef CONFIG_RCU_TRACE | ||
784 | long *rcupreempt_flipctr(int cpu) | ||
785 | { | ||
786 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
787 | } | ||
788 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
789 | |||
790 | int rcupreempt_flip_flag(int cpu) | ||
791 | { | ||
792 | return per_cpu(rcu_flip_flag, cpu); | ||
793 | } | ||
794 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
795 | |||
796 | int rcupreempt_mb_flag(int cpu) | ||
797 | { | ||
798 | return per_cpu(rcu_mb_flag, cpu); | ||
799 | } | ||
800 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
801 | |||
802 | char *rcupreempt_try_flip_state_name(void) | ||
803 | { | ||
804 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
805 | } | ||
806 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
807 | |||
808 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
809 | { | ||
810 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
811 | |||
812 | return &rdp->trace; | ||
813 | } | ||
814 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
815 | |||
816 | #endif /* #ifdef RCU_TRACE */ | ||