aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2010-09-07 13:38:22 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2011-05-06 02:16:54 -0400
commite59fb3120becfb36b22ddb8bd27d065d3cdca499 (patch)
tree37eaadfe112b64caae943fc7469274bc96553d92 /kernel
parenta00e0d714fbded07a7a2254391ce9ed5a5cb9d82 (diff)
rcu: Decrease memory-barrier usage based on semi-formal proof
Commit d09b62d fixed grace-period synchronization, but left some smp_mb() invocations in rcu_process_callbacks() that are no longer needed, but sheer paranoia prevented them from being removed. This commit removes them and provides a proof of correctness in their absence. It also adds a memory barrier to rcu_report_qs_rsp() immediately before the update to rsp->completed in order to handle the theoretical possibility that the compiler or CPU might move massive quantities of code into a lock-based critical section. This also proves that the sheer paranoia was not entirely unjustified, at least from a theoretical point of view. In addition, the old dyntick-idle synchronization depended on the fact that grace periods were many milliseconds in duration, so that it could be assumed that no dyntick-idle CPU could reorder a memory reference across an entire grace period. Unfortunately for this design, the addition of expedited grace periods breaks this assumption, which has the unfortunate side-effect of requiring atomic operations in the functions that track dyntick-idle state for RCU. (There is some hope that the algorithms used in user-level RCU might be applied here, but some work is required to handle the NMIs that user-space applications can happily ignore. For the short term, better safe than sorry.) This proof assumes that neither compiler nor CPU will allow a lock acquisition and release to be reordered, as doing so can result in deadlock. The proof is as follows: 1. A given CPU declares a quiescent state under the protection of its leaf rcu_node's lock. 2. If there is more than one level of rcu_node hierarchy, the last CPU to declare a quiescent state will also acquire the ->lock of the next rcu_node up in the hierarchy, but only after releasing the lower level's lock. The acquisition of this lock clearly cannot occur prior to the acquisition of the leaf node's lock. 3. Step 2 repeats until we reach the root rcu_node structure. Please note again that only one lock is held at a time through this process. The acquisition of the root rcu_node's ->lock must occur after the release of that of the leaf rcu_node. 4. At this point, we set the ->completed field in the rcu_state structure in rcu_report_qs_rsp(). However, if the rcu_node hierarchy contains only one rcu_node, then in theory the code preceding the quiescent state could leak into the critical section. We therefore precede the update of ->completed with a memory barrier. All CPUs will therefore agree that any updates preceding any report of a quiescent state will have happened before the update of ->completed. 5. Regardless of whether a new grace period is needed, rcu_start_gp() will propagate the new value of ->completed to all of the leaf rcu_node structures, under the protection of each rcu_node's ->lock. If a new grace period is needed immediately, this propagation will occur in the same critical section that ->completed was set in, but courtesy of the memory barrier in #4 above, is still seen to follow any pre-quiescent-state activity. 6. When a given CPU invokes __rcu_process_gp_end(), it becomes aware of the end of the old grace period and therefore makes any RCU callbacks that were waiting on that grace period eligible for invocation. If this CPU is the same one that detected the end of the grace period, and if there is but a single rcu_node in the hierarchy, we will still be in the single critical section. In this case, the memory barrier in step #4 guarantees that all callbacks will be seen to execute after each CPU's quiescent state. On the other hand, if this is a different CPU, it will acquire the leaf rcu_node's ->lock, and will again be serialized after each CPU's quiescent state for the old grace period. On the strength of this proof, this commit therefore removes the memory barriers from rcu_process_callbacks() and adds one to rcu_report_qs_rsp(). The effect is to reduce the number of memory barriers by one and to reduce the frequency of execution from about once per scheduling tick per CPU to once per grace period. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcutree.c130
-rw-r--r--kernel/rcutree.h9
-rw-r--r--kernel/rcutree_plugin.h7
-rw-r--r--kernel/rcutree_trace.c12
4 files changed, 68 insertions, 90 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 18f7a593d4c7..90104a19c564 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -128,7 +128,7 @@ void rcu_note_context_switch(int cpu)
128#ifdef CONFIG_NO_HZ 128#ifdef CONFIG_NO_HZ
129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 129DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
130 .dynticks_nesting = 1, 130 .dynticks_nesting = 1,
131 .dynticks = 1, 131 .dynticks = ATOMIC_INIT(1),
132}; 132};
133#endif /* #ifdef CONFIG_NO_HZ */ 133#endif /* #ifdef CONFIG_NO_HZ */
134 134
@@ -262,13 +262,25 @@ void rcu_enter_nohz(void)
262 unsigned long flags; 262 unsigned long flags;
263 struct rcu_dynticks *rdtp; 263 struct rcu_dynticks *rdtp;
264 264
265 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
266 local_irq_save(flags); 265 local_irq_save(flags);
267 rdtp = &__get_cpu_var(rcu_dynticks); 266 rdtp = &__get_cpu_var(rcu_dynticks);
268 rdtp->dynticks++; 267 if (--rdtp->dynticks_nesting) {
269 rdtp->dynticks_nesting--; 268 local_irq_restore(flags);
270 WARN_ON_ONCE(rdtp->dynticks & 0x1); 269 return;
270 }
271 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
272 smp_mb__before_atomic_inc(); /* See above. */
273 atomic_inc(&rdtp->dynticks);
274 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
275 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
271 local_irq_restore(flags); 276 local_irq_restore(flags);
277
278 /* If the interrupt queued a callback, get out of dyntick mode. */
279 if (in_irq() &&
280 (__get_cpu_var(rcu_sched_data).nxtlist ||
281 __get_cpu_var(rcu_bh_data).nxtlist ||
282 rcu_preempt_needs_cpu(smp_processor_id())))
283 set_need_resched();
272} 284}
273 285
274/* 286/*
@@ -284,11 +296,16 @@ void rcu_exit_nohz(void)
284 296
285 local_irq_save(flags); 297 local_irq_save(flags);
286 rdtp = &__get_cpu_var(rcu_dynticks); 298 rdtp = &__get_cpu_var(rcu_dynticks);
287 rdtp->dynticks++; 299 if (rdtp->dynticks_nesting++) {
288 rdtp->dynticks_nesting++; 300 local_irq_restore(flags);
289 WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); 301 return;
302 }
303 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
304 atomic_inc(&rdtp->dynticks);
305 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
306 smp_mb__after_atomic_inc(); /* See above. */
307 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
290 local_irq_restore(flags); 308 local_irq_restore(flags);
291 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
292} 309}
293 310
294/** 311/**
@@ -302,11 +319,15 @@ void rcu_nmi_enter(void)
302{ 319{
303 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 320 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
304 321
305 if (rdtp->dynticks & 0x1) 322 if (rdtp->dynticks_nmi_nesting == 0 &&
323 (atomic_read(&rdtp->dynticks) & 0x1))
306 return; 324 return;
307 rdtp->dynticks_nmi++; 325 rdtp->dynticks_nmi_nesting++;
308 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); 326 smp_mb__before_atomic_inc(); /* Force delay from prior write. */
309 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 327 atomic_inc(&rdtp->dynticks);
328 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
329 smp_mb__after_atomic_inc(); /* See above. */
330 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
310} 331}
311 332
312/** 333/**
@@ -320,11 +341,14 @@ void rcu_nmi_exit(void)
320{ 341{
321 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 342 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
322 343
323 if (rdtp->dynticks & 0x1) 344 if (rdtp->dynticks_nmi_nesting == 0 ||
345 --rdtp->dynticks_nmi_nesting != 0)
324 return; 346 return;
325 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 347 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
326 rdtp->dynticks_nmi++; 348 smp_mb__before_atomic_inc(); /* See above. */
327 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); 349 atomic_inc(&rdtp->dynticks);
350 smp_mb__after_atomic_inc(); /* Force delay to next write. */
351 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
328} 352}
329 353
330/** 354/**
@@ -335,13 +359,7 @@ void rcu_nmi_exit(void)
335 */ 359 */
336void rcu_irq_enter(void) 360void rcu_irq_enter(void)
337{ 361{
338 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 362 rcu_exit_nohz();
339
340 if (rdtp->dynticks_nesting++)
341 return;
342 rdtp->dynticks++;
343 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
344 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
345} 363}
346 364
347/** 365/**
@@ -353,18 +371,7 @@ void rcu_irq_enter(void)
353 */ 371 */
354void rcu_irq_exit(void) 372void rcu_irq_exit(void)
355{ 373{
356 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 374 rcu_enter_nohz();
357
358 if (--rdtp->dynticks_nesting)
359 return;
360 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
361 rdtp->dynticks++;
362 WARN_ON_ONCE(rdtp->dynticks & 0x1);
363
364 /* If the interrupt queued a callback, get out of dyntick mode. */
365 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
366 __this_cpu_read(rcu_bh_data.nxtlist))
367 set_need_resched();
368} 375}
369 376
370#ifdef CONFIG_SMP 377#ifdef CONFIG_SMP
@@ -376,19 +383,8 @@ void rcu_irq_exit(void)
376 */ 383 */
377static int dyntick_save_progress_counter(struct rcu_data *rdp) 384static int dyntick_save_progress_counter(struct rcu_data *rdp)
378{ 385{
379 int ret; 386 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
380 int snap; 387 return 0;
381 int snap_nmi;
382
383 snap = rdp->dynticks->dynticks;
384 snap_nmi = rdp->dynticks->dynticks_nmi;
385 smp_mb(); /* Order sampling of snap with end of grace period. */
386 rdp->dynticks_snap = snap;
387 rdp->dynticks_nmi_snap = snap_nmi;
388 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
389 if (ret)
390 rdp->dynticks_fqs++;
391 return ret;
392} 388}
393 389
394/* 390/*
@@ -399,16 +395,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
399 */ 395 */
400static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 396static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
401{ 397{
402 long curr; 398 unsigned long curr;
403 long curr_nmi; 399 unsigned long snap;
404 long snap;
405 long snap_nmi;
406 400
407 curr = rdp->dynticks->dynticks; 401 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
408 snap = rdp->dynticks_snap; 402 snap = (unsigned long)rdp->dynticks_snap;
409 curr_nmi = rdp->dynticks->dynticks_nmi;
410 snap_nmi = rdp->dynticks_nmi_snap;
411 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
412 403
413 /* 404 /*
414 * If the CPU passed through or entered a dynticks idle phase with 405 * If the CPU passed through or entered a dynticks idle phase with
@@ -418,8 +409,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
418 * read-side critical section that started before the beginning 409 * read-side critical section that started before the beginning
419 * of the current RCU grace period. 410 * of the current RCU grace period.
420 */ 411 */
421 if ((curr != snap || (curr & 0x1) == 0) && 412 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
422 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
423 rdp->dynticks_fqs++; 413 rdp->dynticks_fqs++;
424 return 1; 414 return 1;
425 } 415 }
@@ -841,6 +831,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
841 __releases(rcu_get_root(rsp)->lock) 831 __releases(rcu_get_root(rsp)->lock)
842{ 832{
843 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 833 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
834
835 /*
836 * Ensure that all grace-period and pre-grace-period activity
837 * is seen before the assignment to rsp->completed.
838 */
839 smp_mb(); /* See above block comment. */
844 rsp->completed = rsp->gpnum; 840 rsp->completed = rsp->gpnum;
845 rsp->signaled = RCU_GP_IDLE; 841 rsp->signaled = RCU_GP_IDLE;
846 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 842 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -1367,25 +1363,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1367 */ 1363 */
1368static void rcu_process_callbacks(struct softirq_action *unused) 1364static void rcu_process_callbacks(struct softirq_action *unused)
1369{ 1365{
1370 /*
1371 * Memory references from any prior RCU read-side critical sections
1372 * executed by the interrupted code must be seen before any RCU
1373 * grace-period manipulations below.
1374 */
1375 smp_mb(); /* See above block comment. */
1376
1377 __rcu_process_callbacks(&rcu_sched_state, 1366 __rcu_process_callbacks(&rcu_sched_state,
1378 &__get_cpu_var(rcu_sched_data)); 1367 &__get_cpu_var(rcu_sched_data));
1379 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1368 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1380 rcu_preempt_process_callbacks(); 1369 rcu_preempt_process_callbacks();
1381 1370
1382 /*
1383 * Memory references from any later RCU read-side critical sections
1384 * executed by the interrupted code must be seen after any RCU
1385 * grace-period manipulations above.
1386 */
1387 smp_mb(); /* See above block comment. */
1388
1389 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ 1371 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1390 rcu_needs_cpu_flush(); 1372 rcu_needs_cpu_flush();
1391} 1373}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e1a6663c0155..bd891def3303 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,11 +84,9 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track nesting level, sort of. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks; /* Even value for dynticks-idle, else odd. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 int dynticks_nmi; /* Even value for either dynticks-idle or */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
90 /* not in nmi handler, else odd. So this */
91 /* remains even for nmi from irq handler. */
92}; 90};
93 91
94/* 92/*
@@ -218,7 +216,6 @@ struct rcu_data {
218 /* 3) dynticks interface. */ 216 /* 3) dynticks interface. */
219 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 217 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
220 int dynticks_snap; /* Per-GP tracking for dynticks. */ 218 int dynticks_snap; /* Per-GP tracking for dynticks. */
221 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
222#endif /* #ifdef CONFIG_NO_HZ */ 219#endif /* #ifdef CONFIG_NO_HZ */
223 220
224 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 221 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 38426ef1bcd6..764b5fcc7c56 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1182,7 +1182,6 @@ int rcu_needs_cpu(int cpu)
1182{ 1182{
1183 int c = 0; 1183 int c = 0;
1184 int snap; 1184 int snap;
1185 int snap_nmi;
1186 int thatcpu; 1185 int thatcpu;
1187 1186
1188 /* Check for being in the holdoff period. */ 1187 /* Check for being in the holdoff period. */
@@ -1193,10 +1192,10 @@ int rcu_needs_cpu(int cpu)
1193 for_each_online_cpu(thatcpu) { 1192 for_each_online_cpu(thatcpu) {
1194 if (thatcpu == cpu) 1193 if (thatcpu == cpu)
1195 continue; 1194 continue;
1196 snap = per_cpu(rcu_dynticks, thatcpu).dynticks; 1195 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1197 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; 1196 thatcpu).dynticks);
1198 smp_mb(); /* Order sampling of snap with end of grace period. */ 1197 smp_mb(); /* Order sampling of snap with end of grace period. */
1199 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { 1198 if ((snap & 0x1) != 0) {
1200 per_cpu(rcu_dyntick_drain, cpu) = 0; 1199 per_cpu(rcu_dyntick_drain, cpu) = 0;
1201 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1200 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1202 return rcu_needs_cpu_quick_check(cpu); 1201 return rcu_needs_cpu_quick_check(cpu);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..4a21ca55ef7c 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -57,10 +57,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
57 rdp->passed_quiesc, rdp->passed_quiesc_completed, 57 rdp->passed_quiesc, rdp->passed_quiesc_completed,
58 rdp->qs_pending); 58 rdp->qs_pending);
59#ifdef CONFIG_NO_HZ 59#ifdef CONFIG_NO_HZ
60 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 60 seq_printf(m, " dt=%d/%d/%d df=%lu",
61 rdp->dynticks->dynticks, 61 atomic_read(&rdp->dynticks->dynticks),
62 rdp->dynticks->dynticks_nesting, 62 rdp->dynticks->dynticks_nesting,
63 rdp->dynticks->dynticks_nmi, 63 rdp->dynticks->dynticks_nmi_nesting,
64 rdp->dynticks_fqs); 64 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 65#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -115,9 +115,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
115 rdp->qs_pending); 115 rdp->qs_pending);
116#ifdef CONFIG_NO_HZ 116#ifdef CONFIG_NO_HZ
117 seq_printf(m, ",%d,%d,%d,%lu", 117 seq_printf(m, ",%d,%d,%d,%lu",
118 rdp->dynticks->dynticks, 118 atomic_read(&rdp->dynticks->dynticks),
119 rdp->dynticks->dynticks_nesting, 119 rdp->dynticks->dynticks_nesting,
120 rdp->dynticks->dynticks_nmi, 120 rdp->dynticks->dynticks_nmi_nesting,
121 rdp->dynticks_fqs); 121 rdp->dynticks_fqs);
122#endif /* #ifdef CONFIG_NO_HZ */ 122#endif /* #ifdef CONFIG_NO_HZ */
123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -130,7 +130,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
130{ 130{
131 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 131 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
132#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 133 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
134#endif /* #ifdef CONFIG_NO_HZ */ 134#endif /* #ifdef CONFIG_NO_HZ */
135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); 135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
136#ifdef CONFIG_TREE_PREEMPT_RCU 136#ifdef CONFIG_TREE_PREEMPT_RCU