diff options
author | Steven Rostedt <rostedt@goodmis.org> | 2008-02-29 12:46:50 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-02-29 12:46:50 -0500 |
commit | 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1 (patch) | |
tree | 1d90ec0b8bd4e3c154e386f005ef596ee25fa53f /kernel | |
parent | c0f4133b8f70769bc8dda977feb9a29109d6ccca (diff) |
rcu: add support for dynamic ticks and preempt rcu
The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The
idle CPU will not progress the RCU through its grace period and a
synchronize_rcu my get stuck. Without this patch I have a box that will
not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine
with this patch.
This patch comes from the -rt kernel where it has been tested for
several months.
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcupreempt.c | 224 | ||||
-rw-r--r-- | kernel/softirq.c | 1 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 3 |
3 files changed, 224 insertions, 4 deletions
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade89..c7c52096df48 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
@@ -23,6 +23,10 @@ | |||
23 | * to Suparna Bhattacharya for pushing me completely away | 23 | * to Suparna Bhattacharya for pushing me completely away |
24 | * from atomic instructions on the read side. | 24 | * from atomic instructions on the read side. |
25 | * | 25 | * |
26 | * - Added handling of Dynamic Ticks | ||
27 | * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> | ||
28 | * - Steven Rostedt <srostedt@redhat.com> | ||
29 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | 30 | * Papers: http://www.rdrop.com/users/paulmck/RCU |
27 | * | 31 | * |
28 | * Design Document: http://lwn.net/Articles/253651/ | 32 | * Design Document: http://lwn.net/Articles/253651/ |
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) | |||
409 | } | 413 | } |
410 | } | 414 | } |
411 | 415 | ||
416 | #ifdef CONFIG_NO_HZ | ||
417 | |||
418 | DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; | ||
419 | static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); | ||
420 | static DEFINE_PER_CPU(int, rcu_update_flag); | ||
421 | |||
422 | /** | ||
423 | * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. | ||
424 | * | ||
425 | * If the CPU was idle with dynamic ticks active, this updates the | ||
426 | * dynticks_progress_counter to let the RCU handling know that the | ||
427 | * CPU is active. | ||
428 | */ | ||
429 | void rcu_irq_enter(void) | ||
430 | { | ||
431 | int cpu = smp_processor_id(); | ||
432 | |||
433 | if (per_cpu(rcu_update_flag, cpu)) | ||
434 | per_cpu(rcu_update_flag, cpu)++; | ||
435 | |||
436 | /* | ||
437 | * Only update if we are coming from a stopped ticks mode | ||
438 | * (dynticks_progress_counter is even). | ||
439 | */ | ||
440 | if (!in_interrupt() && | ||
441 | (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { | ||
442 | /* | ||
443 | * The following might seem like we could have a race | ||
444 | * with NMI/SMIs. But this really isn't a problem. | ||
445 | * Here we do a read/modify/write, and the race happens | ||
446 | * when an NMI/SMI comes in after the read and before | ||
447 | * the write. But NMI/SMIs will increment this counter | ||
448 | * twice before returning, so the zero bit will not | ||
449 | * be corrupted by the NMI/SMI which is the most important | ||
450 | * part. | ||
451 | * | ||
452 | * The only thing is that we would bring back the counter | ||
453 | * to a postion that it was in during the NMI/SMI. | ||
454 | * But the zero bit would be set, so the rest of the | ||
455 | * counter would again be ignored. | ||
456 | * | ||
457 | * On return from the IRQ, the counter may have the zero | ||
458 | * bit be 0 and the counter the same as the return from | ||
459 | * the NMI/SMI. If the state machine was so unlucky to | ||
460 | * see that, it still doesn't matter, since all | ||
461 | * RCU read-side critical sections on this CPU would | ||
462 | * have already completed. | ||
463 | */ | ||
464 | per_cpu(dynticks_progress_counter, cpu)++; | ||
465 | /* | ||
466 | * The following memory barrier ensures that any | ||
467 | * rcu_read_lock() primitives in the irq handler | ||
468 | * are seen by other CPUs to follow the above | ||
469 | * increment to dynticks_progress_counter. This is | ||
470 | * required in order for other CPUs to correctly | ||
471 | * determine when it is safe to advance the RCU | ||
472 | * grace-period state machine. | ||
473 | */ | ||
474 | smp_mb(); /* see above block comment. */ | ||
475 | /* | ||
476 | * Since we can't determine the dynamic tick mode from | ||
477 | * the dynticks_progress_counter after this routine, | ||
478 | * we use a second flag to acknowledge that we came | ||
479 | * from an idle state with ticks stopped. | ||
480 | */ | ||
481 | per_cpu(rcu_update_flag, cpu)++; | ||
482 | /* | ||
483 | * If we take an NMI/SMI now, they will also increment | ||
484 | * the rcu_update_flag, and will not update the | ||
485 | * dynticks_progress_counter on exit. That is for | ||
486 | * this IRQ to do. | ||
487 | */ | ||
488 | } | ||
489 | } | ||
490 | |||
491 | /** | ||
492 | * rcu_irq_exit - Called from exiting Hard irq context. | ||
493 | * | ||
494 | * If the CPU was idle with dynamic ticks active, update the | ||
495 | * dynticks_progress_counter to put let the RCU handling be | ||
496 | * aware that the CPU is going back to idle with no ticks. | ||
497 | */ | ||
498 | void rcu_irq_exit(void) | ||
499 | { | ||
500 | int cpu = smp_processor_id(); | ||
501 | |||
502 | /* | ||
503 | * rcu_update_flag is set if we interrupted the CPU | ||
504 | * when it was idle with ticks stopped. | ||
505 | * Once this occurs, we keep track of interrupt nesting | ||
506 | * because a NMI/SMI could also come in, and we still | ||
507 | * only want the IRQ that started the increment of the | ||
508 | * dynticks_progress_counter to be the one that modifies | ||
509 | * it on exit. | ||
510 | */ | ||
511 | if (per_cpu(rcu_update_flag, cpu)) { | ||
512 | if (--per_cpu(rcu_update_flag, cpu)) | ||
513 | return; | ||
514 | |||
515 | /* This must match the interrupt nesting */ | ||
516 | WARN_ON(in_interrupt()); | ||
517 | |||
518 | /* | ||
519 | * If an NMI/SMI happens now we are still | ||
520 | * protected by the dynticks_progress_counter being odd. | ||
521 | */ | ||
522 | |||
523 | /* | ||
524 | * The following memory barrier ensures that any | ||
525 | * rcu_read_unlock() primitives in the irq handler | ||
526 | * are seen by other CPUs to preceed the following | ||
527 | * increment to dynticks_progress_counter. This | ||
528 | * is required in order for other CPUs to determine | ||
529 | * when it is safe to advance the RCU grace-period | ||
530 | * state machine. | ||
531 | */ | ||
532 | smp_mb(); /* see above block comment. */ | ||
533 | per_cpu(dynticks_progress_counter, cpu)++; | ||
534 | WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); | ||
535 | } | ||
536 | } | ||
537 | |||
538 | static void dyntick_save_progress_counter(int cpu) | ||
539 | { | ||
540 | per_cpu(rcu_dyntick_snapshot, cpu) = | ||
541 | per_cpu(dynticks_progress_counter, cpu); | ||
542 | } | ||
543 | |||
544 | static inline int | ||
545 | rcu_try_flip_waitack_needed(int cpu) | ||
546 | { | ||
547 | long curr; | ||
548 | long snap; | ||
549 | |||
550 | curr = per_cpu(dynticks_progress_counter, cpu); | ||
551 | snap = per_cpu(rcu_dyntick_snapshot, cpu); | ||
552 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
553 | |||
554 | /* | ||
555 | * If the CPU remained in dynticks mode for the entire time | ||
556 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
557 | * then it cannot be in the middle of an rcu_read_lock(), so | ||
558 | * the next rcu_read_lock() it executes must use the new value | ||
559 | * of the counter. So we can safely pretend that this CPU | ||
560 | * already acknowledged the counter. | ||
561 | */ | ||
562 | |||
563 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
564 | return 0; | ||
565 | |||
566 | /* | ||
567 | * If the CPU passed through or entered a dynticks idle phase with | ||
568 | * no active irq handlers, then, as above, we can safely pretend | ||
569 | * that this CPU already acknowledged the counter. | ||
570 | */ | ||
571 | |||
572 | if ((curr - snap) > 2 || (snap & 0x1) == 0) | ||
573 | return 0; | ||
574 | |||
575 | /* We need this CPU to explicitly acknowledge the counter flip. */ | ||
576 | |||
577 | return 1; | ||
578 | } | ||
579 | |||
580 | static inline int | ||
581 | rcu_try_flip_waitmb_needed(int cpu) | ||
582 | { | ||
583 | long curr; | ||
584 | long snap; | ||
585 | |||
586 | curr = per_cpu(dynticks_progress_counter, cpu); | ||
587 | snap = per_cpu(rcu_dyntick_snapshot, cpu); | ||
588 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
589 | |||
590 | /* | ||
591 | * If the CPU remained in dynticks mode for the entire time | ||
592 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
593 | * then it cannot have executed an RCU read-side critical section | ||
594 | * during that time, so there is no need for it to execute a | ||
595 | * memory barrier. | ||
596 | */ | ||
597 | |||
598 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
599 | return 0; | ||
600 | |||
601 | /* | ||
602 | * If the CPU either entered or exited an outermost interrupt, | ||
603 | * SMI, NMI, or whatever handler, then we know that it executed | ||
604 | * a memory barrier when doing so. So we don't need another one. | ||
605 | */ | ||
606 | if (curr != snap) | ||
607 | return 0; | ||
608 | |||
609 | /* We need the CPU to execute a memory barrier. */ | ||
610 | |||
611 | return 1; | ||
612 | } | ||
613 | |||
614 | #else /* !CONFIG_NO_HZ */ | ||
615 | |||
616 | # define dyntick_save_progress_counter(cpu) do { } while (0) | ||
617 | # define rcu_try_flip_waitack_needed(cpu) (1) | ||
618 | # define rcu_try_flip_waitmb_needed(cpu) (1) | ||
619 | |||
620 | #endif /* CONFIG_NO_HZ */ | ||
621 | |||
412 | /* | 622 | /* |
413 | * Get here when RCU is idle. Decide whether we need to | 623 | * Get here when RCU is idle. Decide whether we need to |
414 | * move out of idle state, and return non-zero if so. | 624 | * move out of idle state, and return non-zero if so. |
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void) | |||
447 | 657 | ||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | 658 | /* Now ask each CPU for acknowledgement of the flip. */ |
449 | 659 | ||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 660 | for_each_cpu_mask(cpu, rcu_cpu_online_map) { |
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | 661 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; |
662 | dyntick_save_progress_counter(cpu); | ||
663 | } | ||
452 | 664 | ||
453 | return 1; | 665 | return 1; |
454 | } | 666 | } |
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) | |||
464 | 676 | ||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | 677 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); |
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 678 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | 679 | if (rcu_try_flip_waitack_needed(cpu) && |
680 | per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | 681 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); |
469 | return 0; | 682 | return 0; |
470 | } | 683 | } |
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) | |||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | 722 | smp_mb(); /* ^^^^^^^^^^^^ */ |
510 | 723 | ||
511 | /* Call for a memory barrier from each CPU. */ | 724 | /* Call for a memory barrier from each CPU. */ |
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 725 | for_each_cpu_mask(cpu, rcu_cpu_online_map) { |
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | 726 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; |
727 | dyntick_save_progress_counter(cpu); | ||
728 | } | ||
514 | 729 | ||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | 730 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); |
516 | return 1; | 731 | return 1; |
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) | |||
528 | 743 | ||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | 744 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); |
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 745 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | 746 | if (rcu_try_flip_waitmb_needed(cpu) && |
747 | per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | 748 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); |
533 | return 0; | 749 | return 0; |
534 | } | 750 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f471e..31e9f2a47928 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -313,6 +313,7 @@ void irq_exit(void) | |||
313 | /* Make sure that timer wheel updates are propagated */ | 313 | /* Make sure that timer wheel updates are propagated */ |
314 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) | 314 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) |
315 | tick_nohz_stop_sched_tick(); | 315 | tick_nohz_stop_sched_tick(); |
316 | rcu_irq_exit(); | ||
316 | #endif | 317 | #endif |
317 | preempt_enable_no_resched(); | 318 | preempt_enable_no_resched(); |
318 | } | 319 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbdb4..2968298f8f36 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void) | |||
282 | ts->idle_tick = ts->sched_timer.expires; | 282 | ts->idle_tick = ts->sched_timer.expires; |
283 | ts->tick_stopped = 1; | 283 | ts->tick_stopped = 1; |
284 | ts->idle_jiffies = last_jiffies; | 284 | ts->idle_jiffies = last_jiffies; |
285 | rcu_enter_nohz(); | ||
285 | } | 286 | } |
286 | 287 | ||
287 | /* | 288 | /* |
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void) | |||
375 | return; | 376 | return; |
376 | } | 377 | } |
377 | 378 | ||
379 | rcu_exit_nohz(); | ||
380 | |||
378 | /* Update jiffies first */ | 381 | /* Update jiffies first */ |
379 | select_nohz_load_balancer(0); | 382 | select_nohz_load_balancer(0); |
380 | now = ktime_get(); | 383 | now = ktime_get(); |