diff options
| author | Steven Rostedt <rostedt@goodmis.org> | 2008-02-29 12:46:50 -0500 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2008-02-29 12:46:50 -0500 |
| commit | 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1 (patch) | |
| tree | 1d90ec0b8bd4e3c154e386f005ef596ee25fa53f /kernel | |
| parent | c0f4133b8f70769bc8dda977feb9a29109d6ccca (diff) | |
rcu: add support for dynamic ticks and preempt rcu
The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The
idle CPU will not progress the RCU through its grace period and a
synchronize_rcu my get stuck. Without this patch I have a box that will
not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine
with this patch.
This patch comes from the -rt kernel where it has been tested for
several months.
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/rcupreempt.c | 224 | ||||
| -rw-r--r-- | kernel/softirq.c | 1 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 3 |
3 files changed, 224 insertions, 4 deletions
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade..c7c52096df 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c | |||
| @@ -23,6 +23,10 @@ | |||
| 23 | * to Suparna Bhattacharya for pushing me completely away | 23 | * to Suparna Bhattacharya for pushing me completely away |
| 24 | * from atomic instructions on the read side. | 24 | * from atomic instructions on the read side. |
| 25 | * | 25 | * |
| 26 | * - Added handling of Dynamic Ticks | ||
| 27 | * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com> | ||
| 28 | * - Steven Rostedt <srostedt@redhat.com> | ||
| 29 | * | ||
| 26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | 30 | * Papers: http://www.rdrop.com/users/paulmck/RCU |
| 27 | * | 31 | * |
| 28 | * Design Document: http://lwn.net/Articles/253651/ | 32 | * Design Document: http://lwn.net/Articles/253651/ |
| @@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) | |||
| 409 | } | 413 | } |
| 410 | } | 414 | } |
| 411 | 415 | ||
| 416 | #ifdef CONFIG_NO_HZ | ||
| 417 | |||
| 418 | DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; | ||
| 419 | static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); | ||
| 420 | static DEFINE_PER_CPU(int, rcu_update_flag); | ||
| 421 | |||
| 422 | /** | ||
| 423 | * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. | ||
| 424 | * | ||
| 425 | * If the CPU was idle with dynamic ticks active, this updates the | ||
| 426 | * dynticks_progress_counter to let the RCU handling know that the | ||
| 427 | * CPU is active. | ||
| 428 | */ | ||
| 429 | void rcu_irq_enter(void) | ||
| 430 | { | ||
| 431 | int cpu = smp_processor_id(); | ||
| 432 | |||
| 433 | if (per_cpu(rcu_update_flag, cpu)) | ||
| 434 | per_cpu(rcu_update_flag, cpu)++; | ||
| 435 | |||
| 436 | /* | ||
| 437 | * Only update if we are coming from a stopped ticks mode | ||
| 438 | * (dynticks_progress_counter is even). | ||
| 439 | */ | ||
| 440 | if (!in_interrupt() && | ||
| 441 | (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { | ||
| 442 | /* | ||
| 443 | * The following might seem like we could have a race | ||
| 444 | * with NMI/SMIs. But this really isn't a problem. | ||
| 445 | * Here we do a read/modify/write, and the race happens | ||
| 446 | * when an NMI/SMI comes in after the read and before | ||
| 447 | * the write. But NMI/SMIs will increment this counter | ||
| 448 | * twice before returning, so the zero bit will not | ||
| 449 | * be corrupted by the NMI/SMI which is the most important | ||
| 450 | * part. | ||
| 451 | * | ||
| 452 | * The only thing is that we would bring back the counter | ||
| 453 | * to a postion that it was in during the NMI/SMI. | ||
| 454 | * But the zero bit would be set, so the rest of the | ||
| 455 | * counter would again be ignored. | ||
| 456 | * | ||
| 457 | * On return from the IRQ, the counter may have the zero | ||
| 458 | * bit be 0 and the counter the same as the return from | ||
| 459 | * the NMI/SMI. If the state machine was so unlucky to | ||
| 460 | * see that, it still doesn't matter, since all | ||
| 461 | * RCU read-side critical sections on this CPU would | ||
| 462 | * have already completed. | ||
| 463 | */ | ||
| 464 | per_cpu(dynticks_progress_counter, cpu)++; | ||
| 465 | /* | ||
| 466 | * The following memory barrier ensures that any | ||
| 467 | * rcu_read_lock() primitives in the irq handler | ||
| 468 | * are seen by other CPUs to follow the above | ||
| 469 | * increment to dynticks_progress_counter. This is | ||
| 470 | * required in order for other CPUs to correctly | ||
| 471 | * determine when it is safe to advance the RCU | ||
| 472 | * grace-period state machine. | ||
| 473 | */ | ||
| 474 | smp_mb(); /* see above block comment. */ | ||
| 475 | /* | ||
| 476 | * Since we can't determine the dynamic tick mode from | ||
| 477 | * the dynticks_progress_counter after this routine, | ||
| 478 | * we use a second flag to acknowledge that we came | ||
| 479 | * from an idle state with ticks stopped. | ||
| 480 | */ | ||
| 481 | per_cpu(rcu_update_flag, cpu)++; | ||
| 482 | /* | ||
| 483 | * If we take an NMI/SMI now, they will also increment | ||
| 484 | * the rcu_update_flag, and will not update the | ||
| 485 | * dynticks_progress_counter on exit. That is for | ||
| 486 | * this IRQ to do. | ||
| 487 | */ | ||
| 488 | } | ||
| 489 | } | ||
| 490 | |||
| 491 | /** | ||
| 492 | * rcu_irq_exit - Called from exiting Hard irq context. | ||
| 493 | * | ||
| 494 | * If the CPU was idle with dynamic ticks active, update the | ||
| 495 | * dynticks_progress_counter to put let the RCU handling be | ||
| 496 | * aware that the CPU is going back to idle with no ticks. | ||
| 497 | */ | ||
| 498 | void rcu_irq_exit(void) | ||
| 499 | { | ||
| 500 | int cpu = smp_processor_id(); | ||
| 501 | |||
| 502 | /* | ||
| 503 | * rcu_update_flag is set if we interrupted the CPU | ||
| 504 | * when it was idle with ticks stopped. | ||
| 505 | * Once this occurs, we keep track of interrupt nesting | ||
| 506 | * because a NMI/SMI could also come in, and we still | ||
| 507 | * only want the IRQ that started the increment of the | ||
| 508 | * dynticks_progress_counter to be the one that modifies | ||
| 509 | * it on exit. | ||
| 510 | */ | ||
| 511 | if (per_cpu(rcu_update_flag, cpu)) { | ||
| 512 | if (--per_cpu(rcu_update_flag, cpu)) | ||
| 513 | return; | ||
| 514 | |||
| 515 | /* This must match the interrupt nesting */ | ||
| 516 | WARN_ON(in_interrupt()); | ||
| 517 | |||
| 518 | /* | ||
| 519 | * If an NMI/SMI happens now we are still | ||
| 520 | * protected by the dynticks_progress_counter being odd. | ||
| 521 | */ | ||
| 522 | |||
| 523 | /* | ||
| 524 | * The following memory barrier ensures that any | ||
| 525 | * rcu_read_unlock() primitives in the irq handler | ||
| 526 | * are seen by other CPUs to preceed the following | ||
| 527 | * increment to dynticks_progress_counter. This | ||
| 528 | * is required in order for other CPUs to determine | ||
| 529 | * when it is safe to advance the RCU grace-period | ||
| 530 | * state machine. | ||
| 531 | */ | ||
| 532 | smp_mb(); /* see above block comment. */ | ||
| 533 | per_cpu(dynticks_progress_counter, cpu)++; | ||
| 534 | WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); | ||
| 535 | } | ||
| 536 | } | ||
| 537 | |||
| 538 | static void dyntick_save_progress_counter(int cpu) | ||
| 539 | { | ||
| 540 | per_cpu(rcu_dyntick_snapshot, cpu) = | ||
| 541 | per_cpu(dynticks_progress_counter, cpu); | ||
| 542 | } | ||
| 543 | |||
| 544 | static inline int | ||
| 545 | rcu_try_flip_waitack_needed(int cpu) | ||
| 546 | { | ||
| 547 | long curr; | ||
| 548 | long snap; | ||
| 549 | |||
| 550 | curr = per_cpu(dynticks_progress_counter, cpu); | ||
| 551 | snap = per_cpu(rcu_dyntick_snapshot, cpu); | ||
| 552 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
| 553 | |||
| 554 | /* | ||
| 555 | * If the CPU remained in dynticks mode for the entire time | ||
| 556 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
| 557 | * then it cannot be in the middle of an rcu_read_lock(), so | ||
| 558 | * the next rcu_read_lock() it executes must use the new value | ||
| 559 | * of the counter. So we can safely pretend that this CPU | ||
| 560 | * already acknowledged the counter. | ||
| 561 | */ | ||
| 562 | |||
| 563 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
| 564 | return 0; | ||
| 565 | |||
| 566 | /* | ||
| 567 | * If the CPU passed through or entered a dynticks idle phase with | ||
| 568 | * no active irq handlers, then, as above, we can safely pretend | ||
| 569 | * that this CPU already acknowledged the counter. | ||
| 570 | */ | ||
| 571 | |||
| 572 | if ((curr - snap) > 2 || (snap & 0x1) == 0) | ||
| 573 | return 0; | ||
| 574 | |||
| 575 | /* We need this CPU to explicitly acknowledge the counter flip. */ | ||
| 576 | |||
| 577 | return 1; | ||
| 578 | } | ||
| 579 | |||
| 580 | static inline int | ||
| 581 | rcu_try_flip_waitmb_needed(int cpu) | ||
| 582 | { | ||
| 583 | long curr; | ||
| 584 | long snap; | ||
| 585 | |||
| 586 | curr = per_cpu(dynticks_progress_counter, cpu); | ||
| 587 | snap = per_cpu(rcu_dyntick_snapshot, cpu); | ||
| 588 | smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ | ||
| 589 | |||
| 590 | /* | ||
| 591 | * If the CPU remained in dynticks mode for the entire time | ||
| 592 | * and didn't take any interrupts, NMIs, SMIs, or whatever, | ||
| 593 | * then it cannot have executed an RCU read-side critical section | ||
| 594 | * during that time, so there is no need for it to execute a | ||
| 595 | * memory barrier. | ||
| 596 | */ | ||
| 597 | |||
| 598 | if ((curr == snap) && ((curr & 0x1) == 0)) | ||
| 599 | return 0; | ||
| 600 | |||
| 601 | /* | ||
| 602 | * If the CPU either entered or exited an outermost interrupt, | ||
| 603 | * SMI, NMI, or whatever handler, then we know that it executed | ||
| 604 | * a memory barrier when doing so. So we don't need another one. | ||
| 605 | */ | ||
| 606 | if (curr != snap) | ||
| 607 | return 0; | ||
| 608 | |||
| 609 | /* We need the CPU to execute a memory barrier. */ | ||
| 610 | |||
| 611 | return 1; | ||
| 612 | } | ||
| 613 | |||
| 614 | #else /* !CONFIG_NO_HZ */ | ||
| 615 | |||
| 616 | # define dyntick_save_progress_counter(cpu) do { } while (0) | ||
| 617 | # define rcu_try_flip_waitack_needed(cpu) (1) | ||
| 618 | # define rcu_try_flip_waitmb_needed(cpu) (1) | ||
| 619 | |||
| 620 | #endif /* CONFIG_NO_HZ */ | ||
| 621 | |||
| 412 | /* | 622 | /* |
| 413 | * Get here when RCU is idle. Decide whether we need to | 623 | * Get here when RCU is idle. Decide whether we need to |
| 414 | * move out of idle state, and return non-zero if so. | 624 | * move out of idle state, and return non-zero if so. |
| @@ -447,8 +657,10 @@ rcu_try_flip_idle(void) | |||
| 447 | 657 | ||
| 448 | /* Now ask each CPU for acknowledgement of the flip. */ | 658 | /* Now ask each CPU for acknowledgement of the flip. */ |
| 449 | 659 | ||
| 450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 660 | for_each_cpu_mask(cpu, rcu_cpu_online_map) { |
| 451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | 661 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; |
| 662 | dyntick_save_progress_counter(cpu); | ||
| 663 | } | ||
| 452 | 664 | ||
| 453 | return 1; | 665 | return 1; |
| 454 | } | 666 | } |
| @@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) | |||
| 464 | 676 | ||
| 465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | 677 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); |
| 466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 678 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
| 467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | 679 | if (rcu_try_flip_waitack_needed(cpu) && |
| 680 | per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
| 468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | 681 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); |
| 469 | return 0; | 682 | return 0; |
| 470 | } | 683 | } |
| @@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) | |||
| 509 | smp_mb(); /* ^^^^^^^^^^^^ */ | 722 | smp_mb(); /* ^^^^^^^^^^^^ */ |
| 510 | 723 | ||
| 511 | /* Call for a memory barrier from each CPU. */ | 724 | /* Call for a memory barrier from each CPU. */ |
| 512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 725 | for_each_cpu_mask(cpu, rcu_cpu_online_map) { |
| 513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | 726 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; |
| 727 | dyntick_save_progress_counter(cpu); | ||
| 728 | } | ||
| 514 | 729 | ||
| 515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | 730 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); |
| 516 | return 1; | 731 | return 1; |
| @@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) | |||
| 528 | 743 | ||
| 529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | 744 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); |
| 530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | 745 | for_each_cpu_mask(cpu, rcu_cpu_online_map) |
| 531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | 746 | if (rcu_try_flip_waitmb_needed(cpu) && |
| 747 | per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
| 532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | 748 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); |
| 533 | return 0; | 749 | return 0; |
| 534 | } | 750 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f47..31e9f2a479 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -313,6 +313,7 @@ void irq_exit(void) | |||
| 313 | /* Make sure that timer wheel updates are propagated */ | 313 | /* Make sure that timer wheel updates are propagated */ |
| 314 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) | 314 | if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) |
| 315 | tick_nohz_stop_sched_tick(); | 315 | tick_nohz_stop_sched_tick(); |
| 316 | rcu_irq_exit(); | ||
| 316 | #endif | 317 | #endif |
| 317 | preempt_enable_no_resched(); | 318 | preempt_enable_no_resched(); |
| 318 | } | 319 | } |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbd..2968298f8f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void) | |||
| 282 | ts->idle_tick = ts->sched_timer.expires; | 282 | ts->idle_tick = ts->sched_timer.expires; |
| 283 | ts->tick_stopped = 1; | 283 | ts->tick_stopped = 1; |
| 284 | ts->idle_jiffies = last_jiffies; | 284 | ts->idle_jiffies = last_jiffies; |
| 285 | rcu_enter_nohz(); | ||
| 285 | } | 286 | } |
| 286 | 287 | ||
| 287 | /* | 288 | /* |
| @@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void) | |||
| 375 | return; | 376 | return; |
| 376 | } | 377 | } |
| 377 | 378 | ||
| 379 | rcu_exit_nohz(); | ||
| 380 | |||
| 378 | /* Update jiffies first */ | 381 | /* Update jiffies first */ |
| 379 | select_nohz_load_balancer(0); | 382 | select_nohz_load_balancer(0); |
| 380 | now = ktime_get(); | 383 | now = ktime_get(); |
