diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2009-11-02 16:52:28 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-11-09 22:11:54 -0500 |
commit | d09b62dfa336447c52a5ec9bb88adbc479b0f3b8 (patch) | |
tree | 70a002fed2e0471def01ea3c166137449cb1527d /kernel | |
parent | 281d150c5f8892f158747594ab49ce2823fd8b8c (diff) |
rcu: Fix synchronization for rcu_process_gp_end() uses of ->completed counter
Impose a clear locking design on the rcu_process_gp_end()
function's use of the ->completed counter. This is done by
creating a ->completed field in the rcu_node structure, which
can safely be accessed under the protection of that structure's
lock. Performance and scalability are maintained by using a
form of double-checked locking, so that rcu_process_gp_end()
only acquires the leaf rcu_node structure's ->lock if a grace
period has recently ended.
This fix reduces rcutorture failure rate by at least two orders
of magnitude under heavy stress with force_quiescent_state()
being invoked artificially often. Without this fix,
unsynchronized access to the ->completed field can cause
rcu_process_gp_end() to advance callbacks whose grace period has
not yet expired. (Bad idea!)
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: <stable@kernel.org> # .32.x
LKML-Reference: <12571987494069-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcutree.c | 128 | ||||
-rw-r--r-- | kernel/rcutree.h | 3 |
2 files changed, 83 insertions, 48 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 26249abf24dc..9e068d112153 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -570,6 +570,76 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) | |||
570 | } | 570 | } |
571 | 571 | ||
572 | /* | 572 | /* |
573 | * Advance this CPU's callbacks, but only if the current grace period | ||
574 | * has ended. This may be called only from the CPU to whom the rdp | ||
575 | * belongs. In addition, the corresponding leaf rcu_node structure's | ||
576 | * ->lock must be held by the caller, with irqs disabled. | ||
577 | */ | ||
578 | static void | ||
579 | __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | ||
580 | { | ||
581 | /* Did another grace period end? */ | ||
582 | if (rdp->completed != rnp->completed) { | ||
583 | |||
584 | /* Advance callbacks. No harm if list empty. */ | ||
585 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; | ||
586 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; | ||
587 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
588 | |||
589 | /* Remember that we saw this grace-period completion. */ | ||
590 | rdp->completed = rnp->completed; | ||
591 | } | ||
592 | } | ||
593 | |||
594 | /* | ||
595 | * Advance this CPU's callbacks, but only if the current grace period | ||
596 | * has ended. This may be called only from the CPU to whom the rdp | ||
597 | * belongs. | ||
598 | */ | ||
599 | static void | ||
600 | rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) | ||
601 | { | ||
602 | unsigned long flags; | ||
603 | struct rcu_node *rnp; | ||
604 | |||
605 | local_irq_save(flags); | ||
606 | rnp = rdp->mynode; | ||
607 | if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ | ||
608 | !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ | ||
609 | local_irq_restore(flags); | ||
610 | return; | ||
611 | } | ||
612 | __rcu_process_gp_end(rsp, rnp, rdp); | ||
613 | spin_unlock_irqrestore(&rnp->lock, flags); | ||
614 | } | ||
615 | |||
616 | /* | ||
617 | * Do per-CPU grace-period initialization for running CPU. The caller | ||
618 | * must hold the lock of the leaf rcu_node structure corresponding to | ||
619 | * this CPU. | ||
620 | */ | ||
621 | static void | ||
622 | rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | ||
623 | { | ||
624 | /* Prior grace period ended, so advance callbacks for current CPU. */ | ||
625 | __rcu_process_gp_end(rsp, rnp, rdp); | ||
626 | |||
627 | /* | ||
628 | * Because this CPU just now started the new grace period, we know | ||
629 | * that all of its callbacks will be covered by this upcoming grace | ||
630 | * period, even the ones that were registered arbitrarily recently. | ||
631 | * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. | ||
632 | * | ||
633 | * Other CPUs cannot be sure exactly when the grace period started. | ||
634 | * Therefore, their recently registered callbacks must pass through | ||
635 | * an additional RCU_NEXT_READY stage, so that they will be handled | ||
636 | * by the next RCU grace period. | ||
637 | */ | ||
638 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
639 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
640 | } | ||
641 | |||
642 | /* | ||
573 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 643 | * Start a new RCU grace period if warranted, re-initializing the hierarchy |
574 | * in preparation for detecting the next grace period. The caller must hold | 644 | * in preparation for detecting the next grace period. The caller must hold |
575 | * the root node's ->lock, which is released before return. Hard irqs must | 645 | * the root node's ->lock, which is released before return. Hard irqs must |
@@ -596,26 +666,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
596 | dyntick_record_completed(rsp, rsp->completed - 1); | 666 | dyntick_record_completed(rsp, rsp->completed - 1); |
597 | note_new_gpnum(rsp, rdp); | 667 | note_new_gpnum(rsp, rdp); |
598 | 668 | ||
599 | /* | ||
600 | * Because this CPU just now started the new grace period, we know | ||
601 | * that all of its callbacks will be covered by this upcoming grace | ||
602 | * period, even the ones that were registered arbitrarily recently. | ||
603 | * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. | ||
604 | * | ||
605 | * Other CPUs cannot be sure exactly when the grace period started. | ||
606 | * Therefore, their recently registered callbacks must pass through | ||
607 | * an additional RCU_NEXT_READY stage, so that they will be handled | ||
608 | * by the next RCU grace period. | ||
609 | */ | ||
610 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
611 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
612 | |||
613 | /* Special-case the common single-level case. */ | 669 | /* Special-case the common single-level case. */ |
614 | if (NUM_RCU_NODES == 1) { | 670 | if (NUM_RCU_NODES == 1) { |
615 | rcu_preempt_check_blocked_tasks(rnp); | 671 | rcu_preempt_check_blocked_tasks(rnp); |
616 | rnp->qsmask = rnp->qsmaskinit; | 672 | rnp->qsmask = rnp->qsmaskinit; |
617 | rnp->gpnum = rsp->gpnum; | 673 | rnp->gpnum = rsp->gpnum; |
674 | rnp->completed = rsp->completed; | ||
618 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ | 675 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ |
676 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | ||
619 | spin_unlock_irqrestore(&rnp->lock, flags); | 677 | spin_unlock_irqrestore(&rnp->lock, flags); |
620 | return; | 678 | return; |
621 | } | 679 | } |
@@ -648,6 +706,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
648 | rcu_preempt_check_blocked_tasks(rnp); | 706 | rcu_preempt_check_blocked_tasks(rnp); |
649 | rnp->qsmask = rnp->qsmaskinit; | 707 | rnp->qsmask = rnp->qsmaskinit; |
650 | rnp->gpnum = rsp->gpnum; | 708 | rnp->gpnum = rsp->gpnum; |
709 | rnp->completed = rsp->completed; | ||
710 | if (rnp == rdp->mynode) | ||
711 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | ||
651 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 712 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
652 | } | 713 | } |
653 | 714 | ||
@@ -659,34 +720,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
659 | } | 720 | } |
660 | 721 | ||
661 | /* | 722 | /* |
662 | * Advance this CPU's callbacks, but only if the current grace period | ||
663 | * has ended. This may be called only from the CPU to whom the rdp | ||
664 | * belongs. | ||
665 | */ | ||
666 | static void | ||
667 | rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) | ||
668 | { | ||
669 | long completed_snap; | ||
670 | unsigned long flags; | ||
671 | |||
672 | local_irq_save(flags); | ||
673 | completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */ | ||
674 | |||
675 | /* Did another grace period end? */ | ||
676 | if (rdp->completed != completed_snap) { | ||
677 | |||
678 | /* Advance callbacks. No harm if list empty. */ | ||
679 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; | ||
680 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; | ||
681 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
682 | |||
683 | /* Remember that we saw this grace-period completion. */ | ||
684 | rdp->completed = completed_snap; | ||
685 | } | ||
686 | local_irq_restore(flags); | ||
687 | } | ||
688 | |||
689 | /* | ||
690 | * Clean up after the prior grace period and let rcu_start_gp() start up | 723 | * Clean up after the prior grace period and let rcu_start_gp() start up |
691 | * the next grace period if one is needed. Note that the caller must | 724 | * the next grace period if one is needed. Note that the caller must |
692 | * hold rnp->lock, as required by rcu_start_gp(), which will release it. | 725 | * hold rnp->lock, as required by rcu_start_gp(), which will release it. |
@@ -697,7 +730,6 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) | |||
697 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 730 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
698 | rsp->completed = rsp->gpnum; | 731 | rsp->completed = rsp->gpnum; |
699 | rsp->signaled = RCU_GP_IDLE; | 732 | rsp->signaled = RCU_GP_IDLE; |
700 | rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); | ||
701 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 733 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
702 | } | 734 | } |
703 | 735 | ||
@@ -1539,21 +1571,16 @@ static void __cpuinit | |||
1539 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | 1571 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) |
1540 | { | 1572 | { |
1541 | unsigned long flags; | 1573 | unsigned long flags; |
1542 | long lastcomp; | ||
1543 | unsigned long mask; | 1574 | unsigned long mask; |
1544 | struct rcu_data *rdp = rsp->rda[cpu]; | 1575 | struct rcu_data *rdp = rsp->rda[cpu]; |
1545 | struct rcu_node *rnp = rcu_get_root(rsp); | 1576 | struct rcu_node *rnp = rcu_get_root(rsp); |
1546 | 1577 | ||
1547 | /* Set up local state, ensuring consistent view of global state. */ | 1578 | /* Set up local state, ensuring consistent view of global state. */ |
1548 | spin_lock_irqsave(&rnp->lock, flags); | 1579 | spin_lock_irqsave(&rnp->lock, flags); |
1549 | lastcomp = rsp->completed; | ||
1550 | rdp->completed = lastcomp; | ||
1551 | rdp->gpnum = lastcomp; | ||
1552 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ | 1580 | rdp->passed_quiesc = 0; /* We could be racing with new GP, */ |
1553 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ | 1581 | rdp->qs_pending = 1; /* so set up to respond to current GP. */ |
1554 | rdp->beenonline = 1; /* We have now been online. */ | 1582 | rdp->beenonline = 1; /* We have now been online. */ |
1555 | rdp->preemptable = preemptable; | 1583 | rdp->preemptable = preemptable; |
1556 | rdp->passed_quiesc_completed = lastcomp - 1; | ||
1557 | rdp->qlen_last_fqs_check = 0; | 1584 | rdp->qlen_last_fqs_check = 0; |
1558 | rdp->n_force_qs_snap = rsp->n_force_qs; | 1585 | rdp->n_force_qs_snap = rsp->n_force_qs; |
1559 | rdp->blimit = blimit; | 1586 | rdp->blimit = blimit; |
@@ -1575,6 +1602,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1575 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 1602 | spin_lock(&rnp->lock); /* irqs already disabled. */ |
1576 | rnp->qsmaskinit |= mask; | 1603 | rnp->qsmaskinit |= mask; |
1577 | mask = rnp->grpmask; | 1604 | mask = rnp->grpmask; |
1605 | if (rnp == rdp->mynode) { | ||
1606 | rdp->gpnum = rnp->completed; /* if GP in progress... */ | ||
1607 | rdp->completed = rnp->completed; | ||
1608 | rdp->passed_quiesc_completed = rnp->completed - 1; | ||
1609 | } | ||
1578 | spin_unlock(&rnp->lock); /* irqs already disabled. */ | 1610 | spin_unlock(&rnp->lock); /* irqs already disabled. */ |
1579 | rnp = rnp->parent; | 1611 | rnp = rnp->parent; |
1580 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | 1612 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8a4c1650ad8d..c1891c3cae63 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -84,6 +84,9 @@ struct rcu_node { | |||
84 | long gpnum; /* Current grace period for this node. */ | 84 | long gpnum; /* Current grace period for this node. */ |
85 | /* This will either be equal to or one */ | 85 | /* This will either be equal to or one */ |
86 | /* behind the root rcu_node's gpnum. */ | 86 | /* behind the root rcu_node's gpnum. */ |
87 | long completed; /* Last grace period completed for this node. */ | ||
88 | /* This will either be equal to or one */ | ||
89 | /* behind the root rcu_node's gpnum. */ | ||
87 | unsigned long qsmask; /* CPUs or groups that need to switch in */ | 90 | unsigned long qsmask; /* CPUs or groups that need to switch in */ |
88 | /* order for current grace period to proceed.*/ | 91 | /* order for current grace period to proceed.*/ |
89 | /* In leaf rcu_node, each bit corresponds to */ | 92 | /* In leaf rcu_node, each bit corresponds to */ |