aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2009-11-02 16:52:28 -0500
committerIngo Molnar <mingo@elte.hu>2009-11-09 22:11:54 -0500
commitd09b62dfa336447c52a5ec9bb88adbc479b0f3b8 (patch)
tree70a002fed2e0471def01ea3c166137449cb1527d /kernel
parent281d150c5f8892f158747594ab49ce2823fd8b8c (diff)
rcu: Fix synchronization for rcu_process_gp_end() uses of ->completed counter
Impose a clear locking design on the rcu_process_gp_end() function's use of the ->completed counter. This is done by creating a ->completed field in the rcu_node structure, which can safely be accessed under the protection of that structure's lock. Performance and scalability are maintained by using a form of double-checked locking, so that rcu_process_gp_end() only acquires the leaf rcu_node structure's ->lock if a grace period has recently ended. This fix reduces rcutorture failure rate by at least two orders of magnitude under heavy stress with force_quiescent_state() being invoked artificially often. Without this fix, unsynchronized access to the ->completed field can cause rcu_process_gp_end() to advance callbacks whose grace period has not yet expired. (Bad idea!) Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: <stable@kernel.org> # .32.x LKML-Reference: <12571987494069-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcutree.c128
-rw-r--r--kernel/rcutree.h3
2 files changed, 83 insertions, 48 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 26249abf24dc..9e068d112153 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -570,6 +570,76 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
570} 570}
571 571
572/* 572/*
573 * Advance this CPU's callbacks, but only if the current grace period
574 * has ended. This may be called only from the CPU to whom the rdp
575 * belongs. In addition, the corresponding leaf rcu_node structure's
576 * ->lock must be held by the caller, with irqs disabled.
577 */
578static void
579__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
580{
581 /* Did another grace period end? */
582 if (rdp->completed != rnp->completed) {
583
584 /* Advance callbacks. No harm if list empty. */
585 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
586 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
587 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
588
589 /* Remember that we saw this grace-period completion. */
590 rdp->completed = rnp->completed;
591 }
592}
593
594/*
595 * Advance this CPU's callbacks, but only if the current grace period
596 * has ended. This may be called only from the CPU to whom the rdp
597 * belongs.
598 */
599static void
600rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
601{
602 unsigned long flags;
603 struct rcu_node *rnp;
604
605 local_irq_save(flags);
606 rnp = rdp->mynode;
607 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
608 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */
609 local_irq_restore(flags);
610 return;
611 }
612 __rcu_process_gp_end(rsp, rnp, rdp);
613 spin_unlock_irqrestore(&rnp->lock, flags);
614}
615
616/*
617 * Do per-CPU grace-period initialization for running CPU. The caller
618 * must hold the lock of the leaf rcu_node structure corresponding to
619 * this CPU.
620 */
621static void
622rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp)
623{
624 /* Prior grace period ended, so advance callbacks for current CPU. */
625 __rcu_process_gp_end(rsp, rnp, rdp);
626
627 /*
628 * Because this CPU just now started the new grace period, we know
629 * that all of its callbacks will be covered by this upcoming grace
630 * period, even the ones that were registered arbitrarily recently.
631 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
632 *
633 * Other CPUs cannot be sure exactly when the grace period started.
634 * Therefore, their recently registered callbacks must pass through
635 * an additional RCU_NEXT_READY stage, so that they will be handled
636 * by the next RCU grace period.
637 */
638 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
639 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
640}
641
642/*
573 * Start a new RCU grace period if warranted, re-initializing the hierarchy 643 * Start a new RCU grace period if warranted, re-initializing the hierarchy
574 * in preparation for detecting the next grace period. The caller must hold 644 * in preparation for detecting the next grace period. The caller must hold
575 * the root node's ->lock, which is released before return. Hard irqs must 645 * the root node's ->lock, which is released before return. Hard irqs must
@@ -596,26 +666,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
596 dyntick_record_completed(rsp, rsp->completed - 1); 666 dyntick_record_completed(rsp, rsp->completed - 1);
597 note_new_gpnum(rsp, rdp); 667 note_new_gpnum(rsp, rdp);
598 668
599 /*
600 * Because this CPU just now started the new grace period, we know
601 * that all of its callbacks will be covered by this upcoming grace
602 * period, even the ones that were registered arbitrarily recently.
603 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
604 *
605 * Other CPUs cannot be sure exactly when the grace period started.
606 * Therefore, their recently registered callbacks must pass through
607 * an additional RCU_NEXT_READY stage, so that they will be handled
608 * by the next RCU grace period.
609 */
610 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
611 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
612
613 /* Special-case the common single-level case. */ 669 /* Special-case the common single-level case. */
614 if (NUM_RCU_NODES == 1) { 670 if (NUM_RCU_NODES == 1) {
615 rcu_preempt_check_blocked_tasks(rnp); 671 rcu_preempt_check_blocked_tasks(rnp);
616 rnp->qsmask = rnp->qsmaskinit; 672 rnp->qsmask = rnp->qsmaskinit;
617 rnp->gpnum = rsp->gpnum; 673 rnp->gpnum = rsp->gpnum;
674 rnp->completed = rsp->completed;
618 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 675 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
676 rcu_start_gp_per_cpu(rsp, rnp, rdp);
619 spin_unlock_irqrestore(&rnp->lock, flags); 677 spin_unlock_irqrestore(&rnp->lock, flags);
620 return; 678 return;
621 } 679 }
@@ -648,6 +706,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
648 rcu_preempt_check_blocked_tasks(rnp); 706 rcu_preempt_check_blocked_tasks(rnp);
649 rnp->qsmask = rnp->qsmaskinit; 707 rnp->qsmask = rnp->qsmaskinit;
650 rnp->gpnum = rsp->gpnum; 708 rnp->gpnum = rsp->gpnum;
709 rnp->completed = rsp->completed;
710 if (rnp == rdp->mynode)
711 rcu_start_gp_per_cpu(rsp, rnp, rdp);
651 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 712 spin_unlock(&rnp->lock); /* irqs remain disabled. */
652 } 713 }
653 714
@@ -659,34 +720,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659} 720}
660 721
661/* 722/*
662 * Advance this CPU's callbacks, but only if the current grace period
663 * has ended. This may be called only from the CPU to whom the rdp
664 * belongs.
665 */
666static void
667rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
668{
669 long completed_snap;
670 unsigned long flags;
671
672 local_irq_save(flags);
673 completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock. */
674
675 /* Did another grace period end? */
676 if (rdp->completed != completed_snap) {
677
678 /* Advance callbacks. No harm if list empty. */
679 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
680 rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
681 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
682
683 /* Remember that we saw this grace-period completion. */
684 rdp->completed = completed_snap;
685 }
686 local_irq_restore(flags);
687}
688
689/*
690 * Clean up after the prior grace period and let rcu_start_gp() start up 723 * Clean up after the prior grace period and let rcu_start_gp() start up
691 * the next grace period if one is needed. Note that the caller must 724 * the next grace period if one is needed. Note that the caller must
692 * hold rnp->lock, as required by rcu_start_gp(), which will release it. 725 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
@@ -697,7 +730,6 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
697 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 730 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
698 rsp->completed = rsp->gpnum; 731 rsp->completed = rsp->gpnum;
699 rsp->signaled = RCU_GP_IDLE; 732 rsp->signaled = RCU_GP_IDLE;
700 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
701 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 733 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
702} 734}
703 735
@@ -1539,21 +1571,16 @@ static void __cpuinit
1539rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) 1571rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1540{ 1572{
1541 unsigned long flags; 1573 unsigned long flags;
1542 long lastcomp;
1543 unsigned long mask; 1574 unsigned long mask;
1544 struct rcu_data *rdp = rsp->rda[cpu]; 1575 struct rcu_data *rdp = rsp->rda[cpu];
1545 struct rcu_node *rnp = rcu_get_root(rsp); 1576 struct rcu_node *rnp = rcu_get_root(rsp);
1546 1577
1547 /* Set up local state, ensuring consistent view of global state. */ 1578 /* Set up local state, ensuring consistent view of global state. */
1548 spin_lock_irqsave(&rnp->lock, flags); 1579 spin_lock_irqsave(&rnp->lock, flags);
1549 lastcomp = rsp->completed;
1550 rdp->completed = lastcomp;
1551 rdp->gpnum = lastcomp;
1552 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1580 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1553 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1581 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1554 rdp->beenonline = 1; /* We have now been online. */ 1582 rdp->beenonline = 1; /* We have now been online. */
1555 rdp->preemptable = preemptable; 1583 rdp->preemptable = preemptable;
1556 rdp->passed_quiesc_completed = lastcomp - 1;
1557 rdp->qlen_last_fqs_check = 0; 1584 rdp->qlen_last_fqs_check = 0;
1558 rdp->n_force_qs_snap = rsp->n_force_qs; 1585 rdp->n_force_qs_snap = rsp->n_force_qs;
1559 rdp->blimit = blimit; 1586 rdp->blimit = blimit;
@@ -1575,6 +1602,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1575 spin_lock(&rnp->lock); /* irqs already disabled. */ 1602 spin_lock(&rnp->lock); /* irqs already disabled. */
1576 rnp->qsmaskinit |= mask; 1603 rnp->qsmaskinit |= mask;
1577 mask = rnp->grpmask; 1604 mask = rnp->grpmask;
1605 if (rnp == rdp->mynode) {
1606 rdp->gpnum = rnp->completed; /* if GP in progress... */
1607 rdp->completed = rnp->completed;
1608 rdp->passed_quiesc_completed = rnp->completed - 1;
1609 }
1578 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1610 spin_unlock(&rnp->lock); /* irqs already disabled. */
1579 rnp = rnp->parent; 1611 rnp = rnp->parent;
1580 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1612 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8a4c1650ad8d..c1891c3cae63 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,6 +84,9 @@ struct rcu_node {
84 long gpnum; /* Current grace period for this node. */ 84 long gpnum; /* Current grace period for this node. */
85 /* This will either be equal to or one */ 85 /* This will either be equal to or one */
86 /* behind the root rcu_node's gpnum. */ 86 /* behind the root rcu_node's gpnum. */
87 long completed; /* Last grace period completed for this node. */
88 /* This will either be equal to or one */
89 /* behind the root rcu_node's gpnum. */
87 unsigned long qsmask; /* CPUs or groups that need to switch in */ 90 unsigned long qsmask; /* CPUs or groups that need to switch in */
88 /* order for current grace period to proceed.*/ 91 /* order for current grace period to proceed.*/
89 /* In leaf rcu_node, each bit corresponds to */ 92 /* In leaf rcu_node, each bit corresponds to */