From 29494be71afe2a16ad04e344306a620d7cc22d06 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 20 Oct 2010 14:13:06 +0800 Subject: rcu,cleanup: simplify the code when cpu is dying When we handle the CPU_DYING notifier, the whole system is stopped except for the current CPU. We therefore need no synchronization with the other CPUs. This allows us to move any orphaned RCU callbacks directly to the list of any online CPU without needing to run them through the global orphan lists. These global orphan lists can therefore be dispensed with. This commit makes thes changes, though currently victimizes CPU 0 @@@. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 81 +++++++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 60 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c47981..669d7fe049d1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -67,9 +67,6 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .gpnum = -300, \ .completed = -300, \ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ - .orphan_cbs_list = NULL, \ - .orphan_cbs_tail = &structname.orphan_cbs_list, \ - .orphan_qlen = 0, \ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -984,53 +981,31 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* - * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the - * specified flavor of RCU. The callbacks will be adopted by the next - * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever - * comes first. Because this is invoked from the CPU_DYING notifier, - * irqs are already disabled. + * Move a dying CPU's RCU callbacks to online CPU's callback list. + * Synchronization is not required because this function executes + * in stop_machine() context. */ -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +static void rcu_send_cbs_to_online(struct rcu_state *rsp) { int i; + /* current DYING CPU is cleared in the cpu_online_mask */ + int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ - *rsp->orphan_cbs_tail = rdp->nxtlist; - rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - rsp->orphan_qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ -} - -/* - * Adopt previously orphaned RCU callbacks. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_data *rdp; - - raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = this_cpu_ptr(rsp->rda); - if (rsp->orphan_cbs_list == NULL) { - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); - return; - } - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; - rdp->qlen += rsp->orphan_qlen; - rdp->n_cbs_adopted += rsp->orphan_qlen; - rsp->orphan_cbs_list = NULL; - rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; - rsp->orphan_qlen = 0; - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -1081,8 +1056,6 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); - - rcu_adopt_orphan_cbs(rsp); } /* @@ -1100,11 +1073,7 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) -{ -} - -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +static void rcu_send_cbs_to_online(struct rcu_state *rsp) { } @@ -1702,10 +1671,7 @@ static void _rcu_barrier(struct rcu_state *rsp, * early. */ atomic_set(&rcu_barrier_cpu_count, 1); - preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ - rcu_adopt_orphan_cbs(rsp); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); - preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ if (atomic_dec_and_test(&rcu_barrier_cpu_count)) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); @@ -1831,18 +1797,13 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_DYING: case CPU_DYING_FROZEN: /* - * preempt_disable() in _rcu_barrier() prevents stop_machine(), - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" - * returns, all online cpus have queued rcu_barrier_func(). - * The dying CPU clears its cpu_online_mask bit and - * moves all of its RCU callbacks to ->orphan_cbs_list - * in the context of stop_machine(), so subsequent calls - * to _rcu_barrier() will adopt these callbacks and only - * then queue rcu_barrier_func() on all remaining CPUs. + * The whole machine is "stopped" except this cpu, so we can + * touch any data without introducing corruption. And we send + * the callbacks to an attribute chosen online cpu. */ - rcu_send_cbs_to_orphanage(&rcu_bh_state); - rcu_send_cbs_to_orphanage(&rcu_sched_state); - rcu_preempt_send_cbs_to_orphanage(); + rcu_send_cbs_to_online(&rcu_bh_state); + rcu_send_cbs_to_online(&rcu_sched_state); + rcu_preempt_send_cbs_to_online(); break; case CPU_DEAD: case CPU_DEAD_FROZEN: -- cgit v1.2.2 From 2d999e03b7c8305b4385dd20992e4ed3e827177b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Oct 2010 12:06:18 -0700 Subject: rcu: update documentation/comments for Lai's adoption patch Lai's RCU-callback immediate-adoption patch changes the RCU tracing output, so update tracing.txt. Also update a few comments to clarify the synchronization design. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 669d7fe049d1..120820ffc657 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1668,7 +1668,9 @@ static void _rcu_barrier(struct rcu_state *rsp, * decrement rcu_barrier_cpu_count -- otherwise the first CPU * might complete its grace period before all of the other CPUs * did their increment, causing this function to return too - * early. + * early. Note that on_each_cpu() disables irqs, which prevents + * any CPUs from coming online or going offline until each online + * CPU has queued its RCU-barrier callback. */ atomic_set(&rcu_barrier_cpu_count, 1); on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); @@ -1797,9 +1799,9 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_DYING: case CPU_DYING_FROZEN: /* - * The whole machine is "stopped" except this cpu, so we can - * touch any data without introducing corruption. And we send - * the callbacks to an attribute chosen online cpu. + * The whole machine is "stopped" except this CPU, so we can + * touch any data without introducing corruption. We send the + * dying CPU's callbacks to an arbitrarily chosen online CPU. */ rcu_send_cbs_to_online(&rcu_bh_state); rcu_send_cbs_to_online(&rcu_sched_state); -- cgit v1.2.2 From 909ea96468096b07fbb41aaf69be060d92bd9271 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 8 Dec 2010 16:22:55 +0100 Subject: core: Replace __get_cpu_var with __this_cpu_read if not used for an address. __get_cpu_var() can be replaced with this_cpu_read and will then use a single read instruction with implied address calculation to access the correct per cpu instance. However, the address of a per cpu variable passed to __this_cpu_read() cannot be determined (since it's an implied address conversion through segment prefixes). Therefore apply this only to uses of __get_cpu_var where the address of the variable is not used. Cc: Pekka Enberg Cc: Hugh Dickins Cc: Thomas Gleixner Acked-by: H. Peter Anvin Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/rcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ccdc04c47981..aeebf772d6a2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -367,8 +367,8 @@ void rcu_irq_exit(void) WARN_ON_ONCE(rdtp->dynticks & 0x1); /* If the interrupt queued a callback, get out of dyntick mode. */ - if (__get_cpu_var(rcu_sched_data).nxtlist || - __get_cpu_var(rcu_bh_data).nxtlist) + if (__this_cpu_read(rcu_sched_data.nxtlist) || + __this_cpu_read(rcu_bh_data.nxtlist)) set_need_resched(); } -- cgit v1.2.2 From 20377f32dcb77941d450728da18cce5b1a7faec5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 10 Dec 2010 22:11:10 +0100 Subject: rcu: Stop chasing QS if another CPU did it for us When a CPU is idle and others CPUs handled its extended quiescent state to complete grace periods on its behalf, it will catch up with completed grace periods numbers when it wakes up. But at this point there might be no more grace period to complete, but still the woken CPU always keeps its stale qs_pending value and will then continue to chase quiescent states even if its not needed anymore. This results in clusters of spurious softirqs until a new real grace period is started. Because if we continue to chase quiescent states but we have completed every grace periods, rcu_report_qs_rdp() is puzzled and makes that state run into infinite loops. As suggested by Lai Jiangshan, just reset qs_pending if someone completed every grace periods on our behalf. Suggested-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Steven Rostedt Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 120820ffc657..916f42b39f1e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -678,6 +678,14 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + + /* + * If another CPU handled our extended quiescent states and + * we have no more grace period to complete yet, then stop + * chasing quiescent states. + */ + if (rdp->completed == rnp->gpnum) + rdp->qs_pending = 0; } } -- cgit v1.2.2 From 5ff8e6f0535fe730e921ca347bc38dcb9e01791a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 10 Dec 2010 22:11:11 +0100 Subject: rcu: Keep gpnum and completed fields synchronized When a CPU that was in an extended quiescent state wakes up and catches up with grace periods that remote CPUs completed on its behalf, we update the completed field but not the gpnum that keeps a stale value of a backward grace period ID. Later, note_new_gpnum() will interpret the shift between the local CPU and the node grace period ID as some new grace period to handle and will then start to hunt quiescent state. But if every grace periods have already been completed, this interpretation becomes broken. And we'll be stuck in clusters of spurious softirqs because rcu_report_qs_rdp() will make this broken state run into infinite loop. The solution, as suggested by Lai Jiangshan, is to ensure that the gpnum and completed fields are well synchronized when we catch up with completed grace periods on their behalf by other cpus. This way we won't start noting spurious new grace periods. Suggested-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Steven Rostedt --- kernel/rcutree.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 916f42b39f1e..8105271fc10e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -679,6 +679,15 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + /* + * If we were in an extended quiescent state, we may have + * missed some grace periods that others CPUs took care on + * our behalf. Catch up with this state to avoid noting + * spurious new grace periods. + */ + if (rdp->completed > rdp->gpnum) + rdp->gpnum = rdp->completed; + /* * If another CPU handled our extended quiescent states and * we have no more grace period to complete yet, then stop -- cgit v1.2.2 From 121dfc4b3eba9e2f3c42d35205a3510cc65b9931 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 10 Dec 2010 15:02:47 -0800 Subject: rcu: fine-tune grace-period begin/end checks Use the CPU's bit in rnp->qsmask to determine whether or not the CPU should try to report a quiescent state. Handle overflow in the check for rdp->gpnum having fallen behind. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8105271fc10e..c39ec5b4ae82 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -617,9 +617,17 @@ static void __init check_cpu_stall_init(void) static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { if (rdp->gpnum != rnp->gpnum) { - rdp->qs_pending = 1; - rdp->passed_quiesc = 0; + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ rdp->gpnum = rnp->gpnum; + if (rnp->qsmask & rdp->grpmask) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + } else + rdp->qs_pending = 0; } } @@ -681,19 +689,20 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* * If we were in an extended quiescent state, we may have - * missed some grace periods that others CPUs took care on + * missed some grace periods that others CPUs handled on * our behalf. Catch up with this state to avoid noting - * spurious new grace periods. + * spurious new grace periods. If another grace period + * has started, then rnp->gpnum will have advanced, so + * we will detect this later on. */ - if (rdp->completed > rdp->gpnum) + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) rdp->gpnum = rdp->completed; /* - * If another CPU handled our extended quiescent states and - * we have no more grace period to complete yet, then stop - * chasing quiescent states. + * If RCU does not need a quiescent state from this CPU, + * then make sure that this CPU doesn't go looking for one. */ - if (rdp->completed == rnp->gpnum) + if ((rnp->qsmask & rdp->grpmask) == 0) rdp->qs_pending = 0; } } -- cgit v1.2.2 From 0209f6490b030f35349a2bb71294f3fd75b0f36d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Dec 2010 16:07:52 -0800 Subject: rcu: limit rcu_node leaf-level fanout Some recent benchmarks have indicated possible lock contention on the leaf-level rcu_node locks. This commit therefore limits the number of CPUs per leaf-level rcu_node structure to 16, in other words, there can be at most 16 rcu_data structures fanning into a given rcu_node structure. Prior to this, the limit was 32 on 32-bit systems and 64 on 64-bit systems. Note that the fanout of non-leaf rcu_node structures is unchanged. The organization of accesses to the rcu_node tree is such that references to non-leaf rcu_node structures are much less frequent than to the leaf structures. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c39ec5b4ae82..01c8ad33c510 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1869,8 +1869,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) + for (i = NUM_RCU_LVLS - 1; i > 0; i--) rsp->levelspread[i] = CONFIG_RCU_FANOUT; + rsp->levelspread[0] = RCU_FANOUT_LEAF; } #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ static void __init rcu_init_levelspread(struct rcu_state *rsp) -- cgit v1.2.2 From b52573d2796274f7f31cfeff7185c320adcd4f12 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Dec 2010 17:36:02 -0800 Subject: rcu: reduce __call_rcu()-induced contention on rcu_node structures When the current __call_rcu() function was written, the expedited APIs did not exist. The __call_rcu() implementation therefore went to great lengths to detect the end of old grace periods and to start new ones, all in the name of reducing grace-period latency. Now the expedited APIs do exist, and the usage of __call_rcu() has increased considerably. This commit therefore causes __call_rcu() to avoid worrying about grace periods unless there are a large number of RCU callbacks stacked up on the current CPU. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) (limited to 'kernel/rcutree.c') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 01c8ad33c510..d0ddfea6579d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1435,22 +1435,11 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ local_irq_save(flags); rdp = this_cpu_ptr(rsp->rda); - rcu_process_gp_end(rsp, rdp); - check_for_new_grace_period(rsp, rdp); /* Add the callback to our list. */ *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; - /* Start a new grace period if one not already started. */ - if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ - } - /* * Force the grace period if too many callbacks or too long waiting. * Enforce hysteresis, and don't invoke force_quiescent_state() @@ -1459,12 +1448,27 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * is the only one waiting for a grace period to complete. */ if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - rdp->blimit = LONG_MAX; - if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) - force_quiescent_state(rsp, 0); - rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) force_quiescent_state(rsp, 1); local_irq_restore(flags); -- cgit v1.2.2