aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2012-10-07 11:36:12 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2012-10-08 12:06:38 -0400
commita4fbe35a124526e6759be07bd9c7ea796ba1e00d (patch)
treecb5c5a1608fcff588ed9a204ea67d5891adb18fb /kernel
parentcb349ca95407cbc11424d5e9fc7c8e700709041b (diff)
rcu: Grace-period initialization excludes only RCU notifier
Kirill noted the following deadlock cycle on shutdown involving padata: > With commit 755609a9087fa983f567dc5452b2fa7b089b591f I've got deadlock on > poweroff. > > It guess it happens because of race for cpu_hotplug.lock: > > CPU A CPU B > disable_nonboot_cpus() > _cpu_down() > cpu_hotplug_begin() > mutex_lock(&cpu_hotplug.lock); > __cpu_notify() > padata_cpu_callback() > __padata_remove_cpu() > padata_replace() > synchronize_rcu() > rcu_gp_kthread() > get_online_cpus(); > mutex_lock(&cpu_hotplug.lock); It would of course be good to eliminate grace-period delays from CPU-hotplug notifiers, but that is a separate issue. Deadlock is not an appropriate diagnostic for excessive CPU-hotplug latency. Fortunately, grace-period initialization does not actually need to exclude all of the CPU-hotplug operation, but rather only RCU's own CPU_UP_PREPARE and CPU_DEAD CPU-hotplug notifiers. This commit therefore introduces a new per-rcu_state onoff_mutex that provides the required concurrency control in place of the get_online_cpus() that was previously in rcu_gp_init(). Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Kirill A. Shutemov <kirill@shutemov.name>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/rcutree.c21
-rw-r--r--kernel/rcutree.h6
2 files changed, 16 insertions, 11 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 4fb2376ddf06..74df86bd9204 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 74 .orphan_nxttail = &sname##_state.orphan_nxtlist, \
75 .orphan_donetail = &sname##_state.orphan_donelist, \ 75 .orphan_donetail = &sname##_state.orphan_donelist, \
76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 76 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
77 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
77 .name = #sname, \ 78 .name = #sname, \
78} 79}
79 80
@@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1197 raw_spin_unlock_irq(&rnp->lock); 1198 raw_spin_unlock_irq(&rnp->lock);
1198 1199
1199 /* Exclude any concurrent CPU-hotplug operations. */ 1200 /* Exclude any concurrent CPU-hotplug operations. */
1200 get_online_cpus(); 1201 mutex_lock(&rsp->onoff_mutex);
1201 1202
1202 /* 1203 /*
1203 * Set the quiescent-state-needed bits in all the rcu_node 1204 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1234 cond_resched(); 1235 cond_resched();
1235 } 1236 }
1236 1237
1237 put_online_cpus(); 1238 mutex_unlock(&rsp->onoff_mutex);
1238 return 1; 1239 return 1;
1239} 1240}
1240 1241
@@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1700 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ 1701 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
1701 1702
1702 /* Exclude any attempts to start a new grace period. */ 1703 /* Exclude any attempts to start a new grace period. */
1704 mutex_lock(&rsp->onoff_mutex);
1703 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1705 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1704 1706
1705 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 1707 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
@@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1744 init_callback_list(rdp); 1746 init_callback_list(rdp);
1745 /* Disallow further callbacks on this CPU. */ 1747 /* Disallow further callbacks on this CPU. */
1746 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 1748 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
1749 mutex_unlock(&rsp->onoff_mutex);
1747} 1750}
1748 1751
1749#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1752#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2648 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2651 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2649 struct rcu_node *rnp = rcu_get_root(rsp); 2652 struct rcu_node *rnp = rcu_get_root(rsp);
2650 2653
2654 /* Exclude new grace periods. */
2655 mutex_lock(&rsp->onoff_mutex);
2656
2651 /* Set up local state, ensuring consistent view of global state. */ 2657 /* Set up local state, ensuring consistent view of global state. */
2652 raw_spin_lock_irqsave(&rnp->lock, flags); 2658 raw_spin_lock_irqsave(&rnp->lock, flags);
2653 rdp->beenonline = 1; /* We have now been online. */ 2659 rdp->beenonline = 1; /* We have now been online. */
@@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2662 rcu_prepare_for_idle_init(cpu); 2668 rcu_prepare_for_idle_init(cpu);
2663 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 2669 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
2664 2670
2665 /*
2666 * A new grace period might start here. If so, we won't be part
2667 * of it, but that is OK, as we are currently in a quiescent state.
2668 */
2669
2670 /* Exclude any attempts to start a new GP on large systems. */
2671 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
2672
2673 /* Add CPU to rcu_node bitmasks. */ 2671 /* Add CPU to rcu_node bitmasks. */
2674 rnp = rdp->mynode; 2672 rnp = rdp->mynode;
2675 mask = rdp->grpmask; 2673 mask = rdp->grpmask;
@@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2693 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ 2691 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
2694 rnp = rnp->parent; 2692 rnp = rnp->parent;
2695 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 2693 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
2694 local_irq_restore(flags);
2696 2695
2697 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2696 mutex_unlock(&rsp->onoff_mutex);
2698} 2697}
2699 2698
2700static void __cpuinit rcu_prepare_cpu(int cpu) 2699static void __cpuinit rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5faf05d68326..a240f032848e 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -394,11 +394,17 @@ struct rcu_state {
394 struct rcu_head **orphan_donetail; /* Tail of above. */ 394 struct rcu_head **orphan_donetail; /* Tail of above. */
395 long qlen_lazy; /* Number of lazy callbacks. */ 395 long qlen_lazy; /* Number of lazy callbacks. */
396 long qlen; /* Total number of callbacks. */ 396 long qlen; /* Total number of callbacks. */
397 /* End of fields guarded by onofflock. */
398
399 struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */
400
397 struct mutex barrier_mutex; /* Guards barrier fields. */ 401 struct mutex barrier_mutex; /* Guards barrier fields. */
398 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 402 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
399 struct completion barrier_completion; /* Wake at barrier end. */ 403 struct completion barrier_completion; /* Wake at barrier end. */
400 unsigned long n_barrier_done; /* ++ at start and end of */ 404 unsigned long n_barrier_done; /* ++ at start and end of */
401 /* _rcu_barrier(). */ 405 /* _rcu_barrier(). */
406 /* End of fields guarded by barrier_mutex. */
407
402 unsigned long jiffies_force_qs; /* Time at which to invoke */ 408 unsigned long jiffies_force_qs; /* Time at which to invoke */
403 /* force_quiescent_state(). */ 409 /* force_quiescent_state(). */
404 unsigned long n_force_qs; /* Number of calls to */ 410 unsigned long n_force_qs; /* Number of calls to */