diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-10-07 11:36:12 -0400 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-10-08 12:06:38 -0400 |
commit | a4fbe35a124526e6759be07bd9c7ea796ba1e00d (patch) | |
tree | cb5c5a1608fcff588ed9a204ea67d5891adb18fb /kernel | |
parent | cb349ca95407cbc11424d5e9fc7c8e700709041b (diff) |
rcu: Grace-period initialization excludes only RCU notifier
Kirill noted the following deadlock cycle on shutdown involving padata:
> With commit 755609a9087fa983f567dc5452b2fa7b089b591f I've got deadlock on
> poweroff.
>
> It guess it happens because of race for cpu_hotplug.lock:
>
> CPU A CPU B
> disable_nonboot_cpus()
> _cpu_down()
> cpu_hotplug_begin()
> mutex_lock(&cpu_hotplug.lock);
> __cpu_notify()
> padata_cpu_callback()
> __padata_remove_cpu()
> padata_replace()
> synchronize_rcu()
> rcu_gp_kthread()
> get_online_cpus();
> mutex_lock(&cpu_hotplug.lock);
It would of course be good to eliminate grace-period delays from
CPU-hotplug notifiers, but that is a separate issue. Deadlock is
not an appropriate diagnostic for excessive CPU-hotplug latency.
Fortunately, grace-period initialization does not actually need to
exclude all of the CPU-hotplug operation, but rather only RCU's own
CPU_UP_PREPARE and CPU_DEAD CPU-hotplug notifiers. This commit therefore
introduces a new per-rcu_state onoff_mutex that provides the required
concurrency control in place of the get_online_cpus() that was previously
in rcu_gp_init().
Reported-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Kirill A. Shutemov <kirill@shutemov.name>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/rcutree.c | 21 | ||||
-rw-r--r-- | kernel/rcutree.h | 6 |
2 files changed, 16 insertions, 11 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4fb2376ddf06..74df86bd9204 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -74,6 +74,7 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | |||
74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
75 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 75 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
77 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | ||
77 | .name = #sname, \ | 78 | .name = #sname, \ |
78 | } | 79 | } |
79 | 80 | ||
@@ -1197,7 +1198,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1197 | raw_spin_unlock_irq(&rnp->lock); | 1198 | raw_spin_unlock_irq(&rnp->lock); |
1198 | 1199 | ||
1199 | /* Exclude any concurrent CPU-hotplug operations. */ | 1200 | /* Exclude any concurrent CPU-hotplug operations. */ |
1200 | get_online_cpus(); | 1201 | mutex_lock(&rsp->onoff_mutex); |
1201 | 1202 | ||
1202 | /* | 1203 | /* |
1203 | * Set the quiescent-state-needed bits in all the rcu_node | 1204 | * Set the quiescent-state-needed bits in all the rcu_node |
@@ -1234,7 +1235,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1234 | cond_resched(); | 1235 | cond_resched(); |
1235 | } | 1236 | } |
1236 | 1237 | ||
1237 | put_online_cpus(); | 1238 | mutex_unlock(&rsp->onoff_mutex); |
1238 | return 1; | 1239 | return 1; |
1239 | } | 1240 | } |
1240 | 1241 | ||
@@ -1700,6 +1701,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1700 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ | 1701 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
1701 | 1702 | ||
1702 | /* Exclude any attempts to start a new grace period. */ | 1703 | /* Exclude any attempts to start a new grace period. */ |
1704 | mutex_lock(&rsp->onoff_mutex); | ||
1703 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 1705 | raw_spin_lock_irqsave(&rsp->onofflock, flags); |
1704 | 1706 | ||
1705 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | 1707 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ |
@@ -1744,6 +1746,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
1744 | init_callback_list(rdp); | 1746 | init_callback_list(rdp); |
1745 | /* Disallow further callbacks on this CPU. */ | 1747 | /* Disallow further callbacks on this CPU. */ |
1746 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | 1748 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; |
1749 | mutex_unlock(&rsp->onoff_mutex); | ||
1747 | } | 1750 | } |
1748 | 1751 | ||
1749 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1752 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
@@ -2648,6 +2651,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2648 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | 2651 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); |
2649 | struct rcu_node *rnp = rcu_get_root(rsp); | 2652 | struct rcu_node *rnp = rcu_get_root(rsp); |
2650 | 2653 | ||
2654 | /* Exclude new grace periods. */ | ||
2655 | mutex_lock(&rsp->onoff_mutex); | ||
2656 | |||
2651 | /* Set up local state, ensuring consistent view of global state. */ | 2657 | /* Set up local state, ensuring consistent view of global state. */ |
2652 | raw_spin_lock_irqsave(&rnp->lock, flags); | 2658 | raw_spin_lock_irqsave(&rnp->lock, flags); |
2653 | rdp->beenonline = 1; /* We have now been online. */ | 2659 | rdp->beenonline = 1; /* We have now been online. */ |
@@ -2662,14 +2668,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2662 | rcu_prepare_for_idle_init(cpu); | 2668 | rcu_prepare_for_idle_init(cpu); |
2663 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 2669 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
2664 | 2670 | ||
2665 | /* | ||
2666 | * A new grace period might start here. If so, we won't be part | ||
2667 | * of it, but that is OK, as we are currently in a quiescent state. | ||
2668 | */ | ||
2669 | |||
2670 | /* Exclude any attempts to start a new GP on large systems. */ | ||
2671 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | ||
2672 | |||
2673 | /* Add CPU to rcu_node bitmasks. */ | 2671 | /* Add CPU to rcu_node bitmasks. */ |
2674 | rnp = rdp->mynode; | 2672 | rnp = rdp->mynode; |
2675 | mask = rdp->grpmask; | 2673 | mask = rdp->grpmask; |
@@ -2693,8 +2691,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
2693 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2691 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
2694 | rnp = rnp->parent; | 2692 | rnp = rnp->parent; |
2695 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | 2693 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); |
2694 | local_irq_restore(flags); | ||
2696 | 2695 | ||
2697 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 2696 | mutex_unlock(&rsp->onoff_mutex); |
2698 | } | 2697 | } |
2699 | 2698 | ||
2700 | static void __cpuinit rcu_prepare_cpu(int cpu) | 2699 | static void __cpuinit rcu_prepare_cpu(int cpu) |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 5faf05d68326..a240f032848e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -394,11 +394,17 @@ struct rcu_state { | |||
394 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 394 | struct rcu_head **orphan_donetail; /* Tail of above. */ |
395 | long qlen_lazy; /* Number of lazy callbacks. */ | 395 | long qlen_lazy; /* Number of lazy callbacks. */ |
396 | long qlen; /* Total number of callbacks. */ | 396 | long qlen; /* Total number of callbacks. */ |
397 | /* End of fields guarded by onofflock. */ | ||
398 | |||
399 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ | ||
400 | |||
397 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 401 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
398 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 402 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ |
399 | struct completion barrier_completion; /* Wake at barrier end. */ | 403 | struct completion barrier_completion; /* Wake at barrier end. */ |
400 | unsigned long n_barrier_done; /* ++ at start and end of */ | 404 | unsigned long n_barrier_done; /* ++ at start and end of */ |
401 | /* _rcu_barrier(). */ | 405 | /* _rcu_barrier(). */ |
406 | /* End of fields guarded by barrier_mutex. */ | ||
407 | |||
402 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 408 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
403 | /* force_quiescent_state(). */ | 409 | /* force_quiescent_state(). */ |
404 | unsigned long n_force_qs; /* Number of calls to */ | 410 | unsigned long n_force_qs; /* Number of calls to */ |