aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBen Greear <greearb@candelatech.com>2013-06-06 17:29:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-06-10 20:46:57 -0400
commit34376a50fb1fa095b9d0636fa41ed2e73125f214 (patch)
treed6bfe7027fd066762dcc0e66820f707ce18e1371
parent1b79821fe7855c34030bc2f177660874e0a0ab62 (diff)
Fix lockup related to stop_machine being stuck in __do_softirq.
The stop machine logic can lock up if all but one of the migration threads make it through the disable-irq step and the one remaining thread gets stuck in __do_softirq. The reason __do_softirq can hang is that it has a bail-out based on jiffies timeout, but in the lockup case, jiffies itself is not incremented. To work around this, re-add the max_restart counter in __do_irq and stop processing irqs after 10 restarts. Thanks to Tejun Heo and Rusty Russell and others for helping me track this down. This was introduced in 3.9 by commit c10d73671ad3 ("softirq: reduce latencies"). It may be worth looking into ath9k to see if it has issues with its irq handler at a later date. The hang stack traces look something like this: ------------[ cut here ]------------ WARNING: at kernel/watchdog.c:245 watchdog_overflow_callback+0x9c/0xa7() Watchdog detected hard LOCKUP on cpu 2 Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] Pid: 23, comm: migration/2 Tainted: G C 3.9.4+ #11 Call Trace: <NMI> warn_slowpath_common+0x85/0x9f warn_slowpath_fmt+0x46/0x48 watchdog_overflow_callback+0x9c/0xa7 __perf_event_overflow+0x137/0x1cb perf_event_overflow+0x14/0x16 intel_pmu_handle_irq+0x2dc/0x359 perf_event_nmi_handler+0x19/0x1b nmi_handle+0x7f/0xc2 do_nmi+0xbc/0x304 end_repeat_nmi+0x1e/0x2e <<EOE>> cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 ---[ end trace 4947dfa9b0a4cec3 ]--- BUG: soft lockup - CPU#1 stuck for 22s! [migration/1:17] Modules linked in: ath9k ath9k_common ath9k_hw ath mac80211 cfg80211 nfsv4 auth_rpcgss nfs fscache nf_nat_ipv4 nf_nat veth 8021q garp stp mrp llc pktgen lockd sunrpc] irq event stamp: 835637905 hardirqs last enabled at (835637904): __do_softirq+0x9f/0x257 hardirqs last disabled at (835637905): apic_timer_interrupt+0x6d/0x80 softirqs last enabled at (5654720): __do_softirq+0x1ff/0x257 softirqs last disabled at (5654725): irq_exit+0x5f/0xbb CPU 1 Pid: 17, comm: migration/1 Tainted: G WC 3.9.4+ #11 To be filled by O.E.M. To be filled by O.E.M./To be filled by O.E.M. RIP: tasklet_hi_action+0xf0/0xf0 Process migration/1 Call Trace: <IRQ> __do_softirq+0x117/0x257 irq_exit+0x5f/0xbb smp_apic_timer_interrupt+0x8a/0x98 apic_timer_interrupt+0x72/0x80 <EOI> printk+0x4d/0x4f stop_machine_cpu_stop+0x22c/0x274 cpu_stopper_thread+0xae/0x162 smpboot_thread_fn+0x258/0x260 kthread+0xc7/0xcf ret_from_fork+0x7c/0xb0 Signed-off-by: Ben Greear <greearb@candelatech.com> Acked-by: Tejun Heo <tj@kernel.org> Acked-by: Pekka Riikonen <priikone@iki.fi> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--kernel/softirq.c13
1 files changed, 10 insertions, 3 deletions
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0dad..3d6833f125d3 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip)
195EXPORT_SYMBOL(local_bh_enable_ip); 195EXPORT_SYMBOL(local_bh_enable_ip);
196 196
197/* 197/*
198 * We restart softirq processing for at most 2 ms, 198 * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
199 * and if need_resched() is not set. 199 * but break the loop if need_resched() is set or after 2 ms.
200 * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
201 * certain cases, such as stop_machine(), jiffies may cease to
202 * increment and so we need the MAX_SOFTIRQ_RESTART limit as
203 * well to make sure we eventually return from this method.
200 * 204 *
201 * These limits have been established via experimentation. 205 * These limits have been established via experimentation.
202 * The two things to balance is latency against fairness - 206 * The two things to balance is latency against fairness -
@@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
204 * should not be able to lock up the box. 208 * should not be able to lock up the box.
205 */ 209 */
206#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2) 210#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
211#define MAX_SOFTIRQ_RESTART 10
207 212
208asmlinkage void __do_softirq(void) 213asmlinkage void __do_softirq(void)
209{ 214{
@@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void)
212 unsigned long end = jiffies + MAX_SOFTIRQ_TIME; 217 unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
213 int cpu; 218 int cpu;
214 unsigned long old_flags = current->flags; 219 unsigned long old_flags = current->flags;
220 int max_restart = MAX_SOFTIRQ_RESTART;
215 221
216 /* 222 /*
217 * Mask out PF_MEMALLOC s current task context is borrowed for the 223 * Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -265,7 +271,8 @@ restart:
265 271
266 pending = local_softirq_pending(); 272 pending = local_softirq_pending();
267 if (pending) { 273 if (pending) {
268 if (time_before(jiffies, end) && !need_resched()) 274 if (time_before(jiffies, end) && !need_resched() &&
275 --max_restart)
269 goto restart; 276 goto restart;
270 277
271 wakeup_softirqd(); 278 wakeup_softirqd();