summaryrefslogtreecommitdiffstats
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2013-10-04 17:33:34 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2013-12-03 13:10:18 -0500
commit96d3fd0d315a949e30adc80f086031c5cdf070d1 (patch)
tree0fe7013d59b4d69a91bf031c0a53e8d279413e4a /kernel/rcu/tree.c
parent78e4bc34e5d966cfd95f1238565afc399d56225c (diff)
rcu: Break call_rcu() deadlock involving scheduler and perf
Dave Jones got the following lockdep splat: > ====================================================== > [ INFO: possible circular locking dependency detected ] > 3.12.0-rc3+ #92 Not tainted > ------------------------------------------------------- > trinity-child2/15191 is trying to acquire lock: > (&rdp->nocb_wq){......}, at: [<ffffffff8108ff43>] __wake_up+0x23/0x50 > > but task is already holding lock: > (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230 > > which lock already depends on the new lock. > > > the existing dependency chain (in reverse order) is: > > -> #3 (&ctx->lock){-.-...}: > [<ffffffff810cc243>] lock_acquire+0x93/0x200 > [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80 > [<ffffffff811500ff>] __perf_event_task_sched_out+0x2df/0x5e0 > [<ffffffff81091b83>] perf_event_task_sched_out+0x93/0xa0 > [<ffffffff81732052>] __schedule+0x1d2/0xa20 > [<ffffffff81732f30>] preempt_schedule_irq+0x50/0xb0 > [<ffffffff817352b6>] retint_kernel+0x26/0x30 > [<ffffffff813eed04>] tty_flip_buffer_push+0x34/0x50 > [<ffffffff813f0504>] pty_write+0x54/0x60 > [<ffffffff813e900d>] n_tty_write+0x32d/0x4e0 > [<ffffffff813e5838>] tty_write+0x158/0x2d0 > [<ffffffff811c4850>] vfs_write+0xc0/0x1f0 > [<ffffffff811c52cc>] SyS_write+0x4c/0xa0 > [<ffffffff8173d4e4>] tracesys+0xdd/0xe2 > > -> #2 (&rq->lock){-.-.-.}: > [<ffffffff810cc243>] lock_acquire+0x93/0x200 > [<ffffffff81733f90>] _raw_spin_lock+0x40/0x80 > [<ffffffff810980b2>] wake_up_new_task+0xc2/0x2e0 > [<ffffffff81054336>] do_fork+0x126/0x460 > [<ffffffff81054696>] kernel_thread+0x26/0x30 > [<ffffffff8171ff93>] rest_init+0x23/0x140 > [<ffffffff81ee1e4b>] start_kernel+0x3f6/0x403 > [<ffffffff81ee1571>] x86_64_start_reservations+0x2a/0x2c > [<ffffffff81ee1664>] x86_64_start_kernel+0xf1/0xf4 > > -> #1 (&p->pi_lock){-.-.-.}: > [<ffffffff810cc243>] lock_acquire+0x93/0x200 > [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90 > [<ffffffff810979d1>] try_to_wake_up+0x31/0x350 > [<ffffffff81097d62>] default_wake_function+0x12/0x20 > [<ffffffff81084af8>] autoremove_wake_function+0x18/0x40 > [<ffffffff8108ea38>] __wake_up_common+0x58/0x90 > [<ffffffff8108ff59>] __wake_up+0x39/0x50 > [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0 > [<ffffffff81111450>] __call_rcu+0x140/0x820 > [<ffffffff81111b8d>] call_rcu+0x1d/0x20 > [<ffffffff81093697>] cpu_attach_domain+0x287/0x360 > [<ffffffff81099d7e>] build_sched_domains+0xe5e/0x10a0 > [<ffffffff81efa7fc>] sched_init_smp+0x3b7/0x47a > [<ffffffff81ee1f4e>] kernel_init_freeable+0xf6/0x202 > [<ffffffff817200be>] kernel_init+0xe/0x190 > [<ffffffff8173d22c>] ret_from_fork+0x7c/0xb0 > > -> #0 (&rdp->nocb_wq){......}: > [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0 > [<ffffffff810cc243>] lock_acquire+0x93/0x200 > [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90 > [<ffffffff8108ff43>] __wake_up+0x23/0x50 > [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0 > [<ffffffff81111450>] __call_rcu+0x140/0x820 > [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30 > [<ffffffff81149abf>] put_ctx+0x4f/0x70 > [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230 > [<ffffffff81056b8d>] do_exit+0x30d/0xcc0 > [<ffffffff8105893c>] do_group_exit+0x4c/0xc0 > [<ffffffff810589c4>] SyS_exit_group+0x14/0x20 > [<ffffffff8173d4e4>] tracesys+0xdd/0xe2 > > other info that might help us debug this: > > Chain exists of: > &rdp->nocb_wq --> &rq->lock --> &ctx->lock > > Possible unsafe locking scenario: > > CPU0 CPU1 > ---- ---- > lock(&ctx->lock); > lock(&rq->lock); > lock(&ctx->lock); > lock(&rdp->nocb_wq); > > *** DEADLOCK *** > > 1 lock held by trinity-child2/15191: > #0: (&ctx->lock){-.-...}, at: [<ffffffff81154c19>] perf_event_exit_task+0x109/0x230 > > stack backtrace: > CPU: 2 PID: 15191 Comm: trinity-child2 Not tainted 3.12.0-rc3+ #92 > ffffffff82565b70 ffff880070c2dbf8 ffffffff8172a363 ffffffff824edf40 > ffff880070c2dc38 ffffffff81726741 ffff880070c2dc90 ffff88022383b1c0 > ffff88022383aac0 0000000000000000 ffff88022383b188 ffff88022383b1c0 > Call Trace: > [<ffffffff8172a363>] dump_stack+0x4e/0x82 > [<ffffffff81726741>] print_circular_bug+0x200/0x20f > [<ffffffff810cb7ca>] __lock_acquire+0x191a/0x1be0 > [<ffffffff810c6439>] ? get_lock_stats+0x19/0x60 > [<ffffffff8100b2f4>] ? native_sched_clock+0x24/0x80 > [<ffffffff810cc243>] lock_acquire+0x93/0x200 > [<ffffffff8108ff43>] ? __wake_up+0x23/0x50 > [<ffffffff8173419b>] _raw_spin_lock_irqsave+0x4b/0x90 > [<ffffffff8108ff43>] ? __wake_up+0x23/0x50 > [<ffffffff8108ff43>] __wake_up+0x23/0x50 > [<ffffffff8110d4f8>] __call_rcu_nocb_enqueue+0xa8/0xc0 > [<ffffffff81111450>] __call_rcu+0x140/0x820 > [<ffffffff8109bc8f>] ? local_clock+0x3f/0x50 > [<ffffffff81111bb0>] kfree_call_rcu+0x20/0x30 > [<ffffffff81149abf>] put_ctx+0x4f/0x70 > [<ffffffff81154c3e>] perf_event_exit_task+0x12e/0x230 > [<ffffffff81056b8d>] do_exit+0x30d/0xcc0 > [<ffffffff810c9af5>] ? trace_hardirqs_on_caller+0x115/0x1e0 > [<ffffffff810c9bcd>] ? trace_hardirqs_on+0xd/0x10 > [<ffffffff8105893c>] do_group_exit+0x4c/0xc0 > [<ffffffff810589c4>] SyS_exit_group+0x14/0x20 > [<ffffffff8173d4e4>] tracesys+0xdd/0xe2 The underlying problem is that perf is invoking call_rcu() with the scheduler locks held, but in NOCB mode, call_rcu() will with high probability invoke the scheduler -- which just might want to use its locks. The reason that call_rcu() needs to invoke the scheduler is to wake up the corresponding rcuo callback-offload kthread, which does the job of starting up a grace period and invoking the callbacks afterwards. One solution (championed on a related problem by Lai Jiangshan) is to simply defer the wakeup to some point where scheduler locks are no longer held. Since we don't want to unnecessarily incur the cost of such deferral, the task before us is threefold: 1. Determine when it is likely that a relevant scheduler lock is held. 2. Defer the wakeup in such cases. 3. Ensure that all deferred wakeups eventually happen, preferably sooner rather than later. We use irqs_disabled_flags() as a proxy for relevant scheduler locks being held. This works because the relevant locks are always acquired with interrupts disabled. We may defer more often than needed, but that is at least safe. The wakeup deferral is tracked via a new field in the per-CPU and per-RCU-flavor rcu_data structure, namely ->nocb_defer_wakeup. This flag is checked by the RCU core processing. The __rcu_pending() function now checks this flag, which causes rcu_check_callbacks() to initiate RCU core processing at each scheduling-clock interrupt where this flag is set. Of course this is not sufficient because scheduling-clock interrupts are often turned off (the things we used to be able to count on!). So the flags are also checked on entry to any state that RCU considers to be idle, which includes both NO_HZ_IDLE idle state and NO_HZ_FULL user-mode-execution state. This approach should allow call_rcu() to be invoked regardless of what locks you might be holding, the key word being "should". Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c24
1 files changed, 20 insertions, 4 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index abef9c358d47..264f0284c0bd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -369,6 +369,9 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, 369static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
370 bool user) 370 bool user)
371{ 371{
372 struct rcu_state *rsp;
373 struct rcu_data *rdp;
374
372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 375 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
373 if (!user && !is_idle_task(current)) { 376 if (!user && !is_idle_task(current)) {
374 struct task_struct *idle __maybe_unused = 377 struct task_struct *idle __maybe_unused =
@@ -380,6 +383,10 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
380 current->pid, current->comm, 383 current->pid, current->comm,
381 idle->pid, idle->comm); /* must be idle task! */ 384 idle->pid, idle->comm); /* must be idle task! */
382 } 385 }
386 for_each_rcu_flavor(rsp) {
387 rdp = this_cpu_ptr(rsp->rda);
388 do_nocb_deferred_wakeup(rdp);
389 }
383 rcu_prepare_for_idle(smp_processor_id()); 390 rcu_prepare_for_idle(smp_processor_id());
384 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ 391 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
385 smp_mb__before_atomic_inc(); /* See above. */ 392 smp_mb__before_atomic_inc(); /* See above. */
@@ -1928,13 +1935,13 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
1928 * Adopt the RCU callbacks from the specified rcu_state structure's 1935 * Adopt the RCU callbacks from the specified rcu_state structure's
1929 * orphanage. The caller must hold the ->orphan_lock. 1936 * orphanage. The caller must hold the ->orphan_lock.
1930 */ 1937 */
1931static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) 1938static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
1932{ 1939{
1933 int i; 1940 int i;
1934 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); 1941 struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
1935 1942
1936 /* No-CBs CPUs are handled specially. */ 1943 /* No-CBs CPUs are handled specially. */
1937 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) 1944 if (rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
1938 return; 1945 return;
1939 1946
1940 /* Do the accounting first. */ 1947 /* Do the accounting first. */
@@ -2013,7 +2020,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2013 2020
2014 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2021 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
2015 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2022 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
2016 rcu_adopt_orphan_cbs(rsp); 2023 rcu_adopt_orphan_cbs(rsp, flags);
2017 2024
2018 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 2025 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
2019 mask = rdp->grpmask; /* rnp->grplo is constant. */ 2026 mask = rdp->grpmask; /* rnp->grplo is constant. */
@@ -2330,6 +2337,9 @@ __rcu_process_callbacks(struct rcu_state *rsp)
2330 /* If there are callbacks ready, invoke them. */ 2337 /* If there are callbacks ready, invoke them. */
2331 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2338 if (cpu_has_callbacks_ready_to_invoke(rdp))
2332 invoke_rcu_callbacks(rsp, rdp); 2339 invoke_rcu_callbacks(rsp, rdp);
2340
2341 /* Do any needed deferred wakeups of rcuo kthreads. */
2342 do_nocb_deferred_wakeup(rdp);
2333} 2343}
2334 2344
2335/* 2345/*
@@ -2464,7 +2474,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
2464 2474
2465 if (cpu != -1) 2475 if (cpu != -1)
2466 rdp = per_cpu_ptr(rsp->rda, cpu); 2476 rdp = per_cpu_ptr(rsp->rda, cpu);
2467 offline = !__call_rcu_nocb(rdp, head, lazy); 2477 offline = !__call_rcu_nocb(rdp, head, lazy, flags);
2468 WARN_ON_ONCE(offline); 2478 WARN_ON_ONCE(offline);
2469 /* _call_rcu() is illegal on offline CPU; leak the callback. */ 2479 /* _call_rcu() is illegal on offline CPU; leak the callback. */
2470 local_irq_restore(flags); 2480 local_irq_restore(flags);
@@ -2817,6 +2827,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
2817 return 1; 2827 return 1;
2818 } 2828 }
2819 2829
2830 /* Does this CPU need a deferred NOCB wakeup? */
2831 if (rcu_nocb_need_deferred_wakeup(rdp)) {
2832 rdp->n_rp_nocb_defer_wakeup++;
2833 return 1;
2834 }
2835
2820 /* nothing to do */ 2836 /* nothing to do */
2821 rdp->n_rp_need_nothing++; 2837 rdp->n_rp_need_nothing++;
2822 return 0; 2838 return 0;