aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2009-02-05 18:43:07 -0500
committerSteven Rostedt <srostedt@redhat.com>2009-02-07 20:00:17 -0500
commit78d904b46a72fcf15ea6a39672bbef92953876b5 (patch)
tree69f56f7bbd1866491517d902bdf18ab973f0eb5f /kernel/trace
parent1830b52d0de8c60c4f5dfbac134aa8f69d815801 (diff)
ring-buffer: add NMI protection for spinlocks
Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/ring_buffer.c48
2 files changed, 54 insertions, 2 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
9config NOP_TRACER 9config NOP_TRACER
10 bool 10 bool
11 11
12config HAVE_FTRACE_NMI_ENTER
13 bool
14
12config HAVE_FUNCTION_TRACER 15config HAVE_FUNCTION_TRACER
13 bool 16 bool
14 17
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
37config RING_BUFFER 40config RING_BUFFER
38 bool 41 bool
39 42
43config FTRACE_NMI_ENTER
44 bool
45 depends on HAVE_FTRACE_NMI_ENTER
46 default y
47
40config TRACING 48config TRACING
41 bool 49 bool
42 select DEBUG_FS 50 select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/ftrace_irq.h>
7#include <linux/spinlock.h> 8#include <linux/spinlock.h>
8#include <linux/debugfs.h> 9#include <linux/debugfs.h>
9#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -19,6 +20,35 @@
19#include "trace.h" 20#include "trace.h"
20 21
21/* 22/*
23 * Since the write to the buffer is still not fully lockless,
24 * we must be careful with NMIs. The locks in the writers
25 * are taken when a write crosses to a new page. The locks
26 * protect against races with the readers (this will soon
27 * be fixed with a lockless solution).
28 *
29 * Because we can not protect against NMIs, and we want to
30 * keep traces reentrant, we need to manage what happens
31 * when we are in an NMI.
32 */
33static DEFINE_PER_CPU(int, rb_in_nmi);
34
35void ftrace_nmi_enter(void)
36{
37 __get_cpu_var(rb_in_nmi)++;
38 /* call arch specific handler too */
39 arch_ftrace_nmi_enter();
40}
41
42void ftrace_nmi_exit(void)
43{
44 arch_ftrace_nmi_exit();
45 __get_cpu_var(rb_in_nmi)--;
46 /* NMIs are not recursive */
47 WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
48}
49
50
51/*
22 * A fast way to enable or disable all ring buffers is to 52 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers 53 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to. 54 * prevents all ring buffers from being recorded to.
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
982 struct ring_buffer *buffer = cpu_buffer->buffer; 1012 struct ring_buffer *buffer = cpu_buffer->buffer;
983 struct ring_buffer_event *event; 1013 struct ring_buffer_event *event;
984 unsigned long flags; 1014 unsigned long flags;
1015 bool lock_taken = false;
985 1016
986 commit_page = cpu_buffer->commit_page; 1017 commit_page = cpu_buffer->commit_page;
987 /* we just need to protect against interrupts */ 1018 /* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
995 struct buffer_page *next_page = tail_page; 1026 struct buffer_page *next_page = tail_page;
996 1027
997 local_irq_save(flags); 1028 local_irq_save(flags);
998 __raw_spin_lock(&cpu_buffer->lock); 1029 /*
1030 * NMIs can happen after we take the lock.
1031 * If we are in an NMI, only take the lock
1032 * if it is not already taken. Otherwise
1033 * simply fail.
1034 */
1035 if (unlikely(__get_cpu_var(rb_in_nmi))) {
1036 if (!__raw_spin_trylock(&cpu_buffer->lock))
1037 goto out_unlock;
1038 } else
1039 __raw_spin_lock(&cpu_buffer->lock);
1040
1041 lock_taken = true;
999 1042
1000 rb_inc_page(cpu_buffer, &next_page); 1043 rb_inc_page(cpu_buffer, &next_page);
1001 1044
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1097 if (tail <= BUF_PAGE_SIZE) 1140 if (tail <= BUF_PAGE_SIZE)
1098 local_set(&tail_page->write, tail); 1141 local_set(&tail_page->write, tail);
1099 1142
1100 __raw_spin_unlock(&cpu_buffer->lock); 1143 if (likely(lock_taken))
1144 __raw_spin_unlock(&cpu_buffer->lock);
1101 local_irq_restore(flags); 1145 local_irq_restore(flags);
1102 return NULL; 1146 return NULL;
1103} 1147}