diff options
author | Steven Rostedt <srostedt@redhat.com> | 2009-02-05 18:43:07 -0500 |
---|---|---|
committer | Steven Rostedt <srostedt@redhat.com> | 2009-02-07 20:00:17 -0500 |
commit | 78d904b46a72fcf15ea6a39672bbef92953876b5 (patch) | |
tree | 69f56f7bbd1866491517d902bdf18ab973f0eb5f | |
parent | 1830b52d0de8c60c4f5dfbac134aa8f69d815801 (diff) |
ring-buffer: add NMI protection for spinlocks
Impact: prevent deadlock in NMI
The ring buffers are not yet totally lockless with writing to
the buffer. When a writer crosses a page, it grabs a per cpu spinlock
to protect against a reader. The spinlocks taken by a writer are not
to protect against other writers, since a writer can only write to
its own per cpu buffer. The spinlocks protect against readers that
can touch any cpu buffer. The writers are made to be reentrant
with the spinlocks disabling interrupts.
The problem arises when an NMI writes to the buffer, and that write
crosses a page boundary. If it grabs a spinlock, it can be racing
with another writer (since disabling interrupts does not protect
against NMIs) or with a reader on the same CPU. Luckily, most of the
users are not reentrant and protects against this issue. But if a
user of the ring buffer becomes reentrant (which is what the ring
buffers do allow), if the NMI also writes to the ring buffer then
we risk the chance of a deadlock.
This patch moves the ftrace_nmi_enter called by nmi_enter() to the
ring buffer code. It replaces the current ftrace_nmi_enter that is
used by arch specific code to arch_ftrace_nmi_enter and updates
the Kconfig to handle it.
When an NMI is called, it will set a per cpu variable in the ring buffer
code and will clear it when the NMI exits. If a write to the ring buffer
crosses page boundaries inside an NMI, a trylock is used on the spin
lock instead. If the spinlock fails to be acquired, then the entry
is discarded.
This bug appeared in the ftrace work in the RT tree, where event tracing
is reentrant. This workaround solved the deadlocks that appeared there.
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
-rw-r--r-- | arch/x86/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/kernel/ftrace.c | 8 | ||||
-rw-r--r-- | include/linux/ftrace_irq.h | 10 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 8 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 48 |
5 files changed, 68 insertions, 7 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73f7fe8fd4d1..a6be725cb049 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -34,6 +34,7 @@ config X86 | |||
34 | select HAVE_FUNCTION_TRACER | 34 | select HAVE_FUNCTION_TRACER |
35 | select HAVE_FUNCTION_GRAPH_TRACER | 35 | select HAVE_FUNCTION_GRAPH_TRACER |
36 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST | 36 | select HAVE_FUNCTION_TRACE_MCOUNT_TEST |
37 | select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER | ||
37 | select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) | 38 | select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) |
38 | select HAVE_ARCH_KGDB if !X86_VOYAGER | 39 | select HAVE_ARCH_KGDB if !X86_VOYAGER |
39 | select HAVE_ARCH_TRACEHOOK | 40 | select HAVE_ARCH_TRACEHOOK |
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4d33224c055f..4c683587055b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c | |||
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void) | |||
113 | MCOUNT_INSN_SIZE); | 113 | MCOUNT_INSN_SIZE); |
114 | } | 114 | } |
115 | 115 | ||
116 | void ftrace_nmi_enter(void) | 116 | void arch_ftrace_nmi_enter(void) |
117 | { | 117 | { |
118 | atomic_inc(&in_nmi); | 118 | atomic_inc(&in_nmi); |
119 | /* Must have in_nmi seen before reading write flag */ | 119 | /* Must have in_nmi seen before reading write flag */ |
@@ -124,7 +124,7 @@ void ftrace_nmi_enter(void) | |||
124 | } | 124 | } |
125 | } | 125 | } |
126 | 126 | ||
127 | void ftrace_nmi_exit(void) | 127 | void arch_ftrace_nmi_exit(void) |
128 | { | 128 | { |
129 | /* Finish all executions before clearing in_nmi */ | 129 | /* Finish all executions before clearing in_nmi */ |
130 | smp_wmb(); | 130 | smp_wmb(); |
@@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void) | |||
376 | */ | 376 | */ |
377 | static atomic_t in_nmi; | 377 | static atomic_t in_nmi; |
378 | 378 | ||
379 | void ftrace_nmi_enter(void) | 379 | void arch_ftrace_nmi_enter(void) |
380 | { | 380 | { |
381 | atomic_inc(&in_nmi); | 381 | atomic_inc(&in_nmi); |
382 | } | 382 | } |
383 | 383 | ||
384 | void ftrace_nmi_exit(void) | 384 | void arch_ftrace_nmi_exit(void) |
385 | { | 385 | { |
386 | atomic_dec(&in_nmi); | 386 | atomic_dec(&in_nmi); |
387 | } | 387 | } |
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 366a054d0b05..29de6779a963 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h | |||
@@ -2,7 +2,15 @@ | |||
2 | #define _LINUX_FTRACE_IRQ_H | 2 | #define _LINUX_FTRACE_IRQ_H |
3 | 3 | ||
4 | 4 | ||
5 | #if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) | 5 | #ifdef CONFIG_FTRACE_NMI_ENTER |
6 | extern void arch_ftrace_nmi_enter(void); | ||
7 | extern void arch_ftrace_nmi_exit(void); | ||
8 | #else | ||
9 | static inline void arch_ftrace_nmi_enter(void) { } | ||
10 | static inline void arch_ftrace_nmi_exit(void) { } | ||
11 | #endif | ||
12 | |||
13 | #ifdef CONFIG_RING_BUFFER | ||
6 | extern void ftrace_nmi_enter(void); | 14 | extern void ftrace_nmi_enter(void); |
7 | extern void ftrace_nmi_exit(void); | 15 | extern void ftrace_nmi_exit(void); |
8 | #else | 16 | #else |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d9..25131a5d5e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT | |||
9 | config NOP_TRACER | 9 | config NOP_TRACER |
10 | bool | 10 | bool |
11 | 11 | ||
12 | config HAVE_FTRACE_NMI_ENTER | ||
13 | bool | ||
14 | |||
12 | config HAVE_FUNCTION_TRACER | 15 | config HAVE_FUNCTION_TRACER |
13 | bool | 16 | bool |
14 | 17 | ||
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE | |||
37 | config RING_BUFFER | 40 | config RING_BUFFER |
38 | bool | 41 | bool |
39 | 42 | ||
43 | config FTRACE_NMI_ENTER | ||
44 | bool | ||
45 | depends on HAVE_FTRACE_NMI_ENTER | ||
46 | default y | ||
47 | |||
40 | config TRACING | 48 | config TRACING |
41 | bool | 49 | bool |
42 | select DEBUG_FS | 50 | select DEBUG_FS |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374ceef..a60a6a852f42 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -4,6 +4,7 @@ | |||
4 | * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> | 4 | * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> |
5 | */ | 5 | */ |
6 | #include <linux/ring_buffer.h> | 6 | #include <linux/ring_buffer.h> |
7 | #include <linux/ftrace_irq.h> | ||
7 | #include <linux/spinlock.h> | 8 | #include <linux/spinlock.h> |
8 | #include <linux/debugfs.h> | 9 | #include <linux/debugfs.h> |
9 | #include <linux/uaccess.h> | 10 | #include <linux/uaccess.h> |
@@ -19,6 +20,35 @@ | |||
19 | #include "trace.h" | 20 | #include "trace.h" |
20 | 21 | ||
21 | /* | 22 | /* |
23 | * Since the write to the buffer is still not fully lockless, | ||
24 | * we must be careful with NMIs. The locks in the writers | ||
25 | * are taken when a write crosses to a new page. The locks | ||
26 | * protect against races with the readers (this will soon | ||
27 | * be fixed with a lockless solution). | ||
28 | * | ||
29 | * Because we can not protect against NMIs, and we want to | ||
30 | * keep traces reentrant, we need to manage what happens | ||
31 | * when we are in an NMI. | ||
32 | */ | ||
33 | static DEFINE_PER_CPU(int, rb_in_nmi); | ||
34 | |||
35 | void ftrace_nmi_enter(void) | ||
36 | { | ||
37 | __get_cpu_var(rb_in_nmi)++; | ||
38 | /* call arch specific handler too */ | ||
39 | arch_ftrace_nmi_enter(); | ||
40 | } | ||
41 | |||
42 | void ftrace_nmi_exit(void) | ||
43 | { | ||
44 | arch_ftrace_nmi_exit(); | ||
45 | __get_cpu_var(rb_in_nmi)--; | ||
46 | /* NMIs are not recursive */ | ||
47 | WARN_ON_ONCE(__get_cpu_var(rb_in_nmi)); | ||
48 | } | ||
49 | |||
50 | |||
51 | /* | ||
22 | * A fast way to enable or disable all ring buffers is to | 52 | * A fast way to enable or disable all ring buffers is to |
23 | * call tracing_on or tracing_off. Turning off the ring buffers | 53 | * call tracing_on or tracing_off. Turning off the ring buffers |
24 | * prevents all ring buffers from being recorded to. | 54 | * prevents all ring buffers from being recorded to. |
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
982 | struct ring_buffer *buffer = cpu_buffer->buffer; | 1012 | struct ring_buffer *buffer = cpu_buffer->buffer; |
983 | struct ring_buffer_event *event; | 1013 | struct ring_buffer_event *event; |
984 | unsigned long flags; | 1014 | unsigned long flags; |
1015 | bool lock_taken = false; | ||
985 | 1016 | ||
986 | commit_page = cpu_buffer->commit_page; | 1017 | commit_page = cpu_buffer->commit_page; |
987 | /* we just need to protect against interrupts */ | 1018 | /* we just need to protect against interrupts */ |
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
995 | struct buffer_page *next_page = tail_page; | 1026 | struct buffer_page *next_page = tail_page; |
996 | 1027 | ||
997 | local_irq_save(flags); | 1028 | local_irq_save(flags); |
998 | __raw_spin_lock(&cpu_buffer->lock); | 1029 | /* |
1030 | * NMIs can happen after we take the lock. | ||
1031 | * If we are in an NMI, only take the lock | ||
1032 | * if it is not already taken. Otherwise | ||
1033 | * simply fail. | ||
1034 | */ | ||
1035 | if (unlikely(__get_cpu_var(rb_in_nmi))) { | ||
1036 | if (!__raw_spin_trylock(&cpu_buffer->lock)) | ||
1037 | goto out_unlock; | ||
1038 | } else | ||
1039 | __raw_spin_lock(&cpu_buffer->lock); | ||
1040 | |||
1041 | lock_taken = true; | ||
999 | 1042 | ||
1000 | rb_inc_page(cpu_buffer, &next_page); | 1043 | rb_inc_page(cpu_buffer, &next_page); |
1001 | 1044 | ||
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1097 | if (tail <= BUF_PAGE_SIZE) | 1140 | if (tail <= BUF_PAGE_SIZE) |
1098 | local_set(&tail_page->write, tail); | 1141 | local_set(&tail_page->write, tail); |
1099 | 1142 | ||
1100 | __raw_spin_unlock(&cpu_buffer->lock); | 1143 | if (likely(lock_taken)) |
1144 | __raw_spin_unlock(&cpu_buffer->lock); | ||
1101 | local_irq_restore(flags); | 1145 | local_irq_restore(flags); |
1102 | return NULL; | 1146 | return NULL; |
1103 | } | 1147 | } |