aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2009-02-05 18:43:07 -0500
committerSteven Rostedt <srostedt@redhat.com>2009-02-07 20:00:17 -0500
commit78d904b46a72fcf15ea6a39672bbef92953876b5 (patch)
tree69f56f7bbd1866491517d902bdf18ab973f0eb5f
parent1830b52d0de8c60c4f5dfbac134aa8f69d815801 (diff)
ring-buffer: add NMI protection for spinlocks
Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt <srostedt@redhat.com>
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/kernel/ftrace.c8
-rw-r--r--include/linux/ftrace_irq.h10
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/ring_buffer.c48
5 files changed, 68 insertions, 7 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..a6be725cb049 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
34 select HAVE_FUNCTION_TRACER 34 select HAVE_FUNCTION_TRACER
35 select HAVE_FUNCTION_GRAPH_TRACER 35 select HAVE_FUNCTION_GRAPH_TRACER
36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST 36 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
37 select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER
37 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 38 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
38 select HAVE_ARCH_KGDB if !X86_VOYAGER 39 select HAVE_ARCH_KGDB if !X86_VOYAGER
39 select HAVE_ARCH_TRACEHOOK 40 select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4d33224c055f..4c683587055b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
113 MCOUNT_INSN_SIZE); 113 MCOUNT_INSN_SIZE);
114} 114}
115 115
116void ftrace_nmi_enter(void) 116void arch_ftrace_nmi_enter(void)
117{ 117{
118 atomic_inc(&in_nmi); 118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */ 119 /* Must have in_nmi seen before reading write flag */
@@ -124,7 +124,7 @@ void ftrace_nmi_enter(void)
124 } 124 }
125} 125}
126 126
127void ftrace_nmi_exit(void) 127void arch_ftrace_nmi_exit(void)
128{ 128{
129 /* Finish all executions before clearing in_nmi */ 129 /* Finish all executions before clearing in_nmi */
130 smp_wmb(); 130 smp_wmb();
@@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void)
376 */ 376 */
377static atomic_t in_nmi; 377static atomic_t in_nmi;
378 378
379void ftrace_nmi_enter(void) 379void arch_ftrace_nmi_enter(void)
380{ 380{
381 atomic_inc(&in_nmi); 381 atomic_inc(&in_nmi);
382} 382}
383 383
384void ftrace_nmi_exit(void) 384void arch_ftrace_nmi_exit(void)
385{ 385{
386 atomic_dec(&in_nmi); 386 atomic_dec(&in_nmi);
387} 387}
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 366a054d0b05..29de6779a963 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,15 @@
2#define _LINUX_FTRACE_IRQ_H 2#define _LINUX_FTRACE_IRQ_H
3 3
4 4
5#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) 5#ifdef CONFIG_FTRACE_NMI_ENTER
6extern void arch_ftrace_nmi_enter(void);
7extern void arch_ftrace_nmi_exit(void);
8#else
9static inline void arch_ftrace_nmi_enter(void) { }
10static inline void arch_ftrace_nmi_exit(void) { }
11#endif
12
13#ifdef CONFIG_RING_BUFFER
6extern void ftrace_nmi_enter(void); 14extern void ftrace_nmi_enter(void);
7extern void ftrace_nmi_exit(void); 15extern void ftrace_nmi_exit(void);
8#else 16#else
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
9config NOP_TRACER 9config NOP_TRACER
10 bool 10 bool
11 11
12config HAVE_FTRACE_NMI_ENTER
13 bool
14
12config HAVE_FUNCTION_TRACER 15config HAVE_FUNCTION_TRACER
13 bool 16 bool
14 17
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
37config RING_BUFFER 40config RING_BUFFER
38 bool 41 bool
39 42
43config FTRACE_NMI_ENTER
44 bool
45 depends on HAVE_FTRACE_NMI_ENTER
46 default y
47
40config TRACING 48config TRACING
41 bool 49 bool
42 select DEBUG_FS 50 select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/ftrace_irq.h>
7#include <linux/spinlock.h> 8#include <linux/spinlock.h>
8#include <linux/debugfs.h> 9#include <linux/debugfs.h>
9#include <linux/uaccess.h> 10#include <linux/uaccess.h>
@@ -19,6 +20,35 @@
19#include "trace.h" 20#include "trace.h"
20 21
21/* 22/*
23 * Since the write to the buffer is still not fully lockless,
24 * we must be careful with NMIs. The locks in the writers
25 * are taken when a write crosses to a new page. The locks
26 * protect against races with the readers (this will soon
27 * be fixed with a lockless solution).
28 *
29 * Because we can not protect against NMIs, and we want to
30 * keep traces reentrant, we need to manage what happens
31 * when we are in an NMI.
32 */
33static DEFINE_PER_CPU(int, rb_in_nmi);
34
35void ftrace_nmi_enter(void)
36{
37 __get_cpu_var(rb_in_nmi)++;
38 /* call arch specific handler too */
39 arch_ftrace_nmi_enter();
40}
41
42void ftrace_nmi_exit(void)
43{
44 arch_ftrace_nmi_exit();
45 __get_cpu_var(rb_in_nmi)--;
46 /* NMIs are not recursive */
47 WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
48}
49
50
51/*
22 * A fast way to enable or disable all ring buffers is to 52 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers 53 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to. 54 * prevents all ring buffers from being recorded to.
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
982 struct ring_buffer *buffer = cpu_buffer->buffer; 1012 struct ring_buffer *buffer = cpu_buffer->buffer;
983 struct ring_buffer_event *event; 1013 struct ring_buffer_event *event;
984 unsigned long flags; 1014 unsigned long flags;
1015 bool lock_taken = false;
985 1016
986 commit_page = cpu_buffer->commit_page; 1017 commit_page = cpu_buffer->commit_page;
987 /* we just need to protect against interrupts */ 1018 /* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
995 struct buffer_page *next_page = tail_page; 1026 struct buffer_page *next_page = tail_page;
996 1027
997 local_irq_save(flags); 1028 local_irq_save(flags);
998 __raw_spin_lock(&cpu_buffer->lock); 1029 /*
1030 * NMIs can happen after we take the lock.
1031 * If we are in an NMI, only take the lock
1032 * if it is not already taken. Otherwise
1033 * simply fail.
1034 */
1035 if (unlikely(__get_cpu_var(rb_in_nmi))) {
1036 if (!__raw_spin_trylock(&cpu_buffer->lock))
1037 goto out_unlock;
1038 } else
1039 __raw_spin_lock(&cpu_buffer->lock);
1040
1041 lock_taken = true;
999 1042
1000 rb_inc_page(cpu_buffer, &next_page); 1043 rb_inc_page(cpu_buffer, &next_page);
1001 1044
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1097 if (tail <= BUF_PAGE_SIZE) 1140 if (tail <= BUF_PAGE_SIZE)
1098 local_set(&tail_page->write, tail); 1141 local_set(&tail_page->write, tail);
1099 1142
1100 __raw_spin_unlock(&cpu_buffer->lock); 1143 if (likely(lock_taken))
1144 __raw_spin_unlock(&cpu_buffer->lock);
1101 local_irq_restore(flags); 1145 local_irq_restore(flags);
1102 return NULL; 1146 return NULL;
1103} 1147}