ring-buffer: add NMI protection for spinlocks

Impact: prevent deadlock in NMI The ring buffers are not yet totally lockless with writing to the buffer. When a writer crosses a page, it grabs a per cpu spinlock to protect against a reader. The spinlocks taken by a writer are not to protect against other writers, since a writer can only write to its own per cpu buffer. The spinlocks protect against readers that can touch any cpu buffer. The writers are made to be reentrant with the spinlocks disabling interrupts. The problem arises when an NMI writes to the buffer, and that write crosses a page boundary. If it grabs a spinlock, it can be racing with another writer (since disabling interrupts does not protect against NMIs) or with a reader on the same CPU. Luckily, most of the users are not reentrant and protects against this issue. But if a user of the ring buffer becomes reentrant (which is what the ring buffers do allow), if the NMI also writes to the ring buffer then we risk the chance of a deadlock. This patch moves the ftrace_nmi_enter called by nmi_enter() to the ring buffer code. It replaces the current ftrace_nmi_enter that is used by arch specific code to arch_ftrace_nmi_enter and updates the Kconfig to handle it. When an NMI is called, it will set a per cpu variable in the ring buffer code and will clear it when the NMI exits. If a write to the ring buffer crosses page boundaries inside an NMI, a trylock is used on the spin lock instead. If the spinlock fails to be acquired, then the entry is discarded. This bug appeared in the ftrace work in the RT tree, where event tracing is reentrant. This workaround solved the deadlocks that appeared there. Signed-off-by: Steven Rostedt <srostedt@redhat.com>
author: Steven Rostedt <srostedt@redhat.com> 2009-02-05 18:43:07 -0500
committer: Steven Rostedt <srostedt@redhat.com> 2009-02-07 20:00:17 -0500
commit: 78d904b46a72fcf15ea6a39672bbef92953876b5 (patch)
tree: 69f56f7bbd1866491517d902bdf18ab973f0eb5f /kernel/trace
parent: 1830b52d0de8c60c4f5dfbac134aa8f69d815801 (diff)
2 files changed, 54 insertions, 2 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
        bool
+config HAVE_FTRACE_NMI_ENTER
+        bool
 config HAVE_FUNCTION_TRACER
        bool
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
        bool
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
 config TRACING
        bool
        select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
 */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -19,6 +20,35 @@
 #include "trace.h"
 /*
+ * Since the write to the buffer is still not fully lockless,
+ * we must be careful with NMIs. The locks in the writers
+ * are taken when a write crosses to a new page. The locks
+ * protect against races with the readers (this will soon
+ * be fixed with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we want to
+ * keep traces reentrant, we need to manage what happens
+ * when we are in an NMI.
+ */
+static DEFINE_PER_CPU(int, rb_in_nmi);
+void ftrace_nmi_enter(void)
+{
+        __get_cpu_var(rb_in_nmi)++;
+        /* call arch specific handler too */
+        arch_ftrace_nmi_enter();
+}
+void ftrace_nmi_exit(void)
+{
+        arch_ftrace_nmi_exit();
+        __get_cpu_var(rb_in_nmi)--;
+        /* NMIs are not recursive */
+        WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
+}
+/*
 * A fast way to enable or disable all ring buffers is to
 * call tracing_on or tracing_off. Turning off the ring buffers
 * prevents all ring buffers from being recorded to.
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        struct ring_buffer *buffer = cpu_buffer->buffer;
        struct ring_buffer_event *event;
        unsigned long flags;
+        bool lock_taken = false;
        commit_page = cpu_buffer->commit_page;
        /* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                struct buffer_page *next_page = tail_page;
                local_irq_save(flags);
-                __raw_spin_lock(&cpu_buffer->lock);
+                /*
+                 * NMIs can happen after we take the lock.
+                 * If we are in an NMI, only take the lock
+                 * if it is not already taken. Otherwise
+                 * simply fail.
+                 */
+                if (unlikely(__get_cpu_var(rb_in_nmi))) {
+                        if (!__raw_spin_trylock(&cpu_buffer->lock))
+                                goto out_unlock;
+                } else
+                        __raw_spin_lock(&cpu_buffer->lock);
+                lock_taken = true;
                rb_inc_page(cpu_buffer, &next_page);
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        if (tail <= BUF_PAGE_SIZE)
                local_set(&tail_page->write, tail);
-        __raw_spin_unlock(&cpu_buffer->lock);
+        if (likely(lock_taken))
+                __raw_spin_unlock(&cpu_buffer->lock);
        local_irq_restore(flags);
        return NULL;
 }
author	Steven Rostedt <srostedt@redhat.com>	2009-02-05 18:43:07 -0500
committer	Steven Rostedt <srostedt@redhat.com>	2009-02-07 20:00:17 -0500
commit	78d904b46a72fcf15ea6a39672bbef92953876b5 (patch)
tree	69f56f7bbd1866491517d902bdf18ab973f0eb5f /kernel/trace
parent	1830b52d0de8c60c4f5dfbac134aa8f69d815801 (diff)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 28f2644484d9..25131a5d5e4f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
9	config NOP_TRACER	9	config NOP_TRACER
10	bool	10	bool
11		11
		12	config HAVE_FTRACE_NMI_ENTER
		13	bool
		14
12	config HAVE_FUNCTION_TRACER	15	config HAVE_FUNCTION_TRACER
13	bool	16	bool
14		17
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
37	config RING_BUFFER	40	config RING_BUFFER
38	bool	41	bool
39		42
		43	config FTRACE_NMI_ENTER
		44	bool
		45	depends on HAVE_FTRACE_NMI_ENTER
		46	default y
		47
40	config TRACING	48	config TRACING
41	bool	49	bool
42	select DEBUG_FS	50	select DEBUG_FS


diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b36d7374ceef..a60a6a852f42 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
4	* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>	4	* Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5	*/	5	*/
6	#include <linux/ring_buffer.h>	6	#include <linux/ring_buffer.h>
		7	#include <linux/ftrace_irq.h>
7	#include <linux/spinlock.h>	8	#include <linux/spinlock.h>
8	#include <linux/debugfs.h>	9	#include <linux/debugfs.h>
9	#include <linux/uaccess.h>	10	#include <linux/uaccess.h>
@@ -19,6 +20,35 @@
19	#include "trace.h"	20	#include "trace.h"
20		21
21	/*	22	/*
		23	* Since the write to the buffer is still not fully lockless,
		24	* we must be careful with NMIs. The locks in the writers
		25	* are taken when a write crosses to a new page. The locks
		26	* protect against races with the readers (this will soon
		27	* be fixed with a lockless solution).
		28	*
		29	* Because we can not protect against NMIs, and we want to
		30	* keep traces reentrant, we need to manage what happens
		31	* when we are in an NMI.
		32	*/
		33	static DEFINE_PER_CPU(int, rb_in_nmi);
		34
		35	void ftrace_nmi_enter(void)
		36	{
		37	__get_cpu_var(rb_in_nmi)++;
		38	/* call arch specific handler too */
		39	arch_ftrace_nmi_enter();
		40	}
		41
		42	void ftrace_nmi_exit(void)
		43	{
		44	arch_ftrace_nmi_exit();
		45	__get_cpu_var(rb_in_nmi)--;
		46	/* NMIs are not recursive */
		47	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
		48	}
		49
		50
		51	/*
22	* A fast way to enable or disable all ring buffers is to	52	* A fast way to enable or disable all ring buffers is to
23	* call tracing_on or tracing_off. Turning off the ring buffers	53	* call tracing_on or tracing_off. Turning off the ring buffers
24	* prevents all ring buffers from being recorded to.	54	* prevents all ring buffers from being recorded to.
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
982	struct ring_buffer *buffer = cpu_buffer->buffer;	1012	struct ring_buffer *buffer = cpu_buffer->buffer;
983	struct ring_buffer_event *event;	1013	struct ring_buffer_event *event;
984	unsigned long flags;	1014	unsigned long flags;
		1015	bool lock_taken = false;
985		1016
986	commit_page = cpu_buffer->commit_page;	1017	commit_page = cpu_buffer->commit_page;
987	/* we just need to protect against interrupts */	1018	/* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
995	struct buffer_page *next_page = tail_page;	1026	struct buffer_page *next_page = tail_page;
996		1027
997	local_irq_save(flags);	1028	local_irq_save(flags);
998	__raw_spin_lock(&cpu_buffer->lock);	1029	/*
		1030	* NMIs can happen after we take the lock.
		1031	* If we are in an NMI, only take the lock
		1032	* if it is not already taken. Otherwise
		1033	* simply fail.
		1034	*/
		1035	if (unlikely(__get_cpu_var(rb_in_nmi))) {
		1036	if (!__raw_spin_trylock(&cpu_buffer->lock))
		1037	goto out_unlock;
		1038	} else
		1039	__raw_spin_lock(&cpu_buffer->lock);
		1040
		1041	lock_taken = true;
999		1042
1000	rb_inc_page(cpu_buffer, &next_page);	1043	rb_inc_page(cpu_buffer, &next_page);
1001		1044
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1097	if (tail <= BUF_PAGE_SIZE)	1140	if (tail <= BUF_PAGE_SIZE)
1098	local_set(&tail_page->write, tail);	1141	local_set(&tail_page->write, tail);
1099		1142
1100	__raw_spin_unlock(&cpu_buffer->lock);	1143	if (likely(lock_taken))
		1144	__raw_spin_unlock(&cpu_buffer->lock);
1101	local_irq_restore(flags);	1145	local_irq_restore(flags);
1102	return NULL;	1146	return NULL;
1103	}	1147	}