aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorPeter Zijlstra (Intel) <peterz@infradead.org>2014-12-16 06:47:34 -0500
committerIngo Molnar <mingo@kernel.org>2015-01-14 09:11:45 -0500
commit86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 (patch)
tree52885be83b5062b95923e16c02f7137c45609a61 /kernel/trace
parent188c901941efd43cbf21e8f4f9e9a276536b989c (diff)
perf: Avoid horrible stack usage
Both Linus (most recent) and Steve (a while ago) reported that perf related callbacks have massive stack bloat. The problem is that software events need a pt_regs in order to properly report the event location and unwind stack. And because we could not assume one was present we allocated one on stack and filled it with minimal bits required for operation. Now, pt_regs is quite large, so this is undesirable. Furthermore it turns out that most sites actually have a pt_regs pointer available, making this even more onerous, as the stack space is pointless waste. This patch addresses the problem by observing that software events have well defined nesting semantics, therefore we can use static per-cpu storage instead of on-stack. Linus made the further observation that all but the scheduler callers of perf_sw_event() have a pt_regs available, so we change the regular perf_sw_event() to require a valid pt_regs (where it used to be optional) and add perf_sw_event_sched() for the scheduler. We have a scheduler specific call instead of a more generic _noregs() like construct because we can assume non-recursion from the scheduler and thereby simplify the code further (_noregs would have to put the recursion context call inline in order to assertain which __perf_regs element to use). One last note on the implementation of perf_trace_buf_prepare(); we allow .regs = NULL for those cases where we already have a pt_regs pointer available and do not need another. Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Reported-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Javi Merino <javi.merino@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Petr Mladek <pmladek@suse.cz> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Vaibhav Nagarnaik <vnagarnaik@google.com> Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
4 files changed, 8 insertions, 6 deletions
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_prepare(int size, unsigned short type,
264 struct pt_regs *regs, int *rctxp) 264 struct pt_regs **regs, int *rctxp)
265{ 265{
266 struct trace_entry *entry; 266 struct trace_entry *entry;
267 unsigned long flags; 267 unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
280 if (*rctxp < 0) 280 if (*rctxp < 0)
281 return NULL; 281 return NULL;
282 282
283 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
283 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
284 286
285 /* zero the dead bytes from align to not leak stack to user */ 287 /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1148 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1148 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1149 size -= sizeof(u32); 1149 size -= sizeof(u32);
1150 1150
1151 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1151 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1152 if (!entry) 1152 if (!entry)
1153 return; 1153 return;
1154 1154
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1179 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1179 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1180 size -= sizeof(u32); 1180 size -= sizeof(u32);
1181 1181
1182 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1182 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1183 if (!entry) 1183 if (!entry)
1184 return; 1184 return;
1185 1185
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
574 size -= sizeof(u32); 574 size -= sizeof(u32);
575 575
576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
577 sys_data->enter_event->event.type, regs, &rctx); 577 sys_data->enter_event->event.type, NULL, &rctx);
578 if (!rec) 578 if (!rec)
579 return; 579 return;
580 580
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
647 size -= sizeof(u32); 647 size -= sizeof(u32);
648 648
649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
650 sys_data->exit_event->event.type, regs, &rctx); 650 sys_data->exit_event->event.type, NULL, &rctx);
651 if (!rec) 651 if (!rec)
652 return; 652 return;
653 653
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1111 if (hlist_empty(head)) 1111 if (hlist_empty(head))
1112 goto out; 1112 goto out;
1113 1113
1114 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1114 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1115 if (!entry) 1115 if (!entry)
1116 goto out; 1116 goto out;
1117 1117