aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra (Intel) <peterz@infradead.org>2014-12-16 06:47:34 -0500
committerIngo Molnar <mingo@kernel.org>2015-01-14 09:11:45 -0500
commit86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 (patch)
tree52885be83b5062b95923e16c02f7137c45609a61
parent188c901941efd43cbf21e8f4f9e9a276536b989c (diff)
perf: Avoid horrible stack usage
Both Linus (most recent) and Steve (a while ago) reported that perf related callbacks have massive stack bloat. The problem is that software events need a pt_regs in order to properly report the event location and unwind stack. And because we could not assume one was present we allocated one on stack and filled it with minimal bits required for operation. Now, pt_regs is quite large, so this is undesirable. Furthermore it turns out that most sites actually have a pt_regs pointer available, making this even more onerous, as the stack space is pointless waste. This patch addresses the problem by observing that software events have well defined nesting semantics, therefore we can use static per-cpu storage instead of on-stack. Linus made the further observation that all but the scheduler callers of perf_sw_event() have a pt_regs available, so we change the regular perf_sw_event() to require a valid pt_regs (where it used to be optional) and add perf_sw_event_sched() for the scheduler. We have a scheduler specific call instead of a more generic _noregs() like construct because we can assume non-recursion from the scheduler and thereby simplify the code further (_noregs would have to put the recursion context call inline in order to assertain which __perf_regs element to use). One last note on the implementation of perf_trace_buf_prepare(); we allow .regs = NULL for those cases where we already have a pt_regs pointer available and do not need another. Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Reported-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@kernel.org> Cc: Javi Merino <javi.merino@arm.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Petr Mladek <pmladek@suse.cz> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Cc: Vaibhav Nagarnaik <vnagarnaik@google.com> Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/ftrace_event.h2
-rw-r--r--include/linux/perf_event.h28
-rw-r--r--include/trace/ftrace.h7
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_kprobe.c4
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
9 files changed, 52 insertions, 24 deletions
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..d36f68b08acc 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -595,7 +595,7 @@ extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
595 char *filter_str); 595 char *filter_str);
596extern void ftrace_profile_free_filter(struct perf_event *event); 596extern void ftrace_profile_free_filter(struct perf_event *event);
597extern void *perf_trace_buf_prepare(int size, unsigned short type, 597extern void *perf_trace_buf_prepare(int size, unsigned short type,
598 struct pt_regs *regs, int *rctxp); 598 struct pt_regs **regs, int *rctxp);
599 599
600static inline void 600static inline void
601perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, 601perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61ca4b39..3a7bd80b4db8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event *event)
665 665
666extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 666extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
667 667
668extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
668extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); 669extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
669 670
670#ifndef perf_arch_fetch_caller_regs 671#ifndef perf_arch_fetch_caller_regs
@@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
689static __always_inline void 690static __always_inline void
690perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 691perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
691{ 692{
692 struct pt_regs hot_regs; 693 if (static_key_false(&perf_swevent_enabled[event_id]))
694 __perf_sw_event(event_id, nr, regs, addr);
695}
696
697DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
693 698
699/*
700 * 'Special' version for the scheduler, it hard assumes no recursion,
701 * which is guaranteed by us not actually scheduling inside other swevents
702 * because those disable preemption.
703 */
704static __always_inline void
705perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
706{
694 if (static_key_false(&perf_swevent_enabled[event_id])) { 707 if (static_key_false(&perf_swevent_enabled[event_id])) {
695 if (!regs) { 708 struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
696 perf_fetch_caller_regs(&hot_regs); 709
697 regs = &hot_regs; 710 perf_fetch_caller_regs(regs);
698 } 711 ___perf_sw_event(event_id, nr, regs, addr);
699 __perf_sw_event(event_id, nr, regs, addr);
700 } 712 }
701} 713}
702 714
@@ -712,7 +724,7 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
712static inline void perf_event_task_sched_out(struct task_struct *prev, 724static inline void perf_event_task_sched_out(struct task_struct *prev,
713 struct task_struct *next) 725 struct task_struct *next)
714{ 726{
715 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); 727 perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
716 728
717 if (static_key_false(&perf_sched_events.key)) 729 if (static_key_false(&perf_sched_events.key))
718 __perf_event_task_sched_out(prev, next); 730 __perf_event_task_sched_out(prev, next);
@@ -823,6 +835,8 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
823static inline void 835static inline void
824perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } 836perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { }
825static inline void 837static inline void
838perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { }
839static inline void
826perf_bp_event(struct perf_event *event, void *data) { } 840perf_bp_event(struct perf_event *event, void *data) { }
827 841
828static inline int perf_register_guest_info_callbacks 842static inline int perf_register_guest_info_callbacks
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..27609dfcce25 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -763,7 +763,7 @@ perf_trace_##call(void *__data, proto) \
763 struct ftrace_event_call *event_call = __data; \ 763 struct ftrace_event_call *event_call = __data; \
764 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ 764 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
765 struct ftrace_raw_##call *entry; \ 765 struct ftrace_raw_##call *entry; \
766 struct pt_regs __regs; \ 766 struct pt_regs *__regs; \
767 u64 __addr = 0, __count = 1; \ 767 u64 __addr = 0, __count = 1; \
768 struct task_struct *__task = NULL; \ 768 struct task_struct *__task = NULL; \
769 struct hlist_head *head; \ 769 struct hlist_head *head; \
@@ -782,18 +782,19 @@ perf_trace_##call(void *__data, proto) \
782 sizeof(u64)); \ 782 sizeof(u64)); \
783 __entry_size -= sizeof(u32); \ 783 __entry_size -= sizeof(u32); \
784 \ 784 \
785 perf_fetch_caller_regs(&__regs); \
786 entry = perf_trace_buf_prepare(__entry_size, \ 785 entry = perf_trace_buf_prepare(__entry_size, \
787 event_call->event.type, &__regs, &rctx); \ 786 event_call->event.type, &__regs, &rctx); \
788 if (!entry) \ 787 if (!entry) \
789 return; \ 788 return; \
790 \ 789 \
790 perf_fetch_caller_regs(__regs); \
791 \
791 tstruct \ 792 tstruct \
792 \ 793 \
793 { assign; } \ 794 { assign; } \
794 \ 795 \
795 perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ 796 perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
796 __count, &__regs, head, __task); \ 797 __count, __regs, head, __task); \
797} 798}
798 799
799/* 800/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..c10124b772c4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5889,6 +5889,8 @@ end:
5889 rcu_read_unlock(); 5889 rcu_read_unlock();
5890} 5890}
5891 5891
5892DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
5893
5892int perf_swevent_get_recursion_context(void) 5894int perf_swevent_get_recursion_context(void)
5893{ 5895{
5894 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 5896 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
5904 put_recursion_context(swhash->recursion, rctx); 5906 put_recursion_context(swhash->recursion, rctx);
5905} 5907}
5906 5908
5907void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 5909void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5908{ 5910{
5909 struct perf_sample_data data; 5911 struct perf_sample_data data;
5910 int rctx;
5911 5912
5912 preempt_disable_notrace(); 5913 if (WARN_ON_ONCE(!regs))
5913 rctx = perf_swevent_get_recursion_context();
5914 if (rctx < 0)
5915 return; 5914 return;
5916 5915
5917 perf_sample_data_init(&data, addr, 0); 5916 perf_sample_data_init(&data, addr, 0);
5918
5919 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); 5917 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5918}
5919
5920void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5921{
5922 int rctx;
5923
5924 preempt_disable_notrace();
5925 rctx = perf_swevent_get_recursion_context();
5926 if (unlikely(rctx < 0))
5927 goto fail;
5928
5929 ___perf_sw_event(event_id, nr, regs, addr);
5920 5930
5921 perf_swevent_put_recursion_context(rctx); 5931 perf_swevent_put_recursion_context(rctx);
5932fail:
5922 preempt_enable_notrace(); 5933 preempt_enable_notrace();
5923} 5934}
5924 5935
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..d22fb16a7153 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1082,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1082 if (p->sched_class->migrate_task_rq) 1082 if (p->sched_class->migrate_task_rq)
1083 p->sched_class->migrate_task_rq(p, new_cpu); 1083 p->sched_class->migrate_task_rq(p, new_cpu);
1084 p->se.nr_migrations++; 1084 p->se.nr_migrations++;
1085 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1085 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1086 } 1086 }
1087 1087
1088 __set_task_cpu(p, new_cpu); 1088 __set_task_cpu(p, new_cpu);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_prepare(int size, unsigned short type,
264 struct pt_regs *regs, int *rctxp) 264 struct pt_regs **regs, int *rctxp)
265{ 265{
266 struct trace_entry *entry; 266 struct trace_entry *entry;
267 unsigned long flags; 267 unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
280 if (*rctxp < 0) 280 if (*rctxp < 0)
281 return NULL; 281 return NULL;
282 282
283 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]);
283 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
284 286
285 /* zero the dead bytes from align to not leak stack to user */ 287 /* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1148 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1148 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1149 size -= sizeof(u32); 1149 size -= sizeof(u32);
1150 1150
1151 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1151 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1152 if (!entry) 1152 if (!entry)
1153 return; 1153 return;
1154 1154
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1179 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1179 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1180 size -= sizeof(u32); 1180 size -= sizeof(u32);
1181 1181
1182 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1182 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1183 if (!entry) 1183 if (!entry)
1184 return; 1184 return;
1185 1185
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
574 size -= sizeof(u32); 574 size -= sizeof(u32);
575 575
576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 576 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
577 sys_data->enter_event->event.type, regs, &rctx); 577 sys_data->enter_event->event.type, NULL, &rctx);
578 if (!rec) 578 if (!rec)
579 return; 579 return;
580 580
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
647 size -= sizeof(u32); 647 size -= sizeof(u32);
648 648
649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 649 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
650 sys_data->exit_event->event.type, regs, &rctx); 650 sys_data->exit_event->event.type, NULL, &rctx);
651 if (!rec) 651 if (!rec)
652 return; 652 return;
653 653
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1111 if (hlist_empty(head)) 1111 if (hlist_empty(head))
1112 goto out; 1112 goto out;
1113 1113
1114 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); 1114 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
1115 if (!entry) 1115 if (!entry)
1116 goto out; 1116 goto out;
1117 1117