diff options
author | Peter Zijlstra (Intel) <peterz@infradead.org> | 2014-12-16 06:47:34 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-01-14 09:11:45 -0500 |
commit | 86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 (patch) | |
tree | 52885be83b5062b95923e16c02f7137c45609a61 | |
parent | 188c901941efd43cbf21e8f4f9e9a276536b989c (diff) |
perf: Avoid horrible stack usage
Both Linus (most recent) and Steve (a while ago) reported that perf
related callbacks have massive stack bloat.
The problem is that software events need a pt_regs in order to
properly report the event location and unwind stack. And because we
could not assume one was present we allocated one on stack and filled
it with minimal bits required for operation.
Now, pt_regs is quite large, so this is undesirable. Furthermore it
turns out that most sites actually have a pt_regs pointer available,
making this even more onerous, as the stack space is pointless waste.
This patch addresses the problem by observing that software events
have well defined nesting semantics, therefore we can use static
per-cpu storage instead of on-stack.
Linus made the further observation that all but the scheduler callers
of perf_sw_event() have a pt_regs available, so we change the regular
perf_sw_event() to require a valid pt_regs (where it used to be
optional) and add perf_sw_event_sched() for the scheduler.
We have a scheduler specific call instead of a more generic _noregs()
like construct because we can assume non-recursion from the scheduler
and thereby simplify the code further (_noregs would have to put the
recursion context call inline in order to assertain which __perf_regs
element to use).
One last note on the implementation of perf_trace_buf_prepare(); we
allow .regs = NULL for those cases where we already have a pt_regs
pointer available and do not need another.
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Javi Merino <javi.merino@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Vaibhav Nagarnaik <vnagarnaik@google.com>
Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | include/linux/ftrace_event.h | 2 | ||||
-rw-r--r-- | include/linux/perf_event.h | 28 | ||||
-rw-r--r-- | include/trace/ftrace.h | 7 | ||||
-rw-r--r-- | kernel/events/core.c | 23 | ||||
-rw-r--r-- | kernel/sched/core.c | 2 | ||||
-rw-r--r-- | kernel/trace/trace_event_perf.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_kprobe.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 4 | ||||
-rw-r--r-- | kernel/trace/trace_uprobe.c | 2 |
9 files changed, 52 insertions, 24 deletions
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0bebb5c348b8..d36f68b08acc 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h | |||
@@ -595,7 +595,7 @@ extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
595 | char *filter_str); | 595 | char *filter_str); |
596 | extern void ftrace_profile_free_filter(struct perf_event *event); | 596 | extern void ftrace_profile_free_filter(struct perf_event *event); |
597 | extern void *perf_trace_buf_prepare(int size, unsigned short type, | 597 | extern void *perf_trace_buf_prepare(int size, unsigned short type, |
598 | struct pt_regs *regs, int *rctxp); | 598 | struct pt_regs **regs, int *rctxp); |
599 | 599 | ||
600 | static inline void | 600 | static inline void |
601 | perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, | 601 | perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, |
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f7a61ca4b39..3a7bd80b4db8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h | |||
@@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event *event) | |||
665 | 665 | ||
666 | extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; | 666 | extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; |
667 | 667 | ||
668 | extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64); | ||
668 | extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); | 669 | extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); |
669 | 670 | ||
670 | #ifndef perf_arch_fetch_caller_regs | 671 | #ifndef perf_arch_fetch_caller_regs |
@@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs) | |||
689 | static __always_inline void | 690 | static __always_inline void |
690 | perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | 691 | perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) |
691 | { | 692 | { |
692 | struct pt_regs hot_regs; | 693 | if (static_key_false(&perf_swevent_enabled[event_id])) |
694 | __perf_sw_event(event_id, nr, regs, addr); | ||
695 | } | ||
696 | |||
697 | DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]); | ||
693 | 698 | ||
699 | /* | ||
700 | * 'Special' version for the scheduler, it hard assumes no recursion, | ||
701 | * which is guaranteed by us not actually scheduling inside other swevents | ||
702 | * because those disable preemption. | ||
703 | */ | ||
704 | static __always_inline void | ||
705 | perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) | ||
706 | { | ||
694 | if (static_key_false(&perf_swevent_enabled[event_id])) { | 707 | if (static_key_false(&perf_swevent_enabled[event_id])) { |
695 | if (!regs) { | 708 | struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]); |
696 | perf_fetch_caller_regs(&hot_regs); | 709 | |
697 | regs = &hot_regs; | 710 | perf_fetch_caller_regs(regs); |
698 | } | 711 | ___perf_sw_event(event_id, nr, regs, addr); |
699 | __perf_sw_event(event_id, nr, regs, addr); | ||
700 | } | 712 | } |
701 | } | 713 | } |
702 | 714 | ||
@@ -712,7 +724,7 @@ static inline void perf_event_task_sched_in(struct task_struct *prev, | |||
712 | static inline void perf_event_task_sched_out(struct task_struct *prev, | 724 | static inline void perf_event_task_sched_out(struct task_struct *prev, |
713 | struct task_struct *next) | 725 | struct task_struct *next) |
714 | { | 726 | { |
715 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); | 727 | perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); |
716 | 728 | ||
717 | if (static_key_false(&perf_sched_events.key)) | 729 | if (static_key_false(&perf_sched_events.key)) |
718 | __perf_event_task_sched_out(prev, next); | 730 | __perf_event_task_sched_out(prev, next); |
@@ -823,6 +835,8 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh) | |||
823 | static inline void | 835 | static inline void |
824 | perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } | 836 | perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } |
825 | static inline void | 837 | static inline void |
838 | perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) { } | ||
839 | static inline void | ||
826 | perf_bp_event(struct perf_event *event, void *data) { } | 840 | perf_bp_event(struct perf_event *event, void *data) { } |
827 | 841 | ||
828 | static inline int perf_register_guest_info_callbacks | 842 | static inline int perf_register_guest_info_callbacks |
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 139b5067345b..27609dfcce25 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h | |||
@@ -763,7 +763,7 @@ perf_trace_##call(void *__data, proto) \ | |||
763 | struct ftrace_event_call *event_call = __data; \ | 763 | struct ftrace_event_call *event_call = __data; \ |
764 | struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ | 764 | struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ |
765 | struct ftrace_raw_##call *entry; \ | 765 | struct ftrace_raw_##call *entry; \ |
766 | struct pt_regs __regs; \ | 766 | struct pt_regs *__regs; \ |
767 | u64 __addr = 0, __count = 1; \ | 767 | u64 __addr = 0, __count = 1; \ |
768 | struct task_struct *__task = NULL; \ | 768 | struct task_struct *__task = NULL; \ |
769 | struct hlist_head *head; \ | 769 | struct hlist_head *head; \ |
@@ -782,18 +782,19 @@ perf_trace_##call(void *__data, proto) \ | |||
782 | sizeof(u64)); \ | 782 | sizeof(u64)); \ |
783 | __entry_size -= sizeof(u32); \ | 783 | __entry_size -= sizeof(u32); \ |
784 | \ | 784 | \ |
785 | perf_fetch_caller_regs(&__regs); \ | ||
786 | entry = perf_trace_buf_prepare(__entry_size, \ | 785 | entry = perf_trace_buf_prepare(__entry_size, \ |
787 | event_call->event.type, &__regs, &rctx); \ | 786 | event_call->event.type, &__regs, &rctx); \ |
788 | if (!entry) \ | 787 | if (!entry) \ |
789 | return; \ | 788 | return; \ |
790 | \ | 789 | \ |
790 | perf_fetch_caller_regs(__regs); \ | ||
791 | \ | ||
791 | tstruct \ | 792 | tstruct \ |
792 | \ | 793 | \ |
793 | { assign; } \ | 794 | { assign; } \ |
794 | \ | 795 | \ |
795 | perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ | 796 | perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ |
796 | __count, &__regs, head, __task); \ | 797 | __count, __regs, head, __task); \ |
797 | } | 798 | } |
798 | 799 | ||
799 | /* | 800 | /* |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 882f835a0d85..c10124b772c4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -5889,6 +5889,8 @@ end: | |||
5889 | rcu_read_unlock(); | 5889 | rcu_read_unlock(); |
5890 | } | 5890 | } |
5891 | 5891 | ||
5892 | DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]); | ||
5893 | |||
5892 | int perf_swevent_get_recursion_context(void) | 5894 | int perf_swevent_get_recursion_context(void) |
5893 | { | 5895 | { |
5894 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); | 5896 | struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); |
@@ -5904,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx) | |||
5904 | put_recursion_context(swhash->recursion, rctx); | 5906 | put_recursion_context(swhash->recursion, rctx); |
5905 | } | 5907 | } |
5906 | 5908 | ||
5907 | void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | 5909 | void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) |
5908 | { | 5910 | { |
5909 | struct perf_sample_data data; | 5911 | struct perf_sample_data data; |
5910 | int rctx; | ||
5911 | 5912 | ||
5912 | preempt_disable_notrace(); | 5913 | if (WARN_ON_ONCE(!regs)) |
5913 | rctx = perf_swevent_get_recursion_context(); | ||
5914 | if (rctx < 0) | ||
5915 | return; | 5914 | return; |
5916 | 5915 | ||
5917 | perf_sample_data_init(&data, addr, 0); | 5916 | perf_sample_data_init(&data, addr, 0); |
5918 | |||
5919 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); | 5917 | do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs); |
5918 | } | ||
5919 | |||
5920 | void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) | ||
5921 | { | ||
5922 | int rctx; | ||
5923 | |||
5924 | preempt_disable_notrace(); | ||
5925 | rctx = perf_swevent_get_recursion_context(); | ||
5926 | if (unlikely(rctx < 0)) | ||
5927 | goto fail; | ||
5928 | |||
5929 | ___perf_sw_event(event_id, nr, regs, addr); | ||
5920 | 5930 | ||
5921 | perf_swevent_put_recursion_context(rctx); | 5931 | perf_swevent_put_recursion_context(rctx); |
5932 | fail: | ||
5922 | preempt_enable_notrace(); | 5933 | preempt_enable_notrace(); |
5923 | } | 5934 | } |
5924 | 5935 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..d22fb16a7153 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1082,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1082 | if (p->sched_class->migrate_task_rq) | 1082 | if (p->sched_class->migrate_task_rq) |
1083 | p->sched_class->migrate_task_rq(p, new_cpu); | 1083 | p->sched_class->migrate_task_rq(p, new_cpu); |
1084 | p->se.nr_migrations++; | 1084 | p->se.nr_migrations++; |
1085 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); | 1085 | perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); |
1086 | } | 1086 | } |
1087 | 1087 | ||
1088 | __set_task_cpu(p, new_cpu); | 1088 | __set_task_cpu(p, new_cpu); |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 4b9c114ee9de..6fa484de2ba1 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags) | |||
261 | } | 261 | } |
262 | 262 | ||
263 | void *perf_trace_buf_prepare(int size, unsigned short type, | 263 | void *perf_trace_buf_prepare(int size, unsigned short type, |
264 | struct pt_regs *regs, int *rctxp) | 264 | struct pt_regs **regs, int *rctxp) |
265 | { | 265 | { |
266 | struct trace_entry *entry; | 266 | struct trace_entry *entry; |
267 | unsigned long flags; | 267 | unsigned long flags; |
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type, | |||
280 | if (*rctxp < 0) | 280 | if (*rctxp < 0) |
281 | return NULL; | 281 | return NULL; |
282 | 282 | ||
283 | if (regs) | ||
284 | *regs = this_cpu_ptr(&__perf_regs[*rctxp]); | ||
283 | raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); | 285 | raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); |
284 | 286 | ||
285 | /* zero the dead bytes from align to not leak stack to user */ | 287 | /* zero the dead bytes from align to not leak stack to user */ |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5edb518be345..296079ae6583 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) | |||
1148 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1148 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1149 | size -= sizeof(u32); | 1149 | size -= sizeof(u32); |
1150 | 1150 | ||
1151 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1151 | entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); |
1152 | if (!entry) | 1152 | if (!entry) |
1153 | return; | 1153 | return; |
1154 | 1154 | ||
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, | |||
1179 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | 1179 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); |
1180 | size -= sizeof(u32); | 1180 | size -= sizeof(u32); |
1181 | 1181 | ||
1182 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1182 | entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); |
1183 | if (!entry) | 1183 | if (!entry) |
1184 | return; | 1184 | return; |
1185 | 1185 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index c6ee36fcbf90..f97f6e3a676c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
574 | size -= sizeof(u32); | 574 | size -= sizeof(u32); |
575 | 575 | ||
576 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, | 576 | rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, |
577 | sys_data->enter_event->event.type, regs, &rctx); | 577 | sys_data->enter_event->event.type, NULL, &rctx); |
578 | if (!rec) | 578 | if (!rec) |
579 | return; | 579 | return; |
580 | 580 | ||
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
647 | size -= sizeof(u32); | 647 | size -= sizeof(u32); |
648 | 648 | ||
649 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, | 649 | rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, |
650 | sys_data->exit_event->event.type, regs, &rctx); | 650 | sys_data->exit_event->event.type, NULL, &rctx); |
651 | if (!rec) | 651 | if (!rec) |
652 | return; | 652 | return; |
653 | 653 | ||
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8520acc34b18..b11441321e7a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, | |||
1111 | if (hlist_empty(head)) | 1111 | if (hlist_empty(head)) |
1112 | goto out; | 1112 | goto out; |
1113 | 1113 | ||
1114 | entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); | 1114 | entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); |
1115 | if (!entry) | 1115 | if (!entry) |
1116 | goto out; | 1116 | goto out; |
1117 | 1117 | ||