aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-08-06 19:25:54 -0400
committerIngo Molnar <mingo@elte.hu>2009-08-09 06:53:48 -0400
commitf413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 (patch)
tree08a9621cb1318f73a37faeed14c4e728408551ad
parent3a6593050fbd8bbcaed3a44d01c31d907315c86c (diff)
perf_counter: Fix/complete ftrace event records sampling
This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Gabriel Munteanu <eduard.munteanu@linux360.ro> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/ftrace_event.h4
-rw-r--r--include/linux/perf_counter.h9
-rw-r--r--include/trace/ftrace.h130
-rw-r--r--kernel/perf_counter.c18
-rw-r--r--kernel/trace/trace.c1
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--tools/perf/builtin-record.c1
7 files changed, 126 insertions, 41 deletions
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d7cd193c2277..a81170de7f6b 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -89,7 +89,9 @@ enum print_line_t {
89 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ 89 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
90}; 90};
91 91
92 92void tracing_generic_entry_update(struct trace_entry *entry,
93 unsigned long flags,
94 int pc);
93struct ring_buffer_event * 95struct ring_buffer_event *
94trace_current_buffer_lock_reserve(int type, unsigned long len, 96trace_current_buffer_lock_reserve(int type, unsigned long len,
95 unsigned long flags, int pc); 97 unsigned long flags, int pc);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e604e6ef72dd..a67dd5c5b6d3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -121,8 +121,9 @@ enum perf_counter_sample_format {
121 PERF_SAMPLE_CPU = 1U << 7, 121 PERF_SAMPLE_CPU = 1U << 7,
122 PERF_SAMPLE_PERIOD = 1U << 8, 122 PERF_SAMPLE_PERIOD = 1U << 8,
123 PERF_SAMPLE_STREAM_ID = 1U << 9, 123 PERF_SAMPLE_STREAM_ID = 1U << 9,
124 PERF_SAMPLE_TP_RECORD = 1U << 10,
124 125
125 PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ 126 PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */
126}; 127};
127 128
128/* 129/*
@@ -413,6 +414,11 @@ struct perf_callchain_entry {
413 __u64 ip[PERF_MAX_STACK_DEPTH]; 414 __u64 ip[PERF_MAX_STACK_DEPTH];
414}; 415};
415 416
417struct perf_tracepoint_record {
418 int size;
419 char *record;
420};
421
416struct task_struct; 422struct task_struct;
417 423
418/** 424/**
@@ -681,6 +687,7 @@ struct perf_sample_data {
681 struct pt_regs *regs; 687 struct pt_regs *regs;
682 u64 addr; 688 u64 addr;
683 u64 period; 689 u64 period;
690 void *private;
684}; 691};
685 692
686extern int perf_counter_overflow(struct perf_counter *counter, int nmi, 693extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index fec71f8dbc48..7fb16d90e7b1 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \
353/* 353/*
354 * Generate the functions needed for tracepoint perf_counter support. 354 * Generate the functions needed for tracepoint perf_counter support.
355 * 355 *
356 * static void ftrace_profile_<call>(proto) 356 * NOTE: The insertion profile callback (ftrace_profile_<call>) is defined later
357 * {
358 * extern void perf_tpcounter_event(int, u64, u64);
359 * u64 __addr = 0, __count = 1;
360 *
361 * <assign> <-- here we expand the TP_perf_assign() macro
362 *
363 * perf_tpcounter_event(event_<call>.id, __addr, __count);
364 * }
365 * 357 *
366 * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call) 358 * static int ftrace_profile_enable_<call>(struct ftrace_event_call *event_call)
367 * { 359 * {
@@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \
381 * 373 *
382 */ 374 */
383 375
384#undef TP_fast_assign
385#define TP_fast_assign(args...)
386
387#undef TP_perf_assign
388#define TP_perf_assign(args...) args
389
390#undef __perf_addr
391#define __perf_addr(a) __addr = (a)
392
393#undef __perf_count
394#define __perf_count(c) __count = (c)
395
396#undef TRACE_EVENT 376#undef TRACE_EVENT
397#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ 377#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
398 \ 378 \
399static void ftrace_profile_##call(proto) \ 379static void ftrace_profile_##call(proto); \
400{ \
401 extern void perf_tpcounter_event(int, u64, u64); \
402 u64 __addr = 0, __count = 1; \
403 { assign; } \
404 perf_tpcounter_event(event_##call.id, __addr, __count); \
405} \
406 \ 380 \
407static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ 381static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
408{ \ 382{ \
@@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
422 396
423#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 397#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
424 398
425#undef TP_fast_assign
426#define TP_fast_assign(args...) args
427
428#undef TP_perf_assign
429#define TP_perf_assign(args...)
430
431#endif 399#endif
432 400
433/* 401/*
@@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
647 615
648#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) 616#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
649 617
618/*
619 * Define the insertion callback to profile events
620 *
621 * The job is very similar to ftrace_raw_event_<call> except that we don't
622 * insert in the ring buffer but in a perf counter.
623 *
624 * static void ftrace_profile_<call>(proto)
625 * {
626 * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
627 * struct ftrace_event_call *event_call = &event_<call>;
628 * extern void perf_tpcounter_event(int, u64, u64, void *, int);
629 * struct ftrace_raw_##call *entry;
630 * u64 __addr = 0, __count = 1;
631 * unsigned long irq_flags;
632 * int __entry_size;
633 * int __data_size;
634 * int pc;
635 *
636 * local_save_flags(irq_flags);
637 * pc = preempt_count();
638 *
639 * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
640 * __entry_size = __data_size + sizeof(*entry);
641 *
642 * do {
643 * char raw_data[__entry_size]; <- allocate our sample in the stack
644 * struct trace_entry *ent;
645 *
646 * entry = (struct ftrace_raw_<call> *)raw_data;
647 * ent = &entry->ent;
648 * tracing_generic_entry_update(ent, irq_flags, pc);
649 * ent->type = event_call->id;
650 *
651 * <tstruct> <- do some jobs with dynamic arrays
652 *
653 * <assign> <- affect our values
654 *
655 * perf_tpcounter_event(event_call->id, __addr, __count, entry,
656 * __entry_size); <- submit them to perf counter
657 * } while (0);
658 *
659 * }
660 */
661
662#ifdef CONFIG_EVENT_PROFILE
663
664#undef __perf_addr
665#define __perf_addr(a) __addr = (a)
666
667#undef __perf_count
668#define __perf_count(c) __count = (c)
669
670#undef TRACE_EVENT
671#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
672static void ftrace_profile_##call(proto) \
673{ \
674 struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
675 struct ftrace_event_call *event_call = &event_##call; \
676 extern void perf_tpcounter_event(int, u64, u64, void *, int); \
677 struct ftrace_raw_##call *entry; \
678 u64 __addr = 0, __count = 1; \
679 unsigned long irq_flags; \
680 int __entry_size; \
681 int __data_size; \
682 int pc; \
683 \
684 local_save_flags(irq_flags); \
685 pc = preempt_count(); \
686 \
687 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
688 __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\
689 \
690 do { \
691 char raw_data[__entry_size]; \
692 struct trace_entry *ent; \
693 \
694 entry = (struct ftrace_raw_##call *)raw_data; \
695 ent = &entry->ent; \
696 tracing_generic_entry_update(ent, irq_flags, pc); \
697 ent->type = event_call->id; \
698 \
699 tstruct \
700 \
701 { assign; } \
702 \
703 perf_tpcounter_event(event_call->id, __addr, __count, entry,\
704 __entry_size); \
705 } while (0); \
706 \
707}
708
709#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
710#endif /* CONFIG_EVENT_PROFILE */
711
650#undef _TRACE_PROFILE_INIT 712#undef _TRACE_PROFILE_INIT
651 713
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52eb4b68d34f..868102172aa4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2646 u64 counter; 2646 u64 counter;
2647 } group_entry; 2647 } group_entry;
2648 struct perf_callchain_entry *callchain = NULL; 2648 struct perf_callchain_entry *callchain = NULL;
2649 struct perf_tracepoint_record *tp;
2649 int callchain_size = 0; 2650 int callchain_size = 0;
2650 u64 time; 2651 u64 time;
2651 struct { 2652 struct {
@@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2714 header.size += sizeof(u64); 2715 header.size += sizeof(u64);
2715 } 2716 }
2716 2717
2718 if (sample_type & PERF_SAMPLE_TP_RECORD) {
2719 tp = data->private;
2720 header.size += tp->size;
2721 }
2722
2717 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2723 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2718 if (ret) 2724 if (ret)
2719 return; 2725 return;
@@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
2777 } 2783 }
2778 } 2784 }
2779 2785
2786 if (sample_type & PERF_SAMPLE_TP_RECORD)
2787 perf_output_copy(&handle, tp->record, tp->size);
2788
2780 perf_output_end(&handle); 2789 perf_output_end(&handle);
2781} 2790}
2782 2791
@@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = {
3703}; 3712};
3704 3713
3705#ifdef CONFIG_EVENT_PROFILE 3714#ifdef CONFIG_EVENT_PROFILE
3706void perf_tpcounter_event(int event_id, u64 addr, u64 count) 3715void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3716 int entry_size)
3707{ 3717{
3718 struct perf_tracepoint_record tp = {
3719 .size = entry_size,
3720 .record = record,
3721 };
3722
3708 struct perf_sample_data data = { 3723 struct perf_sample_data data = {
3709 .regs = get_irq_regs(), 3724 .regs = get_irq_regs(),
3710 .addr = addr, 3725 .addr = addr,
3726 .private = &tp,
3711 }; 3727 };
3712 3728
3713 if (!data.regs) 3729 if (!data.regs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8930e39b9d8c..c22b40f8f576 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 848 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 849 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
850} 850}
851EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
851 852
852struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 853struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
853 int type, 854 int type,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3548ae5cc780..8b9f4f6e9559 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
439 int *ent_cpu, u64 *ent_ts); 439 int *ent_cpu, u64 *ent_ts);
440 440
441void tracing_generic_entry_update(struct trace_entry *entry,
442 unsigned long flags,
443 int pc);
444
445void default_wait_pipe(struct trace_iterator *iter); 441void default_wait_pipe(struct trace_iterator *iter);
446void poll_wait_pipe(struct trace_iterator *iter); 442void poll_wait_pipe(struct trace_iterator *iter);
447 443
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 6da09928130f..90c98082af10 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
412 if (call_graph) 412 if (call_graph)
413 attr->sample_type |= PERF_SAMPLE_CALLCHAIN; 413 attr->sample_type |= PERF_SAMPLE_CALLCHAIN;
414 414
415
415 attr->mmap = track; 416 attr->mmap = track;
416 attr->comm = track; 417 attr->comm = track;
417 attr->inherit = (cpu < 0) && inherit; 418 attr->inherit = (cpu < 0) && inherit;