aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-09-18 00:10:28 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-09-18 01:25:44 -0400
commit20ab4425a77a1f34028cc6ce57053c22c184ba5f (patch)
treeca821b19593c3821fa13a520201537ad35e4c98d
parente5e25cf47b0bdd1f7e9b8bb6368ee48e16de0c87 (diff)
tracing: Allocate the ftrace event profile buffer dynamically
Currently the trace event profile buffer is allocated in the stack. But this may be too much for the stack, as the events can have large statically defined field size and can also grow with dynamic arrays. Allocate two per cpu buffer for all profiled events. The first cpu buffer is used to host every non-nmi context traces. It is protected by disabling the interrupts while writing and committing the trace. The second buffer is reserved for nmi. So that there is no race between them and the first buffer. The whole write/commit section is rcu protected because we release these buffers while deactivating the last profiling trace event. v2: Move the buffers from trace_event to be global, as pointed by Steven Rostedt. v3: Fix the syscall events to handle the profiling buffer races by disabling interrupts, now that the buffers are globals. Suggested-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Jason Baron <jbaron@redhat.com> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/ftrace_event.h6
-rw-r--r--include/trace/ftrace.h83
-rw-r--r--kernel/trace/trace_event_profile.c61
-rw-r--r--kernel/trace/trace_syscalls.c97
4 files changed, 199 insertions, 48 deletions
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bc103d7b1ca8..4ec5e67e18cf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -4,6 +4,7 @@
4#include <linux/ring_buffer.h> 4#include <linux/ring_buffer.h>
5#include <linux/trace_seq.h> 5#include <linux/trace_seq.h>
6#include <linux/percpu.h> 6#include <linux/percpu.h>
7#include <linux/hardirq.h>
7 8
8struct trace_array; 9struct trace_array;
9struct tracer; 10struct tracer;
@@ -134,6 +135,11 @@ struct ftrace_event_call {
134 void (*profile_disable)(void); 135 void (*profile_disable)(void);
135}; 136};
136 137
138#define FTRACE_MAX_PROFILE_SIZE 2048
139
140extern char *trace_profile_buf;
141extern char *trace_profile_buf_nmi;
142
137#define MAX_FILTER_PRED 32 143#define MAX_FILTER_PRED 32
138#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ 144#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */
139 145
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a822087857e9..a0361cb69769 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -648,11 +648,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
648 * struct ftrace_raw_##call *entry; 648 * struct ftrace_raw_##call *entry;
649 * u64 __addr = 0, __count = 1; 649 * u64 __addr = 0, __count = 1;
650 * unsigned long irq_flags; 650 * unsigned long irq_flags;
651 * struct trace_entry *ent;
651 * int __entry_size; 652 * int __entry_size;
652 * int __data_size; 653 * int __data_size;
654 * int __cpu
653 * int pc; 655 * int pc;
654 * 656 *
655 * local_save_flags(irq_flags);
656 * pc = preempt_count(); 657 * pc = preempt_count();
657 * 658 *
658 * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); 659 * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
@@ -663,25 +664,34 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
663 * sizeof(u64)); 664 * sizeof(u64));
664 * __entry_size -= sizeof(u32); 665 * __entry_size -= sizeof(u32);
665 * 666 *
666 * do { 667 * // Protect the non nmi buffer
667 * char raw_data[__entry_size]; <- allocate our sample in the stack 668 * // This also protects the rcu read side
668 * struct trace_entry *ent; 669 * local_irq_save(irq_flags);
670 * __cpu = smp_processor_id();
671 *
672 * if (in_nmi())
673 * raw_data = rcu_dereference(trace_profile_buf_nmi);
674 * else
675 * raw_data = rcu_dereference(trace_profile_buf);
676 *
677 * if (!raw_data)
678 * goto end;
669 * 679 *
670 * zero dead bytes from alignment to avoid stack leak to userspace: 680 * raw_data = per_cpu_ptr(raw_data, __cpu);
671 * 681 *
672 * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; 682 * //zero dead bytes from alignment to avoid stack leak to userspace:
673 * entry = (struct ftrace_raw_<call> *)raw_data; 683 * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL;
674 * ent = &entry->ent; 684 * entry = (struct ftrace_raw_<call> *)raw_data;
675 * tracing_generic_entry_update(ent, irq_flags, pc); 685 * ent = &entry->ent;
676 * ent->type = event_call->id; 686 * tracing_generic_entry_update(ent, irq_flags, pc);
687 * ent->type = event_call->id;
677 * 688 *
678 * <tstruct> <- do some jobs with dynamic arrays 689 * <tstruct> <- do some jobs with dynamic arrays
679 * 690 *
680 * <assign> <- affect our values 691 * <assign> <- affect our values
681 * 692 *
682 * perf_tpcounter_event(event_call->id, __addr, __count, entry, 693 * perf_tpcounter_event(event_call->id, __addr, __count, entry,
683 * __entry_size); <- submit them to perf counter 694 * __entry_size); <- submit them to perf counter
684 * } while (0);
685 * 695 *
686 * } 696 * }
687 */ 697 */
@@ -704,11 +714,13 @@ static void ftrace_profile_##call(proto) \
704 struct ftrace_raw_##call *entry; \ 714 struct ftrace_raw_##call *entry; \
705 u64 __addr = 0, __count = 1; \ 715 u64 __addr = 0, __count = 1; \
706 unsigned long irq_flags; \ 716 unsigned long irq_flags; \
717 struct trace_entry *ent; \
707 int __entry_size; \ 718 int __entry_size; \
708 int __data_size; \ 719 int __data_size; \
720 char *raw_data; \
721 int __cpu; \
709 int pc; \ 722 int pc; \
710 \ 723 \
711 local_save_flags(irq_flags); \
712 pc = preempt_count(); \ 724 pc = preempt_count(); \
713 \ 725 \
714 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ 726 __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
@@ -716,23 +728,38 @@ static void ftrace_profile_##call(proto) \
716 sizeof(u64)); \ 728 sizeof(u64)); \
717 __entry_size -= sizeof(u32); \ 729 __entry_size -= sizeof(u32); \
718 \ 730 \
719 do { \ 731 if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \
720 char raw_data[__entry_size]; \ 732 "profile buffer not large enough")) \
721 struct trace_entry *ent; \ 733 return; \
734 \
735 local_irq_save(irq_flags); \
736 __cpu = smp_processor_id(); \
722 \ 737 \
723 *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ 738 if (in_nmi()) \
724 entry = (struct ftrace_raw_##call *)raw_data; \ 739 raw_data = rcu_dereference(trace_profile_buf_nmi); \
725 ent = &entry->ent; \ 740 else \
726 tracing_generic_entry_update(ent, irq_flags, pc); \ 741 raw_data = rcu_dereference(trace_profile_buf); \
727 ent->type = event_call->id; \
728 \ 742 \
729 tstruct \ 743 if (!raw_data) \
744 goto end; \
730 \ 745 \
731 { assign; } \ 746 raw_data = per_cpu_ptr(raw_data, __cpu); \
732 \ 747 \
733 perf_tpcounter_event(event_call->id, __addr, __count, entry,\ 748 *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \
749 entry = (struct ftrace_raw_##call *)raw_data; \
750 ent = &entry->ent; \
751 tracing_generic_entry_update(ent, irq_flags, pc); \
752 ent->type = event_call->id; \
753 \
754 tstruct \
755 \
756 { assign; } \
757 \
758 perf_tpcounter_event(event_call->id, __addr, __count, entry, \
734 __entry_size); \ 759 __entry_size); \
735 } while (0); \ 760 \
761end: \
762 local_irq_restore(irq_flags); \
736 \ 763 \
737} 764}
738 765
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index df4a74efd50c..3aaa77c3309b 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,12 +8,52 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18char *trace_profile_buf_nmi;
19
20/* Count the events in use (per event id, not per instance) */
21static int total_profile_count;
22
11static int ftrace_profile_enable_event(struct ftrace_event_call *event) 23static int ftrace_profile_enable_event(struct ftrace_event_call *event)
12{ 24{
25 char *buf;
26 int ret = -ENOMEM;
27
13 if (atomic_inc_return(&event->profile_count)) 28 if (atomic_inc_return(&event->profile_count))
14 return 0; 29 return 0;
15 30
16 return event->profile_enable(); 31 if (!total_profile_count++) {
32 buf = (char *)alloc_percpu(profile_buf_t);
33 if (!buf)
34 goto fail_buf;
35
36 rcu_assign_pointer(trace_profile_buf, buf);
37
38 buf = (char *)alloc_percpu(profile_buf_t);
39 if (!buf)
40 goto fail_buf_nmi;
41
42 rcu_assign_pointer(trace_profile_buf_nmi, buf);
43 }
44
45 ret = event->profile_enable();
46 if (!ret)
47 return 0;
48
49 kfree(trace_profile_buf_nmi);
50fail_buf_nmi:
51 kfree(trace_profile_buf);
52fail_buf:
53 total_profile_count--;
54 atomic_dec(&event->profile_count);
55
56 return ret;
17} 57}
18 58
19int ftrace_profile_enable(int event_id) 59int ftrace_profile_enable(int event_id)
@@ -36,10 +76,29 @@ int ftrace_profile_enable(int event_id)
36 76
37static void ftrace_profile_disable_event(struct ftrace_event_call *event) 77static void ftrace_profile_disable_event(struct ftrace_event_call *event)
38{ 78{
79 char *buf, *nmi_buf;
80
39 if (!atomic_add_negative(-1, &event->profile_count)) 81 if (!atomic_add_negative(-1, &event->profile_count))
40 return; 82 return;
41 83
42 event->profile_disable(); 84 event->profile_disable();
85
86 if (!--total_profile_count) {
87 buf = trace_profile_buf;
88 rcu_assign_pointer(trace_profile_buf, NULL);
89
90 nmi_buf = trace_profile_buf_nmi;
91 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
92
93 /*
94 * Ensure every events in profiling have finished before
95 * releasing the buffers
96 */
97 synchronize_sched();
98
99 free_percpu(buf);
100 free_percpu(nmi_buf);
101 }
43} 102}
44 103
45void ftrace_profile_disable(int event_id) 104void ftrace_profile_disable(int event_id)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8712ce3c6a0e..7a3550cf2597 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit;
384 384
385static void prof_syscall_enter(struct pt_regs *regs, long id) 385static void prof_syscall_enter(struct pt_regs *regs, long id)
386{ 386{
387 struct syscall_trace_enter *rec;
388 struct syscall_metadata *sys_data; 387 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec;
389 unsigned long flags;
390 char *raw_data;
389 int syscall_nr; 391 int syscall_nr;
390 int size; 392 int size;
393 int cpu;
391 394
392 syscall_nr = syscall_get_nr(current, regs); 395 syscall_nr = syscall_get_nr(current, regs);
393 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
402 size = ALIGN(size + sizeof(u32), sizeof(u64)); 405 size = ALIGN(size + sizeof(u32), sizeof(u64));
403 size -= sizeof(u32); 406 size -= sizeof(u32);
404 407
405 do { 408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
406 char raw_data[size]; 409 "profile buffer not large enough"))
410 return;
411
412 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags);
407 414
408 /* zero the dead bytes from align to not leak stack to user */ 415 cpu = smp_processor_id();
409 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
410 424
411 rec = (struct syscall_trace_enter *) raw_data; 425 raw_data = per_cpu_ptr(raw_data, cpu);
412 tracing_generic_entry_update(&rec->ent, 0, 0); 426
413 rec->ent.type = sys_data->enter_id; 427 /* zero the dead bytes from align to not leak stack to user */
414 rec->nr = syscall_nr; 428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
415 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 429
416 (unsigned long *)&rec->args); 430 rec = (struct syscall_trace_enter *) raw_data;
417 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); 431 tracing_generic_entry_update(&rec->ent, 0, 0);
418 } while(0); 432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args);
436 perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size);
437
438end:
439 local_irq_restore(flags);
419} 440}
420 441
421int reg_prof_syscall_enter(char *name) 442int reg_prof_syscall_enter(char *name)
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name)
460static void prof_syscall_exit(struct pt_regs *regs, long ret) 481static void prof_syscall_exit(struct pt_regs *regs, long ret)
461{ 482{
462 struct syscall_metadata *sys_data; 483 struct syscall_metadata *sys_data;
463 struct syscall_trace_exit rec; 484 struct syscall_trace_exit *rec;
485 unsigned long flags;
464 int syscall_nr; 486 int syscall_nr;
487 char *raw_data;
488 int size;
489 int cpu;
465 490
466 syscall_nr = syscall_get_nr(current, regs); 491 syscall_nr = syscall_get_nr(current, regs);
467 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
471 if (!sys_data) 496 if (!sys_data)
472 return; 497 return;
473 498
474 tracing_generic_entry_update(&rec.ent, 0, 0); 499 /* We can probably do that at build time */
475 rec.ent.type = sys_data->exit_id; 500 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
476 rec.nr = syscall_nr; 501 size -= sizeof(u32);
477 rec.ret = syscall_get_return_value(current, regs);
478 502
479 perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); 503 /*
504 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime?
506 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
508 "exit event has grown above profile buffer size"))
509 return;
510
511 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags);
513 cpu = smp_processor_id();
514
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs);
534
535 perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size);
536
537end:
538 local_irq_restore(flags);
480} 539}
481 540
482int reg_prof_syscall_exit(char *name) 541int reg_prof_syscall_exit(char *name)