diff options
-rw-r--r-- | include/linux/ftrace_event.h | 6 | ||||
-rw-r--r-- | include/trace/ftrace.h | 83 | ||||
-rw-r--r-- | kernel/trace/trace_event_profile.c | 61 | ||||
-rw-r--r-- | kernel/trace/trace_syscalls.c | 97 |
4 files changed, 199 insertions, 48 deletions
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bc103d7b1ca8..4ec5e67e18cf 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/ring_buffer.h> | 4 | #include <linux/ring_buffer.h> |
5 | #include <linux/trace_seq.h> | 5 | #include <linux/trace_seq.h> |
6 | #include <linux/percpu.h> | 6 | #include <linux/percpu.h> |
7 | #include <linux/hardirq.h> | ||
7 | 8 | ||
8 | struct trace_array; | 9 | struct trace_array; |
9 | struct tracer; | 10 | struct tracer; |
@@ -134,6 +135,11 @@ struct ftrace_event_call { | |||
134 | void (*profile_disable)(void); | 135 | void (*profile_disable)(void); |
135 | }; | 136 | }; |
136 | 137 | ||
138 | #define FTRACE_MAX_PROFILE_SIZE 2048 | ||
139 | |||
140 | extern char *trace_profile_buf; | ||
141 | extern char *trace_profile_buf_nmi; | ||
142 | |||
137 | #define MAX_FILTER_PRED 32 | 143 | #define MAX_FILTER_PRED 32 |
138 | #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ | 144 | #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ |
139 | 145 | ||
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a822087857e9..a0361cb69769 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h | |||
@@ -648,11 +648,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ | |||
648 | * struct ftrace_raw_##call *entry; | 648 | * struct ftrace_raw_##call *entry; |
649 | * u64 __addr = 0, __count = 1; | 649 | * u64 __addr = 0, __count = 1; |
650 | * unsigned long irq_flags; | 650 | * unsigned long irq_flags; |
651 | * struct trace_entry *ent; | ||
651 | * int __entry_size; | 652 | * int __entry_size; |
652 | * int __data_size; | 653 | * int __data_size; |
654 | * int __cpu | ||
653 | * int pc; | 655 | * int pc; |
654 | * | 656 | * |
655 | * local_save_flags(irq_flags); | ||
656 | * pc = preempt_count(); | 657 | * pc = preempt_count(); |
657 | * | 658 | * |
658 | * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); | 659 | * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); |
@@ -663,25 +664,34 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ | |||
663 | * sizeof(u64)); | 664 | * sizeof(u64)); |
664 | * __entry_size -= sizeof(u32); | 665 | * __entry_size -= sizeof(u32); |
665 | * | 666 | * |
666 | * do { | 667 | * // Protect the non nmi buffer |
667 | * char raw_data[__entry_size]; <- allocate our sample in the stack | 668 | * // This also protects the rcu read side |
668 | * struct trace_entry *ent; | 669 | * local_irq_save(irq_flags); |
670 | * __cpu = smp_processor_id(); | ||
671 | * | ||
672 | * if (in_nmi()) | ||
673 | * raw_data = rcu_dereference(trace_profile_buf_nmi); | ||
674 | * else | ||
675 | * raw_data = rcu_dereference(trace_profile_buf); | ||
676 | * | ||
677 | * if (!raw_data) | ||
678 | * goto end; | ||
669 | * | 679 | * |
670 | * zero dead bytes from alignment to avoid stack leak to userspace: | 680 | * raw_data = per_cpu_ptr(raw_data, __cpu); |
671 | * | 681 | * |
672 | * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; | 682 | * //zero dead bytes from alignment to avoid stack leak to userspace: |
673 | * entry = (struct ftrace_raw_<call> *)raw_data; | 683 | * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; |
674 | * ent = &entry->ent; | 684 | * entry = (struct ftrace_raw_<call> *)raw_data; |
675 | * tracing_generic_entry_update(ent, irq_flags, pc); | 685 | * ent = &entry->ent; |
676 | * ent->type = event_call->id; | 686 | * tracing_generic_entry_update(ent, irq_flags, pc); |
687 | * ent->type = event_call->id; | ||
677 | * | 688 | * |
678 | * <tstruct> <- do some jobs with dynamic arrays | 689 | * <tstruct> <- do some jobs with dynamic arrays |
679 | * | 690 | * |
680 | * <assign> <- affect our values | 691 | * <assign> <- affect our values |
681 | * | 692 | * |
682 | * perf_tpcounter_event(event_call->id, __addr, __count, entry, | 693 | * perf_tpcounter_event(event_call->id, __addr, __count, entry, |
683 | * __entry_size); <- submit them to perf counter | 694 | * __entry_size); <- submit them to perf counter |
684 | * } while (0); | ||
685 | * | 695 | * |
686 | * } | 696 | * } |
687 | */ | 697 | */ |
@@ -704,11 +714,13 @@ static void ftrace_profile_##call(proto) \ | |||
704 | struct ftrace_raw_##call *entry; \ | 714 | struct ftrace_raw_##call *entry; \ |
705 | u64 __addr = 0, __count = 1; \ | 715 | u64 __addr = 0, __count = 1; \ |
706 | unsigned long irq_flags; \ | 716 | unsigned long irq_flags; \ |
717 | struct trace_entry *ent; \ | ||
707 | int __entry_size; \ | 718 | int __entry_size; \ |
708 | int __data_size; \ | 719 | int __data_size; \ |
720 | char *raw_data; \ | ||
721 | int __cpu; \ | ||
709 | int pc; \ | 722 | int pc; \ |
710 | \ | 723 | \ |
711 | local_save_flags(irq_flags); \ | ||
712 | pc = preempt_count(); \ | 724 | pc = preempt_count(); \ |
713 | \ | 725 | \ |
714 | __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ | 726 | __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ |
@@ -716,23 +728,38 @@ static void ftrace_profile_##call(proto) \ | |||
716 | sizeof(u64)); \ | 728 | sizeof(u64)); \ |
717 | __entry_size -= sizeof(u32); \ | 729 | __entry_size -= sizeof(u32); \ |
718 | \ | 730 | \ |
719 | do { \ | 731 | if (WARN_ONCE(__entry_size > FTRACE_MAX_PROFILE_SIZE, \ |
720 | char raw_data[__entry_size]; \ | 732 | "profile buffer not large enough")) \ |
721 | struct trace_entry *ent; \ | 733 | return; \ |
734 | \ | ||
735 | local_irq_save(irq_flags); \ | ||
736 | __cpu = smp_processor_id(); \ | ||
722 | \ | 737 | \ |
723 | *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ | 738 | if (in_nmi()) \ |
724 | entry = (struct ftrace_raw_##call *)raw_data; \ | 739 | raw_data = rcu_dereference(trace_profile_buf_nmi); \ |
725 | ent = &entry->ent; \ | 740 | else \ |
726 | tracing_generic_entry_update(ent, irq_flags, pc); \ | 741 | raw_data = rcu_dereference(trace_profile_buf); \ |
727 | ent->type = event_call->id; \ | ||
728 | \ | 742 | \ |
729 | tstruct \ | 743 | if (!raw_data) \ |
744 | goto end; \ | ||
730 | \ | 745 | \ |
731 | { assign; } \ | 746 | raw_data = per_cpu_ptr(raw_data, __cpu); \ |
732 | \ | 747 | \ |
733 | perf_tpcounter_event(event_call->id, __addr, __count, entry,\ | 748 | *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ |
749 | entry = (struct ftrace_raw_##call *)raw_data; \ | ||
750 | ent = &entry->ent; \ | ||
751 | tracing_generic_entry_update(ent, irq_flags, pc); \ | ||
752 | ent->type = event_call->id; \ | ||
753 | \ | ||
754 | tstruct \ | ||
755 | \ | ||
756 | { assign; } \ | ||
757 | \ | ||
758 | perf_tpcounter_event(event_call->id, __addr, __count, entry, \ | ||
734 | __entry_size); \ | 759 | __entry_size); \ |
735 | } while (0); \ | 760 | \ |
761 | end: \ | ||
762 | local_irq_restore(irq_flags); \ | ||
736 | \ | 763 | \ |
737 | } | 764 | } |
738 | 765 | ||
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index df4a74efd50c..3aaa77c3309b 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c | |||
@@ -8,12 +8,52 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include "trace.h" | 9 | #include "trace.h" |
10 | 10 | ||
11 | /* | ||
12 | * We can't use a size but a type in alloc_percpu() | ||
13 | * So let's create a dummy type that matches the desired size | ||
14 | */ | ||
15 | typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; | ||
16 | |||
17 | char *trace_profile_buf; | ||
18 | char *trace_profile_buf_nmi; | ||
19 | |||
20 | /* Count the events in use (per event id, not per instance) */ | ||
21 | static int total_profile_count; | ||
22 | |||
11 | static int ftrace_profile_enable_event(struct ftrace_event_call *event) | 23 | static int ftrace_profile_enable_event(struct ftrace_event_call *event) |
12 | { | 24 | { |
25 | char *buf; | ||
26 | int ret = -ENOMEM; | ||
27 | |||
13 | if (atomic_inc_return(&event->profile_count)) | 28 | if (atomic_inc_return(&event->profile_count)) |
14 | return 0; | 29 | return 0; |
15 | 30 | ||
16 | return event->profile_enable(); | 31 | if (!total_profile_count++) { |
32 | buf = (char *)alloc_percpu(profile_buf_t); | ||
33 | if (!buf) | ||
34 | goto fail_buf; | ||
35 | |||
36 | rcu_assign_pointer(trace_profile_buf, buf); | ||
37 | |||
38 | buf = (char *)alloc_percpu(profile_buf_t); | ||
39 | if (!buf) | ||
40 | goto fail_buf_nmi; | ||
41 | |||
42 | rcu_assign_pointer(trace_profile_buf_nmi, buf); | ||
43 | } | ||
44 | |||
45 | ret = event->profile_enable(); | ||
46 | if (!ret) | ||
47 | return 0; | ||
48 | |||
49 | kfree(trace_profile_buf_nmi); | ||
50 | fail_buf_nmi: | ||
51 | kfree(trace_profile_buf); | ||
52 | fail_buf: | ||
53 | total_profile_count--; | ||
54 | atomic_dec(&event->profile_count); | ||
55 | |||
56 | return ret; | ||
17 | } | 57 | } |
18 | 58 | ||
19 | int ftrace_profile_enable(int event_id) | 59 | int ftrace_profile_enable(int event_id) |
@@ -36,10 +76,29 @@ int ftrace_profile_enable(int event_id) | |||
36 | 76 | ||
37 | static void ftrace_profile_disable_event(struct ftrace_event_call *event) | 77 | static void ftrace_profile_disable_event(struct ftrace_event_call *event) |
38 | { | 78 | { |
79 | char *buf, *nmi_buf; | ||
80 | |||
39 | if (!atomic_add_negative(-1, &event->profile_count)) | 81 | if (!atomic_add_negative(-1, &event->profile_count)) |
40 | return; | 82 | return; |
41 | 83 | ||
42 | event->profile_disable(); | 84 | event->profile_disable(); |
85 | |||
86 | if (!--total_profile_count) { | ||
87 | buf = trace_profile_buf; | ||
88 | rcu_assign_pointer(trace_profile_buf, NULL); | ||
89 | |||
90 | nmi_buf = trace_profile_buf_nmi; | ||
91 | rcu_assign_pointer(trace_profile_buf_nmi, NULL); | ||
92 | |||
93 | /* | ||
94 | * Ensure every events in profiling have finished before | ||
95 | * releasing the buffers | ||
96 | */ | ||
97 | synchronize_sched(); | ||
98 | |||
99 | free_percpu(buf); | ||
100 | free_percpu(nmi_buf); | ||
101 | } | ||
43 | } | 102 | } |
44 | 103 | ||
45 | void ftrace_profile_disable(int event_id) | 104 | void ftrace_profile_disable(int event_id) |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3c6a0e..7a3550cf2597 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -384,10 +384,13 @@ static int sys_prof_refcount_exit; | |||
384 | 384 | ||
385 | static void prof_syscall_enter(struct pt_regs *regs, long id) | 385 | static void prof_syscall_enter(struct pt_regs *regs, long id) |
386 | { | 386 | { |
387 | struct syscall_trace_enter *rec; | ||
388 | struct syscall_metadata *sys_data; | 387 | struct syscall_metadata *sys_data; |
388 | struct syscall_trace_enter *rec; | ||
389 | unsigned long flags; | ||
390 | char *raw_data; | ||
389 | int syscall_nr; | 391 | int syscall_nr; |
390 | int size; | 392 | int size; |
393 | int cpu; | ||
391 | 394 | ||
392 | syscall_nr = syscall_get_nr(current, regs); | 395 | syscall_nr = syscall_get_nr(current, regs); |
393 | if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) | 396 | if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) |
@@ -402,20 +405,38 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
402 | size = ALIGN(size + sizeof(u32), sizeof(u64)); | 405 | size = ALIGN(size + sizeof(u32), sizeof(u64)); |
403 | size -= sizeof(u32); | 406 | size -= sizeof(u32); |
404 | 407 | ||
405 | do { | 408 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, |
406 | char raw_data[size]; | 409 | "profile buffer not large enough")) |
410 | return; | ||
411 | |||
412 | /* Protect the per cpu buffer, begin the rcu read side */ | ||
413 | local_irq_save(flags); | ||
407 | 414 | ||
408 | /* zero the dead bytes from align to not leak stack to user */ | 415 | cpu = smp_processor_id(); |
409 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | 416 | |
417 | if (in_nmi()) | ||
418 | raw_data = rcu_dereference(trace_profile_buf_nmi); | ||
419 | else | ||
420 | raw_data = rcu_dereference(trace_profile_buf); | ||
421 | |||
422 | if (!raw_data) | ||
423 | goto end; | ||
410 | 424 | ||
411 | rec = (struct syscall_trace_enter *) raw_data; | 425 | raw_data = per_cpu_ptr(raw_data, cpu); |
412 | tracing_generic_entry_update(&rec->ent, 0, 0); | 426 | |
413 | rec->ent.type = sys_data->enter_id; | 427 | /* zero the dead bytes from align to not leak stack to user */ |
414 | rec->nr = syscall_nr; | 428 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; |
415 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | 429 | |
416 | (unsigned long *)&rec->args); | 430 | rec = (struct syscall_trace_enter *) raw_data; |
417 | perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); | 431 | tracing_generic_entry_update(&rec->ent, 0, 0); |
418 | } while(0); | 432 | rec->ent.type = sys_data->enter_id; |
433 | rec->nr = syscall_nr; | ||
434 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | ||
435 | (unsigned long *)&rec->args); | ||
436 | perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); | ||
437 | |||
438 | end: | ||
439 | local_irq_restore(flags); | ||
419 | } | 440 | } |
420 | 441 | ||
421 | int reg_prof_syscall_enter(char *name) | 442 | int reg_prof_syscall_enter(char *name) |
@@ -460,8 +481,12 @@ void unreg_prof_syscall_enter(char *name) | |||
460 | static void prof_syscall_exit(struct pt_regs *regs, long ret) | 481 | static void prof_syscall_exit(struct pt_regs *regs, long ret) |
461 | { | 482 | { |
462 | struct syscall_metadata *sys_data; | 483 | struct syscall_metadata *sys_data; |
463 | struct syscall_trace_exit rec; | 484 | struct syscall_trace_exit *rec; |
485 | unsigned long flags; | ||
464 | int syscall_nr; | 486 | int syscall_nr; |
487 | char *raw_data; | ||
488 | int size; | ||
489 | int cpu; | ||
465 | 490 | ||
466 | syscall_nr = syscall_get_nr(current, regs); | 491 | syscall_nr = syscall_get_nr(current, regs); |
467 | if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) | 492 | if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) |
@@ -471,12 +496,46 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
471 | if (!sys_data) | 496 | if (!sys_data) |
472 | return; | 497 | return; |
473 | 498 | ||
474 | tracing_generic_entry_update(&rec.ent, 0, 0); | 499 | /* We can probably do that at build time */ |
475 | rec.ent.type = sys_data->exit_id; | 500 | size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); |
476 | rec.nr = syscall_nr; | 501 | size -= sizeof(u32); |
477 | rec.ret = syscall_get_return_value(current, regs); | ||
478 | 502 | ||
479 | perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); | 503 | /* |
504 | * Impossible, but be paranoid with the future | ||
505 | * How to put this check outside runtime? | ||
506 | */ | ||
507 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | ||
508 | "exit event has grown above profile buffer size")) | ||
509 | return; | ||
510 | |||
511 | /* Protect the per cpu buffer, begin the rcu read side */ | ||
512 | local_irq_save(flags); | ||
513 | cpu = smp_processor_id(); | ||
514 | |||
515 | if (in_nmi()) | ||
516 | raw_data = rcu_dereference(trace_profile_buf_nmi); | ||
517 | else | ||
518 | raw_data = rcu_dereference(trace_profile_buf); | ||
519 | |||
520 | if (!raw_data) | ||
521 | goto end; | ||
522 | |||
523 | raw_data = per_cpu_ptr(raw_data, cpu); | ||
524 | |||
525 | /* zero the dead bytes from align to not leak stack to user */ | ||
526 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
527 | |||
528 | rec = (struct syscall_trace_exit *)raw_data; | ||
529 | |||
530 | tracing_generic_entry_update(&rec->ent, 0, 0); | ||
531 | rec->ent.type = sys_data->exit_id; | ||
532 | rec->nr = syscall_nr; | ||
533 | rec->ret = syscall_get_return_value(current, regs); | ||
534 | |||
535 | perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size); | ||
536 | |||
537 | end: | ||
538 | local_irq_restore(flags); | ||
480 | } | 539 | } |
481 | 540 | ||
482 | int reg_prof_syscall_exit(char *name) | 541 | int reg_prof_syscall_exit(char *name) |