diff options
| author | Yonghong Song <yhs@fb.com> | 2017-08-04 19:00:09 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2017-08-07 17:09:48 -0400 |
| commit | cf5f5cea270655dd49370760576c64b228583b79 (patch) | |
| tree | 5758e9d56b94542d082d40ed66f8d8effa6287f9 /kernel/trace | |
| parent | d226a2b84d0528da7e35e7e19e052293889cdd21 (diff) | |
bpf: add support for sys_enter_* and sys_exit_* tracepoints
Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
style tracepoints. The iovisor/bcc issue #748
(https://github.com/iovisor/bcc/issues/748) documents this issue.
For example, if you try to attach a bpf program to tracepoints
syscalls/sys_enter_newfstat, you will get the following error:
# ./tools/trace.py t:syscalls:sys_enter_newfstat
Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
Failed to attach BPF to tracepoint
The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
tracepoints are treated differently from other tracepoints and there
is no bpf hook to it.
This patch adds bpf support for these syscalls tracepoints by
. permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
. calling bpf programs in perf_syscall_enter and perf_syscall_exit
The legality of bpf program ctx access is also checked.
Function trace_event_get_offsets returns correct max offset for each
specific syscall tracepoint, which is compared against the maximum offset
access in bpf program.
Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel/trace')
| -rw-r--r-- | kernel/trace/trace_syscalls.c | 53 |
1 files changed, 51 insertions, 2 deletions
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..7a1a92036563 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); | |||
| 559 | static int sys_perf_refcount_enter; | 559 | static int sys_perf_refcount_enter; |
| 560 | static int sys_perf_refcount_exit; | 560 | static int sys_perf_refcount_exit; |
| 561 | 561 | ||
| 562 | static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, | ||
| 563 | struct syscall_metadata *sys_data, | ||
| 564 | struct syscall_trace_enter *rec) { | ||
| 565 | struct syscall_tp_t { | ||
| 566 | unsigned long long regs; | ||
| 567 | unsigned long syscall_nr; | ||
| 568 | unsigned long args[sys_data->nb_args]; | ||
| 569 | } param; | ||
| 570 | int i; | ||
| 571 | |||
| 572 | *(struct pt_regs **)¶m = regs; | ||
| 573 | param.syscall_nr = rec->nr; | ||
| 574 | for (i = 0; i < sys_data->nb_args; i++) | ||
| 575 | param.args[i] = rec->args[i]; | ||
| 576 | return trace_call_bpf(prog, ¶m); | ||
| 577 | } | ||
| 578 | |||
| 562 | static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | 579 | static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) |
| 563 | { | 580 | { |
| 564 | struct syscall_metadata *sys_data; | 581 | struct syscall_metadata *sys_data; |
| 565 | struct syscall_trace_enter *rec; | 582 | struct syscall_trace_enter *rec; |
| 566 | struct hlist_head *head; | 583 | struct hlist_head *head; |
| 584 | struct bpf_prog *prog; | ||
| 567 | int syscall_nr; | 585 | int syscall_nr; |
| 568 | int rctx; | 586 | int rctx; |
| 569 | int size; | 587 | int size; |
| @@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 578 | if (!sys_data) | 596 | if (!sys_data) |
| 579 | return; | 597 | return; |
| 580 | 598 | ||
| 599 | prog = READ_ONCE(sys_data->enter_event->prog); | ||
| 581 | head = this_cpu_ptr(sys_data->enter_event->perf_events); | 600 | head = this_cpu_ptr(sys_data->enter_event->perf_events); |
| 582 | if (hlist_empty(head)) | 601 | if (!prog && hlist_empty(head)) |
| 583 | return; | 602 | return; |
| 584 | 603 | ||
| 585 | /* get the size after alignment with the u32 buffer size field */ | 604 | /* get the size after alignment with the u32 buffer size field */ |
| @@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) | |||
| 594 | rec->nr = syscall_nr; | 613 | rec->nr = syscall_nr; |
| 595 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, | 614 | syscall_get_arguments(current, regs, 0, sys_data->nb_args, |
| 596 | (unsigned long *)&rec->args); | 615 | (unsigned long *)&rec->args); |
| 616 | |||
| 617 | if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) || | ||
| 618 | hlist_empty(head)) { | ||
| 619 | perf_swevent_put_recursion_context(rctx); | ||
| 620 | return; | ||
| 621 | } | ||
| 622 | |||
| 597 | perf_trace_buf_submit(rec, size, rctx, | 623 | perf_trace_buf_submit(rec, size, rctx, |
| 598 | sys_data->enter_event->event.type, 1, regs, | 624 | sys_data->enter_event->event.type, 1, regs, |
| 599 | head, NULL); | 625 | head, NULL); |
| @@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call) | |||
| 633 | mutex_unlock(&syscall_trace_lock); | 659 | mutex_unlock(&syscall_trace_lock); |
| 634 | } | 660 | } |
| 635 | 661 | ||
| 662 | static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs, | ||
| 663 | struct syscall_trace_exit *rec) { | ||
| 664 | struct syscall_tp_t { | ||
| 665 | unsigned long long regs; | ||
| 666 | unsigned long syscall_nr; | ||
| 667 | unsigned long ret; | ||
| 668 | } param; | ||
| 669 | |||
| 670 | *(struct pt_regs **)¶m = regs; | ||
| 671 | param.syscall_nr = rec->nr; | ||
| 672 | param.ret = rec->ret; | ||
| 673 | return trace_call_bpf(prog, ¶m); | ||
| 674 | } | ||
| 675 | |||
| 636 | static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | 676 | static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) |
| 637 | { | 677 | { |
| 638 | struct syscall_metadata *sys_data; | 678 | struct syscall_metadata *sys_data; |
| 639 | struct syscall_trace_exit *rec; | 679 | struct syscall_trace_exit *rec; |
| 640 | struct hlist_head *head; | 680 | struct hlist_head *head; |
| 681 | struct bpf_prog *prog; | ||
| 641 | int syscall_nr; | 682 | int syscall_nr; |
| 642 | int rctx; | 683 | int rctx; |
| 643 | int size; | 684 | int size; |
| @@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 652 | if (!sys_data) | 693 | if (!sys_data) |
| 653 | return; | 694 | return; |
| 654 | 695 | ||
| 696 | prog = READ_ONCE(sys_data->exit_event->prog); | ||
| 655 | head = this_cpu_ptr(sys_data->exit_event->perf_events); | 697 | head = this_cpu_ptr(sys_data->exit_event->perf_events); |
| 656 | if (hlist_empty(head)) | 698 | if (!prog && hlist_empty(head)) |
| 657 | return; | 699 | return; |
| 658 | 700 | ||
| 659 | /* We can probably do that at build time */ | 701 | /* We can probably do that at build time */ |
| @@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) | |||
| 666 | 708 | ||
| 667 | rec->nr = syscall_nr; | 709 | rec->nr = syscall_nr; |
| 668 | rec->ret = syscall_get_return_value(current, regs); | 710 | rec->ret = syscall_get_return_value(current, regs); |
| 711 | |||
| 712 | if ((prog && !perf_call_bpf_exit(prog, regs, rec)) || | ||
| 713 | hlist_empty(head)) { | ||
| 714 | perf_swevent_put_recursion_context(rctx); | ||
| 715 | return; | ||
| 716 | } | ||
| 717 | |||
| 669 | perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, | 718 | perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, |
| 670 | 1, regs, head, NULL); | 719 | 1, regs, head, NULL); |
| 671 | } | 720 | } |
