aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-04-07 21:04:27 -0400
committerDavid S. Miller <davem@davemloft.net>2016-04-07 21:04:27 -0400
commitf8711655f862eabc0cb03e2bccd871069399c53e (patch)
treec7516296b2081373659aa0dfe39e61fe7598b56e
parentb33b0a1bf69faff89693df49519fa7b459f5d807 (diff)
parente3edfdec04d43aa6276db639d3721e073161d2c2 (diff)
Merge branch 'bpf-tracepoints'
Alexei Starovoitov says: ==================== allow bpf attach to tracepoints Hi Steven, Peter, v1->v2: addressed Peter's comments: - fixed wording in patch 1, added ack - refactored 2nd patch into 3: 2/10 remove unused __perf_addr macro which frees up an argument in perf_trace_buf_submit 3/10 split perf_trace_buf_prepare into alloc and update parts, so that bpf programs don't have to pay performance penalty for update of struct trace_entry which is not going to be accessed by bpf 4/10 actual addition of bpf filter to perf tracepoint handler is now trivial and bpf prog can be used as proper filter of tracepoints v1 cover: last time we discussed bpf+tracepoints it was a year ago [1] and the reason we didn't proceed with that approach was that bpf would make arguments arg1, arg2 to trace_xx(arg1, arg2) call to be exposed to bpf program and that was considered unnecessary extension of abi. Back then I wanted to avoid the cost of buffer alloc and field assign part in all of the tracepoints, but looks like when optimized the cost is acceptable. So this new apporach doesn't expose any new abi to bpf program. The program is looking at tracepoint fields after they were copied by perf_trace_xx() and described in /sys/kernel/debug/tracing/events/xxx/format We made a tool [2] that takes arguments from /sys/.../format and works as: $ tplist.py -v random:urandom_read int got_bits; int pool_left; int input_left; Then these fields can be copy-pasted into bpf program like: struct urandom_read { __u64 hidden_pad; int got_bits; int pool_left; int input_left; }; and the program can use it: SEC("tracepoint/random/urandom_read") int bpf_prog(struct urandom_read *ctx) { return ctx->pool_left > 0 ? 1 : 0; } This way the program can access tracepoint fields faster than equivalent bpf+kprobe program, which is the main goal of these patches. Patch 1-4 are simple changes in perf core side, please review. I'd like to take the whole set via net-next tree, since the rest of the patches might conflict with other bpf work going on in net-next and we want to avoid cross-tree merge conflicts. Alternatively we can put patches 1-4 into both tip and net-next. Patch 9 is an example of access to tracepoint fields from bpf prog. Patch 10 is a micro benchmark for bpf+kprobe vs bpf+tracepoint. Note that for actual tracing tools the user doesn't need to run tplist.py and copy-paste fields manually. The tools do it automatically. Like argdist tool [3] can be used as: $ argdist -H 't:block:block_rq_complete():u32:nr_sector' where 'nr_sector' is name of tracepoint field taken from /sys/kernel/debug/tracing/events/block/block_rq_complete/format and appropriate bpf program is generated on the fly. [1] http://thread.gmane.org/gmane.linux.kernel.api/8127/focus=8165 [2] https://github.com/iovisor/bcc/blob/master/tools/tplist.py [3] https://github.com/iovisor/bcc/blob/master/tools/argdist.py ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/bpf.h2
-rw-r--r--include/linux/perf_event.h4
-rw-r--r--include/linux/trace_events.h9
-rw-r--r--include/trace/perf.h23
-rw-r--r--include/trace/trace_events.h3
-rw-r--r--include/uapi/linux/bpf.h1
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/bpf/verifier.c6
-rw-r--r--kernel/events/core.c27
-rw-r--r--kernel/trace/bpf_trace.c85
-rw-r--r--kernel/trace/trace_event_perf.c40
-rw-r--r--kernel/trace/trace_events.c18
-rw-r--r--kernel/trace/trace_kprobe.c10
-rw-r--r--kernel/trace/trace_syscalls.c13
-rw-r--r--kernel/trace/trace_uprobe.c5
-rw-r--r--samples/bpf/Makefile5
-rw-r--r--samples/bpf/bpf_load.c26
-rw-r--r--samples/bpf/offwaketime_kern.c26
-rw-r--r--samples/bpf/test_overhead_kprobe_kern.c41
-rw-r--r--samples/bpf/test_overhead_tp_kern.c36
-rw-r--r--samples/bpf/test_overhead_user.c162
21 files changed, 475 insertions, 69 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..b2365a6eba3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
131struct bpf_prog_aux { 131struct bpf_prog_aux {
132 atomic_t refcnt; 132 atomic_t refcnt;
133 u32 used_map_cnt; 133 u32 used_map_cnt;
134 u32 max_ctx_offset;
134 const struct bpf_verifier_ops *ops; 135 const struct bpf_verifier_ops *ops;
135 struct bpf_map **used_maps; 136 struct bpf_map **used_maps;
136 struct bpf_prog *prog; 137 struct bpf_prog *prog;
@@ -160,6 +161,7 @@ struct bpf_array {
160#define MAX_TAIL_CALL_CNT 32 161#define MAX_TAIL_CALL_CNT 32
161 162
162u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); 163u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
164u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
163void bpf_fd_array_map_clear(struct bpf_map *map); 165void bpf_fd_array_map_clear(struct bpf_map *map);
164bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); 166bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
165const struct bpf_func_proto *bpf_get_trace_printk_proto(void); 167const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..eb41b535ef38 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
882 */ 882 */
883static inline void perf_fetch_caller_regs(struct pt_regs *regs) 883static inline void perf_fetch_caller_regs(struct pt_regs *regs)
884{ 884{
885 memset(regs, 0, sizeof(*regs));
886
887 perf_arch_fetch_caller_regs(regs, CALLER_ADDR0); 885 perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
888} 886}
889 887
@@ -1018,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
1018} 1016}
1019 1017
1020extern void perf_event_init(void); 1018extern void perf_event_init(void);
1021extern void perf_tp_event(u64 addr, u64 count, void *record, 1019extern void perf_tp_event(u16 event_type, u64 count, void *record,
1022 int entry_size, struct pt_regs *regs, 1020 int entry_size, struct pt_regs *regs,
1023 struct hlist_head *head, int rctx, 1021 struct hlist_head *head, int rctx,
1024 struct task_struct *task); 1022 struct task_struct *task);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..fe6441203b59 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
569 int is_signed, int filter_type); 569 int is_signed, int filter_type);
570extern int trace_add_event_call(struct trace_event_call *call); 570extern int trace_add_event_call(struct trace_event_call *call);
571extern int trace_remove_event_call(struct trace_event_call *call); 571extern int trace_remove_event_call(struct trace_event_call *call);
572extern int trace_event_get_offsets(struct trace_event_call *call);
572 573
573#define is_signed_type(type) (((type)(-1)) < (type)1) 574#define is_signed_type(type) (((type)(-1)) < (type)1)
574 575
@@ -605,15 +606,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
605extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, 606extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
606 char *filter_str); 607 char *filter_str);
607extern void ftrace_profile_free_filter(struct perf_event *event); 608extern void ftrace_profile_free_filter(struct perf_event *event);
608extern void *perf_trace_buf_prepare(int size, unsigned short type, 609void perf_trace_buf_update(void *record, u16 type);
609 struct pt_regs **regs, int *rctxp); 610void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
610 611
611static inline void 612static inline void
612perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, 613perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
613 u64 count, struct pt_regs *regs, void *head, 614 u64 count, struct pt_regs *regs, void *head,
614 struct task_struct *task) 615 struct task_struct *task)
615{ 616{
616 perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task); 617 perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
617} 618}
618#endif 619#endif
619 620
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 26486fcd74ce..a182306eefd7 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -20,9 +20,6 @@
20#undef __get_bitmask 20#undef __get_bitmask
21#define __get_bitmask(field) (char *)__get_dynamic_array(field) 21#define __get_bitmask(field) (char *)__get_dynamic_array(field)
22 22
23#undef __perf_addr
24#define __perf_addr(a) (__addr = (a))
25
26#undef __perf_count 23#undef __perf_count
27#define __perf_count(c) (__count = (c)) 24#define __perf_count(c) (__count = (c))
28 25
@@ -37,8 +34,9 @@ perf_trace_##call(void *__data, proto) \
37 struct trace_event_call *event_call = __data; \ 34 struct trace_event_call *event_call = __data; \
38 struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ 35 struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
39 struct trace_event_raw_##call *entry; \ 36 struct trace_event_raw_##call *entry; \
37 struct bpf_prog *prog = event_call->prog; \
40 struct pt_regs *__regs; \ 38 struct pt_regs *__regs; \
41 u64 __addr = 0, __count = 1; \ 39 u64 __count = 1; \
42 struct task_struct *__task = NULL; \ 40 struct task_struct *__task = NULL; \
43 struct hlist_head *head; \ 41 struct hlist_head *head; \
44 int __entry_size; \ 42 int __entry_size; \
@@ -48,7 +46,7 @@ perf_trace_##call(void *__data, proto) \
48 __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ 46 __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
49 \ 47 \
50 head = this_cpu_ptr(event_call->perf_events); \ 48 head = this_cpu_ptr(event_call->perf_events); \
51 if (__builtin_constant_p(!__task) && !__task && \ 49 if (!prog && __builtin_constant_p(!__task) && !__task && \
52 hlist_empty(head)) \ 50 hlist_empty(head)) \
53 return; \ 51 return; \
54 \ 52 \
@@ -56,8 +54,7 @@ perf_trace_##call(void *__data, proto) \
56 sizeof(u64)); \ 54 sizeof(u64)); \
57 __entry_size -= sizeof(u32); \ 55 __entry_size -= sizeof(u32); \
58 \ 56 \
59 entry = perf_trace_buf_prepare(__entry_size, \ 57 entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx); \
60 event_call->event.type, &__regs, &rctx); \
61 if (!entry) \ 58 if (!entry) \
62 return; \ 59 return; \
63 \ 60 \
@@ -67,8 +64,16 @@ perf_trace_##call(void *__data, proto) \
67 \ 64 \
68 { assign; } \ 65 { assign; } \
69 \ 66 \
70 perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \ 67 if (prog) { \
71 __count, __regs, head, __task); \ 68 *(struct pt_regs **)entry = __regs; \
69 if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
70 perf_swevent_put_recursion_context(rctx); \
71 return; \
72 } \
73 } \
74 perf_trace_buf_submit(entry, __entry_size, rctx, \
75 event_call->event.type, __count, __regs, \
76 head, __task); \
72} 77}
73 78
74/* 79/*
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 170c93bbdbb7..80679a9fae65 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -652,9 +652,6 @@ static inline notrace int trace_event_get_offsets_##call( \
652#undef TP_fast_assign 652#undef TP_fast_assign
653#define TP_fast_assign(args...) args 653#define TP_fast_assign(args...) args
654 654
655#undef __perf_addr
656#define __perf_addr(a) (a)
657
658#undef __perf_count 655#undef __perf_count
659#define __perf_count(c) (c) 656#define __perf_count(c) (c)
660 657
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23917bb47bf3..70eda5aeb304 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_prog_type {
92 BPF_PROG_TYPE_KPROBE, 92 BPF_PROG_TYPE_KPROBE,
93 BPF_PROG_TYPE_SCHED_CLS, 93 BPF_PROG_TYPE_SCHED_CLS,
94 BPF_PROG_TYPE_SCHED_ACT, 94 BPF_PROG_TYPE_SCHED_ACT,
95 BPF_PROG_TYPE_TRACEPOINT,
95}; 96};
96 97
97#define BPF_PSEUDO_MAP_FD 1 98#define BPF_PSEUDO_MAP_FD 1
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..35114725cf30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ free_smap:
116 return ERR_PTR(err); 116 return ERR_PTR(err);
117} 117}
118 118
119static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) 119u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
120{ 120{
121 struct pt_regs *regs = (struct pt_regs *) (long) r1; 121 struct pt_regs *regs = (struct pt_regs *) (long) r1;
122 struct bpf_map *map = (struct bpf_map *) (long) r2; 122 struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..58792fed5678 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
652 enum bpf_access_type t) 652 enum bpf_access_type t)
653{ 653{
654 if (env->prog->aux->ops->is_valid_access && 654 if (env->prog->aux->ops->is_valid_access &&
655 env->prog->aux->ops->is_valid_access(off, size, t)) 655 env->prog->aux->ops->is_valid_access(off, size, t)) {
656 /* remember the offset of last byte accessed in ctx */
657 if (env->prog->aux->max_ctx_offset < off + size)
658 env->prog->aux->max_ctx_offset = off + size;
656 return 0; 659 return 0;
660 }
657 661
658 verbose("invalid bpf_context access off=%d size=%d\n", off, size); 662 verbose("invalid bpf_context access off=%d size=%d\n", off, size);
659 return -EACCES; 663 return -EACCES;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..9a01019ff7c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void)
6725} 6725}
6726EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 6726EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6727 6727
6728inline void perf_swevent_put_recursion_context(int rctx) 6728void perf_swevent_put_recursion_context(int rctx)
6729{ 6729{
6730 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable); 6730 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6731 6731
6732 put_recursion_context(swhash->recursion, rctx); 6732 put_recursion_context(swhash->recursion, rctx);
6733} 6733}
6734EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
6734 6735
6735void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) 6736void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6736{ 6737{
@@ -6987,7 +6988,7 @@ static int perf_tp_event_match(struct perf_event *event,
6987 return 1; 6988 return 1;
6988} 6989}
6989 6990
6990void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, 6991void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
6991 struct pt_regs *regs, struct hlist_head *head, int rctx, 6992 struct pt_regs *regs, struct hlist_head *head, int rctx,
6992 struct task_struct *task) 6993 struct task_struct *task)
6993{ 6994{
@@ -6999,9 +7000,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6999 .data = record, 7000 .data = record,
7000 }; 7001 };
7001 7002
7002 perf_sample_data_init(&data, addr, 0); 7003 perf_sample_data_init(&data, 0, 0);
7003 data.raw = &raw; 7004 data.raw = &raw;
7004 7005
7006 perf_trace_buf_update(record, event_type);
7007
7005 hlist_for_each_entry_rcu(event, head, hlist_entry) { 7008 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7006 if (perf_tp_event_match(event, &data, regs)) 7009 if (perf_tp_event_match(event, &data, regs))
7007 perf_swevent_event(event, count, &data, regs); 7010 perf_swevent_event(event, count, &data, regs);
@@ -7104,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event)
7104 7107
7105static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) 7108static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7106{ 7109{
7110 bool is_kprobe, is_tracepoint;
7107 struct bpf_prog *prog; 7111 struct bpf_prog *prog;
7108 7112
7109 if (event->attr.type != PERF_TYPE_TRACEPOINT) 7113 if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -7112,20 +7116,31 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7112 if (event->tp_event->prog) 7116 if (event->tp_event->prog)
7113 return -EEXIST; 7117 return -EEXIST;
7114 7118
7115 if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) 7119 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
7116 /* bpf programs can only be attached to u/kprobes */ 7120 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
7121 if (!is_kprobe && !is_tracepoint)
7122 /* bpf programs can only be attached to u/kprobe or tracepoint */
7117 return -EINVAL; 7123 return -EINVAL;
7118 7124
7119 prog = bpf_prog_get(prog_fd); 7125 prog = bpf_prog_get(prog_fd);
7120 if (IS_ERR(prog)) 7126 if (IS_ERR(prog))
7121 return PTR_ERR(prog); 7127 return PTR_ERR(prog);
7122 7128
7123 if (prog->type != BPF_PROG_TYPE_KPROBE) { 7129 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
7130 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
7124 /* valid fd, but invalid bpf program type */ 7131 /* valid fd, but invalid bpf program type */
7125 bpf_prog_put(prog); 7132 bpf_prog_put(prog);
7126 return -EINVAL; 7133 return -EINVAL;
7127 } 7134 }
7128 7135
7136 if (is_tracepoint) {
7137 int off = trace_event_get_offsets(event->tp_event);
7138
7139 if (prog->aux->max_ctx_offset > off) {
7140 bpf_prog_put(prog);
7141 return -EACCES;
7142 }
7143 }
7129 event->tp_event->prog = prog; 7144 event->tp_event->prog = prog;
7130 7145
7131 return 0; 7146 return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e4ffb3ace5f..413ec5614180 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
268 .arg5_type = ARG_CONST_STACK_SIZE, 268 .arg5_type = ARG_CONST_STACK_SIZE,
269}; 269};
270 270
271static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) 271static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
272{ 272{
273 switch (func_id) { 273 switch (func_id) {
274 case BPF_FUNC_map_lookup_elem: 274 case BPF_FUNC_map_lookup_elem:
@@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
295 return &bpf_get_smp_processor_id_proto; 295 return &bpf_get_smp_processor_id_proto;
296 case BPF_FUNC_perf_event_read: 296 case BPF_FUNC_perf_event_read:
297 return &bpf_perf_event_read_proto; 297 return &bpf_perf_event_read_proto;
298 default:
299 return NULL;
300 }
301}
302
303static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
304{
305 switch (func_id) {
298 case BPF_FUNC_perf_event_output: 306 case BPF_FUNC_perf_event_output:
299 return &bpf_perf_event_output_proto; 307 return &bpf_perf_event_output_proto;
300 case BPF_FUNC_get_stackid: 308 case BPF_FUNC_get_stackid:
301 return &bpf_get_stackid_proto; 309 return &bpf_get_stackid_proto;
302 default: 310 default:
303 return NULL; 311 return tracing_func_proto(func_id);
304 } 312 }
305} 313}
306 314
@@ -332,9 +340,82 @@ static struct bpf_prog_type_list kprobe_tl = {
332 .type = BPF_PROG_TYPE_KPROBE, 340 .type = BPF_PROG_TYPE_KPROBE,
333}; 341};
334 342
343static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
344{
345 /*
346 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
347 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
348 * from there and call the same bpf_perf_event_output() helper
349 */
350 u64 ctx = *(long *)r1;
351
352 return bpf_perf_event_output(ctx, r2, index, r4, size);
353}
354
355static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
356 .func = bpf_perf_event_output_tp,
357 .gpl_only = true,
358 .ret_type = RET_INTEGER,
359 .arg1_type = ARG_PTR_TO_CTX,
360 .arg2_type = ARG_CONST_MAP_PTR,
361 .arg3_type = ARG_ANYTHING,
362 .arg4_type = ARG_PTR_TO_STACK,
363 .arg5_type = ARG_CONST_STACK_SIZE,
364};
365
366static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
367{
368 u64 ctx = *(long *)r1;
369
370 return bpf_get_stackid(ctx, r2, r3, r4, r5);
371}
372
373static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
374 .func = bpf_get_stackid_tp,
375 .gpl_only = true,
376 .ret_type = RET_INTEGER,
377 .arg1_type = ARG_PTR_TO_CTX,
378 .arg2_type = ARG_CONST_MAP_PTR,
379 .arg3_type = ARG_ANYTHING,
380};
381
382static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
383{
384 switch (func_id) {
385 case BPF_FUNC_perf_event_output:
386 return &bpf_perf_event_output_proto_tp;
387 case BPF_FUNC_get_stackid:
388 return &bpf_get_stackid_proto_tp;
389 default:
390 return tracing_func_proto(func_id);
391 }
392}
393
394static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
395{
396 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
397 return false;
398 if (type != BPF_READ)
399 return false;
400 if (off % size != 0)
401 return false;
402 return true;
403}
404
405static const struct bpf_verifier_ops tracepoint_prog_ops = {
406 .get_func_proto = tp_prog_func_proto,
407 .is_valid_access = tp_prog_is_valid_access,
408};
409
410static struct bpf_prog_type_list tracepoint_tl = {
411 .ops = &tracepoint_prog_ops,
412 .type = BPF_PROG_TYPE_TRACEPOINT,
413};
414
335static int __init register_kprobe_prog_ops(void) 415static int __init register_kprobe_prog_ops(void)
336{ 416{
337 bpf_register_prog_type(&kprobe_tl); 417 bpf_register_prog_type(&kprobe_tl);
418 bpf_register_prog_type(&tracepoint_tl);
338 return 0; 419 return 0;
339} 420}
340late_initcall(register_kprobe_prog_ops); 421late_initcall(register_kprobe_prog_ops);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..5a927075977f 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -260,42 +260,43 @@ void perf_trace_del(struct perf_event *p_event, int flags)
260 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); 260 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
261} 261}
262 262
263void *perf_trace_buf_prepare(int size, unsigned short type, 263void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
264 struct pt_regs **regs, int *rctxp)
265{ 264{
266 struct trace_entry *entry;
267 unsigned long flags;
268 char *raw_data; 265 char *raw_data;
269 int pc; 266 int rctx;
270 267
271 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 268 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
272 269
273 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 270 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
274 "perf buffer not large enough")) 271 "perf buffer not large enough"))
275 return NULL; 272 return NULL;
276 273
277 pc = preempt_count(); 274 *rctxp = rctx = perf_swevent_get_recursion_context();
278 275 if (rctx < 0)
279 *rctxp = perf_swevent_get_recursion_context();
280 if (*rctxp < 0)
281 return NULL; 276 return NULL;
282 277
283 if (regs) 278 if (regs)
284 *regs = this_cpu_ptr(&__perf_regs[*rctxp]); 279 *regs = this_cpu_ptr(&__perf_regs[rctx]);
285 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); 280 raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
286 281
287 /* zero the dead bytes from align to not leak stack to user */ 282 /* zero the dead bytes from align to not leak stack to user */
288 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 283 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
284 return raw_data;
285}
286EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
287NOKPROBE_SYMBOL(perf_trace_buf_alloc);
288
289void perf_trace_buf_update(void *record, u16 type)
290{
291 struct trace_entry *entry = record;
292 int pc = preempt_count();
293 unsigned long flags;
289 294
290 entry = (struct trace_entry *)raw_data;
291 local_save_flags(flags); 295 local_save_flags(flags);
292 tracing_generic_entry_update(entry, flags, pc); 296 tracing_generic_entry_update(entry, flags, pc);
293 entry->type = type; 297 entry->type = type;
294
295 return raw_data;
296} 298}
297EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 299NOKPROBE_SYMBOL(perf_trace_buf_update);
298NOKPROBE_SYMBOL(perf_trace_buf_prepare);
299 300
300#ifdef CONFIG_FUNCTION_TRACER 301#ifdef CONFIG_FUNCTION_TRACER
301static void 302static void
@@ -316,15 +317,16 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
316 317
317 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); 318 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
318 319
320 memset(&regs, 0, sizeof(regs));
319 perf_fetch_caller_regs(&regs); 321 perf_fetch_caller_regs(&regs);
320 322
321 entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); 323 entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
322 if (!entry) 324 if (!entry)
323 return; 325 return;
324 326
325 entry->ip = ip; 327 entry->ip = ip;
326 entry->parent_ip = parent_ip; 328 entry->parent_ip = parent_ip;
327 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, 329 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
328 1, &regs, head, NULL); 330 1, &regs, head, NULL);
329 331
330#undef ENTRY_SIZE 332#undef ENTRY_SIZE
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..ced963049e0a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
204 } 204 }
205} 205}
206 206
207/*
208 * run-time version of trace_event_get_offsets_<call>() that returns the last
209 * accessible offset of trace fields excluding __dynamic_array bytes
210 */
211int trace_event_get_offsets(struct trace_event_call *call)
212{
213 struct ftrace_event_field *tail;
214 struct list_head *head;
215
216 head = trace_get_fields(call);
217 /*
218 * head->next points to the last field with the largest offset,
219 * since it was added last by trace_define_field()
220 */
221 tail = list_first_entry(head, struct ftrace_event_field, link);
222 return tail->offset + tail->size;
223}
224
207int trace_event_raw_init(struct trace_event_call *call) 225int trace_event_raw_init(struct trace_event_call *call)
208{ 226{
209 int id; 227 int id;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 919e0ddd8fcc..5546eec0505f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1149 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1149 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1150 size -= sizeof(u32); 1150 size -= sizeof(u32);
1151 1151
1152 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); 1152 entry = perf_trace_buf_alloc(size, NULL, &rctx);
1153 if (!entry) 1153 if (!entry)
1154 return; 1154 return;
1155 1155
1156 entry->ip = (unsigned long)tk->rp.kp.addr; 1156 entry->ip = (unsigned long)tk->rp.kp.addr;
1157 memset(&entry[1], 0, dsize); 1157 memset(&entry[1], 0, dsize);
1158 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1158 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1159 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1159 perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
1160 head, NULL);
1160} 1161}
1161NOKPROBE_SYMBOL(kprobe_perf_func); 1162NOKPROBE_SYMBOL(kprobe_perf_func);
1162 1163
@@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
1184 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1185 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1185 size -= sizeof(u32); 1186 size -= sizeof(u32);
1186 1187
1187 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); 1188 entry = perf_trace_buf_alloc(size, NULL, &rctx);
1188 if (!entry) 1189 if (!entry)
1189 return; 1190 return;
1190 1191
1191 entry->func = (unsigned long)tk->rp.kp.addr; 1192 entry->func = (unsigned long)tk->rp.kp.addr;
1192 entry->ret_ip = (unsigned long)ri->ret_addr; 1193 entry->ret_ip = (unsigned long)ri->ret_addr;
1193 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); 1194 store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
1194 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1195 perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
1196 head, NULL);
1195} 1197}
1196NOKPROBE_SYMBOL(kretprobe_perf_func); 1198NOKPROBE_SYMBOL(kretprobe_perf_func);
1197#endif /* CONFIG_PERF_EVENTS */ 1199#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e78f364cc192..b2b6efc083a4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
587 size = ALIGN(size + sizeof(u32), sizeof(u64)); 587 size = ALIGN(size + sizeof(u32), sizeof(u64));
588 size -= sizeof(u32); 588 size -= sizeof(u32);
589 589
590 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 590 rec = perf_trace_buf_alloc(size, NULL, &rctx);
591 sys_data->enter_event->event.type, NULL, &rctx);
592 if (!rec) 591 if (!rec)
593 return; 592 return;
594 593
595 rec->nr = syscall_nr; 594 rec->nr = syscall_nr;
596 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 595 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
597 (unsigned long *)&rec->args); 596 (unsigned long *)&rec->args);
598 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 597 perf_trace_buf_submit(rec, size, rctx,
598 sys_data->enter_event->event.type, 1, regs,
599 head, NULL);
599} 600}
600 601
601static int perf_sysenter_enable(struct trace_event_call *call) 602static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
660 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 661 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
661 size -= sizeof(u32); 662 size -= sizeof(u32);
662 663
663 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 664 rec = perf_trace_buf_alloc(size, NULL, &rctx);
664 sys_data->exit_event->event.type, NULL, &rctx);
665 if (!rec) 665 if (!rec)
666 return; 666 return;
667 667
668 rec->nr = syscall_nr; 668 rec->nr = syscall_nr;
669 rec->ret = syscall_get_return_value(current, regs); 669 rec->ret = syscall_get_return_value(current, regs);
670 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 670 perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
671 1, regs, head, NULL);
671} 672}
672 673
673static int perf_sysexit_enable(struct trace_event_call *call) 674static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7915142c89e4..c53485441c88 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1131 if (hlist_empty(head)) 1131 if (hlist_empty(head))
1132 goto out; 1132 goto out;
1133 1133
1134 entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx); 1134 entry = perf_trace_buf_alloc(size, NULL, &rctx);
1135 if (!entry) 1135 if (!entry)
1136 goto out; 1136 goto out;
1137 1137
@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1152 memset(data + len, 0, size - esize - len); 1152 memset(data + len, 0, size - esize - len);
1153 } 1153 }
1154 1154
1155 perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); 1155 perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
1156 head, NULL);
1156 out: 1157 out:
1157 preempt_enable(); 1158 preempt_enable();
1158} 1159}
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 502c9fc8db85..9959771bf808 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -19,6 +19,7 @@ hostprogs-y += lathist
19hostprogs-y += offwaketime 19hostprogs-y += offwaketime
20hostprogs-y += spintest 20hostprogs-y += spintest
21hostprogs-y += map_perf_test 21hostprogs-y += map_perf_test
22hostprogs-y += test_overhead
22 23
23test_verifier-objs := test_verifier.o libbpf.o 24test_verifier-objs := test_verifier.o libbpf.o
24test_maps-objs := test_maps.o libbpf.o 25test_maps-objs := test_maps.o libbpf.o
@@ -38,6 +39,7 @@ lathist-objs := bpf_load.o libbpf.o lathist_user.o
38offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o 39offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
39spintest-objs := bpf_load.o libbpf.o spintest_user.o 40spintest-objs := bpf_load.o libbpf.o spintest_user.o
40map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o 41map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
42test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
41 43
42# Tell kbuild to always build the programs 44# Tell kbuild to always build the programs
43always := $(hostprogs-y) 45always := $(hostprogs-y)
@@ -56,6 +58,8 @@ always += lathist_kern.o
56always += offwaketime_kern.o 58always += offwaketime_kern.o
57always += spintest_kern.o 59always += spintest_kern.o
58always += map_perf_test_kern.o 60always += map_perf_test_kern.o
61always += test_overhead_tp_kern.o
62always += test_overhead_kprobe_kern.o
59 63
60HOSTCFLAGS += -I$(objtree)/usr/include 64HOSTCFLAGS += -I$(objtree)/usr/include
61 65
@@ -75,6 +79,7 @@ HOSTLOADLIBES_lathist += -lelf
75HOSTLOADLIBES_offwaketime += -lelf 79HOSTLOADLIBES_offwaketime += -lelf
76HOSTLOADLIBES_spintest += -lelf 80HOSTLOADLIBES_spintest += -lelf
77HOSTLOADLIBES_map_perf_test += -lelf -lrt 81HOSTLOADLIBES_map_perf_test += -lelf -lrt
82HOSTLOADLIBES_test_overhead += -lelf -lrt
78 83
79# point this to your LLVM backend with bpf support 84# point this to your LLVM backend with bpf support
80LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc 85LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 58f86bd11b3d..022af71c2bb5 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -49,6 +49,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
49 bool is_socket = strncmp(event, "socket", 6) == 0; 49 bool is_socket = strncmp(event, "socket", 6) == 0;
50 bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; 50 bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
51 bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; 51 bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
52 bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
52 enum bpf_prog_type prog_type; 53 enum bpf_prog_type prog_type;
53 char buf[256]; 54 char buf[256];
54 int fd, efd, err, id; 55 int fd, efd, err, id;
@@ -63,6 +64,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
63 prog_type = BPF_PROG_TYPE_SOCKET_FILTER; 64 prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
64 } else if (is_kprobe || is_kretprobe) { 65 } else if (is_kprobe || is_kretprobe) {
65 prog_type = BPF_PROG_TYPE_KPROBE; 66 prog_type = BPF_PROG_TYPE_KPROBE;
67 } else if (is_tracepoint) {
68 prog_type = BPF_PROG_TYPE_TRACEPOINT;
66 } else { 69 } else {
67 printf("Unknown event '%s'\n", event); 70 printf("Unknown event '%s'\n", event);
68 return -1; 71 return -1;
@@ -111,12 +114,23 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
111 event, strerror(errno)); 114 event, strerror(errno));
112 return -1; 115 return -1;
113 } 116 }
114 }
115 117
116 strcpy(buf, DEBUGFS); 118 strcpy(buf, DEBUGFS);
117 strcat(buf, "events/kprobes/"); 119 strcat(buf, "events/kprobes/");
118 strcat(buf, event); 120 strcat(buf, event);
119 strcat(buf, "/id"); 121 strcat(buf, "/id");
122 } else if (is_tracepoint) {
123 event += 11;
124
125 if (*event == 0) {
126 printf("event name cannot be empty\n");
127 return -1;
128 }
129 strcpy(buf, DEBUGFS);
130 strcat(buf, "events/");
131 strcat(buf, event);
132 strcat(buf, "/id");
133 }
120 134
121 efd = open(buf, O_RDONLY, 0); 135 efd = open(buf, O_RDONLY, 0);
122 if (efd < 0) { 136 if (efd < 0) {
@@ -304,6 +318,7 @@ int load_bpf_file(char *path)
304 318
305 if (memcmp(shname_prog, "kprobe/", 7) == 0 || 319 if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
306 memcmp(shname_prog, "kretprobe/", 10) == 0 || 320 memcmp(shname_prog, "kretprobe/", 10) == 0 ||
321 memcmp(shname_prog, "tracepoint/", 11) == 0 ||
307 memcmp(shname_prog, "socket", 6) == 0) 322 memcmp(shname_prog, "socket", 6) == 0)
308 load_and_attach(shname_prog, insns, data_prog->d_size); 323 load_and_attach(shname_prog, insns, data_prog->d_size);
309 } 324 }
@@ -320,6 +335,7 @@ int load_bpf_file(char *path)
320 335
321 if (memcmp(shname, "kprobe/", 7) == 0 || 336 if (memcmp(shname, "kprobe/", 7) == 0 ||
322 memcmp(shname, "kretprobe/", 10) == 0 || 337 memcmp(shname, "kretprobe/", 10) == 0 ||
338 memcmp(shname, "tracepoint/", 11) == 0 ||
323 memcmp(shname, "socket", 6) == 0) 339 memcmp(shname, "socket", 6) == 0)
324 load_and_attach(shname, data->d_buf, data->d_size); 340 load_and_attach(shname, data->d_buf, data->d_size);
325 } 341 }
diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c
index c0aa5a9b9c48..983629a31c79 100644
--- a/samples/bpf/offwaketime_kern.c
+++ b/samples/bpf/offwaketime_kern.c
@@ -73,7 +73,7 @@ int waker(struct pt_regs *ctx)
73 return 0; 73 return 0;
74} 74}
75 75
76static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta) 76static inline int update_counts(void *ctx, u32 pid, u64 delta)
77{ 77{
78 struct key_t key = {}; 78 struct key_t key = {};
79 struct wokeby_t *woke; 79 struct wokeby_t *woke;
@@ -100,15 +100,33 @@ static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta)
100 return 0; 100 return 0;
101} 101}
102 102
103#if 1
104/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
105struct sched_switch_args {
106 unsigned long long pad;
107 char prev_comm[16];
108 int prev_pid;
109 int prev_prio;
110 long long prev_state;
111 char next_comm[16];
112 int next_pid;
113 int next_prio;
114};
115SEC("tracepoint/sched/sched_switch")
116int oncpu(struct sched_switch_args *ctx)
117{
118 /* record previous thread sleep time */
119 u32 pid = ctx->prev_pid;
120#else
103SEC("kprobe/finish_task_switch") 121SEC("kprobe/finish_task_switch")
104int oncpu(struct pt_regs *ctx) 122int oncpu(struct pt_regs *ctx)
105{ 123{
106 struct task_struct *p = (void *) PT_REGS_PARM1(ctx); 124 struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
125 /* record previous thread sleep time */
126 u32 pid = _(p->pid);
127#endif
107 u64 delta, ts, *tsp; 128 u64 delta, ts, *tsp;
108 u32 pid;
109 129
110 /* record previous thread sleep time */
111 pid = _(p->pid);
112 ts = bpf_ktime_get_ns(); 130 ts = bpf_ktime_get_ns();
113 bpf_map_update_elem(&start, &pid, &ts, BPF_ANY); 131 bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
114 132
diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c
new file mode 100644
index 000000000000..468a66a92ef9
--- /dev/null
+++ b/samples/bpf/test_overhead_kprobe_kern.c
@@ -0,0 +1,41 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/version.h>
8#include <linux/ptrace.h>
9#include <uapi/linux/bpf.h>
10#include "bpf_helpers.h"
11
12#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
13
14SEC("kprobe/__set_task_comm")
15int prog(struct pt_regs *ctx)
16{
17 struct signal_struct *signal;
18 struct task_struct *tsk;
19 char oldcomm[16] = {};
20 char newcomm[16] = {};
21 u16 oom_score_adj;
22 u32 pid;
23
24 tsk = (void *)PT_REGS_PARM1(ctx);
25
26 pid = _(tsk->pid);
27 bpf_probe_read(oldcomm, sizeof(oldcomm), &tsk->comm);
28 bpf_probe_read(newcomm, sizeof(newcomm), (void *)PT_REGS_PARM2(ctx));
29 signal = _(tsk->signal);
30 oom_score_adj = _(signal->oom_score_adj);
31 return 0;
32}
33
34SEC("kprobe/urandom_read")
35int prog2(struct pt_regs *ctx)
36{
37 return 0;
38}
39
40char _license[] SEC("license") = "GPL";
41u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c
new file mode 100644
index 000000000000..38f5c0b9da9f
--- /dev/null
+++ b/samples/bpf/test_overhead_tp_kern.c
@@ -0,0 +1,36 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <uapi/linux/bpf.h>
8#include "bpf_helpers.h"
9
10/* from /sys/kernel/debug/tracing/events/task/task_rename/format */
11struct task_rename {
12 __u64 pad;
13 __u32 pid;
14 char oldcomm[16];
15 char newcomm[16];
16 __u16 oom_score_adj;
17};
18SEC("tracepoint/task/task_rename")
19int prog(struct task_rename *ctx)
20{
21 return 0;
22}
23
24/* from /sys/kernel/debug/tracing/events/random/urandom_read/format */
25struct urandom_read {
26 __u64 pad;
27 int got_bits;
28 int pool_left;
29 int input_left;
30};
31SEC("tracepoint/random/urandom_read")
32int prog2(struct urandom_read *ctx)
33{
34 return 0;
35}
36char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c
new file mode 100644
index 000000000000..d291167fd3c7
--- /dev/null
+++ b/samples/bpf/test_overhead_user.c
@@ -0,0 +1,162 @@
1/* Copyright (c) 2016 Facebook
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#define _GNU_SOURCE
8#include <sched.h>
9#include <stdio.h>
10#include <sys/types.h>
11#include <asm/unistd.h>
12#include <fcntl.h>
13#include <unistd.h>
14#include <assert.h>
15#include <sys/wait.h>
16#include <stdlib.h>
17#include <signal.h>
18#include <linux/bpf.h>
19#include <string.h>
20#include <time.h>
21#include <sys/resource.h>
22#include "libbpf.h"
23#include "bpf_load.h"
24
25#define MAX_CNT 1000000
26
27static __u64 time_get_ns(void)
28{
29 struct timespec ts;
30
31 clock_gettime(CLOCK_MONOTONIC, &ts);
32 return ts.tv_sec * 1000000000ull + ts.tv_nsec;
33}
34
35static void test_task_rename(int cpu)
36{
37 __u64 start_time;
38 char buf[] = "test\n";
39 int i, fd;
40
41 fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
42 if (fd < 0) {
43 printf("couldn't open /proc\n");
44 exit(1);
45 }
46 start_time = time_get_ns();
47 for (i = 0; i < MAX_CNT; i++)
48 write(fd, buf, sizeof(buf));
49 printf("task_rename:%d: %lld events per sec\n",
50 cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
51 close(fd);
52}
53
54static void test_urandom_read(int cpu)
55{
56 __u64 start_time;
57 char buf[4];
58 int i, fd;
59
60 fd = open("/dev/urandom", O_RDONLY);
61 if (fd < 0) {
62 printf("couldn't open /dev/urandom\n");
63 exit(1);
64 }
65 start_time = time_get_ns();
66 for (i = 0; i < MAX_CNT; i++)
67 read(fd, buf, sizeof(buf));
68 printf("urandom_read:%d: %lld events per sec\n",
69 cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
70 close(fd);
71}
72
73static void loop(int cpu, int flags)
74{
75 cpu_set_t cpuset;
76
77 CPU_ZERO(&cpuset);
78 CPU_SET(cpu, &cpuset);
79 sched_setaffinity(0, sizeof(cpuset), &cpuset);
80
81 if (flags & 1)
82 test_task_rename(cpu);
83 if (flags & 2)
84 test_urandom_read(cpu);
85}
86
87static void run_perf_test(int tasks, int flags)
88{
89 pid_t pid[tasks];
90 int i;
91
92 for (i = 0; i < tasks; i++) {
93 pid[i] = fork();
94 if (pid[i] == 0) {
95 loop(i, flags);
96 exit(0);
97 } else if (pid[i] == -1) {
98 printf("couldn't spawn #%d process\n", i);
99 exit(1);
100 }
101 }
102 for (i = 0; i < tasks; i++) {
103 int status;
104
105 assert(waitpid(pid[i], &status, 0) == pid[i]);
106 assert(status == 0);
107 }
108}
109
110static void unload_progs(void)
111{
112 close(prog_fd[0]);
113 close(prog_fd[1]);
114 close(event_fd[0]);
115 close(event_fd[1]);
116}
117
118int main(int argc, char **argv)
119{
120 struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
121 char filename[256];
122 int num_cpu = 8;
123 int test_flags = ~0;
124
125 setrlimit(RLIMIT_MEMLOCK, &r);
126
127 if (argc > 1)
128 test_flags = atoi(argv[1]) ? : test_flags;
129 if (argc > 2)
130 num_cpu = atoi(argv[2]) ? : num_cpu;
131
132 if (test_flags & 0x3) {
133 printf("BASE\n");
134 run_perf_test(num_cpu, test_flags);
135 }
136
137 if (test_flags & 0xC) {
138 snprintf(filename, sizeof(filename),
139 "%s_kprobe_kern.o", argv[0]);
140 if (load_bpf_file(filename)) {
141 printf("%s", bpf_log_buf);
142 return 1;
143 }
144 printf("w/KPROBE\n");
145 run_perf_test(num_cpu, test_flags >> 2);
146 unload_progs();
147 }
148
149 if (test_flags & 0x30) {
150 snprintf(filename, sizeof(filename),
151 "%s_tp_kern.o", argv[0]);
152 if (load_bpf_file(filename)) {
153 printf("%s", bpf_log_buf);
154 return 1;
155 }
156 printf("w/TRACEPOINT\n");
157 run_perf_test(num_cpu, test_flags >> 4);
158 unload_progs();
159 }
160
161 return 0;
162}