aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c217
-rw-r--r--kernel/events/hw_breakpoint.c6
2 files changed, 205 insertions, 18 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 94afe5b91c6a..a6a9ec4cd8f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
118 PERF_FLAG_FD_OUTPUT |\ 118 PERF_FLAG_FD_OUTPUT |\
119 PERF_FLAG_PID_CGROUP) 119 PERF_FLAG_PID_CGROUP)
120 120
121/*
122 * branch priv levels that need permission checks
123 */
124#define PERF_SAMPLE_BRANCH_PERM_PLM \
125 (PERF_SAMPLE_BRANCH_KERNEL |\
126 PERF_SAMPLE_BRANCH_HV)
127
121enum event_type_t { 128enum event_type_t {
122 EVENT_FLEXIBLE = 0x1, 129 EVENT_FLEXIBLE = 0x1,
123 EVENT_PINNED = 0x2, 130 EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 135 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 136 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 137 */
131struct jump_label_key_deferred perf_sched_events __read_mostly; 138struct static_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
133 141
134static atomic_t nr_mmap_events __read_mostly; 142static atomic_t nr_mmap_events __read_mostly;
135static atomic_t nr_comm_events __read_mostly; 143static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
881 if (is_cgroup_event(event)) 889 if (is_cgroup_event(event))
882 ctx->nr_cgroups++; 890 ctx->nr_cgroups++;
883 891
892 if (has_branch_stack(event))
893 ctx->nr_branch_stack++;
894
884 list_add_rcu(&event->event_entry, &ctx->event_list); 895 list_add_rcu(&event->event_entry, &ctx->event_list);
885 if (!ctx->nr_events) 896 if (!ctx->nr_events)
886 perf_pmu_rotate_start(ctx->pmu); 897 perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1020 cpuctx->cgrp = NULL; 1031 cpuctx->cgrp = NULL;
1021 } 1032 }
1022 1033
1034 if (has_branch_stack(event))
1035 ctx->nr_branch_stack--;
1036
1023 ctx->nr_events--; 1037 ctx->nr_events--;
1024 if (event->attr.inherit_stat) 1038 if (event->attr.inherit_stat)
1025 ctx->nr_stat--; 1039 ctx->nr_stat--;
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2195} 2209}
2196 2210
2197/* 2211/*
2212 * When sampling the branck stack in system-wide, it may be necessary
2213 * to flush the stack on context switch. This happens when the branch
2214 * stack does not tag its entries with the pid of the current task.
2215 * Otherwise it becomes impossible to associate a branch entry with a
2216 * task. This ambiguity is more likely to appear when the branch stack
2217 * supports priv level filtering and the user sets it to monitor only
2218 * at the user level (which could be a useful measurement in system-wide
2219 * mode). In that case, the risk is high of having a branch stack with
2220 * branch from multiple tasks. Flushing may mean dropping the existing
2221 * entries or stashing them somewhere in the PMU specific code layer.
2222 *
2223 * This function provides the context switch callback to the lower code
2224 * layer. It is invoked ONLY when there is at least one system-wide context
2225 * with at least one active event using taken branch sampling.
2226 */
2227static void perf_branch_stack_sched_in(struct task_struct *prev,
2228 struct task_struct *task)
2229{
2230 struct perf_cpu_context *cpuctx;
2231 struct pmu *pmu;
2232 unsigned long flags;
2233
2234 /* no need to flush branch stack if not changing task */
2235 if (prev == task)
2236 return;
2237
2238 local_irq_save(flags);
2239
2240 rcu_read_lock();
2241
2242 list_for_each_entry_rcu(pmu, &pmus, entry) {
2243 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2244
2245 /*
2246 * check if the context has at least one
2247 * event using PERF_SAMPLE_BRANCH_STACK
2248 */
2249 if (cpuctx->ctx.nr_branch_stack > 0
2250 && pmu->flush_branch_stack) {
2251
2252 pmu = cpuctx->ctx.pmu;
2253
2254 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2255
2256 perf_pmu_disable(pmu);
2257
2258 pmu->flush_branch_stack();
2259
2260 perf_pmu_enable(pmu);
2261
2262 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2263 }
2264 }
2265
2266 rcu_read_unlock();
2267
2268 local_irq_restore(flags);
2269}
2270
2271/*
2198 * Called from scheduler to add the events of the current task 2272 * Called from scheduler to add the events of the current task
2199 * with interrupts disabled. 2273 * with interrupts disabled.
2200 * 2274 *
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2225 */ 2299 */
2226 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2300 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2227 perf_cgroup_sched_in(prev, task); 2301 perf_cgroup_sched_in(prev, task);
2302
2303 /* check for system-wide branch_stack events */
2304 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2305 perf_branch_stack_sched_in(prev, task);
2228} 2306}
2229 2307
2230static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
2778 2856
2779 if (!event->parent) { 2857 if (!event->parent) {
2780 if (event->attach_state & PERF_ATTACH_TASK) 2858 if (event->attach_state & PERF_ATTACH_TASK)
2781 jump_label_dec_deferred(&perf_sched_events); 2859 static_key_slow_dec_deferred(&perf_sched_events);
2782 if (event->attr.mmap || event->attr.mmap_data) 2860 if (event->attr.mmap || event->attr.mmap_data)
2783 atomic_dec(&nr_mmap_events); 2861 atomic_dec(&nr_mmap_events);
2784 if (event->attr.comm) 2862 if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
2789 put_callchain_buffers(); 2867 put_callchain_buffers();
2790 if (is_cgroup_event(event)) { 2868 if (is_cgroup_event(event)) {
2791 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2869 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2792 jump_label_dec_deferred(&perf_sched_events); 2870 static_key_slow_dec_deferred(&perf_sched_events);
2871 }
2872
2873 if (has_branch_stack(event)) {
2874 static_key_slow_dec_deferred(&perf_sched_events);
2875 /* is system-wide event */
2876 if (!(event->attach_state & PERF_ATTACH_TASK))
2877 atomic_dec(&per_cpu(perf_branch_stack_events,
2878 event->cpu));
2793 } 2879 }
2794 } 2880 }
2795 2881
@@ -3262,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event,
3262 *running = ctx_time - event->tstamp_running; 3348 *running = ctx_time - event->tstamp_running;
3263} 3349}
3264 3350
3265void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) 3351void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3266{ 3352{
3267} 3353}
3268 3354
@@ -3312,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event)
3312 userpg->time_running = running + 3398 userpg->time_running = running +
3313 atomic64_read(&event->child_total_time_running); 3399 atomic64_read(&event->child_total_time_running);
3314 3400
3315 perf_update_user_clock(userpg, now); 3401 arch_perf_update_userpage(userpg, now);
3316 3402
3317 barrier(); 3403 barrier();
3318 ++userpg->lock; 3404 ++userpg->lock;
@@ -3907,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
3907 } 3993 }
3908 } 3994 }
3909 } 3995 }
3996
3997 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
3998 if (data->br_stack) {
3999 size_t size;
4000
4001 size = data->br_stack->nr
4002 * sizeof(struct perf_branch_entry);
4003
4004 perf_output_put(handle, data->br_stack->nr);
4005 perf_output_copy(handle, data->br_stack->entries, size);
4006 } else {
4007 /*
4008 * we always store at least the value of nr
4009 */
4010 u64 nr = 0;
4011 perf_output_put(handle, nr);
4012 }
4013 }
3910} 4014}
3911 4015
3912void perf_prepare_sample(struct perf_event_header *header, 4016void perf_prepare_sample(struct perf_event_header *header,
@@ -3949,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
3949 WARN_ON_ONCE(size & (sizeof(u64)-1)); 4053 WARN_ON_ONCE(size & (sizeof(u64)-1));
3950 header->size += size; 4054 header->size += size;
3951 } 4055 }
4056
4057 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4058 int size = sizeof(u64); /* nr */
4059 if (data->br_stack) {
4060 size += data->br_stack->nr
4061 * sizeof(struct perf_branch_entry);
4062 }
4063 header->size += size;
4064 }
3952} 4065}
3953 4066
3954static void perf_event_output(struct perf_event *event, 4067static void perf_event_output(struct perf_event *event,
@@ -4991,7 +5104,7 @@ fail:
4991 return err; 5104 return err;
4992} 5105}
4993 5106
4994struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5107struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
4995 5108
4996static void sw_perf_event_destroy(struct perf_event *event) 5109static void sw_perf_event_destroy(struct perf_event *event)
4997{ 5110{
@@ -4999,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4999 5112
5000 WARN_ON(event->parent); 5113 WARN_ON(event->parent);
5001 5114
5002 jump_label_dec(&perf_swevent_enabled[event_id]); 5115 static_key_slow_dec(&perf_swevent_enabled[event_id]);
5003 swevent_hlist_put(event); 5116 swevent_hlist_put(event);
5004} 5117}
5005 5118
@@ -5010,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
5010 if (event->attr.type != PERF_TYPE_SOFTWARE) 5123 if (event->attr.type != PERF_TYPE_SOFTWARE)
5011 return -ENOENT; 5124 return -ENOENT;
5012 5125
5126 /*
5127 * no branch sampling for software events
5128 */
5129 if (has_branch_stack(event))
5130 return -EOPNOTSUPP;
5131
5013 switch (event_id) { 5132 switch (event_id) {
5014 case PERF_COUNT_SW_CPU_CLOCK: 5133 case PERF_COUNT_SW_CPU_CLOCK:
5015 case PERF_COUNT_SW_TASK_CLOCK: 5134 case PERF_COUNT_SW_TASK_CLOCK:
@@ -5029,7 +5148,7 @@ static int perf_swevent_init(struct perf_event *event)
5029 if (err) 5148 if (err)
5030 return err; 5149 return err;
5031 5150
5032 jump_label_inc(&perf_swevent_enabled[event_id]); 5151 static_key_slow_inc(&perf_swevent_enabled[event_id]);
5033 event->destroy = sw_perf_event_destroy; 5152 event->destroy = sw_perf_event_destroy;
5034 } 5153 }
5035 5154
@@ -5120,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
5120 if (event->attr.type != PERF_TYPE_TRACEPOINT) 5239 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5121 return -ENOENT; 5240 return -ENOENT;
5122 5241
5242 /*
5243 * no branch sampling for tracepoint events
5244 */
5245 if (has_branch_stack(event))
5246 return -EOPNOTSUPP;
5247
5123 err = perf_trace_init(event); 5248 err = perf_trace_init(event);
5124 if (err) 5249 if (err)
5125 return err; 5250 return err;
@@ -5345,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
5345 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5470 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5346 return -ENOENT; 5471 return -ENOENT;
5347 5472
5473 /*
5474 * no branch sampling for software events
5475 */
5476 if (has_branch_stack(event))
5477 return -EOPNOTSUPP;
5478
5348 perf_swevent_init_hrtimer(event); 5479 perf_swevent_init_hrtimer(event);
5349 5480
5350 return 0; 5481 return 0;
@@ -5419,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
5419 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5550 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5420 return -ENOENT; 5551 return -ENOENT;
5421 5552
5553 /*
5554 * no branch sampling for software events
5555 */
5556 if (has_branch_stack(event))
5557 return -EOPNOTSUPP;
5558
5422 perf_swevent_init_hrtimer(event); 5559 perf_swevent_init_hrtimer(event);
5423 5560
5424 return 0; 5561 return 0;
@@ -5852,7 +5989,7 @@ done:
5852 5989
5853 if (!event->parent) { 5990 if (!event->parent) {
5854 if (event->attach_state & PERF_ATTACH_TASK) 5991 if (event->attach_state & PERF_ATTACH_TASK)
5855 jump_label_inc(&perf_sched_events.key); 5992 static_key_slow_inc(&perf_sched_events.key);
5856 if (event->attr.mmap || event->attr.mmap_data) 5993 if (event->attr.mmap || event->attr.mmap_data)
5857 atomic_inc(&nr_mmap_events); 5994 atomic_inc(&nr_mmap_events);
5858 if (event->attr.comm) 5995 if (event->attr.comm)
@@ -5866,6 +6003,12 @@ done:
5866 return ERR_PTR(err); 6003 return ERR_PTR(err);
5867 } 6004 }
5868 } 6005 }
6006 if (has_branch_stack(event)) {
6007 static_key_slow_inc(&perf_sched_events.key);
6008 if (!(event->attach_state & PERF_ATTACH_TASK))
6009 atomic_inc(&per_cpu(perf_branch_stack_events,
6010 event->cpu));
6011 }
5869 } 6012 }
5870 6013
5871 return event; 6014 return event;
@@ -5935,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
5935 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 6078 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5936 return -EINVAL; 6079 return -EINVAL;
5937 6080
6081 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6082 u64 mask = attr->branch_sample_type;
6083
6084 /* only using defined bits */
6085 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6086 return -EINVAL;
6087
6088 /* at least one branch bit must be set */
6089 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6090 return -EINVAL;
6091
6092 /* kernel level capture: check permissions */
6093 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6094 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6095 return -EACCES;
6096
6097 /* propagate priv level, when not set for branch */
6098 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6099
6100 /* exclude_kernel checked on syscall entry */
6101 if (!attr->exclude_kernel)
6102 mask |= PERF_SAMPLE_BRANCH_KERNEL;
6103
6104 if (!attr->exclude_user)
6105 mask |= PERF_SAMPLE_BRANCH_USER;
6106
6107 if (!attr->exclude_hv)
6108 mask |= PERF_SAMPLE_BRANCH_HV;
6109 /*
6110 * adjust user setting (for HW filter setup)
6111 */
6112 attr->branch_sample_type = mask;
6113 }
6114 }
5938out: 6115out:
5939 return ret; 6116 return ret;
5940 6117
@@ -6090,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
6090 * - that may need work on context switch 6267 * - that may need work on context switch
6091 */ 6268 */
6092 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6269 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6093 jump_label_inc(&perf_sched_events.key); 6270 static_key_slow_inc(&perf_sched_events.key);
6094 } 6271 }
6095 6272
6096 /* 6273 /*
@@ -6939,6 +7116,13 @@ void __init perf_event_init(void)
6939 7116
6940 /* do not patch jump label more than once per second */ 7117 /* do not patch jump label more than once per second */
6941 jump_label_rate_limit(&perf_sched_events, HZ); 7118 jump_label_rate_limit(&perf_sched_events, HZ);
7119
7120 /*
7121 * Build time assertion that we keep the data_head at the intended
7122 * location. IOW, validation we got the __reserved[] size right.
7123 */
7124 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
7125 != 1024);
6942} 7126}
6943 7127
6944static int __init perf_event_sysfs_init(void) 7128static int __init perf_event_sysfs_init(void)
@@ -6970,8 +7154,7 @@ unlock:
6970device_initcall(perf_event_sysfs_init); 7154device_initcall(perf_event_sysfs_init);
6971 7155
6972#ifdef CONFIG_CGROUP_PERF 7156#ifdef CONFIG_CGROUP_PERF
6973static struct cgroup_subsys_state *perf_cgroup_create( 7157static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
6974 struct cgroup_subsys *ss, struct cgroup *cont)
6975{ 7158{
6976 struct perf_cgroup *jc; 7159 struct perf_cgroup *jc;
6977 7160
@@ -6988,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
6988 return &jc->css; 7171 return &jc->css;
6989} 7172}
6990 7173
6991static void perf_cgroup_destroy(struct cgroup_subsys *ss, 7174static void perf_cgroup_destroy(struct cgroup *cont)
6992 struct cgroup *cont)
6993{ 7175{
6994 struct perf_cgroup *jc; 7176 struct perf_cgroup *jc;
6995 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7177 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7005,8 +7187,7 @@ static int __perf_cgroup_move(void *info)
7005 return 0; 7187 return 0;
7006} 7188}
7007 7189
7008static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7190static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
7009 struct cgroup_taskset *tset)
7010{ 7191{
7011 struct task_struct *task; 7192 struct task_struct *task;
7012 7193
@@ -7014,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7014 task_function_call(task, __perf_cgroup_move, task); 7195 task_function_call(task, __perf_cgroup_move, task);
7015} 7196}
7016 7197
7017static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7198static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7018 struct cgroup *old_cgrp, struct task_struct *task) 7199 struct task_struct *task)
7019{ 7200{
7020 /* 7201 /*
7021 * cgroup_exit() is called in the copy_process() failure path. 7202 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 3330022a7ac1..bb38c4d3ee12 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
581 if (bp->attr.type != PERF_TYPE_BREAKPOINT) 581 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
582 return -ENOENT; 582 return -ENOENT;
583 583
584 /*
585 * no branch sampling for breakpoint events
586 */
587 if (has_branch_stack(bp))
588 return -EOPNOTSUPP;
589
584 err = register_perf_hw_breakpoint(bp); 590 err = register_perf_hw_breakpoint(bp);
585 if (err) 591 if (err)
586 return err; 592 return err;