aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c736
1 files changed, 522 insertions, 214 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2870feee81dd..999835b6112b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -35,6 +38,12 @@
35 38
36#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
37 40
41enum event_type_t {
42 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45};
46
38atomic_t perf_task_events __read_mostly; 47atomic_t perf_task_events __read_mostly;
39static atomic_t nr_mmap_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly;
40static atomic_t nr_comm_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly;
@@ -62,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
62 71
63static atomic64_t perf_event_id; 72static atomic64_t perf_event_id;
64 73
74static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type);
76
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type);
79
65void __weak perf_event_print_debug(void) { } 80void __weak perf_event_print_debug(void) { }
66 81
67extern __weak const char *perf_pmu_name(void) 82extern __weak const char *perf_pmu_name(void)
@@ -69,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
69 return "pmu"; 84 return "pmu";
70} 85}
71 86
87static inline u64 perf_clock(void)
88{
89 return local_clock();
90}
91
72void perf_pmu_disable(struct pmu *pmu) 92void perf_pmu_disable(struct pmu *pmu)
73{ 93{
74 int *count = this_cpu_ptr(pmu->pmu_disable_count); 94 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -133,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
133 } 153 }
134} 154}
135 155
156static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
157{
158 /*
159 * only top level events have the pid namespace they were created in
160 */
161 if (event->parent)
162 event = event->parent;
163
164 return task_tgid_nr_ns(p, event->ns);
165}
166
167static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
168{
169 /*
170 * only top level events have the pid namespace they were created in
171 */
172 if (event->parent)
173 event = event->parent;
174
175 return task_pid_nr_ns(p, event->ns);
176}
177
136/* 178/*
137 * If we inherit events we want to return the parent event id 179 * If we inherit events we want to return the parent event id
138 * to userspace. 180 * to userspace.
@@ -215,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
215 put_ctx(ctx); 257 put_ctx(ctx);
216} 258}
217 259
218static inline u64 perf_clock(void)
219{
220 return local_clock();
221}
222
223/* 260/*
224 * Update the record of the current time in a context. 261 * Update the record of the current time in a context.
225 */ 262 */
@@ -231,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
231 ctx->timestamp = now; 268 ctx->timestamp = now;
232} 269}
233 270
271static u64 perf_event_time(struct perf_event *event)
272{
273 struct perf_event_context *ctx = event->ctx;
274 return ctx ? ctx->time : 0;
275}
276
234/* 277/*
235 * Update the total_time_enabled and total_time_running fields for a event. 278 * Update the total_time_enabled and total_time_running fields for a event.
236 */ 279 */
@@ -244,7 +287,7 @@ static void update_event_times(struct perf_event *event)
244 return; 287 return;
245 288
246 if (ctx->is_active) 289 if (ctx->is_active)
247 run_end = ctx->time; 290 run_end = perf_event_time(event);
248 else 291 else
249 run_end = event->tstamp_stopped; 292 run_end = event->tstamp_stopped;
250 293
@@ -253,7 +296,7 @@ static void update_event_times(struct perf_event *event)
253 if (event->state == PERF_EVENT_STATE_INACTIVE) 296 if (event->state == PERF_EVENT_STATE_INACTIVE)
254 run_end = event->tstamp_stopped; 297 run_end = event->tstamp_stopped;
255 else 298 else
256 run_end = ctx->time; 299 run_end = perf_event_time(event);
257 300
258 event->total_time_running = run_end - event->tstamp_running; 301 event->total_time_running = run_end - event->tstamp_running;
259} 302}
@@ -312,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
312 ctx->nr_stat++; 355 ctx->nr_stat++;
313} 356}
314 357
358/*
359 * Called at perf_event creation and when events are attached/detached from a
360 * group.
361 */
362static void perf_event__read_size(struct perf_event *event)
363{
364 int entry = sizeof(u64); /* value */
365 int size = 0;
366 int nr = 1;
367
368 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
369 size += sizeof(u64);
370
371 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
372 size += sizeof(u64);
373
374 if (event->attr.read_format & PERF_FORMAT_ID)
375 entry += sizeof(u64);
376
377 if (event->attr.read_format & PERF_FORMAT_GROUP) {
378 nr += event->group_leader->nr_siblings;
379 size += sizeof(u64);
380 }
381
382 size += entry * nr;
383 event->read_size = size;
384}
385
386static void perf_event__header_size(struct perf_event *event)
387{
388 struct perf_sample_data *data;
389 u64 sample_type = event->attr.sample_type;
390 u16 size = 0;
391
392 perf_event__read_size(event);
393
394 if (sample_type & PERF_SAMPLE_IP)
395 size += sizeof(data->ip);
396
397 if (sample_type & PERF_SAMPLE_ADDR)
398 size += sizeof(data->addr);
399
400 if (sample_type & PERF_SAMPLE_PERIOD)
401 size += sizeof(data->period);
402
403 if (sample_type & PERF_SAMPLE_READ)
404 size += event->read_size;
405
406 event->header_size = size;
407}
408
409static void perf_event__id_header_size(struct perf_event *event)
410{
411 struct perf_sample_data *data;
412 u64 sample_type = event->attr.sample_type;
413 u16 size = 0;
414
415 if (sample_type & PERF_SAMPLE_TID)
416 size += sizeof(data->tid_entry);
417
418 if (sample_type & PERF_SAMPLE_TIME)
419 size += sizeof(data->time);
420
421 if (sample_type & PERF_SAMPLE_ID)
422 size += sizeof(data->id);
423
424 if (sample_type & PERF_SAMPLE_STREAM_ID)
425 size += sizeof(data->stream_id);
426
427 if (sample_type & PERF_SAMPLE_CPU)
428 size += sizeof(data->cpu_entry);
429
430 event->id_header_size = size;
431}
432
315static void perf_group_attach(struct perf_event *event) 433static void perf_group_attach(struct perf_event *event)
316{ 434{
317 struct perf_event *group_leader = event->group_leader; 435 struct perf_event *group_leader = event->group_leader, *pos;
318 436
319 /* 437 /*
320 * We can have double attach due to group movement in perf_event_open. 438 * We can have double attach due to group movement in perf_event_open.
@@ -333,6 +451,11 @@ static void perf_group_attach(struct perf_event *event)
333 451
334 list_add_tail(&event->group_entry, &group_leader->sibling_list); 452 list_add_tail(&event->group_entry, &group_leader->sibling_list);
335 group_leader->nr_siblings++; 453 group_leader->nr_siblings++;
454
455 perf_event__header_size(group_leader);
456
457 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
458 perf_event__header_size(pos);
336} 459}
337 460
338/* 461/*
@@ -391,7 +514,7 @@ static void perf_group_detach(struct perf_event *event)
391 if (event->group_leader != event) { 514 if (event->group_leader != event) {
392 list_del_init(&event->group_entry); 515 list_del_init(&event->group_entry);
393 event->group_leader->nr_siblings--; 516 event->group_leader->nr_siblings--;
394 return; 517 goto out;
395 } 518 }
396 519
397 if (!list_empty(&event->group_entry)) 520 if (!list_empty(&event->group_entry))
@@ -410,6 +533,12 @@ static void perf_group_detach(struct perf_event *event)
410 /* Inherit group flags from the previous leader */ 533 /* Inherit group flags from the previous leader */
411 sibling->group_flags = event->group_flags; 534 sibling->group_flags = event->group_flags;
412 } 535 }
536
537out:
538 perf_event__header_size(event->group_leader);
539
540 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
541 perf_event__header_size(tmp);
413} 542}
414 543
415static inline int 544static inline int
@@ -423,6 +552,7 @@ event_sched_out(struct perf_event *event,
423 struct perf_cpu_context *cpuctx, 552 struct perf_cpu_context *cpuctx,
424 struct perf_event_context *ctx) 553 struct perf_event_context *ctx)
425{ 554{
555 u64 tstamp = perf_event_time(event);
426 u64 delta; 556 u64 delta;
427 /* 557 /*
428 * An event which could not be activated because of 558 * An event which could not be activated because of
@@ -434,7 +564,7 @@ event_sched_out(struct perf_event *event,
434 && !event_filter_match(event)) { 564 && !event_filter_match(event)) {
435 delta = ctx->time - event->tstamp_stopped; 565 delta = ctx->time - event->tstamp_stopped;
436 event->tstamp_running += delta; 566 event->tstamp_running += delta;
437 event->tstamp_stopped = ctx->time; 567 event->tstamp_stopped = tstamp;
438 } 568 }
439 569
440 if (event->state != PERF_EVENT_STATE_ACTIVE) 570 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -445,7 +575,7 @@ event_sched_out(struct perf_event *event,
445 event->pending_disable = 0; 575 event->pending_disable = 0;
446 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
447 } 577 }
448 event->tstamp_stopped = ctx->time; 578 event->tstamp_stopped = tstamp;
449 event->pmu->del(event, 0); 579 event->pmu->del(event, 0);
450 event->oncpu = -1; 580 event->oncpu = -1;
451 581
@@ -657,6 +787,8 @@ event_sched_in(struct perf_event *event,
657 struct perf_cpu_context *cpuctx, 787 struct perf_cpu_context *cpuctx,
658 struct perf_event_context *ctx) 788 struct perf_event_context *ctx)
659{ 789{
790 u64 tstamp = perf_event_time(event);
791
660 if (event->state <= PERF_EVENT_STATE_OFF) 792 if (event->state <= PERF_EVENT_STATE_OFF)
661 return 0; 793 return 0;
662 794
@@ -673,9 +805,9 @@ event_sched_in(struct perf_event *event,
673 return -EAGAIN; 805 return -EAGAIN;
674 } 806 }
675 807
676 event->tstamp_running += ctx->time - event->tstamp_stopped; 808 event->tstamp_running += tstamp - event->tstamp_stopped;
677 809
678 event->shadow_ctx_time = ctx->time - ctx->timestamp; 810 event->shadow_ctx_time = tstamp - ctx->timestamp;
679 811
680 if (!is_software_event(event)) 812 if (!is_software_event(event))
681 cpuctx->active_oncpu++; 813 cpuctx->active_oncpu++;
@@ -787,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
787static void add_event_to_ctx(struct perf_event *event, 919static void add_event_to_ctx(struct perf_event *event,
788 struct perf_event_context *ctx) 920 struct perf_event_context *ctx)
789{ 921{
922 u64 tstamp = perf_event_time(event);
923
790 list_add_event(event, ctx); 924 list_add_event(event, ctx);
791 perf_group_attach(event); 925 perf_group_attach(event);
792 event->tstamp_enabled = ctx->time; 926 event->tstamp_enabled = tstamp;
793 event->tstamp_running = ctx->time; 927 event->tstamp_running = tstamp;
794 event->tstamp_stopped = ctx->time; 928 event->tstamp_stopped = tstamp;
795} 929}
796 930
797/* 931/*
@@ -826,7 +960,7 @@ static void __perf_install_in_context(void *info)
826 960
827 add_event_to_ctx(event, ctx); 961 add_event_to_ctx(event, ctx);
828 962
829 if (event->cpu != -1 && event->cpu != smp_processor_id()) 963 if (!event_filter_match(event))
830 goto unlock; 964 goto unlock;
831 965
832 /* 966 /*
@@ -931,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
931 struct perf_event_context *ctx) 1065 struct perf_event_context *ctx)
932{ 1066{
933 struct perf_event *sub; 1067 struct perf_event *sub;
1068 u64 tstamp = perf_event_time(event);
934 1069
935 event->state = PERF_EVENT_STATE_INACTIVE; 1070 event->state = PERF_EVENT_STATE_INACTIVE;
936 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1071 event->tstamp_enabled = tstamp - event->total_time_enabled;
937 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1072 list_for_each_entry(sub, &event->sibling_list, group_entry) {
938 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1073 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
939 sub->tstamp_enabled = 1074 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
940 ctx->time - sub->total_time_enabled;
941 }
942 } 1075 }
943} 1076}
944 1077
@@ -971,7 +1104,7 @@ static void __perf_event_enable(void *info)
971 goto unlock; 1104 goto unlock;
972 __perf_event_mark_enabled(event, ctx); 1105 __perf_event_mark_enabled(event, ctx);
973 1106
974 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1107 if (!event_filter_match(event))
975 goto unlock; 1108 goto unlock;
976 1109
977 /* 1110 /*
@@ -1073,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1073 /* 1206 /*
1074 * not supported on inherited events 1207 * not supported on inherited events
1075 */ 1208 */
1076 if (event->attr.inherit) 1209 if (event->attr.inherit || !is_sampling_event(event))
1077 return -EINVAL; 1210 return -EINVAL;
1078 1211
1079 atomic_add(refresh, &event->event_limit); 1212 atomic_add(refresh, &event->event_limit);
@@ -1082,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1082 return 0; 1215 return 0;
1083} 1216}
1084 1217
1085enum event_type_t {
1086 EVENT_FLEXIBLE = 0x1,
1087 EVENT_PINNED = 0x2,
1088 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1089};
1090
1091static void ctx_sched_out(struct perf_event_context *ctx, 1218static void ctx_sched_out(struct perf_event_context *ctx,
1092 struct perf_cpu_context *cpuctx, 1219 struct perf_cpu_context *cpuctx,
1093 enum event_type_t event_type) 1220 enum event_type_t event_type)
@@ -1324,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1324 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1451 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1325 if (event->state <= PERF_EVENT_STATE_OFF) 1452 if (event->state <= PERF_EVENT_STATE_OFF)
1326 continue; 1453 continue;
1327 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1454 if (!event_filter_match(event))
1328 continue; 1455 continue;
1329 1456
1330 if (group_can_go_on(event, cpuctx, 1)) 1457 if (group_can_go_on(event, cpuctx, 1))
@@ -1356,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1356 * Listen to the 'cpu' scheduling filter constraint 1483 * Listen to the 'cpu' scheduling filter constraint
1357 * of events: 1484 * of events:
1358 */ 1485 */
1359 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1486 if (!event_filter_match(event))
1360 continue; 1487 continue;
1361 1488
1362 if (group_can_go_on(event, cpuctx, can_add_hw)) { 1489 if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1583,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1583 if (event->state != PERF_EVENT_STATE_ACTIVE) 1710 if (event->state != PERF_EVENT_STATE_ACTIVE)
1584 continue; 1711 continue;
1585 1712
1586 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1713 if (!event_filter_match(event))
1587 continue; 1714 continue;
1588 1715
1589 hwc = &event->hw; 1716 hwc = &event->hw;
@@ -1774,11 +1901,12 @@ static void __perf_event_read(void *info)
1774 return; 1901 return;
1775 1902
1776 raw_spin_lock(&ctx->lock); 1903 raw_spin_lock(&ctx->lock);
1777 update_context_time(ctx); 1904 if (ctx->is_active)
1905 update_context_time(ctx);
1778 update_event_times(event); 1906 update_event_times(event);
1907 if (event->state == PERF_EVENT_STATE_ACTIVE)
1908 event->pmu->read(event);
1779 raw_spin_unlock(&ctx->lock); 1909 raw_spin_unlock(&ctx->lock);
1780
1781 event->pmu->read(event);
1782} 1910}
1783 1911
1784static inline u64 perf_event_count(struct perf_event *event) 1912static inline u64 perf_event_count(struct perf_event *event)
@@ -1872,8 +2000,7 @@ static int alloc_callchain_buffers(void)
1872 * accessed from NMI. Use a temporary manual per cpu allocation 2000 * accessed from NMI. Use a temporary manual per cpu allocation
1873 * until that gets sorted out. 2001 * until that gets sorted out.
1874 */ 2002 */
1875 size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * 2003 size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
1876 num_possible_cpus();
1877 2004
1878 entries = kzalloc(size, GFP_KERNEL); 2005 entries = kzalloc(size, GFP_KERNEL);
1879 if (!entries) 2006 if (!entries)
@@ -2074,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid)
2074 if (!task) 2201 if (!task)
2075 return ERR_PTR(-ESRCH); 2202 return ERR_PTR(-ESRCH);
2076 2203
2077 /*
2078 * Can't attach events to a dying task.
2079 */
2080 err = -ESRCH;
2081 if (task->flags & PF_EXITING)
2082 goto errout;
2083
2084 /* Reuse ptrace permission checks for now. */ 2204 /* Reuse ptrace permission checks for now. */
2085 err = -EACCES; 2205 err = -EACCES;
2086 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2206 if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2101,14 +2221,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2101 unsigned long flags; 2221 unsigned long flags;
2102 int ctxn, err; 2222 int ctxn, err;
2103 2223
2104 if (!task && cpu != -1) { 2224 if (!task) {
2105 /* Must be root to operate on a CPU event: */ 2225 /* Must be root to operate on a CPU event: */
2106 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2226 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2107 return ERR_PTR(-EACCES); 2227 return ERR_PTR(-EACCES);
2108 2228
2109 if (cpu < 0 || cpu >= nr_cpumask_bits)
2110 return ERR_PTR(-EINVAL);
2111
2112 /* 2229 /*
2113 * We could be clever and allow to attach a event to an 2230 * We could be clever and allow to attach a event to an
2114 * offline CPU and activate it when the CPU comes up, but 2231 * offline CPU and activate it when the CPU comes up, but
@@ -2144,14 +2261,27 @@ retry:
2144 2261
2145 get_ctx(ctx); 2262 get_ctx(ctx);
2146 2263
2147 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { 2264 err = 0;
2148 /* 2265 mutex_lock(&task->perf_event_mutex);
2149 * We raced with some other task; use 2266 /*
2150 * the context they set. 2267 * If it has already passed perf_event_exit_task().
2151 */ 2268 * we must see PF_EXITING, it takes this mutex too.
2269 */
2270 if (task->flags & PF_EXITING)
2271 err = -ESRCH;
2272 else if (task->perf_event_ctxp[ctxn])
2273 err = -EAGAIN;
2274 else
2275 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2276 mutex_unlock(&task->perf_event_mutex);
2277
2278 if (unlikely(err)) {
2152 put_task_struct(task); 2279 put_task_struct(task);
2153 kfree(ctx); 2280 kfree(ctx);
2154 goto retry; 2281
2282 if (err == -EAGAIN)
2283 goto retry;
2284 goto errout;
2155 } 2285 }
2156 } 2286 }
2157 2287
@@ -2289,31 +2419,6 @@ static int perf_release(struct inode *inode, struct file *file)
2289 return perf_event_release_kernel(event); 2419 return perf_event_release_kernel(event);
2290} 2420}
2291 2421
2292static int perf_event_read_size(struct perf_event *event)
2293{
2294 int entry = sizeof(u64); /* value */
2295 int size = 0;
2296 int nr = 1;
2297
2298 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2299 size += sizeof(u64);
2300
2301 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2302 size += sizeof(u64);
2303
2304 if (event->attr.read_format & PERF_FORMAT_ID)
2305 entry += sizeof(u64);
2306
2307 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2308 nr += event->group_leader->nr_siblings;
2309 size += sizeof(u64);
2310 }
2311
2312 size += entry * nr;
2313
2314 return size;
2315}
2316
2317u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2422u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
2318{ 2423{
2319 struct perf_event *child; 2424 struct perf_event *child;
@@ -2428,7 +2533,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2428 if (event->state == PERF_EVENT_STATE_ERROR) 2533 if (event->state == PERF_EVENT_STATE_ERROR)
2429 return 0; 2534 return 0;
2430 2535
2431 if (count < perf_event_read_size(event)) 2536 if (count < event->read_size)
2432 return -ENOSPC; 2537 return -ENOSPC;
2433 2538
2434 WARN_ON_ONCE(event->ctx->parent_ctx); 2539 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2514,7 +2619,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2514 int ret = 0; 2619 int ret = 0;
2515 u64 value; 2620 u64 value;
2516 2621
2517 if (!event->attr.sample_period) 2622 if (!is_sampling_event(event))
2518 return -EINVAL; 2623 return -EINVAL;
2519 2624
2520 if (copy_from_user(&value, arg, sizeof(value))) 2625 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3305,6 +3410,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3305 } while (len); 3410 } while (len);
3306} 3411}
3307 3412
3413static void __perf_event_header__init_id(struct perf_event_header *header,
3414 struct perf_sample_data *data,
3415 struct perf_event *event)
3416{
3417 u64 sample_type = event->attr.sample_type;
3418
3419 data->type = sample_type;
3420 header->size += event->id_header_size;
3421
3422 if (sample_type & PERF_SAMPLE_TID) {
3423 /* namespace issues */
3424 data->tid_entry.pid = perf_event_pid(event, current);
3425 data->tid_entry.tid = perf_event_tid(event, current);
3426 }
3427
3428 if (sample_type & PERF_SAMPLE_TIME)
3429 data->time = perf_clock();
3430
3431 if (sample_type & PERF_SAMPLE_ID)
3432 data->id = primary_event_id(event);
3433
3434 if (sample_type & PERF_SAMPLE_STREAM_ID)
3435 data->stream_id = event->id;
3436
3437 if (sample_type & PERF_SAMPLE_CPU) {
3438 data->cpu_entry.cpu = raw_smp_processor_id();
3439 data->cpu_entry.reserved = 0;
3440 }
3441}
3442
3443static void perf_event_header__init_id(struct perf_event_header *header,
3444 struct perf_sample_data *data,
3445 struct perf_event *event)
3446{
3447 if (event->attr.sample_id_all)
3448 __perf_event_header__init_id(header, data, event);
3449}
3450
3451static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3452 struct perf_sample_data *data)
3453{
3454 u64 sample_type = data->type;
3455
3456 if (sample_type & PERF_SAMPLE_TID)
3457 perf_output_put(handle, data->tid_entry);
3458
3459 if (sample_type & PERF_SAMPLE_TIME)
3460 perf_output_put(handle, data->time);
3461
3462 if (sample_type & PERF_SAMPLE_ID)
3463 perf_output_put(handle, data->id);
3464
3465 if (sample_type & PERF_SAMPLE_STREAM_ID)
3466 perf_output_put(handle, data->stream_id);
3467
3468 if (sample_type & PERF_SAMPLE_CPU)
3469 perf_output_put(handle, data->cpu_entry);
3470}
3471
3472static void perf_event__output_id_sample(struct perf_event *event,
3473 struct perf_output_handle *handle,
3474 struct perf_sample_data *sample)
3475{
3476 if (event->attr.sample_id_all)
3477 __perf_event__output_id_sample(handle, sample);
3478}
3479
3308int perf_output_begin(struct perf_output_handle *handle, 3480int perf_output_begin(struct perf_output_handle *handle,
3309 struct perf_event *event, unsigned int size, 3481 struct perf_event *event, unsigned int size,
3310 int nmi, int sample) 3482 int nmi, int sample)
@@ -3312,6 +3484,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3312 struct perf_buffer *buffer; 3484 struct perf_buffer *buffer;
3313 unsigned long tail, offset, head; 3485 unsigned long tail, offset, head;
3314 int have_lost; 3486 int have_lost;
3487 struct perf_sample_data sample_data;
3315 struct { 3488 struct {
3316 struct perf_event_header header; 3489 struct perf_event_header header;
3317 u64 id; 3490 u64 id;
@@ -3338,8 +3511,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3338 goto out; 3511 goto out;
3339 3512
3340 have_lost = local_read(&buffer->lost); 3513 have_lost = local_read(&buffer->lost);
3341 if (have_lost) 3514 if (have_lost) {
3342 size += sizeof(lost_event); 3515 lost_event.header.size = sizeof(lost_event);
3516 perf_event_header__init_id(&lost_event.header, &sample_data,
3517 event);
3518 size += lost_event.header.size;
3519 }
3343 3520
3344 perf_output_get_handle(handle); 3521 perf_output_get_handle(handle);
3345 3522
@@ -3370,11 +3547,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3370 if (have_lost) { 3547 if (have_lost) {
3371 lost_event.header.type = PERF_RECORD_LOST; 3548 lost_event.header.type = PERF_RECORD_LOST;
3372 lost_event.header.misc = 0; 3549 lost_event.header.misc = 0;
3373 lost_event.header.size = sizeof(lost_event);
3374 lost_event.id = event->id; 3550 lost_event.id = event->id;
3375 lost_event.lost = local_xchg(&buffer->lost, 0); 3551 lost_event.lost = local_xchg(&buffer->lost, 0);
3376 3552
3377 perf_output_put(handle, lost_event); 3553 perf_output_put(handle, lost_event);
3554 perf_event__output_id_sample(event, handle, &sample_data);
3378 } 3555 }
3379 3556
3380 return 0; 3557 return 0;
@@ -3407,28 +3584,6 @@ void perf_output_end(struct perf_output_handle *handle)
3407 rcu_read_unlock(); 3584 rcu_read_unlock();
3408} 3585}
3409 3586
3410static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3411{
3412 /*
3413 * only top level events have the pid namespace they were created in
3414 */
3415 if (event->parent)
3416 event = event->parent;
3417
3418 return task_tgid_nr_ns(p, event->ns);
3419}
3420
3421static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3422{
3423 /*
3424 * only top level events have the pid namespace they were created in
3425 */
3426 if (event->parent)
3427 event = event->parent;
3428
3429 return task_pid_nr_ns(p, event->ns);
3430}
3431
3432static void perf_output_read_one(struct perf_output_handle *handle, 3587static void perf_output_read_one(struct perf_output_handle *handle,
3433 struct perf_event *event, 3588 struct perf_event *event,
3434 u64 enabled, u64 running) 3589 u64 enabled, u64 running)
@@ -3603,61 +3758,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3603{ 3758{
3604 u64 sample_type = event->attr.sample_type; 3759 u64 sample_type = event->attr.sample_type;
3605 3760
3606 data->type = sample_type;
3607
3608 header->type = PERF_RECORD_SAMPLE; 3761 header->type = PERF_RECORD_SAMPLE;
3609 header->size = sizeof(*header); 3762 header->size = sizeof(*header) + event->header_size;
3610 3763
3611 header->misc = 0; 3764 header->misc = 0;
3612 header->misc |= perf_misc_flags(regs); 3765 header->misc |= perf_misc_flags(regs);
3613 3766
3614 if (sample_type & PERF_SAMPLE_IP) { 3767 __perf_event_header__init_id(header, data, event);
3615 data->ip = perf_instruction_pointer(regs);
3616
3617 header->size += sizeof(data->ip);
3618 }
3619
3620 if (sample_type & PERF_SAMPLE_TID) {
3621 /* namespace issues */
3622 data->tid_entry.pid = perf_event_pid(event, current);
3623 data->tid_entry.tid = perf_event_tid(event, current);
3624
3625 header->size += sizeof(data->tid_entry);
3626 }
3627
3628 if (sample_type & PERF_SAMPLE_TIME) {
3629 data->time = perf_clock();
3630
3631 header->size += sizeof(data->time);
3632 }
3633
3634 if (sample_type & PERF_SAMPLE_ADDR)
3635 header->size += sizeof(data->addr);
3636
3637 if (sample_type & PERF_SAMPLE_ID) {
3638 data->id = primary_event_id(event);
3639
3640 header->size += sizeof(data->id);
3641 }
3642
3643 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3644 data->stream_id = event->id;
3645
3646 header->size += sizeof(data->stream_id);
3647 }
3648
3649 if (sample_type & PERF_SAMPLE_CPU) {
3650 data->cpu_entry.cpu = raw_smp_processor_id();
3651 data->cpu_entry.reserved = 0;
3652
3653 header->size += sizeof(data->cpu_entry);
3654 }
3655
3656 if (sample_type & PERF_SAMPLE_PERIOD)
3657 header->size += sizeof(data->period);
3658 3768
3659 if (sample_type & PERF_SAMPLE_READ) 3769 if (sample_type & PERF_SAMPLE_IP)
3660 header->size += perf_event_read_size(event); 3770 data->ip = perf_instruction_pointer(regs);
3661 3771
3662 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3772 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3663 int size = 1; 3773 int size = 1;
@@ -3722,23 +3832,26 @@ perf_event_read_event(struct perf_event *event,
3722 struct task_struct *task) 3832 struct task_struct *task)
3723{ 3833{
3724 struct perf_output_handle handle; 3834 struct perf_output_handle handle;
3835 struct perf_sample_data sample;
3725 struct perf_read_event read_event = { 3836 struct perf_read_event read_event = {
3726 .header = { 3837 .header = {
3727 .type = PERF_RECORD_READ, 3838 .type = PERF_RECORD_READ,
3728 .misc = 0, 3839 .misc = 0,
3729 .size = sizeof(read_event) + perf_event_read_size(event), 3840 .size = sizeof(read_event) + event->read_size,
3730 }, 3841 },
3731 .pid = perf_event_pid(event, task), 3842 .pid = perf_event_pid(event, task),
3732 .tid = perf_event_tid(event, task), 3843 .tid = perf_event_tid(event, task),
3733 }; 3844 };
3734 int ret; 3845 int ret;
3735 3846
3847 perf_event_header__init_id(&read_event.header, &sample, event);
3736 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3848 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3737 if (ret) 3849 if (ret)
3738 return; 3850 return;
3739 3851
3740 perf_output_put(&handle, read_event); 3852 perf_output_put(&handle, read_event);
3741 perf_output_read(&handle, event); 3853 perf_output_read(&handle, event);
3854 perf_event__output_id_sample(event, &handle, &sample);
3742 3855
3743 perf_output_end(&handle); 3856 perf_output_end(&handle);
3744} 3857}
@@ -3768,14 +3881,16 @@ static void perf_event_task_output(struct perf_event *event,
3768 struct perf_task_event *task_event) 3881 struct perf_task_event *task_event)
3769{ 3882{
3770 struct perf_output_handle handle; 3883 struct perf_output_handle handle;
3884 struct perf_sample_data sample;
3771 struct task_struct *task = task_event->task; 3885 struct task_struct *task = task_event->task;
3772 int size, ret; 3886 int ret, size = task_event->event_id.header.size;
3773 3887
3774 size = task_event->event_id.header.size; 3888 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3775 ret = perf_output_begin(&handle, event, size, 0, 0);
3776 3889
3890 ret = perf_output_begin(&handle, event,
3891 task_event->event_id.header.size, 0, 0);
3777 if (ret) 3892 if (ret)
3778 return; 3893 goto out;
3779 3894
3780 task_event->event_id.pid = perf_event_pid(event, task); 3895 task_event->event_id.pid = perf_event_pid(event, task);
3781 task_event->event_id.ppid = perf_event_pid(event, current); 3896 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3785,7 +3900,11 @@ static void perf_event_task_output(struct perf_event *event,
3785 3900
3786 perf_output_put(&handle, task_event->event_id); 3901 perf_output_put(&handle, task_event->event_id);
3787 3902
3903 perf_event__output_id_sample(event, &handle, &sample);
3904
3788 perf_output_end(&handle); 3905 perf_output_end(&handle);
3906out:
3907 task_event->event_id.header.size = size;
3789} 3908}
3790 3909
3791static int perf_event_task_match(struct perf_event *event) 3910static int perf_event_task_match(struct perf_event *event)
@@ -3793,7 +3912,7 @@ static int perf_event_task_match(struct perf_event *event)
3793 if (event->state < PERF_EVENT_STATE_INACTIVE) 3912 if (event->state < PERF_EVENT_STATE_INACTIVE)
3794 return 0; 3913 return 0;
3795 3914
3796 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3915 if (!event_filter_match(event))
3797 return 0; 3916 return 0;
3798 3917
3799 if (event->attr.comm || event->attr.mmap || 3918 if (event->attr.comm || event->attr.mmap ||
@@ -3900,11 +4019,16 @@ static void perf_event_comm_output(struct perf_event *event,
3900 struct perf_comm_event *comm_event) 4019 struct perf_comm_event *comm_event)
3901{ 4020{
3902 struct perf_output_handle handle; 4021 struct perf_output_handle handle;
4022 struct perf_sample_data sample;
3903 int size = comm_event->event_id.header.size; 4023 int size = comm_event->event_id.header.size;
3904 int ret = perf_output_begin(&handle, event, size, 0, 0); 4024 int ret;
4025
4026 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4027 ret = perf_output_begin(&handle, event,
4028 comm_event->event_id.header.size, 0, 0);
3905 4029
3906 if (ret) 4030 if (ret)
3907 return; 4031 goto out;
3908 4032
3909 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4033 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3910 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4034 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3912,7 +4036,12 @@ static void perf_event_comm_output(struct perf_event *event,
3912 perf_output_put(&handle, comm_event->event_id); 4036 perf_output_put(&handle, comm_event->event_id);
3913 perf_output_copy(&handle, comm_event->comm, 4037 perf_output_copy(&handle, comm_event->comm,
3914 comm_event->comm_size); 4038 comm_event->comm_size);
4039
4040 perf_event__output_id_sample(event, &handle, &sample);
4041
3915 perf_output_end(&handle); 4042 perf_output_end(&handle);
4043out:
4044 comm_event->event_id.header.size = size;
3916} 4045}
3917 4046
3918static int perf_event_comm_match(struct perf_event *event) 4047static int perf_event_comm_match(struct perf_event *event)
@@ -3920,7 +4049,7 @@ static int perf_event_comm_match(struct perf_event *event)
3920 if (event->state < PERF_EVENT_STATE_INACTIVE) 4049 if (event->state < PERF_EVENT_STATE_INACTIVE)
3921 return 0; 4050 return 0;
3922 4051
3923 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4052 if (!event_filter_match(event))
3924 return 0; 4053 return 0;
3925 4054
3926 if (event->attr.comm) 4055 if (event->attr.comm)
@@ -3957,7 +4086,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3957 comm_event->comm_size = size; 4086 comm_event->comm_size = size;
3958 4087
3959 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4088 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3960
3961 rcu_read_lock(); 4089 rcu_read_lock();
3962 list_for_each_entry_rcu(pmu, &pmus, entry) { 4090 list_for_each_entry_rcu(pmu, &pmus, entry) {
3963 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4091 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
@@ -4038,11 +4166,15 @@ static void perf_event_mmap_output(struct perf_event *event,
4038 struct perf_mmap_event *mmap_event) 4166 struct perf_mmap_event *mmap_event)
4039{ 4167{
4040 struct perf_output_handle handle; 4168 struct perf_output_handle handle;
4169 struct perf_sample_data sample;
4041 int size = mmap_event->event_id.header.size; 4170 int size = mmap_event->event_id.header.size;
4042 int ret = perf_output_begin(&handle, event, size, 0, 0); 4171 int ret;
4043 4172
4173 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4174 ret = perf_output_begin(&handle, event,
4175 mmap_event->event_id.header.size, 0, 0);
4044 if (ret) 4176 if (ret)
4045 return; 4177 goto out;
4046 4178
4047 mmap_event->event_id.pid = perf_event_pid(event, current); 4179 mmap_event->event_id.pid = perf_event_pid(event, current);
4048 mmap_event->event_id.tid = perf_event_tid(event, current); 4180 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4050,7 +4182,12 @@ static void perf_event_mmap_output(struct perf_event *event,
4050 perf_output_put(&handle, mmap_event->event_id); 4182 perf_output_put(&handle, mmap_event->event_id);
4051 perf_output_copy(&handle, mmap_event->file_name, 4183 perf_output_copy(&handle, mmap_event->file_name,
4052 mmap_event->file_size); 4184 mmap_event->file_size);
4185
4186 perf_event__output_id_sample(event, &handle, &sample);
4187
4053 perf_output_end(&handle); 4188 perf_output_end(&handle);
4189out:
4190 mmap_event->event_id.header.size = size;
4054} 4191}
4055 4192
4056static int perf_event_mmap_match(struct perf_event *event, 4193static int perf_event_mmap_match(struct perf_event *event,
@@ -4060,7 +4197,7 @@ static int perf_event_mmap_match(struct perf_event *event,
4060 if (event->state < PERF_EVENT_STATE_INACTIVE) 4197 if (event->state < PERF_EVENT_STATE_INACTIVE)
4061 return 0; 4198 return 0;
4062 4199
4063 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4200 if (!event_filter_match(event))
4064 return 0; 4201 return 0;
4065 4202
4066 if ((!executable && event->attr.mmap_data) || 4203 if ((!executable && event->attr.mmap_data) ||
@@ -4205,6 +4342,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4205static void perf_log_throttle(struct perf_event *event, int enable) 4342static void perf_log_throttle(struct perf_event *event, int enable)
4206{ 4343{
4207 struct perf_output_handle handle; 4344 struct perf_output_handle handle;
4345 struct perf_sample_data sample;
4208 int ret; 4346 int ret;
4209 4347
4210 struct { 4348 struct {
@@ -4226,11 +4364,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4226 if (enable) 4364 if (enable)
4227 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4365 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4228 4366
4229 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4367 perf_event_header__init_id(&throttle_event.header, &sample, event);
4368
4369 ret = perf_output_begin(&handle, event,
4370 throttle_event.header.size, 1, 0);
4230 if (ret) 4371 if (ret)
4231 return; 4372 return;
4232 4373
4233 perf_output_put(&handle, throttle_event); 4374 perf_output_put(&handle, throttle_event);
4375 perf_event__output_id_sample(event, &handle, &sample);
4234 perf_output_end(&handle); 4376 perf_output_end(&handle);
4235} 4377}
4236 4378
@@ -4246,6 +4388,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4246 struct hw_perf_event *hwc = &event->hw; 4388 struct hw_perf_event *hwc = &event->hw;
4247 int ret = 0; 4389 int ret = 0;
4248 4390
4391 /*
4392 * Non-sampling counters might still use the PMI to fold short
4393 * hardware counters, ignore those.
4394 */
4395 if (unlikely(!is_sampling_event(event)))
4396 return 0;
4397
4249 if (!throttle) { 4398 if (!throttle) {
4250 hwc->interrupts++; 4399 hwc->interrupts++;
4251 } else { 4400 } else {
@@ -4391,7 +4540,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4391 if (!regs) 4540 if (!regs)
4392 return; 4541 return;
4393 4542
4394 if (!hwc->sample_period) 4543 if (!is_sampling_event(event))
4395 return; 4544 return;
4396 4545
4397 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4546 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4518,7 +4667,7 @@ int perf_swevent_get_recursion_context(void)
4518} 4667}
4519EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4668EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4520 4669
4521void inline perf_swevent_put_recursion_context(int rctx) 4670inline void perf_swevent_put_recursion_context(int rctx)
4522{ 4671{
4523 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 4672 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4524 4673
@@ -4554,7 +4703,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4554 struct hw_perf_event *hwc = &event->hw; 4703 struct hw_perf_event *hwc = &event->hw;
4555 struct hlist_head *head; 4704 struct hlist_head *head;
4556 4705
4557 if (hwc->sample_period) { 4706 if (is_sampling_event(event)) {
4558 hwc->last_period = hwc->sample_period; 4707 hwc->last_period = hwc->sample_period;
4559 perf_swevent_set_period(event); 4708 perf_swevent_set_period(event);
4560 } 4709 }
@@ -4811,15 +4960,6 @@ static int perf_tp_event_init(struct perf_event *event)
4811 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4960 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4812 return -ENOENT; 4961 return -ENOENT;
4813 4962
4814 /*
4815 * Raw tracepoint data is a severe data leak, only allow root to
4816 * have these.
4817 */
4818 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4819 perf_paranoid_tracepoint_raw() &&
4820 !capable(CAP_SYS_ADMIN))
4821 return -EPERM;
4822
4823 err = perf_trace_init(event); 4963 err = perf_trace_init(event);
4824 if (err) 4964 if (err)
4825 return err; 4965 return err;
@@ -4842,7 +4982,7 @@ static struct pmu perf_tracepoint = {
4842 4982
4843static inline void perf_tp_register(void) 4983static inline void perf_tp_register(void)
4844{ 4984{
4845 perf_pmu_register(&perf_tracepoint); 4985 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4846} 4986}
4847 4987
4848static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4988static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4932,31 +5072,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4932static void perf_swevent_start_hrtimer(struct perf_event *event) 5072static void perf_swevent_start_hrtimer(struct perf_event *event)
4933{ 5073{
4934 struct hw_perf_event *hwc = &event->hw; 5074 struct hw_perf_event *hwc = &event->hw;
5075 s64 period;
5076
5077 if (!is_sampling_event(event))
5078 return;
4935 5079
4936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5080 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4937 hwc->hrtimer.function = perf_swevent_hrtimer; 5081 hwc->hrtimer.function = perf_swevent_hrtimer;
4938 if (hwc->sample_period) {
4939 s64 period = local64_read(&hwc->period_left);
4940 5082
4941 if (period) { 5083 period = local64_read(&hwc->period_left);
4942 if (period < 0) 5084 if (period) {
4943 period = 10000; 5085 if (period < 0)
5086 period = 10000;
4944 5087
4945 local64_set(&hwc->period_left, 0); 5088 local64_set(&hwc->period_left, 0);
4946 } else { 5089 } else {
4947 period = max_t(u64, 10000, hwc->sample_period); 5090 period = max_t(u64, 10000, hwc->sample_period);
4948 } 5091 }
4949 __hrtimer_start_range_ns(&hwc->hrtimer, 5092 __hrtimer_start_range_ns(&hwc->hrtimer,
4950 ns_to_ktime(period), 0, 5093 ns_to_ktime(period), 0,
4951 HRTIMER_MODE_REL_PINNED, 0); 5094 HRTIMER_MODE_REL_PINNED, 0);
4952 }
4953} 5095}
4954 5096
4955static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5097static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4956{ 5098{
4957 struct hw_perf_event *hwc = &event->hw; 5099 struct hw_perf_event *hwc = &event->hw;
4958 5100
4959 if (hwc->sample_period) { 5101 if (is_sampling_event(event)) {
4960 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5102 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4961 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5103 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4962 5104
@@ -5184,8 +5326,63 @@ static void free_pmu_context(struct pmu *pmu)
5184out: 5326out:
5185 mutex_unlock(&pmus_lock); 5327 mutex_unlock(&pmus_lock);
5186} 5328}
5329static struct idr pmu_idr;
5330
5331static ssize_t
5332type_show(struct device *dev, struct device_attribute *attr, char *page)
5333{
5334 struct pmu *pmu = dev_get_drvdata(dev);
5335
5336 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5337}
5338
5339static struct device_attribute pmu_dev_attrs[] = {
5340 __ATTR_RO(type),
5341 __ATTR_NULL,
5342};
5343
5344static int pmu_bus_running;
5345static struct bus_type pmu_bus = {
5346 .name = "event_source",
5347 .dev_attrs = pmu_dev_attrs,
5348};
5349
5350static void pmu_dev_release(struct device *dev)
5351{
5352 kfree(dev);
5353}
5187 5354
5188int perf_pmu_register(struct pmu *pmu) 5355static int pmu_dev_alloc(struct pmu *pmu)
5356{
5357 int ret = -ENOMEM;
5358
5359 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5360 if (!pmu->dev)
5361 goto out;
5362
5363 device_initialize(pmu->dev);
5364 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5365 if (ret)
5366 goto free_dev;
5367
5368 dev_set_drvdata(pmu->dev, pmu);
5369 pmu->dev->bus = &pmu_bus;
5370 pmu->dev->release = pmu_dev_release;
5371 ret = device_add(pmu->dev);
5372 if (ret)
5373 goto free_dev;
5374
5375out:
5376 return ret;
5377
5378free_dev:
5379 put_device(pmu->dev);
5380 goto out;
5381}
5382
5383static struct lock_class_key cpuctx_mutex;
5384
5385int perf_pmu_register(struct pmu *pmu, char *name, int type)
5189{ 5386{
5190 int cpu, ret; 5387 int cpu, ret;
5191 5388
@@ -5195,19 +5392,45 @@ int perf_pmu_register(struct pmu *pmu)
5195 if (!pmu->pmu_disable_count) 5392 if (!pmu->pmu_disable_count)
5196 goto unlock; 5393 goto unlock;
5197 5394
5395 pmu->type = -1;
5396 if (!name)
5397 goto skip_type;
5398 pmu->name = name;
5399
5400 if (type < 0) {
5401 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5402 if (!err)
5403 goto free_pdc;
5404
5405 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5406 if (err) {
5407 ret = err;
5408 goto free_pdc;
5409 }
5410 }
5411 pmu->type = type;
5412
5413 if (pmu_bus_running) {
5414 ret = pmu_dev_alloc(pmu);
5415 if (ret)
5416 goto free_idr;
5417 }
5418
5419skip_type:
5198 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5420 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5199 if (pmu->pmu_cpu_context) 5421 if (pmu->pmu_cpu_context)
5200 goto got_cpu_context; 5422 goto got_cpu_context;
5201 5423
5202 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5424 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5203 if (!pmu->pmu_cpu_context) 5425 if (!pmu->pmu_cpu_context)
5204 goto free_pdc; 5426 goto free_dev;
5205 5427
5206 for_each_possible_cpu(cpu) { 5428 for_each_possible_cpu(cpu) {
5207 struct perf_cpu_context *cpuctx; 5429 struct perf_cpu_context *cpuctx;
5208 5430
5209 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5431 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5210 __perf_event_init_context(&cpuctx->ctx); 5432 __perf_event_init_context(&cpuctx->ctx);
5433 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5211 cpuctx->ctx.type = cpu_context; 5434 cpuctx->ctx.type = cpu_context;
5212 cpuctx->ctx.pmu = pmu; 5435 cpuctx->ctx.pmu = pmu;
5213 cpuctx->jiffies_interval = 1; 5436 cpuctx->jiffies_interval = 1;
@@ -5245,6 +5468,14 @@ unlock:
5245 5468
5246 return ret; 5469 return ret;
5247 5470
5471free_dev:
5472 device_del(pmu->dev);
5473 put_device(pmu->dev);
5474
5475free_idr:
5476 if (pmu->type >= PERF_TYPE_MAX)
5477 idr_remove(&pmu_idr, pmu->type);
5478
5248free_pdc: 5479free_pdc:
5249 free_percpu(pmu->pmu_disable_count); 5480 free_percpu(pmu->pmu_disable_count);
5250 goto unlock; 5481 goto unlock;
@@ -5264,6 +5495,10 @@ void perf_pmu_unregister(struct pmu *pmu)
5264 synchronize_rcu(); 5495 synchronize_rcu();
5265 5496
5266 free_percpu(pmu->pmu_disable_count); 5497 free_percpu(pmu->pmu_disable_count);
5498 if (pmu->type >= PERF_TYPE_MAX)
5499 idr_remove(&pmu_idr, pmu->type);
5500 device_del(pmu->dev);
5501 put_device(pmu->dev);
5267 free_pmu_context(pmu); 5502 free_pmu_context(pmu);
5268} 5503}
5269 5504
@@ -5273,6 +5508,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5273 int idx; 5508 int idx;
5274 5509
5275 idx = srcu_read_lock(&pmus_srcu); 5510 idx = srcu_read_lock(&pmus_srcu);
5511
5512 rcu_read_lock();
5513 pmu = idr_find(&pmu_idr, event->attr.type);
5514 rcu_read_unlock();
5515 if (pmu)
5516 goto unlock;
5517
5276 list_for_each_entry_rcu(pmu, &pmus, entry) { 5518 list_for_each_entry_rcu(pmu, &pmus, entry) {
5277 int ret = pmu->event_init(event); 5519 int ret = pmu->event_init(event);
5278 if (!ret) 5520 if (!ret)
@@ -5305,6 +5547,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5305 struct hw_perf_event *hwc; 5547 struct hw_perf_event *hwc;
5306 long err; 5548 long err;
5307 5549
5550 if ((unsigned)cpu >= nr_cpu_ids) {
5551 if (!task || cpu != -1)
5552 return ERR_PTR(-EINVAL);
5553 }
5554
5308 event = kzalloc(sizeof(*event), GFP_KERNEL); 5555 event = kzalloc(sizeof(*event), GFP_KERNEL);
5309 if (!event) 5556 if (!event)
5310 return ERR_PTR(-ENOMEM); 5557 return ERR_PTR(-ENOMEM);
@@ -5353,7 +5600,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5353 5600
5354 if (!overflow_handler && parent_event) 5601 if (!overflow_handler && parent_event)
5355 overflow_handler = parent_event->overflow_handler; 5602 overflow_handler = parent_event->overflow_handler;
5356 5603
5357 event->overflow_handler = overflow_handler; 5604 event->overflow_handler = overflow_handler;
5358 5605
5359 if (attr->disabled) 5606 if (attr->disabled)
@@ -5738,6 +5985,12 @@ SYSCALL_DEFINE5(perf_event_open,
5738 mutex_unlock(&current->perf_event_mutex); 5985 mutex_unlock(&current->perf_event_mutex);
5739 5986
5740 /* 5987 /*
5988 * Precalculate sample_data sizes
5989 */
5990 perf_event__header_size(event);
5991 perf_event__id_header_size(event);
5992
5993 /*
5741 * Drop the reference on the group_event after placing the 5994 * Drop the reference on the group_event after placing the
5742 * new event on the sibling_list. This ensures destruction 5995 * new event on the sibling_list. This ensures destruction
5743 * of the group leader will find the pointer to itself in 5996 * of the group leader will find the pointer to itself in
@@ -5883,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5883 * scheduled, so we are now safe from rescheduling changing 6136 * scheduled, so we are now safe from rescheduling changing
5884 * our context. 6137 * our context.
5885 */ 6138 */
5886 child_ctx = child->perf_event_ctxp[ctxn]; 6139 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
5887 task_ctx_sched_out(child_ctx, EVENT_ALL); 6140 task_ctx_sched_out(child_ctx, EVENT_ALL);
5888 6141
5889 /* 6142 /*
@@ -6090,6 +6343,12 @@ inherit_event(struct perf_event *parent_event,
6090 child_event->overflow_handler = parent_event->overflow_handler; 6343 child_event->overflow_handler = parent_event->overflow_handler;
6091 6344
6092 /* 6345 /*
6346 * Precalculate sample_data sizes
6347 */
6348 perf_event__header_size(child_event);
6349 perf_event__id_header_size(child_event);
6350
6351 /*
6093 * Link it up in the child's context: 6352 * Link it up in the child's context:
6094 */ 6353 */
6095 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6354 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6190,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6190 unsigned long flags; 6449 unsigned long flags;
6191 int ret = 0; 6450 int ret = 0;
6192 6451
6193 child->perf_event_ctxp[ctxn] = NULL;
6194
6195 mutex_init(&child->perf_event_mutex);
6196 INIT_LIST_HEAD(&child->perf_event_list);
6197
6198 if (likely(!parent->perf_event_ctxp[ctxn])) 6452 if (likely(!parent->perf_event_ctxp[ctxn]))
6199 return 0; 6453 return 0;
6200 6454
@@ -6246,7 +6500,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6246 6500
6247 raw_spin_lock_irqsave(&parent_ctx->lock, flags); 6501 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6248 parent_ctx->rotate_disable = 0; 6502 parent_ctx->rotate_disable = 0;
6249 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6250 6503
6251 child_ctx = child->perf_event_ctxp[ctxn]; 6504 child_ctx = child->perf_event_ctxp[ctxn];
6252 6505
@@ -6254,12 +6507,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6254 /* 6507 /*
6255 * Mark the child context as a clone of the parent 6508 * Mark the child context as a clone of the parent
6256 * context, or of whatever the parent is a clone of. 6509 * context, or of whatever the parent is a clone of.
6257 * Note that if the parent is a clone, it could get 6510 *
6258 * uncloned at any point, but that doesn't matter 6511 * Note that if the parent is a clone, the holding of
6259 * because the list of events and the generation 6512 * parent_ctx->lock avoids it from being uncloned.
6260 * count can't have changed since we took the mutex.
6261 */ 6513 */
6262 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 6514 cloned_ctx = parent_ctx->parent_ctx;
6263 if (cloned_ctx) { 6515 if (cloned_ctx) {
6264 child_ctx->parent_ctx = cloned_ctx; 6516 child_ctx->parent_ctx = cloned_ctx;
6265 child_ctx->parent_gen = parent_ctx->parent_gen; 6517 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6270,6 +6522,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6270 get_ctx(child_ctx->parent_ctx); 6522 get_ctx(child_ctx->parent_ctx);
6271 } 6523 }
6272 6524
6525 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6273 mutex_unlock(&parent_ctx->mutex); 6526 mutex_unlock(&parent_ctx->mutex);
6274 6527
6275 perf_unpin_context(parent_ctx); 6528 perf_unpin_context(parent_ctx);
@@ -6284,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child)
6284{ 6537{
6285 int ctxn, ret; 6538 int ctxn, ret;
6286 6539
6540 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
6541 mutex_init(&child->perf_event_mutex);
6542 INIT_LIST_HEAD(&child->perf_event_list);
6543
6287 for_each_task_context_nr(ctxn) { 6544 for_each_task_context_nr(ctxn) {
6288 ret = perf_event_init_context(child, ctxn); 6545 ret = perf_event_init_context(child, ctxn);
6289 if (ret) 6546 if (ret)
@@ -6320,7 +6577,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6320 mutex_unlock(&swhash->hlist_mutex); 6577 mutex_unlock(&swhash->hlist_mutex);
6321} 6578}
6322 6579
6323#ifdef CONFIG_HOTPLUG_CPU 6580#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6324static void perf_pmu_rotate_stop(struct pmu *pmu) 6581static void perf_pmu_rotate_stop(struct pmu *pmu)
6325{ 6582{
6326 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6583 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6374,6 +6631,26 @@ static void perf_event_exit_cpu(int cpu)
6374static inline void perf_event_exit_cpu(int cpu) { } 6631static inline void perf_event_exit_cpu(int cpu) { }
6375#endif 6632#endif
6376 6633
6634static int
6635perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6636{
6637 int cpu;
6638
6639 for_each_online_cpu(cpu)
6640 perf_event_exit_cpu(cpu);
6641
6642 return NOTIFY_OK;
6643}
6644
6645/*
6646 * Run the perf reboot notifier at the very last possible moment so that
6647 * the generic watchdog code runs as long as possible.
6648 */
6649static struct notifier_block perf_reboot_notifier = {
6650 .notifier_call = perf_reboot,
6651 .priority = INT_MIN,
6652};
6653
6377static int __cpuinit 6654static int __cpuinit
6378perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6655perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6379{ 6656{
@@ -6402,14 +6679,45 @@ void __init perf_event_init(void)
6402{ 6679{
6403 int ret; 6680 int ret;
6404 6681
6682 idr_init(&pmu_idr);
6683
6405 perf_event_init_all_cpus(); 6684 perf_event_init_all_cpus();
6406 init_srcu_struct(&pmus_srcu); 6685 init_srcu_struct(&pmus_srcu);
6407 perf_pmu_register(&perf_swevent); 6686 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6408 perf_pmu_register(&perf_cpu_clock); 6687 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6409 perf_pmu_register(&perf_task_clock); 6688 perf_pmu_register(&perf_task_clock, NULL, -1);
6410 perf_tp_register(); 6689 perf_tp_register();
6411 perf_cpu_notifier(perf_cpu_notify); 6690 perf_cpu_notifier(perf_cpu_notify);
6691 register_reboot_notifier(&perf_reboot_notifier);
6412 6692
6413 ret = init_hw_breakpoint(); 6693 ret = init_hw_breakpoint();
6414 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 6694 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6415} 6695}
6696
6697static int __init perf_event_sysfs_init(void)
6698{
6699 struct pmu *pmu;
6700 int ret;
6701
6702 mutex_lock(&pmus_lock);
6703
6704 ret = bus_register(&pmu_bus);
6705 if (ret)
6706 goto unlock;
6707
6708 list_for_each_entry(pmu, &pmus, entry) {
6709 if (!pmu->name || pmu->type < 0)
6710 continue;
6711
6712 ret = pmu_dev_alloc(pmu);
6713 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6714 }
6715 pmu_bus_running = 1;
6716 ret = 0;
6717
6718unlock:
6719 mutex_unlock(&pmus_lock);
6720
6721 return ret;
6722}
6723device_initcall(perf_event_sysfs_init);