aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c890
1 files changed, 654 insertions, 236 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f4982..126a302c481c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -31,9 +34,16 @@
31#include <linux/kernel_stat.h> 34#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 35#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h>
34 38
35#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
36 40
41enum event_type_t {
42 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45};
46
37atomic_t perf_task_events __read_mostly; 47atomic_t perf_task_events __read_mostly;
38static atomic_t nr_mmap_events __read_mostly; 48static atomic_t nr_mmap_events __read_mostly;
39static atomic_t nr_comm_events __read_mostly; 49static atomic_t nr_comm_events __read_mostly;
@@ -61,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
61 71
62static atomic64_t perf_event_id; 72static atomic64_t perf_event_id;
63 73
74static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type);
76
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type);
79
64void __weak perf_event_print_debug(void) { } 80void __weak perf_event_print_debug(void) { }
65 81
66extern __weak const char *perf_pmu_name(void) 82extern __weak const char *perf_pmu_name(void)
@@ -68,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
68 return "pmu"; 84 return "pmu";
69} 85}
70 86
87static inline u64 perf_clock(void)
88{
89 return local_clock();
90}
91
71void perf_pmu_disable(struct pmu *pmu) 92void perf_pmu_disable(struct pmu *pmu)
72{ 93{
73 int *count = this_cpu_ptr(pmu->pmu_disable_count); 94 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -132,6 +153,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
132 } 153 }
133} 154}
134 155
156static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
157{
158 /*
159 * only top level events have the pid namespace they were created in
160 */
161 if (event->parent)
162 event = event->parent;
163
164 return task_tgid_nr_ns(p, event->ns);
165}
166
167static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
168{
169 /*
170 * only top level events have the pid namespace they were created in
171 */
172 if (event->parent)
173 event = event->parent;
174
175 return task_pid_nr_ns(p, event->ns);
176}
177
135/* 178/*
136 * If we inherit events we want to return the parent event id 179 * If we inherit events we want to return the parent event id
137 * to userspace. 180 * to userspace.
@@ -214,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 put_ctx(ctx); 257 put_ctx(ctx);
215} 258}
216 259
217static inline u64 perf_clock(void)
218{
219 return local_clock();
220}
221
222/* 260/*
223 * Update the record of the current time in a context. 261 * Update the record of the current time in a context.
224 */ 262 */
@@ -230,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
230 ctx->timestamp = now; 268 ctx->timestamp = now;
231} 269}
232 270
271static u64 perf_event_time(struct perf_event *event)
272{
273 struct perf_event_context *ctx = event->ctx;
274 return ctx ? ctx->time : 0;
275}
276
233/* 277/*
234 * Update the total_time_enabled and total_time_running fields for a event. 278 * Update the total_time_enabled and total_time_running fields for a event.
235 */ 279 */
@@ -243,7 +287,7 @@ static void update_event_times(struct perf_event *event)
243 return; 287 return;
244 288
245 if (ctx->is_active) 289 if (ctx->is_active)
246 run_end = ctx->time; 290 run_end = perf_event_time(event);
247 else 291 else
248 run_end = event->tstamp_stopped; 292 run_end = event->tstamp_stopped;
249 293
@@ -252,7 +296,7 @@ static void update_event_times(struct perf_event *event)
252 if (event->state == PERF_EVENT_STATE_INACTIVE) 296 if (event->state == PERF_EVENT_STATE_INACTIVE)
253 run_end = event->tstamp_stopped; 297 run_end = event->tstamp_stopped;
254 else 298 else
255 run_end = ctx->time; 299 run_end = perf_event_time(event);
256 300
257 event->total_time_running = run_end - event->tstamp_running; 301 event->total_time_running = run_end - event->tstamp_running;
258} 302}
@@ -311,9 +355,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
311 ctx->nr_stat++; 355 ctx->nr_stat++;
312} 356}
313 357
358/*
359 * Called at perf_event creation and when events are attached/detached from a
360 * group.
361 */
362static void perf_event__read_size(struct perf_event *event)
363{
364 int entry = sizeof(u64); /* value */
365 int size = 0;
366 int nr = 1;
367
368 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
369 size += sizeof(u64);
370
371 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
372 size += sizeof(u64);
373
374 if (event->attr.read_format & PERF_FORMAT_ID)
375 entry += sizeof(u64);
376
377 if (event->attr.read_format & PERF_FORMAT_GROUP) {
378 nr += event->group_leader->nr_siblings;
379 size += sizeof(u64);
380 }
381
382 size += entry * nr;
383 event->read_size = size;
384}
385
386static void perf_event__header_size(struct perf_event *event)
387{
388 struct perf_sample_data *data;
389 u64 sample_type = event->attr.sample_type;
390 u16 size = 0;
391
392 perf_event__read_size(event);
393
394 if (sample_type & PERF_SAMPLE_IP)
395 size += sizeof(data->ip);
396
397 if (sample_type & PERF_SAMPLE_ADDR)
398 size += sizeof(data->addr);
399
400 if (sample_type & PERF_SAMPLE_PERIOD)
401 size += sizeof(data->period);
402
403 if (sample_type & PERF_SAMPLE_READ)
404 size += event->read_size;
405
406 event->header_size = size;
407}
408
409static void perf_event__id_header_size(struct perf_event *event)
410{
411 struct perf_sample_data *data;
412 u64 sample_type = event->attr.sample_type;
413 u16 size = 0;
414
415 if (sample_type & PERF_SAMPLE_TID)
416 size += sizeof(data->tid_entry);
417
418 if (sample_type & PERF_SAMPLE_TIME)
419 size += sizeof(data->time);
420
421 if (sample_type & PERF_SAMPLE_ID)
422 size += sizeof(data->id);
423
424 if (sample_type & PERF_SAMPLE_STREAM_ID)
425 size += sizeof(data->stream_id);
426
427 if (sample_type & PERF_SAMPLE_CPU)
428 size += sizeof(data->cpu_entry);
429
430 event->id_header_size = size;
431}
432
314static void perf_group_attach(struct perf_event *event) 433static void perf_group_attach(struct perf_event *event)
315{ 434{
316 struct perf_event *group_leader = event->group_leader; 435 struct perf_event *group_leader = event->group_leader, *pos;
317 436
318 /* 437 /*
319 * We can have double attach due to group movement in perf_event_open. 438 * We can have double attach due to group movement in perf_event_open.
@@ -332,6 +451,11 @@ static void perf_group_attach(struct perf_event *event)
332 451
333 list_add_tail(&event->group_entry, &group_leader->sibling_list); 452 list_add_tail(&event->group_entry, &group_leader->sibling_list);
334 group_leader->nr_siblings++; 453 group_leader->nr_siblings++;
454
455 perf_event__header_size(group_leader);
456
457 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
458 perf_event__header_size(pos);
335} 459}
336 460
337/* 461/*
@@ -390,7 +514,7 @@ static void perf_group_detach(struct perf_event *event)
390 if (event->group_leader != event) { 514 if (event->group_leader != event) {
391 list_del_init(&event->group_entry); 515 list_del_init(&event->group_entry);
392 event->group_leader->nr_siblings--; 516 event->group_leader->nr_siblings--;
393 return; 517 goto out;
394 } 518 }
395 519
396 if (!list_empty(&event->group_entry)) 520 if (!list_empty(&event->group_entry))
@@ -409,6 +533,12 @@ static void perf_group_detach(struct perf_event *event)
409 /* Inherit group flags from the previous leader */ 533 /* Inherit group flags from the previous leader */
410 sibling->group_flags = event->group_flags; 534 sibling->group_flags = event->group_flags;
411 } 535 }
536
537out:
538 perf_event__header_size(event->group_leader);
539
540 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
541 perf_event__header_size(tmp);
412} 542}
413 543
414static inline int 544static inline int
@@ -422,6 +552,7 @@ event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 552 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 553 struct perf_event_context *ctx)
424{ 554{
555 u64 tstamp = perf_event_time(event);
425 u64 delta; 556 u64 delta;
426 /* 557 /*
427 * An event which could not be activated because of 558 * An event which could not be activated because of
@@ -433,7 +564,7 @@ event_sched_out(struct perf_event *event,
433 && !event_filter_match(event)) { 564 && !event_filter_match(event)) {
434 delta = ctx->time - event->tstamp_stopped; 565 delta = ctx->time - event->tstamp_stopped;
435 event->tstamp_running += delta; 566 event->tstamp_running += delta;
436 event->tstamp_stopped = ctx->time; 567 event->tstamp_stopped = tstamp;
437 } 568 }
438 569
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 570 if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -444,7 +575,7 @@ event_sched_out(struct perf_event *event,
444 event->pending_disable = 0; 575 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
446 } 577 }
447 event->tstamp_stopped = ctx->time; 578 event->tstamp_stopped = tstamp;
448 event->pmu->del(event, 0); 579 event->pmu->del(event, 0);
449 event->oncpu = -1; 580 event->oncpu = -1;
450 581
@@ -656,6 +787,8 @@ event_sched_in(struct perf_event *event,
656 struct perf_cpu_context *cpuctx, 787 struct perf_cpu_context *cpuctx,
657 struct perf_event_context *ctx) 788 struct perf_event_context *ctx)
658{ 789{
790 u64 tstamp = perf_event_time(event);
791
659 if (event->state <= PERF_EVENT_STATE_OFF) 792 if (event->state <= PERF_EVENT_STATE_OFF)
660 return 0; 793 return 0;
661 794
@@ -672,7 +805,9 @@ event_sched_in(struct perf_event *event,
672 return -EAGAIN; 805 return -EAGAIN;
673 } 806 }
674 807
675 event->tstamp_running += ctx->time - event->tstamp_stopped; 808 event->tstamp_running += tstamp - event->tstamp_stopped;
809
810 event->shadow_ctx_time = tstamp - ctx->timestamp;
676 811
677 if (!is_software_event(event)) 812 if (!is_software_event(event))
678 cpuctx->active_oncpu++; 813 cpuctx->active_oncpu++;
@@ -784,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
784static void add_event_to_ctx(struct perf_event *event, 919static void add_event_to_ctx(struct perf_event *event,
785 struct perf_event_context *ctx) 920 struct perf_event_context *ctx)
786{ 921{
922 u64 tstamp = perf_event_time(event);
923
787 list_add_event(event, ctx); 924 list_add_event(event, ctx);
788 perf_group_attach(event); 925 perf_group_attach(event);
789 event->tstamp_enabled = ctx->time; 926 event->tstamp_enabled = tstamp;
790 event->tstamp_running = ctx->time; 927 event->tstamp_running = tstamp;
791 event->tstamp_stopped = ctx->time; 928 event->tstamp_stopped = tstamp;
792} 929}
793 930
794/* 931/*
@@ -823,7 +960,7 @@ static void __perf_install_in_context(void *info)
823 960
824 add_event_to_ctx(event, ctx); 961 add_event_to_ctx(event, ctx);
825 962
826 if (event->cpu != -1 && event->cpu != smp_processor_id()) 963 if (!event_filter_match(event))
827 goto unlock; 964 goto unlock;
828 965
829 /* 966 /*
@@ -928,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
928 struct perf_event_context *ctx) 1065 struct perf_event_context *ctx)
929{ 1066{
930 struct perf_event *sub; 1067 struct perf_event *sub;
1068 u64 tstamp = perf_event_time(event);
931 1069
932 event->state = PERF_EVENT_STATE_INACTIVE; 1070 event->state = PERF_EVENT_STATE_INACTIVE;
933 event->tstamp_enabled = ctx->time - event->total_time_enabled; 1071 event->tstamp_enabled = tstamp - event->total_time_enabled;
934 list_for_each_entry(sub, &event->sibling_list, group_entry) { 1072 list_for_each_entry(sub, &event->sibling_list, group_entry) {
935 if (sub->state >= PERF_EVENT_STATE_INACTIVE) { 1073 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
936 sub->tstamp_enabled = 1074 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
937 ctx->time - sub->total_time_enabled;
938 }
939 } 1075 }
940} 1076}
941 1077
@@ -968,7 +1104,7 @@ static void __perf_event_enable(void *info)
968 goto unlock; 1104 goto unlock;
969 __perf_event_mark_enabled(event, ctx); 1105 __perf_event_mark_enabled(event, ctx);
970 1106
971 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1107 if (!event_filter_match(event))
972 goto unlock; 1108 goto unlock;
973 1109
974 /* 1110 /*
@@ -1070,7 +1206,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1070 /* 1206 /*
1071 * not supported on inherited events 1207 * not supported on inherited events
1072 */ 1208 */
1073 if (event->attr.inherit) 1209 if (event->attr.inherit || !is_sampling_event(event))
1074 return -EINVAL; 1210 return -EINVAL;
1075 1211
1076 atomic_add(refresh, &event->event_limit); 1212 atomic_add(refresh, &event->event_limit);
@@ -1079,12 +1215,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1079 return 0; 1215 return 0;
1080} 1216}
1081 1217
1082enum event_type_t {
1083 EVENT_FLEXIBLE = 0x1,
1084 EVENT_PINNED = 0x2,
1085 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1086};
1087
1088static void ctx_sched_out(struct perf_event_context *ctx, 1218static void ctx_sched_out(struct perf_event_context *ctx,
1089 struct perf_cpu_context *cpuctx, 1219 struct perf_cpu_context *cpuctx,
1090 enum event_type_t event_type) 1220 enum event_type_t event_type)
@@ -1284,8 +1414,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
1284{ 1414{
1285 int ctxn; 1415 int ctxn;
1286 1416
1287 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1288
1289 for_each_task_context_nr(ctxn) 1417 for_each_task_context_nr(ctxn)
1290 perf_event_context_sched_out(task, ctxn, next); 1418 perf_event_context_sched_out(task, ctxn, next);
1291} 1419}
@@ -1323,7 +1451,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1323 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 1451 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1324 if (event->state <= PERF_EVENT_STATE_OFF) 1452 if (event->state <= PERF_EVENT_STATE_OFF)
1325 continue; 1453 continue;
1326 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1454 if (!event_filter_match(event))
1327 continue; 1455 continue;
1328 1456
1329 if (group_can_go_on(event, cpuctx, 1)) 1457 if (group_can_go_on(event, cpuctx, 1))
@@ -1355,7 +1483,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1355 * Listen to the 'cpu' scheduling filter constraint 1483 * Listen to the 'cpu' scheduling filter constraint
1356 * of events: 1484 * of events:
1357 */ 1485 */
1358 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1486 if (!event_filter_match(event))
1359 continue; 1487 continue;
1360 1488
1361 if (group_can_go_on(event, cpuctx, can_add_hw)) { 1489 if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1582,7 +1710,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
1582 if (event->state != PERF_EVENT_STATE_ACTIVE) 1710 if (event->state != PERF_EVENT_STATE_ACTIVE)
1583 continue; 1711 continue;
1584 1712
1585 if (event->cpu != -1 && event->cpu != smp_processor_id()) 1713 if (!event_filter_match(event))
1586 continue; 1714 continue;
1587 1715
1588 hwc = &event->hw; 1716 hwc = &event->hw;
@@ -1619,8 +1747,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
1619{ 1747{
1620 raw_spin_lock(&ctx->lock); 1748 raw_spin_lock(&ctx->lock);
1621 1749
1622 /* Rotate the first entry last of non-pinned groups */ 1750 /*
1623 list_rotate_left(&ctx->flexible_groups); 1751 * Rotate the first entry last of non-pinned groups. Rotation might be
1752 * disabled by the inheritance code.
1753 */
1754 if (!ctx->rotate_disable)
1755 list_rotate_left(&ctx->flexible_groups);
1624 1756
1625 raw_spin_unlock(&ctx->lock); 1757 raw_spin_unlock(&ctx->lock);
1626} 1758}
@@ -2069,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid)
2069 if (!task) 2201 if (!task)
2070 return ERR_PTR(-ESRCH); 2202 return ERR_PTR(-ESRCH);
2071 2203
2072 /*
2073 * Can't attach events to a dying task.
2074 */
2075 err = -ESRCH;
2076 if (task->flags & PF_EXITING)
2077 goto errout;
2078
2079 /* Reuse ptrace permission checks for now. */ 2204 /* Reuse ptrace permission checks for now. */
2080 err = -EACCES; 2205 err = -EACCES;
2081 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2206 if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2096,14 +2221,11 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2096 unsigned long flags; 2221 unsigned long flags;
2097 int ctxn, err; 2222 int ctxn, err;
2098 2223
2099 if (!task && cpu != -1) { 2224 if (!task) {
2100 /* Must be root to operate on a CPU event: */ 2225 /* Must be root to operate on a CPU event: */
2101 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 2226 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2102 return ERR_PTR(-EACCES); 2227 return ERR_PTR(-EACCES);
2103 2228
2104 if (cpu < 0 || cpu >= nr_cpumask_bits)
2105 return ERR_PTR(-EINVAL);
2106
2107 /* 2229 /*
2108 * We could be clever and allow to attach a event to an 2230 * We could be clever and allow to attach a event to an
2109 * offline CPU and activate it when the CPU comes up, but 2231 * offline CPU and activate it when the CPU comes up, but
@@ -2139,14 +2261,27 @@ retry:
2139 2261
2140 get_ctx(ctx); 2262 get_ctx(ctx);
2141 2263
2142 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { 2264 err = 0;
2143 /* 2265 mutex_lock(&task->perf_event_mutex);
2144 * We raced with some other task; use 2266 /*
2145 * the context they set. 2267 * If it has already passed perf_event_exit_task().
2146 */ 2268 * we must see PF_EXITING, it takes this mutex too.
2269 */
2270 if (task->flags & PF_EXITING)
2271 err = -ESRCH;
2272 else if (task->perf_event_ctxp[ctxn])
2273 err = -EAGAIN;
2274 else
2275 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2276 mutex_unlock(&task->perf_event_mutex);
2277
2278 if (unlikely(err)) {
2147 put_task_struct(task); 2279 put_task_struct(task);
2148 kfree(ctx); 2280 kfree(ctx);
2149 goto retry; 2281
2282 if (err == -EAGAIN)
2283 goto retry;
2284 goto errout;
2150 } 2285 }
2151 } 2286 }
2152 2287
@@ -2232,11 +2367,6 @@ int perf_event_release_kernel(struct perf_event *event)
2232 raw_spin_unlock_irq(&ctx->lock); 2367 raw_spin_unlock_irq(&ctx->lock);
2233 mutex_unlock(&ctx->mutex); 2368 mutex_unlock(&ctx->mutex);
2234 2369
2235 mutex_lock(&event->owner->perf_event_mutex);
2236 list_del_init(&event->owner_entry);
2237 mutex_unlock(&event->owner->perf_event_mutex);
2238 put_task_struct(event->owner);
2239
2240 free_event(event); 2370 free_event(event);
2241 2371
2242 return 0; 2372 return 0;
@@ -2249,35 +2379,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2249static int perf_release(struct inode *inode, struct file *file) 2379static int perf_release(struct inode *inode, struct file *file)
2250{ 2380{
2251 struct perf_event *event = file->private_data; 2381 struct perf_event *event = file->private_data;
2382 struct task_struct *owner;
2252 2383
2253 file->private_data = NULL; 2384 file->private_data = NULL;
2254 2385
2255 return perf_event_release_kernel(event); 2386 rcu_read_lock();
2256} 2387 owner = ACCESS_ONCE(event->owner);
2257 2388 /*
2258static int perf_event_read_size(struct perf_event *event) 2389 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2259{ 2390 * !owner it means the list deletion is complete and we can indeed
2260 int entry = sizeof(u64); /* value */ 2391 * free this event, otherwise we need to serialize on
2261 int size = 0; 2392 * owner->perf_event_mutex.
2262 int nr = 1; 2393 */
2263 2394 smp_read_barrier_depends();
2264 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2395 if (owner) {
2265 size += sizeof(u64); 2396 /*
2266 2397 * Since delayed_put_task_struct() also drops the last
2267 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2398 * task reference we can safely take a new reference
2268 size += sizeof(u64); 2399 * while holding the rcu_read_lock().
2269 2400 */
2270 if (event->attr.read_format & PERF_FORMAT_ID) 2401 get_task_struct(owner);
2271 entry += sizeof(u64);
2272
2273 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2274 nr += event->group_leader->nr_siblings;
2275 size += sizeof(u64);
2276 } 2402 }
2403 rcu_read_unlock();
2277 2404
2278 size += entry * nr; 2405 if (owner) {
2406 mutex_lock(&owner->perf_event_mutex);
2407 /*
2408 * We have to re-check the event->owner field, if it is cleared
2409 * we raced with perf_event_exit_task(), acquiring the mutex
2410 * ensured they're done, and we can proceed with freeing the
2411 * event.
2412 */
2413 if (event->owner)
2414 list_del_init(&event->owner_entry);
2415 mutex_unlock(&owner->perf_event_mutex);
2416 put_task_struct(owner);
2417 }
2279 2418
2280 return size; 2419 return perf_event_release_kernel(event);
2281} 2420}
2282 2421
2283u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2422u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2394,7 +2533,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2394 if (event->state == PERF_EVENT_STATE_ERROR) 2533 if (event->state == PERF_EVENT_STATE_ERROR)
2395 return 0; 2534 return 0;
2396 2535
2397 if (count < perf_event_read_size(event)) 2536 if (count < event->read_size)
2398 return -ENOSPC; 2537 return -ENOSPC;
2399 2538
2400 WARN_ON_ONCE(event->ctx->parent_ctx); 2539 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2480,7 +2619,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2480 int ret = 0; 2619 int ret = 0;
2481 u64 value; 2620 u64 value;
2482 2621
2483 if (!event->attr.sample_period) 2622 if (!is_sampling_event(event))
2484 return -EINVAL; 2623 return -EINVAL;
2485 2624
2486 if (copy_from_user(&value, arg, sizeof(value))) 2625 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3271,6 +3410,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3271 } while (len); 3410 } while (len);
3272} 3411}
3273 3412
3413static void __perf_event_header__init_id(struct perf_event_header *header,
3414 struct perf_sample_data *data,
3415 struct perf_event *event)
3416{
3417 u64 sample_type = event->attr.sample_type;
3418
3419 data->type = sample_type;
3420 header->size += event->id_header_size;
3421
3422 if (sample_type & PERF_SAMPLE_TID) {
3423 /* namespace issues */
3424 data->tid_entry.pid = perf_event_pid(event, current);
3425 data->tid_entry.tid = perf_event_tid(event, current);
3426 }
3427
3428 if (sample_type & PERF_SAMPLE_TIME)
3429 data->time = perf_clock();
3430
3431 if (sample_type & PERF_SAMPLE_ID)
3432 data->id = primary_event_id(event);
3433
3434 if (sample_type & PERF_SAMPLE_STREAM_ID)
3435 data->stream_id = event->id;
3436
3437 if (sample_type & PERF_SAMPLE_CPU) {
3438 data->cpu_entry.cpu = raw_smp_processor_id();
3439 data->cpu_entry.reserved = 0;
3440 }
3441}
3442
3443static void perf_event_header__init_id(struct perf_event_header *header,
3444 struct perf_sample_data *data,
3445 struct perf_event *event)
3446{
3447 if (event->attr.sample_id_all)
3448 __perf_event_header__init_id(header, data, event);
3449}
3450
3451static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3452 struct perf_sample_data *data)
3453{
3454 u64 sample_type = data->type;
3455
3456 if (sample_type & PERF_SAMPLE_TID)
3457 perf_output_put(handle, data->tid_entry);
3458
3459 if (sample_type & PERF_SAMPLE_TIME)
3460 perf_output_put(handle, data->time);
3461
3462 if (sample_type & PERF_SAMPLE_ID)
3463 perf_output_put(handle, data->id);
3464
3465 if (sample_type & PERF_SAMPLE_STREAM_ID)
3466 perf_output_put(handle, data->stream_id);
3467
3468 if (sample_type & PERF_SAMPLE_CPU)
3469 perf_output_put(handle, data->cpu_entry);
3470}
3471
3472static void perf_event__output_id_sample(struct perf_event *event,
3473 struct perf_output_handle *handle,
3474 struct perf_sample_data *sample)
3475{
3476 if (event->attr.sample_id_all)
3477 __perf_event__output_id_sample(handle, sample);
3478}
3479
3274int perf_output_begin(struct perf_output_handle *handle, 3480int perf_output_begin(struct perf_output_handle *handle,
3275 struct perf_event *event, unsigned int size, 3481 struct perf_event *event, unsigned int size,
3276 int nmi, int sample) 3482 int nmi, int sample)
@@ -3278,6 +3484,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3278 struct perf_buffer *buffer; 3484 struct perf_buffer *buffer;
3279 unsigned long tail, offset, head; 3485 unsigned long tail, offset, head;
3280 int have_lost; 3486 int have_lost;
3487 struct perf_sample_data sample_data;
3281 struct { 3488 struct {
3282 struct perf_event_header header; 3489 struct perf_event_header header;
3283 u64 id; 3490 u64 id;
@@ -3304,8 +3511,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3304 goto out; 3511 goto out;
3305 3512
3306 have_lost = local_read(&buffer->lost); 3513 have_lost = local_read(&buffer->lost);
3307 if (have_lost) 3514 if (have_lost) {
3308 size += sizeof(lost_event); 3515 lost_event.header.size = sizeof(lost_event);
3516 perf_event_header__init_id(&lost_event.header, &sample_data,
3517 event);
3518 size += lost_event.header.size;
3519 }
3309 3520
3310 perf_output_get_handle(handle); 3521 perf_output_get_handle(handle);
3311 3522
@@ -3336,11 +3547,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3336 if (have_lost) { 3547 if (have_lost) {
3337 lost_event.header.type = PERF_RECORD_LOST; 3548 lost_event.header.type = PERF_RECORD_LOST;
3338 lost_event.header.misc = 0; 3549 lost_event.header.misc = 0;
3339 lost_event.header.size = sizeof(lost_event);
3340 lost_event.id = event->id; 3550 lost_event.id = event->id;
3341 lost_event.lost = local_xchg(&buffer->lost, 0); 3551 lost_event.lost = local_xchg(&buffer->lost, 0);
3342 3552
3343 perf_output_put(handle, lost_event); 3553 perf_output_put(handle, lost_event);
3554 perf_event__output_id_sample(event, handle, &sample_data);
3344 } 3555 }
3345 3556
3346 return 0; 3557 return 0;
@@ -3373,30 +3584,9 @@ void perf_output_end(struct perf_output_handle *handle)
3373 rcu_read_unlock(); 3584 rcu_read_unlock();
3374} 3585}
3375 3586
3376static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3377{
3378 /*
3379 * only top level events have the pid namespace they were created in
3380 */
3381 if (event->parent)
3382 event = event->parent;
3383
3384 return task_tgid_nr_ns(p, event->ns);
3385}
3386
3387static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3388{
3389 /*
3390 * only top level events have the pid namespace they were created in
3391 */
3392 if (event->parent)
3393 event = event->parent;
3394
3395 return task_pid_nr_ns(p, event->ns);
3396}
3397
3398static void perf_output_read_one(struct perf_output_handle *handle, 3587static void perf_output_read_one(struct perf_output_handle *handle,
3399 struct perf_event *event) 3588 struct perf_event *event,
3589 u64 enabled, u64 running)
3400{ 3590{
3401 u64 read_format = event->attr.read_format; 3591 u64 read_format = event->attr.read_format;
3402 u64 values[4]; 3592 u64 values[4];
@@ -3404,11 +3594,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3404 3594
3405 values[n++] = perf_event_count(event); 3595 values[n++] = perf_event_count(event);
3406 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 3596 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
3407 values[n++] = event->total_time_enabled + 3597 values[n++] = enabled +
3408 atomic64_read(&event->child_total_time_enabled); 3598 atomic64_read(&event->child_total_time_enabled);
3409 } 3599 }
3410 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 3600 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
3411 values[n++] = event->total_time_running + 3601 values[n++] = running +
3412 atomic64_read(&event->child_total_time_running); 3602 atomic64_read(&event->child_total_time_running);
3413 } 3603 }
3414 if (read_format & PERF_FORMAT_ID) 3604 if (read_format & PERF_FORMAT_ID)
@@ -3421,7 +3611,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
3421 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. 3611 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
3422 */ 3612 */
3423static void perf_output_read_group(struct perf_output_handle *handle, 3613static void perf_output_read_group(struct perf_output_handle *handle,
3424 struct perf_event *event) 3614 struct perf_event *event,
3615 u64 enabled, u64 running)
3425{ 3616{
3426 struct perf_event *leader = event->group_leader, *sub; 3617 struct perf_event *leader = event->group_leader, *sub;
3427 u64 read_format = event->attr.read_format; 3618 u64 read_format = event->attr.read_format;
@@ -3431,10 +3622,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3431 values[n++] = 1 + leader->nr_siblings; 3622 values[n++] = 1 + leader->nr_siblings;
3432 3623
3433 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 3624 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3434 values[n++] = leader->total_time_enabled; 3625 values[n++] = enabled;
3435 3626
3436 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 3627 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3437 values[n++] = leader->total_time_running; 3628 values[n++] = running;
3438 3629
3439 if (leader != event) 3630 if (leader != event)
3440 leader->pmu->read(leader); 3631 leader->pmu->read(leader);
@@ -3459,13 +3650,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3459 } 3650 }
3460} 3651}
3461 3652
3653#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
3654 PERF_FORMAT_TOTAL_TIME_RUNNING)
3655
3462static void perf_output_read(struct perf_output_handle *handle, 3656static void perf_output_read(struct perf_output_handle *handle,
3463 struct perf_event *event) 3657 struct perf_event *event)
3464{ 3658{
3659 u64 enabled = 0, running = 0, now, ctx_time;
3660 u64 read_format = event->attr.read_format;
3661
3662 /*
3663 * compute total_time_enabled, total_time_running
3664 * based on snapshot values taken when the event
3665 * was last scheduled in.
3666 *
3667 * we cannot simply called update_context_time()
3668 * because of locking issue as we are called in
3669 * NMI context
3670 */
3671 if (read_format & PERF_FORMAT_TOTAL_TIMES) {
3672 now = perf_clock();
3673 ctx_time = event->shadow_ctx_time + now;
3674 enabled = ctx_time - event->tstamp_enabled;
3675 running = ctx_time - event->tstamp_running;
3676 }
3677
3465 if (event->attr.read_format & PERF_FORMAT_GROUP) 3678 if (event->attr.read_format & PERF_FORMAT_GROUP)
3466 perf_output_read_group(handle, event); 3679 perf_output_read_group(handle, event, enabled, running);
3467 else 3680 else
3468 perf_output_read_one(handle, event); 3681 perf_output_read_one(handle, event, enabled, running);
3469} 3682}
3470 3683
3471void perf_output_sample(struct perf_output_handle *handle, 3684void perf_output_sample(struct perf_output_handle *handle,
@@ -3545,61 +3758,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3545{ 3758{
3546 u64 sample_type = event->attr.sample_type; 3759 u64 sample_type = event->attr.sample_type;
3547 3760
3548 data->type = sample_type;
3549
3550 header->type = PERF_RECORD_SAMPLE; 3761 header->type = PERF_RECORD_SAMPLE;
3551 header->size = sizeof(*header); 3762 header->size = sizeof(*header) + event->header_size;
3552 3763
3553 header->misc = 0; 3764 header->misc = 0;
3554 header->misc |= perf_misc_flags(regs); 3765 header->misc |= perf_misc_flags(regs);
3555 3766
3556 if (sample_type & PERF_SAMPLE_IP) { 3767 __perf_event_header__init_id(header, data, event);
3557 data->ip = perf_instruction_pointer(regs);
3558
3559 header->size += sizeof(data->ip);
3560 }
3561
3562 if (sample_type & PERF_SAMPLE_TID) {
3563 /* namespace issues */
3564 data->tid_entry.pid = perf_event_pid(event, current);
3565 data->tid_entry.tid = perf_event_tid(event, current);
3566
3567 header->size += sizeof(data->tid_entry);
3568 }
3569
3570 if (sample_type & PERF_SAMPLE_TIME) {
3571 data->time = perf_clock();
3572
3573 header->size += sizeof(data->time);
3574 }
3575
3576 if (sample_type & PERF_SAMPLE_ADDR)
3577 header->size += sizeof(data->addr);
3578
3579 if (sample_type & PERF_SAMPLE_ID) {
3580 data->id = primary_event_id(event);
3581
3582 header->size += sizeof(data->id);
3583 }
3584 3768
3585 if (sample_type & PERF_SAMPLE_STREAM_ID) { 3769 if (sample_type & PERF_SAMPLE_IP)
3586 data->stream_id = event->id; 3770 data->ip = perf_instruction_pointer(regs);
3587
3588 header->size += sizeof(data->stream_id);
3589 }
3590
3591 if (sample_type & PERF_SAMPLE_CPU) {
3592 data->cpu_entry.cpu = raw_smp_processor_id();
3593 data->cpu_entry.reserved = 0;
3594
3595 header->size += sizeof(data->cpu_entry);
3596 }
3597
3598 if (sample_type & PERF_SAMPLE_PERIOD)
3599 header->size += sizeof(data->period);
3600
3601 if (sample_type & PERF_SAMPLE_READ)
3602 header->size += perf_event_read_size(event);
3603 3771
3604 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3772 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3605 int size = 1; 3773 int size = 1;
@@ -3664,23 +3832,26 @@ perf_event_read_event(struct perf_event *event,
3664 struct task_struct *task) 3832 struct task_struct *task)
3665{ 3833{
3666 struct perf_output_handle handle; 3834 struct perf_output_handle handle;
3835 struct perf_sample_data sample;
3667 struct perf_read_event read_event = { 3836 struct perf_read_event read_event = {
3668 .header = { 3837 .header = {
3669 .type = PERF_RECORD_READ, 3838 .type = PERF_RECORD_READ,
3670 .misc = 0, 3839 .misc = 0,
3671 .size = sizeof(read_event) + perf_event_read_size(event), 3840 .size = sizeof(read_event) + event->read_size,
3672 }, 3841 },
3673 .pid = perf_event_pid(event, task), 3842 .pid = perf_event_pid(event, task),
3674 .tid = perf_event_tid(event, task), 3843 .tid = perf_event_tid(event, task),
3675 }; 3844 };
3676 int ret; 3845 int ret;
3677 3846
3847 perf_event_header__init_id(&read_event.header, &sample, event);
3678 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3848 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3679 if (ret) 3849 if (ret)
3680 return; 3850 return;
3681 3851
3682 perf_output_put(&handle, read_event); 3852 perf_output_put(&handle, read_event);
3683 perf_output_read(&handle, event); 3853 perf_output_read(&handle, event);
3854 perf_event__output_id_sample(event, &handle, &sample);
3684 3855
3685 perf_output_end(&handle); 3856 perf_output_end(&handle);
3686} 3857}
@@ -3710,14 +3881,16 @@ static void perf_event_task_output(struct perf_event *event,
3710 struct perf_task_event *task_event) 3881 struct perf_task_event *task_event)
3711{ 3882{
3712 struct perf_output_handle handle; 3883 struct perf_output_handle handle;
3884 struct perf_sample_data sample;
3713 struct task_struct *task = task_event->task; 3885 struct task_struct *task = task_event->task;
3714 int size, ret; 3886 int ret, size = task_event->event_id.header.size;
3715 3887
3716 size = task_event->event_id.header.size; 3888 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3717 ret = perf_output_begin(&handle, event, size, 0, 0);
3718 3889
3890 ret = perf_output_begin(&handle, event,
3891 task_event->event_id.header.size, 0, 0);
3719 if (ret) 3892 if (ret)
3720 return; 3893 goto out;
3721 3894
3722 task_event->event_id.pid = perf_event_pid(event, task); 3895 task_event->event_id.pid = perf_event_pid(event, task);
3723 task_event->event_id.ppid = perf_event_pid(event, current); 3896 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3727,7 +3900,11 @@ static void perf_event_task_output(struct perf_event *event,
3727 3900
3728 perf_output_put(&handle, task_event->event_id); 3901 perf_output_put(&handle, task_event->event_id);
3729 3902
3903 perf_event__output_id_sample(event, &handle, &sample);
3904
3730 perf_output_end(&handle); 3905 perf_output_end(&handle);
3906out:
3907 task_event->event_id.header.size = size;
3731} 3908}
3732 3909
3733static int perf_event_task_match(struct perf_event *event) 3910static int perf_event_task_match(struct perf_event *event)
@@ -3735,7 +3912,7 @@ static int perf_event_task_match(struct perf_event *event)
3735 if (event->state < PERF_EVENT_STATE_INACTIVE) 3912 if (event->state < PERF_EVENT_STATE_INACTIVE)
3736 return 0; 3913 return 0;
3737 3914
3738 if (event->cpu != -1 && event->cpu != smp_processor_id()) 3915 if (!event_filter_match(event))
3739 return 0; 3916 return 0;
3740 3917
3741 if (event->attr.comm || event->attr.mmap || 3918 if (event->attr.comm || event->attr.mmap ||
@@ -3766,6 +3943,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3766 rcu_read_lock(); 3943 rcu_read_lock();
3767 list_for_each_entry_rcu(pmu, &pmus, entry) { 3944 list_for_each_entry_rcu(pmu, &pmus, entry) {
3768 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3945 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3946 if (cpuctx->active_pmu != pmu)
3947 goto next;
3769 perf_event_task_ctx(&cpuctx->ctx, task_event); 3948 perf_event_task_ctx(&cpuctx->ctx, task_event);
3770 3949
3771 ctx = task_event->task_ctx; 3950 ctx = task_event->task_ctx;
@@ -3840,11 +4019,16 @@ static void perf_event_comm_output(struct perf_event *event,
3840 struct perf_comm_event *comm_event) 4019 struct perf_comm_event *comm_event)
3841{ 4020{
3842 struct perf_output_handle handle; 4021 struct perf_output_handle handle;
4022 struct perf_sample_data sample;
3843 int size = comm_event->event_id.header.size; 4023 int size = comm_event->event_id.header.size;
3844 int ret = perf_output_begin(&handle, event, size, 0, 0); 4024 int ret;
4025
4026 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4027 ret = perf_output_begin(&handle, event,
4028 comm_event->event_id.header.size, 0, 0);
3845 4029
3846 if (ret) 4030 if (ret)
3847 return; 4031 goto out;
3848 4032
3849 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4033 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3850 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4034 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3852,7 +4036,12 @@ static void perf_event_comm_output(struct perf_event *event,
3852 perf_output_put(&handle, comm_event->event_id); 4036 perf_output_put(&handle, comm_event->event_id);
3853 perf_output_copy(&handle, comm_event->comm, 4037 perf_output_copy(&handle, comm_event->comm,
3854 comm_event->comm_size); 4038 comm_event->comm_size);
4039
4040 perf_event__output_id_sample(event, &handle, &sample);
4041
3855 perf_output_end(&handle); 4042 perf_output_end(&handle);
4043out:
4044 comm_event->event_id.header.size = size;
3856} 4045}
3857 4046
3858static int perf_event_comm_match(struct perf_event *event) 4047static int perf_event_comm_match(struct perf_event *event)
@@ -3860,7 +4049,7 @@ static int perf_event_comm_match(struct perf_event *event)
3860 if (event->state < PERF_EVENT_STATE_INACTIVE) 4049 if (event->state < PERF_EVENT_STATE_INACTIVE)
3861 return 0; 4050 return 0;
3862 4051
3863 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4052 if (!event_filter_match(event))
3864 return 0; 4053 return 0;
3865 4054
3866 if (event->attr.comm) 4055 if (event->attr.comm)
@@ -3897,10 +4086,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3897 comm_event->comm_size = size; 4086 comm_event->comm_size = size;
3898 4087
3899 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4088 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3900
3901 rcu_read_lock(); 4089 rcu_read_lock();
3902 list_for_each_entry_rcu(pmu, &pmus, entry) { 4090 list_for_each_entry_rcu(pmu, &pmus, entry) {
3903 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4091 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4092 if (cpuctx->active_pmu != pmu)
4093 goto next;
3904 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4094 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3905 4095
3906 ctxn = pmu->task_ctx_nr; 4096 ctxn = pmu->task_ctx_nr;
@@ -3976,11 +4166,15 @@ static void perf_event_mmap_output(struct perf_event *event,
3976 struct perf_mmap_event *mmap_event) 4166 struct perf_mmap_event *mmap_event)
3977{ 4167{
3978 struct perf_output_handle handle; 4168 struct perf_output_handle handle;
4169 struct perf_sample_data sample;
3979 int size = mmap_event->event_id.header.size; 4170 int size = mmap_event->event_id.header.size;
3980 int ret = perf_output_begin(&handle, event, size, 0, 0); 4171 int ret;
3981 4172
4173 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4174 ret = perf_output_begin(&handle, event,
4175 mmap_event->event_id.header.size, 0, 0);
3982 if (ret) 4176 if (ret)
3983 return; 4177 goto out;
3984 4178
3985 mmap_event->event_id.pid = perf_event_pid(event, current); 4179 mmap_event->event_id.pid = perf_event_pid(event, current);
3986 mmap_event->event_id.tid = perf_event_tid(event, current); 4180 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -3988,7 +4182,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3988 perf_output_put(&handle, mmap_event->event_id); 4182 perf_output_put(&handle, mmap_event->event_id);
3989 perf_output_copy(&handle, mmap_event->file_name, 4183 perf_output_copy(&handle, mmap_event->file_name,
3990 mmap_event->file_size); 4184 mmap_event->file_size);
4185
4186 perf_event__output_id_sample(event, &handle, &sample);
4187
3991 perf_output_end(&handle); 4188 perf_output_end(&handle);
4189out:
4190 mmap_event->event_id.header.size = size;
3992} 4191}
3993 4192
3994static int perf_event_mmap_match(struct perf_event *event, 4193static int perf_event_mmap_match(struct perf_event *event,
@@ -3998,7 +4197,7 @@ static int perf_event_mmap_match(struct perf_event *event,
3998 if (event->state < PERF_EVENT_STATE_INACTIVE) 4197 if (event->state < PERF_EVENT_STATE_INACTIVE)
3999 return 0; 4198 return 0;
4000 4199
4001 if (event->cpu != -1 && event->cpu != smp_processor_id()) 4200 if (!event_filter_match(event))
4002 return 0; 4201 return 0;
4003 4202
4004 if ((!executable && event->attr.mmap_data) || 4203 if ((!executable && event->attr.mmap_data) ||
@@ -4086,6 +4285,8 @@ got_name:
4086 rcu_read_lock(); 4285 rcu_read_lock();
4087 list_for_each_entry_rcu(pmu, &pmus, entry) { 4286 list_for_each_entry_rcu(pmu, &pmus, entry) {
4088 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4287 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4288 if (cpuctx->active_pmu != pmu)
4289 goto next;
4089 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4290 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4090 vma->vm_flags & VM_EXEC); 4291 vma->vm_flags & VM_EXEC);
4091 4292
@@ -4141,6 +4342,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4141static void perf_log_throttle(struct perf_event *event, int enable) 4342static void perf_log_throttle(struct perf_event *event, int enable)
4142{ 4343{
4143 struct perf_output_handle handle; 4344 struct perf_output_handle handle;
4345 struct perf_sample_data sample;
4144 int ret; 4346 int ret;
4145 4347
4146 struct { 4348 struct {
@@ -4162,11 +4364,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4162 if (enable) 4364 if (enable)
4163 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4365 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4164 4366
4165 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4367 perf_event_header__init_id(&throttle_event.header, &sample, event);
4368
4369 ret = perf_output_begin(&handle, event,
4370 throttle_event.header.size, 1, 0);
4166 if (ret) 4371 if (ret)
4167 return; 4372 return;
4168 4373
4169 perf_output_put(&handle, throttle_event); 4374 perf_output_put(&handle, throttle_event);
4375 perf_event__output_id_sample(event, &handle, &sample);
4170 perf_output_end(&handle); 4376 perf_output_end(&handle);
4171} 4377}
4172 4378
@@ -4182,6 +4388,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4182 struct hw_perf_event *hwc = &event->hw; 4388 struct hw_perf_event *hwc = &event->hw;
4183 int ret = 0; 4389 int ret = 0;
4184 4390
4391 /*
4392 * Non-sampling counters might still use the PMI to fold short
4393 * hardware counters, ignore those.
4394 */
4395 if (unlikely(!is_sampling_event(event)))
4396 return 0;
4397
4185 if (!throttle) { 4398 if (!throttle) {
4186 hwc->interrupts++; 4399 hwc->interrupts++;
4187 } else { 4400 } else {
@@ -4327,7 +4540,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4327 if (!regs) 4540 if (!regs)
4328 return; 4541 return;
4329 4542
4330 if (!hwc->sample_period) 4543 if (!is_sampling_event(event))
4331 return; 4544 return;
4332 4545
4333 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4546 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4454,7 +4667,7 @@ int perf_swevent_get_recursion_context(void)
4454} 4667}
4455EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); 4668EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
4456 4669
4457void inline perf_swevent_put_recursion_context(int rctx) 4670inline void perf_swevent_put_recursion_context(int rctx)
4458{ 4671{
4459 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 4672 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
4460 4673
@@ -4490,7 +4703,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4490 struct hw_perf_event *hwc = &event->hw; 4703 struct hw_perf_event *hwc = &event->hw;
4491 struct hlist_head *head; 4704 struct hlist_head *head;
4492 4705
4493 if (hwc->sample_period) { 4706 if (is_sampling_event(event)) {
4494 hwc->last_period = hwc->sample_period; 4707 hwc->last_period = hwc->sample_period;
4495 perf_swevent_set_period(event); 4708 perf_swevent_set_period(event);
4496 } 4709 }
@@ -4655,7 +4868,7 @@ static int perf_swevent_init(struct perf_event *event)
4655 break; 4868 break;
4656 } 4869 }
4657 4870
4658 if (event_id > PERF_COUNT_SW_MAX) 4871 if (event_id >= PERF_COUNT_SW_MAX)
4659 return -ENOENT; 4872 return -ENOENT;
4660 4873
4661 if (!event->parent) { 4874 if (!event->parent) {
@@ -4747,15 +4960,6 @@ static int perf_tp_event_init(struct perf_event *event)
4747 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4960 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4748 return -ENOENT; 4961 return -ENOENT;
4749 4962
4750 /*
4751 * Raw tracepoint data is a severe data leak, only allow root to
4752 * have these.
4753 */
4754 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4755 perf_paranoid_tracepoint_raw() &&
4756 !capable(CAP_SYS_ADMIN))
4757 return -EPERM;
4758
4759 err = perf_trace_init(event); 4963 err = perf_trace_init(event);
4760 if (err) 4964 if (err)
4761 return err; 4965 return err;
@@ -4778,7 +4982,7 @@ static struct pmu perf_tracepoint = {
4778 4982
4779static inline void perf_tp_register(void) 4983static inline void perf_tp_register(void)
4780{ 4984{
4781 perf_pmu_register(&perf_tracepoint); 4985 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4782} 4986}
4783 4987
4784static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4988static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4868,31 +5072,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4868static void perf_swevent_start_hrtimer(struct perf_event *event) 5072static void perf_swevent_start_hrtimer(struct perf_event *event)
4869{ 5073{
4870 struct hw_perf_event *hwc = &event->hw; 5074 struct hw_perf_event *hwc = &event->hw;
5075 s64 period;
5076
5077 if (!is_sampling_event(event))
5078 return;
4871 5079
4872 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5080 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4873 hwc->hrtimer.function = perf_swevent_hrtimer; 5081 hwc->hrtimer.function = perf_swevent_hrtimer;
4874 if (hwc->sample_period) {
4875 s64 period = local64_read(&hwc->period_left);
4876 5082
4877 if (period) { 5083 period = local64_read(&hwc->period_left);
4878 if (period < 0) 5084 if (period) {
4879 period = 10000; 5085 if (period < 0)
5086 period = 10000;
4880 5087
4881 local64_set(&hwc->period_left, 0); 5088 local64_set(&hwc->period_left, 0);
4882 } else { 5089 } else {
4883 period = max_t(u64, 10000, hwc->sample_period); 5090 period = max_t(u64, 10000, hwc->sample_period);
4884 } 5091 }
4885 __hrtimer_start_range_ns(&hwc->hrtimer, 5092 __hrtimer_start_range_ns(&hwc->hrtimer,
4886 ns_to_ktime(period), 0, 5093 ns_to_ktime(period), 0,
4887 HRTIMER_MODE_REL_PINNED, 0); 5094 HRTIMER_MODE_REL_PINNED, 0);
4888 }
4889} 5095}
4890 5096
4891static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5097static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4892{ 5098{
4893 struct hw_perf_event *hwc = &event->hw; 5099 struct hw_perf_event *hwc = &event->hw;
4894 5100
4895 if (hwc->sample_period) { 5101 if (is_sampling_event(event)) {
4896 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5102 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4897 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5103 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4898 5104
@@ -5087,25 +5293,96 @@ static void *find_pmu_context(int ctxn)
5087 return NULL; 5293 return NULL;
5088} 5294}
5089 5295
5090static void free_pmu_context(void * __percpu cpu_context) 5296static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5091{ 5297{
5092 struct pmu *pmu; 5298 int cpu;
5299
5300 for_each_possible_cpu(cpu) {
5301 struct perf_cpu_context *cpuctx;
5302
5303 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5304
5305 if (cpuctx->active_pmu == old_pmu)
5306 cpuctx->active_pmu = pmu;
5307 }
5308}
5309
5310static void free_pmu_context(struct pmu *pmu)
5311{
5312 struct pmu *i;
5093 5313
5094 mutex_lock(&pmus_lock); 5314 mutex_lock(&pmus_lock);
5095 /* 5315 /*
5096 * Like a real lame refcount. 5316 * Like a real lame refcount.
5097 */ 5317 */
5098 list_for_each_entry(pmu, &pmus, entry) { 5318 list_for_each_entry(i, &pmus, entry) {
5099 if (pmu->pmu_cpu_context == cpu_context) 5319 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5320 update_pmu_context(i, pmu);
5100 goto out; 5321 goto out;
5322 }
5101 } 5323 }
5102 5324
5103 free_percpu(cpu_context); 5325 free_percpu(pmu->pmu_cpu_context);
5104out: 5326out:
5105 mutex_unlock(&pmus_lock); 5327 mutex_unlock(&pmus_lock);
5106} 5328}
5329static struct idr pmu_idr;
5107 5330
5108int perf_pmu_register(struct pmu *pmu) 5331static ssize_t
5332type_show(struct device *dev, struct device_attribute *attr, char *page)
5333{
5334 struct pmu *pmu = dev_get_drvdata(dev);
5335
5336 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5337}
5338
5339static struct device_attribute pmu_dev_attrs[] = {
5340 __ATTR_RO(type),
5341 __ATTR_NULL,
5342};
5343
5344static int pmu_bus_running;
5345static struct bus_type pmu_bus = {
5346 .name = "event_source",
5347 .dev_attrs = pmu_dev_attrs,
5348};
5349
5350static void pmu_dev_release(struct device *dev)
5351{
5352 kfree(dev);
5353}
5354
5355static int pmu_dev_alloc(struct pmu *pmu)
5356{
5357 int ret = -ENOMEM;
5358
5359 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5360 if (!pmu->dev)
5361 goto out;
5362
5363 device_initialize(pmu->dev);
5364 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5365 if (ret)
5366 goto free_dev;
5367
5368 dev_set_drvdata(pmu->dev, pmu);
5369 pmu->dev->bus = &pmu_bus;
5370 pmu->dev->release = pmu_dev_release;
5371 ret = device_add(pmu->dev);
5372 if (ret)
5373 goto free_dev;
5374
5375out:
5376 return ret;
5377
5378free_dev:
5379 put_device(pmu->dev);
5380 goto out;
5381}
5382
5383static struct lock_class_key cpuctx_mutex;
5384
5385int perf_pmu_register(struct pmu *pmu, char *name, int type)
5109{ 5386{
5110 int cpu, ret; 5387 int cpu, ret;
5111 5388
@@ -5115,23 +5392,50 @@ int perf_pmu_register(struct pmu *pmu)
5115 if (!pmu->pmu_disable_count) 5392 if (!pmu->pmu_disable_count)
5116 goto unlock; 5393 goto unlock;
5117 5394
5395 pmu->type = -1;
5396 if (!name)
5397 goto skip_type;
5398 pmu->name = name;
5399
5400 if (type < 0) {
5401 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5402 if (!err)
5403 goto free_pdc;
5404
5405 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5406 if (err) {
5407 ret = err;
5408 goto free_pdc;
5409 }
5410 }
5411 pmu->type = type;
5412
5413 if (pmu_bus_running) {
5414 ret = pmu_dev_alloc(pmu);
5415 if (ret)
5416 goto free_idr;
5417 }
5418
5419skip_type:
5118 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5420 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5119 if (pmu->pmu_cpu_context) 5421 if (pmu->pmu_cpu_context)
5120 goto got_cpu_context; 5422 goto got_cpu_context;
5121 5423
5122 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5424 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5123 if (!pmu->pmu_cpu_context) 5425 if (!pmu->pmu_cpu_context)
5124 goto free_pdc; 5426 goto free_dev;
5125 5427
5126 for_each_possible_cpu(cpu) { 5428 for_each_possible_cpu(cpu) {
5127 struct perf_cpu_context *cpuctx; 5429 struct perf_cpu_context *cpuctx;
5128 5430
5129 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5431 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5130 __perf_event_init_context(&cpuctx->ctx); 5432 __perf_event_init_context(&cpuctx->ctx);
5433 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5131 cpuctx->ctx.type = cpu_context; 5434 cpuctx->ctx.type = cpu_context;
5132 cpuctx->ctx.pmu = pmu; 5435 cpuctx->ctx.pmu = pmu;
5133 cpuctx->jiffies_interval = 1; 5436 cpuctx->jiffies_interval = 1;
5134 INIT_LIST_HEAD(&cpuctx->rotation_list); 5437 INIT_LIST_HEAD(&cpuctx->rotation_list);
5438 cpuctx->active_pmu = pmu;
5135 } 5439 }
5136 5440
5137got_cpu_context: 5441got_cpu_context:
@@ -5164,6 +5468,14 @@ unlock:
5164 5468
5165 return ret; 5469 return ret;
5166 5470
5471free_dev:
5472 device_del(pmu->dev);
5473 put_device(pmu->dev);
5474
5475free_idr:
5476 if (pmu->type >= PERF_TYPE_MAX)
5477 idr_remove(&pmu_idr, pmu->type);
5478
5167free_pdc: 5479free_pdc:
5168 free_percpu(pmu->pmu_disable_count); 5480 free_percpu(pmu->pmu_disable_count);
5169 goto unlock; 5481 goto unlock;
@@ -5183,7 +5495,11 @@ void perf_pmu_unregister(struct pmu *pmu)
5183 synchronize_rcu(); 5495 synchronize_rcu();
5184 5496
5185 free_percpu(pmu->pmu_disable_count); 5497 free_percpu(pmu->pmu_disable_count);
5186 free_pmu_context(pmu->pmu_cpu_context); 5498 if (pmu->type >= PERF_TYPE_MAX)
5499 idr_remove(&pmu_idr, pmu->type);
5500 device_del(pmu->dev);
5501 put_device(pmu->dev);
5502 free_pmu_context(pmu);
5187} 5503}
5188 5504
5189struct pmu *perf_init_event(struct perf_event *event) 5505struct pmu *perf_init_event(struct perf_event *event)
@@ -5192,6 +5508,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5192 int idx; 5508 int idx;
5193 5509
5194 idx = srcu_read_lock(&pmus_srcu); 5510 idx = srcu_read_lock(&pmus_srcu);
5511
5512 rcu_read_lock();
5513 pmu = idr_find(&pmu_idr, event->attr.type);
5514 rcu_read_unlock();
5515 if (pmu)
5516 goto unlock;
5517
5195 list_for_each_entry_rcu(pmu, &pmus, entry) { 5518 list_for_each_entry_rcu(pmu, &pmus, entry) {
5196 int ret = pmu->event_init(event); 5519 int ret = pmu->event_init(event);
5197 if (!ret) 5520 if (!ret)
@@ -5224,6 +5547,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5224 struct hw_perf_event *hwc; 5547 struct hw_perf_event *hwc;
5225 long err; 5548 long err;
5226 5549
5550 if ((unsigned)cpu >= nr_cpu_ids) {
5551 if (!task || cpu != -1)
5552 return ERR_PTR(-EINVAL);
5553 }
5554
5227 event = kzalloc(sizeof(*event), GFP_KERNEL); 5555 event = kzalloc(sizeof(*event), GFP_KERNEL);
5228 if (!event) 5556 if (!event)
5229 return ERR_PTR(-ENOMEM); 5557 return ERR_PTR(-ENOMEM);
@@ -5272,7 +5600,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
5272 5600
5273 if (!overflow_handler && parent_event) 5601 if (!overflow_handler && parent_event)
5274 overflow_handler = parent_event->overflow_handler; 5602 overflow_handler = parent_event->overflow_handler;
5275 5603
5276 event->overflow_handler = overflow_handler; 5604 event->overflow_handler = overflow_handler;
5277 5605
5278 if (attr->disabled) 5606 if (attr->disabled)
@@ -5651,12 +5979,18 @@ SYSCALL_DEFINE5(perf_event_open,
5651 mutex_unlock(&ctx->mutex); 5979 mutex_unlock(&ctx->mutex);
5652 5980
5653 event->owner = current; 5981 event->owner = current;
5654 get_task_struct(current); 5982
5655 mutex_lock(&current->perf_event_mutex); 5983 mutex_lock(&current->perf_event_mutex);
5656 list_add_tail(&event->owner_entry, &current->perf_event_list); 5984 list_add_tail(&event->owner_entry, &current->perf_event_list);
5657 mutex_unlock(&current->perf_event_mutex); 5985 mutex_unlock(&current->perf_event_mutex);
5658 5986
5659 /* 5987 /*
5988 * Precalculate sample_data sizes
5989 */
5990 perf_event__header_size(event);
5991 perf_event__id_header_size(event);
5992
5993 /*
5660 * Drop the reference on the group_event after placing the 5994 * Drop the reference on the group_event after placing the
5661 * new event on the sibling_list. This ensures destruction 5995 * new event on the sibling_list. This ensures destruction
5662 * of the group leader will find the pointer to itself in 5996 * of the group leader will find the pointer to itself in
@@ -5719,12 +6053,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5719 ++ctx->generation; 6053 ++ctx->generation;
5720 mutex_unlock(&ctx->mutex); 6054 mutex_unlock(&ctx->mutex);
5721 6055
5722 event->owner = current;
5723 get_task_struct(current);
5724 mutex_lock(&current->perf_event_mutex);
5725 list_add_tail(&event->owner_entry, &current->perf_event_list);
5726 mutex_unlock(&current->perf_event_mutex);
5727
5728 return event; 6056 return event;
5729 6057
5730err_free: 6058err_free:
@@ -5808,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5808 * scheduled, so we are now safe from rescheduling changing 6136 * scheduled, so we are now safe from rescheduling changing
5809 * our context. 6137 * our context.
5810 */ 6138 */
5811 child_ctx = child->perf_event_ctxp[ctxn]; 6139 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
5812 task_ctx_sched_out(child_ctx, EVENT_ALL); 6140 task_ctx_sched_out(child_ctx, EVENT_ALL);
5813 6141
5814 /* 6142 /*
@@ -5875,8 +6203,24 @@ again:
5875 */ 6203 */
5876void perf_event_exit_task(struct task_struct *child) 6204void perf_event_exit_task(struct task_struct *child)
5877{ 6205{
6206 struct perf_event *event, *tmp;
5878 int ctxn; 6207 int ctxn;
5879 6208
6209 mutex_lock(&child->perf_event_mutex);
6210 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6211 owner_entry) {
6212 list_del_init(&event->owner_entry);
6213
6214 /*
6215 * Ensure the list deletion is visible before we clear
6216 * the owner, closes a race against perf_release() where
6217 * we need to serialize on the owner->perf_event_mutex.
6218 */
6219 smp_wmb();
6220 event->owner = NULL;
6221 }
6222 mutex_unlock(&child->perf_event_mutex);
6223
5880 for_each_task_context_nr(ctxn) 6224 for_each_task_context_nr(ctxn)
5881 perf_event_exit_task_context(child, ctxn); 6225 perf_event_exit_task_context(child, ctxn);
5882} 6226}
@@ -5999,6 +6343,12 @@ inherit_event(struct perf_event *parent_event,
5999 child_event->overflow_handler = parent_event->overflow_handler; 6343 child_event->overflow_handler = parent_event->overflow_handler;
6000 6344
6001 /* 6345 /*
6346 * Precalculate sample_data sizes
6347 */
6348 perf_event__header_size(child_event);
6349 perf_event__id_header_size(child_event);
6350
6351 /*
6002 * Link it up in the child's context: 6352 * Link it up in the child's context:
6003 */ 6353 */
6004 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6354 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6096,13 +6446,9 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6096 struct perf_event *event; 6446 struct perf_event *event;
6097 struct task_struct *parent = current; 6447 struct task_struct *parent = current;
6098 int inherited_all = 1; 6448 int inherited_all = 1;
6449 unsigned long flags;
6099 int ret = 0; 6450 int ret = 0;
6100 6451
6101 child->perf_event_ctxp[ctxn] = NULL;
6102
6103 mutex_init(&child->perf_event_mutex);
6104 INIT_LIST_HEAD(&child->perf_event_list);
6105
6106 if (likely(!parent->perf_event_ctxp[ctxn])) 6452 if (likely(!parent->perf_event_ctxp[ctxn]))
6107 return 0; 6453 return 0;
6108 6454
@@ -6136,6 +6482,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6136 break; 6482 break;
6137 } 6483 }
6138 6484
6485 /*
6486 * We can't hold ctx->lock when iterating the ->flexible_group list due
6487 * to allocations, but we need to prevent rotation because
6488 * rotate_ctx() will change the list from interrupt context.
6489 */
6490 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6491 parent_ctx->rotate_disable = 1;
6492 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6493
6139 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6494 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6140 ret = inherit_task_group(event, parent, parent_ctx, 6495 ret = inherit_task_group(event, parent, parent_ctx,
6141 child, ctxn, &inherited_all); 6496 child, ctxn, &inherited_all);
@@ -6143,18 +6498,20 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6143 break; 6498 break;
6144 } 6499 }
6145 6500
6501 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6502 parent_ctx->rotate_disable = 0;
6503
6146 child_ctx = child->perf_event_ctxp[ctxn]; 6504 child_ctx = child->perf_event_ctxp[ctxn];
6147 6505
6148 if (child_ctx && inherited_all) { 6506 if (child_ctx && inherited_all) {
6149 /* 6507 /*
6150 * Mark the child context as a clone of the parent 6508 * Mark the child context as a clone of the parent
6151 * context, or of whatever the parent is a clone of. 6509 * context, or of whatever the parent is a clone of.
6152 * Note that if the parent is a clone, it could get 6510 *
6153 * uncloned at any point, but that doesn't matter 6511 * Note that if the parent is a clone, the holding of
6154 * because the list of events and the generation 6512 * parent_ctx->lock avoids it from being uncloned.
6155 * count can't have changed since we took the mutex.
6156 */ 6513 */
6157 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); 6514 cloned_ctx = parent_ctx->parent_ctx;
6158 if (cloned_ctx) { 6515 if (cloned_ctx) {
6159 child_ctx->parent_ctx = cloned_ctx; 6516 child_ctx->parent_ctx = cloned_ctx;
6160 child_ctx->parent_gen = parent_ctx->parent_gen; 6517 child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6165,6 +6522,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6165 get_ctx(child_ctx->parent_ctx); 6522 get_ctx(child_ctx->parent_ctx);
6166 } 6523 }
6167 6524
6525 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6168 mutex_unlock(&parent_ctx->mutex); 6526 mutex_unlock(&parent_ctx->mutex);
6169 6527
6170 perf_unpin_context(parent_ctx); 6528 perf_unpin_context(parent_ctx);
@@ -6179,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child)
6179{ 6537{
6180 int ctxn, ret; 6538 int ctxn, ret;
6181 6539
6540 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
6541 mutex_init(&child->perf_event_mutex);
6542 INIT_LIST_HEAD(&child->perf_event_list);
6543
6182 for_each_task_context_nr(ctxn) { 6544 for_each_task_context_nr(ctxn) {
6183 ret = perf_event_init_context(child, ctxn); 6545 ret = perf_event_init_context(child, ctxn);
6184 if (ret) 6546 if (ret)
@@ -6215,7 +6577,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6215 mutex_unlock(&swhash->hlist_mutex); 6577 mutex_unlock(&swhash->hlist_mutex);
6216} 6578}
6217 6579
6218#ifdef CONFIG_HOTPLUG_CPU 6580#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6219static void perf_pmu_rotate_stop(struct pmu *pmu) 6581static void perf_pmu_rotate_stop(struct pmu *pmu)
6220{ 6582{
6221 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6583 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6269,6 +6631,26 @@ static void perf_event_exit_cpu(int cpu)
6269static inline void perf_event_exit_cpu(int cpu) { } 6631static inline void perf_event_exit_cpu(int cpu) { }
6270#endif 6632#endif
6271 6633
6634static int
6635perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6636{
6637 int cpu;
6638
6639 for_each_online_cpu(cpu)
6640 perf_event_exit_cpu(cpu);
6641
6642 return NOTIFY_OK;
6643}
6644
6645/*
6646 * Run the perf reboot notifier at the very last possible moment so that
6647 * the generic watchdog code runs as long as possible.
6648 */
6649static struct notifier_block perf_reboot_notifier = {
6650 .notifier_call = perf_reboot,
6651 .priority = INT_MIN,
6652};
6653
6272static int __cpuinit 6654static int __cpuinit
6273perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6655perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6274{ 6656{
@@ -6295,11 +6677,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6295 6677
6296void __init perf_event_init(void) 6678void __init perf_event_init(void)
6297{ 6679{
6680 int ret;
6681
6682 idr_init(&pmu_idr);
6683
6298 perf_event_init_all_cpus(); 6684 perf_event_init_all_cpus();
6299 init_srcu_struct(&pmus_srcu); 6685 init_srcu_struct(&pmus_srcu);
6300 perf_pmu_register(&perf_swevent); 6686 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6301 perf_pmu_register(&perf_cpu_clock); 6687 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6302 perf_pmu_register(&perf_task_clock); 6688 perf_pmu_register(&perf_task_clock, NULL, -1);
6303 perf_tp_register(); 6689 perf_tp_register();
6304 perf_cpu_notifier(perf_cpu_notify); 6690 perf_cpu_notifier(perf_cpu_notify);
6691 register_reboot_notifier(&perf_reboot_notifier);
6692
6693 ret = init_hw_breakpoint();
6694 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6695}
6696
6697static int __init perf_event_sysfs_init(void)
6698{
6699 struct pmu *pmu;
6700 int ret;
6701
6702 mutex_lock(&pmus_lock);
6703
6704 ret = bus_register(&pmu_bus);
6705 if (ret)
6706 goto unlock;
6707
6708 list_for_each_entry(pmu, &pmus, entry) {
6709 if (!pmu->name || pmu->type < 0)
6710 continue;
6711
6712 ret = pmu_dev_alloc(pmu);
6713 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6714 }
6715 pmu_bus_running = 1;
6716 ret = 0;
6717
6718unlock:
6719 mutex_unlock(&pmus_lock);
6720
6721 return ret;
6305} 6722}
6723device_initcall(perf_event_sysfs_init);