aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c699
1 files changed, 533 insertions, 166 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index cb6c0d2af68..11847bf1e8c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -13,6 +13,7 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/idr.h>
16#include <linux/file.h> 17#include <linux/file.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/slab.h> 19#include <linux/slab.h>
@@ -21,7 +22,9 @@
21#include <linux/dcache.h> 22#include <linux/dcache.h>
22#include <linux/percpu.h> 23#include <linux/percpu.h>
23#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/reboot.h>
24#include <linux/vmstat.h> 26#include <linux/vmstat.h>
27#include <linux/device.h>
25#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
26#include <linux/hardirq.h> 29#include <linux/hardirq.h>
27#include <linux/rculist.h> 30#include <linux/rculist.h>
@@ -31,6 +34,7 @@
31#include <linux/kernel_stat.h> 34#include <linux/kernel_stat.h>
32#include <linux/perf_event.h> 35#include <linux/perf_event.h>
33#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h>
34 38
35#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
36 40
@@ -132,6 +136,28 @@ static void unclone_ctx(struct perf_event_context *ctx)
132 } 136 }
133} 137}
134 138
139static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
140{
141 /*
142 * only top level events have the pid namespace they were created in
143 */
144 if (event->parent)
145 event = event->parent;
146
147 return task_tgid_nr_ns(p, event->ns);
148}
149
150static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
151{
152 /*
153 * only top level events have the pid namespace they were created in
154 */
155 if (event->parent)
156 event = event->parent;
157
158 return task_pid_nr_ns(p, event->ns);
159}
160
135/* 161/*
136 * If we inherit events we want to return the parent event id 162 * If we inherit events we want to return the parent event id
137 * to userspace. 163 * to userspace.
@@ -311,9 +337,84 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
311 ctx->nr_stat++; 337 ctx->nr_stat++;
312} 338}
313 339
340/*
341 * Called at perf_event creation and when events are attached/detached from a
342 * group.
343 */
344static void perf_event__read_size(struct perf_event *event)
345{
346 int entry = sizeof(u64); /* value */
347 int size = 0;
348 int nr = 1;
349
350 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
351 size += sizeof(u64);
352
353 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
354 size += sizeof(u64);
355
356 if (event->attr.read_format & PERF_FORMAT_ID)
357 entry += sizeof(u64);
358
359 if (event->attr.read_format & PERF_FORMAT_GROUP) {
360 nr += event->group_leader->nr_siblings;
361 size += sizeof(u64);
362 }
363
364 size += entry * nr;
365 event->read_size = size;
366}
367
368static void perf_event__header_size(struct perf_event *event)
369{
370 struct perf_sample_data *data;
371 u64 sample_type = event->attr.sample_type;
372 u16 size = 0;
373
374 perf_event__read_size(event);
375
376 if (sample_type & PERF_SAMPLE_IP)
377 size += sizeof(data->ip);
378
379 if (sample_type & PERF_SAMPLE_ADDR)
380 size += sizeof(data->addr);
381
382 if (sample_type & PERF_SAMPLE_PERIOD)
383 size += sizeof(data->period);
384
385 if (sample_type & PERF_SAMPLE_READ)
386 size += event->read_size;
387
388 event->header_size = size;
389}
390
391static void perf_event__id_header_size(struct perf_event *event)
392{
393 struct perf_sample_data *data;
394 u64 sample_type = event->attr.sample_type;
395 u16 size = 0;
396
397 if (sample_type & PERF_SAMPLE_TID)
398 size += sizeof(data->tid_entry);
399
400 if (sample_type & PERF_SAMPLE_TIME)
401 size += sizeof(data->time);
402
403 if (sample_type & PERF_SAMPLE_ID)
404 size += sizeof(data->id);
405
406 if (sample_type & PERF_SAMPLE_STREAM_ID)
407 size += sizeof(data->stream_id);
408
409 if (sample_type & PERF_SAMPLE_CPU)
410 size += sizeof(data->cpu_entry);
411
412 event->id_header_size = size;
413}
414
314static void perf_group_attach(struct perf_event *event) 415static void perf_group_attach(struct perf_event *event)
315{ 416{
316 struct perf_event *group_leader = event->group_leader; 417 struct perf_event *group_leader = event->group_leader, *pos;
317 418
318 /* 419 /*
319 * We can have double attach due to group movement in perf_event_open. 420 * We can have double attach due to group movement in perf_event_open.
@@ -332,6 +433,11 @@ static void perf_group_attach(struct perf_event *event)
332 433
333 list_add_tail(&event->group_entry, &group_leader->sibling_list); 434 list_add_tail(&event->group_entry, &group_leader->sibling_list);
334 group_leader->nr_siblings++; 435 group_leader->nr_siblings++;
436
437 perf_event__header_size(group_leader);
438
439 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
440 perf_event__header_size(pos);
335} 441}
336 442
337/* 443/*
@@ -390,7 +496,7 @@ static void perf_group_detach(struct perf_event *event)
390 if (event->group_leader != event) { 496 if (event->group_leader != event) {
391 list_del_init(&event->group_entry); 497 list_del_init(&event->group_entry);
392 event->group_leader->nr_siblings--; 498 event->group_leader->nr_siblings--;
393 return; 499 goto out;
394 } 500 }
395 501
396 if (!list_empty(&event->group_entry)) 502 if (!list_empty(&event->group_entry))
@@ -409,6 +515,12 @@ static void perf_group_detach(struct perf_event *event)
409 /* Inherit group flags from the previous leader */ 515 /* Inherit group flags from the previous leader */
410 sibling->group_flags = event->group_flags; 516 sibling->group_flags = event->group_flags;
411 } 517 }
518
519out:
520 perf_event__header_size(event->group_leader);
521
522 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
523 perf_event__header_size(tmp);
412} 524}
413 525
414static inline int 526static inline int
@@ -1072,7 +1184,7 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1072 /* 1184 /*
1073 * not supported on inherited events 1185 * not supported on inherited events
1074 */ 1186 */
1075 if (event->attr.inherit) 1187 if (event->attr.inherit || !is_sampling_event(event))
1076 return -EINVAL; 1188 return -EINVAL;
1077 1189
1078 atomic_add(refresh, &event->event_limit); 1190 atomic_add(refresh, &event->event_limit);
@@ -1286,8 +1398,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
1286{ 1398{
1287 int ctxn; 1399 int ctxn;
1288 1400
1289 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1290
1291 for_each_task_context_nr(ctxn) 1401 for_each_task_context_nr(ctxn)
1292 perf_event_context_sched_out(task, ctxn, next); 1402 perf_event_context_sched_out(task, ctxn, next);
1293} 1403}
@@ -1621,8 +1731,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
1621{ 1731{
1622 raw_spin_lock(&ctx->lock); 1732 raw_spin_lock(&ctx->lock);
1623 1733
1624 /* Rotate the first entry last of non-pinned groups */ 1734 /*
1625 list_rotate_left(&ctx->flexible_groups); 1735 * Rotate the first entry last of non-pinned groups. Rotation might be
1736 * disabled by the inheritance code.
1737 */
1738 if (!ctx->rotate_disable)
1739 list_rotate_left(&ctx->flexible_groups);
1626 1740
1627 raw_spin_unlock(&ctx->lock); 1741 raw_spin_unlock(&ctx->lock);
1628} 1742}
@@ -2234,11 +2348,6 @@ int perf_event_release_kernel(struct perf_event *event)
2234 raw_spin_unlock_irq(&ctx->lock); 2348 raw_spin_unlock_irq(&ctx->lock);
2235 mutex_unlock(&ctx->mutex); 2349 mutex_unlock(&ctx->mutex);
2236 2350
2237 mutex_lock(&event->owner->perf_event_mutex);
2238 list_del_init(&event->owner_entry);
2239 mutex_unlock(&event->owner->perf_event_mutex);
2240 put_task_struct(event->owner);
2241
2242 free_event(event); 2351 free_event(event);
2243 2352
2244 return 0; 2353 return 0;
@@ -2251,35 +2360,44 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
2251static int perf_release(struct inode *inode, struct file *file) 2360static int perf_release(struct inode *inode, struct file *file)
2252{ 2361{
2253 struct perf_event *event = file->private_data; 2362 struct perf_event *event = file->private_data;
2363 struct task_struct *owner;
2254 2364
2255 file->private_data = NULL; 2365 file->private_data = NULL;
2256 2366
2257 return perf_event_release_kernel(event); 2367 rcu_read_lock();
2258} 2368 owner = ACCESS_ONCE(event->owner);
2259 2369 /*
2260static int perf_event_read_size(struct perf_event *event) 2370 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
2261{ 2371 * !owner it means the list deletion is complete and we can indeed
2262 int entry = sizeof(u64); /* value */ 2372 * free this event, otherwise we need to serialize on
2263 int size = 0; 2373 * owner->perf_event_mutex.
2264 int nr = 1; 2374 */
2265 2375 smp_read_barrier_depends();
2266 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) 2376 if (owner) {
2267 size += sizeof(u64); 2377 /*
2268 2378 * Since delayed_put_task_struct() also drops the last
2269 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) 2379 * task reference we can safely take a new reference
2270 size += sizeof(u64); 2380 * while holding the rcu_read_lock().
2271 2381 */
2272 if (event->attr.read_format & PERF_FORMAT_ID) 2382 get_task_struct(owner);
2273 entry += sizeof(u64);
2274
2275 if (event->attr.read_format & PERF_FORMAT_GROUP) {
2276 nr += event->group_leader->nr_siblings;
2277 size += sizeof(u64);
2278 } 2383 }
2384 rcu_read_unlock();
2279 2385
2280 size += entry * nr; 2386 if (owner) {
2387 mutex_lock(&owner->perf_event_mutex);
2388 /*
2389 * We have to re-check the event->owner field, if it is cleared
2390 * we raced with perf_event_exit_task(), acquiring the mutex
2391 * ensured they're done, and we can proceed with freeing the
2392 * event.
2393 */
2394 if (event->owner)
2395 list_del_init(&event->owner_entry);
2396 mutex_unlock(&owner->perf_event_mutex);
2397 put_task_struct(owner);
2398 }
2281 2399
2282 return size; 2400 return perf_event_release_kernel(event);
2283} 2401}
2284 2402
2285u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 2403u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -2396,7 +2514,7 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
2396 if (event->state == PERF_EVENT_STATE_ERROR) 2514 if (event->state == PERF_EVENT_STATE_ERROR)
2397 return 0; 2515 return 0;
2398 2516
2399 if (count < perf_event_read_size(event)) 2517 if (count < event->read_size)
2400 return -ENOSPC; 2518 return -ENOSPC;
2401 2519
2402 WARN_ON_ONCE(event->ctx->parent_ctx); 2520 WARN_ON_ONCE(event->ctx->parent_ctx);
@@ -2482,7 +2600,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2482 int ret = 0; 2600 int ret = 0;
2483 u64 value; 2601 u64 value;
2484 2602
2485 if (!event->attr.sample_period) 2603 if (!is_sampling_event(event))
2486 return -EINVAL; 2604 return -EINVAL;
2487 2605
2488 if (copy_from_user(&value, arg, sizeof(value))) 2606 if (copy_from_user(&value, arg, sizeof(value)))
@@ -3273,6 +3391,73 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle,
3273 } while (len); 3391 } while (len);
3274} 3392}
3275 3393
3394static void __perf_event_header__init_id(struct perf_event_header *header,
3395 struct perf_sample_data *data,
3396 struct perf_event *event)
3397{
3398 u64 sample_type = event->attr.sample_type;
3399
3400 data->type = sample_type;
3401 header->size += event->id_header_size;
3402
3403 if (sample_type & PERF_SAMPLE_TID) {
3404 /* namespace issues */
3405 data->tid_entry.pid = perf_event_pid(event, current);
3406 data->tid_entry.tid = perf_event_tid(event, current);
3407 }
3408
3409 if (sample_type & PERF_SAMPLE_TIME)
3410 data->time = perf_clock();
3411
3412 if (sample_type & PERF_SAMPLE_ID)
3413 data->id = primary_event_id(event);
3414
3415 if (sample_type & PERF_SAMPLE_STREAM_ID)
3416 data->stream_id = event->id;
3417
3418 if (sample_type & PERF_SAMPLE_CPU) {
3419 data->cpu_entry.cpu = raw_smp_processor_id();
3420 data->cpu_entry.reserved = 0;
3421 }
3422}
3423
3424static void perf_event_header__init_id(struct perf_event_header *header,
3425 struct perf_sample_data *data,
3426 struct perf_event *event)
3427{
3428 if (event->attr.sample_id_all)
3429 __perf_event_header__init_id(header, data, event);
3430}
3431
3432static void __perf_event__output_id_sample(struct perf_output_handle *handle,
3433 struct perf_sample_data *data)
3434{
3435 u64 sample_type = data->type;
3436
3437 if (sample_type & PERF_SAMPLE_TID)
3438 perf_output_put(handle, data->tid_entry);
3439
3440 if (sample_type & PERF_SAMPLE_TIME)
3441 perf_output_put(handle, data->time);
3442
3443 if (sample_type & PERF_SAMPLE_ID)
3444 perf_output_put(handle, data->id);
3445
3446 if (sample_type & PERF_SAMPLE_STREAM_ID)
3447 perf_output_put(handle, data->stream_id);
3448
3449 if (sample_type & PERF_SAMPLE_CPU)
3450 perf_output_put(handle, data->cpu_entry);
3451}
3452
3453static void perf_event__output_id_sample(struct perf_event *event,
3454 struct perf_output_handle *handle,
3455 struct perf_sample_data *sample)
3456{
3457 if (event->attr.sample_id_all)
3458 __perf_event__output_id_sample(handle, sample);
3459}
3460
3276int perf_output_begin(struct perf_output_handle *handle, 3461int perf_output_begin(struct perf_output_handle *handle,
3277 struct perf_event *event, unsigned int size, 3462 struct perf_event *event, unsigned int size,
3278 int nmi, int sample) 3463 int nmi, int sample)
@@ -3280,6 +3465,7 @@ int perf_output_begin(struct perf_output_handle *handle,
3280 struct perf_buffer *buffer; 3465 struct perf_buffer *buffer;
3281 unsigned long tail, offset, head; 3466 unsigned long tail, offset, head;
3282 int have_lost; 3467 int have_lost;
3468 struct perf_sample_data sample_data;
3283 struct { 3469 struct {
3284 struct perf_event_header header; 3470 struct perf_event_header header;
3285 u64 id; 3471 u64 id;
@@ -3306,8 +3492,12 @@ int perf_output_begin(struct perf_output_handle *handle,
3306 goto out; 3492 goto out;
3307 3493
3308 have_lost = local_read(&buffer->lost); 3494 have_lost = local_read(&buffer->lost);
3309 if (have_lost) 3495 if (have_lost) {
3310 size += sizeof(lost_event); 3496 lost_event.header.size = sizeof(lost_event);
3497 perf_event_header__init_id(&lost_event.header, &sample_data,
3498 event);
3499 size += lost_event.header.size;
3500 }
3311 3501
3312 perf_output_get_handle(handle); 3502 perf_output_get_handle(handle);
3313 3503
@@ -3338,11 +3528,11 @@ int perf_output_begin(struct perf_output_handle *handle,
3338 if (have_lost) { 3528 if (have_lost) {
3339 lost_event.header.type = PERF_RECORD_LOST; 3529 lost_event.header.type = PERF_RECORD_LOST;
3340 lost_event.header.misc = 0; 3530 lost_event.header.misc = 0;
3341 lost_event.header.size = sizeof(lost_event);
3342 lost_event.id = event->id; 3531 lost_event.id = event->id;
3343 lost_event.lost = local_xchg(&buffer->lost, 0); 3532 lost_event.lost = local_xchg(&buffer->lost, 0);
3344 3533
3345 perf_output_put(handle, lost_event); 3534 perf_output_put(handle, lost_event);
3535 perf_event__output_id_sample(event, handle, &sample_data);
3346 } 3536 }
3347 3537
3348 return 0; 3538 return 0;
@@ -3375,28 +3565,6 @@ void perf_output_end(struct perf_output_handle *handle)
3375 rcu_read_unlock(); 3565 rcu_read_unlock();
3376} 3566}
3377 3567
3378static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
3379{
3380 /*
3381 * only top level events have the pid namespace they were created in
3382 */
3383 if (event->parent)
3384 event = event->parent;
3385
3386 return task_tgid_nr_ns(p, event->ns);
3387}
3388
3389static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
3390{
3391 /*
3392 * only top level events have the pid namespace they were created in
3393 */
3394 if (event->parent)
3395 event = event->parent;
3396
3397 return task_pid_nr_ns(p, event->ns);
3398}
3399
3400static void perf_output_read_one(struct perf_output_handle *handle, 3568static void perf_output_read_one(struct perf_output_handle *handle,
3401 struct perf_event *event, 3569 struct perf_event *event,
3402 u64 enabled, u64 running) 3570 u64 enabled, u64 running)
@@ -3571,61 +3739,16 @@ void perf_prepare_sample(struct perf_event_header *header,
3571{ 3739{
3572 u64 sample_type = event->attr.sample_type; 3740 u64 sample_type = event->attr.sample_type;
3573 3741
3574 data->type = sample_type;
3575
3576 header->type = PERF_RECORD_SAMPLE; 3742 header->type = PERF_RECORD_SAMPLE;
3577 header->size = sizeof(*header); 3743 header->size = sizeof(*header) + event->header_size;
3578 3744
3579 header->misc = 0; 3745 header->misc = 0;
3580 header->misc |= perf_misc_flags(regs); 3746 header->misc |= perf_misc_flags(regs);
3581 3747
3582 if (sample_type & PERF_SAMPLE_IP) { 3748 __perf_event_header__init_id(header, data, event);
3583 data->ip = perf_instruction_pointer(regs);
3584
3585 header->size += sizeof(data->ip);
3586 }
3587
3588 if (sample_type & PERF_SAMPLE_TID) {
3589 /* namespace issues */
3590 data->tid_entry.pid = perf_event_pid(event, current);
3591 data->tid_entry.tid = perf_event_tid(event, current);
3592
3593 header->size += sizeof(data->tid_entry);
3594 }
3595
3596 if (sample_type & PERF_SAMPLE_TIME) {
3597 data->time = perf_clock();
3598
3599 header->size += sizeof(data->time);
3600 }
3601
3602 if (sample_type & PERF_SAMPLE_ADDR)
3603 header->size += sizeof(data->addr);
3604
3605 if (sample_type & PERF_SAMPLE_ID) {
3606 data->id = primary_event_id(event);
3607
3608 header->size += sizeof(data->id);
3609 }
3610
3611 if (sample_type & PERF_SAMPLE_STREAM_ID) {
3612 data->stream_id = event->id;
3613
3614 header->size += sizeof(data->stream_id);
3615 }
3616
3617 if (sample_type & PERF_SAMPLE_CPU) {
3618 data->cpu_entry.cpu = raw_smp_processor_id();
3619 data->cpu_entry.reserved = 0;
3620
3621 header->size += sizeof(data->cpu_entry);
3622 }
3623
3624 if (sample_type & PERF_SAMPLE_PERIOD)
3625 header->size += sizeof(data->period);
3626 3749
3627 if (sample_type & PERF_SAMPLE_READ) 3750 if (sample_type & PERF_SAMPLE_IP)
3628 header->size += perf_event_read_size(event); 3751 data->ip = perf_instruction_pointer(regs);
3629 3752
3630 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 3753 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
3631 int size = 1; 3754 int size = 1;
@@ -3690,23 +3813,26 @@ perf_event_read_event(struct perf_event *event,
3690 struct task_struct *task) 3813 struct task_struct *task)
3691{ 3814{
3692 struct perf_output_handle handle; 3815 struct perf_output_handle handle;
3816 struct perf_sample_data sample;
3693 struct perf_read_event read_event = { 3817 struct perf_read_event read_event = {
3694 .header = { 3818 .header = {
3695 .type = PERF_RECORD_READ, 3819 .type = PERF_RECORD_READ,
3696 .misc = 0, 3820 .misc = 0,
3697 .size = sizeof(read_event) + perf_event_read_size(event), 3821 .size = sizeof(read_event) + event->read_size,
3698 }, 3822 },
3699 .pid = perf_event_pid(event, task), 3823 .pid = perf_event_pid(event, task),
3700 .tid = perf_event_tid(event, task), 3824 .tid = perf_event_tid(event, task),
3701 }; 3825 };
3702 int ret; 3826 int ret;
3703 3827
3828 perf_event_header__init_id(&read_event.header, &sample, event);
3704 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 3829 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3705 if (ret) 3830 if (ret)
3706 return; 3831 return;
3707 3832
3708 perf_output_put(&handle, read_event); 3833 perf_output_put(&handle, read_event);
3709 perf_output_read(&handle, event); 3834 perf_output_read(&handle, event);
3835 perf_event__output_id_sample(event, &handle, &sample);
3710 3836
3711 perf_output_end(&handle); 3837 perf_output_end(&handle);
3712} 3838}
@@ -3736,14 +3862,16 @@ static void perf_event_task_output(struct perf_event *event,
3736 struct perf_task_event *task_event) 3862 struct perf_task_event *task_event)
3737{ 3863{
3738 struct perf_output_handle handle; 3864 struct perf_output_handle handle;
3865 struct perf_sample_data sample;
3739 struct task_struct *task = task_event->task; 3866 struct task_struct *task = task_event->task;
3740 int size, ret; 3867 int ret, size = task_event->event_id.header.size;
3741 3868
3742 size = task_event->event_id.header.size; 3869 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
3743 ret = perf_output_begin(&handle, event, size, 0, 0);
3744 3870
3871 ret = perf_output_begin(&handle, event,
3872 task_event->event_id.header.size, 0, 0);
3745 if (ret) 3873 if (ret)
3746 return; 3874 goto out;
3747 3875
3748 task_event->event_id.pid = perf_event_pid(event, task); 3876 task_event->event_id.pid = perf_event_pid(event, task);
3749 task_event->event_id.ppid = perf_event_pid(event, current); 3877 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3753,7 +3881,11 @@ static void perf_event_task_output(struct perf_event *event,
3753 3881
3754 perf_output_put(&handle, task_event->event_id); 3882 perf_output_put(&handle, task_event->event_id);
3755 3883
3884 perf_event__output_id_sample(event, &handle, &sample);
3885
3756 perf_output_end(&handle); 3886 perf_output_end(&handle);
3887out:
3888 task_event->event_id.header.size = size;
3757} 3889}
3758 3890
3759static int perf_event_task_match(struct perf_event *event) 3891static int perf_event_task_match(struct perf_event *event)
@@ -3792,6 +3924,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3792 rcu_read_lock(); 3924 rcu_read_lock();
3793 list_for_each_entry_rcu(pmu, &pmus, entry) { 3925 list_for_each_entry_rcu(pmu, &pmus, entry) {
3794 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 3926 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
3927 if (cpuctx->active_pmu != pmu)
3928 goto next;
3795 perf_event_task_ctx(&cpuctx->ctx, task_event); 3929 perf_event_task_ctx(&cpuctx->ctx, task_event);
3796 3930
3797 ctx = task_event->task_ctx; 3931 ctx = task_event->task_ctx;
@@ -3866,11 +4000,16 @@ static void perf_event_comm_output(struct perf_event *event,
3866 struct perf_comm_event *comm_event) 4000 struct perf_comm_event *comm_event)
3867{ 4001{
3868 struct perf_output_handle handle; 4002 struct perf_output_handle handle;
4003 struct perf_sample_data sample;
3869 int size = comm_event->event_id.header.size; 4004 int size = comm_event->event_id.header.size;
3870 int ret = perf_output_begin(&handle, event, size, 0, 0); 4005 int ret;
4006
4007 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4008 ret = perf_output_begin(&handle, event,
4009 comm_event->event_id.header.size, 0, 0);
3871 4010
3872 if (ret) 4011 if (ret)
3873 return; 4012 goto out;
3874 4013
3875 comm_event->event_id.pid = perf_event_pid(event, comm_event->task); 4014 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3876 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4015 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
@@ -3878,7 +4017,12 @@ static void perf_event_comm_output(struct perf_event *event,
3878 perf_output_put(&handle, comm_event->event_id); 4017 perf_output_put(&handle, comm_event->event_id);
3879 perf_output_copy(&handle, comm_event->comm, 4018 perf_output_copy(&handle, comm_event->comm,
3880 comm_event->comm_size); 4019 comm_event->comm_size);
4020
4021 perf_event__output_id_sample(event, &handle, &sample);
4022
3881 perf_output_end(&handle); 4023 perf_output_end(&handle);
4024out:
4025 comm_event->event_id.header.size = size;
3882} 4026}
3883 4027
3884static int perf_event_comm_match(struct perf_event *event) 4028static int perf_event_comm_match(struct perf_event *event)
@@ -3923,10 +4067,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3923 comm_event->comm_size = size; 4067 comm_event->comm_size = size;
3924 4068
3925 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 4069 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3926
3927 rcu_read_lock(); 4070 rcu_read_lock();
3928 list_for_each_entry_rcu(pmu, &pmus, entry) { 4071 list_for_each_entry_rcu(pmu, &pmus, entry) {
3929 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4072 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4073 if (cpuctx->active_pmu != pmu)
4074 goto next;
3930 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 4075 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3931 4076
3932 ctxn = pmu->task_ctx_nr; 4077 ctxn = pmu->task_ctx_nr;
@@ -4002,11 +4147,15 @@ static void perf_event_mmap_output(struct perf_event *event,
4002 struct perf_mmap_event *mmap_event) 4147 struct perf_mmap_event *mmap_event)
4003{ 4148{
4004 struct perf_output_handle handle; 4149 struct perf_output_handle handle;
4150 struct perf_sample_data sample;
4005 int size = mmap_event->event_id.header.size; 4151 int size = mmap_event->event_id.header.size;
4006 int ret = perf_output_begin(&handle, event, size, 0, 0); 4152 int ret;
4007 4153
4154 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4155 ret = perf_output_begin(&handle, event,
4156 mmap_event->event_id.header.size, 0, 0);
4008 if (ret) 4157 if (ret)
4009 return; 4158 goto out;
4010 4159
4011 mmap_event->event_id.pid = perf_event_pid(event, current); 4160 mmap_event->event_id.pid = perf_event_pid(event, current);
4012 mmap_event->event_id.tid = perf_event_tid(event, current); 4161 mmap_event->event_id.tid = perf_event_tid(event, current);
@@ -4014,7 +4163,12 @@ static void perf_event_mmap_output(struct perf_event *event,
4014 perf_output_put(&handle, mmap_event->event_id); 4163 perf_output_put(&handle, mmap_event->event_id);
4015 perf_output_copy(&handle, mmap_event->file_name, 4164 perf_output_copy(&handle, mmap_event->file_name,
4016 mmap_event->file_size); 4165 mmap_event->file_size);
4166
4167 perf_event__output_id_sample(event, &handle, &sample);
4168
4017 perf_output_end(&handle); 4169 perf_output_end(&handle);
4170out:
4171 mmap_event->event_id.header.size = size;
4018} 4172}
4019 4173
4020static int perf_event_mmap_match(struct perf_event *event, 4174static int perf_event_mmap_match(struct perf_event *event,
@@ -4112,6 +4266,8 @@ got_name:
4112 rcu_read_lock(); 4266 rcu_read_lock();
4113 list_for_each_entry_rcu(pmu, &pmus, entry) { 4267 list_for_each_entry_rcu(pmu, &pmus, entry) {
4114 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); 4268 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4269 if (cpuctx->active_pmu != pmu)
4270 goto next;
4115 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4271 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4116 vma->vm_flags & VM_EXEC); 4272 vma->vm_flags & VM_EXEC);
4117 4273
@@ -4167,6 +4323,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
4167static void perf_log_throttle(struct perf_event *event, int enable) 4323static void perf_log_throttle(struct perf_event *event, int enable)
4168{ 4324{
4169 struct perf_output_handle handle; 4325 struct perf_output_handle handle;
4326 struct perf_sample_data sample;
4170 int ret; 4327 int ret;
4171 4328
4172 struct { 4329 struct {
@@ -4188,11 +4345,15 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4188 if (enable) 4345 if (enable)
4189 throttle_event.header.type = PERF_RECORD_UNTHROTTLE; 4346 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
4190 4347
4191 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); 4348 perf_event_header__init_id(&throttle_event.header, &sample, event);
4349
4350 ret = perf_output_begin(&handle, event,
4351 throttle_event.header.size, 1, 0);
4192 if (ret) 4352 if (ret)
4193 return; 4353 return;
4194 4354
4195 perf_output_put(&handle, throttle_event); 4355 perf_output_put(&handle, throttle_event);
4356 perf_event__output_id_sample(event, &handle, &sample);
4196 perf_output_end(&handle); 4357 perf_output_end(&handle);
4197} 4358}
4198 4359
@@ -4208,6 +4369,13 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4208 struct hw_perf_event *hwc = &event->hw; 4369 struct hw_perf_event *hwc = &event->hw;
4209 int ret = 0; 4370 int ret = 0;
4210 4371
4372 /*
4373 * Non-sampling counters might still use the PMI to fold short
4374 * hardware counters, ignore those.
4375 */
4376 if (unlikely(!is_sampling_event(event)))
4377 return 0;
4378
4211 if (!throttle) { 4379 if (!throttle) {
4212 hwc->interrupts++; 4380 hwc->interrupts++;
4213 } else { 4381 } else {
@@ -4353,7 +4521,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
4353 if (!regs) 4521 if (!regs)
4354 return; 4522 return;
4355 4523
4356 if (!hwc->sample_period) 4524 if (!is_sampling_event(event))
4357 return; 4525 return;
4358 4526
4359 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4527 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
@@ -4516,7 +4684,7 @@ static int perf_swevent_add(struct perf_event *event, int flags)
4516 struct hw_perf_event *hwc = &event->hw; 4684 struct hw_perf_event *hwc = &event->hw;
4517 struct hlist_head *head; 4685 struct hlist_head *head;
4518 4686
4519 if (hwc->sample_period) { 4687 if (is_sampling_event(event)) {
4520 hwc->last_period = hwc->sample_period; 4688 hwc->last_period = hwc->sample_period;
4521 perf_swevent_set_period(event); 4689 perf_swevent_set_period(event);
4522 } 4690 }
@@ -4681,7 +4849,7 @@ static int perf_swevent_init(struct perf_event *event)
4681 break; 4849 break;
4682 } 4850 }
4683 4851
4684 if (event_id > PERF_COUNT_SW_MAX) 4852 if (event_id >= PERF_COUNT_SW_MAX)
4685 return -ENOENT; 4853 return -ENOENT;
4686 4854
4687 if (!event->parent) { 4855 if (!event->parent) {
@@ -4773,15 +4941,6 @@ static int perf_tp_event_init(struct perf_event *event)
4773 if (event->attr.type != PERF_TYPE_TRACEPOINT) 4941 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4774 return -ENOENT; 4942 return -ENOENT;
4775 4943
4776 /*
4777 * Raw tracepoint data is a severe data leak, only allow root to
4778 * have these.
4779 */
4780 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4781 perf_paranoid_tracepoint_raw() &&
4782 !capable(CAP_SYS_ADMIN))
4783 return -EPERM;
4784
4785 err = perf_trace_init(event); 4944 err = perf_trace_init(event);
4786 if (err) 4945 if (err)
4787 return err; 4946 return err;
@@ -4804,7 +4963,7 @@ static struct pmu perf_tracepoint = {
4804 4963
4805static inline void perf_tp_register(void) 4964static inline void perf_tp_register(void)
4806{ 4965{
4807 perf_pmu_register(&perf_tracepoint); 4966 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
4808} 4967}
4809 4968
4810static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4969static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4894,31 +5053,33 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4894static void perf_swevent_start_hrtimer(struct perf_event *event) 5053static void perf_swevent_start_hrtimer(struct perf_event *event)
4895{ 5054{
4896 struct hw_perf_event *hwc = &event->hw; 5055 struct hw_perf_event *hwc = &event->hw;
5056 s64 period;
5057
5058 if (!is_sampling_event(event))
5059 return;
4897 5060
4898 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 5061 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4899 hwc->hrtimer.function = perf_swevent_hrtimer; 5062 hwc->hrtimer.function = perf_swevent_hrtimer;
4900 if (hwc->sample_period) {
4901 s64 period = local64_read(&hwc->period_left);
4902 5063
4903 if (period) { 5064 period = local64_read(&hwc->period_left);
4904 if (period < 0) 5065 if (period) {
4905 period = 10000; 5066 if (period < 0)
5067 period = 10000;
4906 5068
4907 local64_set(&hwc->period_left, 0); 5069 local64_set(&hwc->period_left, 0);
4908 } else { 5070 } else {
4909 period = max_t(u64, 10000, hwc->sample_period); 5071 period = max_t(u64, 10000, hwc->sample_period);
4910 } 5072 }
4911 __hrtimer_start_range_ns(&hwc->hrtimer, 5073 __hrtimer_start_range_ns(&hwc->hrtimer,
4912 ns_to_ktime(period), 0, 5074 ns_to_ktime(period), 0,
4913 HRTIMER_MODE_REL_PINNED, 0); 5075 HRTIMER_MODE_REL_PINNED, 0);
4914 }
4915} 5076}
4916 5077
4917static void perf_swevent_cancel_hrtimer(struct perf_event *event) 5078static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4918{ 5079{
4919 struct hw_perf_event *hwc = &event->hw; 5080 struct hw_perf_event *hwc = &event->hw;
4920 5081
4921 if (hwc->sample_period) { 5082 if (is_sampling_event(event)) {
4922 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); 5083 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4923 local64_set(&hwc->period_left, ktime_to_ns(remaining)); 5084 local64_set(&hwc->period_left, ktime_to_ns(remaining));
4924 5085
@@ -5113,25 +5274,94 @@ static void *find_pmu_context(int ctxn)
5113 return NULL; 5274 return NULL;
5114} 5275}
5115 5276
5116static void free_pmu_context(void * __percpu cpu_context) 5277static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
5117{ 5278{
5118 struct pmu *pmu; 5279 int cpu;
5280
5281 for_each_possible_cpu(cpu) {
5282 struct perf_cpu_context *cpuctx;
5283
5284 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
5285
5286 if (cpuctx->active_pmu == old_pmu)
5287 cpuctx->active_pmu = pmu;
5288 }
5289}
5290
5291static void free_pmu_context(struct pmu *pmu)
5292{
5293 struct pmu *i;
5119 5294
5120 mutex_lock(&pmus_lock); 5295 mutex_lock(&pmus_lock);
5121 /* 5296 /*
5122 * Like a real lame refcount. 5297 * Like a real lame refcount.
5123 */ 5298 */
5124 list_for_each_entry(pmu, &pmus, entry) { 5299 list_for_each_entry(i, &pmus, entry) {
5125 if (pmu->pmu_cpu_context == cpu_context) 5300 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
5301 update_pmu_context(i, pmu);
5126 goto out; 5302 goto out;
5303 }
5127 } 5304 }
5128 5305
5129 free_percpu(cpu_context); 5306 free_percpu(pmu->pmu_cpu_context);
5130out: 5307out:
5131 mutex_unlock(&pmus_lock); 5308 mutex_unlock(&pmus_lock);
5132} 5309}
5310static struct idr pmu_idr;
5311
5312static ssize_t
5313type_show(struct device *dev, struct device_attribute *attr, char *page)
5314{
5315 struct pmu *pmu = dev_get_drvdata(dev);
5316
5317 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5318}
5319
5320static struct device_attribute pmu_dev_attrs[] = {
5321 __ATTR_RO(type),
5322 __ATTR_NULL,
5323};
5324
5325static int pmu_bus_running;
5326static struct bus_type pmu_bus = {
5327 .name = "event_source",
5328 .dev_attrs = pmu_dev_attrs,
5329};
5330
5331static void pmu_dev_release(struct device *dev)
5332{
5333 kfree(dev);
5334}
5335
5336static int pmu_dev_alloc(struct pmu *pmu)
5337{
5338 int ret = -ENOMEM;
5339
5340 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
5341 if (!pmu->dev)
5342 goto out;
5343
5344 device_initialize(pmu->dev);
5345 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5346 if (ret)
5347 goto free_dev;
5348
5349 dev_set_drvdata(pmu->dev, pmu);
5350 pmu->dev->bus = &pmu_bus;
5351 pmu->dev->release = pmu_dev_release;
5352 ret = device_add(pmu->dev);
5353 if (ret)
5354 goto free_dev;
5355
5356out:
5357 return ret;
5358
5359free_dev:
5360 put_device(pmu->dev);
5361 goto out;
5362}
5133 5363
5134int perf_pmu_register(struct pmu *pmu) 5364int perf_pmu_register(struct pmu *pmu, char *name, int type)
5135{ 5365{
5136 int cpu, ret; 5366 int cpu, ret;
5137 5367
@@ -5141,13 +5371,38 @@ int perf_pmu_register(struct pmu *pmu)
5141 if (!pmu->pmu_disable_count) 5371 if (!pmu->pmu_disable_count)
5142 goto unlock; 5372 goto unlock;
5143 5373
5374 pmu->type = -1;
5375 if (!name)
5376 goto skip_type;
5377 pmu->name = name;
5378
5379 if (type < 0) {
5380 int err = idr_pre_get(&pmu_idr, GFP_KERNEL);
5381 if (!err)
5382 goto free_pdc;
5383
5384 err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type);
5385 if (err) {
5386 ret = err;
5387 goto free_pdc;
5388 }
5389 }
5390 pmu->type = type;
5391
5392 if (pmu_bus_running) {
5393 ret = pmu_dev_alloc(pmu);
5394 if (ret)
5395 goto free_idr;
5396 }
5397
5398skip_type:
5144 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); 5399 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5145 if (pmu->pmu_cpu_context) 5400 if (pmu->pmu_cpu_context)
5146 goto got_cpu_context; 5401 goto got_cpu_context;
5147 5402
5148 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5403 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5149 if (!pmu->pmu_cpu_context) 5404 if (!pmu->pmu_cpu_context)
5150 goto free_pdc; 5405 goto free_dev;
5151 5406
5152 for_each_possible_cpu(cpu) { 5407 for_each_possible_cpu(cpu) {
5153 struct perf_cpu_context *cpuctx; 5408 struct perf_cpu_context *cpuctx;
@@ -5158,6 +5413,7 @@ int perf_pmu_register(struct pmu *pmu)
5158 cpuctx->ctx.pmu = pmu; 5413 cpuctx->ctx.pmu = pmu;
5159 cpuctx->jiffies_interval = 1; 5414 cpuctx->jiffies_interval = 1;
5160 INIT_LIST_HEAD(&cpuctx->rotation_list); 5415 INIT_LIST_HEAD(&cpuctx->rotation_list);
5416 cpuctx->active_pmu = pmu;
5161 } 5417 }
5162 5418
5163got_cpu_context: 5419got_cpu_context:
@@ -5190,6 +5446,14 @@ unlock:
5190 5446
5191 return ret; 5447 return ret;
5192 5448
5449free_dev:
5450 device_del(pmu->dev);
5451 put_device(pmu->dev);
5452
5453free_idr:
5454 if (pmu->type >= PERF_TYPE_MAX)
5455 idr_remove(&pmu_idr, pmu->type);
5456
5193free_pdc: 5457free_pdc:
5194 free_percpu(pmu->pmu_disable_count); 5458 free_percpu(pmu->pmu_disable_count);
5195 goto unlock; 5459 goto unlock;
@@ -5209,7 +5473,11 @@ void perf_pmu_unregister(struct pmu *pmu)
5209 synchronize_rcu(); 5473 synchronize_rcu();
5210 5474
5211 free_percpu(pmu->pmu_disable_count); 5475 free_percpu(pmu->pmu_disable_count);
5212 free_pmu_context(pmu->pmu_cpu_context); 5476 if (pmu->type >= PERF_TYPE_MAX)
5477 idr_remove(&pmu_idr, pmu->type);
5478 device_del(pmu->dev);
5479 put_device(pmu->dev);
5480 free_pmu_context(pmu);
5213} 5481}
5214 5482
5215struct pmu *perf_init_event(struct perf_event *event) 5483struct pmu *perf_init_event(struct perf_event *event)
@@ -5218,6 +5486,13 @@ struct pmu *perf_init_event(struct perf_event *event)
5218 int idx; 5486 int idx;
5219 5487
5220 idx = srcu_read_lock(&pmus_srcu); 5488 idx = srcu_read_lock(&pmus_srcu);
5489
5490 rcu_read_lock();
5491 pmu = idr_find(&pmu_idr, event->attr.type);
5492 rcu_read_unlock();
5493 if (pmu)
5494 goto unlock;
5495
5221 list_for_each_entry_rcu(pmu, &pmus, entry) { 5496 list_for_each_entry_rcu(pmu, &pmus, entry) {
5222 int ret = pmu->event_init(event); 5497 int ret = pmu->event_init(event);
5223 if (!ret) 5498 if (!ret)
@@ -5677,12 +5952,18 @@ SYSCALL_DEFINE5(perf_event_open,
5677 mutex_unlock(&ctx->mutex); 5952 mutex_unlock(&ctx->mutex);
5678 5953
5679 event->owner = current; 5954 event->owner = current;
5680 get_task_struct(current); 5955
5681 mutex_lock(&current->perf_event_mutex); 5956 mutex_lock(&current->perf_event_mutex);
5682 list_add_tail(&event->owner_entry, &current->perf_event_list); 5957 list_add_tail(&event->owner_entry, &current->perf_event_list);
5683 mutex_unlock(&current->perf_event_mutex); 5958 mutex_unlock(&current->perf_event_mutex);
5684 5959
5685 /* 5960 /*
5961 * Precalculate sample_data sizes
5962 */
5963 perf_event__header_size(event);
5964 perf_event__id_header_size(event);
5965
5966 /*
5686 * Drop the reference on the group_event after placing the 5967 * Drop the reference on the group_event after placing the
5687 * new event on the sibling_list. This ensures destruction 5968 * new event on the sibling_list. This ensures destruction
5688 * of the group leader will find the pointer to itself in 5969 * of the group leader will find the pointer to itself in
@@ -5745,12 +6026,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
5745 ++ctx->generation; 6026 ++ctx->generation;
5746 mutex_unlock(&ctx->mutex); 6027 mutex_unlock(&ctx->mutex);
5747 6028
5748 event->owner = current;
5749 get_task_struct(current);
5750 mutex_lock(&current->perf_event_mutex);
5751 list_add_tail(&event->owner_entry, &current->perf_event_list);
5752 mutex_unlock(&current->perf_event_mutex);
5753
5754 return event; 6029 return event;
5755 6030
5756err_free: 6031err_free:
@@ -5901,8 +6176,24 @@ again:
5901 */ 6176 */
5902void perf_event_exit_task(struct task_struct *child) 6177void perf_event_exit_task(struct task_struct *child)
5903{ 6178{
6179 struct perf_event *event, *tmp;
5904 int ctxn; 6180 int ctxn;
5905 6181
6182 mutex_lock(&child->perf_event_mutex);
6183 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
6184 owner_entry) {
6185 list_del_init(&event->owner_entry);
6186
6187 /*
6188 * Ensure the list deletion is visible before we clear
6189 * the owner, closes a race against perf_release() where
6190 * we need to serialize on the owner->perf_event_mutex.
6191 */
6192 smp_wmb();
6193 event->owner = NULL;
6194 }
6195 mutex_unlock(&child->perf_event_mutex);
6196
5906 for_each_task_context_nr(ctxn) 6197 for_each_task_context_nr(ctxn)
5907 perf_event_exit_task_context(child, ctxn); 6198 perf_event_exit_task_context(child, ctxn);
5908} 6199}
@@ -6025,6 +6316,12 @@ inherit_event(struct perf_event *parent_event,
6025 child_event->overflow_handler = parent_event->overflow_handler; 6316 child_event->overflow_handler = parent_event->overflow_handler;
6026 6317
6027 /* 6318 /*
6319 * Precalculate sample_data sizes
6320 */
6321 perf_event__header_size(child_event);
6322 perf_event__id_header_size(child_event);
6323
6324 /*
6028 * Link it up in the child's context: 6325 * Link it up in the child's context:
6029 */ 6326 */
6030 raw_spin_lock_irqsave(&child_ctx->lock, flags); 6327 raw_spin_lock_irqsave(&child_ctx->lock, flags);
@@ -6122,6 +6419,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6122 struct perf_event *event; 6419 struct perf_event *event;
6123 struct task_struct *parent = current; 6420 struct task_struct *parent = current;
6124 int inherited_all = 1; 6421 int inherited_all = 1;
6422 unsigned long flags;
6125 int ret = 0; 6423 int ret = 0;
6126 6424
6127 child->perf_event_ctxp[ctxn] = NULL; 6425 child->perf_event_ctxp[ctxn] = NULL;
@@ -6162,6 +6460,15 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6162 break; 6460 break;
6163 } 6461 }
6164 6462
6463 /*
6464 * We can't hold ctx->lock when iterating the ->flexible_group list due
6465 * to allocations, but we need to prevent rotation because
6466 * rotate_ctx() will change the list from interrupt context.
6467 */
6468 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6469 parent_ctx->rotate_disable = 1;
6470 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6471
6165 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6472 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
6166 ret = inherit_task_group(event, parent, parent_ctx, 6473 ret = inherit_task_group(event, parent, parent_ctx,
6167 child, ctxn, &inherited_all); 6474 child, ctxn, &inherited_all);
@@ -6169,6 +6476,10 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6169 break; 6476 break;
6170 } 6477 }
6171 6478
6479 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
6480 parent_ctx->rotate_disable = 0;
6481 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
6482
6172 child_ctx = child->perf_event_ctxp[ctxn]; 6483 child_ctx = child->perf_event_ctxp[ctxn];
6173 6484
6174 if (child_ctx && inherited_all) { 6485 if (child_ctx && inherited_all) {
@@ -6241,7 +6552,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
6241 mutex_unlock(&swhash->hlist_mutex); 6552 mutex_unlock(&swhash->hlist_mutex);
6242} 6553}
6243 6554
6244#ifdef CONFIG_HOTPLUG_CPU 6555#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
6245static void perf_pmu_rotate_stop(struct pmu *pmu) 6556static void perf_pmu_rotate_stop(struct pmu *pmu)
6246{ 6557{
6247 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 6558 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -6295,6 +6606,26 @@ static void perf_event_exit_cpu(int cpu)
6295static inline void perf_event_exit_cpu(int cpu) { } 6606static inline void perf_event_exit_cpu(int cpu) { }
6296#endif 6607#endif
6297 6608
6609static int
6610perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
6611{
6612 int cpu;
6613
6614 for_each_online_cpu(cpu)
6615 perf_event_exit_cpu(cpu);
6616
6617 return NOTIFY_OK;
6618}
6619
6620/*
6621 * Run the perf reboot notifier at the very last possible moment so that
6622 * the generic watchdog code runs as long as possible.
6623 */
6624static struct notifier_block perf_reboot_notifier = {
6625 .notifier_call = perf_reboot,
6626 .priority = INT_MIN,
6627};
6628
6298static int __cpuinit 6629static int __cpuinit
6299perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) 6630perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6300{ 6631{
@@ -6321,11 +6652,47 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
6321 6652
6322void __init perf_event_init(void) 6653void __init perf_event_init(void)
6323{ 6654{
6655 int ret;
6656
6657 idr_init(&pmu_idr);
6658
6324 perf_event_init_all_cpus(); 6659 perf_event_init_all_cpus();
6325 init_srcu_struct(&pmus_srcu); 6660 init_srcu_struct(&pmus_srcu);
6326 perf_pmu_register(&perf_swevent); 6661 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
6327 perf_pmu_register(&perf_cpu_clock); 6662 perf_pmu_register(&perf_cpu_clock, NULL, -1);
6328 perf_pmu_register(&perf_task_clock); 6663 perf_pmu_register(&perf_task_clock, NULL, -1);
6329 perf_tp_register(); 6664 perf_tp_register();
6330 perf_cpu_notifier(perf_cpu_notify); 6665 perf_cpu_notifier(perf_cpu_notify);
6666 register_reboot_notifier(&perf_reboot_notifier);
6667
6668 ret = init_hw_breakpoint();
6669 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
6670}
6671
6672static int __init perf_event_sysfs_init(void)
6673{
6674 struct pmu *pmu;
6675 int ret;
6676
6677 mutex_lock(&pmus_lock);
6678
6679 ret = bus_register(&pmu_bus);
6680 if (ret)
6681 goto unlock;
6682
6683 list_for_each_entry(pmu, &pmus, entry) {
6684 if (!pmu->name || pmu->type < 0)
6685 continue;
6686
6687 ret = pmu_dev_alloc(pmu);
6688 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
6689 }
6690 pmu_bus_running = 1;
6691 ret = 0;
6692
6693unlock:
6694 mutex_unlock(&pmus_lock);
6695
6696 return ret;
6331} 6697}
6698device_initcall(perf_event_sysfs_init);