aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /kernel/perf_event.c
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c1537
1 files changed, 986 insertions, 551 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7f29643c8985..3d1552d3c12b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,7 @@
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h>
18#include <linux/sysfs.h> 19#include <linux/sysfs.h>
19#include <linux/dcache.h> 20#include <linux/dcache.h>
20#include <linux/percpu.h> 21#include <linux/percpu.h>
@@ -28,13 +29,15 @@
28#include <linux/anon_inodes.h> 29#include <linux/anon_inodes.h>
29#include <linux/kernel_stat.h> 30#include <linux/kernel_stat.h>
30#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/ftrace_event.h>
33#include <linux/hw_breakpoint.h>
31 34
32#include <asm/irq_regs.h> 35#include <asm/irq_regs.h>
33 36
34/* 37/*
35 * Each CPU has a list of per CPU events: 38 * Each CPU has a list of per CPU events:
36 */ 39 */
37DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 40static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
38 41
39int perf_max_events __read_mostly = 1; 42int perf_max_events __read_mostly = 1;
40static int perf_reserved_percpu __read_mostly; 43static int perf_reserved_percpu __read_mostly;
@@ -54,21 +57,6 @@ static atomic_t nr_task_events __read_mostly;
54 */ 57 */
55int sysctl_perf_event_paranoid __read_mostly = 1; 58int sysctl_perf_event_paranoid __read_mostly = 1;
56 59
57static inline bool perf_paranoid_tracepoint_raw(void)
58{
59 return sysctl_perf_event_paranoid > -1;
60}
61
62static inline bool perf_paranoid_cpu(void)
63{
64 return sysctl_perf_event_paranoid > 0;
65}
66
67static inline bool perf_paranoid_kernel(void)
68{
69 return sysctl_perf_event_paranoid > 1;
70}
71
72int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 60int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
73 61
74/* 62/*
@@ -94,13 +82,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
94void __weak hw_perf_disable(void) { barrier(); } 82void __weak hw_perf_disable(void) { barrier(); }
95void __weak hw_perf_enable(void) { barrier(); } 83void __weak hw_perf_enable(void) { barrier(); }
96 84
97void __weak hw_perf_event_setup(int cpu) { barrier(); }
98void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
99
100int __weak 85int __weak
101hw_perf_group_sched_in(struct perf_event *group_leader, 86hw_perf_group_sched_in(struct perf_event *group_leader,
102 struct perf_cpu_context *cpuctx, 87 struct perf_cpu_context *cpuctx,
103 struct perf_event_context *ctx, int cpu) 88 struct perf_event_context *ctx)
104{ 89{
105 return 0; 90 return 0;
106} 91}
@@ -109,25 +94,15 @@ void __weak perf_event_print_debug(void) { }
109 94
110static DEFINE_PER_CPU(int, perf_disable_count); 95static DEFINE_PER_CPU(int, perf_disable_count);
111 96
112void __perf_disable(void)
113{
114 __get_cpu_var(perf_disable_count)++;
115}
116
117bool __perf_enable(void)
118{
119 return !--__get_cpu_var(perf_disable_count);
120}
121
122void perf_disable(void) 97void perf_disable(void)
123{ 98{
124 __perf_disable(); 99 if (!__get_cpu_var(perf_disable_count)++)
125 hw_perf_disable(); 100 hw_perf_disable();
126} 101}
127 102
128void perf_enable(void) 103void perf_enable(void)
129{ 104{
130 if (__perf_enable()) 105 if (!--__get_cpu_var(perf_disable_count))
131 hw_perf_enable(); 106 hw_perf_enable();
132} 107}
133 108
@@ -201,14 +176,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
201 * if so. If we locked the right context, then it 176 * if so. If we locked the right context, then it
202 * can't get swapped on us any more. 177 * can't get swapped on us any more.
203 */ 178 */
204 spin_lock_irqsave(&ctx->lock, *flags); 179 raw_spin_lock_irqsave(&ctx->lock, *flags);
205 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 180 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
206 spin_unlock_irqrestore(&ctx->lock, *flags); 181 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
207 goto retry; 182 goto retry;
208 } 183 }
209 184
210 if (!atomic_inc_not_zero(&ctx->refcount)) { 185 if (!atomic_inc_not_zero(&ctx->refcount)) {
211 spin_unlock_irqrestore(&ctx->lock, *flags); 186 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
212 ctx = NULL; 187 ctx = NULL;
213 } 188 }
214 } 189 }
@@ -229,7 +204,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
229 ctx = perf_lock_task_context(task, &flags); 204 ctx = perf_lock_task_context(task, &flags);
230 if (ctx) { 205 if (ctx) {
231 ++ctx->pin_count; 206 ++ctx->pin_count;
232 spin_unlock_irqrestore(&ctx->lock, flags); 207 raw_spin_unlock_irqrestore(&ctx->lock, flags);
233 } 208 }
234 return ctx; 209 return ctx;
235} 210}
@@ -238,12 +213,64 @@ static void perf_unpin_context(struct perf_event_context *ctx)
238{ 213{
239 unsigned long flags; 214 unsigned long flags;
240 215
241 spin_lock_irqsave(&ctx->lock, flags); 216 raw_spin_lock_irqsave(&ctx->lock, flags);
242 --ctx->pin_count; 217 --ctx->pin_count;
243 spin_unlock_irqrestore(&ctx->lock, flags); 218 raw_spin_unlock_irqrestore(&ctx->lock, flags);
244 put_ctx(ctx); 219 put_ctx(ctx);
245} 220}
246 221
222static inline u64 perf_clock(void)
223{
224 return cpu_clock(raw_smp_processor_id());
225}
226
227/*
228 * Update the record of the current time in a context.
229 */
230static void update_context_time(struct perf_event_context *ctx)
231{
232 u64 now = perf_clock();
233
234 ctx->time += now - ctx->timestamp;
235 ctx->timestamp = now;
236}
237
238/*
239 * Update the total_time_enabled and total_time_running fields for a event.
240 */
241static void update_event_times(struct perf_event *event)
242{
243 struct perf_event_context *ctx = event->ctx;
244 u64 run_end;
245
246 if (event->state < PERF_EVENT_STATE_INACTIVE ||
247 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
248 return;
249
250 if (ctx->is_active)
251 run_end = ctx->time;
252 else
253 run_end = event->tstamp_stopped;
254
255 event->total_time_enabled = run_end - event->tstamp_enabled;
256
257 if (event->state == PERF_EVENT_STATE_INACTIVE)
258 run_end = event->tstamp_stopped;
259 else
260 run_end = ctx->time;
261
262 event->total_time_running = run_end - event->tstamp_running;
263}
264
265static struct list_head *
266ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
267{
268 if (event->attr.pinned)
269 return &ctx->pinned_groups;
270 else
271 return &ctx->flexible_groups;
272}
273
247/* 274/*
248 * Add a event from the lists for its context. 275 * Add a event from the lists for its context.
249 * Must be called with ctx->mutex and ctx->lock held. 276 * Must be called with ctx->mutex and ctx->lock held.
@@ -258,9 +285,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
258 * add it straight to the context's event list, or to the group 285 * add it straight to the context's event list, or to the group
259 * leader's sibling list: 286 * leader's sibling list:
260 */ 287 */
261 if (group_leader == event) 288 if (group_leader == event) {
262 list_add_tail(&event->group_entry, &ctx->group_list); 289 struct list_head *list;
263 else { 290
291 if (is_software_event(event))
292 event->group_flags |= PERF_GROUP_SOFTWARE;
293
294 list = ctx_group_list(event, ctx);
295 list_add_tail(&event->group_entry, list);
296 } else {
297 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
298 !is_software_event(event))
299 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
300
264 list_add_tail(&event->group_entry, &group_leader->sibling_list); 301 list_add_tail(&event->group_entry, &group_leader->sibling_list);
265 group_leader->nr_siblings++; 302 group_leader->nr_siblings++;
266 } 303 }
@@ -292,15 +329,32 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
292 if (event->group_leader != event) 329 if (event->group_leader != event)
293 event->group_leader->nr_siblings--; 330 event->group_leader->nr_siblings--;
294 331
332 update_event_times(event);
333
334 /*
335 * If event was in error state, then keep it
336 * that way, otherwise bogus counts will be
337 * returned on read(). The only way to get out
338 * of error state is by explicit re-enabling
339 * of the event
340 */
341 if (event->state > PERF_EVENT_STATE_OFF)
342 event->state = PERF_EVENT_STATE_OFF;
343
295 /* 344 /*
296 * If this was a group event with sibling events then 345 * If this was a group event with sibling events then
297 * upgrade the siblings to singleton events by adding them 346 * upgrade the siblings to singleton events by adding them
298 * to the context list directly: 347 * to the context list directly:
299 */ 348 */
300 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 349 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
350 struct list_head *list;
301 351
302 list_move_tail(&sibling->group_entry, &ctx->group_list); 352 list = ctx_group_list(event, ctx);
353 list_move_tail(&sibling->group_entry, list);
303 sibling->group_leader = sibling; 354 sibling->group_leader = sibling;
355
356 /* Inherit group flags from the previous leader */
357 sibling->group_flags = event->group_flags;
304 } 358 }
305} 359}
306 360
@@ -370,7 +424,7 @@ static void __perf_event_remove_from_context(void *info)
370 if (ctx->task && cpuctx->task_ctx != ctx) 424 if (ctx->task && cpuctx->task_ctx != ctx)
371 return; 425 return;
372 426
373 spin_lock(&ctx->lock); 427 raw_spin_lock(&ctx->lock);
374 /* 428 /*
375 * Protect the list operation against NMI by disabling the 429 * Protect the list operation against NMI by disabling the
376 * events on a global level. 430 * events on a global level.
@@ -392,7 +446,7 @@ static void __perf_event_remove_from_context(void *info)
392 } 446 }
393 447
394 perf_enable(); 448 perf_enable();
395 spin_unlock(&ctx->lock); 449 raw_spin_unlock(&ctx->lock);
396} 450}
397 451
398 452
@@ -419,7 +473,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
419 if (!task) { 473 if (!task) {
420 /* 474 /*
421 * Per cpu events are removed via an smp call and 475 * Per cpu events are removed via an smp call and
422 * the removal is always sucessful. 476 * the removal is always successful.
423 */ 477 */
424 smp_call_function_single(event->cpu, 478 smp_call_function_single(event->cpu,
425 __perf_event_remove_from_context, 479 __perf_event_remove_from_context,
@@ -431,12 +485,12 @@ retry:
431 task_oncpu_function_call(task, __perf_event_remove_from_context, 485 task_oncpu_function_call(task, __perf_event_remove_from_context,
432 event); 486 event);
433 487
434 spin_lock_irq(&ctx->lock); 488 raw_spin_lock_irq(&ctx->lock);
435 /* 489 /*
436 * If the context is active we need to retry the smp call. 490 * If the context is active we need to retry the smp call.
437 */ 491 */
438 if (ctx->nr_active && !list_empty(&event->group_entry)) { 492 if (ctx->nr_active && !list_empty(&event->group_entry)) {
439 spin_unlock_irq(&ctx->lock); 493 raw_spin_unlock_irq(&ctx->lock);
440 goto retry; 494 goto retry;
441 } 495 }
442 496
@@ -445,48 +499,9 @@ retry:
445 * can remove the event safely, if the call above did not 499 * can remove the event safely, if the call above did not
446 * succeed. 500 * succeed.
447 */ 501 */
448 if (!list_empty(&event->group_entry)) { 502 if (!list_empty(&event->group_entry))
449 list_del_event(event, ctx); 503 list_del_event(event, ctx);
450 } 504 raw_spin_unlock_irq(&ctx->lock);
451 spin_unlock_irq(&ctx->lock);
452}
453
454static inline u64 perf_clock(void)
455{
456 return cpu_clock(smp_processor_id());
457}
458
459/*
460 * Update the record of the current time in a context.
461 */
462static void update_context_time(struct perf_event_context *ctx)
463{
464 u64 now = perf_clock();
465
466 ctx->time += now - ctx->timestamp;
467 ctx->timestamp = now;
468}
469
470/*
471 * Update the total_time_enabled and total_time_running fields for a event.
472 */
473static void update_event_times(struct perf_event *event)
474{
475 struct perf_event_context *ctx = event->ctx;
476 u64 run_end;
477
478 if (event->state < PERF_EVENT_STATE_INACTIVE ||
479 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
480 return;
481
482 event->total_time_enabled = ctx->time - event->tstamp_enabled;
483
484 if (event->state == PERF_EVENT_STATE_INACTIVE)
485 run_end = event->tstamp_stopped;
486 else
487 run_end = ctx->time;
488
489 event->total_time_running = run_end - event->tstamp_running;
490} 505}
491 506
492/* 507/*
@@ -517,7 +532,7 @@ static void __perf_event_disable(void *info)
517 if (ctx->task && cpuctx->task_ctx != ctx) 532 if (ctx->task && cpuctx->task_ctx != ctx)
518 return; 533 return;
519 534
520 spin_lock(&ctx->lock); 535 raw_spin_lock(&ctx->lock);
521 536
522 /* 537 /*
523 * If the event is on, turn it off. 538 * If the event is on, turn it off.
@@ -533,7 +548,7 @@ static void __perf_event_disable(void *info)
533 event->state = PERF_EVENT_STATE_OFF; 548 event->state = PERF_EVENT_STATE_OFF;
534 } 549 }
535 550
536 spin_unlock(&ctx->lock); 551 raw_spin_unlock(&ctx->lock);
537} 552}
538 553
539/* 554/*
@@ -549,7 +564,7 @@ static void __perf_event_disable(void *info)
549 * is the current context on this CPU and preemption is disabled, 564 * is the current context on this CPU and preemption is disabled,
550 * hence we can't get into perf_event_task_sched_out for this context. 565 * hence we can't get into perf_event_task_sched_out for this context.
551 */ 566 */
552static void perf_event_disable(struct perf_event *event) 567void perf_event_disable(struct perf_event *event)
553{ 568{
554 struct perf_event_context *ctx = event->ctx; 569 struct perf_event_context *ctx = event->ctx;
555 struct task_struct *task = ctx->task; 570 struct task_struct *task = ctx->task;
@@ -566,12 +581,12 @@ static void perf_event_disable(struct perf_event *event)
566 retry: 581 retry:
567 task_oncpu_function_call(task, __perf_event_disable, event); 582 task_oncpu_function_call(task, __perf_event_disable, event);
568 583
569 spin_lock_irq(&ctx->lock); 584 raw_spin_lock_irq(&ctx->lock);
570 /* 585 /*
571 * If the event is still active, we need to retry the cross-call. 586 * If the event is still active, we need to retry the cross-call.
572 */ 587 */
573 if (event->state == PERF_EVENT_STATE_ACTIVE) { 588 if (event->state == PERF_EVENT_STATE_ACTIVE) {
574 spin_unlock_irq(&ctx->lock); 589 raw_spin_unlock_irq(&ctx->lock);
575 goto retry; 590 goto retry;
576 } 591 }
577 592
@@ -584,20 +599,19 @@ static void perf_event_disable(struct perf_event *event)
584 event->state = PERF_EVENT_STATE_OFF; 599 event->state = PERF_EVENT_STATE_OFF;
585 } 600 }
586 601
587 spin_unlock_irq(&ctx->lock); 602 raw_spin_unlock_irq(&ctx->lock);
588} 603}
589 604
590static int 605static int
591event_sched_in(struct perf_event *event, 606event_sched_in(struct perf_event *event,
592 struct perf_cpu_context *cpuctx, 607 struct perf_cpu_context *cpuctx,
593 struct perf_event_context *ctx, 608 struct perf_event_context *ctx)
594 int cpu)
595{ 609{
596 if (event->state <= PERF_EVENT_STATE_OFF) 610 if (event->state <= PERF_EVENT_STATE_OFF)
597 return 0; 611 return 0;
598 612
599 event->state = PERF_EVENT_STATE_ACTIVE; 613 event->state = PERF_EVENT_STATE_ACTIVE;
600 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 614 event->oncpu = smp_processor_id();
601 /* 615 /*
602 * The new state must be visible before we turn it on in the hardware: 616 * The new state must be visible before we turn it on in the hardware:
603 */ 617 */
@@ -624,8 +638,7 @@ event_sched_in(struct perf_event *event,
624static int 638static int
625group_sched_in(struct perf_event *group_event, 639group_sched_in(struct perf_event *group_event,
626 struct perf_cpu_context *cpuctx, 640 struct perf_cpu_context *cpuctx,
627 struct perf_event_context *ctx, 641 struct perf_event_context *ctx)
628 int cpu)
629{ 642{
630 struct perf_event *event, *partial_group; 643 struct perf_event *event, *partial_group;
631 int ret; 644 int ret;
@@ -633,18 +646,18 @@ group_sched_in(struct perf_event *group_event,
633 if (group_event->state == PERF_EVENT_STATE_OFF) 646 if (group_event->state == PERF_EVENT_STATE_OFF)
634 return 0; 647 return 0;
635 648
636 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
637 if (ret) 650 if (ret)
638 return ret < 0 ? ret : 0; 651 return ret < 0 ? ret : 0;
639 652
640 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 653 if (event_sched_in(group_event, cpuctx, ctx))
641 return -EAGAIN; 654 return -EAGAIN;
642 655
643 /* 656 /*
644 * Schedule in siblings as one group (if any): 657 * Schedule in siblings as one group (if any):
645 */ 658 */
646 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 659 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
647 if (event_sched_in(event, cpuctx, ctx, cpu)) { 660 if (event_sched_in(event, cpuctx, ctx)) {
648 partial_group = event; 661 partial_group = event;
649 goto group_error; 662 goto group_error;
650 } 663 }
@@ -668,24 +681,6 @@ group_error:
668} 681}
669 682
670/* 683/*
671 * Return 1 for a group consisting entirely of software events,
672 * 0 if the group contains any hardware events.
673 */
674static int is_software_only_group(struct perf_event *leader)
675{
676 struct perf_event *event;
677
678 if (!is_software_event(leader))
679 return 0;
680
681 list_for_each_entry(event, &leader->sibling_list, group_entry)
682 if (!is_software_event(event))
683 return 0;
684
685 return 1;
686}
687
688/*
689 * Work out whether we can put this event group on the CPU now. 684 * Work out whether we can put this event group on the CPU now.
690 */ 685 */
691static int group_can_go_on(struct perf_event *event, 686static int group_can_go_on(struct perf_event *event,
@@ -695,7 +690,7 @@ static int group_can_go_on(struct perf_event *event,
695 /* 690 /*
696 * Groups consisting entirely of software events can always go on. 691 * Groups consisting entirely of software events can always go on.
697 */ 692 */
698 if (is_software_only_group(event)) 693 if (event->group_flags & PERF_GROUP_SOFTWARE)
699 return 1; 694 return 1;
700 /* 695 /*
701 * If an exclusive group is already on, no other hardware 696 * If an exclusive group is already on, no other hardware
@@ -736,7 +731,6 @@ static void __perf_install_in_context(void *info)
736 struct perf_event *event = info; 731 struct perf_event *event = info;
737 struct perf_event_context *ctx = event->ctx; 732 struct perf_event_context *ctx = event->ctx;
738 struct perf_event *leader = event->group_leader; 733 struct perf_event *leader = event->group_leader;
739 int cpu = smp_processor_id();
740 int err; 734 int err;
741 735
742 /* 736 /*
@@ -752,7 +746,7 @@ static void __perf_install_in_context(void *info)
752 cpuctx->task_ctx = ctx; 746 cpuctx->task_ctx = ctx;
753 } 747 }
754 748
755 spin_lock(&ctx->lock); 749 raw_spin_lock(&ctx->lock);
756 ctx->is_active = 1; 750 ctx->is_active = 1;
757 update_context_time(ctx); 751 update_context_time(ctx);
758 752
@@ -764,6 +758,9 @@ static void __perf_install_in_context(void *info)
764 758
765 add_event_to_ctx(event, ctx); 759 add_event_to_ctx(event, ctx);
766 760
761 if (event->cpu != -1 && event->cpu != smp_processor_id())
762 goto unlock;
763
767 /* 764 /*
768 * Don't put the event on if it is disabled or if 765 * Don't put the event on if it is disabled or if
769 * it is in a group and the group isn't on. 766 * it is in a group and the group isn't on.
@@ -780,7 +777,7 @@ static void __perf_install_in_context(void *info)
780 if (!group_can_go_on(event, cpuctx, 1)) 777 if (!group_can_go_on(event, cpuctx, 1))
781 err = -EEXIST; 778 err = -EEXIST;
782 else 779 else
783 err = event_sched_in(event, cpuctx, ctx, cpu); 780 err = event_sched_in(event, cpuctx, ctx);
784 781
785 if (err) { 782 if (err) {
786 /* 783 /*
@@ -802,7 +799,7 @@ static void __perf_install_in_context(void *info)
802 unlock: 799 unlock:
803 perf_enable(); 800 perf_enable();
804 801
805 spin_unlock(&ctx->lock); 802 raw_spin_unlock(&ctx->lock);
806} 803}
807 804
808/* 805/*
@@ -827,7 +824,7 @@ perf_install_in_context(struct perf_event_context *ctx,
827 if (!task) { 824 if (!task) {
828 /* 825 /*
829 * Per cpu events are installed via an smp call and 826 * Per cpu events are installed via an smp call and
830 * the install is always sucessful. 827 * the install is always successful.
831 */ 828 */
832 smp_call_function_single(cpu, __perf_install_in_context, 829 smp_call_function_single(cpu, __perf_install_in_context,
833 event, 1); 830 event, 1);
@@ -838,12 +835,12 @@ retry:
838 task_oncpu_function_call(task, __perf_install_in_context, 835 task_oncpu_function_call(task, __perf_install_in_context,
839 event); 836 event);
840 837
841 spin_lock_irq(&ctx->lock); 838 raw_spin_lock_irq(&ctx->lock);
842 /* 839 /*
843 * we need to retry the smp call. 840 * we need to retry the smp call.
844 */ 841 */
845 if (ctx->is_active && list_empty(&event->group_entry)) { 842 if (ctx->is_active && list_empty(&event->group_entry)) {
846 spin_unlock_irq(&ctx->lock); 843 raw_spin_unlock_irq(&ctx->lock);
847 goto retry; 844 goto retry;
848 } 845 }
849 846
@@ -854,7 +851,7 @@ retry:
854 */ 851 */
855 if (list_empty(&event->group_entry)) 852 if (list_empty(&event->group_entry))
856 add_event_to_ctx(event, ctx); 853 add_event_to_ctx(event, ctx);
857 spin_unlock_irq(&ctx->lock); 854 raw_spin_unlock_irq(&ctx->lock);
858} 855}
859 856
860/* 857/*
@@ -899,7 +896,7 @@ static void __perf_event_enable(void *info)
899 cpuctx->task_ctx = ctx; 896 cpuctx->task_ctx = ctx;
900 } 897 }
901 898
902 spin_lock(&ctx->lock); 899 raw_spin_lock(&ctx->lock);
903 ctx->is_active = 1; 900 ctx->is_active = 1;
904 update_context_time(ctx); 901 update_context_time(ctx);
905 902
@@ -907,6 +904,9 @@ static void __perf_event_enable(void *info)
907 goto unlock; 904 goto unlock;
908 __perf_event_mark_enabled(event, ctx); 905 __perf_event_mark_enabled(event, ctx);
909 906
907 if (event->cpu != -1 && event->cpu != smp_processor_id())
908 goto unlock;
909
910 /* 910 /*
911 * If the event is in a group and isn't the group leader, 911 * If the event is in a group and isn't the group leader,
912 * then don't put it on unless the group is on. 912 * then don't put it on unless the group is on.
@@ -919,11 +919,9 @@ static void __perf_event_enable(void *info)
919 } else { 919 } else {
920 perf_disable(); 920 perf_disable();
921 if (event == leader) 921 if (event == leader)
922 err = group_sched_in(event, cpuctx, ctx, 922 err = group_sched_in(event, cpuctx, ctx);
923 smp_processor_id());
924 else 923 else
925 err = event_sched_in(event, cpuctx, ctx, 924 err = event_sched_in(event, cpuctx, ctx);
926 smp_processor_id());
927 perf_enable(); 925 perf_enable();
928 } 926 }
929 927
@@ -941,7 +939,7 @@ static void __perf_event_enable(void *info)
941 } 939 }
942 940
943 unlock: 941 unlock:
944 spin_unlock(&ctx->lock); 942 raw_spin_unlock(&ctx->lock);
945} 943}
946 944
947/* 945/*
@@ -953,7 +951,7 @@ static void __perf_event_enable(void *info)
953 * perf_event_for_each_child or perf_event_for_each as described 951 * perf_event_for_each_child or perf_event_for_each as described
954 * for perf_event_disable. 952 * for perf_event_disable.
955 */ 953 */
956static void perf_event_enable(struct perf_event *event) 954void perf_event_enable(struct perf_event *event)
957{ 955{
958 struct perf_event_context *ctx = event->ctx; 956 struct perf_event_context *ctx = event->ctx;
959 struct task_struct *task = ctx->task; 957 struct task_struct *task = ctx->task;
@@ -967,7 +965,7 @@ static void perf_event_enable(struct perf_event *event)
967 return; 965 return;
968 } 966 }
969 967
970 spin_lock_irq(&ctx->lock); 968 raw_spin_lock_irq(&ctx->lock);
971 if (event->state >= PERF_EVENT_STATE_INACTIVE) 969 if (event->state >= PERF_EVENT_STATE_INACTIVE)
972 goto out; 970 goto out;
973 971
@@ -982,10 +980,10 @@ static void perf_event_enable(struct perf_event *event)
982 event->state = PERF_EVENT_STATE_OFF; 980 event->state = PERF_EVENT_STATE_OFF;
983 981
984 retry: 982 retry:
985 spin_unlock_irq(&ctx->lock); 983 raw_spin_unlock_irq(&ctx->lock);
986 task_oncpu_function_call(task, __perf_event_enable, event); 984 task_oncpu_function_call(task, __perf_event_enable, event);
987 985
988 spin_lock_irq(&ctx->lock); 986 raw_spin_lock_irq(&ctx->lock);
989 987
990 /* 988 /*
991 * If the context is active and the event is still off, 989 * If the context is active and the event is still off,
@@ -1002,7 +1000,7 @@ static void perf_event_enable(struct perf_event *event)
1002 __perf_event_mark_enabled(event, ctx); 1000 __perf_event_mark_enabled(event, ctx);
1003 1001
1004 out: 1002 out:
1005 spin_unlock_irq(&ctx->lock); 1003 raw_spin_unlock_irq(&ctx->lock);
1006} 1004}
1007 1005
1008static int perf_event_refresh(struct perf_event *event, int refresh) 1006static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1019,25 +1017,40 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1019 return 0; 1017 return 0;
1020} 1018}
1021 1019
1022void __perf_event_sched_out(struct perf_event_context *ctx, 1020enum event_type_t {
1023 struct perf_cpu_context *cpuctx) 1021 EVENT_FLEXIBLE = 0x1,
1022 EVENT_PINNED = 0x2,
1023 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1024};
1025
1026static void ctx_sched_out(struct perf_event_context *ctx,
1027 struct perf_cpu_context *cpuctx,
1028 enum event_type_t event_type)
1024{ 1029{
1025 struct perf_event *event; 1030 struct perf_event *event;
1026 1031
1027 spin_lock(&ctx->lock); 1032 raw_spin_lock(&ctx->lock);
1028 ctx->is_active = 0; 1033 ctx->is_active = 0;
1029 if (likely(!ctx->nr_events)) 1034 if (likely(!ctx->nr_events))
1030 goto out; 1035 goto out;
1031 update_context_time(ctx); 1036 update_context_time(ctx);
1032 1037
1033 perf_disable(); 1038 perf_disable();
1034 if (ctx->nr_active) 1039 if (!ctx->nr_active)
1035 list_for_each_entry(event, &ctx->group_list, group_entry) 1040 goto out_enable;
1041
1042 if (event_type & EVENT_PINNED)
1043 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1036 group_sched_out(event, cpuctx, ctx); 1044 group_sched_out(event, cpuctx, ctx);
1037 1045
1046 if (event_type & EVENT_FLEXIBLE)
1047 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1048 group_sched_out(event, cpuctx, ctx);
1049
1050 out_enable:
1038 perf_enable(); 1051 perf_enable();
1039 out: 1052 out:
1040 spin_unlock(&ctx->lock); 1053 raw_spin_unlock(&ctx->lock);
1041} 1054}
1042 1055
1043/* 1056/*
@@ -1059,8 +1072,6 @@ static int context_equiv(struct perf_event_context *ctx1,
1059 && !ctx1->pin_count && !ctx2->pin_count; 1072 && !ctx1->pin_count && !ctx2->pin_count;
1060} 1073}
1061 1074
1062static void __perf_event_read(void *event);
1063
1064static void __perf_event_sync_stat(struct perf_event *event, 1075static void __perf_event_sync_stat(struct perf_event *event,
1065 struct perf_event *next_event) 1076 struct perf_event *next_event)
1066{ 1077{
@@ -1078,8 +1089,8 @@ static void __perf_event_sync_stat(struct perf_event *event,
1078 */ 1089 */
1079 switch (event->state) { 1090 switch (event->state) {
1080 case PERF_EVENT_STATE_ACTIVE: 1091 case PERF_EVENT_STATE_ACTIVE:
1081 __perf_event_read(event); 1092 event->pmu->read(event);
1082 break; 1093 /* fall-through */
1083 1094
1084 case PERF_EVENT_STATE_INACTIVE: 1095 case PERF_EVENT_STATE_INACTIVE:
1085 update_event_times(event); 1096 update_event_times(event);
@@ -1118,6 +1129,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1118 if (!ctx->nr_stat) 1129 if (!ctx->nr_stat)
1119 return; 1130 return;
1120 1131
1132 update_context_time(ctx);
1133
1121 event = list_first_entry(&ctx->event_list, 1134 event = list_first_entry(&ctx->event_list,
1122 struct perf_event, event_entry); 1135 struct perf_event, event_entry);
1123 1136
@@ -1146,23 +1159,19 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1146 * not restart the event. 1159 * not restart the event.
1147 */ 1160 */
1148void perf_event_task_sched_out(struct task_struct *task, 1161void perf_event_task_sched_out(struct task_struct *task,
1149 struct task_struct *next, int cpu) 1162 struct task_struct *next)
1150{ 1163{
1151 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1164 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1152 struct perf_event_context *ctx = task->perf_event_ctxp; 1165 struct perf_event_context *ctx = task->perf_event_ctxp;
1153 struct perf_event_context *next_ctx; 1166 struct perf_event_context *next_ctx;
1154 struct perf_event_context *parent; 1167 struct perf_event_context *parent;
1155 struct pt_regs *regs;
1156 int do_switch = 1; 1168 int do_switch = 1;
1157 1169
1158 regs = task_pt_regs(task); 1170 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1159 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1160 1171
1161 if (likely(!ctx || !cpuctx->task_ctx)) 1172 if (likely(!ctx || !cpuctx->task_ctx))
1162 return; 1173 return;
1163 1174
1164 update_context_time(ctx);
1165
1166 rcu_read_lock(); 1175 rcu_read_lock();
1167 parent = rcu_dereference(ctx->parent_ctx); 1176 parent = rcu_dereference(ctx->parent_ctx);
1168 next_ctx = next->perf_event_ctxp; 1177 next_ctx = next->perf_event_ctxp;
@@ -1177,8 +1186,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1177 * order we take the locks because no other cpu could 1186 * order we take the locks because no other cpu could
1178 * be trying to lock both of these tasks. 1187 * be trying to lock both of these tasks.
1179 */ 1188 */
1180 spin_lock(&ctx->lock); 1189 raw_spin_lock(&ctx->lock);
1181 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1190 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1182 if (context_equiv(ctx, next_ctx)) { 1191 if (context_equiv(ctx, next_ctx)) {
1183 /* 1192 /*
1184 * XXX do we need a memory barrier of sorts 1193 * XXX do we need a memory barrier of sorts
@@ -1192,21 +1201,19 @@ void perf_event_task_sched_out(struct task_struct *task,
1192 1201
1193 perf_event_sync_stat(ctx, next_ctx); 1202 perf_event_sync_stat(ctx, next_ctx);
1194 } 1203 }
1195 spin_unlock(&next_ctx->lock); 1204 raw_spin_unlock(&next_ctx->lock);
1196 spin_unlock(&ctx->lock); 1205 raw_spin_unlock(&ctx->lock);
1197 } 1206 }
1198 rcu_read_unlock(); 1207 rcu_read_unlock();
1199 1208
1200 if (do_switch) { 1209 if (do_switch) {
1201 __perf_event_sched_out(ctx, cpuctx); 1210 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1202 cpuctx->task_ctx = NULL; 1211 cpuctx->task_ctx = NULL;
1203 } 1212 }
1204} 1213}
1205 1214
1206/* 1215static void task_ctx_sched_out(struct perf_event_context *ctx,
1207 * Called with IRQs disabled 1216 enum event_type_t event_type)
1208 */
1209static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1210{ 1217{
1211 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1212 1219
@@ -1216,47 +1223,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1216 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1223 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1217 return; 1224 return;
1218 1225
1219 __perf_event_sched_out(ctx, cpuctx); 1226 ctx_sched_out(ctx, cpuctx, event_type);
1220 cpuctx->task_ctx = NULL; 1227 cpuctx->task_ctx = NULL;
1221} 1228}
1222 1229
1223/* 1230/*
1224 * Called with IRQs disabled 1231 * Called with IRQs disabled
1225 */ 1232 */
1226static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1233static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1234{
1235 task_ctx_sched_out(ctx, EVENT_ALL);
1236}
1237
1238/*
1239 * Called with IRQs disabled
1240 */
1241static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1242 enum event_type_t event_type)
1227{ 1243{
1228 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1244 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1229} 1245}
1230 1246
1231static void 1247static void
1232__perf_event_sched_in(struct perf_event_context *ctx, 1248ctx_pinned_sched_in(struct perf_event_context *ctx,
1233 struct perf_cpu_context *cpuctx, int cpu) 1249 struct perf_cpu_context *cpuctx)
1234{ 1250{
1235 struct perf_event *event; 1251 struct perf_event *event;
1236 int can_add_hw = 1;
1237
1238 spin_lock(&ctx->lock);
1239 ctx->is_active = 1;
1240 if (likely(!ctx->nr_events))
1241 goto out;
1242 1252
1243 ctx->timestamp = perf_clock(); 1253 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1244 1254 if (event->state <= PERF_EVENT_STATE_OFF)
1245 perf_disable();
1246
1247 /*
1248 * First go through the list and put on any pinned groups
1249 * in order to give them the best chance of going on.
1250 */
1251 list_for_each_entry(event, &ctx->group_list, group_entry) {
1252 if (event->state <= PERF_EVENT_STATE_OFF ||
1253 !event->attr.pinned)
1254 continue; 1255 continue;
1255 if (event->cpu != -1 && event->cpu != cpu) 1256 if (event->cpu != -1 && event->cpu != smp_processor_id())
1256 continue; 1257 continue;
1257 1258
1258 if (group_can_go_on(event, cpuctx, 1)) 1259 if (group_can_go_on(event, cpuctx, 1))
1259 group_sched_in(event, cpuctx, ctx, cpu); 1260 group_sched_in(event, cpuctx, ctx);
1260 1261
1261 /* 1262 /*
1262 * If this pinned group hasn't been scheduled, 1263 * If this pinned group hasn't been scheduled,
@@ -1267,32 +1268,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1267 event->state = PERF_EVENT_STATE_ERROR; 1268 event->state = PERF_EVENT_STATE_ERROR;
1268 } 1269 }
1269 } 1270 }
1271}
1270 1272
1271 list_for_each_entry(event, &ctx->group_list, group_entry) { 1273static void
1272 /* 1274ctx_flexible_sched_in(struct perf_event_context *ctx,
1273 * Ignore events in OFF or ERROR state, and 1275 struct perf_cpu_context *cpuctx)
1274 * ignore pinned events since we did them already. 1276{
1275 */ 1277 struct perf_event *event;
1276 if (event->state <= PERF_EVENT_STATE_OFF || 1278 int can_add_hw = 1;
1277 event->attr.pinned)
1278 continue;
1279 1279
1280 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1281 /* Ignore events in OFF or ERROR state */
1282 if (event->state <= PERF_EVENT_STATE_OFF)
1283 continue;
1280 /* 1284 /*
1281 * Listen to the 'cpu' scheduling filter constraint 1285 * Listen to the 'cpu' scheduling filter constraint
1282 * of events: 1286 * of events:
1283 */ 1287 */
1284 if (event->cpu != -1 && event->cpu != cpu) 1288 if (event->cpu != -1 && event->cpu != smp_processor_id())
1285 continue; 1289 continue;
1286 1290
1287 if (group_can_go_on(event, cpuctx, can_add_hw)) 1291 if (group_can_go_on(event, cpuctx, can_add_hw))
1288 if (group_sched_in(event, cpuctx, ctx, cpu)) 1292 if (group_sched_in(event, cpuctx, ctx))
1289 can_add_hw = 0; 1293 can_add_hw = 0;
1290 } 1294 }
1295}
1296
1297static void
1298ctx_sched_in(struct perf_event_context *ctx,
1299 struct perf_cpu_context *cpuctx,
1300 enum event_type_t event_type)
1301{
1302 raw_spin_lock(&ctx->lock);
1303 ctx->is_active = 1;
1304 if (likely(!ctx->nr_events))
1305 goto out;
1306
1307 ctx->timestamp = perf_clock();
1308
1309 perf_disable();
1310
1311 /*
1312 * First go through the list and put on any pinned groups
1313 * in order to give them the best chance of going on.
1314 */
1315 if (event_type & EVENT_PINNED)
1316 ctx_pinned_sched_in(ctx, cpuctx);
1317
1318 /* Then walk through the lower prio flexible groups */
1319 if (event_type & EVENT_FLEXIBLE)
1320 ctx_flexible_sched_in(ctx, cpuctx);
1321
1291 perf_enable(); 1322 perf_enable();
1292 out: 1323 out:
1293 spin_unlock(&ctx->lock); 1324 raw_spin_unlock(&ctx->lock);
1294} 1325}
1295 1326
1327static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1328 enum event_type_t event_type)
1329{
1330 struct perf_event_context *ctx = &cpuctx->ctx;
1331
1332 ctx_sched_in(ctx, cpuctx, event_type);
1333}
1334
1335static void task_ctx_sched_in(struct task_struct *task,
1336 enum event_type_t event_type)
1337{
1338 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1339 struct perf_event_context *ctx = task->perf_event_ctxp;
1340
1341 if (likely(!ctx))
1342 return;
1343 if (cpuctx->task_ctx == ctx)
1344 return;
1345 ctx_sched_in(ctx, cpuctx, event_type);
1346 cpuctx->task_ctx = ctx;
1347}
1296/* 1348/*
1297 * Called from scheduler to add the events of the current task 1349 * Called from scheduler to add the events of the current task
1298 * with interrupts disabled. 1350 * with interrupts disabled.
@@ -1304,38 +1356,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1304 * accessing the event control register. If a NMI hits, then it will 1356 * accessing the event control register. If a NMI hits, then it will
1305 * keep the event running. 1357 * keep the event running.
1306 */ 1358 */
1307void perf_event_task_sched_in(struct task_struct *task, int cpu) 1359void perf_event_task_sched_in(struct task_struct *task)
1308{ 1360{
1309 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1361 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1310 struct perf_event_context *ctx = task->perf_event_ctxp; 1362 struct perf_event_context *ctx = task->perf_event_ctxp;
1311 1363
1312 if (likely(!ctx)) 1364 if (likely(!ctx))
1313 return; 1365 return;
1366
1314 if (cpuctx->task_ctx == ctx) 1367 if (cpuctx->task_ctx == ctx)
1315 return; 1368 return;
1316 __perf_event_sched_in(ctx, cpuctx, cpu); 1369
1370 /*
1371 * We want to keep the following priority order:
1372 * cpu pinned (that don't need to move), task pinned,
1373 * cpu flexible, task flexible.
1374 */
1375 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1376
1377 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1378 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1379 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380
1317 cpuctx->task_ctx = ctx; 1381 cpuctx->task_ctx = ctx;
1318} 1382}
1319 1383
1320static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1384#define MAX_INTERRUPTS (~0ULL)
1385
1386static void perf_log_throttle(struct perf_event *event, int enable);
1387
1388static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1321{ 1389{
1322 struct perf_event_context *ctx = &cpuctx->ctx; 1390 u64 frequency = event->attr.sample_freq;
1391 u64 sec = NSEC_PER_SEC;
1392 u64 divisor, dividend;
1393
1394 int count_fls, nsec_fls, frequency_fls, sec_fls;
1395
1396 count_fls = fls64(count);
1397 nsec_fls = fls64(nsec);
1398 frequency_fls = fls64(frequency);
1399 sec_fls = 30;
1400
1401 /*
1402 * We got @count in @nsec, with a target of sample_freq HZ
1403 * the target period becomes:
1404 *
1405 * @count * 10^9
1406 * period = -------------------
1407 * @nsec * sample_freq
1408 *
1409 */
1410
1411 /*
1412 * Reduce accuracy by one bit such that @a and @b converge
1413 * to a similar magnitude.
1414 */
1415#define REDUCE_FLS(a, b) \
1416do { \
1417 if (a##_fls > b##_fls) { \
1418 a >>= 1; \
1419 a##_fls--; \
1420 } else { \
1421 b >>= 1; \
1422 b##_fls--; \
1423 } \
1424} while (0)
1425
1426 /*
1427 * Reduce accuracy until either term fits in a u64, then proceed with
1428 * the other, so that finally we can do a u64/u64 division.
1429 */
1430 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1431 REDUCE_FLS(nsec, frequency);
1432 REDUCE_FLS(sec, count);
1433 }
1323 1434
1324 __perf_event_sched_in(ctx, cpuctx, cpu); 1435 if (count_fls + sec_fls > 64) {
1436 divisor = nsec * frequency;
1437
1438 while (count_fls + sec_fls > 64) {
1439 REDUCE_FLS(count, sec);
1440 divisor >>= 1;
1441 }
1442
1443 dividend = count * sec;
1444 } else {
1445 dividend = count * sec;
1446
1447 while (nsec_fls + frequency_fls > 64) {
1448 REDUCE_FLS(nsec, frequency);
1449 dividend >>= 1;
1450 }
1451
1452 divisor = nsec * frequency;
1453 }
1454
1455 return div64_u64(dividend, divisor);
1325} 1456}
1326 1457
1327#define MAX_INTERRUPTS (~0ULL) 1458static void perf_event_stop(struct perf_event *event)
1459{
1460 if (!event->pmu->stop)
1461 return event->pmu->disable(event);
1328 1462
1329static void perf_log_throttle(struct perf_event *event, int enable); 1463 return event->pmu->stop(event);
1464}
1465
1466static int perf_event_start(struct perf_event *event)
1467{
1468 if (!event->pmu->start)
1469 return event->pmu->enable(event);
1470
1471 return event->pmu->start(event);
1472}
1330 1473
1331static void perf_adjust_period(struct perf_event *event, u64 events) 1474static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1332{ 1475{
1333 struct hw_perf_event *hwc = &event->hw; 1476 struct hw_perf_event *hwc = &event->hw;
1334 u64 period, sample_period; 1477 u64 period, sample_period;
1335 s64 delta; 1478 s64 delta;
1336 1479
1337 events *= hwc->sample_period; 1480 period = perf_calculate_period(event, nsec, count);
1338 period = div64_u64(events, event->attr.sample_freq);
1339 1481
1340 delta = (s64)(period - hwc->sample_period); 1482 delta = (s64)(period - hwc->sample_period);
1341 delta = (delta + 7) / 8; /* low pass filter */ 1483 delta = (delta + 7) / 8; /* low pass filter */
@@ -1346,19 +1488,31 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1346 sample_period = 1; 1488 sample_period = 1;
1347 1489
1348 hwc->sample_period = sample_period; 1490 hwc->sample_period = sample_period;
1491
1492 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1493 perf_disable();
1494 perf_event_stop(event);
1495 atomic64_set(&hwc->period_left, 0);
1496 perf_event_start(event);
1497 perf_enable();
1498 }
1349} 1499}
1350 1500
1351static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1501static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1352{ 1502{
1353 struct perf_event *event; 1503 struct perf_event *event;
1354 struct hw_perf_event *hwc; 1504 struct hw_perf_event *hwc;
1355 u64 interrupts, freq; 1505 u64 interrupts, now;
1506 s64 delta;
1356 1507
1357 spin_lock(&ctx->lock); 1508 raw_spin_lock(&ctx->lock);
1358 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1509 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1359 if (event->state != PERF_EVENT_STATE_ACTIVE) 1510 if (event->state != PERF_EVENT_STATE_ACTIVE)
1360 continue; 1511 continue;
1361 1512
1513 if (event->cpu != -1 && event->cpu != smp_processor_id())
1514 continue;
1515
1362 hwc = &event->hw; 1516 hwc = &event->hw;
1363 1517
1364 interrupts = hwc->interrupts; 1518 interrupts = hwc->interrupts;
@@ -1369,47 +1523,25 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1369 */ 1523 */
1370 if (interrupts == MAX_INTERRUPTS) { 1524 if (interrupts == MAX_INTERRUPTS) {
1371 perf_log_throttle(event, 1); 1525 perf_log_throttle(event, 1);
1526 perf_disable();
1372 event->pmu->unthrottle(event); 1527 event->pmu->unthrottle(event);
1373 interrupts = 2*sysctl_perf_event_sample_rate/HZ; 1528 perf_enable();
1374 } 1529 }
1375 1530
1376 if (!event->attr.freq || !event->attr.sample_freq) 1531 if (!event->attr.freq || !event->attr.sample_freq)
1377 continue; 1532 continue;
1378 1533
1379 /* 1534 perf_disable();
1380 * if the specified freq < HZ then we need to skip ticks 1535 event->pmu->read(event);
1381 */ 1536 now = atomic64_read(&event->count);
1382 if (event->attr.sample_freq < HZ) { 1537 delta = now - hwc->freq_count_stamp;
1383 freq = event->attr.sample_freq; 1538 hwc->freq_count_stamp = now;
1384
1385 hwc->freq_count += freq;
1386 hwc->freq_interrupts += interrupts;
1387
1388 if (hwc->freq_count < HZ)
1389 continue;
1390
1391 interrupts = hwc->freq_interrupts;
1392 hwc->freq_interrupts = 0;
1393 hwc->freq_count -= HZ;
1394 } else
1395 freq = HZ;
1396
1397 perf_adjust_period(event, freq * interrupts);
1398 1539
1399 /* 1540 if (delta > 0)
1400 * In order to avoid being stalled by an (accidental) huge 1541 perf_adjust_period(event, TICK_NSEC, delta);
1401 * sample period, force reset the sample period if we didn't 1542 perf_enable();
1402 * get any events in this freq period.
1403 */
1404 if (!interrupts) {
1405 perf_disable();
1406 event->pmu->disable(event);
1407 atomic64_set(&hwc->period_left, 0);
1408 event->pmu->enable(event);
1409 perf_enable();
1410 }
1411 } 1543 }
1412 spin_unlock(&ctx->lock); 1544 raw_spin_unlock(&ctx->lock);
1413} 1545}
1414 1546
1415/* 1547/*
@@ -1417,51 +1549,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1417 */ 1549 */
1418static void rotate_ctx(struct perf_event_context *ctx) 1550static void rotate_ctx(struct perf_event_context *ctx)
1419{ 1551{
1420 struct perf_event *event; 1552 raw_spin_lock(&ctx->lock);
1421 1553
1422 if (!ctx->nr_events) 1554 /* Rotate the first entry last of non-pinned groups */
1423 return; 1555 list_rotate_left(&ctx->flexible_groups);
1424
1425 spin_lock(&ctx->lock);
1426 /*
1427 * Rotate the first entry last (works just fine for group events too):
1428 */
1429 perf_disable();
1430 list_for_each_entry(event, &ctx->group_list, group_entry) {
1431 list_move_tail(&event->group_entry, &ctx->group_list);
1432 break;
1433 }
1434 perf_enable();
1435 1556
1436 spin_unlock(&ctx->lock); 1557 raw_spin_unlock(&ctx->lock);
1437} 1558}
1438 1559
1439void perf_event_task_tick(struct task_struct *curr, int cpu) 1560void perf_event_task_tick(struct task_struct *curr)
1440{ 1561{
1441 struct perf_cpu_context *cpuctx; 1562 struct perf_cpu_context *cpuctx;
1442 struct perf_event_context *ctx; 1563 struct perf_event_context *ctx;
1564 int rotate = 0;
1443 1565
1444 if (!atomic_read(&nr_events)) 1566 if (!atomic_read(&nr_events))
1445 return; 1567 return;
1446 1568
1447 cpuctx = &per_cpu(perf_cpu_context, cpu); 1569 cpuctx = &__get_cpu_var(perf_cpu_context);
1570 if (cpuctx->ctx.nr_events &&
1571 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1572 rotate = 1;
1573
1448 ctx = curr->perf_event_ctxp; 1574 ctx = curr->perf_event_ctxp;
1575 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1576 rotate = 1;
1449 1577
1450 perf_ctx_adjust_freq(&cpuctx->ctx); 1578 perf_ctx_adjust_freq(&cpuctx->ctx);
1451 if (ctx) 1579 if (ctx)
1452 perf_ctx_adjust_freq(ctx); 1580 perf_ctx_adjust_freq(ctx);
1453 1581
1454 perf_event_cpu_sched_out(cpuctx); 1582 if (!rotate)
1583 return;
1584
1585 perf_disable();
1586 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1455 if (ctx) 1587 if (ctx)
1456 __perf_event_task_sched_out(ctx); 1588 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1457 1589
1458 rotate_ctx(&cpuctx->ctx); 1590 rotate_ctx(&cpuctx->ctx);
1459 if (ctx) 1591 if (ctx)
1460 rotate_ctx(ctx); 1592 rotate_ctx(ctx);
1461 1593
1462 perf_event_cpu_sched_in(cpuctx, cpu); 1594 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1463 if (ctx) 1595 if (ctx)
1464 perf_event_task_sched_in(curr, cpu); 1596 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1597 perf_enable();
1598}
1599
1600static int event_enable_on_exec(struct perf_event *event,
1601 struct perf_event_context *ctx)
1602{
1603 if (!event->attr.enable_on_exec)
1604 return 0;
1605
1606 event->attr.enable_on_exec = 0;
1607 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1608 return 0;
1609
1610 __perf_event_mark_enabled(event, ctx);
1611
1612 return 1;
1465} 1613}
1466 1614
1467/* 1615/*
@@ -1474,6 +1622,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1474 struct perf_event *event; 1622 struct perf_event *event;
1475 unsigned long flags; 1623 unsigned long flags;
1476 int enabled = 0; 1624 int enabled = 0;
1625 int ret;
1477 1626
1478 local_irq_save(flags); 1627 local_irq_save(flags);
1479 ctx = task->perf_event_ctxp; 1628 ctx = task->perf_event_ctxp;
@@ -1482,16 +1631,18 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1482 1631
1483 __perf_event_task_sched_out(ctx); 1632 __perf_event_task_sched_out(ctx);
1484 1633
1485 spin_lock(&ctx->lock); 1634 raw_spin_lock(&ctx->lock);
1486 1635
1487 list_for_each_entry(event, &ctx->group_list, group_entry) { 1636 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1488 if (!event->attr.enable_on_exec) 1637 ret = event_enable_on_exec(event, ctx);
1489 continue; 1638 if (ret)
1490 event->attr.enable_on_exec = 0; 1639 enabled = 1;
1491 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1640 }
1492 continue; 1641
1493 __perf_event_mark_enabled(event, ctx); 1642 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1494 enabled = 1; 1643 ret = event_enable_on_exec(event, ctx);
1644 if (ret)
1645 enabled = 1;
1495 } 1646 }
1496 1647
1497 /* 1648 /*
@@ -1500,9 +1651,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1500 if (enabled) 1651 if (enabled)
1501 unclone_ctx(ctx); 1652 unclone_ctx(ctx);
1502 1653
1503 spin_unlock(&ctx->lock); 1654 raw_spin_unlock(&ctx->lock);
1504 1655
1505 perf_event_task_sched_in(task, smp_processor_id()); 1656 perf_event_task_sched_in(task);
1506 out: 1657 out:
1507 local_irq_restore(flags); 1658 local_irq_restore(flags);
1508} 1659}
@@ -1515,7 +1666,6 @@ static void __perf_event_read(void *info)
1515 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1666 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1516 struct perf_event *event = info; 1667 struct perf_event *event = info;
1517 struct perf_event_context *ctx = event->ctx; 1668 struct perf_event_context *ctx = event->ctx;
1518 unsigned long flags;
1519 1669
1520 /* 1670 /*
1521 * If this is a task context, we need to check whether it is 1671 * If this is a task context, we need to check whether it is
@@ -1527,12 +1677,12 @@ static void __perf_event_read(void *info)
1527 if (ctx->task && cpuctx->task_ctx != ctx) 1677 if (ctx->task && cpuctx->task_ctx != ctx)
1528 return; 1678 return;
1529 1679
1530 local_irq_save(flags); 1680 raw_spin_lock(&ctx->lock);
1531 if (ctx->is_active) 1681 update_context_time(ctx);
1532 update_context_time(ctx);
1533 event->pmu->read(event);
1534 update_event_times(event); 1682 update_event_times(event);
1535 local_irq_restore(flags); 1683 raw_spin_unlock(&ctx->lock);
1684
1685 event->pmu->read(event);
1536} 1686}
1537 1687
1538static u64 perf_event_read(struct perf_event *event) 1688static u64 perf_event_read(struct perf_event *event)
@@ -1545,7 +1695,13 @@ static u64 perf_event_read(struct perf_event *event)
1545 smp_call_function_single(event->oncpu, 1695 smp_call_function_single(event->oncpu,
1546 __perf_event_read, event, 1); 1696 __perf_event_read, event, 1);
1547 } else if (event->state == PERF_EVENT_STATE_INACTIVE) { 1697 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1698 struct perf_event_context *ctx = event->ctx;
1699 unsigned long flags;
1700
1701 raw_spin_lock_irqsave(&ctx->lock, flags);
1702 update_context_time(ctx);
1548 update_event_times(event); 1703 update_event_times(event);
1704 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1549 } 1705 }
1550 1706
1551 return atomic64_read(&event->count); 1707 return atomic64_read(&event->count);
@@ -1558,10 +1714,10 @@ static void
1558__perf_event_init_context(struct perf_event_context *ctx, 1714__perf_event_init_context(struct perf_event_context *ctx,
1559 struct task_struct *task) 1715 struct task_struct *task)
1560{ 1716{
1561 memset(ctx, 0, sizeof(*ctx)); 1717 raw_spin_lock_init(&ctx->lock);
1562 spin_lock_init(&ctx->lock);
1563 mutex_init(&ctx->mutex); 1718 mutex_init(&ctx->mutex);
1564 INIT_LIST_HEAD(&ctx->group_list); 1719 INIT_LIST_HEAD(&ctx->pinned_groups);
1720 INIT_LIST_HEAD(&ctx->flexible_groups);
1565 INIT_LIST_HEAD(&ctx->event_list); 1721 INIT_LIST_HEAD(&ctx->event_list);
1566 atomic_set(&ctx->refcount, 1); 1722 atomic_set(&ctx->refcount, 1);
1567 ctx->task = task; 1723 ctx->task = task;
@@ -1575,15 +1731,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1575 unsigned long flags; 1731 unsigned long flags;
1576 int err; 1732 int err;
1577 1733
1578 /* 1734 if (pid == -1 && cpu != -1) {
1579 * If cpu is not a wildcard then this is a percpu event:
1580 */
1581 if (cpu != -1) {
1582 /* Must be root to operate on a CPU event: */ 1735 /* Must be root to operate on a CPU event: */
1583 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 1736 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1584 return ERR_PTR(-EACCES); 1737 return ERR_PTR(-EACCES);
1585 1738
1586 if (cpu < 0 || cpu > num_possible_cpus()) 1739 if (cpu < 0 || cpu >= nr_cpumask_bits)
1587 return ERR_PTR(-EINVAL); 1740 return ERR_PTR(-EINVAL);
1588 1741
1589 /* 1742 /*
@@ -1591,7 +1744,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1591 * offline CPU and activate it when the CPU comes up, but 1744 * offline CPU and activate it when the CPU comes up, but
1592 * that's for later. 1745 * that's for later.
1593 */ 1746 */
1594 if (!cpu_isset(cpu, cpu_online_map)) 1747 if (!cpu_online(cpu))
1595 return ERR_PTR(-ENODEV); 1748 return ERR_PTR(-ENODEV);
1596 1749
1597 cpuctx = &per_cpu(perf_cpu_context, cpu); 1750 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1629,11 +1782,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1629 ctx = perf_lock_task_context(task, &flags); 1782 ctx = perf_lock_task_context(task, &flags);
1630 if (ctx) { 1783 if (ctx) {
1631 unclone_ctx(ctx); 1784 unclone_ctx(ctx);
1632 spin_unlock_irqrestore(&ctx->lock, flags); 1785 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1633 } 1786 }
1634 1787
1635 if (!ctx) { 1788 if (!ctx) {
1636 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1789 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1637 err = -ENOMEM; 1790 err = -ENOMEM;
1638 if (!ctx) 1791 if (!ctx)
1639 goto errout; 1792 goto errout;
@@ -1658,6 +1811,8 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1658 return ERR_PTR(err); 1811 return ERR_PTR(err);
1659} 1812}
1660 1813
1814static void perf_event_free_filter(struct perf_event *event);
1815
1661static void free_event_rcu(struct rcu_head *head) 1816static void free_event_rcu(struct rcu_head *head)
1662{ 1817{
1663 struct perf_event *event; 1818 struct perf_event *event;
@@ -1665,6 +1820,7 @@ static void free_event_rcu(struct rcu_head *head)
1665 event = container_of(head, struct perf_event, rcu_head); 1820 event = container_of(head, struct perf_event, rcu_head);
1666 if (event->ns) 1821 if (event->ns)
1667 put_pid_ns(event->ns); 1822 put_pid_ns(event->ns);
1823 perf_event_free_filter(event);
1668 kfree(event); 1824 kfree(event);
1669} 1825}
1670 1826
@@ -1696,16 +1852,10 @@ static void free_event(struct perf_event *event)
1696 call_rcu(&event->rcu_head, free_event_rcu); 1852 call_rcu(&event->rcu_head, free_event_rcu);
1697} 1853}
1698 1854
1699/* 1855int perf_event_release_kernel(struct perf_event *event)
1700 * Called when the last reference to the file is gone.
1701 */
1702static int perf_release(struct inode *inode, struct file *file)
1703{ 1856{
1704 struct perf_event *event = file->private_data;
1705 struct perf_event_context *ctx = event->ctx; 1857 struct perf_event_context *ctx = event->ctx;
1706 1858
1707 file->private_data = NULL;
1708
1709 WARN_ON_ONCE(ctx->parent_ctx); 1859 WARN_ON_ONCE(ctx->parent_ctx);
1710 mutex_lock(&ctx->mutex); 1860 mutex_lock(&ctx->mutex);
1711 perf_event_remove_from_context(event); 1861 perf_event_remove_from_context(event);
@@ -1720,6 +1870,19 @@ static int perf_release(struct inode *inode, struct file *file)
1720 1870
1721 return 0; 1871 return 0;
1722} 1872}
1873EXPORT_SYMBOL_GPL(perf_event_release_kernel);
1874
1875/*
1876 * Called when the last reference to the file is gone.
1877 */
1878static int perf_release(struct inode *inode, struct file *file)
1879{
1880 struct perf_event *event = file->private_data;
1881
1882 file->private_data = NULL;
1883
1884 return perf_event_release_kernel(event);
1885}
1723 1886
1724static int perf_event_read_size(struct perf_event *event) 1887static int perf_event_read_size(struct perf_event *event)
1725{ 1888{
@@ -1746,91 +1909,94 @@ static int perf_event_read_size(struct perf_event *event)
1746 return size; 1909 return size;
1747} 1910}
1748 1911
1749static u64 perf_event_read_value(struct perf_event *event) 1912u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
1750{ 1913{
1751 struct perf_event *child; 1914 struct perf_event *child;
1752 u64 total = 0; 1915 u64 total = 0;
1753 1916
1917 *enabled = 0;
1918 *running = 0;
1919
1920 mutex_lock(&event->child_mutex);
1754 total += perf_event_read(event); 1921 total += perf_event_read(event);
1755 list_for_each_entry(child, &event->child_list, child_list) 1922 *enabled += event->total_time_enabled +
1923 atomic64_read(&event->child_total_time_enabled);
1924 *running += event->total_time_running +
1925 atomic64_read(&event->child_total_time_running);
1926
1927 list_for_each_entry(child, &event->child_list, child_list) {
1756 total += perf_event_read(child); 1928 total += perf_event_read(child);
1929 *enabled += child->total_time_enabled;
1930 *running += child->total_time_running;
1931 }
1932 mutex_unlock(&event->child_mutex);
1757 1933
1758 return total; 1934 return total;
1759} 1935}
1760 1936EXPORT_SYMBOL_GPL(perf_event_read_value);
1761static int perf_event_read_entry(struct perf_event *event,
1762 u64 read_format, char __user *buf)
1763{
1764 int n = 0, count = 0;
1765 u64 values[2];
1766
1767 values[n++] = perf_event_read_value(event);
1768 if (read_format & PERF_FORMAT_ID)
1769 values[n++] = primary_event_id(event);
1770
1771 count = n * sizeof(u64);
1772
1773 if (copy_to_user(buf, values, count))
1774 return -EFAULT;
1775
1776 return count;
1777}
1778 1937
1779static int perf_event_read_group(struct perf_event *event, 1938static int perf_event_read_group(struct perf_event *event,
1780 u64 read_format, char __user *buf) 1939 u64 read_format, char __user *buf)
1781{ 1940{
1782 struct perf_event *leader = event->group_leader, *sub; 1941 struct perf_event *leader = event->group_leader, *sub;
1783 int n = 0, size = 0, err = -EFAULT; 1942 int n = 0, size = 0, ret = -EFAULT;
1784 u64 values[3]; 1943 struct perf_event_context *ctx = leader->ctx;
1944 u64 values[5];
1945 u64 count, enabled, running;
1946
1947 mutex_lock(&ctx->mutex);
1948 count = perf_event_read_value(leader, &enabled, &running);
1785 1949
1786 values[n++] = 1 + leader->nr_siblings; 1950 values[n++] = 1 + leader->nr_siblings;
1787 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1951 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1788 values[n++] = leader->total_time_enabled + 1952 values[n++] = enabled;
1789 atomic64_read(&leader->child_total_time_enabled); 1953 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1790 } 1954 values[n++] = running;
1791 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { 1955 values[n++] = count;
1792 values[n++] = leader->total_time_running + 1956 if (read_format & PERF_FORMAT_ID)
1793 atomic64_read(&leader->child_total_time_running); 1957 values[n++] = primary_event_id(leader);
1794 }
1795 1958
1796 size = n * sizeof(u64); 1959 size = n * sizeof(u64);
1797 1960
1798 if (copy_to_user(buf, values, size)) 1961 if (copy_to_user(buf, values, size))
1799 return -EFAULT; 1962 goto unlock;
1800
1801 err = perf_event_read_entry(leader, read_format, buf + size);
1802 if (err < 0)
1803 return err;
1804 1963
1805 size += err; 1964 ret = size;
1806 1965
1807 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 1966 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1808 err = perf_event_read_entry(sub, read_format, 1967 n = 0;
1809 buf + size); 1968
1810 if (err < 0) 1969 values[n++] = perf_event_read_value(sub, &enabled, &running);
1811 return err; 1970 if (read_format & PERF_FORMAT_ID)
1971 values[n++] = primary_event_id(sub);
1812 1972
1813 size += err; 1973 size = n * sizeof(u64);
1974
1975 if (copy_to_user(buf + ret, values, size)) {
1976 ret = -EFAULT;
1977 goto unlock;
1978 }
1979
1980 ret += size;
1814 } 1981 }
1982unlock:
1983 mutex_unlock(&ctx->mutex);
1815 1984
1816 return size; 1985 return ret;
1817} 1986}
1818 1987
1819static int perf_event_read_one(struct perf_event *event, 1988static int perf_event_read_one(struct perf_event *event,
1820 u64 read_format, char __user *buf) 1989 u64 read_format, char __user *buf)
1821{ 1990{
1991 u64 enabled, running;
1822 u64 values[4]; 1992 u64 values[4];
1823 int n = 0; 1993 int n = 0;
1824 1994
1825 values[n++] = perf_event_read_value(event); 1995 values[n++] = perf_event_read_value(event, &enabled, &running);
1826 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { 1996 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1827 values[n++] = event->total_time_enabled + 1997 values[n++] = enabled;
1828 atomic64_read(&event->child_total_time_enabled); 1998 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1829 } 1999 values[n++] = running;
1830 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1831 values[n++] = event->total_time_running +
1832 atomic64_read(&event->child_total_time_running);
1833 }
1834 if (read_format & PERF_FORMAT_ID) 2000 if (read_format & PERF_FORMAT_ID)
1835 values[n++] = primary_event_id(event); 2001 values[n++] = primary_event_id(event);
1836 2002
@@ -1861,12 +2027,10 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861 return -ENOSPC; 2027 return -ENOSPC;
1862 2028
1863 WARN_ON_ONCE(event->ctx->parent_ctx); 2029 WARN_ON_ONCE(event->ctx->parent_ctx);
1864 mutex_lock(&event->child_mutex);
1865 if (read_format & PERF_FORMAT_GROUP) 2030 if (read_format & PERF_FORMAT_GROUP)
1866 ret = perf_event_read_group(event, read_format, buf); 2031 ret = perf_event_read_group(event, read_format, buf);
1867 else 2032 else
1868 ret = perf_event_read_one(event, read_format, buf); 2033 ret = perf_event_read_one(event, read_format, buf);
1869 mutex_unlock(&event->child_mutex);
1870 2034
1871 return ret; 2035 return ret;
1872} 2036}
@@ -1956,7 +2120,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956 if (!value) 2120 if (!value)
1957 return -EINVAL; 2121 return -EINVAL;
1958 2122
1959 spin_lock_irq(&ctx->lock); 2123 raw_spin_lock_irq(&ctx->lock);
1960 if (event->attr.freq) { 2124 if (event->attr.freq) {
1961 if (value > sysctl_perf_event_sample_rate) { 2125 if (value > sysctl_perf_event_sample_rate) {
1962 ret = -EINVAL; 2126 ret = -EINVAL;
@@ -1969,12 +2133,13 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1969 event->hw.sample_period = value; 2133 event->hw.sample_period = value;
1970 } 2134 }
1971unlock: 2135unlock:
1972 spin_unlock_irq(&ctx->lock); 2136 raw_spin_unlock_irq(&ctx->lock);
1973 2137
1974 return ret; 2138 return ret;
1975} 2139}
1976 2140
1977int perf_event_set_output(struct perf_event *event, int output_fd); 2141static int perf_event_set_output(struct perf_event *event, int output_fd);
2142static int perf_event_set_filter(struct perf_event *event, void __user *arg);
1978 2143
1979static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2144static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1980{ 2145{
@@ -2002,6 +2167,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2002 case PERF_EVENT_IOC_SET_OUTPUT: 2167 case PERF_EVENT_IOC_SET_OUTPUT:
2003 return perf_event_set_output(event, arg); 2168 return perf_event_set_output(event, arg);
2004 2169
2170 case PERF_EVENT_IOC_SET_FILTER:
2171 return perf_event_set_filter(event, (void __user *)arg);
2172
2005 default: 2173 default:
2006 return -ENOTTY; 2174 return -ENOTTY;
2007 } 2175 }
@@ -2174,6 +2342,7 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2174 perf_mmap_free_page((unsigned long)data->user_page); 2342 perf_mmap_free_page((unsigned long)data->user_page);
2175 for (i = 0; i < data->nr_pages; i++) 2343 for (i = 0; i < data->nr_pages; i++)
2176 perf_mmap_free_page((unsigned long)data->data_pages[i]); 2344 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2345 kfree(data);
2177} 2346}
2178 2347
2179#else 2348#else
@@ -2214,6 +2383,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2214 perf_mmap_unmark_page(base + (i * PAGE_SIZE)); 2383 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
2215 2384
2216 vfree(base); 2385 vfree(base);
2386 kfree(data);
2217} 2387}
2218 2388
2219static void perf_mmap_data_free(struct perf_mmap_data *data) 2389static void perf_mmap_data_free(struct perf_mmap_data *data)
@@ -2307,7 +2477,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2307 } 2477 }
2308 2478
2309 if (!data->watermark) 2479 if (!data->watermark)
2310 data->watermark = max_t(long, PAGE_SIZE, max_size / 2); 2480 data->watermark = max_size / 2;
2311 2481
2312 2482
2313 rcu_assign_pointer(event->data, data); 2483 rcu_assign_pointer(event->data, data);
@@ -2319,7 +2489,6 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2319 2489
2320 data = container_of(rcu_head, struct perf_mmap_data, rcu_head); 2490 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2321 perf_mmap_data_free(data); 2491 perf_mmap_data_free(data);
2322 kfree(data);
2323} 2492}
2324 2493
2325static void perf_mmap_data_release(struct perf_event *event) 2494static void perf_mmap_data_release(struct perf_event *event)
@@ -2420,7 +2589,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2420 if (user_locked > user_lock_limit) 2589 if (user_locked > user_lock_limit)
2421 extra = user_locked - user_lock_limit; 2590 extra = user_locked - user_lock_limit;
2422 2591
2423 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2592 lock_limit = rlimit(RLIMIT_MEMLOCK);
2424 lock_limit >>= PAGE_SHIFT; 2593 lock_limit >>= PAGE_SHIFT;
2425 locked = vma->vm_mm->locked_vm + extra; 2594 locked = vma->vm_mm->locked_vm + extra;
2426 2595
@@ -2616,6 +2785,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2616 return NULL; 2785 return NULL;
2617} 2786}
2618 2787
2788__weak
2789void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2790{
2791}
2792
2793
2619/* 2794/*
2620 * Output 2795 * Output
2621 */ 2796 */
@@ -2666,20 +2841,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2666static void perf_output_lock(struct perf_output_handle *handle) 2841static void perf_output_lock(struct perf_output_handle *handle)
2667{ 2842{
2668 struct perf_mmap_data *data = handle->data; 2843 struct perf_mmap_data *data = handle->data;
2669 int cpu; 2844 int cur, cpu = get_cpu();
2670 2845
2671 handle->locked = 0; 2846 handle->locked = 0;
2672 2847
2673 local_irq_save(handle->flags); 2848 for (;;) {
2674 cpu = smp_processor_id(); 2849 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2675 2850 if (cur == -1) {
2676 if (in_nmi() && atomic_read(&data->lock) == cpu) 2851 handle->locked = 1;
2677 return; 2852 break;
2853 }
2854 if (cur == cpu)
2855 break;
2678 2856
2679 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2680 cpu_relax(); 2857 cpu_relax();
2681 2858 }
2682 handle->locked = 1;
2683} 2859}
2684 2860
2685static void perf_output_unlock(struct perf_output_handle *handle) 2861static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2725,7 +2901,7 @@ again:
2725 if (atomic_xchg(&data->wakeup, 0)) 2901 if (atomic_xchg(&data->wakeup, 0))
2726 perf_output_wakeup(handle); 2902 perf_output_wakeup(handle);
2727out: 2903out:
2728 local_irq_restore(handle->flags); 2904 put_cpu();
2729} 2905}
2730 2906
2731void perf_output_copy(struct perf_output_handle *handle, 2907void perf_output_copy(struct perf_output_handle *handle,
@@ -3200,15 +3376,23 @@ static void perf_event_task_output(struct perf_event *event,
3200 struct perf_task_event *task_event) 3376 struct perf_task_event *task_event)
3201{ 3377{
3202 struct perf_output_handle handle; 3378 struct perf_output_handle handle;
3203 int size;
3204 struct task_struct *task = task_event->task; 3379 struct task_struct *task = task_event->task;
3205 int ret; 3380 unsigned long flags;
3381 int size, ret;
3382
3383 /*
3384 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3385 * in perf_output_lock() from interrupt context, it's game over.
3386 */
3387 local_irq_save(flags);
3206 3388
3207 size = task_event->event_id.header.size; 3389 size = task_event->event_id.header.size;
3208 ret = perf_output_begin(&handle, event, size, 0, 0); 3390 ret = perf_output_begin(&handle, event, size, 0, 0);
3209 3391
3210 if (ret) 3392 if (ret) {
3393 local_irq_restore(flags);
3211 return; 3394 return;
3395 }
3212 3396
3213 task_event->event_id.pid = perf_event_pid(event, task); 3397 task_event->event_id.pid = perf_event_pid(event, task);
3214 task_event->event_id.ppid = perf_event_pid(event, current); 3398 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3216,15 +3400,20 @@ static void perf_event_task_output(struct perf_event *event,
3216 task_event->event_id.tid = perf_event_tid(event, task); 3400 task_event->event_id.tid = perf_event_tid(event, task);
3217 task_event->event_id.ptid = perf_event_tid(event, current); 3401 task_event->event_id.ptid = perf_event_tid(event, current);
3218 3402
3219 task_event->event_id.time = perf_clock();
3220
3221 perf_output_put(&handle, task_event->event_id); 3403 perf_output_put(&handle, task_event->event_id);
3222 3404
3223 perf_output_end(&handle); 3405 perf_output_end(&handle);
3406 local_irq_restore(flags);
3224} 3407}
3225 3408
3226static int perf_event_task_match(struct perf_event *event) 3409static int perf_event_task_match(struct perf_event *event)
3227{ 3410{
3411 if (event->state < PERF_EVENT_STATE_INACTIVE)
3412 return 0;
3413
3414 if (event->cpu != -1 && event->cpu != smp_processor_id())
3415 return 0;
3416
3228 if (event->attr.comm || event->attr.mmap || event->attr.task) 3417 if (event->attr.comm || event->attr.mmap || event->attr.task)
3229 return 1; 3418 return 1;
3230 3419
@@ -3236,15 +3425,10 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3236{ 3425{
3237 struct perf_event *event; 3426 struct perf_event *event;
3238 3427
3239 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3240 return;
3241
3242 rcu_read_lock();
3243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3428 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3244 if (perf_event_task_match(event)) 3429 if (perf_event_task_match(event))
3245 perf_event_task_output(event, task_event); 3430 perf_event_task_output(event, task_event);
3246 } 3431 }
3247 rcu_read_unlock();
3248} 3432}
3249 3433
3250static void perf_event_task_event(struct perf_task_event *task_event) 3434static void perf_event_task_event(struct perf_task_event *task_event)
@@ -3252,15 +3436,14 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3252 struct perf_cpu_context *cpuctx; 3436 struct perf_cpu_context *cpuctx;
3253 struct perf_event_context *ctx = task_event->task_ctx; 3437 struct perf_event_context *ctx = task_event->task_ctx;
3254 3438
3439 rcu_read_lock();
3255 cpuctx = &get_cpu_var(perf_cpu_context); 3440 cpuctx = &get_cpu_var(perf_cpu_context);
3256 perf_event_task_ctx(&cpuctx->ctx, task_event); 3441 perf_event_task_ctx(&cpuctx->ctx, task_event);
3257 put_cpu_var(perf_cpu_context);
3258
3259 rcu_read_lock();
3260 if (!ctx) 3442 if (!ctx)
3261 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3443 ctx = rcu_dereference(current->perf_event_ctxp);
3262 if (ctx) 3444 if (ctx)
3263 perf_event_task_ctx(ctx, task_event); 3445 perf_event_task_ctx(ctx, task_event);
3446 put_cpu_var(perf_cpu_context);
3264 rcu_read_unlock(); 3447 rcu_read_unlock();
3265} 3448}
3266 3449
@@ -3288,6 +3471,7 @@ static void perf_event_task(struct task_struct *task,
3288 /* .ppid */ 3471 /* .ppid */
3289 /* .tid */ 3472 /* .tid */
3290 /* .ptid */ 3473 /* .ptid */
3474 .time = perf_clock(),
3291 }, 3475 },
3292 }; 3476 };
3293 3477
@@ -3337,6 +3521,12 @@ static void perf_event_comm_output(struct perf_event *event,
3337 3521
3338static int perf_event_comm_match(struct perf_event *event) 3522static int perf_event_comm_match(struct perf_event *event)
3339{ 3523{
3524 if (event->state < PERF_EVENT_STATE_INACTIVE)
3525 return 0;
3526
3527 if (event->cpu != -1 && event->cpu != smp_processor_id())
3528 return 0;
3529
3340 if (event->attr.comm) 3530 if (event->attr.comm)
3341 return 1; 3531 return 1;
3342 3532
@@ -3348,15 +3538,10 @@ static void perf_event_comm_ctx(struct perf_event_context *ctx,
3348{ 3538{
3349 struct perf_event *event; 3539 struct perf_event *event;
3350 3540
3351 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3352 return;
3353
3354 rcu_read_lock();
3355 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3541 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3356 if (perf_event_comm_match(event)) 3542 if (perf_event_comm_match(event))
3357 perf_event_comm_output(event, comm_event); 3543 perf_event_comm_output(event, comm_event);
3358 } 3544 }
3359 rcu_read_unlock();
3360} 3545}
3361 3546
3362static void perf_event_comm_event(struct perf_comm_event *comm_event) 3547static void perf_event_comm_event(struct perf_comm_event *comm_event)
@@ -3367,7 +3552,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3367 char comm[TASK_COMM_LEN]; 3552 char comm[TASK_COMM_LEN];
3368 3553
3369 memset(comm, 0, sizeof(comm)); 3554 memset(comm, 0, sizeof(comm));
3370 strncpy(comm, comm_event->task->comm, sizeof(comm)); 3555 strlcpy(comm, comm_event->task->comm, sizeof(comm));
3371 size = ALIGN(strlen(comm)+1, sizeof(u64)); 3556 size = ALIGN(strlen(comm)+1, sizeof(u64));
3372 3557
3373 comm_event->comm = comm; 3558 comm_event->comm = comm;
@@ -3375,18 +3560,13 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3375 3560
3376 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; 3561 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3377 3562
3563 rcu_read_lock();
3378 cpuctx = &get_cpu_var(perf_cpu_context); 3564 cpuctx = &get_cpu_var(perf_cpu_context);
3379 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3565 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3380 put_cpu_var(perf_cpu_context);
3381
3382 rcu_read_lock();
3383 /*
3384 * doesn't really matter which of the child contexts the
3385 * events ends up in.
3386 */
3387 ctx = rcu_dereference(current->perf_event_ctxp); 3566 ctx = rcu_dereference(current->perf_event_ctxp);
3388 if (ctx) 3567 if (ctx)
3389 perf_event_comm_ctx(ctx, comm_event); 3568 perf_event_comm_ctx(ctx, comm_event);
3569 put_cpu_var(perf_cpu_context);
3390 rcu_read_unlock(); 3570 rcu_read_unlock();
3391} 3571}
3392 3572
@@ -3461,6 +3641,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3461static int perf_event_mmap_match(struct perf_event *event, 3641static int perf_event_mmap_match(struct perf_event *event,
3462 struct perf_mmap_event *mmap_event) 3642 struct perf_mmap_event *mmap_event)
3463{ 3643{
3644 if (event->state < PERF_EVENT_STATE_INACTIVE)
3645 return 0;
3646
3647 if (event->cpu != -1 && event->cpu != smp_processor_id())
3648 return 0;
3649
3464 if (event->attr.mmap) 3650 if (event->attr.mmap)
3465 return 1; 3651 return 1;
3466 3652
@@ -3472,15 +3658,10 @@ static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3472{ 3658{
3473 struct perf_event *event; 3659 struct perf_event *event;
3474 3660
3475 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3476 return;
3477
3478 rcu_read_lock();
3479 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 3661 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3480 if (perf_event_mmap_match(event, mmap_event)) 3662 if (perf_event_mmap_match(event, mmap_event))
3481 perf_event_mmap_output(event, mmap_event); 3663 perf_event_mmap_output(event, mmap_event);
3482 } 3664 }
3483 rcu_read_unlock();
3484} 3665}
3485 3666
3486static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) 3667static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -3536,18 +3717,13 @@ got_name:
3536 3717
3537 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 3718 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3538 3719
3720 rcu_read_lock();
3539 cpuctx = &get_cpu_var(perf_cpu_context); 3721 cpuctx = &get_cpu_var(perf_cpu_context);
3540 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3722 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3541 put_cpu_var(perf_cpu_context);
3542
3543 rcu_read_lock();
3544 /*
3545 * doesn't really matter which of the child contexts the
3546 * events ends up in.
3547 */
3548 ctx = rcu_dereference(current->perf_event_ctxp); 3723 ctx = rcu_dereference(current->perf_event_ctxp);
3549 if (ctx) 3724 if (ctx)
3550 perf_event_mmap_ctx(ctx, mmap_event); 3725 perf_event_mmap_ctx(ctx, mmap_event);
3726 put_cpu_var(perf_cpu_context);
3551 rcu_read_unlock(); 3727 rcu_read_unlock();
3552 3728
3553 kfree(buf); 3729 kfree(buf);
@@ -3574,7 +3750,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3574 /* .tid */ 3750 /* .tid */
3575 .start = vma->vm_start, 3751 .start = vma->vm_start,
3576 .len = vma->vm_end - vma->vm_start, 3752 .len = vma->vm_end - vma->vm_start,
3577 .pgoff = vma->vm_pgoff, 3753 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3578 }, 3754 },
3579 }; 3755 };
3580 3756
@@ -3654,12 +3830,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3654 3830
3655 if (event->attr.freq) { 3831 if (event->attr.freq) {
3656 u64 now = perf_clock(); 3832 u64 now = perf_clock();
3657 s64 delta = now - hwc->freq_stamp; 3833 s64 delta = now - hwc->freq_time_stamp;
3658 3834
3659 hwc->freq_stamp = now; 3835 hwc->freq_time_stamp = now;
3660 3836
3661 if (delta > 0 && delta < TICK_NSEC) 3837 if (delta > 0 && delta < 2*TICK_NSEC)
3662 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3838 perf_adjust_period(event, delta, hwc->last_period);
3663 } 3839 }
3664 3840
3665 /* 3841 /*
@@ -3679,7 +3855,11 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3679 perf_event_disable(event); 3855 perf_event_disable(event);
3680 } 3856 }
3681 3857
3682 perf_event_output(event, nmi, data, regs); 3858 if (event->overflow_handler)
3859 event->overflow_handler(event, nmi, data, regs);
3860 else
3861 perf_event_output(event, nmi, data, regs);
3862
3683 return ret; 3863 return ret;
3684} 3864}
3685 3865
@@ -3724,16 +3904,16 @@ again:
3724 return nr; 3904 return nr;
3725} 3905}
3726 3906
3727static void perf_swevent_overflow(struct perf_event *event, 3907static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3728 int nmi, struct perf_sample_data *data, 3908 int nmi, struct perf_sample_data *data,
3729 struct pt_regs *regs) 3909 struct pt_regs *regs)
3730{ 3910{
3731 struct hw_perf_event *hwc = &event->hw; 3911 struct hw_perf_event *hwc = &event->hw;
3732 int throttle = 0; 3912 int throttle = 0;
3733 u64 overflow;
3734 3913
3735 data->period = event->hw.last_period; 3914 data->period = event->hw.last_period;
3736 overflow = perf_swevent_set_period(event); 3915 if (!overflow)
3916 overflow = perf_swevent_set_period(event);
3737 3917
3738 if (hwc->interrupts == MAX_INTERRUPTS) 3918 if (hwc->interrupts == MAX_INTERRUPTS)
3739 return; 3919 return;
@@ -3766,14 +3946,19 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3766 3946
3767 atomic64_add(nr, &event->count); 3947 atomic64_add(nr, &event->count);
3768 3948
3949 if (!regs)
3950 return;
3951
3769 if (!hwc->sample_period) 3952 if (!hwc->sample_period)
3770 return; 3953 return;
3771 3954
3772 if (!regs) 3955 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
3956 return perf_swevent_overflow(event, 1, nmi, data, regs);
3957
3958 if (atomic64_add_negative(nr, &hwc->period_left))
3773 return; 3959 return;
3774 3960
3775 if (!atomic64_add_negative(nr, &hwc->period_left)) 3961 perf_swevent_overflow(event, 0, nmi, data, regs);
3776 perf_swevent_overflow(event, nmi, data, regs);
3777} 3962}
3778 3963
3779static int perf_swevent_is_counting(struct perf_event *event) 3964static int perf_swevent_is_counting(struct perf_event *event)
@@ -3806,25 +3991,47 @@ static int perf_swevent_is_counting(struct perf_event *event)
3806 return 1; 3991 return 1;
3807} 3992}
3808 3993
3994static int perf_tp_event_match(struct perf_event *event,
3995 struct perf_sample_data *data);
3996
3997static int perf_exclude_event(struct perf_event *event,
3998 struct pt_regs *regs)
3999{
4000 if (regs) {
4001 if (event->attr.exclude_user && user_mode(regs))
4002 return 1;
4003
4004 if (event->attr.exclude_kernel && !user_mode(regs))
4005 return 1;
4006 }
4007
4008 return 0;
4009}
4010
3809static int perf_swevent_match(struct perf_event *event, 4011static int perf_swevent_match(struct perf_event *event,
3810 enum perf_type_id type, 4012 enum perf_type_id type,
3811 u32 event_id, struct pt_regs *regs) 4013 u32 event_id,
4014 struct perf_sample_data *data,
4015 struct pt_regs *regs)
3812{ 4016{
4017 if (event->cpu != -1 && event->cpu != smp_processor_id())
4018 return 0;
4019
3813 if (!perf_swevent_is_counting(event)) 4020 if (!perf_swevent_is_counting(event))
3814 return 0; 4021 return 0;
3815 4022
3816 if (event->attr.type != type) 4023 if (event->attr.type != type)
3817 return 0; 4024 return 0;
4025
3818 if (event->attr.config != event_id) 4026 if (event->attr.config != event_id)
3819 return 0; 4027 return 0;
3820 4028
3821 if (regs) { 4029 if (perf_exclude_event(event, regs))
3822 if (event->attr.exclude_user && user_mode(regs)) 4030 return 0;
3823 return 0;
3824 4031
3825 if (event->attr.exclude_kernel && !user_mode(regs)) 4032 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
3826 return 0; 4033 !perf_tp_event_match(event, data))
3827 } 4034 return 0;
3828 4035
3829 return 1; 4036 return 1;
3830} 4037}
@@ -3837,49 +4044,59 @@ static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3837{ 4044{
3838 struct perf_event *event; 4045 struct perf_event *event;
3839 4046
3840 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3841 return;
3842
3843 rcu_read_lock();
3844 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4047 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3845 if (perf_swevent_match(event, type, event_id, regs)) 4048 if (perf_swevent_match(event, type, event_id, data, regs))
3846 perf_swevent_add(event, nr, nmi, data, regs); 4049 perf_swevent_add(event, nr, nmi, data, regs);
3847 } 4050 }
3848 rcu_read_unlock();
3849} 4051}
3850 4052
3851static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) 4053int perf_swevent_get_recursion_context(void)
3852{ 4054{
4055 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
4056 int rctx;
4057
3853 if (in_nmi()) 4058 if (in_nmi())
3854 return &cpuctx->recursion[3]; 4059 rctx = 3;
4060 else if (in_irq())
4061 rctx = 2;
4062 else if (in_softirq())
4063 rctx = 1;
4064 else
4065 rctx = 0;
4066
4067 if (cpuctx->recursion[rctx]) {
4068 put_cpu_var(perf_cpu_context);
4069 return -1;
4070 }
3855 4071
3856 if (in_irq()) 4072 cpuctx->recursion[rctx]++;
3857 return &cpuctx->recursion[2]; 4073 barrier();
3858 4074
3859 if (in_softirq()) 4075 return rctx;
3860 return &cpuctx->recursion[1]; 4076}
4077EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
3861 4078
3862 return &cpuctx->recursion[0]; 4079void perf_swevent_put_recursion_context(int rctx)
4080{
4081 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4082 barrier();
4083 cpuctx->recursion[rctx]--;
4084 put_cpu_var(perf_cpu_context);
3863} 4085}
4086EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
3864 4087
3865static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4088static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3866 u64 nr, int nmi, 4089 u64 nr, int nmi,
3867 struct perf_sample_data *data, 4090 struct perf_sample_data *data,
3868 struct pt_regs *regs) 4091 struct pt_regs *regs)
3869{ 4092{
3870 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4093 struct perf_cpu_context *cpuctx;
3871 int *recursion = perf_swevent_recursion_context(cpuctx);
3872 struct perf_event_context *ctx; 4094 struct perf_event_context *ctx;
3873 4095
3874 if (*recursion) 4096 cpuctx = &__get_cpu_var(perf_cpu_context);
3875 goto out; 4097 rcu_read_lock();
3876
3877 (*recursion)++;
3878 barrier();
3879
3880 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, 4098 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3881 nr, nmi, data, regs); 4099 nr, nmi, data, regs);
3882 rcu_read_lock();
3883 /* 4100 /*
3884 * doesn't really matter which of the child contexts the 4101 * doesn't really matter which of the child contexts the
3885 * events ends up in. 4102 * events ends up in.
@@ -3888,23 +4105,23 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3888 if (ctx) 4105 if (ctx)
3889 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); 4106 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3890 rcu_read_unlock(); 4107 rcu_read_unlock();
3891
3892 barrier();
3893 (*recursion)--;
3894
3895out:
3896 put_cpu_var(perf_cpu_context);
3897} 4108}
3898 4109
3899void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4110void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3900 struct pt_regs *regs, u64 addr) 4111 struct pt_regs *regs, u64 addr)
3901{ 4112{
3902 struct perf_sample_data data = { 4113 struct perf_sample_data data;
3903 .addr = addr, 4114 int rctx;
3904 };
3905 4115
3906 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, 4116 rctx = perf_swevent_get_recursion_context();
3907 &data, regs); 4117 if (rctx < 0)
4118 return;
4119
4120 perf_sample_data_init(&data, addr);
4121
4122 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4123
4124 perf_swevent_put_recursion_context(rctx);
3908} 4125}
3909 4126
3910static void perf_swevent_read(struct perf_event *event) 4127static void perf_swevent_read(struct perf_event *event)
@@ -3945,10 +4162,11 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3945 struct perf_event *event; 4162 struct perf_event *event;
3946 u64 period; 4163 u64 period;
3947 4164
3948 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4165 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3949 event->pmu->read(event); 4166 event->pmu->read(event);
3950 4167
3951 data.addr = 0; 4168 perf_sample_data_init(&data, 0);
4169 data.period = event->hw.last_period;
3952 regs = get_irq_regs(); 4170 regs = get_irq_regs();
3953 /* 4171 /*
3954 * In case we exclude kernel IPs or are somehow not in interrupt 4172 * In case we exclude kernel IPs or are somehow not in interrupt
@@ -4017,8 +4235,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4017 u64 now; 4235 u64 now;
4018 4236
4019 now = cpu_clock(cpu); 4237 now = cpu_clock(cpu);
4020 prev = atomic64_read(&event->hw.prev_count); 4238 prev = atomic64_xchg(&event->hw.prev_count, now);
4021 atomic64_set(&event->hw.prev_count, now);
4022 atomic64_add(now - prev, &event->count); 4239 atomic64_add(now - prev, &event->count);
4023} 4240}
4024 4241
@@ -4107,36 +4324,39 @@ static const struct pmu perf_ops_task_clock = {
4107 .read = task_clock_perf_event_read, 4324 .read = task_clock_perf_event_read,
4108}; 4325};
4109 4326
4110#ifdef CONFIG_EVENT_PROFILE 4327#ifdef CONFIG_EVENT_TRACING
4328
4111void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4112 int entry_size) 4330 int entry_size, struct pt_regs *regs)
4113{ 4331{
4332 struct perf_sample_data data;
4114 struct perf_raw_record raw = { 4333 struct perf_raw_record raw = {
4115 .size = entry_size, 4334 .size = entry_size,
4116 .data = record, 4335 .data = record,
4117 }; 4336 };
4118 4337
4119 struct perf_sample_data data = { 4338 perf_sample_data_init(&data, addr);
4120 .addr = addr, 4339 data.raw = &raw;
4121 .raw = &raw,
4122 };
4123
4124 struct pt_regs *regs = get_irq_regs();
4125
4126 if (!regs)
4127 regs = task_pt_regs(current);
4128 4340
4341 /* Trace events already protected against recursion */
4129 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4342 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4130 &data, regs); 4343 &data, regs);
4131} 4344}
4132EXPORT_SYMBOL_GPL(perf_tp_event); 4345EXPORT_SYMBOL_GPL(perf_tp_event);
4133 4346
4134extern int ftrace_profile_enable(int); 4347static int perf_tp_event_match(struct perf_event *event,
4135extern void ftrace_profile_disable(int); 4348 struct perf_sample_data *data)
4349{
4350 void *record = data->raw->data;
4351
4352 if (likely(!event->filter) || filter_match_preds(event->filter, record))
4353 return 1;
4354 return 0;
4355}
4136 4356
4137static void tp_perf_event_destroy(struct perf_event *event) 4357static void tp_perf_event_destroy(struct perf_event *event)
4138{ 4358{
4139 ftrace_profile_disable(event->attr.config); 4359 perf_trace_disable(event->attr.config);
4140} 4360}
4141 4361
4142static const struct pmu *tp_perf_event_init(struct perf_event *event) 4362static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4150,18 +4370,99 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4150 !capable(CAP_SYS_ADMIN)) 4370 !capable(CAP_SYS_ADMIN))
4151 return ERR_PTR(-EPERM); 4371 return ERR_PTR(-EPERM);
4152 4372
4153 if (ftrace_profile_enable(event->attr.config)) 4373 if (perf_trace_enable(event->attr.config))
4154 return NULL; 4374 return NULL;
4155 4375
4156 event->destroy = tp_perf_event_destroy; 4376 event->destroy = tp_perf_event_destroy;
4157 4377
4158 return &perf_ops_generic; 4378 return &perf_ops_generic;
4159} 4379}
4380
4381static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4382{
4383 char *filter_str;
4384 int ret;
4385
4386 if (event->attr.type != PERF_TYPE_TRACEPOINT)
4387 return -EINVAL;
4388
4389 filter_str = strndup_user(arg, PAGE_SIZE);
4390 if (IS_ERR(filter_str))
4391 return PTR_ERR(filter_str);
4392
4393 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
4394
4395 kfree(filter_str);
4396 return ret;
4397}
4398
4399static void perf_event_free_filter(struct perf_event *event)
4400{
4401 ftrace_profile_free_filter(event);
4402}
4403
4160#else 4404#else
4405
4406static int perf_tp_event_match(struct perf_event *event,
4407 struct perf_sample_data *data)
4408{
4409 return 1;
4410}
4411
4161static const struct pmu *tp_perf_event_init(struct perf_event *event) 4412static const struct pmu *tp_perf_event_init(struct perf_event *event)
4162{ 4413{
4163 return NULL; 4414 return NULL;
4164} 4415}
4416
4417static int perf_event_set_filter(struct perf_event *event, void __user *arg)
4418{
4419 return -ENOENT;
4420}
4421
4422static void perf_event_free_filter(struct perf_event *event)
4423{
4424}
4425
4426#endif /* CONFIG_EVENT_TRACING */
4427
4428#ifdef CONFIG_HAVE_HW_BREAKPOINT
4429static void bp_perf_event_destroy(struct perf_event *event)
4430{
4431 release_bp_slot(event);
4432}
4433
4434static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4435{
4436 int err;
4437
4438 err = register_perf_hw_breakpoint(bp);
4439 if (err)
4440 return ERR_PTR(err);
4441
4442 bp->destroy = bp_perf_event_destroy;
4443
4444 return &perf_ops_bp;
4445}
4446
4447void perf_bp_event(struct perf_event *bp, void *data)
4448{
4449 struct perf_sample_data sample;
4450 struct pt_regs *regs = data;
4451
4452 perf_sample_data_init(&sample, bp->attr.bp_addr);
4453
4454 if (!perf_exclude_event(bp, regs))
4455 perf_swevent_add(bp, 1, 1, &sample, regs);
4456}
4457#else
4458static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4459{
4460 return NULL;
4461}
4462
4463void perf_bp_event(struct perf_event *bp, void *regs)
4464{
4465}
4165#endif 4466#endif
4166 4467
4167atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; 4468atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
@@ -4208,6 +4509,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4208 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4509 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4209 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4510 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4210 case PERF_COUNT_SW_CPU_MIGRATIONS: 4511 case PERF_COUNT_SW_CPU_MIGRATIONS:
4512 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513 case PERF_COUNT_SW_EMULATION_FAULTS:
4211 if (!event->parent) { 4514 if (!event->parent) {
4212 atomic_inc(&perf_swevent_enabled[event_id]); 4515 atomic_inc(&perf_swevent_enabled[event_id]);
4213 event->destroy = sw_perf_event_destroy; 4516 event->destroy = sw_perf_event_destroy;
@@ -4228,6 +4531,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4228 struct perf_event_context *ctx, 4531 struct perf_event_context *ctx,
4229 struct perf_event *group_leader, 4532 struct perf_event *group_leader,
4230 struct perf_event *parent_event, 4533 struct perf_event *parent_event,
4534 perf_overflow_handler_t overflow_handler,
4231 gfp_t gfpflags) 4535 gfp_t gfpflags)
4232{ 4536{
4233 const struct pmu *pmu; 4537 const struct pmu *pmu;
@@ -4270,6 +4574,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4270 4574
4271 event->state = PERF_EVENT_STATE_INACTIVE; 4575 event->state = PERF_EVENT_STATE_INACTIVE;
4272 4576
4577 if (!overflow_handler && parent_event)
4578 overflow_handler = parent_event->overflow_handler;
4579
4580 event->overflow_handler = overflow_handler;
4581
4273 if (attr->disabled) 4582 if (attr->disabled)
4274 event->state = PERF_EVENT_STATE_OFF; 4583 event->state = PERF_EVENT_STATE_OFF;
4275 4584
@@ -4304,6 +4613,11 @@ perf_event_alloc(struct perf_event_attr *attr,
4304 pmu = tp_perf_event_init(event); 4613 pmu = tp_perf_event_init(event);
4305 break; 4614 break;
4306 4615
4616 case PERF_TYPE_BREAKPOINT:
4617 pmu = bp_perf_event_init(event);
4618 break;
4619
4620
4307 default: 4621 default:
4308 break; 4622 break;
4309 } 4623 }
@@ -4398,7 +4712,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4398 if (attr->type >= PERF_TYPE_MAX) 4712 if (attr->type >= PERF_TYPE_MAX)
4399 return -EINVAL; 4713 return -EINVAL;
4400 4714
4401 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) 4715 if (attr->__reserved_1)
4402 return -EINVAL; 4716 return -EINVAL;
4403 4717
4404 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4718 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4416,7 +4730,7 @@ err_size:
4416 goto out; 4730 goto out;
4417} 4731}
4418 4732
4419int perf_event_set_output(struct perf_event *event, int output_fd) 4733static int perf_event_set_output(struct perf_event *event, int output_fd)
4420{ 4734{
4421 struct perf_event *output_event = NULL; 4735 struct perf_event *output_event = NULL;
4422 struct file *output_file = NULL; 4736 struct file *output_file = NULL;
@@ -4546,12 +4860,12 @@ SYSCALL_DEFINE5(perf_event_open,
4546 } 4860 }
4547 4861
4548 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 4862 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4549 NULL, GFP_KERNEL); 4863 NULL, NULL, GFP_KERNEL);
4550 err = PTR_ERR(event); 4864 err = PTR_ERR(event);
4551 if (IS_ERR(event)) 4865 if (IS_ERR(event))
4552 goto err_put_context; 4866 goto err_put_context;
4553 4867
4554 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4868 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4555 if (err < 0) 4869 if (err < 0)
4556 goto err_free_put_context; 4870 goto err_free_put_context;
4557 4871
@@ -4583,7 +4897,7 @@ err_fput_free_put_context:
4583 4897
4584err_free_put_context: 4898err_free_put_context:
4585 if (err < 0) 4899 if (err < 0)
4586 kfree(event); 4900 free_event(event);
4587 4901
4588err_put_context: 4902err_put_context:
4589 if (err < 0) 4903 if (err < 0)
@@ -4594,6 +4908,61 @@ err_put_context:
4594 return err; 4908 return err;
4595} 4909}
4596 4910
4911/**
4912 * perf_event_create_kernel_counter
4913 *
4914 * @attr: attributes of the counter to create
4915 * @cpu: cpu in which the counter is bound
4916 * @pid: task to profile
4917 */
4918struct perf_event *
4919perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4920 pid_t pid,
4921 perf_overflow_handler_t overflow_handler)
4922{
4923 struct perf_event *event;
4924 struct perf_event_context *ctx;
4925 int err;
4926
4927 /*
4928 * Get the target context (task or percpu):
4929 */
4930
4931 ctx = find_get_context(pid, cpu);
4932 if (IS_ERR(ctx)) {
4933 err = PTR_ERR(ctx);
4934 goto err_exit;
4935 }
4936
4937 event = perf_event_alloc(attr, cpu, ctx, NULL,
4938 NULL, overflow_handler, GFP_KERNEL);
4939 if (IS_ERR(event)) {
4940 err = PTR_ERR(event);
4941 goto err_put_context;
4942 }
4943
4944 event->filp = NULL;
4945 WARN_ON_ONCE(ctx->parent_ctx);
4946 mutex_lock(&ctx->mutex);
4947 perf_install_in_context(ctx, event, cpu);
4948 ++ctx->generation;
4949 mutex_unlock(&ctx->mutex);
4950
4951 event->owner = current;
4952 get_task_struct(current);
4953 mutex_lock(&current->perf_event_mutex);
4954 list_add_tail(&event->owner_entry, &current->perf_event_list);
4955 mutex_unlock(&current->perf_event_mutex);
4956
4957 return event;
4958
4959 err_put_context:
4960 put_ctx(ctx);
4961 err_exit:
4962 return ERR_PTR(err);
4963}
4964EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
4965
4597/* 4966/*
4598 * inherit a event from parent task to child task: 4967 * inherit a event from parent task to child task:
4599 */ 4968 */
@@ -4619,7 +4988,7 @@ inherit_event(struct perf_event *parent_event,
4619 child_event = perf_event_alloc(&parent_event->attr, 4988 child_event = perf_event_alloc(&parent_event->attr,
4620 parent_event->cpu, child_ctx, 4989 parent_event->cpu, child_ctx,
4621 group_leader, parent_event, 4990 group_leader, parent_event,
4622 GFP_KERNEL); 4991 NULL, GFP_KERNEL);
4623 if (IS_ERR(child_event)) 4992 if (IS_ERR(child_event))
4624 return child_event; 4993 return child_event;
4625 get_ctx(child_ctx); 4994 get_ctx(child_ctx);
@@ -4634,8 +5003,17 @@ inherit_event(struct perf_event *parent_event,
4634 else 5003 else
4635 child_event->state = PERF_EVENT_STATE_OFF; 5004 child_event->state = PERF_EVENT_STATE_OFF;
4636 5005
4637 if (parent_event->attr.freq) 5006 if (parent_event->attr.freq) {
4638 child_event->hw.sample_period = parent_event->hw.sample_period; 5007 u64 sample_period = parent_event->hw.sample_period;
5008 struct hw_perf_event *hwc = &child_event->hw;
5009
5010 hwc->sample_period = sample_period;
5011 hwc->last_period = sample_period;
5012
5013 atomic64_set(&hwc->period_left, sample_period);
5014 }
5015
5016 child_event->overflow_handler = parent_event->overflow_handler;
4639 5017
4640 /* 5018 /*
4641 * Link it up in the child's context: 5019 * Link it up in the child's context:
@@ -4726,7 +5104,6 @@ __perf_event_exit_task(struct perf_event *child_event,
4726{ 5104{
4727 struct perf_event *parent_event; 5105 struct perf_event *parent_event;
4728 5106
4729 update_event_times(child_event);
4730 perf_event_remove_from_context(child_event); 5107 perf_event_remove_from_context(child_event);
4731 5108
4732 parent_event = child_event->parent; 5109 parent_event = child_event->parent;
@@ -4770,7 +5147,7 @@ void perf_event_exit_task(struct task_struct *child)
4770 * reading child->perf_event_ctxp, we wait until it has 5147 * reading child->perf_event_ctxp, we wait until it has
4771 * incremented the context's refcount before we do put_ctx below. 5148 * incremented the context's refcount before we do put_ctx below.
4772 */ 5149 */
4773 spin_lock(&child_ctx->lock); 5150 raw_spin_lock(&child_ctx->lock);
4774 child->perf_event_ctxp = NULL; 5151 child->perf_event_ctxp = NULL;
4775 /* 5152 /*
4776 * If this context is a clone; unclone it so it can't get 5153 * If this context is a clone; unclone it so it can't get
@@ -4778,7 +5155,8 @@ void perf_event_exit_task(struct task_struct *child)
4778 * the events from it. 5155 * the events from it.
4779 */ 5156 */
4780 unclone_ctx(child_ctx); 5157 unclone_ctx(child_ctx);
4781 spin_unlock_irqrestore(&child_ctx->lock, flags); 5158 update_context_time(child_ctx);
5159 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
4782 5160
4783 /* 5161 /*
4784 * Report the task dead after unscheduling the events so that we 5162 * Report the task dead after unscheduling the events so that we
@@ -4801,7 +5179,11 @@ void perf_event_exit_task(struct task_struct *child)
4801 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4802 5180
4803again: 5181again:
4804 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5183 group_entry)
5184 __perf_event_exit_task(child_event, child_ctx, child);
5185
5186 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
4805 group_entry) 5187 group_entry)
4806 __perf_event_exit_task(child_event, child_ctx, child); 5188 __perf_event_exit_task(child_event, child_ctx, child);
4807 5189
@@ -4810,7 +5192,8 @@ again:
4810 * its siblings to the list, but we obtained 'tmp' before that which 5192 * its siblings to the list, but we obtained 'tmp' before that which
4811 * will still point to the list head terminating the iteration. 5193 * will still point to the list head terminating the iteration.
4812 */ 5194 */
4813 if (!list_empty(&child_ctx->group_list)) 5195 if (!list_empty(&child_ctx->pinned_groups) ||
5196 !list_empty(&child_ctx->flexible_groups))
4814 goto again; 5197 goto again;
4815 5198
4816 mutex_unlock(&child_ctx->mutex); 5199 mutex_unlock(&child_ctx->mutex);
@@ -4818,6 +5201,24 @@ again:
4818 put_ctx(child_ctx); 5201 put_ctx(child_ctx);
4819} 5202}
4820 5203
5204static void perf_free_event(struct perf_event *event,
5205 struct perf_event_context *ctx)
5206{
5207 struct perf_event *parent = event->parent;
5208
5209 if (WARN_ON_ONCE(!parent))
5210 return;
5211
5212 mutex_lock(&parent->child_mutex);
5213 list_del_init(&event->child_list);
5214 mutex_unlock(&parent->child_mutex);
5215
5216 fput(parent->filp);
5217
5218 list_del_event(event, ctx);
5219 free_event(event);
5220}
5221
4821/* 5222/*
4822 * free an unexposed, unused context as created by inheritance by 5223 * free an unexposed, unused context as created by inheritance by
4823 * init_task below, used by fork() in case of fail. 5224 * init_task below, used by fork() in case of fail.
@@ -4832,30 +5233,64 @@ void perf_event_free_task(struct task_struct *task)
4832 5233
4833 mutex_lock(&ctx->mutex); 5234 mutex_lock(&ctx->mutex);
4834again: 5235again:
4835 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5236 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
4836 struct perf_event *parent = event->parent; 5237 perf_free_event(event, ctx);
4837 5238
4838 if (WARN_ON_ONCE(!parent)) 5239 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
4839 continue; 5240 group_entry)
5241 perf_free_event(event, ctx);
4840 5242
4841 mutex_lock(&parent->child_mutex); 5243 if (!list_empty(&ctx->pinned_groups) ||
4842 list_del_init(&event->child_list); 5244 !list_empty(&ctx->flexible_groups))
4843 mutex_unlock(&parent->child_mutex); 5245 goto again;
4844 5246
4845 fput(parent->filp); 5247 mutex_unlock(&ctx->mutex);
4846 5248
4847 list_del_event(event, ctx); 5249 put_ctx(ctx);
4848 free_event(event); 5250}
5251
5252static int
5253inherit_task_group(struct perf_event *event, struct task_struct *parent,
5254 struct perf_event_context *parent_ctx,
5255 struct task_struct *child,
5256 int *inherited_all)
5257{
5258 int ret;
5259 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5260
5261 if (!event->attr.inherit) {
5262 *inherited_all = 0;
5263 return 0;
4849 } 5264 }
4850 5265
4851 if (!list_empty(&ctx->group_list)) 5266 if (!child_ctx) {
4852 goto again; 5267 /*
5268 * This is executed from the parent task context, so
5269 * inherit events that have been marked for cloning.
5270 * First allocate and initialize a context for the
5271 * child.
5272 */
4853 5273
4854 mutex_unlock(&ctx->mutex); 5274 child_ctx = kzalloc(sizeof(struct perf_event_context),
5275 GFP_KERNEL);
5276 if (!child_ctx)
5277 return -ENOMEM;
4855 5278
4856 put_ctx(ctx); 5279 __perf_event_init_context(child_ctx, child);
5280 child->perf_event_ctxp = child_ctx;
5281 get_task_struct(child);
5282 }
5283
5284 ret = inherit_group(event, parent, parent_ctx,
5285 child, child_ctx);
5286
5287 if (ret)
5288 *inherited_all = 0;
5289
5290 return ret;
4857} 5291}
4858 5292
5293
4859/* 5294/*
4860 * Initialize the perf_event context in task_struct 5295 * Initialize the perf_event context in task_struct
4861 */ 5296 */
@@ -4877,20 +5312,6 @@ int perf_event_init_task(struct task_struct *child)
4877 return 0; 5312 return 0;
4878 5313
4879 /* 5314 /*
4880 * This is executed from the parent task context, so inherit
4881 * events that have been marked for cloning.
4882 * First allocate and initialize a context for the child.
4883 */
4884
4885 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4886 if (!child_ctx)
4887 return -ENOMEM;
4888
4889 __perf_event_init_context(child_ctx, child);
4890 child->perf_event_ctxp = child_ctx;
4891 get_task_struct(child);
4892
4893 /*
4894 * If the parent's context is a clone, pin it so it won't get 5315 * If the parent's context is a clone, pin it so it won't get
4895 * swapped under us. 5316 * swapped under us.
4896 */ 5317 */
@@ -4913,22 +5334,23 @@ int perf_event_init_task(struct task_struct *child)
4913 * We dont have to disable NMIs - we are only looking at 5334 * We dont have to disable NMIs - we are only looking at
4914 * the list, not manipulating it: 5335 * the list, not manipulating it:
4915 */ 5336 */
4916 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5337 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
4917 5338 ret = inherit_task_group(event, parent, parent_ctx, child,
4918 if (!event->attr.inherit) { 5339 &inherited_all);
4919 inherited_all = 0; 5340 if (ret)
4920 continue; 5341 break;
4921 } 5342 }
4922 5343
4923 ret = inherit_group(event, parent, parent_ctx, 5344 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
4924 child, child_ctx); 5345 ret = inherit_task_group(event, parent, parent_ctx, child,
4925 if (ret) { 5346 &inherited_all);
4926 inherited_all = 0; 5347 if (ret)
4927 break; 5348 break;
4928 }
4929 } 5349 }
4930 5350
4931 if (inherited_all) { 5351 child_ctx = child->perf_event_ctxp;
5352
5353 if (child_ctx && inherited_all) {
4932 /* 5354 /*
4933 * Mark the child context as a clone of the parent 5355 * Mark the child context as a clone of the parent
4934 * context, or of whatever the parent is a clone of. 5356 * context, or of whatever the parent is a clone of.
@@ -4955,18 +5377,26 @@ int perf_event_init_task(struct task_struct *child)
4955 return ret; 5377 return ret;
4956} 5378}
4957 5379
5380static void __init perf_event_init_all_cpus(void)
5381{
5382 int cpu;
5383 struct perf_cpu_context *cpuctx;
5384
5385 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu);
5387 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 }
5389}
5390
4958static void __cpuinit perf_event_init_cpu(int cpu) 5391static void __cpuinit perf_event_init_cpu(int cpu)
4959{ 5392{
4960 struct perf_cpu_context *cpuctx; 5393 struct perf_cpu_context *cpuctx;
4961 5394
4962 cpuctx = &per_cpu(perf_cpu_context, cpu); 5395 cpuctx = &per_cpu(perf_cpu_context, cpu);
4963 __perf_event_init_context(&cpuctx->ctx, NULL);
4964 5396
4965 spin_lock(&perf_resource_lock); 5397 spin_lock(&perf_resource_lock);
4966 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4967 spin_unlock(&perf_resource_lock); 5399 spin_unlock(&perf_resource_lock);
4968
4969 hw_perf_event_setup(cpu);
4970} 5400}
4971 5401
4972#ifdef CONFIG_HOTPLUG_CPU 5402#ifdef CONFIG_HOTPLUG_CPU
@@ -4976,7 +5406,9 @@ static void __perf_event_exit_cpu(void *info)
4976 struct perf_event_context *ctx = &cpuctx->ctx; 5406 struct perf_event_context *ctx = &cpuctx->ctx;
4977 struct perf_event *event, *tmp; 5407 struct perf_event *event, *tmp;
4978 5408
4979 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5409 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5410 __perf_event_remove_from_context(event);
5411 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
4980 __perf_event_remove_from_context(event); 5412 __perf_event_remove_from_context(event);
4981} 5413}
4982static void perf_event_exit_cpu(int cpu) 5414static void perf_event_exit_cpu(int cpu)
@@ -5004,11 +5436,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5004 perf_event_init_cpu(cpu); 5436 perf_event_init_cpu(cpu);
5005 break; 5437 break;
5006 5438
5007 case CPU_ONLINE:
5008 case CPU_ONLINE_FROZEN:
5009 hw_perf_event_setup_online(cpu);
5010 break;
5011
5012 case CPU_DOWN_PREPARE: 5439 case CPU_DOWN_PREPARE:
5013 case CPU_DOWN_PREPARE_FROZEN: 5440 case CPU_DOWN_PREPARE_FROZEN:
5014 perf_event_exit_cpu(cpu); 5441 perf_event_exit_cpu(cpu);
@@ -5031,6 +5458,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5031 5458
5032void __init perf_event_init(void) 5459void __init perf_event_init(void)
5033{ 5460{
5461 perf_event_init_all_cpus();
5034 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5462 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5035 (void *)(long)smp_processor_id()); 5463 (void *)(long)smp_processor_id());
5036 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5464 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5038,13 +5466,16 @@ void __init perf_event_init(void)
5038 register_cpu_notifier(&perf_cpu_nb); 5466 register_cpu_notifier(&perf_cpu_nb);
5039} 5467}
5040 5468
5041static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5469static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5470 struct sysdev_class_attribute *attr,
5471 char *buf)
5042{ 5472{
5043 return sprintf(buf, "%d\n", perf_reserved_percpu); 5473 return sprintf(buf, "%d\n", perf_reserved_percpu);
5044} 5474}
5045 5475
5046static ssize_t 5476static ssize_t
5047perf_set_reserve_percpu(struct sysdev_class *class, 5477perf_set_reserve_percpu(struct sysdev_class *class,
5478 struct sysdev_class_attribute *attr,
5048 const char *buf, 5479 const char *buf,
5049 size_t count) 5480 size_t count)
5050{ 5481{
@@ -5062,24 +5493,28 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5062 perf_reserved_percpu = val; 5493 perf_reserved_percpu = val;
5063 for_each_online_cpu(cpu) { 5494 for_each_online_cpu(cpu) {
5064 cpuctx = &per_cpu(perf_cpu_context, cpu); 5495 cpuctx = &per_cpu(perf_cpu_context, cpu);
5065 spin_lock_irq(&cpuctx->ctx.lock); 5496 raw_spin_lock_irq(&cpuctx->ctx.lock);
5066 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5497 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5067 perf_max_events - perf_reserved_percpu); 5498 perf_max_events - perf_reserved_percpu);
5068 cpuctx->max_pertask = mpt; 5499 cpuctx->max_pertask = mpt;
5069 spin_unlock_irq(&cpuctx->ctx.lock); 5500 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5070 } 5501 }
5071 spin_unlock(&perf_resource_lock); 5502 spin_unlock(&perf_resource_lock);
5072 5503
5073 return count; 5504 return count;
5074} 5505}
5075 5506
5076static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5507static ssize_t perf_show_overcommit(struct sysdev_class *class,
5508 struct sysdev_class_attribute *attr,
5509 char *buf)
5077{ 5510{
5078 return sprintf(buf, "%d\n", perf_overcommit); 5511 return sprintf(buf, "%d\n", perf_overcommit);
5079} 5512}
5080 5513
5081static ssize_t 5514static ssize_t
5082perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5515perf_set_overcommit(struct sysdev_class *class,
5516 struct sysdev_class_attribute *attr,
5517 const char *buf, size_t count)
5083{ 5518{
5084 unsigned long val; 5519 unsigned long val;
5085 int err; 5520 int err;