aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
committerThomas Gleixner <tglx@linutronix.de>2011-05-14 06:06:36 -0400
commita18f22a968de17b29f2310cdb7ba69163e65ec15 (patch)
treea7d56d88fad5e444d7661484109758a2f436129e /kernel/perf_event.c
parenta1c57e0fec53defe745e64417eacdbd3618c3e66 (diff)
parent798778b8653f64b7b2162ac70eca10367cff6ce8 (diff)
Merge branch 'consolidate-clksrc-i8253' of master.kernel.org:~rmk/linux-2.6-arm into timers/clocksource
Conflicts: arch/ia64/kernel/cyclone.c arch/mips/kernel/i8253.c arch/x86/kernel/i8253.c Reason: Resolve conflicts so further cleanups do not conflict further Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c1070
1 files changed, 901 insertions, 169 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 999835b6112b..8e81a9860a0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu;
62 */ 145 */
63int sysctl_perf_event_paranoid __read_mostly = 1; 146int sysctl_perf_event_paranoid __read_mostly = 1;
64 147
65int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 148/* Minimum for 512 kiB + 1 user control page */
149int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
66 150
67/* 151/*
68 * max perf event sample rate 152 * max perf event sample rate
69 */ 153 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 154#define DEFAULT_MAX_SAMPLE_RATE 100000
155int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
156static int max_samples_per_tick __read_mostly =
157 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
158
159int perf_proc_update_handler(struct ctl_table *table, int write,
160 void __user *buffer, size_t *lenp,
161 loff_t *ppos)
162{
163 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
164
165 if (ret || !write)
166 return ret;
167
168 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
169
170 return 0;
171}
71 172
72static atomic64_t perf_event_id; 173static atomic64_t perf_event_id;
73 174
@@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 176 enum event_type_t event_type);
76 177
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 178static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 179 enum event_type_t event_type,
180 struct task_struct *task);
181
182static void update_context_time(struct perf_event_context *ctx);
183static u64 perf_event_time(struct perf_event *event);
79 184
80void __weak perf_event_print_debug(void) { } 185void __weak perf_event_print_debug(void) { }
81 186
@@ -89,6 +194,361 @@ static inline u64 perf_clock(void)
89 return local_clock(); 194 return local_clock();
90} 195}
91 196
197static inline struct perf_cpu_context *
198__get_cpu_context(struct perf_event_context *ctx)
199{
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201}
202
203#ifdef CONFIG_CGROUP_PERF
204
205/*
206 * Must ensure cgroup is pinned (css_get) before calling
207 * this function. In other words, we cannot call this function
208 * if there is no cgroup event for the current CPU context.
209 */
210static inline struct perf_cgroup *
211perf_cgroup_from_task(struct task_struct *task)
212{
213 return container_of(task_subsys_state(task, perf_subsys_id),
214 struct perf_cgroup, css);
215}
216
217static inline bool
218perf_cgroup_match(struct perf_event *event)
219{
220 struct perf_event_context *ctx = event->ctx;
221 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
222
223 return !event->cgrp || event->cgrp == cpuctx->cgrp;
224}
225
226static inline void perf_get_cgroup(struct perf_event *event)
227{
228 css_get(&event->cgrp->css);
229}
230
231static inline void perf_put_cgroup(struct perf_event *event)
232{
233 css_put(&event->cgrp->css);
234}
235
236static inline void perf_detach_cgroup(struct perf_event *event)
237{
238 perf_put_cgroup(event);
239 event->cgrp = NULL;
240}
241
242static inline int is_cgroup_event(struct perf_event *event)
243{
244 return event->cgrp != NULL;
245}
246
247static inline u64 perf_cgroup_event_time(struct perf_event *event)
248{
249 struct perf_cgroup_info *t;
250
251 t = per_cpu_ptr(event->cgrp->info, event->cpu);
252 return t->time;
253}
254
255static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
256{
257 struct perf_cgroup_info *info;
258 u64 now;
259
260 now = perf_clock();
261
262 info = this_cpu_ptr(cgrp->info);
263
264 info->time += now - info->timestamp;
265 info->timestamp = now;
266}
267
268static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
269{
270 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
271 if (cgrp_out)
272 __update_cgrp_time(cgrp_out);
273}
274
275static inline void update_cgrp_time_from_event(struct perf_event *event)
276{
277 struct perf_cgroup *cgrp;
278
279 /*
280 * ensure we access cgroup data only when needed and
281 * when we know the cgroup is pinned (css_get)
282 */
283 if (!is_cgroup_event(event))
284 return;
285
286 cgrp = perf_cgroup_from_task(current);
287 /*
288 * Do not update time when cgroup is not active
289 */
290 if (cgrp == event->cgrp)
291 __update_cgrp_time(event->cgrp);
292}
293
294static inline void
295perf_cgroup_set_timestamp(struct task_struct *task,
296 struct perf_event_context *ctx)
297{
298 struct perf_cgroup *cgrp;
299 struct perf_cgroup_info *info;
300
301 /*
302 * ctx->lock held by caller
303 * ensure we do not access cgroup data
304 * unless we have the cgroup pinned (css_get)
305 */
306 if (!task || !ctx->nr_cgroups)
307 return;
308
309 cgrp = perf_cgroup_from_task(task);
310 info = this_cpu_ptr(cgrp->info);
311 info->timestamp = ctx->timestamp;
312}
313
314#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
315#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
316
317/*
318 * reschedule events based on the cgroup constraint of task.
319 *
320 * mode SWOUT : schedule out everything
321 * mode SWIN : schedule in based on cgroup for next
322 */
323void perf_cgroup_switch(struct task_struct *task, int mode)
324{
325 struct perf_cpu_context *cpuctx;
326 struct pmu *pmu;
327 unsigned long flags;
328
329 /*
330 * disable interrupts to avoid geting nr_cgroup
331 * changes via __perf_event_disable(). Also
332 * avoids preemption.
333 */
334 local_irq_save(flags);
335
336 /*
337 * we reschedule only in the presence of cgroup
338 * constrained events.
339 */
340 rcu_read_lock();
341
342 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /*
349 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events.
351 *
352 * ctx->nr_cgroups reports the number of cgroup
353 * events for a context.
354 */
355 if (cpuctx->ctx.nr_cgroups > 0) {
356
357 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
359 /*
360 * must not be done before ctxswout due
361 * to event_filter_match() in event_sched_out()
362 */
363 cpuctx->cgrp = NULL;
364 }
365
366 if (mode & PERF_CGROUP_SWIN) {
367 WARN_ON_ONCE(cpuctx->cgrp);
368 /* set cgrp before ctxsw in to
369 * allow event_filter_match() to not
370 * have to pass task around
371 */
372 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 }
375 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 }
379
380 rcu_read_unlock();
381
382 local_irq_restore(flags);
383}
384
385static inline void perf_cgroup_sched_out(struct task_struct *task)
386{
387 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
388}
389
390static inline void perf_cgroup_sched_in(struct task_struct *task)
391{
392 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
393}
394
395static inline int perf_cgroup_connect(int fd, struct perf_event *event,
396 struct perf_event_attr *attr,
397 struct perf_event *group_leader)
398{
399 struct perf_cgroup *cgrp;
400 struct cgroup_subsys_state *css;
401 struct file *file;
402 int ret = 0, fput_needed;
403
404 file = fget_light(fd, &fput_needed);
405 if (!file)
406 return -EBADF;
407
408 css = cgroup_css_from_dir(file, perf_subsys_id);
409 if (IS_ERR(css)) {
410 ret = PTR_ERR(css);
411 goto out;
412 }
413
414 cgrp = container_of(css, struct perf_cgroup, css);
415 event->cgrp = cgrp;
416
417 /* must be done before we fput() the file */
418 perf_get_cgroup(event);
419
420 /*
421 * all events in a group must monitor
422 * the same cgroup because a task belongs
423 * to only one perf cgroup at a time
424 */
425 if (group_leader && group_leader->cgrp != cgrp) {
426 perf_detach_cgroup(event);
427 ret = -EINVAL;
428 }
429out:
430 fput_light(file, fput_needed);
431 return ret;
432}
433
434static inline void
435perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
436{
437 struct perf_cgroup_info *t;
438 t = per_cpu_ptr(event->cgrp->info, event->cpu);
439 event->shadow_ctx_time = now - t->timestamp;
440}
441
442static inline void
443perf_cgroup_defer_enabled(struct perf_event *event)
444{
445 /*
446 * when the current task's perf cgroup does not match
447 * the event's, we need to remember to call the
448 * perf_mark_enable() function the first time a task with
449 * a matching perf cgroup is scheduled in.
450 */
451 if (is_cgroup_event(event) && !perf_cgroup_match(event))
452 event->cgrp_defer_enabled = 1;
453}
454
455static inline void
456perf_cgroup_mark_enabled(struct perf_event *event,
457 struct perf_event_context *ctx)
458{
459 struct perf_event *sub;
460 u64 tstamp = perf_event_time(event);
461
462 if (!event->cgrp_defer_enabled)
463 return;
464
465 event->cgrp_defer_enabled = 0;
466
467 event->tstamp_enabled = tstamp - event->total_time_enabled;
468 list_for_each_entry(sub, &event->sibling_list, group_entry) {
469 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
470 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
471 sub->cgrp_defer_enabled = 0;
472 }
473 }
474}
475#else /* !CONFIG_CGROUP_PERF */
476
477static inline bool
478perf_cgroup_match(struct perf_event *event)
479{
480 return true;
481}
482
483static inline void perf_detach_cgroup(struct perf_event *event)
484{}
485
486static inline int is_cgroup_event(struct perf_event *event)
487{
488 return 0;
489}
490
491static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
492{
493 return 0;
494}
495
496static inline void update_cgrp_time_from_event(struct perf_event *event)
497{
498}
499
500static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
501{
502}
503
504static inline void perf_cgroup_sched_out(struct task_struct *task)
505{
506}
507
508static inline void perf_cgroup_sched_in(struct task_struct *task)
509{
510}
511
512static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
513 struct perf_event_attr *attr,
514 struct perf_event *group_leader)
515{
516 return -EINVAL;
517}
518
519static inline void
520perf_cgroup_set_timestamp(struct task_struct *task,
521 struct perf_event_context *ctx)
522{
523}
524
525void
526perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
527{
528}
529
530static inline void
531perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
532{
533}
534
535static inline u64 perf_cgroup_event_time(struct perf_event *event)
536{
537 return 0;
538}
539
540static inline void
541perf_cgroup_defer_enabled(struct perf_event *event)
542{
543}
544
545static inline void
546perf_cgroup_mark_enabled(struct perf_event *event,
547 struct perf_event_context *ctx)
548{
549}
550#endif
551
92void perf_pmu_disable(struct pmu *pmu) 552void perf_pmu_disable(struct pmu *pmu)
93{ 553{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 554 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +714,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 714 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 715 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 716 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 717}
259 718
260/* 719/*
@@ -271,6 +730,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 730static u64 perf_event_time(struct perf_event *event)
272{ 731{
273 struct perf_event_context *ctx = event->ctx; 732 struct perf_event_context *ctx = event->ctx;
733
734 if (is_cgroup_event(event))
735 return perf_cgroup_event_time(event);
736
274 return ctx ? ctx->time : 0; 737 return ctx ? ctx->time : 0;
275} 738}
276 739
@@ -285,9 +748,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 748 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 749 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 750 return;
288 751 /*
289 if (ctx->is_active) 752 * in cgroup mode, time_enabled represents
753 * the time the event was enabled AND active
754 * tasks were in the monitored cgroup. This is
755 * independent of the activity of the context as
756 * there may be a mix of cgroup and non-cgroup events.
757 *
758 * That is why we treat cgroup events differently
759 * here.
760 */
761 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 762 run_end = perf_event_time(event);
763 else if (ctx->is_active)
764 run_end = ctx->time;
291 else 765 else
292 run_end = event->tstamp_stopped; 766 run_end = event->tstamp_stopped;
293 767
@@ -299,6 +773,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 773 run_end = perf_event_time(event);
300 774
301 event->total_time_running = run_end - event->tstamp_running; 775 event->total_time_running = run_end - event->tstamp_running;
776
302} 777}
303 778
304/* 779/*
@@ -347,6 +822,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 822 list_add_tail(&event->group_entry, list);
348 } 823 }
349 824
825 if (is_cgroup_event(event))
826 ctx->nr_cgroups++;
827
350 list_add_rcu(&event->event_entry, &ctx->event_list); 828 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 829 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 830 perf_pmu_rotate_start(ctx->pmu);
@@ -465,6 +943,7 @@ static void perf_group_attach(struct perf_event *event)
465static void 943static void
466list_del_event(struct perf_event *event, struct perf_event_context *ctx) 944list_del_event(struct perf_event *event, struct perf_event_context *ctx)
467{ 945{
946 struct perf_cpu_context *cpuctx;
468 /* 947 /*
469 * We can have double detach due to exit/hot-unplug + close. 948 * We can have double detach due to exit/hot-unplug + close.
470 */ 949 */
@@ -473,6 +952,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 952
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 953 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 954
955 if (is_cgroup_event(event)) {
956 ctx->nr_cgroups--;
957 cpuctx = __get_cpu_context(ctx);
958 /*
959 * if there are no more cgroup events
960 * then cler cgrp to avoid stale pointer
961 * in update_cgrp_time_from_cpuctx()
962 */
963 if (!ctx->nr_cgroups)
964 cpuctx->cgrp = NULL;
965 }
966
476 ctx->nr_events--; 967 ctx->nr_events--;
477 if (event->attr.inherit_stat) 968 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 969 ctx->nr_stat--;
@@ -544,7 +1035,8 @@ out:
544static inline int 1035static inline int
545event_filter_match(struct perf_event *event) 1036event_filter_match(struct perf_event *event)
546{ 1037{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1038 return (event->cpu == -1 || event->cpu == smp_processor_id())
1039 && perf_cgroup_match(event);
548} 1040}
549 1041
550static void 1042static void
@@ -562,7 +1054,7 @@ event_sched_out(struct perf_event *event,
562 */ 1054 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1055 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1056 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1057 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1058 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1059 event->tstamp_stopped = tstamp;
568 } 1060 }
@@ -606,47 +1098,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1098 cpuctx->exclusive = 0;
607} 1099}
608 1100
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1101/*
616 * Cross CPU call to remove a performance event 1102 * Cross CPU call to remove a performance event
617 * 1103 *
618 * We disable the event on the hardware level first. After that we 1104 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1105 * remove it from the context list.
620 */ 1106 */
621static void __perf_event_remove_from_context(void *info) 1107static int __perf_remove_from_context(void *info)
622{ 1108{
623 struct perf_event *event = info; 1109 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1110 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1111 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1112
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1113 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1114 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1115 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1116 raw_spin_unlock(&ctx->lock);
1117
1118 return 0;
642} 1119}
643 1120
644 1121
645/* 1122/*
646 * Remove the event from a task's (or a CPU's) list of events. 1123 * Remove the event from a task's (or a CPU's) list of events.
647 * 1124 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1125 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1126 * call when the task is on a CPU.
652 * 1127 *
@@ -657,49 +1132,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1132 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1133 * context has been detached from its task.
659 */ 1134 */
660static void perf_event_remove_from_context(struct perf_event *event) 1135static void perf_remove_from_context(struct perf_event *event)
661{ 1136{
662 struct perf_event_context *ctx = event->ctx; 1137 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1138 struct task_struct *task = ctx->task;
664 1139
1140 lockdep_assert_held(&ctx->mutex);
1141
665 if (!task) { 1142 if (!task) {
666 /* 1143 /*
667 * Per cpu events are removed via an smp call and 1144 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1145 * the removal is always successful.
669 */ 1146 */
670 smp_call_function_single(event->cpu, 1147 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1148 return;
674 } 1149 }
675 1150
676retry: 1151retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1152 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1153 return;
679 1154
680 raw_spin_lock_irq(&ctx->lock); 1155 raw_spin_lock_irq(&ctx->lock);
681 /* 1156 /*
682 * If the context is active we need to retry the smp call. 1157 * If we failed to find a running task, but find the context active now
1158 * that we've acquired the ctx->lock, retry.
683 */ 1159 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1160 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1161 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1162 goto retry;
687 } 1163 }
688 1164
689 /* 1165 /*
690 * The lock prevents that this context is scheduled in so we 1166 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1167 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1168 */
694 if (!list_empty(&event->group_entry)) 1169 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1170 raw_spin_unlock_irq(&ctx->lock);
697} 1171}
698 1172
699/* 1173/*
700 * Cross CPU call to disable a performance event 1174 * Cross CPU call to disable a performance event
701 */ 1175 */
702static void __perf_event_disable(void *info) 1176static int __perf_event_disable(void *info)
703{ 1177{
704 struct perf_event *event = info; 1178 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1179 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1182,12 @@ static void __perf_event_disable(void *info)
708 /* 1182 /*
709 * If this is a per-task event, need to check whether this 1183 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1184 * event's task is the current task on this cpu.
1185 *
1186 * Can trigger due to concurrent perf_event_context_sched_out()
1187 * flipping contexts around.
711 */ 1188 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1189 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1190 return -EINVAL;
714 1191
715 raw_spin_lock(&ctx->lock); 1192 raw_spin_lock(&ctx->lock);
716 1193
@@ -720,6 +1197,7 @@ static void __perf_event_disable(void *info)
720 */ 1197 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1198 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1199 update_context_time(ctx);
1200 update_cgrp_time_from_event(event);
723 update_group_times(event); 1201 update_group_times(event);
724 if (event == event->group_leader) 1202 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1203 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1207,8 @@ static void __perf_event_disable(void *info)
729 } 1207 }
730 1208
731 raw_spin_unlock(&ctx->lock); 1209 raw_spin_unlock(&ctx->lock);
1210
1211 return 0;
732} 1212}
733 1213
734/* 1214/*
@@ -753,13 +1233,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1233 /*
754 * Disable the event on the cpu that it's on 1234 * Disable the event on the cpu that it's on
755 */ 1235 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1236 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1237 return;
759 } 1238 }
760 1239
761retry: 1240retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1241 if (!task_function_call(task, __perf_event_disable, event))
1242 return;
763 1243
764 raw_spin_lock_irq(&ctx->lock); 1244 raw_spin_lock_irq(&ctx->lock);
765 /* 1245 /*
@@ -767,6 +1247,11 @@ retry:
767 */ 1247 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1248 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1249 raw_spin_unlock_irq(&ctx->lock);
1250 /*
1251 * Reload the task pointer, it might have been changed by
1252 * a concurrent perf_event_context_sched_out().
1253 */
1254 task = ctx->task;
770 goto retry; 1255 goto retry;
771 } 1256 }
772 1257
@@ -778,10 +1263,48 @@ retry:
778 update_group_times(event); 1263 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1264 event->state = PERF_EVENT_STATE_OFF;
780 } 1265 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1266 raw_spin_unlock_irq(&ctx->lock);
783} 1267}
784 1268
1269static void perf_set_shadow_time(struct perf_event *event,
1270 struct perf_event_context *ctx,
1271 u64 tstamp)
1272{
1273 /*
1274 * use the correct time source for the time snapshot
1275 *
1276 * We could get by without this by leveraging the
1277 * fact that to get to this function, the caller
1278 * has most likely already called update_context_time()
1279 * and update_cgrp_time_xx() and thus both timestamp
1280 * are identical (or very close). Given that tstamp is,
1281 * already adjusted for cgroup, we could say that:
1282 * tstamp - ctx->timestamp
1283 * is equivalent to
1284 * tstamp - cgrp->timestamp.
1285 *
1286 * Then, in perf_output_read(), the calculation would
1287 * work with no changes because:
1288 * - event is guaranteed scheduled in
1289 * - no scheduled out in between
1290 * - thus the timestamp would be the same
1291 *
1292 * But this is a bit hairy.
1293 *
1294 * So instead, we have an explicit cgroup call to remain
1295 * within the time time source all along. We believe it
1296 * is cleaner and simpler to understand.
1297 */
1298 if (is_cgroup_event(event))
1299 perf_cgroup_set_shadow_time(event, tstamp);
1300 else
1301 event->shadow_ctx_time = tstamp - ctx->timestamp;
1302}
1303
1304#define MAX_INTERRUPTS (~0ULL)
1305
1306static void perf_log_throttle(struct perf_event *event, int enable);
1307
785static int 1308static int
786event_sched_in(struct perf_event *event, 1309event_sched_in(struct perf_event *event,
787 struct perf_cpu_context *cpuctx, 1310 struct perf_cpu_context *cpuctx,
@@ -794,6 +1317,17 @@ event_sched_in(struct perf_event *event,
794 1317
795 event->state = PERF_EVENT_STATE_ACTIVE; 1318 event->state = PERF_EVENT_STATE_ACTIVE;
796 event->oncpu = smp_processor_id(); 1319 event->oncpu = smp_processor_id();
1320
1321 /*
1322 * Unthrottle events, since we scheduled we might have missed several
1323 * ticks already, also for a heavily scheduling task there is little
1324 * guarantee it'll get a tick in a timely manner.
1325 */
1326 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1327 perf_log_throttle(event, 1);
1328 event->hw.interrupts = 0;
1329 }
1330
797 /* 1331 /*
798 * The new state must be visible before we turn it on in the hardware: 1332 * The new state must be visible before we turn it on in the hardware:
799 */ 1333 */
@@ -807,7 +1341,7 @@ event_sched_in(struct perf_event *event,
807 1341
808 event->tstamp_running += tstamp - event->tstamp_stopped; 1342 event->tstamp_running += tstamp - event->tstamp_stopped;
809 1343
810 event->shadow_ctx_time = tstamp - ctx->timestamp; 1344 perf_set_shadow_time(event, ctx, tstamp);
811 1345
812 if (!is_software_event(event)) 1346 if (!is_software_event(event))
813 cpuctx->active_oncpu++; 1347 cpuctx->active_oncpu++;
@@ -928,12 +1462,15 @@ static void add_event_to_ctx(struct perf_event *event,
928 event->tstamp_stopped = tstamp; 1462 event->tstamp_stopped = tstamp;
929} 1463}
930 1464
1465static void perf_event_context_sched_in(struct perf_event_context *ctx,
1466 struct task_struct *tsk);
1467
931/* 1468/*
932 * Cross CPU call to install and enable a performance event 1469 * Cross CPU call to install and enable a performance event
933 * 1470 *
934 * Must be called with ctx->mutex held 1471 * Must be called with ctx->mutex held
935 */ 1472 */
936static void __perf_install_in_context(void *info) 1473static int __perf_install_in_context(void *info)
937{ 1474{
938 struct perf_event *event = info; 1475 struct perf_event *event = info;
939 struct perf_event_context *ctx = event->ctx; 1476 struct perf_event_context *ctx = event->ctx;
@@ -942,21 +1479,22 @@ static void __perf_install_in_context(void *info)
942 int err; 1479 int err;
943 1480
944 /* 1481 /*
945 * If this is a task context, we need to check whether it is 1482 * In case we're installing a new context to an already running task,
946 * the current task context of this cpu. If not it has been 1483 * could also happen before perf_event_task_sched_in() on architectures
947 * scheduled out before the smp call arrived. 1484 * which do context switches with IRQs enabled.
948 * Or possibly this is the right context but it isn't
949 * on this cpu because it had no events.
950 */ 1485 */
951 if (ctx->task && cpuctx->task_ctx != ctx) { 1486 if (ctx->task && !cpuctx->task_ctx)
952 if (cpuctx->task_ctx || ctx->task != current) 1487 perf_event_context_sched_in(ctx, ctx->task);
953 return;
954 cpuctx->task_ctx = ctx;
955 }
956 1488
957 raw_spin_lock(&ctx->lock); 1489 raw_spin_lock(&ctx->lock);
958 ctx->is_active = 1; 1490 ctx->is_active = 1;
959 update_context_time(ctx); 1491 update_context_time(ctx);
1492 /*
1493 * update cgrp time only if current cgrp
1494 * matches event->cgrp. Must be done before
1495 * calling add_event_to_ctx()
1496 */
1497 update_cgrp_time_from_event(event);
960 1498
961 add_event_to_ctx(event, ctx); 1499 add_event_to_ctx(event, ctx);
962 1500
@@ -997,6 +1535,8 @@ static void __perf_install_in_context(void *info)
997 1535
998unlock: 1536unlock:
999 raw_spin_unlock(&ctx->lock); 1537 raw_spin_unlock(&ctx->lock);
1538
1539 return 0;
1000} 1540}
1001 1541
1002/* 1542/*
@@ -1008,8 +1548,6 @@ unlock:
1008 * If the event is attached to a task which is on a CPU we use a smp 1548 * If the event is attached to a task which is on a CPU we use a smp
1009 * call to enable it in the task context. The task might have been 1549 * call to enable it in the task context. The task might have been
1010 * scheduled away, but we check this in the smp call again. 1550 * scheduled away, but we check this in the smp call again.
1011 *
1012 * Must be called with ctx->mutex held.
1013 */ 1551 */
1014static void 1552static void
1015perf_install_in_context(struct perf_event_context *ctx, 1553perf_install_in_context(struct perf_event_context *ctx,
@@ -1018,6 +1556,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1018{ 1556{
1019 struct task_struct *task = ctx->task; 1557 struct task_struct *task = ctx->task;
1020 1558
1559 lockdep_assert_held(&ctx->mutex);
1560
1021 event->ctx = ctx; 1561 event->ctx = ctx;
1022 1562
1023 if (!task) { 1563 if (!task) {
@@ -1025,31 +1565,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1025 * Per cpu events are installed via an smp call and 1565 * Per cpu events are installed via an smp call and
1026 * the install is always successful. 1566 * the install is always successful.
1027 */ 1567 */
1028 smp_call_function_single(cpu, __perf_install_in_context, 1568 cpu_function_call(cpu, __perf_install_in_context, event);
1029 event, 1);
1030 return; 1569 return;
1031 } 1570 }
1032 1571
1033retry: 1572retry:
1034 task_oncpu_function_call(task, __perf_install_in_context, 1573 if (!task_function_call(task, __perf_install_in_context, event))
1035 event); 1574 return;
1036 1575
1037 raw_spin_lock_irq(&ctx->lock); 1576 raw_spin_lock_irq(&ctx->lock);
1038 /* 1577 /*
1039 * we need to retry the smp call. 1578 * If we failed to find a running task, but find the context active now
1579 * that we've acquired the ctx->lock, retry.
1040 */ 1580 */
1041 if (ctx->is_active && list_empty(&event->group_entry)) { 1581 if (ctx->is_active) {
1042 raw_spin_unlock_irq(&ctx->lock); 1582 raw_spin_unlock_irq(&ctx->lock);
1043 goto retry; 1583 goto retry;
1044 } 1584 }
1045 1585
1046 /* 1586 /*
1047 * The lock prevents that this context is scheduled in so we 1587 * Since the task isn't running, its safe to add the event, us holding
1048 * can add the event safely, if it the call above did not 1588 * the ctx->lock ensures the task won't get scheduled in.
1049 * succeed.
1050 */ 1589 */
1051 if (list_empty(&event->group_entry)) 1590 add_event_to_ctx(event, ctx);
1052 add_event_to_ctx(event, ctx);
1053 raw_spin_unlock_irq(&ctx->lock); 1591 raw_spin_unlock_irq(&ctx->lock);
1054} 1592}
1055 1593
@@ -1078,7 +1616,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1078/* 1616/*
1079 * Cross CPU call to enable a performance event 1617 * Cross CPU call to enable a performance event
1080 */ 1618 */
1081static void __perf_event_enable(void *info) 1619static int __perf_event_enable(void *info)
1082{ 1620{
1083 struct perf_event *event = info; 1621 struct perf_event *event = info;
1084 struct perf_event_context *ctx = event->ctx; 1622 struct perf_event_context *ctx = event->ctx;
@@ -1086,26 +1624,27 @@ static void __perf_event_enable(void *info)
1086 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1624 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1087 int err; 1625 int err;
1088 1626
1089 /* 1627 if (WARN_ON_ONCE(!ctx->is_active))
1090 * If this is a per-task event, need to check whether this 1628 return -EINVAL;
1091 * event's task is the current task on this cpu.
1092 */
1093 if (ctx->task && cpuctx->task_ctx != ctx) {
1094 if (cpuctx->task_ctx || ctx->task != current)
1095 return;
1096 cpuctx->task_ctx = ctx;
1097 }
1098 1629
1099 raw_spin_lock(&ctx->lock); 1630 raw_spin_lock(&ctx->lock);
1100 ctx->is_active = 1;
1101 update_context_time(ctx); 1631 update_context_time(ctx);
1102 1632
1103 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1633 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1104 goto unlock; 1634 goto unlock;
1635
1636 /*
1637 * set current task's cgroup time reference point
1638 */
1639 perf_cgroup_set_timestamp(current, ctx);
1640
1105 __perf_event_mark_enabled(event, ctx); 1641 __perf_event_mark_enabled(event, ctx);
1106 1642
1107 if (!event_filter_match(event)) 1643 if (!event_filter_match(event)) {
1644 if (is_cgroup_event(event))
1645 perf_cgroup_defer_enabled(event);
1108 goto unlock; 1646 goto unlock;
1647 }
1109 1648
1110 /* 1649 /*
1111 * If the event is in a group and isn't the group leader, 1650 * If the event is in a group and isn't the group leader,
@@ -1138,6 +1677,8 @@ static void __perf_event_enable(void *info)
1138 1677
1139unlock: 1678unlock:
1140 raw_spin_unlock(&ctx->lock); 1679 raw_spin_unlock(&ctx->lock);
1680
1681 return 0;
1141} 1682}
1142 1683
1143/* 1684/*
@@ -1158,8 +1699,7 @@ void perf_event_enable(struct perf_event *event)
1158 /* 1699 /*
1159 * Enable the event on the cpu that it's on 1700 * Enable the event on the cpu that it's on
1160 */ 1701 */
1161 smp_call_function_single(event->cpu, __perf_event_enable, 1702 cpu_function_call(event->cpu, __perf_event_enable, event);
1162 event, 1);
1163 return; 1703 return;
1164 } 1704 }
1165 1705
@@ -1178,8 +1718,15 @@ void perf_event_enable(struct perf_event *event)
1178 event->state = PERF_EVENT_STATE_OFF; 1718 event->state = PERF_EVENT_STATE_OFF;
1179 1719
1180retry: 1720retry:
1721 if (!ctx->is_active) {
1722 __perf_event_mark_enabled(event, ctx);
1723 goto out;
1724 }
1725
1181 raw_spin_unlock_irq(&ctx->lock); 1726 raw_spin_unlock_irq(&ctx->lock);
1182 task_oncpu_function_call(task, __perf_event_enable, event); 1727
1728 if (!task_function_call(task, __perf_event_enable, event))
1729 return;
1183 1730
1184 raw_spin_lock_irq(&ctx->lock); 1731 raw_spin_lock_irq(&ctx->lock);
1185 1732
@@ -1187,15 +1734,14 @@ retry:
1187 * If the context is active and the event is still off, 1734 * If the context is active and the event is still off,
1188 * we need to retry the cross-call. 1735 * we need to retry the cross-call.
1189 */ 1736 */
1190 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1737 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1738 /*
1739 * task could have been flipped by a concurrent
1740 * perf_event_context_sched_out()
1741 */
1742 task = ctx->task;
1191 goto retry; 1743 goto retry;
1192 1744 }
1193 /*
1194 * Since we have the lock this context can't be scheduled
1195 * in, so we can change the state safely.
1196 */
1197 if (event->state == PERF_EVENT_STATE_OFF)
1198 __perf_event_mark_enabled(event, ctx);
1199 1745
1200out: 1746out:
1201 raw_spin_unlock_irq(&ctx->lock); 1747 raw_spin_unlock_irq(&ctx->lock);
@@ -1227,6 +1773,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1227 if (likely(!ctx->nr_events)) 1773 if (likely(!ctx->nr_events))
1228 goto out; 1774 goto out;
1229 update_context_time(ctx); 1775 update_context_time(ctx);
1776 update_cgrp_time_from_cpuctx(cpuctx);
1230 1777
1231 if (!ctx->nr_active) 1778 if (!ctx->nr_active)
1232 goto out; 1779 goto out;
@@ -1339,8 +1886,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1339 } 1886 }
1340} 1887}
1341 1888
1342void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1889static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1343 struct task_struct *next) 1890 struct task_struct *next)
1344{ 1891{
1345 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1892 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1346 struct perf_event_context *next_ctx; 1893 struct perf_event_context *next_ctx;
@@ -1416,6 +1963,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1416 1963
1417 for_each_task_context_nr(ctxn) 1964 for_each_task_context_nr(ctxn)
1418 perf_event_context_sched_out(task, ctxn, next); 1965 perf_event_context_sched_out(task, ctxn, next);
1966
1967 /*
1968 * if cgroup events exist on this CPU, then we need
1969 * to check if we have to switch out PMU state.
1970 * cgroup event are system-wide mode only
1971 */
1972 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1973 perf_cgroup_sched_out(task);
1419} 1974}
1420 1975
1421static void task_ctx_sched_out(struct perf_event_context *ctx, 1976static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1454,6 +2009,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1454 if (!event_filter_match(event)) 2009 if (!event_filter_match(event))
1455 continue; 2010 continue;
1456 2011
2012 /* may need to reset tstamp_enabled */
2013 if (is_cgroup_event(event))
2014 perf_cgroup_mark_enabled(event, ctx);
2015
1457 if (group_can_go_on(event, cpuctx, 1)) 2016 if (group_can_go_on(event, cpuctx, 1))
1458 group_sched_in(event, cpuctx, ctx); 2017 group_sched_in(event, cpuctx, ctx);
1459 2018
@@ -1486,6 +2045,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1486 if (!event_filter_match(event)) 2045 if (!event_filter_match(event))
1487 continue; 2046 continue;
1488 2047
2048 /* may need to reset tstamp_enabled */
2049 if (is_cgroup_event(event))
2050 perf_cgroup_mark_enabled(event, ctx);
2051
1489 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2052 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1490 if (group_sched_in(event, cpuctx, ctx)) 2053 if (group_sched_in(event, cpuctx, ctx))
1491 can_add_hw = 0; 2054 can_add_hw = 0;
@@ -1496,15 +2059,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1496static void 2059static void
1497ctx_sched_in(struct perf_event_context *ctx, 2060ctx_sched_in(struct perf_event_context *ctx,
1498 struct perf_cpu_context *cpuctx, 2061 struct perf_cpu_context *cpuctx,
1499 enum event_type_t event_type) 2062 enum event_type_t event_type,
2063 struct task_struct *task)
1500{ 2064{
2065 u64 now;
2066
1501 raw_spin_lock(&ctx->lock); 2067 raw_spin_lock(&ctx->lock);
1502 ctx->is_active = 1; 2068 ctx->is_active = 1;
1503 if (likely(!ctx->nr_events)) 2069 if (likely(!ctx->nr_events))
1504 goto out; 2070 goto out;
1505 2071
1506 ctx->timestamp = perf_clock(); 2072 now = perf_clock();
1507 2073 ctx->timestamp = now;
2074 perf_cgroup_set_timestamp(task, ctx);
1508 /* 2075 /*
1509 * First go through the list and put on any pinned groups 2076 * First go through the list and put on any pinned groups
1510 * in order to give them the best chance of going on. 2077 * in order to give them the best chance of going on.
@@ -1521,11 +2088,12 @@ out:
1521} 2088}
1522 2089
1523static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2090static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1524 enum event_type_t event_type) 2091 enum event_type_t event_type,
2092 struct task_struct *task)
1525{ 2093{
1526 struct perf_event_context *ctx = &cpuctx->ctx; 2094 struct perf_event_context *ctx = &cpuctx->ctx;
1527 2095
1528 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, task);
1529} 2097}
1530 2098
1531static void task_ctx_sched_in(struct perf_event_context *ctx, 2099static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1533,15 +2101,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1533{ 2101{
1534 struct perf_cpu_context *cpuctx; 2102 struct perf_cpu_context *cpuctx;
1535 2103
1536 cpuctx = __get_cpu_context(ctx); 2104 cpuctx = __get_cpu_context(ctx);
1537 if (cpuctx->task_ctx == ctx) 2105 if (cpuctx->task_ctx == ctx)
1538 return; 2106 return;
1539 2107
1540 ctx_sched_in(ctx, cpuctx, event_type); 2108 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1541 cpuctx->task_ctx = ctx; 2109 cpuctx->task_ctx = ctx;
1542} 2110}
1543 2111
1544void perf_event_context_sched_in(struct perf_event_context *ctx) 2112static void perf_event_context_sched_in(struct perf_event_context *ctx,
2113 struct task_struct *task)
1545{ 2114{
1546 struct perf_cpu_context *cpuctx; 2115 struct perf_cpu_context *cpuctx;
1547 2116
@@ -1557,9 +2126,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1557 */ 2126 */
1558 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2127 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1559 2128
1560 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2129 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1561 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2130 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1562 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2131 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1563 2132
1564 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
1565 2134
@@ -1592,14 +2161,17 @@ void __perf_event_task_sched_in(struct task_struct *task)
1592 if (likely(!ctx)) 2161 if (likely(!ctx))
1593 continue; 2162 continue;
1594 2163
1595 perf_event_context_sched_in(ctx); 2164 perf_event_context_sched_in(ctx, task);
1596 } 2165 }
2166 /*
2167 * if cgroup events exist on this CPU, then we need
2168 * to check if we have to switch in PMU state.
2169 * cgroup event are system-wide mode only
2170 */
2171 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2172 perf_cgroup_sched_in(task);
1597} 2173}
1598 2174
1599#define MAX_INTERRUPTS (~0ULL)
1600
1601static void perf_log_throttle(struct perf_event *event, int enable);
1602
1603static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2175static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1604{ 2176{
1605 u64 frequency = event->attr.sample_freq; 2177 u64 frequency = event->attr.sample_freq;
@@ -1627,7 +2199,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1627 * Reduce accuracy by one bit such that @a and @b converge 2199 * Reduce accuracy by one bit such that @a and @b converge
1628 * to a similar magnitude. 2200 * to a similar magnitude.
1629 */ 2201 */
1630#define REDUCE_FLS(a, b) \ 2202#define REDUCE_FLS(a, b) \
1631do { \ 2203do { \
1632 if (a##_fls > b##_fls) { \ 2204 if (a##_fls > b##_fls) { \
1633 a >>= 1; \ 2205 a >>= 1; \
@@ -1797,7 +2369,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1797 if (ctx) 2369 if (ctx)
1798 rotate_ctx(ctx); 2370 rotate_ctx(ctx);
1799 2371
1800 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2372 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1801 if (ctx) 2373 if (ctx)
1802 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2374 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1803 2375
@@ -1852,6 +2424,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1852 if (!ctx || !ctx->nr_events) 2424 if (!ctx || !ctx->nr_events)
1853 goto out; 2425 goto out;
1854 2426
2427 /*
2428 * We must ctxsw out cgroup events to avoid conflict
2429 * when invoking perf_task_event_sched_in() later on
2430 * in this function. Otherwise we end up trying to
2431 * ctxswin cgroup events which are already scheduled
2432 * in.
2433 */
2434 perf_cgroup_sched_out(current);
1855 task_ctx_sched_out(ctx, EVENT_ALL); 2435 task_ctx_sched_out(ctx, EVENT_ALL);
1856 2436
1857 raw_spin_lock(&ctx->lock); 2437 raw_spin_lock(&ctx->lock);
@@ -1876,7 +2456,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1876 2456
1877 raw_spin_unlock(&ctx->lock); 2457 raw_spin_unlock(&ctx->lock);
1878 2458
1879 perf_event_context_sched_in(ctx); 2459 /*
2460 * Also calls ctxswin for cgroup events, if any:
2461 */
2462 perf_event_context_sched_in(ctx, ctx->task);
1880out: 2463out:
1881 local_irq_restore(flags); 2464 local_irq_restore(flags);
1882} 2465}
@@ -1901,8 +2484,10 @@ static void __perf_event_read(void *info)
1901 return; 2484 return;
1902 2485
1903 raw_spin_lock(&ctx->lock); 2486 raw_spin_lock(&ctx->lock);
1904 if (ctx->is_active) 2487 if (ctx->is_active) {
1905 update_context_time(ctx); 2488 update_context_time(ctx);
2489 update_cgrp_time_from_event(event);
2490 }
1906 update_event_times(event); 2491 update_event_times(event);
1907 if (event->state == PERF_EVENT_STATE_ACTIVE) 2492 if (event->state == PERF_EVENT_STATE_ACTIVE)
1908 event->pmu->read(event); 2493 event->pmu->read(event);
@@ -1933,8 +2518,10 @@ static u64 perf_event_read(struct perf_event *event)
1933 * (e.g., thread is blocked), in that case 2518 * (e.g., thread is blocked), in that case
1934 * we cannot update context time 2519 * we cannot update context time
1935 */ 2520 */
1936 if (ctx->is_active) 2521 if (ctx->is_active) {
1937 update_context_time(ctx); 2522 update_context_time(ctx);
2523 update_cgrp_time_from_event(event);
2524 }
1938 update_event_times(event); 2525 update_event_times(event);
1939 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2526 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1940 } 2527 }
@@ -2213,6 +2800,9 @@ errout:
2213 2800
2214} 2801}
2215 2802
2803/*
2804 * Returns a matching context with refcount and pincount.
2805 */
2216static struct perf_event_context * 2806static struct perf_event_context *
2217find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2807find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2218{ 2808{
@@ -2237,6 +2827,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2237 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2827 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2238 ctx = &cpuctx->ctx; 2828 ctx = &cpuctx->ctx;
2239 get_ctx(ctx); 2829 get_ctx(ctx);
2830 ++ctx->pin_count;
2240 2831
2241 return ctx; 2832 return ctx;
2242 } 2833 }
@@ -2250,6 +2841,7 @@ retry:
2250 ctx = perf_lock_task_context(task, ctxn, &flags); 2841 ctx = perf_lock_task_context(task, ctxn, &flags);
2251 if (ctx) { 2842 if (ctx) {
2252 unclone_ctx(ctx); 2843 unclone_ctx(ctx);
2844 ++ctx->pin_count;
2253 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2845 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2254 } 2846 }
2255 2847
@@ -2271,8 +2863,10 @@ retry:
2271 err = -ESRCH; 2863 err = -ESRCH;
2272 else if (task->perf_event_ctxp[ctxn]) 2864 else if (task->perf_event_ctxp[ctxn])
2273 err = -EAGAIN; 2865 err = -EAGAIN;
2274 else 2866 else {
2867 ++ctx->pin_count;
2275 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2868 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2869 }
2276 mutex_unlock(&task->perf_event_mutex); 2870 mutex_unlock(&task->perf_event_mutex);
2277 2871
2278 if (unlikely(err)) { 2872 if (unlikely(err)) {
@@ -2312,7 +2906,7 @@ static void free_event(struct perf_event *event)
2312 2906
2313 if (!event->parent) { 2907 if (!event->parent) {
2314 if (event->attach_state & PERF_ATTACH_TASK) 2908 if (event->attach_state & PERF_ATTACH_TASK)
2315 jump_label_dec(&perf_task_events); 2909 jump_label_dec(&perf_sched_events);
2316 if (event->attr.mmap || event->attr.mmap_data) 2910 if (event->attr.mmap || event->attr.mmap_data)
2317 atomic_dec(&nr_mmap_events); 2911 atomic_dec(&nr_mmap_events);
2318 if (event->attr.comm) 2912 if (event->attr.comm)
@@ -2321,6 +2915,10 @@ static void free_event(struct perf_event *event)
2321 atomic_dec(&nr_task_events); 2915 atomic_dec(&nr_task_events);
2322 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2916 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2323 put_callchain_buffers(); 2917 put_callchain_buffers();
2918 if (is_cgroup_event(event)) {
2919 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2920 jump_label_dec(&perf_sched_events);
2921 }
2324 } 2922 }
2325 2923
2326 if (event->buffer) { 2924 if (event->buffer) {
@@ -2328,6 +2926,9 @@ static void free_event(struct perf_event *event)
2328 event->buffer = NULL; 2926 event->buffer = NULL;
2329 } 2927 }
2330 2928
2929 if (is_cgroup_event(event))
2930 perf_detach_cgroup(event);
2931
2331 if (event->destroy) 2932 if (event->destroy)
2332 event->destroy(event); 2933 event->destroy(event);
2333 2934
@@ -4395,26 +4996,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4395 if (unlikely(!is_sampling_event(event))) 4996 if (unlikely(!is_sampling_event(event)))
4396 return 0; 4997 return 0;
4397 4998
4398 if (!throttle) { 4999 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4399 hwc->interrupts++; 5000 if (throttle) {
4400 } else { 5001 hwc->interrupts = MAX_INTERRUPTS;
4401 if (hwc->interrupts != MAX_INTERRUPTS) { 5002 perf_log_throttle(event, 0);
4402 hwc->interrupts++;
4403 if (HZ * hwc->interrupts >
4404 (u64)sysctl_perf_event_sample_rate) {
4405 hwc->interrupts = MAX_INTERRUPTS;
4406 perf_log_throttle(event, 0);
4407 ret = 1;
4408 }
4409 } else {
4410 /*
4411 * Keep re-disabling events even though on the previous
4412 * pass we disabled it - just in case we raced with a
4413 * sched-in and the event got enabled again:
4414 */
4415 ret = 1; 5003 ret = 1;
4416 } 5004 }
4417 } 5005 } else
5006 hwc->interrupts++;
4418 5007
4419 if (event->attr.freq) { 5008 if (event->attr.freq) {
4420 u64 now = perf_clock(); 5009 u64 now = perf_clock();
@@ -4556,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event,
4556 struct pt_regs *regs) 5145 struct pt_regs *regs)
4557{ 5146{
4558 if (event->hw.state & PERF_HES_STOPPED) 5147 if (event->hw.state & PERF_HES_STOPPED)
4559 return 0; 5148 return 1;
4560 5149
4561 if (regs) { 5150 if (regs) {
4562 if (event->attr.exclude_user && user_mode(regs)) 5151 if (event->attr.exclude_user && user_mode(regs))
@@ -4912,6 +5501,8 @@ static int perf_tp_event_match(struct perf_event *event,
4912 struct perf_sample_data *data, 5501 struct perf_sample_data *data,
4913 struct pt_regs *regs) 5502 struct pt_regs *regs)
4914{ 5503{
5504 if (event->hw.state & PERF_HES_STOPPED)
5505 return 0;
4915 /* 5506 /*
4916 * All tracepoints are from kernel-space. 5507 * All tracepoints are from kernel-space.
4917 */ 5508 */
@@ -5051,6 +5642,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5051 u64 period; 5642 u64 period;
5052 5643
5053 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5644 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5645
5646 if (event->state != PERF_EVENT_STATE_ACTIVE)
5647 return HRTIMER_NORESTART;
5648
5054 event->pmu->read(event); 5649 event->pmu->read(event);
5055 5650
5056 perf_sample_data_init(&data, 0); 5651 perf_sample_data_init(&data, 0);
@@ -5077,9 +5672,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5077 if (!is_sampling_event(event)) 5672 if (!is_sampling_event(event))
5078 return; 5673 return;
5079 5674
5080 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5081 hwc->hrtimer.function = perf_swevent_hrtimer;
5082
5083 period = local64_read(&hwc->period_left); 5675 period = local64_read(&hwc->period_left);
5084 if (period) { 5676 if (period) {
5085 if (period < 0) 5677 if (period < 0)
@@ -5106,6 +5698,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5106 } 5698 }
5107} 5699}
5108 5700
5701static void perf_swevent_init_hrtimer(struct perf_event *event)
5702{
5703 struct hw_perf_event *hwc = &event->hw;
5704
5705 if (!is_sampling_event(event))
5706 return;
5707
5708 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5709 hwc->hrtimer.function = perf_swevent_hrtimer;
5710
5711 /*
5712 * Since hrtimers have a fixed rate, we can do a static freq->period
5713 * mapping and avoid the whole period adjust feedback stuff.
5714 */
5715 if (event->attr.freq) {
5716 long freq = event->attr.sample_freq;
5717
5718 event->attr.sample_period = NSEC_PER_SEC / freq;
5719 hwc->sample_period = event->attr.sample_period;
5720 local64_set(&hwc->period_left, hwc->sample_period);
5721 event->attr.freq = 0;
5722 }
5723}
5724
5109/* 5725/*
5110 * Software event: cpu wall time clock 5726 * Software event: cpu wall time clock
5111 */ 5727 */
@@ -5158,6 +5774,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5158 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5774 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5159 return -ENOENT; 5775 return -ENOENT;
5160 5776
5777 perf_swevent_init_hrtimer(event);
5778
5161 return 0; 5779 return 0;
5162} 5780}
5163 5781
@@ -5213,16 +5831,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5213 5831
5214static void task_clock_event_read(struct perf_event *event) 5832static void task_clock_event_read(struct perf_event *event)
5215{ 5833{
5216 u64 time; 5834 u64 now = perf_clock();
5217 5835 u64 delta = now - event->ctx->timestamp;
5218 if (!in_nmi()) { 5836 u64 time = event->ctx->time + delta;
5219 update_context_time(event->ctx);
5220 time = event->ctx->time;
5221 } else {
5222 u64 now = perf_clock();
5223 u64 delta = now - event->ctx->timestamp;
5224 time = event->ctx->time + delta;
5225 }
5226 5837
5227 task_clock_event_update(event, time); 5838 task_clock_event_update(event, time);
5228} 5839}
@@ -5235,6 +5846,8 @@ static int task_clock_event_init(struct perf_event *event)
5235 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5846 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5236 return -ENOENT; 5847 return -ENOENT;
5237 5848
5849 perf_swevent_init_hrtimer(event);
5850
5238 return 0; 5851 return 0;
5239} 5852}
5240 5853
@@ -5506,17 +6119,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5506{ 6119{
5507 struct pmu *pmu = NULL; 6120 struct pmu *pmu = NULL;
5508 int idx; 6121 int idx;
6122 int ret;
5509 6123
5510 idx = srcu_read_lock(&pmus_srcu); 6124 idx = srcu_read_lock(&pmus_srcu);
5511 6125
5512 rcu_read_lock(); 6126 rcu_read_lock();
5513 pmu = idr_find(&pmu_idr, event->attr.type); 6127 pmu = idr_find(&pmu_idr, event->attr.type);
5514 rcu_read_unlock(); 6128 rcu_read_unlock();
5515 if (pmu) 6129 if (pmu) {
6130 ret = pmu->event_init(event);
6131 if (ret)
6132 pmu = ERR_PTR(ret);
5516 goto unlock; 6133 goto unlock;
6134 }
5517 6135
5518 list_for_each_entry_rcu(pmu, &pmus, entry) { 6136 list_for_each_entry_rcu(pmu, &pmus, entry) {
5519 int ret = pmu->event_init(event); 6137 ret = pmu->event_init(event);
5520 if (!ret) 6138 if (!ret)
5521 goto unlock; 6139 goto unlock;
5522 6140
@@ -5642,7 +6260,7 @@ done:
5642 6260
5643 if (!event->parent) { 6261 if (!event->parent) {
5644 if (event->attach_state & PERF_ATTACH_TASK) 6262 if (event->attach_state & PERF_ATTACH_TASK)
5645 jump_label_inc(&perf_task_events); 6263 jump_label_inc(&perf_sched_events);
5646 if (event->attr.mmap || event->attr.mmap_data) 6264 if (event->attr.mmap || event->attr.mmap_data)
5647 atomic_inc(&nr_mmap_events); 6265 atomic_inc(&nr_mmap_events);
5648 if (event->attr.comm) 6266 if (event->attr.comm)
@@ -5817,7 +6435,7 @@ SYSCALL_DEFINE5(perf_event_open,
5817 int err; 6435 int err;
5818 6436
5819 /* for future expandability... */ 6437 /* for future expandability... */
5820 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6438 if (flags & ~PERF_FLAG_ALL)
5821 return -EINVAL; 6439 return -EINVAL;
5822 6440
5823 err = perf_copy_attr(attr_uptr, &attr); 6441 err = perf_copy_attr(attr_uptr, &attr);
@@ -5834,6 +6452,15 @@ SYSCALL_DEFINE5(perf_event_open,
5834 return -EINVAL; 6452 return -EINVAL;
5835 } 6453 }
5836 6454
6455 /*
6456 * In cgroup mode, the pid argument is used to pass the fd
6457 * opened to the cgroup directory in cgroupfs. The cpu argument
6458 * designates the cpu on which to monitor threads from that
6459 * cgroup.
6460 */
6461 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6462 return -EINVAL;
6463
5837 event_fd = get_unused_fd_flags(O_RDWR); 6464 event_fd = get_unused_fd_flags(O_RDWR);
5838 if (event_fd < 0) 6465 if (event_fd < 0)
5839 return event_fd; 6466 return event_fd;
@@ -5851,7 +6478,7 @@ SYSCALL_DEFINE5(perf_event_open,
5851 group_leader = NULL; 6478 group_leader = NULL;
5852 } 6479 }
5853 6480
5854 if (pid != -1) { 6481 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5855 task = find_lively_task_by_vpid(pid); 6482 task = find_lively_task_by_vpid(pid);
5856 if (IS_ERR(task)) { 6483 if (IS_ERR(task)) {
5857 err = PTR_ERR(task); 6484 err = PTR_ERR(task);
@@ -5865,6 +6492,19 @@ SYSCALL_DEFINE5(perf_event_open,
5865 goto err_task; 6492 goto err_task;
5866 } 6493 }
5867 6494
6495 if (flags & PERF_FLAG_PID_CGROUP) {
6496 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6497 if (err)
6498 goto err_alloc;
6499 /*
6500 * one more event:
6501 * - that has cgroup constraint on event->cpu
6502 * - that may need work on context switch
6503 */
6504 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6505 jump_label_inc(&perf_sched_events);
6506 }
6507
5868 /* 6508 /*
5869 * Special case software events and allow them to be part of 6509 * Special case software events and allow them to be part of
5870 * any hardware group. 6510 * any hardware group.
@@ -5903,6 +6543,11 @@ SYSCALL_DEFINE5(perf_event_open,
5903 goto err_alloc; 6543 goto err_alloc;
5904 } 6544 }
5905 6545
6546 if (task) {
6547 put_task_struct(task);
6548 task = NULL;
6549 }
6550
5906 /* 6551 /*
5907 * Look up the group leader (we will attach this event to it): 6552 * Look up the group leader (we will attach this event to it):
5908 */ 6553 */
@@ -5950,10 +6595,10 @@ SYSCALL_DEFINE5(perf_event_open,
5950 struct perf_event_context *gctx = group_leader->ctx; 6595 struct perf_event_context *gctx = group_leader->ctx;
5951 6596
5952 mutex_lock(&gctx->mutex); 6597 mutex_lock(&gctx->mutex);
5953 perf_event_remove_from_context(group_leader); 6598 perf_remove_from_context(group_leader);
5954 list_for_each_entry(sibling, &group_leader->sibling_list, 6599 list_for_each_entry(sibling, &group_leader->sibling_list,
5955 group_entry) { 6600 group_entry) {
5956 perf_event_remove_from_context(sibling); 6601 perf_remove_from_context(sibling);
5957 put_ctx(gctx); 6602 put_ctx(gctx);
5958 } 6603 }
5959 mutex_unlock(&gctx->mutex); 6604 mutex_unlock(&gctx->mutex);
@@ -5976,6 +6621,7 @@ SYSCALL_DEFINE5(perf_event_open,
5976 6621
5977 perf_install_in_context(ctx, event, cpu); 6622 perf_install_in_context(ctx, event, cpu);
5978 ++ctx->generation; 6623 ++ctx->generation;
6624 perf_unpin_context(ctx);
5979 mutex_unlock(&ctx->mutex); 6625 mutex_unlock(&ctx->mutex);
5980 6626
5981 event->owner = current; 6627 event->owner = current;
@@ -6001,6 +6647,7 @@ SYSCALL_DEFINE5(perf_event_open,
6001 return event_fd; 6647 return event_fd;
6002 6648
6003err_context: 6649err_context:
6650 perf_unpin_context(ctx);
6004 put_ctx(ctx); 6651 put_ctx(ctx);
6005err_alloc: 6652err_alloc:
6006 free_event(event); 6653 free_event(event);
@@ -6051,6 +6698,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6051 mutex_lock(&ctx->mutex); 6698 mutex_lock(&ctx->mutex);
6052 perf_install_in_context(ctx, event, cpu); 6699 perf_install_in_context(ctx, event, cpu);
6053 ++ctx->generation; 6700 ++ctx->generation;
6701 perf_unpin_context(ctx);
6054 mutex_unlock(&ctx->mutex); 6702 mutex_unlock(&ctx->mutex);
6055 6703
6056 return event; 6704 return event;
@@ -6102,17 +6750,20 @@ __perf_event_exit_task(struct perf_event *child_event,
6102 struct perf_event_context *child_ctx, 6750 struct perf_event_context *child_ctx,
6103 struct task_struct *child) 6751 struct task_struct *child)
6104{ 6752{
6105 struct perf_event *parent_event; 6753 if (child_event->parent) {
6754 raw_spin_lock_irq(&child_ctx->lock);
6755 perf_group_detach(child_event);
6756 raw_spin_unlock_irq(&child_ctx->lock);
6757 }
6106 6758
6107 perf_event_remove_from_context(child_event); 6759 perf_remove_from_context(child_event);
6108 6760
6109 parent_event = child_event->parent;
6110 /* 6761 /*
6111 * It can happen that parent exits first, and has events 6762 * It can happen that the parent exits first, and has events
6112 * that are still around due to the child reference. These 6763 * that are still around due to the child reference. These
6113 * events need to be zapped - but otherwise linger. 6764 * events need to be zapped.
6114 */ 6765 */
6115 if (parent_event) { 6766 if (child_event->parent) {
6116 sync_child_event(child_event, child); 6767 sync_child_event(child_event, child);
6117 free_event(child_event); 6768 free_event(child_event);
6118 } 6769 }
@@ -6411,7 +7062,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6411 return 0; 7062 return 0;
6412 } 7063 }
6413 7064
6414 child_ctx = child->perf_event_ctxp[ctxn]; 7065 child_ctx = child->perf_event_ctxp[ctxn];
6415 if (!child_ctx) { 7066 if (!child_ctx) {
6416 /* 7067 /*
6417 * This is executed from the parent task context, so 7068 * This is executed from the parent task context, so
@@ -6526,6 +7177,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6526 mutex_unlock(&parent_ctx->mutex); 7177 mutex_unlock(&parent_ctx->mutex);
6527 7178
6528 perf_unpin_context(parent_ctx); 7179 perf_unpin_context(parent_ctx);
7180 put_ctx(parent_ctx);
6529 7181
6530 return ret; 7182 return ret;
6531} 7183}
@@ -6595,9 +7247,9 @@ static void __perf_event_exit_context(void *__info)
6595 perf_pmu_rotate_stop(ctx->pmu); 7247 perf_pmu_rotate_stop(ctx->pmu);
6596 7248
6597 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7249 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6598 __perf_event_remove_from_context(event); 7250 __perf_remove_from_context(event);
6599 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7251 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6600 __perf_event_remove_from_context(event); 7252 __perf_remove_from_context(event);
6601} 7253}
6602 7254
6603static void perf_event_exit_cpu_context(int cpu) 7255static void perf_event_exit_cpu_context(int cpu)
@@ -6721,3 +7373,83 @@ unlock:
6721 return ret; 7373 return ret;
6722} 7374}
6723device_initcall(perf_event_sysfs_init); 7375device_initcall(perf_event_sysfs_init);
7376
7377#ifdef CONFIG_CGROUP_PERF
7378static struct cgroup_subsys_state *perf_cgroup_create(
7379 struct cgroup_subsys *ss, struct cgroup *cont)
7380{
7381 struct perf_cgroup *jc;
7382
7383 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7384 if (!jc)
7385 return ERR_PTR(-ENOMEM);
7386
7387 jc->info = alloc_percpu(struct perf_cgroup_info);
7388 if (!jc->info) {
7389 kfree(jc);
7390 return ERR_PTR(-ENOMEM);
7391 }
7392
7393 return &jc->css;
7394}
7395
7396static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7397 struct cgroup *cont)
7398{
7399 struct perf_cgroup *jc;
7400 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7401 struct perf_cgroup, css);
7402 free_percpu(jc->info);
7403 kfree(jc);
7404}
7405
7406static int __perf_cgroup_move(void *info)
7407{
7408 struct task_struct *task = info;
7409 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7410 return 0;
7411}
7412
7413static void perf_cgroup_move(struct task_struct *task)
7414{
7415 task_function_call(task, __perf_cgroup_move, task);
7416}
7417
7418static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7419 struct cgroup *old_cgrp, struct task_struct *task,
7420 bool threadgroup)
7421{
7422 perf_cgroup_move(task);
7423 if (threadgroup) {
7424 struct task_struct *c;
7425 rcu_read_lock();
7426 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7427 perf_cgroup_move(c);
7428 }
7429 rcu_read_unlock();
7430 }
7431}
7432
7433static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7434 struct cgroup *old_cgrp, struct task_struct *task)
7435{
7436 /*
7437 * cgroup_exit() is called in the copy_process() failure path.
7438 * Ignore this case since the task hasn't ran yet, this avoids
7439 * trying to poke a half freed task state from generic code.
7440 */
7441 if (!(task->flags & PF_EXITING))
7442 return;
7443
7444 perf_cgroup_move(task);
7445}
7446
7447struct cgroup_subsys perf_subsys = {
7448 .name = "perf_event",
7449 .subsys_id = perf_subsys_id,
7450 .create = perf_cgroup_create,
7451 .destroy = perf_cgroup_destroy,
7452 .exit = perf_cgroup_exit,
7453 .attach = perf_cgroup_attach,
7454};
7455#endif /* CONFIG_CGROUP_PERF */