aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c1004
1 files changed, 846 insertions, 158 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 656222fcf767..ed253aa24ba4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,13 +38,96 @@
38 38
39#include <asm/irq_regs.h> 39#include <asm/irq_regs.h>
40 40
41struct remote_function_call {
42 struct task_struct *p;
43 int (*func)(void *info);
44 void *info;
45 int ret;
46};
47
48static void remote_function(void *data)
49{
50 struct remote_function_call *tfc = data;
51 struct task_struct *p = tfc->p;
52
53 if (p) {
54 tfc->ret = -EAGAIN;
55 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
56 return;
57 }
58
59 tfc->ret = tfc->func(tfc->info);
60}
61
62/**
63 * task_function_call - call a function on the cpu on which a task runs
64 * @p: the task to evaluate
65 * @func: the function to be called
66 * @info: the function call argument
67 *
68 * Calls the function @func when the task is currently running. This might
69 * be on the current CPU, which just calls the function directly
70 *
71 * returns: @func return value, or
72 * -ESRCH - when the process isn't running
73 * -EAGAIN - when the process moved away
74 */
75static int
76task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
77{
78 struct remote_function_call data = {
79 .p = p,
80 .func = func,
81 .info = info,
82 .ret = -ESRCH, /* No such (running) process */
83 };
84
85 if (task_curr(p))
86 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
87
88 return data.ret;
89}
90
91/**
92 * cpu_function_call - call a function on the cpu
93 * @func: the function to be called
94 * @info: the function call argument
95 *
96 * Calls the function @func on the remote cpu.
97 *
98 * returns: @func return value or -ENXIO when the cpu is offline
99 */
100static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
101{
102 struct remote_function_call data = {
103 .p = NULL,
104 .func = func,
105 .info = info,
106 .ret = -ENXIO, /* No such CPU */
107 };
108
109 smp_call_function_single(cpu, remote_function, &data, 1);
110
111 return data.ret;
112}
113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
41enum event_type_t { 118enum event_type_t {
42 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
43 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
44 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
45}; 122};
46 123
47atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
48static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
49static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
50static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
67/* 150/*
68 * max perf event sample rate 151 * max perf event sample rate
69 */ 152 */
70int sysctl_perf_event_sample_rate __read_mostly = 100000; 153#define DEFAULT_MAX_SAMPLE_RATE 100000
154int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
155static int max_samples_per_tick __read_mostly =
156 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
157
158int perf_proc_update_handler(struct ctl_table *table, int write,
159 void __user *buffer, size_t *lenp,
160 loff_t *ppos)
161{
162 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
163
164 if (ret || !write)
165 return ret;
166
167 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
168
169 return 0;
170}
71 171
72static atomic64_t perf_event_id; 172static atomic64_t perf_event_id;
73 173
@@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
75 enum event_type_t event_type); 175 enum event_type_t event_type);
76 176
77static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 177static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
78 enum event_type_t event_type); 178 enum event_type_t event_type,
179 struct task_struct *task);
180
181static void update_context_time(struct perf_event_context *ctx);
182static u64 perf_event_time(struct perf_event *event);
79 183
80void __weak perf_event_print_debug(void) { } 184void __weak perf_event_print_debug(void) { }
81 185
@@ -89,6 +193,360 @@ static inline u64 perf_clock(void)
89 return local_clock(); 193 return local_clock();
90} 194}
91 195
196static inline struct perf_cpu_context *
197__get_cpu_context(struct perf_event_context *ctx)
198{
199 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
200}
201
202#ifdef CONFIG_CGROUP_PERF
203
204/*
205 * Must ensure cgroup is pinned (css_get) before calling
206 * this function. In other words, we cannot call this function
207 * if there is no cgroup event for the current CPU context.
208 */
209static inline struct perf_cgroup *
210perf_cgroup_from_task(struct task_struct *task)
211{
212 return container_of(task_subsys_state(task, perf_subsys_id),
213 struct perf_cgroup, css);
214}
215
216static inline bool
217perf_cgroup_match(struct perf_event *event)
218{
219 struct perf_event_context *ctx = event->ctx;
220 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
221
222 return !event->cgrp || event->cgrp == cpuctx->cgrp;
223}
224
225static inline void perf_get_cgroup(struct perf_event *event)
226{
227 css_get(&event->cgrp->css);
228}
229
230static inline void perf_put_cgroup(struct perf_event *event)
231{
232 css_put(&event->cgrp->css);
233}
234
235static inline void perf_detach_cgroup(struct perf_event *event)
236{
237 perf_put_cgroup(event);
238 event->cgrp = NULL;
239}
240
241static inline int is_cgroup_event(struct perf_event *event)
242{
243 return event->cgrp != NULL;
244}
245
246static inline u64 perf_cgroup_event_time(struct perf_event *event)
247{
248 struct perf_cgroup_info *t;
249
250 t = per_cpu_ptr(event->cgrp->info, event->cpu);
251 return t->time;
252}
253
254static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
255{
256 struct perf_cgroup_info *info;
257 u64 now;
258
259 now = perf_clock();
260
261 info = this_cpu_ptr(cgrp->info);
262
263 info->time += now - info->timestamp;
264 info->timestamp = now;
265}
266
267static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
268{
269 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
270 if (cgrp_out)
271 __update_cgrp_time(cgrp_out);
272}
273
274static inline void update_cgrp_time_from_event(struct perf_event *event)
275{
276 struct perf_cgroup *cgrp;
277
278 /*
279 * ensure we access cgroup data only when needed and
280 * when we know the cgroup is pinned (css_get)
281 */
282 if (!is_cgroup_event(event))
283 return;
284
285 cgrp = perf_cgroup_from_task(current);
286 /*
287 * Do not update time when cgroup is not active
288 */
289 if (cgrp == event->cgrp)
290 __update_cgrp_time(event->cgrp);
291}
292
293static inline void
294perf_cgroup_set_timestamp(struct task_struct *task,
295 struct perf_event_context *ctx)
296{
297 struct perf_cgroup *cgrp;
298 struct perf_cgroup_info *info;
299
300 /*
301 * ctx->lock held by caller
302 * ensure we do not access cgroup data
303 * unless we have the cgroup pinned (css_get)
304 */
305 if (!task || !ctx->nr_cgroups)
306 return;
307
308 cgrp = perf_cgroup_from_task(task);
309 info = this_cpu_ptr(cgrp->info);
310 info->timestamp = ctx->timestamp;
311}
312
313#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
314#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
315
316/*
317 * reschedule events based on the cgroup constraint of task.
318 *
319 * mode SWOUT : schedule out everything
320 * mode SWIN : schedule in based on cgroup for next
321 */
322void perf_cgroup_switch(struct task_struct *task, int mode)
323{
324 struct perf_cpu_context *cpuctx;
325 struct pmu *pmu;
326 unsigned long flags;
327
328 /*
329 * disable interrupts to avoid geting nr_cgroup
330 * changes via __perf_event_disable(). Also
331 * avoids preemption.
332 */
333 local_irq_save(flags);
334
335 /*
336 * we reschedule only in the presence of cgroup
337 * constrained events.
338 */
339 rcu_read_lock();
340
341 list_for_each_entry_rcu(pmu, &pmus, entry) {
342
343 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
344
345 perf_pmu_disable(cpuctx->ctx.pmu);
346
347 /*
348 * perf_cgroup_events says at least one
349 * context on this CPU has cgroup events.
350 *
351 * ctx->nr_cgroups reports the number of cgroup
352 * events for a context.
353 */
354 if (cpuctx->ctx.nr_cgroups > 0) {
355
356 if (mode & PERF_CGROUP_SWOUT) {
357 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
358 /*
359 * must not be done before ctxswout due
360 * to event_filter_match() in event_sched_out()
361 */
362 cpuctx->cgrp = NULL;
363 }
364
365 if (mode & PERF_CGROUP_SWIN) {
366 /* set cgrp before ctxsw in to
367 * allow event_filter_match() to not
368 * have to pass task around
369 */
370 cpuctx->cgrp = perf_cgroup_from_task(task);
371 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
372 }
373 }
374
375 perf_pmu_enable(cpuctx->ctx.pmu);
376 }
377
378 rcu_read_unlock();
379
380 local_irq_restore(flags);
381}
382
383static inline void perf_cgroup_sched_out(struct task_struct *task)
384{
385 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
386}
387
388static inline void perf_cgroup_sched_in(struct task_struct *task)
389{
390 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
391}
392
393static inline int perf_cgroup_connect(int fd, struct perf_event *event,
394 struct perf_event_attr *attr,
395 struct perf_event *group_leader)
396{
397 struct perf_cgroup *cgrp;
398 struct cgroup_subsys_state *css;
399 struct file *file;
400 int ret = 0, fput_needed;
401
402 file = fget_light(fd, &fput_needed);
403 if (!file)
404 return -EBADF;
405
406 css = cgroup_css_from_dir(file, perf_subsys_id);
407 if (IS_ERR(css)) {
408 ret = PTR_ERR(css);
409 goto out;
410 }
411
412 cgrp = container_of(css, struct perf_cgroup, css);
413 event->cgrp = cgrp;
414
415 /* must be done before we fput() the file */
416 perf_get_cgroup(event);
417
418 /*
419 * all events in a group must monitor
420 * the same cgroup because a task belongs
421 * to only one perf cgroup at a time
422 */
423 if (group_leader && group_leader->cgrp != cgrp) {
424 perf_detach_cgroup(event);
425 ret = -EINVAL;
426 }
427out:
428 fput_light(file, fput_needed);
429 return ret;
430}
431
432static inline void
433perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
434{
435 struct perf_cgroup_info *t;
436 t = per_cpu_ptr(event->cgrp->info, event->cpu);
437 event->shadow_ctx_time = now - t->timestamp;
438}
439
440static inline void
441perf_cgroup_defer_enabled(struct perf_event *event)
442{
443 /*
444 * when the current task's perf cgroup does not match
445 * the event's, we need to remember to call the
446 * perf_mark_enable() function the first time a task with
447 * a matching perf cgroup is scheduled in.
448 */
449 if (is_cgroup_event(event) && !perf_cgroup_match(event))
450 event->cgrp_defer_enabled = 1;
451}
452
453static inline void
454perf_cgroup_mark_enabled(struct perf_event *event,
455 struct perf_event_context *ctx)
456{
457 struct perf_event *sub;
458 u64 tstamp = perf_event_time(event);
459
460 if (!event->cgrp_defer_enabled)
461 return;
462
463 event->cgrp_defer_enabled = 0;
464
465 event->tstamp_enabled = tstamp - event->total_time_enabled;
466 list_for_each_entry(sub, &event->sibling_list, group_entry) {
467 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
468 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
469 sub->cgrp_defer_enabled = 0;
470 }
471 }
472}
473#else /* !CONFIG_CGROUP_PERF */
474
475static inline bool
476perf_cgroup_match(struct perf_event *event)
477{
478 return true;
479}
480
481static inline void perf_detach_cgroup(struct perf_event *event)
482{}
483
484static inline int is_cgroup_event(struct perf_event *event)
485{
486 return 0;
487}
488
489static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
490{
491 return 0;
492}
493
494static inline void update_cgrp_time_from_event(struct perf_event *event)
495{
496}
497
498static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
499{
500}
501
502static inline void perf_cgroup_sched_out(struct task_struct *task)
503{
504}
505
506static inline void perf_cgroup_sched_in(struct task_struct *task)
507{
508}
509
510static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
511 struct perf_event_attr *attr,
512 struct perf_event *group_leader)
513{
514 return -EINVAL;
515}
516
517static inline void
518perf_cgroup_set_timestamp(struct task_struct *task,
519 struct perf_event_context *ctx)
520{
521}
522
523void
524perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
525{
526}
527
528static inline void
529perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
530{
531}
532
533static inline u64 perf_cgroup_event_time(struct perf_event *event)
534{
535 return 0;
536}
537
538static inline void
539perf_cgroup_defer_enabled(struct perf_event *event)
540{
541}
542
543static inline void
544perf_cgroup_mark_enabled(struct perf_event *event,
545 struct perf_event_context *ctx)
546{
547}
548#endif
549
92void perf_pmu_disable(struct pmu *pmu) 550void perf_pmu_disable(struct pmu *pmu)
93{ 551{
94 int *count = this_cpu_ptr(pmu->pmu_disable_count); 552 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
254 raw_spin_lock_irqsave(&ctx->lock, flags); 712 raw_spin_lock_irqsave(&ctx->lock, flags);
255 --ctx->pin_count; 713 --ctx->pin_count;
256 raw_spin_unlock_irqrestore(&ctx->lock, flags); 714 raw_spin_unlock_irqrestore(&ctx->lock, flags);
257 put_ctx(ctx);
258} 715}
259 716
260/* 717/*
@@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx)
271static u64 perf_event_time(struct perf_event *event) 728static u64 perf_event_time(struct perf_event *event)
272{ 729{
273 struct perf_event_context *ctx = event->ctx; 730 struct perf_event_context *ctx = event->ctx;
731
732 if (is_cgroup_event(event))
733 return perf_cgroup_event_time(event);
734
274 return ctx ? ctx->time : 0; 735 return ctx ? ctx->time : 0;
275} 736}
276 737
@@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event)
285 if (event->state < PERF_EVENT_STATE_INACTIVE || 746 if (event->state < PERF_EVENT_STATE_INACTIVE ||
286 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 747 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
287 return; 748 return;
288 749 /*
289 if (ctx->is_active) 750 * in cgroup mode, time_enabled represents
751 * the time the event was enabled AND active
752 * tasks were in the monitored cgroup. This is
753 * independent of the activity of the context as
754 * there may be a mix of cgroup and non-cgroup events.
755 *
756 * That is why we treat cgroup events differently
757 * here.
758 */
759 if (is_cgroup_event(event))
290 run_end = perf_event_time(event); 760 run_end = perf_event_time(event);
761 else if (ctx->is_active)
762 run_end = ctx->time;
291 else 763 else
292 run_end = event->tstamp_stopped; 764 run_end = event->tstamp_stopped;
293 765
@@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event)
299 run_end = perf_event_time(event); 771 run_end = perf_event_time(event);
300 772
301 event->total_time_running = run_end - event->tstamp_running; 773 event->total_time_running = run_end - event->tstamp_running;
774
302} 775}
303 776
304/* 777/*
@@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
347 list_add_tail(&event->group_entry, list); 820 list_add_tail(&event->group_entry, list);
348 } 821 }
349 822
823 if (is_cgroup_event(event))
824 ctx->nr_cgroups++;
825
350 list_add_rcu(&event->event_entry, &ctx->event_list); 826 list_add_rcu(&event->event_entry, &ctx->event_list);
351 if (!ctx->nr_events) 827 if (!ctx->nr_events)
352 perf_pmu_rotate_start(ctx->pmu); 828 perf_pmu_rotate_start(ctx->pmu);
@@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
473 949
474 event->attach_state &= ~PERF_ATTACH_CONTEXT; 950 event->attach_state &= ~PERF_ATTACH_CONTEXT;
475 951
952 if (is_cgroup_event(event))
953 ctx->nr_cgroups--;
954
476 ctx->nr_events--; 955 ctx->nr_events--;
477 if (event->attr.inherit_stat) 956 if (event->attr.inherit_stat)
478 ctx->nr_stat--; 957 ctx->nr_stat--;
@@ -544,7 +1023,8 @@ out:
544static inline int 1023static inline int
545event_filter_match(struct perf_event *event) 1024event_filter_match(struct perf_event *event)
546{ 1025{
547 return event->cpu == -1 || event->cpu == smp_processor_id(); 1026 return (event->cpu == -1 || event->cpu == smp_processor_id())
1027 && perf_cgroup_match(event);
548} 1028}
549 1029
550static void 1030static void
@@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event,
562 */ 1042 */
563 if (event->state == PERF_EVENT_STATE_INACTIVE 1043 if (event->state == PERF_EVENT_STATE_INACTIVE
564 && !event_filter_match(event)) { 1044 && !event_filter_match(event)) {
565 delta = ctx->time - event->tstamp_stopped; 1045 delta = tstamp - event->tstamp_stopped;
566 event->tstamp_running += delta; 1046 event->tstamp_running += delta;
567 event->tstamp_stopped = tstamp; 1047 event->tstamp_stopped = tstamp;
568 } 1048 }
@@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event,
606 cpuctx->exclusive = 0; 1086 cpuctx->exclusive = 0;
607} 1087}
608 1088
609static inline struct perf_cpu_context *
610__get_cpu_context(struct perf_event_context *ctx)
611{
612 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
613}
614
615/* 1089/*
616 * Cross CPU call to remove a performance event 1090 * Cross CPU call to remove a performance event
617 * 1091 *
618 * We disable the event on the hardware level first. After that we 1092 * We disable the event on the hardware level first. After that we
619 * remove it from the context list. 1093 * remove it from the context list.
620 */ 1094 */
621static void __perf_event_remove_from_context(void *info) 1095static int __perf_remove_from_context(void *info)
622{ 1096{
623 struct perf_event *event = info; 1097 struct perf_event *event = info;
624 struct perf_event_context *ctx = event->ctx; 1098 struct perf_event_context *ctx = event->ctx;
625 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1099 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
626 1100
627 /*
628 * If this is a task context, we need to check whether it is
629 * the current task context of this cpu. If not it has been
630 * scheduled out before the smp call arrived.
631 */
632 if (ctx->task && cpuctx->task_ctx != ctx)
633 return;
634
635 raw_spin_lock(&ctx->lock); 1101 raw_spin_lock(&ctx->lock);
636
637 event_sched_out(event, cpuctx, ctx); 1102 event_sched_out(event, cpuctx, ctx);
638
639 list_del_event(event, ctx); 1103 list_del_event(event, ctx);
640
641 raw_spin_unlock(&ctx->lock); 1104 raw_spin_unlock(&ctx->lock);
1105
1106 return 0;
642} 1107}
643 1108
644 1109
645/* 1110/*
646 * Remove the event from a task's (or a CPU's) list of events. 1111 * Remove the event from a task's (or a CPU's) list of events.
647 * 1112 *
648 * Must be called with ctx->mutex held.
649 *
650 * CPU events are removed with a smp call. For task events we only 1113 * CPU events are removed with a smp call. For task events we only
651 * call when the task is on a CPU. 1114 * call when the task is on a CPU.
652 * 1115 *
@@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info)
657 * When called from perf_event_exit_task, it's OK because the 1120 * When called from perf_event_exit_task, it's OK because the
658 * context has been detached from its task. 1121 * context has been detached from its task.
659 */ 1122 */
660static void perf_event_remove_from_context(struct perf_event *event) 1123static void perf_remove_from_context(struct perf_event *event)
661{ 1124{
662 struct perf_event_context *ctx = event->ctx; 1125 struct perf_event_context *ctx = event->ctx;
663 struct task_struct *task = ctx->task; 1126 struct task_struct *task = ctx->task;
664 1127
1128 lockdep_assert_held(&ctx->mutex);
1129
665 if (!task) { 1130 if (!task) {
666 /* 1131 /*
667 * Per cpu events are removed via an smp call and 1132 * Per cpu events are removed via an smp call and
668 * the removal is always successful. 1133 * the removal is always successful.
669 */ 1134 */
670 smp_call_function_single(event->cpu, 1135 cpu_function_call(event->cpu, __perf_remove_from_context, event);
671 __perf_event_remove_from_context,
672 event, 1);
673 return; 1136 return;
674 } 1137 }
675 1138
676retry: 1139retry:
677 task_oncpu_function_call(task, __perf_event_remove_from_context, 1140 if (!task_function_call(task, __perf_remove_from_context, event))
678 event); 1141 return;
679 1142
680 raw_spin_lock_irq(&ctx->lock); 1143 raw_spin_lock_irq(&ctx->lock);
681 /* 1144 /*
682 * If the context is active we need to retry the smp call. 1145 * If we failed to find a running task, but find the context active now
1146 * that we've acquired the ctx->lock, retry.
683 */ 1147 */
684 if (ctx->nr_active && !list_empty(&event->group_entry)) { 1148 if (ctx->is_active) {
685 raw_spin_unlock_irq(&ctx->lock); 1149 raw_spin_unlock_irq(&ctx->lock);
686 goto retry; 1150 goto retry;
687 } 1151 }
688 1152
689 /* 1153 /*
690 * The lock prevents that this context is scheduled in so we 1154 * Since the task isn't running, its safe to remove the event, us
691 * can remove the event safely, if the call above did not 1155 * holding the ctx->lock ensures the task won't get scheduled in.
692 * succeed.
693 */ 1156 */
694 if (!list_empty(&event->group_entry)) 1157 list_del_event(event, ctx);
695 list_del_event(event, ctx);
696 raw_spin_unlock_irq(&ctx->lock); 1158 raw_spin_unlock_irq(&ctx->lock);
697} 1159}
698 1160
699/* 1161/*
700 * Cross CPU call to disable a performance event 1162 * Cross CPU call to disable a performance event
701 */ 1163 */
702static void __perf_event_disable(void *info) 1164static int __perf_event_disable(void *info)
703{ 1165{
704 struct perf_event *event = info; 1166 struct perf_event *event = info;
705 struct perf_event_context *ctx = event->ctx; 1167 struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info)
708 /* 1170 /*
709 * If this is a per-task event, need to check whether this 1171 * If this is a per-task event, need to check whether this
710 * event's task is the current task on this cpu. 1172 * event's task is the current task on this cpu.
1173 *
1174 * Can trigger due to concurrent perf_event_context_sched_out()
1175 * flipping contexts around.
711 */ 1176 */
712 if (ctx->task && cpuctx->task_ctx != ctx) 1177 if (ctx->task && cpuctx->task_ctx != ctx)
713 return; 1178 return -EINVAL;
714 1179
715 raw_spin_lock(&ctx->lock); 1180 raw_spin_lock(&ctx->lock);
716 1181
@@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info)
720 */ 1185 */
721 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1186 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
722 update_context_time(ctx); 1187 update_context_time(ctx);
1188 update_cgrp_time_from_event(event);
723 update_group_times(event); 1189 update_group_times(event);
724 if (event == event->group_leader) 1190 if (event == event->group_leader)
725 group_sched_out(event, cpuctx, ctx); 1191 group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info)
729 } 1195 }
730 1196
731 raw_spin_unlock(&ctx->lock); 1197 raw_spin_unlock(&ctx->lock);
1198
1199 return 0;
732} 1200}
733 1201
734/* 1202/*
@@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event)
753 /* 1221 /*
754 * Disable the event on the cpu that it's on 1222 * Disable the event on the cpu that it's on
755 */ 1223 */
756 smp_call_function_single(event->cpu, __perf_event_disable, 1224 cpu_function_call(event->cpu, __perf_event_disable, event);
757 event, 1);
758 return; 1225 return;
759 } 1226 }
760 1227
761retry: 1228retry:
762 task_oncpu_function_call(task, __perf_event_disable, event); 1229 if (!task_function_call(task, __perf_event_disable, event))
1230 return;
763 1231
764 raw_spin_lock_irq(&ctx->lock); 1232 raw_spin_lock_irq(&ctx->lock);
765 /* 1233 /*
@@ -767,6 +1235,11 @@ retry:
767 */ 1235 */
768 if (event->state == PERF_EVENT_STATE_ACTIVE) { 1236 if (event->state == PERF_EVENT_STATE_ACTIVE) {
769 raw_spin_unlock_irq(&ctx->lock); 1237 raw_spin_unlock_irq(&ctx->lock);
1238 /*
1239 * Reload the task pointer, it might have been changed by
1240 * a concurrent perf_event_context_sched_out().
1241 */
1242 task = ctx->task;
770 goto retry; 1243 goto retry;
771 } 1244 }
772 1245
@@ -778,10 +1251,44 @@ retry:
778 update_group_times(event); 1251 update_group_times(event);
779 event->state = PERF_EVENT_STATE_OFF; 1252 event->state = PERF_EVENT_STATE_OFF;
780 } 1253 }
781
782 raw_spin_unlock_irq(&ctx->lock); 1254 raw_spin_unlock_irq(&ctx->lock);
783} 1255}
784 1256
1257static void perf_set_shadow_time(struct perf_event *event,
1258 struct perf_event_context *ctx,
1259 u64 tstamp)
1260{
1261 /*
1262 * use the correct time source for the time snapshot
1263 *
1264 * We could get by without this by leveraging the
1265 * fact that to get to this function, the caller
1266 * has most likely already called update_context_time()
1267 * and update_cgrp_time_xx() and thus both timestamp
1268 * are identical (or very close). Given that tstamp is,
1269 * already adjusted for cgroup, we could say that:
1270 * tstamp - ctx->timestamp
1271 * is equivalent to
1272 * tstamp - cgrp->timestamp.
1273 *
1274 * Then, in perf_output_read(), the calculation would
1275 * work with no changes because:
1276 * - event is guaranteed scheduled in
1277 * - no scheduled out in between
1278 * - thus the timestamp would be the same
1279 *
1280 * But this is a bit hairy.
1281 *
1282 * So instead, we have an explicit cgroup call to remain
1283 * within the time time source all along. We believe it
1284 * is cleaner and simpler to understand.
1285 */
1286 if (is_cgroup_event(event))
1287 perf_cgroup_set_shadow_time(event, tstamp);
1288 else
1289 event->shadow_ctx_time = tstamp - ctx->timestamp;
1290}
1291
785#define MAX_INTERRUPTS (~0ULL) 1292#define MAX_INTERRUPTS (~0ULL)
786 1293
787static void perf_log_throttle(struct perf_event *event, int enable); 1294static void perf_log_throttle(struct perf_event *event, int enable);
@@ -822,7 +1329,7 @@ event_sched_in(struct perf_event *event,
822 1329
823 event->tstamp_running += tstamp - event->tstamp_stopped; 1330 event->tstamp_running += tstamp - event->tstamp_stopped;
824 1331
825 event->shadow_ctx_time = tstamp - ctx->timestamp; 1332 perf_set_shadow_time(event, ctx, tstamp);
826 1333
827 if (!is_software_event(event)) 1334 if (!is_software_event(event))
828 cpuctx->active_oncpu++; 1335 cpuctx->active_oncpu++;
@@ -943,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event,
943 event->tstamp_stopped = tstamp; 1450 event->tstamp_stopped = tstamp;
944} 1451}
945 1452
1453static void perf_event_context_sched_in(struct perf_event_context *ctx,
1454 struct task_struct *tsk);
1455
946/* 1456/*
947 * Cross CPU call to install and enable a performance event 1457 * Cross CPU call to install and enable a performance event
948 * 1458 *
949 * Must be called with ctx->mutex held 1459 * Must be called with ctx->mutex held
950 */ 1460 */
951static void __perf_install_in_context(void *info) 1461static int __perf_install_in_context(void *info)
952{ 1462{
953 struct perf_event *event = info; 1463 struct perf_event *event = info;
954 struct perf_event_context *ctx = event->ctx; 1464 struct perf_event_context *ctx = event->ctx;
@@ -957,21 +1467,22 @@ static void __perf_install_in_context(void *info)
957 int err; 1467 int err;
958 1468
959 /* 1469 /*
960 * If this is a task context, we need to check whether it is 1470 * In case we're installing a new context to an already running task,
961 * the current task context of this cpu. If not it has been 1471 * could also happen before perf_event_task_sched_in() on architectures
962 * scheduled out before the smp call arrived. 1472 * which do context switches with IRQs enabled.
963 * Or possibly this is the right context but it isn't
964 * on this cpu because it had no events.
965 */ 1473 */
966 if (ctx->task && cpuctx->task_ctx != ctx) { 1474 if (ctx->task && !cpuctx->task_ctx)
967 if (cpuctx->task_ctx || ctx->task != current) 1475 perf_event_context_sched_in(ctx, ctx->task);
968 return;
969 cpuctx->task_ctx = ctx;
970 }
971 1476
972 raw_spin_lock(&ctx->lock); 1477 raw_spin_lock(&ctx->lock);
973 ctx->is_active = 1; 1478 ctx->is_active = 1;
974 update_context_time(ctx); 1479 update_context_time(ctx);
1480 /*
1481 * update cgrp time only if current cgrp
1482 * matches event->cgrp. Must be done before
1483 * calling add_event_to_ctx()
1484 */
1485 update_cgrp_time_from_event(event);
975 1486
976 add_event_to_ctx(event, ctx); 1487 add_event_to_ctx(event, ctx);
977 1488
@@ -1012,6 +1523,8 @@ static void __perf_install_in_context(void *info)
1012 1523
1013unlock: 1524unlock:
1014 raw_spin_unlock(&ctx->lock); 1525 raw_spin_unlock(&ctx->lock);
1526
1527 return 0;
1015} 1528}
1016 1529
1017/* 1530/*
@@ -1023,8 +1536,6 @@ unlock:
1023 * If the event is attached to a task which is on a CPU we use a smp 1536 * If the event is attached to a task which is on a CPU we use a smp
1024 * call to enable it in the task context. The task might have been 1537 * call to enable it in the task context. The task might have been
1025 * scheduled away, but we check this in the smp call again. 1538 * scheduled away, but we check this in the smp call again.
1026 *
1027 * Must be called with ctx->mutex held.
1028 */ 1539 */
1029static void 1540static void
1030perf_install_in_context(struct perf_event_context *ctx, 1541perf_install_in_context(struct perf_event_context *ctx,
@@ -1033,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx,
1033{ 1544{
1034 struct task_struct *task = ctx->task; 1545 struct task_struct *task = ctx->task;
1035 1546
1547 lockdep_assert_held(&ctx->mutex);
1548
1036 event->ctx = ctx; 1549 event->ctx = ctx;
1037 1550
1038 if (!task) { 1551 if (!task) {
@@ -1040,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx,
1040 * Per cpu events are installed via an smp call and 1553 * Per cpu events are installed via an smp call and
1041 * the install is always successful. 1554 * the install is always successful.
1042 */ 1555 */
1043 smp_call_function_single(cpu, __perf_install_in_context, 1556 cpu_function_call(cpu, __perf_install_in_context, event);
1044 event, 1);
1045 return; 1557 return;
1046 } 1558 }
1047 1559
1048retry: 1560retry:
1049 task_oncpu_function_call(task, __perf_install_in_context, 1561 if (!task_function_call(task, __perf_install_in_context, event))
1050 event); 1562 return;
1051 1563
1052 raw_spin_lock_irq(&ctx->lock); 1564 raw_spin_lock_irq(&ctx->lock);
1053 /* 1565 /*
1054 * we need to retry the smp call. 1566 * If we failed to find a running task, but find the context active now
1567 * that we've acquired the ctx->lock, retry.
1055 */ 1568 */
1056 if (ctx->is_active && list_empty(&event->group_entry)) { 1569 if (ctx->is_active) {
1057 raw_spin_unlock_irq(&ctx->lock); 1570 raw_spin_unlock_irq(&ctx->lock);
1058 goto retry; 1571 goto retry;
1059 } 1572 }
1060 1573
1061 /* 1574 /*
1062 * The lock prevents that this context is scheduled in so we 1575 * Since the task isn't running, its safe to add the event, us holding
1063 * can add the event safely, if it the call above did not 1576 * the ctx->lock ensures the task won't get scheduled in.
1064 * succeed.
1065 */ 1577 */
1066 if (list_empty(&event->group_entry)) 1578 add_event_to_ctx(event, ctx);
1067 add_event_to_ctx(event, ctx);
1068 raw_spin_unlock_irq(&ctx->lock); 1579 raw_spin_unlock_irq(&ctx->lock);
1069} 1580}
1070 1581
@@ -1093,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
1093/* 1604/*
1094 * Cross CPU call to enable a performance event 1605 * Cross CPU call to enable a performance event
1095 */ 1606 */
1096static void __perf_event_enable(void *info) 1607static int __perf_event_enable(void *info)
1097{ 1608{
1098 struct perf_event *event = info; 1609 struct perf_event *event = info;
1099 struct perf_event_context *ctx = event->ctx; 1610 struct perf_event_context *ctx = event->ctx;
@@ -1101,26 +1612,27 @@ static void __perf_event_enable(void *info)
1101 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1612 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1102 int err; 1613 int err;
1103 1614
1104 /* 1615 if (WARN_ON_ONCE(!ctx->is_active))
1105 * If this is a per-task event, need to check whether this 1616 return -EINVAL;
1106 * event's task is the current task on this cpu.
1107 */
1108 if (ctx->task && cpuctx->task_ctx != ctx) {
1109 if (cpuctx->task_ctx || ctx->task != current)
1110 return;
1111 cpuctx->task_ctx = ctx;
1112 }
1113 1617
1114 raw_spin_lock(&ctx->lock); 1618 raw_spin_lock(&ctx->lock);
1115 ctx->is_active = 1;
1116 update_context_time(ctx); 1619 update_context_time(ctx);
1117 1620
1118 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1621 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1119 goto unlock; 1622 goto unlock;
1623
1624 /*
1625 * set current task's cgroup time reference point
1626 */
1627 perf_cgroup_set_timestamp(current, ctx);
1628
1120 __perf_event_mark_enabled(event, ctx); 1629 __perf_event_mark_enabled(event, ctx);
1121 1630
1122 if (!event_filter_match(event)) 1631 if (!event_filter_match(event)) {
1632 if (is_cgroup_event(event))
1633 perf_cgroup_defer_enabled(event);
1123 goto unlock; 1634 goto unlock;
1635 }
1124 1636
1125 /* 1637 /*
1126 * If the event is in a group and isn't the group leader, 1638 * If the event is in a group and isn't the group leader,
@@ -1153,6 +1665,8 @@ static void __perf_event_enable(void *info)
1153 1665
1154unlock: 1666unlock:
1155 raw_spin_unlock(&ctx->lock); 1667 raw_spin_unlock(&ctx->lock);
1668
1669 return 0;
1156} 1670}
1157 1671
1158/* 1672/*
@@ -1173,8 +1687,7 @@ void perf_event_enable(struct perf_event *event)
1173 /* 1687 /*
1174 * Enable the event on the cpu that it's on 1688 * Enable the event on the cpu that it's on
1175 */ 1689 */
1176 smp_call_function_single(event->cpu, __perf_event_enable, 1690 cpu_function_call(event->cpu, __perf_event_enable, event);
1177 event, 1);
1178 return; 1691 return;
1179 } 1692 }
1180 1693
@@ -1193,8 +1706,15 @@ void perf_event_enable(struct perf_event *event)
1193 event->state = PERF_EVENT_STATE_OFF; 1706 event->state = PERF_EVENT_STATE_OFF;
1194 1707
1195retry: 1708retry:
1709 if (!ctx->is_active) {
1710 __perf_event_mark_enabled(event, ctx);
1711 goto out;
1712 }
1713
1196 raw_spin_unlock_irq(&ctx->lock); 1714 raw_spin_unlock_irq(&ctx->lock);
1197 task_oncpu_function_call(task, __perf_event_enable, event); 1715
1716 if (!task_function_call(task, __perf_event_enable, event))
1717 return;
1198 1718
1199 raw_spin_lock_irq(&ctx->lock); 1719 raw_spin_lock_irq(&ctx->lock);
1200 1720
@@ -1202,15 +1722,14 @@ retry:
1202 * If the context is active and the event is still off, 1722 * If the context is active and the event is still off,
1203 * we need to retry the cross-call. 1723 * we need to retry the cross-call.
1204 */ 1724 */
1205 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) 1725 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
1726 /*
1727 * task could have been flipped by a concurrent
1728 * perf_event_context_sched_out()
1729 */
1730 task = ctx->task;
1206 goto retry; 1731 goto retry;
1207 1732 }
1208 /*
1209 * Since we have the lock this context can't be scheduled
1210 * in, so we can change the state safely.
1211 */
1212 if (event->state == PERF_EVENT_STATE_OFF)
1213 __perf_event_mark_enabled(event, ctx);
1214 1733
1215out: 1734out:
1216 raw_spin_unlock_irq(&ctx->lock); 1735 raw_spin_unlock_irq(&ctx->lock);
@@ -1242,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1242 if (likely(!ctx->nr_events)) 1761 if (likely(!ctx->nr_events))
1243 goto out; 1762 goto out;
1244 update_context_time(ctx); 1763 update_context_time(ctx);
1764 update_cgrp_time_from_cpuctx(cpuctx);
1245 1765
1246 if (!ctx->nr_active) 1766 if (!ctx->nr_active)
1247 goto out; 1767 goto out;
@@ -1354,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1354 } 1874 }
1355} 1875}
1356 1876
1357void perf_event_context_sched_out(struct task_struct *task, int ctxn, 1877static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1358 struct task_struct *next) 1878 struct task_struct *next)
1359{ 1879{
1360 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 1880 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1361 struct perf_event_context *next_ctx; 1881 struct perf_event_context *next_ctx;
@@ -1431,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1431 1951
1432 for_each_task_context_nr(ctxn) 1952 for_each_task_context_nr(ctxn)
1433 perf_event_context_sched_out(task, ctxn, next); 1953 perf_event_context_sched_out(task, ctxn, next);
1954
1955 /*
1956 * if cgroup events exist on this CPU, then we need
1957 * to check if we have to switch out PMU state.
1958 * cgroup event are system-wide mode only
1959 */
1960 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1961 perf_cgroup_sched_out(task);
1434} 1962}
1435 1963
1436static void task_ctx_sched_out(struct perf_event_context *ctx, 1964static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1469,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1469 if (!event_filter_match(event)) 1997 if (!event_filter_match(event))
1470 continue; 1998 continue;
1471 1999
2000 /* may need to reset tstamp_enabled */
2001 if (is_cgroup_event(event))
2002 perf_cgroup_mark_enabled(event, ctx);
2003
1472 if (group_can_go_on(event, cpuctx, 1)) 2004 if (group_can_go_on(event, cpuctx, 1))
1473 group_sched_in(event, cpuctx, ctx); 2005 group_sched_in(event, cpuctx, ctx);
1474 2006
@@ -1501,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1501 if (!event_filter_match(event)) 2033 if (!event_filter_match(event))
1502 continue; 2034 continue;
1503 2035
2036 /* may need to reset tstamp_enabled */
2037 if (is_cgroup_event(event))
2038 perf_cgroup_mark_enabled(event, ctx);
2039
1504 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2040 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1505 if (group_sched_in(event, cpuctx, ctx)) 2041 if (group_sched_in(event, cpuctx, ctx))
1506 can_add_hw = 0; 2042 can_add_hw = 0;
@@ -1511,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1511static void 2047static void
1512ctx_sched_in(struct perf_event_context *ctx, 2048ctx_sched_in(struct perf_event_context *ctx,
1513 struct perf_cpu_context *cpuctx, 2049 struct perf_cpu_context *cpuctx,
1514 enum event_type_t event_type) 2050 enum event_type_t event_type,
2051 struct task_struct *task)
1515{ 2052{
2053 u64 now;
2054
1516 raw_spin_lock(&ctx->lock); 2055 raw_spin_lock(&ctx->lock);
1517 ctx->is_active = 1; 2056 ctx->is_active = 1;
1518 if (likely(!ctx->nr_events)) 2057 if (likely(!ctx->nr_events))
1519 goto out; 2058 goto out;
1520 2059
1521 ctx->timestamp = perf_clock(); 2060 now = perf_clock();
1522 2061 ctx->timestamp = now;
2062 perf_cgroup_set_timestamp(task, ctx);
1523 /* 2063 /*
1524 * First go through the list and put on any pinned groups 2064 * First go through the list and put on any pinned groups
1525 * in order to give them the best chance of going on. 2065 * in order to give them the best chance of going on.
@@ -1536,11 +2076,12 @@ out:
1536} 2076}
1537 2077
1538static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2078static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1539 enum event_type_t event_type) 2079 enum event_type_t event_type,
2080 struct task_struct *task)
1540{ 2081{
1541 struct perf_event_context *ctx = &cpuctx->ctx; 2082 struct perf_event_context *ctx = &cpuctx->ctx;
1542 2083
1543 ctx_sched_in(ctx, cpuctx, event_type); 2084 ctx_sched_in(ctx, cpuctx, event_type, task);
1544} 2085}
1545 2086
1546static void task_ctx_sched_in(struct perf_event_context *ctx, 2087static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1548,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1548{ 2089{
1549 struct perf_cpu_context *cpuctx; 2090 struct perf_cpu_context *cpuctx;
1550 2091
1551 cpuctx = __get_cpu_context(ctx); 2092 cpuctx = __get_cpu_context(ctx);
1552 if (cpuctx->task_ctx == ctx) 2093 if (cpuctx->task_ctx == ctx)
1553 return; 2094 return;
1554 2095
1555 ctx_sched_in(ctx, cpuctx, event_type); 2096 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1556 cpuctx->task_ctx = ctx; 2097 cpuctx->task_ctx = ctx;
1557} 2098}
1558 2099
1559void perf_event_context_sched_in(struct perf_event_context *ctx) 2100static void perf_event_context_sched_in(struct perf_event_context *ctx,
2101 struct task_struct *task)
1560{ 2102{
1561 struct perf_cpu_context *cpuctx; 2103 struct perf_cpu_context *cpuctx;
1562 2104
@@ -1572,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
1572 */ 2114 */
1573 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2115 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1574 2116
1575 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2117 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1576 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2118 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1577 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2119 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1578 2120
1579 cpuctx->task_ctx = ctx; 2121 cpuctx->task_ctx = ctx;
1580 2122
@@ -1607,8 +2149,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1607 if (likely(!ctx)) 2149 if (likely(!ctx))
1608 continue; 2150 continue;
1609 2151
1610 perf_event_context_sched_in(ctx); 2152 perf_event_context_sched_in(ctx, task);
1611 } 2153 }
2154 /*
2155 * if cgroup events exist on this CPU, then we need
2156 * to check if we have to switch in PMU state.
2157 * cgroup event are system-wide mode only
2158 */
2159 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2160 perf_cgroup_sched_in(task);
1612} 2161}
1613 2162
1614static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2163static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1638,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1638 * Reduce accuracy by one bit such that @a and @b converge 2187 * Reduce accuracy by one bit such that @a and @b converge
1639 * to a similar magnitude. 2188 * to a similar magnitude.
1640 */ 2189 */
1641#define REDUCE_FLS(a, b) \ 2190#define REDUCE_FLS(a, b) \
1642do { \ 2191do { \
1643 if (a##_fls > b##_fls) { \ 2192 if (a##_fls > b##_fls) { \
1644 a >>= 1; \ 2193 a >>= 1; \
@@ -1808,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1808 if (ctx) 2357 if (ctx)
1809 rotate_ctx(ctx); 2358 rotate_ctx(ctx);
1810 2359
1811 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2360 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1812 if (ctx) 2361 if (ctx)
1813 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2362 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1814 2363
@@ -1887,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1887 2436
1888 raw_spin_unlock(&ctx->lock); 2437 raw_spin_unlock(&ctx->lock);
1889 2438
1890 perf_event_context_sched_in(ctx); 2439 perf_event_context_sched_in(ctx, ctx->task);
1891out: 2440out:
1892 local_irq_restore(flags); 2441 local_irq_restore(flags);
1893} 2442}
@@ -1912,8 +2461,10 @@ static void __perf_event_read(void *info)
1912 return; 2461 return;
1913 2462
1914 raw_spin_lock(&ctx->lock); 2463 raw_spin_lock(&ctx->lock);
1915 if (ctx->is_active) 2464 if (ctx->is_active) {
1916 update_context_time(ctx); 2465 update_context_time(ctx);
2466 update_cgrp_time_from_event(event);
2467 }
1917 update_event_times(event); 2468 update_event_times(event);
1918 if (event->state == PERF_EVENT_STATE_ACTIVE) 2469 if (event->state == PERF_EVENT_STATE_ACTIVE)
1919 event->pmu->read(event); 2470 event->pmu->read(event);
@@ -1944,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event)
1944 * (e.g., thread is blocked), in that case 2495 * (e.g., thread is blocked), in that case
1945 * we cannot update context time 2496 * we cannot update context time
1946 */ 2497 */
1947 if (ctx->is_active) 2498 if (ctx->is_active) {
1948 update_context_time(ctx); 2499 update_context_time(ctx);
2500 update_cgrp_time_from_event(event);
2501 }
1949 update_event_times(event); 2502 update_event_times(event);
1950 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2503 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1951 } 2504 }
@@ -2224,6 +2777,9 @@ errout:
2224 2777
2225} 2778}
2226 2779
2780/*
2781 * Returns a matching context with refcount and pincount.
2782 */
2227static struct perf_event_context * 2783static struct perf_event_context *
2228find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 2784find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2229{ 2785{
@@ -2248,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2248 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 2804 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2249 ctx = &cpuctx->ctx; 2805 ctx = &cpuctx->ctx;
2250 get_ctx(ctx); 2806 get_ctx(ctx);
2807 ++ctx->pin_count;
2251 2808
2252 return ctx; 2809 return ctx;
2253 } 2810 }
@@ -2261,6 +2818,7 @@ retry:
2261 ctx = perf_lock_task_context(task, ctxn, &flags); 2818 ctx = perf_lock_task_context(task, ctxn, &flags);
2262 if (ctx) { 2819 if (ctx) {
2263 unclone_ctx(ctx); 2820 unclone_ctx(ctx);
2821 ++ctx->pin_count;
2264 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2822 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2265 } 2823 }
2266 2824
@@ -2282,8 +2840,10 @@ retry:
2282 err = -ESRCH; 2840 err = -ESRCH;
2283 else if (task->perf_event_ctxp[ctxn]) 2841 else if (task->perf_event_ctxp[ctxn])
2284 err = -EAGAIN; 2842 err = -EAGAIN;
2285 else 2843 else {
2844 ++ctx->pin_count;
2286 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2845 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2846 }
2287 mutex_unlock(&task->perf_event_mutex); 2847 mutex_unlock(&task->perf_event_mutex);
2288 2848
2289 if (unlikely(err)) { 2849 if (unlikely(err)) {
@@ -2323,7 +2883,7 @@ static void free_event(struct perf_event *event)
2323 2883
2324 if (!event->parent) { 2884 if (!event->parent) {
2325 if (event->attach_state & PERF_ATTACH_TASK) 2885 if (event->attach_state & PERF_ATTACH_TASK)
2326 jump_label_dec(&perf_task_events); 2886 jump_label_dec(&perf_sched_events);
2327 if (event->attr.mmap || event->attr.mmap_data) 2887 if (event->attr.mmap || event->attr.mmap_data)
2328 atomic_dec(&nr_mmap_events); 2888 atomic_dec(&nr_mmap_events);
2329 if (event->attr.comm) 2889 if (event->attr.comm)
@@ -2332,6 +2892,10 @@ static void free_event(struct perf_event *event)
2332 atomic_dec(&nr_task_events); 2892 atomic_dec(&nr_task_events);
2333 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) 2893 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
2334 put_callchain_buffers(); 2894 put_callchain_buffers();
2895 if (is_cgroup_event(event)) {
2896 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2897 jump_label_dec(&perf_sched_events);
2898 }
2335 } 2899 }
2336 2900
2337 if (event->buffer) { 2901 if (event->buffer) {
@@ -2339,6 +2903,9 @@ static void free_event(struct perf_event *event)
2339 event->buffer = NULL; 2903 event->buffer = NULL;
2340 } 2904 }
2341 2905
2906 if (is_cgroup_event(event))
2907 perf_detach_cgroup(event);
2908
2342 if (event->destroy) 2909 if (event->destroy)
2343 event->destroy(event); 2910 event->destroy(event);
2344 2911
@@ -4406,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
4406 if (unlikely(!is_sampling_event(event))) 4973 if (unlikely(!is_sampling_event(event)))
4407 return 0; 4974 return 0;
4408 4975
4409 if (!throttle) { 4976 if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
4410 hwc->interrupts++; 4977 if (throttle) {
4411 } else { 4978 hwc->interrupts = MAX_INTERRUPTS;
4412 if (hwc->interrupts != MAX_INTERRUPTS) { 4979 perf_log_throttle(event, 0);
4413 hwc->interrupts++;
4414 if (HZ * hwc->interrupts >
4415 (u64)sysctl_perf_event_sample_rate) {
4416 hwc->interrupts = MAX_INTERRUPTS;
4417 perf_log_throttle(event, 0);
4418 ret = 1;
4419 }
4420 } else {
4421 /*
4422 * Keep re-disabling events even though on the previous
4423 * pass we disabled it - just in case we raced with a
4424 * sched-in and the event got enabled again:
4425 */
4426 ret = 1; 4980 ret = 1;
4427 } 4981 }
4428 } 4982 } else
4983 hwc->interrupts++;
4429 4984
4430 if (event->attr.freq) { 4985 if (event->attr.freq) {
4431 u64 now = perf_clock(); 4986 u64 now = perf_clock();
@@ -5062,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5062 u64 period; 5617 u64 period;
5063 5618
5064 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 5619 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5620
5621 if (event->state != PERF_EVENT_STATE_ACTIVE)
5622 return HRTIMER_NORESTART;
5623
5065 event->pmu->read(event); 5624 event->pmu->read(event);
5066 5625
5067 perf_sample_data_init(&data, 0); 5626 perf_sample_data_init(&data, 0);
@@ -5088,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
5088 if (!is_sampling_event(event)) 5647 if (!is_sampling_event(event))
5089 return; 5648 return;
5090 5649
5091 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5092 hwc->hrtimer.function = perf_swevent_hrtimer;
5093
5094 period = local64_read(&hwc->period_left); 5650 period = local64_read(&hwc->period_left);
5095 if (period) { 5651 if (period) {
5096 if (period < 0) 5652 if (period < 0)
@@ -5117,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5117 } 5673 }
5118} 5674}
5119 5675
5676static void perf_swevent_init_hrtimer(struct perf_event *event)
5677{
5678 struct hw_perf_event *hwc = &event->hw;
5679
5680 if (!is_sampling_event(event))
5681 return;
5682
5683 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5684 hwc->hrtimer.function = perf_swevent_hrtimer;
5685
5686 /*
5687 * Since hrtimers have a fixed rate, we can do a static freq->period
5688 * mapping and avoid the whole period adjust feedback stuff.
5689 */
5690 if (event->attr.freq) {
5691 long freq = event->attr.sample_freq;
5692
5693 event->attr.sample_period = NSEC_PER_SEC / freq;
5694 hwc->sample_period = event->attr.sample_period;
5695 local64_set(&hwc->period_left, hwc->sample_period);
5696 event->attr.freq = 0;
5697 }
5698}
5699
5120/* 5700/*
5121 * Software event: cpu wall time clock 5701 * Software event: cpu wall time clock
5122 */ 5702 */
@@ -5169,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event)
5169 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5749 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5170 return -ENOENT; 5750 return -ENOENT;
5171 5751
5752 perf_swevent_init_hrtimer(event);
5753
5172 return 0; 5754 return 0;
5173} 5755}
5174 5756
@@ -5224,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
5224 5806
5225static void task_clock_event_read(struct perf_event *event) 5807static void task_clock_event_read(struct perf_event *event)
5226{ 5808{
5227 u64 time; 5809 u64 now = perf_clock();
5228 5810 u64 delta = now - event->ctx->timestamp;
5229 if (!in_nmi()) { 5811 u64 time = event->ctx->time + delta;
5230 update_context_time(event->ctx);
5231 time = event->ctx->time;
5232 } else {
5233 u64 now = perf_clock();
5234 u64 delta = now - event->ctx->timestamp;
5235 time = event->ctx->time + delta;
5236 }
5237 5812
5238 task_clock_event_update(event, time); 5813 task_clock_event_update(event, time);
5239} 5814}
@@ -5246,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event)
5246 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5821 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5247 return -ENOENT; 5822 return -ENOENT;
5248 5823
5824 perf_swevent_init_hrtimer(event);
5825
5249 return 0; 5826 return 0;
5250} 5827}
5251 5828
@@ -5517,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event)
5517{ 6094{
5518 struct pmu *pmu = NULL; 6095 struct pmu *pmu = NULL;
5519 int idx; 6096 int idx;
6097 int ret;
5520 6098
5521 idx = srcu_read_lock(&pmus_srcu); 6099 idx = srcu_read_lock(&pmus_srcu);
5522 6100
5523 rcu_read_lock(); 6101 rcu_read_lock();
5524 pmu = idr_find(&pmu_idr, event->attr.type); 6102 pmu = idr_find(&pmu_idr, event->attr.type);
5525 rcu_read_unlock(); 6103 rcu_read_unlock();
5526 if (pmu) 6104 if (pmu) {
6105 ret = pmu->event_init(event);
6106 if (ret)
6107 pmu = ERR_PTR(ret);
5527 goto unlock; 6108 goto unlock;
6109 }
5528 6110
5529 list_for_each_entry_rcu(pmu, &pmus, entry) { 6111 list_for_each_entry_rcu(pmu, &pmus, entry) {
5530 int ret = pmu->event_init(event); 6112 ret = pmu->event_init(event);
5531 if (!ret) 6113 if (!ret)
5532 goto unlock; 6114 goto unlock;
5533 6115
@@ -5653,7 +6235,7 @@ done:
5653 6235
5654 if (!event->parent) { 6236 if (!event->parent) {
5655 if (event->attach_state & PERF_ATTACH_TASK) 6237 if (event->attach_state & PERF_ATTACH_TASK)
5656 jump_label_inc(&perf_task_events); 6238 jump_label_inc(&perf_sched_events);
5657 if (event->attr.mmap || event->attr.mmap_data) 6239 if (event->attr.mmap || event->attr.mmap_data)
5658 atomic_inc(&nr_mmap_events); 6240 atomic_inc(&nr_mmap_events);
5659 if (event->attr.comm) 6241 if (event->attr.comm)
@@ -5828,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open,
5828 int err; 6410 int err;
5829 6411
5830 /* for future expandability... */ 6412 /* for future expandability... */
5831 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6413 if (flags & ~PERF_FLAG_ALL)
5832 return -EINVAL; 6414 return -EINVAL;
5833 6415
5834 err = perf_copy_attr(attr_uptr, &attr); 6416 err = perf_copy_attr(attr_uptr, &attr);
@@ -5845,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open,
5845 return -EINVAL; 6427 return -EINVAL;
5846 } 6428 }
5847 6429
6430 /*
6431 * In cgroup mode, the pid argument is used to pass the fd
6432 * opened to the cgroup directory in cgroupfs. The cpu argument
6433 * designates the cpu on which to monitor threads from that
6434 * cgroup.
6435 */
6436 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6437 return -EINVAL;
6438
5848 event_fd = get_unused_fd_flags(O_RDWR); 6439 event_fd = get_unused_fd_flags(O_RDWR);
5849 if (event_fd < 0) 6440 if (event_fd < 0)
5850 return event_fd; 6441 return event_fd;
@@ -5862,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open,
5862 group_leader = NULL; 6453 group_leader = NULL;
5863 } 6454 }
5864 6455
5865 if (pid != -1) { 6456 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5866 task = find_lively_task_by_vpid(pid); 6457 task = find_lively_task_by_vpid(pid);
5867 if (IS_ERR(task)) { 6458 if (IS_ERR(task)) {
5868 err = PTR_ERR(task); 6459 err = PTR_ERR(task);
@@ -5876,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open,
5876 goto err_task; 6467 goto err_task;
5877 } 6468 }
5878 6469
6470 if (flags & PERF_FLAG_PID_CGROUP) {
6471 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6472 if (err)
6473 goto err_alloc;
6474 /*
6475 * one more event:
6476 * - that has cgroup constraint on event->cpu
6477 * - that may need work on context switch
6478 */
6479 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6480 jump_label_inc(&perf_sched_events);
6481 }
6482
5879 /* 6483 /*
5880 * Special case software events and allow them to be part of 6484 * Special case software events and allow them to be part of
5881 * any hardware group. 6485 * any hardware group.
@@ -5961,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open,
5961 struct perf_event_context *gctx = group_leader->ctx; 6565 struct perf_event_context *gctx = group_leader->ctx;
5962 6566
5963 mutex_lock(&gctx->mutex); 6567 mutex_lock(&gctx->mutex);
5964 perf_event_remove_from_context(group_leader); 6568 perf_remove_from_context(group_leader);
5965 list_for_each_entry(sibling, &group_leader->sibling_list, 6569 list_for_each_entry(sibling, &group_leader->sibling_list,
5966 group_entry) { 6570 group_entry) {
5967 perf_event_remove_from_context(sibling); 6571 perf_remove_from_context(sibling);
5968 put_ctx(gctx); 6572 put_ctx(gctx);
5969 } 6573 }
5970 mutex_unlock(&gctx->mutex); 6574 mutex_unlock(&gctx->mutex);
@@ -5987,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open,
5987 6591
5988 perf_install_in_context(ctx, event, cpu); 6592 perf_install_in_context(ctx, event, cpu);
5989 ++ctx->generation; 6593 ++ctx->generation;
6594 perf_unpin_context(ctx);
5990 mutex_unlock(&ctx->mutex); 6595 mutex_unlock(&ctx->mutex);
5991 6596
5992 event->owner = current; 6597 event->owner = current;
@@ -6012,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open,
6012 return event_fd; 6617 return event_fd;
6013 6618
6014err_context: 6619err_context:
6620 perf_unpin_context(ctx);
6015 put_ctx(ctx); 6621 put_ctx(ctx);
6016err_alloc: 6622err_alloc:
6017 free_event(event); 6623 free_event(event);
@@ -6062,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6062 mutex_lock(&ctx->mutex); 6668 mutex_lock(&ctx->mutex);
6063 perf_install_in_context(ctx, event, cpu); 6669 perf_install_in_context(ctx, event, cpu);
6064 ++ctx->generation; 6670 ++ctx->generation;
6671 perf_unpin_context(ctx);
6065 mutex_unlock(&ctx->mutex); 6672 mutex_unlock(&ctx->mutex);
6066 6673
6067 return event; 6674 return event;
@@ -6115,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event,
6115{ 6722{
6116 struct perf_event *parent_event; 6723 struct perf_event *parent_event;
6117 6724
6118 perf_event_remove_from_context(child_event); 6725 perf_remove_from_context(child_event);
6119 6726
6120 parent_event = child_event->parent; 6727 parent_event = child_event->parent;
6121 /* 6728 /*
@@ -6422,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
6422 return 0; 7029 return 0;
6423 } 7030 }
6424 7031
6425 child_ctx = child->perf_event_ctxp[ctxn]; 7032 child_ctx = child->perf_event_ctxp[ctxn];
6426 if (!child_ctx) { 7033 if (!child_ctx) {
6427 /* 7034 /*
6428 * This is executed from the parent task context, so 7035 * This is executed from the parent task context, so
@@ -6537,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
6537 mutex_unlock(&parent_ctx->mutex); 7144 mutex_unlock(&parent_ctx->mutex);
6538 7145
6539 perf_unpin_context(parent_ctx); 7146 perf_unpin_context(parent_ctx);
7147 put_ctx(parent_ctx);
6540 7148
6541 return ret; 7149 return ret;
6542} 7150}
@@ -6606,9 +7214,9 @@ static void __perf_event_exit_context(void *__info)
6606 perf_pmu_rotate_stop(ctx->pmu); 7214 perf_pmu_rotate_stop(ctx->pmu);
6607 7215
6608 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 7216 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
6609 __perf_event_remove_from_context(event); 7217 __perf_remove_from_context(event);
6610 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) 7218 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
6611 __perf_event_remove_from_context(event); 7219 __perf_remove_from_context(event);
6612} 7220}
6613 7221
6614static void perf_event_exit_cpu_context(int cpu) 7222static void perf_event_exit_cpu_context(int cpu)
@@ -6732,3 +7340,83 @@ unlock:
6732 return ret; 7340 return ret;
6733} 7341}
6734device_initcall(perf_event_sysfs_init); 7342device_initcall(perf_event_sysfs_init);
7343
7344#ifdef CONFIG_CGROUP_PERF
7345static struct cgroup_subsys_state *perf_cgroup_create(
7346 struct cgroup_subsys *ss, struct cgroup *cont)
7347{
7348 struct perf_cgroup *jc;
7349
7350 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7351 if (!jc)
7352 return ERR_PTR(-ENOMEM);
7353
7354 jc->info = alloc_percpu(struct perf_cgroup_info);
7355 if (!jc->info) {
7356 kfree(jc);
7357 return ERR_PTR(-ENOMEM);
7358 }
7359
7360 return &jc->css;
7361}
7362
7363static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7364 struct cgroup *cont)
7365{
7366 struct perf_cgroup *jc;
7367 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7368 struct perf_cgroup, css);
7369 free_percpu(jc->info);
7370 kfree(jc);
7371}
7372
7373static int __perf_cgroup_move(void *info)
7374{
7375 struct task_struct *task = info;
7376 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7377 return 0;
7378}
7379
7380static void perf_cgroup_move(struct task_struct *task)
7381{
7382 task_function_call(task, __perf_cgroup_move, task);
7383}
7384
7385static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7386 struct cgroup *old_cgrp, struct task_struct *task,
7387 bool threadgroup)
7388{
7389 perf_cgroup_move(task);
7390 if (threadgroup) {
7391 struct task_struct *c;
7392 rcu_read_lock();
7393 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7394 perf_cgroup_move(c);
7395 }
7396 rcu_read_unlock();
7397 }
7398}
7399
7400static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7401 struct cgroup *old_cgrp, struct task_struct *task)
7402{
7403 /*
7404 * cgroup_exit() is called in the copy_process() failure path.
7405 * Ignore this case since the task hasn't ran yet, this avoids
7406 * trying to poke a half freed task state from generic code.
7407 */
7408 if (!(task->flags & PF_EXITING))
7409 return;
7410
7411 perf_cgroup_move(task);
7412}
7413
7414struct cgroup_subsys perf_subsys = {
7415 .name = "perf_event",
7416 .subsys_id = perf_subsys_id,
7417 .create = perf_cgroup_create,
7418 .destroy = perf_cgroup_destroy,
7419 .exit = perf_cgroup_exit,
7420 .attach = perf_cgroup_attach,
7421};
7422#endif /* CONFIG_CGROUP_PERF */