aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephane Eranian <eranian@google.com>2011-02-14 04:20:01 -0500
committerIngo Molnar <mingo@elte.hu>2011-02-16 07:30:48 -0500
commite5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 (patch)
tree5862b4cddb7c88e0513e503cb3f46c60da2eeb6f
parentd41d5a01631af821d3a3447e6613a316f5ee6c25 (diff)
perf: Add cgroup support
This kernel patch adds the ability to filter monitoring based on container groups (cgroups). This is for use in per-cpu mode only. The cgroup to monitor is passed as a file descriptor in the pid argument to the syscall. The file descriptor must be opened to the cgroup name in the cgroup filesystem. For instance, if the cgroup name is foo and cgroupfs is mounted in /cgroup, then the file descriptor is opened to /cgroup/foo. Cgroup mode is activated by passing PERF_FLAG_PID_CGROUP in the flags argument to the syscall. For instance to measure in cgroup foo on CPU1 assuming cgroupfs is mounted under /cgroup: struct perf_event_attr attr; int cgroup_fd, fd; cgroup_fd = open("/cgroup/foo", O_RDONLY); fd = perf_event_open(&attr, cgroup_fd, 1, -1, PERF_FLAG_PID_CGROUP); close(cgroup_fd); Signed-off-by: Stephane Eranian <eranian@google.com> [ added perf_cgroup_{exit,attach} ] Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <4d590250.114ddf0a.689e.4482@mx.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/cgroup_subsys.h4
-rw-r--r--include/linux/perf_event.h33
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/cgroup.c23
-rw-r--r--kernel/perf_event.c638
6 files changed, 671 insertions, 38 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 38117d937332..e654fa239916 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
627/* Get id and depth of css */ 627/* Get id and depth of css */
628unsigned short css_id(struct cgroup_subsys_state *css); 628unsigned short css_id(struct cgroup_subsys_state *css);
629unsigned short css_depth(struct cgroup_subsys_state *css); 629unsigned short css_depth(struct cgroup_subsys_state *css);
630struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
630 631
631#else /* !CONFIG_CGROUPS */ 632#else /* !CONFIG_CGROUPS */
632 633
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ccefff02b6cb..cdbfcb8780ec 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -65,4 +65,8 @@ SUBSYS(net_cls)
65SUBSYS(blkio) 65SUBSYS(blkio)
66#endif 66#endif
67 67
68#ifdef CONFIG_CGROUP_PERF
69SUBSYS(perf)
70#endif
71
68/* */ 72/* */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dda5b0a3ff60..38c8b2554842 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -464,6 +464,7 @@ enum perf_callchain_context {
464 464
465#define PERF_FLAG_FD_NO_GROUP (1U << 0) 465#define PERF_FLAG_FD_NO_GROUP (1U << 0)
466#define PERF_FLAG_FD_OUTPUT (1U << 1) 466#define PERF_FLAG_FD_OUTPUT (1U << 1)
467#define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */
467 468
468#ifdef __KERNEL__ 469#ifdef __KERNEL__
469/* 470/*
@@ -471,6 +472,7 @@ enum perf_callchain_context {
471 */ 472 */
472 473
473#ifdef CONFIG_PERF_EVENTS 474#ifdef CONFIG_PERF_EVENTS
475# include <linux/cgroup.h>
474# include <asm/perf_event.h> 476# include <asm/perf_event.h>
475# include <asm/local64.h> 477# include <asm/local64.h>
476#endif 478#endif
@@ -716,6 +718,22 @@ struct swevent_hlist {
716#define PERF_ATTACH_GROUP 0x02 718#define PERF_ATTACH_GROUP 0x02
717#define PERF_ATTACH_TASK 0x04 719#define PERF_ATTACH_TASK 0x04
718 720
721#ifdef CONFIG_CGROUP_PERF
722/*
723 * perf_cgroup_info keeps track of time_enabled for a cgroup.
724 * This is a per-cpu dynamically allocated data structure.
725 */
726struct perf_cgroup_info {
727 u64 time;
728 u64 timestamp;
729};
730
731struct perf_cgroup {
732 struct cgroup_subsys_state css;
733 struct perf_cgroup_info *info; /* timing info, one per cpu */
734};
735#endif
736
719/** 737/**
720 * struct perf_event - performance event kernel representation: 738 * struct perf_event - performance event kernel representation:
721 */ 739 */
@@ -832,6 +850,11 @@ struct perf_event {
832 struct event_filter *filter; 850 struct event_filter *filter;
833#endif 851#endif
834 852
853#ifdef CONFIG_CGROUP_PERF
854 struct perf_cgroup *cgrp; /* cgroup event is attach to */
855 int cgrp_defer_enabled;
856#endif
857
835#endif /* CONFIG_PERF_EVENTS */ 858#endif /* CONFIG_PERF_EVENTS */
836}; 859};
837 860
@@ -886,6 +909,7 @@ struct perf_event_context {
886 u64 generation; 909 u64 generation;
887 int pin_count; 910 int pin_count;
888 struct rcu_head rcu_head; 911 struct rcu_head rcu_head;
912 int nr_cgroups; /* cgroup events present */
889}; 913};
890 914
891/* 915/*
@@ -905,6 +929,9 @@ struct perf_cpu_context {
905 struct list_head rotation_list; 929 struct list_head rotation_list;
906 int jiffies_interval; 930 int jiffies_interval;
907 struct pmu *active_pmu; 931 struct pmu *active_pmu;
932#ifdef CONFIG_CGROUP_PERF
933 struct perf_cgroup *cgrp;
934#endif
908}; 935};
909 936
910struct perf_output_handle { 937struct perf_output_handle {
@@ -1040,11 +1067,11 @@ have_event:
1040 __perf_sw_event(event_id, nr, nmi, regs, addr); 1067 __perf_sw_event(event_id, nr, nmi, regs, addr);
1041} 1068}
1042 1069
1043extern atomic_t perf_task_events; 1070extern atomic_t perf_sched_events;
1044 1071
1045static inline void perf_event_task_sched_in(struct task_struct *task) 1072static inline void perf_event_task_sched_in(struct task_struct *task)
1046{ 1073{
1047 COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); 1074 COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
1048} 1075}
1049 1076
1050static inline 1077static inline
@@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex
1052{ 1079{
1053 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); 1080 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1054 1081
1055 COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); 1082 COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
1056} 1083}
1057 1084
1058extern void perf_event_mmap(struct vm_area_struct *vma); 1085extern void perf_event_mmap(struct vm_area_struct *vma);
diff --git a/init/Kconfig b/init/Kconfig
index be788c0957d4..20d6bd919b8d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
683 select this option (if, for some reason, they need to disable it 683 select this option (if, for some reason, they need to disable it
684 then noswapaccount does the trick). 684 then noswapaccount does the trick).
685 685
686config CGROUP_PERF
687 bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
688 depends on PERF_EVENTS && CGROUPS
689 help
690 This option extends the per-cpu mode to restrict monitoring to
691 threads which belong to the cgroup specificied and run on the
692 designated cpu.
693
694 Say N if unsure.
695
686menuconfig CGROUP_SCHED 696menuconfig CGROUP_SCHED
687 bool "Group CPU scheduler" 697 bool "Group CPU scheduler"
688 depends on EXPERIMENTAL 698 depends on EXPERIMENTAL
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f6495f33a355..95362d15128c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
4818 return ret; 4818 return ret;
4819} 4819}
4820 4820
4821/*
4822 * get corresponding css from file open on cgroupfs directory
4823 */
4824struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
4825{
4826 struct cgroup *cgrp;
4827 struct inode *inode;
4828 struct cgroup_subsys_state *css;
4829
4830 inode = f->f_dentry->d_inode;
4831 /* check in cgroup filesystem dir */
4832 if (inode->i_op != &cgroup_dir_inode_operations)
4833 return ERR_PTR(-EBADF);
4834
4835 if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
4836 return ERR_PTR(-EINVAL);
4837
4838 /* get cgroup */
4839 cgrp = __d_cgrp(f->f_dentry);
4840 css = cgrp->subsys[id];
4841 return css ? css : ERR_PTR(-ENOENT);
4842}
4843
4821#ifdef CONFIG_CGROUP_DEBUG 4844#ifdef CONFIG_CGROUP_DEBUG
4822static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 4845static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
4823 struct cgroup *cont) 4846 struct cgroup *cont)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d3f282fa50e..65dcdc76d709 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
111 return data.ret; 111 return data.ret;
112} 112}
113 113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
114enum event_type_t { 118enum event_type_t {
115 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
116 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
117 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
118}; 122};
119 123
120atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
121static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
122static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
123static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
148 enum event_type_t event_type); 158 enum event_type_t event_type);
149 159
150static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 160static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
151 enum event_type_t event_type); 161 enum event_type_t event_type,
162 struct task_struct *task);
163
164static void update_context_time(struct perf_event_context *ctx);
165static u64 perf_event_time(struct perf_event *event);
152 166
153void __weak perf_event_print_debug(void) { } 167void __weak perf_event_print_debug(void) { }
154 168
@@ -162,6 +176,338 @@ static inline u64 perf_clock(void)
162 return local_clock(); 176 return local_clock();
163} 177}
164 178
179static inline struct perf_cpu_context *
180__get_cpu_context(struct perf_event_context *ctx)
181{
182 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
183}
184
185#ifdef CONFIG_CGROUP_PERF
186
187static inline struct perf_cgroup *
188perf_cgroup_from_task(struct task_struct *task)
189{
190 return container_of(task_subsys_state(task, perf_subsys_id),
191 struct perf_cgroup, css);
192}
193
194static inline bool
195perf_cgroup_match(struct perf_event *event)
196{
197 struct perf_event_context *ctx = event->ctx;
198 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
199
200 return !event->cgrp || event->cgrp == cpuctx->cgrp;
201}
202
203static inline void perf_get_cgroup(struct perf_event *event)
204{
205 css_get(&event->cgrp->css);
206}
207
208static inline void perf_put_cgroup(struct perf_event *event)
209{
210 css_put(&event->cgrp->css);
211}
212
213static inline void perf_detach_cgroup(struct perf_event *event)
214{
215 perf_put_cgroup(event);
216 event->cgrp = NULL;
217}
218
219static inline int is_cgroup_event(struct perf_event *event)
220{
221 return event->cgrp != NULL;
222}
223
224static inline u64 perf_cgroup_event_time(struct perf_event *event)
225{
226 struct perf_cgroup_info *t;
227
228 t = per_cpu_ptr(event->cgrp->info, event->cpu);
229 return t->time;
230}
231
232static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
233{
234 struct perf_cgroup_info *info;
235 u64 now;
236
237 now = perf_clock();
238
239 info = this_cpu_ptr(cgrp->info);
240
241 info->time += now - info->timestamp;
242 info->timestamp = now;
243}
244
245static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
246{
247 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
248 if (cgrp_out)
249 __update_cgrp_time(cgrp_out);
250}
251
252static inline void update_cgrp_time_from_event(struct perf_event *event)
253{
254 struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
255 /*
256 * do not update time when cgroup is not active
257 */
258 if (!event->cgrp || cgrp != event->cgrp)
259 return;
260
261 __update_cgrp_time(event->cgrp);
262}
263
264static inline void
265perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
266{
267 struct perf_cgroup *cgrp;
268 struct perf_cgroup_info *info;
269
270 if (!task)
271 return;
272
273 cgrp = perf_cgroup_from_task(task);
274 info = this_cpu_ptr(cgrp->info);
275 info->timestamp = now;
276}
277
278#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
279#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
280
281/*
282 * reschedule events based on the cgroup constraint of task.
283 *
284 * mode SWOUT : schedule out everything
285 * mode SWIN : schedule in based on cgroup for next
286 */
287void perf_cgroup_switch(struct task_struct *task, int mode)
288{
289 struct perf_cpu_context *cpuctx;
290 struct pmu *pmu;
291 unsigned long flags;
292
293 /*
294 * disable interrupts to avoid geting nr_cgroup
295 * changes via __perf_event_disable(). Also
296 * avoids preemption.
297 */
298 local_irq_save(flags);
299
300 /*
301 * we reschedule only in the presence of cgroup
302 * constrained events.
303 */
304 rcu_read_lock();
305
306 list_for_each_entry_rcu(pmu, &pmus, entry) {
307
308 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
309
310 perf_pmu_disable(cpuctx->ctx.pmu);
311
312 /*
313 * perf_cgroup_events says at least one
314 * context on this CPU has cgroup events.
315 *
316 * ctx->nr_cgroups reports the number of cgroup
317 * events for a context.
318 */
319 if (cpuctx->ctx.nr_cgroups > 0) {
320
321 if (mode & PERF_CGROUP_SWOUT) {
322 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
323 /*
324 * must not be done before ctxswout due
325 * to event_filter_match() in event_sched_out()
326 */
327 cpuctx->cgrp = NULL;
328 }
329
330 if (mode & PERF_CGROUP_SWIN) {
331 /* set cgrp before ctxsw in to
332 * allow event_filter_match() to not
333 * have to pass task around
334 */
335 cpuctx->cgrp = perf_cgroup_from_task(task);
336 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
337 }
338 }
339
340 perf_pmu_enable(cpuctx->ctx.pmu);
341 }
342
343 rcu_read_unlock();
344
345 local_irq_restore(flags);
346}
347
348static inline void perf_cgroup_sched_out(struct task_struct *task)
349{
350 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
351}
352
353static inline void perf_cgroup_sched_in(struct task_struct *task)
354{
355 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
356}
357
358static inline int perf_cgroup_connect(int fd, struct perf_event *event,
359 struct perf_event_attr *attr,
360 struct perf_event *group_leader)
361{
362 struct perf_cgroup *cgrp;
363 struct cgroup_subsys_state *css;
364 struct file *file;
365 int ret = 0, fput_needed;
366
367 file = fget_light(fd, &fput_needed);
368 if (!file)
369 return -EBADF;
370
371 css = cgroup_css_from_dir(file, perf_subsys_id);
372 if (IS_ERR(css))
373 return PTR_ERR(css);
374
375 cgrp = container_of(css, struct perf_cgroup, css);
376 event->cgrp = cgrp;
377
378 /*
379 * all events in a group must monitor
380 * the same cgroup because a task belongs
381 * to only one perf cgroup at a time
382 */
383 if (group_leader && group_leader->cgrp != cgrp) {
384 perf_detach_cgroup(event);
385 ret = -EINVAL;
386 } else {
387 /* must be done before we fput() the file */
388 perf_get_cgroup(event);
389 }
390 fput_light(file, fput_needed);
391 return ret;
392}
393
394static inline void
395perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
396{
397 struct perf_cgroup_info *t;
398 t = per_cpu_ptr(event->cgrp->info, event->cpu);
399 event->shadow_ctx_time = now - t->timestamp;
400}
401
402static inline void
403perf_cgroup_defer_enabled(struct perf_event *event)
404{
405 /*
406 * when the current task's perf cgroup does not match
407 * the event's, we need to remember to call the
408 * perf_mark_enable() function the first time a task with
409 * a matching perf cgroup is scheduled in.
410 */
411 if (is_cgroup_event(event) && !perf_cgroup_match(event))
412 event->cgrp_defer_enabled = 1;
413}
414
415static inline void
416perf_cgroup_mark_enabled(struct perf_event *event,
417 struct perf_event_context *ctx)
418{
419 struct perf_event *sub;
420 u64 tstamp = perf_event_time(event);
421
422 if (!event->cgrp_defer_enabled)
423 return;
424
425 event->cgrp_defer_enabled = 0;
426
427 event->tstamp_enabled = tstamp - event->total_time_enabled;
428 list_for_each_entry(sub, &event->sibling_list, group_entry) {
429 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
430 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
431 sub->cgrp_defer_enabled = 0;
432 }
433 }
434}
435#else /* !CONFIG_CGROUP_PERF */
436
437static inline bool
438perf_cgroup_match(struct perf_event *event)
439{
440 return true;
441}
442
443static inline void perf_detach_cgroup(struct perf_event *event)
444{}
445
446static inline int is_cgroup_event(struct perf_event *event)
447{
448 return 0;
449}
450
451static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
452{
453 return 0;
454}
455
456static inline void update_cgrp_time_from_event(struct perf_event *event)
457{
458}
459
460static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
461{
462}
463
464static inline void perf_cgroup_sched_out(struct task_struct *task)
465{
466}
467
468static inline void perf_cgroup_sched_in(struct task_struct *task)
469{
470}
471
472static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
473 struct perf_event_attr *attr,
474 struct perf_event *group_leader)
475{
476 return -EINVAL;
477}
478
479static inline void
480perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
481{
482}
483
484void
485perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
486{
487}
488
489static inline void
490perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
491{
492}
493
494static inline u64 perf_cgroup_event_time(struct perf_event *event)
495{
496 return 0;
497}
498
499static inline void
500perf_cgroup_defer_enabled(struct perf_event *event)
501{
502}
503
504static inline void
505perf_cgroup_mark_enabled(struct perf_event *event,
506 struct perf_event_context *ctx)
507{
508}
509#endif
510
165void perf_pmu_disable(struct pmu *pmu) 511void perf_pmu_disable(struct pmu *pmu)
166{ 512{
167 int *count = this_cpu_ptr(pmu->pmu_disable_count); 513 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx)
343static u64 perf_event_time(struct perf_event *event) 689static u64 perf_event_time(struct perf_event *event)
344{ 690{
345 struct perf_event_context *ctx = event->ctx; 691 struct perf_event_context *ctx = event->ctx;
692
693 if (is_cgroup_event(event))
694 return perf_cgroup_event_time(event);
695
346 return ctx ? ctx->time : 0; 696 return ctx ? ctx->time : 0;
347} 697}
348 698
@@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event)
357 if (event->state < PERF_EVENT_STATE_INACTIVE || 707 if (event->state < PERF_EVENT_STATE_INACTIVE ||
358 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 708 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
359 return; 709 return;
360 710 /*
361 if (ctx->is_active) 711 * in cgroup mode, time_enabled represents
712 * the time the event was enabled AND active
713 * tasks were in the monitored cgroup. This is
714 * independent of the activity of the context as
715 * there may be a mix of cgroup and non-cgroup events.
716 *
717 * That is why we treat cgroup events differently
718 * here.
719 */
720 if (is_cgroup_event(event))
362 run_end = perf_event_time(event); 721 run_end = perf_event_time(event);
722 else if (ctx->is_active)
723 run_end = ctx->time;
363 else 724 else
364 run_end = event->tstamp_stopped; 725 run_end = event->tstamp_stopped;
365 726
@@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event)
371 run_end = perf_event_time(event); 732 run_end = perf_event_time(event);
372 733
373 event->total_time_running = run_end - event->tstamp_running; 734 event->total_time_running = run_end - event->tstamp_running;
735
374} 736}
375 737
376/* 738/*
@@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
419 list_add_tail(&event->group_entry, list); 781 list_add_tail(&event->group_entry, list);
420 } 782 }
421 783
784 if (is_cgroup_event(event)) {
785 ctx->nr_cgroups++;
786 /*
787 * one more event:
788 * - that has cgroup constraint on event->cpu
789 * - that may need work on context switch
790 */
791 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
792 jump_label_inc(&perf_sched_events);
793 }
794
422 list_add_rcu(&event->event_entry, &ctx->event_list); 795 list_add_rcu(&event->event_entry, &ctx->event_list);
423 if (!ctx->nr_events) 796 if (!ctx->nr_events)
424 perf_pmu_rotate_start(ctx->pmu); 797 perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
545 918
546 event->attach_state &= ~PERF_ATTACH_CONTEXT; 919 event->attach_state &= ~PERF_ATTACH_CONTEXT;
547 920
921 if (is_cgroup_event(event)) {
922 ctx->nr_cgroups--;
923 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
924 jump_label_dec(&perf_sched_events);
925 }
926
548 ctx->nr_events--; 927 ctx->nr_events--;
549 if (event->attr.inherit_stat) 928 if (event->attr.inherit_stat)
550 ctx->nr_stat--; 929 ctx->nr_stat--;
@@ -616,7 +995,8 @@ out:
616static inline int 995static inline int
617event_filter_match(struct perf_event *event) 996event_filter_match(struct perf_event *event)
618{ 997{
619 return event->cpu == -1 || event->cpu == smp_processor_id(); 998 return (event->cpu == -1 || event->cpu == smp_processor_id())
999 && perf_cgroup_match(event);
620} 1000}
621 1001
622static void 1002static void
@@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event,
634 */ 1014 */
635 if (event->state == PERF_EVENT_STATE_INACTIVE 1015 if (event->state == PERF_EVENT_STATE_INACTIVE
636 && !event_filter_match(event)) { 1016 && !event_filter_match(event)) {
637 delta = ctx->time - event->tstamp_stopped; 1017 delta = tstamp - event->tstamp_stopped;
638 event->tstamp_running += delta; 1018 event->tstamp_running += delta;
639 event->tstamp_stopped = tstamp; 1019 event->tstamp_stopped = tstamp;
640 } 1020 }
@@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event,
678 cpuctx->exclusive = 0; 1058 cpuctx->exclusive = 0;
679} 1059}
680 1060
681static inline struct perf_cpu_context *
682__get_cpu_context(struct perf_event_context *ctx)
683{
684 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
685}
686
687/* 1061/*
688 * Cross CPU call to remove a performance event 1062 * Cross CPU call to remove a performance event
689 * 1063 *
@@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info)
783 */ 1157 */
784 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1158 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
785 update_context_time(ctx); 1159 update_context_time(ctx);
1160 update_cgrp_time_from_event(event);
786 update_group_times(event); 1161 update_group_times(event);
787 if (event == event->group_leader) 1162 if (event == event->group_leader)
788 group_sched_out(event, cpuctx, ctx); 1163 group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1226,41 @@ retry:
851 raw_spin_unlock_irq(&ctx->lock); 1226 raw_spin_unlock_irq(&ctx->lock);
852} 1227}
853 1228
1229static void perf_set_shadow_time(struct perf_event *event,
1230 struct perf_event_context *ctx,
1231 u64 tstamp)
1232{
1233 /*
1234 * use the correct time source for the time snapshot
1235 *
1236 * We could get by without this by leveraging the
1237 * fact that to get to this function, the caller
1238 * has most likely already called update_context_time()
1239 * and update_cgrp_time_xx() and thus both timestamp
1240 * are identical (or very close). Given that tstamp is,
1241 * already adjusted for cgroup, we could say that:
1242 * tstamp - ctx->timestamp
1243 * is equivalent to
1244 * tstamp - cgrp->timestamp.
1245 *
1246 * Then, in perf_output_read(), the calculation would
1247 * work with no changes because:
1248 * - event is guaranteed scheduled in
1249 * - no scheduled out in between
1250 * - thus the timestamp would be the same
1251 *
1252 * But this is a bit hairy.
1253 *
1254 * So instead, we have an explicit cgroup call to remain
1255 * within the time time source all along. We believe it
1256 * is cleaner and simpler to understand.
1257 */
1258 if (is_cgroup_event(event))
1259 perf_cgroup_set_shadow_time(event, tstamp);
1260 else
1261 event->shadow_ctx_time = tstamp - ctx->timestamp;
1262}
1263
854#define MAX_INTERRUPTS (~0ULL) 1264#define MAX_INTERRUPTS (~0ULL)
855 1265
856static void perf_log_throttle(struct perf_event *event, int enable); 1266static void perf_log_throttle(struct perf_event *event, int enable);
@@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event,
891 1301
892 event->tstamp_running += tstamp - event->tstamp_stopped; 1302 event->tstamp_running += tstamp - event->tstamp_stopped;
893 1303
894 event->shadow_ctx_time = tstamp - ctx->timestamp; 1304 perf_set_shadow_time(event, ctx, tstamp);
895 1305
896 if (!is_software_event(event)) 1306 if (!is_software_event(event))
897 cpuctx->active_oncpu++; 1307 cpuctx->active_oncpu++;
@@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event,
1012 event->tstamp_stopped = tstamp; 1422 event->tstamp_stopped = tstamp;
1013} 1423}
1014 1424
1015static void perf_event_context_sched_in(struct perf_event_context *ctx); 1425static void perf_event_context_sched_in(struct perf_event_context *ctx,
1426 struct task_struct *tsk);
1016 1427
1017/* 1428/*
1018 * Cross CPU call to install and enable a performance event 1429 * Cross CPU call to install and enable a performance event
@@ -1033,11 +1444,17 @@ static int __perf_install_in_context(void *info)
1033 * which do context switches with IRQs enabled. 1444 * which do context switches with IRQs enabled.
1034 */ 1445 */
1035 if (ctx->task && !cpuctx->task_ctx) 1446 if (ctx->task && !cpuctx->task_ctx)
1036 perf_event_context_sched_in(ctx); 1447 perf_event_context_sched_in(ctx, ctx->task);
1037 1448
1038 raw_spin_lock(&ctx->lock); 1449 raw_spin_lock(&ctx->lock);
1039 ctx->is_active = 1; 1450 ctx->is_active = 1;
1040 update_context_time(ctx); 1451 update_context_time(ctx);
1452 /*
1453 * update cgrp time only if current cgrp
1454 * matches event->cgrp. Must be done before
1455 * calling add_event_to_ctx()
1456 */
1457 update_cgrp_time_from_event(event);
1041 1458
1042 add_event_to_ctx(event, ctx); 1459 add_event_to_ctx(event, ctx);
1043 1460
@@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info)
1175 1592
1176 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1593 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1177 goto unlock; 1594 goto unlock;
1595
1596 /*
1597 * set current task's cgroup time reference point
1598 */
1599 perf_cgroup_set_timestamp(current, perf_clock());
1600
1178 __perf_event_mark_enabled(event, ctx); 1601 __perf_event_mark_enabled(event, ctx);
1179 1602
1180 if (!event_filter_match(event)) 1603 if (!event_filter_match(event)) {
1604 if (is_cgroup_event(event))
1605 perf_cgroup_defer_enabled(event);
1181 goto unlock; 1606 goto unlock;
1607 }
1182 1608
1183 /* 1609 /*
1184 * If the event is in a group and isn't the group leader, 1610 * If the event is in a group and isn't the group leader,
@@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1307 if (likely(!ctx->nr_events)) 1733 if (likely(!ctx->nr_events))
1308 goto out; 1734 goto out;
1309 update_context_time(ctx); 1735 update_context_time(ctx);
1736 update_cgrp_time_from_cpuctx(cpuctx);
1310 1737
1311 if (!ctx->nr_active) 1738 if (!ctx->nr_active)
1312 goto out; 1739 goto out;
@@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1496 1923
1497 for_each_task_context_nr(ctxn) 1924 for_each_task_context_nr(ctxn)
1498 perf_event_context_sched_out(task, ctxn, next); 1925 perf_event_context_sched_out(task, ctxn, next);
1926
1927 /*
1928 * if cgroup events exist on this CPU, then we need
1929 * to check if we have to switch out PMU state.
1930 * cgroup event are system-wide mode only
1931 */
1932 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1933 perf_cgroup_sched_out(task);
1499} 1934}
1500 1935
1501static void task_ctx_sched_out(struct perf_event_context *ctx, 1936static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1534 if (!event_filter_match(event)) 1969 if (!event_filter_match(event))
1535 continue; 1970 continue;
1536 1971
1972 /* may need to reset tstamp_enabled */
1973 if (is_cgroup_event(event))
1974 perf_cgroup_mark_enabled(event, ctx);
1975
1537 if (group_can_go_on(event, cpuctx, 1)) 1976 if (group_can_go_on(event, cpuctx, 1))
1538 group_sched_in(event, cpuctx, ctx); 1977 group_sched_in(event, cpuctx, ctx);
1539 1978
@@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1566 if (!event_filter_match(event)) 2005 if (!event_filter_match(event))
1567 continue; 2006 continue;
1568 2007
2008 /* may need to reset tstamp_enabled */
2009 if (is_cgroup_event(event))
2010 perf_cgroup_mark_enabled(event, ctx);
2011
1569 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2012 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1570 if (group_sched_in(event, cpuctx, ctx)) 2013 if (group_sched_in(event, cpuctx, ctx))
1571 can_add_hw = 0; 2014 can_add_hw = 0;
@@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1576static void 2019static void
1577ctx_sched_in(struct perf_event_context *ctx, 2020ctx_sched_in(struct perf_event_context *ctx,
1578 struct perf_cpu_context *cpuctx, 2021 struct perf_cpu_context *cpuctx,
1579 enum event_type_t event_type) 2022 enum event_type_t event_type,
2023 struct task_struct *task)
1580{ 2024{
2025 u64 now;
2026
1581 raw_spin_lock(&ctx->lock); 2027 raw_spin_lock(&ctx->lock);
1582 ctx->is_active = 1; 2028 ctx->is_active = 1;
1583 if (likely(!ctx->nr_events)) 2029 if (likely(!ctx->nr_events))
1584 goto out; 2030 goto out;
1585 2031
1586 ctx->timestamp = perf_clock(); 2032 now = perf_clock();
1587 2033 ctx->timestamp = now;
2034 perf_cgroup_set_timestamp(task, now);
1588 /* 2035 /*
1589 * First go through the list and put on any pinned groups 2036 * First go through the list and put on any pinned groups
1590 * in order to give them the best chance of going on. 2037 * in order to give them the best chance of going on.
@@ -1601,11 +2048,12 @@ out:
1601} 2048}
1602 2049
1603static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2050static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1604 enum event_type_t event_type) 2051 enum event_type_t event_type,
2052 struct task_struct *task)
1605{ 2053{
1606 struct perf_event_context *ctx = &cpuctx->ctx; 2054 struct perf_event_context *ctx = &cpuctx->ctx;
1607 2055
1608 ctx_sched_in(ctx, cpuctx, event_type); 2056 ctx_sched_in(ctx, cpuctx, event_type, task);
1609} 2057}
1610 2058
1611static void task_ctx_sched_in(struct perf_event_context *ctx, 2059static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1617 if (cpuctx->task_ctx == ctx) 2065 if (cpuctx->task_ctx == ctx)
1618 return; 2066 return;
1619 2067
1620 ctx_sched_in(ctx, cpuctx, event_type); 2068 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1621 cpuctx->task_ctx = ctx; 2069 cpuctx->task_ctx = ctx;
1622} 2070}
1623 2071
1624static void perf_event_context_sched_in(struct perf_event_context *ctx) 2072static void perf_event_context_sched_in(struct perf_event_context *ctx,
2073 struct task_struct *task)
1625{ 2074{
1626 struct perf_cpu_context *cpuctx; 2075 struct perf_cpu_context *cpuctx;
1627 2076
@@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx)
1637 */ 2086 */
1638 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2087 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1639 2088
1640 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2089 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1641 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2090 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1642 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2091 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1643 2092
1644 cpuctx->task_ctx = ctx; 2093 cpuctx->task_ctx = ctx;
1645 2094
@@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1672 if (likely(!ctx)) 2121 if (likely(!ctx))
1673 continue; 2122 continue;
1674 2123
1675 perf_event_context_sched_in(ctx); 2124 perf_event_context_sched_in(ctx, task);
1676 } 2125 }
2126 /*
2127 * if cgroup events exist on this CPU, then we need
2128 * to check if we have to switch in PMU state.
2129 * cgroup event are system-wide mode only
2130 */
2131 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2132 perf_cgroup_sched_in(task);
1677} 2133}
1678 2134
1679static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2135static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1873 if (ctx) 2329 if (ctx)
1874 rotate_ctx(ctx); 2330 rotate_ctx(ctx);
1875 2331
1876 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2332 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1877 if (ctx) 2333 if (ctx)
1878 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2334 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1879 2335
@@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1952 2408
1953 raw_spin_unlock(&ctx->lock); 2409 raw_spin_unlock(&ctx->lock);
1954 2410
1955 perf_event_context_sched_in(ctx); 2411 perf_event_context_sched_in(ctx, ctx->task);
1956out: 2412out:
1957 local_irq_restore(flags); 2413 local_irq_restore(flags);
1958} 2414}
@@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info)
1977 return; 2433 return;
1978 2434
1979 raw_spin_lock(&ctx->lock); 2435 raw_spin_lock(&ctx->lock);
1980 if (ctx->is_active) 2436 if (ctx->is_active) {
1981 update_context_time(ctx); 2437 update_context_time(ctx);
2438 update_cgrp_time_from_event(event);
2439 }
1982 update_event_times(event); 2440 update_event_times(event);
1983 if (event->state == PERF_EVENT_STATE_ACTIVE) 2441 if (event->state == PERF_EVENT_STATE_ACTIVE)
1984 event->pmu->read(event); 2442 event->pmu->read(event);
@@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event)
2009 * (e.g., thread is blocked), in that case 2467 * (e.g., thread is blocked), in that case
2010 * we cannot update context time 2468 * we cannot update context time
2011 */ 2469 */
2012 if (ctx->is_active) 2470 if (ctx->is_active) {
2013 update_context_time(ctx); 2471 update_context_time(ctx);
2472 update_cgrp_time_from_event(event);
2473 }
2014 update_event_times(event); 2474 update_event_times(event);
2015 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2475 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2016 } 2476 }
@@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event)
2395 2855
2396 if (!event->parent) { 2856 if (!event->parent) {
2397 if (event->attach_state & PERF_ATTACH_TASK) 2857 if (event->attach_state & PERF_ATTACH_TASK)
2398 jump_label_dec(&perf_task_events); 2858 jump_label_dec(&perf_sched_events);
2399 if (event->attr.mmap || event->attr.mmap_data) 2859 if (event->attr.mmap || event->attr.mmap_data)
2400 atomic_dec(&nr_mmap_events); 2860 atomic_dec(&nr_mmap_events);
2401 if (event->attr.comm) 2861 if (event->attr.comm)
@@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event)
2411 event->buffer = NULL; 2871 event->buffer = NULL;
2412 } 2872 }
2413 2873
2874 if (is_cgroup_event(event))
2875 perf_detach_cgroup(event);
2876
2414 if (event->destroy) 2877 if (event->destroy)
2415 event->destroy(event); 2878 event->destroy(event);
2416 2879
@@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event)
5300 5763
5301 if (!in_nmi()) { 5764 if (!in_nmi()) {
5302 update_context_time(event->ctx); 5765 update_context_time(event->ctx);
5766 update_cgrp_time_from_event(event);
5303 time = event->ctx->time; 5767 time = event->ctx->time;
5304 } else { 5768 } else {
5305 u64 now = perf_clock(); 5769 u64 now = perf_clock();
@@ -5725,7 +6189,7 @@ done:
5725 6189
5726 if (!event->parent) { 6190 if (!event->parent) {
5727 if (event->attach_state & PERF_ATTACH_TASK) 6191 if (event->attach_state & PERF_ATTACH_TASK)
5728 jump_label_inc(&perf_task_events); 6192 jump_label_inc(&perf_sched_events);
5729 if (event->attr.mmap || event->attr.mmap_data) 6193 if (event->attr.mmap || event->attr.mmap_data)
5730 atomic_inc(&nr_mmap_events); 6194 atomic_inc(&nr_mmap_events);
5731 if (event->attr.comm) 6195 if (event->attr.comm)
@@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open,
5900 int err; 6364 int err;
5901 6365
5902 /* for future expandability... */ 6366 /* for future expandability... */
5903 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6367 if (flags & ~PERF_FLAG_ALL)
5904 return -EINVAL; 6368 return -EINVAL;
5905 6369
5906 err = perf_copy_attr(attr_uptr, &attr); 6370 err = perf_copy_attr(attr_uptr, &attr);
@@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open,
5917 return -EINVAL; 6381 return -EINVAL;
5918 } 6382 }
5919 6383
6384 /*
6385 * In cgroup mode, the pid argument is used to pass the fd
6386 * opened to the cgroup directory in cgroupfs. The cpu argument
6387 * designates the cpu on which to monitor threads from that
6388 * cgroup.
6389 */
6390 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6391 return -EINVAL;
6392
5920 event_fd = get_unused_fd_flags(O_RDWR); 6393 event_fd = get_unused_fd_flags(O_RDWR);
5921 if (event_fd < 0) 6394 if (event_fd < 0)
5922 return event_fd; 6395 return event_fd;
@@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open,
5934 group_leader = NULL; 6407 group_leader = NULL;
5935 } 6408 }
5936 6409
5937 if (pid != -1) { 6410 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5938 task = find_lively_task_by_vpid(pid); 6411 task = find_lively_task_by_vpid(pid);
5939 if (IS_ERR(task)) { 6412 if (IS_ERR(task)) {
5940 err = PTR_ERR(task); 6413 err = PTR_ERR(task);
@@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open,
5948 goto err_task; 6421 goto err_task;
5949 } 6422 }
5950 6423
6424 if (flags & PERF_FLAG_PID_CGROUP) {
6425 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6426 if (err)
6427 goto err_alloc;
6428 }
6429
5951 /* 6430 /*
5952 * Special case software events and allow them to be part of 6431 * Special case software events and allow them to be part of
5953 * any hardware group. 6432 * any hardware group.
@@ -6808,3 +7287,92 @@ unlock:
6808 return ret; 7287 return ret;
6809} 7288}
6810device_initcall(perf_event_sysfs_init); 7289device_initcall(perf_event_sysfs_init);
7290
7291#ifdef CONFIG_CGROUP_PERF
7292static struct cgroup_subsys_state *perf_cgroup_create(
7293 struct cgroup_subsys *ss, struct cgroup *cont)
7294{
7295 struct perf_cgroup *jc;
7296 struct perf_cgroup_info *t;
7297 int c;
7298
7299 jc = kmalloc(sizeof(*jc), GFP_KERNEL);
7300 if (!jc)
7301 return ERR_PTR(-ENOMEM);
7302
7303 memset(jc, 0, sizeof(*jc));
7304
7305 jc->info = alloc_percpu(struct perf_cgroup_info);
7306 if (!jc->info) {
7307 kfree(jc);
7308 return ERR_PTR(-ENOMEM);
7309 }
7310
7311 for_each_possible_cpu(c) {
7312 t = per_cpu_ptr(jc->info, c);
7313 t->time = 0;
7314 t->timestamp = 0;
7315 }
7316 return &jc->css;
7317}
7318
7319static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7320 struct cgroup *cont)
7321{
7322 struct perf_cgroup *jc;
7323 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7324 struct perf_cgroup, css);
7325 free_percpu(jc->info);
7326 kfree(jc);
7327}
7328
7329static int __perf_cgroup_move(void *info)
7330{
7331 struct task_struct *task = info;
7332 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7333 return 0;
7334}
7335
7336static void perf_cgroup_move(struct task_struct *task)
7337{
7338 task_function_call(task, __perf_cgroup_move, task);
7339}
7340
7341static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7342 struct cgroup *old_cgrp, struct task_struct *task,
7343 bool threadgroup)
7344{
7345 perf_cgroup_move(task);
7346 if (threadgroup) {
7347 struct task_struct *c;
7348 rcu_read_lock();
7349 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7350 perf_cgroup_move(c);
7351 }
7352 rcu_read_unlock();
7353 }
7354}
7355
7356static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7357 struct cgroup *old_cgrp, struct task_struct *task)
7358{
7359 /*
7360 * cgroup_exit() is called in the copy_process() failure path.
7361 * Ignore this case since the task hasn't ran yet, this avoids
7362 * trying to poke a half freed task state from generic code.
7363 */
7364 if (!(task->flags & PF_EXITING))
7365 return;
7366
7367 perf_cgroup_move(task);
7368}
7369
7370struct cgroup_subsys perf_subsys = {
7371 .name = "perf_event",
7372 .subsys_id = perf_subsys_id,
7373 .create = perf_cgroup_create,
7374 .destroy = perf_cgroup_destroy,
7375 .exit = perf_cgroup_exit,
7376 .attach = perf_cgroup_attach,
7377};
7378#endif /* CONFIG_CGROUP_PERF */