diff options
-rw-r--r-- | include/linux/cgroup.h | 1 | ||||
-rw-r--r-- | include/linux/cgroup_subsys.h | 4 | ||||
-rw-r--r-- | include/linux/perf_event.h | 33 | ||||
-rw-r--r-- | init/Kconfig | 10 | ||||
-rw-r--r-- | kernel/cgroup.c | 23 | ||||
-rw-r--r-- | kernel/perf_event.c | 638 |
6 files changed, 671 insertions, 38 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 38117d937332..e654fa239916 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -627,6 +627,7 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, | |||
627 | /* Get id and depth of css */ | 627 | /* Get id and depth of css */ |
628 | unsigned short css_id(struct cgroup_subsys_state *css); | 628 | unsigned short css_id(struct cgroup_subsys_state *css); |
629 | unsigned short css_depth(struct cgroup_subsys_state *css); | 629 | unsigned short css_depth(struct cgroup_subsys_state *css); |
630 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); | ||
630 | 631 | ||
631 | #else /* !CONFIG_CGROUPS */ | 632 | #else /* !CONFIG_CGROUPS */ |
632 | 633 | ||
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ccefff02b6cb..cdbfcb8780ec 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h | |||
@@ -65,4 +65,8 @@ SUBSYS(net_cls) | |||
65 | SUBSYS(blkio) | 65 | SUBSYS(blkio) |
66 | #endif | 66 | #endif |
67 | 67 | ||
68 | #ifdef CONFIG_CGROUP_PERF | ||
69 | SUBSYS(perf) | ||
70 | #endif | ||
71 | |||
68 | /* */ | 72 | /* */ |
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dda5b0a3ff60..38c8b2554842 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h | |||
@@ -464,6 +464,7 @@ enum perf_callchain_context { | |||
464 | 464 | ||
465 | #define PERF_FLAG_FD_NO_GROUP (1U << 0) | 465 | #define PERF_FLAG_FD_NO_GROUP (1U << 0) |
466 | #define PERF_FLAG_FD_OUTPUT (1U << 1) | 466 | #define PERF_FLAG_FD_OUTPUT (1U << 1) |
467 | #define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ | ||
467 | 468 | ||
468 | #ifdef __KERNEL__ | 469 | #ifdef __KERNEL__ |
469 | /* | 470 | /* |
@@ -471,6 +472,7 @@ enum perf_callchain_context { | |||
471 | */ | 472 | */ |
472 | 473 | ||
473 | #ifdef CONFIG_PERF_EVENTS | 474 | #ifdef CONFIG_PERF_EVENTS |
475 | # include <linux/cgroup.h> | ||
474 | # include <asm/perf_event.h> | 476 | # include <asm/perf_event.h> |
475 | # include <asm/local64.h> | 477 | # include <asm/local64.h> |
476 | #endif | 478 | #endif |
@@ -716,6 +718,22 @@ struct swevent_hlist { | |||
716 | #define PERF_ATTACH_GROUP 0x02 | 718 | #define PERF_ATTACH_GROUP 0x02 |
717 | #define PERF_ATTACH_TASK 0x04 | 719 | #define PERF_ATTACH_TASK 0x04 |
718 | 720 | ||
721 | #ifdef CONFIG_CGROUP_PERF | ||
722 | /* | ||
723 | * perf_cgroup_info keeps track of time_enabled for a cgroup. | ||
724 | * This is a per-cpu dynamically allocated data structure. | ||
725 | */ | ||
726 | struct perf_cgroup_info { | ||
727 | u64 time; | ||
728 | u64 timestamp; | ||
729 | }; | ||
730 | |||
731 | struct perf_cgroup { | ||
732 | struct cgroup_subsys_state css; | ||
733 | struct perf_cgroup_info *info; /* timing info, one per cpu */ | ||
734 | }; | ||
735 | #endif | ||
736 | |||
719 | /** | 737 | /** |
720 | * struct perf_event - performance event kernel representation: | 738 | * struct perf_event - performance event kernel representation: |
721 | */ | 739 | */ |
@@ -832,6 +850,11 @@ struct perf_event { | |||
832 | struct event_filter *filter; | 850 | struct event_filter *filter; |
833 | #endif | 851 | #endif |
834 | 852 | ||
853 | #ifdef CONFIG_CGROUP_PERF | ||
854 | struct perf_cgroup *cgrp; /* cgroup event is attach to */ | ||
855 | int cgrp_defer_enabled; | ||
856 | #endif | ||
857 | |||
835 | #endif /* CONFIG_PERF_EVENTS */ | 858 | #endif /* CONFIG_PERF_EVENTS */ |
836 | }; | 859 | }; |
837 | 860 | ||
@@ -886,6 +909,7 @@ struct perf_event_context { | |||
886 | u64 generation; | 909 | u64 generation; |
887 | int pin_count; | 910 | int pin_count; |
888 | struct rcu_head rcu_head; | 911 | struct rcu_head rcu_head; |
912 | int nr_cgroups; /* cgroup events present */ | ||
889 | }; | 913 | }; |
890 | 914 | ||
891 | /* | 915 | /* |
@@ -905,6 +929,9 @@ struct perf_cpu_context { | |||
905 | struct list_head rotation_list; | 929 | struct list_head rotation_list; |
906 | int jiffies_interval; | 930 | int jiffies_interval; |
907 | struct pmu *active_pmu; | 931 | struct pmu *active_pmu; |
932 | #ifdef CONFIG_CGROUP_PERF | ||
933 | struct perf_cgroup *cgrp; | ||
934 | #endif | ||
908 | }; | 935 | }; |
909 | 936 | ||
910 | struct perf_output_handle { | 937 | struct perf_output_handle { |
@@ -1040,11 +1067,11 @@ have_event: | |||
1040 | __perf_sw_event(event_id, nr, nmi, regs, addr); | 1067 | __perf_sw_event(event_id, nr, nmi, regs, addr); |
1041 | } | 1068 | } |
1042 | 1069 | ||
1043 | extern atomic_t perf_task_events; | 1070 | extern atomic_t perf_sched_events; |
1044 | 1071 | ||
1045 | static inline void perf_event_task_sched_in(struct task_struct *task) | 1072 | static inline void perf_event_task_sched_in(struct task_struct *task) |
1046 | { | 1073 | { |
1047 | COND_STMT(&perf_task_events, __perf_event_task_sched_in(task)); | 1074 | COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task)); |
1048 | } | 1075 | } |
1049 | 1076 | ||
1050 | static inline | 1077 | static inline |
@@ -1052,7 +1079,7 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex | |||
1052 | { | 1079 | { |
1053 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); | 1080 | perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); |
1054 | 1081 | ||
1055 | COND_STMT(&perf_task_events, __perf_event_task_sched_out(task, next)); | 1082 | COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next)); |
1056 | } | 1083 | } |
1057 | 1084 | ||
1058 | extern void perf_event_mmap(struct vm_area_struct *vma); | 1085 | extern void perf_event_mmap(struct vm_area_struct *vma); |
diff --git a/init/Kconfig b/init/Kconfig index be788c0957d4..20d6bd919b8d 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -683,6 +683,16 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED | |||
683 | select this option (if, for some reason, they need to disable it | 683 | select this option (if, for some reason, they need to disable it |
684 | then noswapaccount does the trick). | 684 | then noswapaccount does the trick). |
685 | 685 | ||
686 | config CGROUP_PERF | ||
687 | bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" | ||
688 | depends on PERF_EVENTS && CGROUPS | ||
689 | help | ||
690 | This option extends the per-cpu mode to restrict monitoring to | ||
691 | threads which belong to the cgroup specificied and run on the | ||
692 | designated cpu. | ||
693 | |||
694 | Say N if unsure. | ||
695 | |||
686 | menuconfig CGROUP_SCHED | 696 | menuconfig CGROUP_SCHED |
687 | bool "Group CPU scheduler" | 697 | bool "Group CPU scheduler" |
688 | depends on EXPERIMENTAL | 698 | depends on EXPERIMENTAL |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f6495f33a355..95362d15128c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -4818,6 +4818,29 @@ css_get_next(struct cgroup_subsys *ss, int id, | |||
4818 | return ret; | 4818 | return ret; |
4819 | } | 4819 | } |
4820 | 4820 | ||
4821 | /* | ||
4822 | * get corresponding css from file open on cgroupfs directory | ||
4823 | */ | ||
4824 | struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) | ||
4825 | { | ||
4826 | struct cgroup *cgrp; | ||
4827 | struct inode *inode; | ||
4828 | struct cgroup_subsys_state *css; | ||
4829 | |||
4830 | inode = f->f_dentry->d_inode; | ||
4831 | /* check in cgroup filesystem dir */ | ||
4832 | if (inode->i_op != &cgroup_dir_inode_operations) | ||
4833 | return ERR_PTR(-EBADF); | ||
4834 | |||
4835 | if (id < 0 || id >= CGROUP_SUBSYS_COUNT) | ||
4836 | return ERR_PTR(-EINVAL); | ||
4837 | |||
4838 | /* get cgroup */ | ||
4839 | cgrp = __d_cgrp(f->f_dentry); | ||
4840 | css = cgrp->subsys[id]; | ||
4841 | return css ? css : ERR_PTR(-ENOENT); | ||
4842 | } | ||
4843 | |||
4821 | #ifdef CONFIG_CGROUP_DEBUG | 4844 | #ifdef CONFIG_CGROUP_DEBUG |
4822 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, | 4845 | static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, |
4823 | struct cgroup *cont) | 4846 | struct cgroup *cont) |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3d3f282fa50e..65dcdc76d709 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | |||
111 | return data.ret; | 111 | return data.ret; |
112 | } | 112 | } |
113 | 113 | ||
114 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | ||
115 | PERF_FLAG_FD_OUTPUT |\ | ||
116 | PERF_FLAG_PID_CGROUP) | ||
117 | |||
114 | enum event_type_t { | 118 | enum event_type_t { |
115 | EVENT_FLEXIBLE = 0x1, | 119 | EVENT_FLEXIBLE = 0x1, |
116 | EVENT_PINNED = 0x2, | 120 | EVENT_PINNED = 0x2, |
117 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | 121 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, |
118 | }; | 122 | }; |
119 | 123 | ||
120 | atomic_t perf_task_events __read_mostly; | 124 | /* |
125 | * perf_sched_events : >0 events exist | ||
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | ||
127 | */ | ||
128 | atomic_t perf_sched_events __read_mostly; | ||
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | ||
130 | |||
121 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
122 | static atomic_t nr_comm_events __read_mostly; | 132 | static atomic_t nr_comm_events __read_mostly; |
123 | static atomic_t nr_task_events __read_mostly; | 133 | static atomic_t nr_task_events __read_mostly; |
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
148 | enum event_type_t event_type); | 158 | enum event_type_t event_type); |
149 | 159 | ||
150 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 160 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
151 | enum event_type_t event_type); | 161 | enum event_type_t event_type, |
162 | struct task_struct *task); | ||
163 | |||
164 | static void update_context_time(struct perf_event_context *ctx); | ||
165 | static u64 perf_event_time(struct perf_event *event); | ||
152 | 166 | ||
153 | void __weak perf_event_print_debug(void) { } | 167 | void __weak perf_event_print_debug(void) { } |
154 | 168 | ||
@@ -162,6 +176,338 @@ static inline u64 perf_clock(void) | |||
162 | return local_clock(); | 176 | return local_clock(); |
163 | } | 177 | } |
164 | 178 | ||
179 | static inline struct perf_cpu_context * | ||
180 | __get_cpu_context(struct perf_event_context *ctx) | ||
181 | { | ||
182 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
183 | } | ||
184 | |||
185 | #ifdef CONFIG_CGROUP_PERF | ||
186 | |||
187 | static inline struct perf_cgroup * | ||
188 | perf_cgroup_from_task(struct task_struct *task) | ||
189 | { | ||
190 | return container_of(task_subsys_state(task, perf_subsys_id), | ||
191 | struct perf_cgroup, css); | ||
192 | } | ||
193 | |||
194 | static inline bool | ||
195 | perf_cgroup_match(struct perf_event *event) | ||
196 | { | ||
197 | struct perf_event_context *ctx = event->ctx; | ||
198 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
199 | |||
200 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | ||
201 | } | ||
202 | |||
203 | static inline void perf_get_cgroup(struct perf_event *event) | ||
204 | { | ||
205 | css_get(&event->cgrp->css); | ||
206 | } | ||
207 | |||
208 | static inline void perf_put_cgroup(struct perf_event *event) | ||
209 | { | ||
210 | css_put(&event->cgrp->css); | ||
211 | } | ||
212 | |||
213 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
214 | { | ||
215 | perf_put_cgroup(event); | ||
216 | event->cgrp = NULL; | ||
217 | } | ||
218 | |||
219 | static inline int is_cgroup_event(struct perf_event *event) | ||
220 | { | ||
221 | return event->cgrp != NULL; | ||
222 | } | ||
223 | |||
224 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
225 | { | ||
226 | struct perf_cgroup_info *t; | ||
227 | |||
228 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
229 | return t->time; | ||
230 | } | ||
231 | |||
232 | static inline void __update_cgrp_time(struct perf_cgroup *cgrp) | ||
233 | { | ||
234 | struct perf_cgroup_info *info; | ||
235 | u64 now; | ||
236 | |||
237 | now = perf_clock(); | ||
238 | |||
239 | info = this_cpu_ptr(cgrp->info); | ||
240 | |||
241 | info->time += now - info->timestamp; | ||
242 | info->timestamp = now; | ||
243 | } | ||
244 | |||
245 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
246 | { | ||
247 | struct perf_cgroup *cgrp_out = cpuctx->cgrp; | ||
248 | if (cgrp_out) | ||
249 | __update_cgrp_time(cgrp_out); | ||
250 | } | ||
251 | |||
252 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
253 | { | ||
254 | struct perf_cgroup *cgrp = perf_cgroup_from_task(current); | ||
255 | /* | ||
256 | * do not update time when cgroup is not active | ||
257 | */ | ||
258 | if (!event->cgrp || cgrp != event->cgrp) | ||
259 | return; | ||
260 | |||
261 | __update_cgrp_time(event->cgrp); | ||
262 | } | ||
263 | |||
264 | static inline void | ||
265 | perf_cgroup_set_timestamp(struct task_struct *task, u64 now) | ||
266 | { | ||
267 | struct perf_cgroup *cgrp; | ||
268 | struct perf_cgroup_info *info; | ||
269 | |||
270 | if (!task) | ||
271 | return; | ||
272 | |||
273 | cgrp = perf_cgroup_from_task(task); | ||
274 | info = this_cpu_ptr(cgrp->info); | ||
275 | info->timestamp = now; | ||
276 | } | ||
277 | |||
278 | #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ | ||
279 | #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ | ||
280 | |||
281 | /* | ||
282 | * reschedule events based on the cgroup constraint of task. | ||
283 | * | ||
284 | * mode SWOUT : schedule out everything | ||
285 | * mode SWIN : schedule in based on cgroup for next | ||
286 | */ | ||
287 | void perf_cgroup_switch(struct task_struct *task, int mode) | ||
288 | { | ||
289 | struct perf_cpu_context *cpuctx; | ||
290 | struct pmu *pmu; | ||
291 | unsigned long flags; | ||
292 | |||
293 | /* | ||
294 | * disable interrupts to avoid geting nr_cgroup | ||
295 | * changes via __perf_event_disable(). Also | ||
296 | * avoids preemption. | ||
297 | */ | ||
298 | local_irq_save(flags); | ||
299 | |||
300 | /* | ||
301 | * we reschedule only in the presence of cgroup | ||
302 | * constrained events. | ||
303 | */ | ||
304 | rcu_read_lock(); | ||
305 | |||
306 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
307 | |||
308 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
309 | |||
310 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
311 | |||
312 | /* | ||
313 | * perf_cgroup_events says at least one | ||
314 | * context on this CPU has cgroup events. | ||
315 | * | ||
316 | * ctx->nr_cgroups reports the number of cgroup | ||
317 | * events for a context. | ||
318 | */ | ||
319 | if (cpuctx->ctx.nr_cgroups > 0) { | ||
320 | |||
321 | if (mode & PERF_CGROUP_SWOUT) { | ||
322 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
323 | /* | ||
324 | * must not be done before ctxswout due | ||
325 | * to event_filter_match() in event_sched_out() | ||
326 | */ | ||
327 | cpuctx->cgrp = NULL; | ||
328 | } | ||
329 | |||
330 | if (mode & PERF_CGROUP_SWIN) { | ||
331 | /* set cgrp before ctxsw in to | ||
332 | * allow event_filter_match() to not | ||
333 | * have to pass task around | ||
334 | */ | ||
335 | cpuctx->cgrp = perf_cgroup_from_task(task); | ||
336 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | ||
337 | } | ||
338 | } | ||
339 | |||
340 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
341 | } | ||
342 | |||
343 | rcu_read_unlock(); | ||
344 | |||
345 | local_irq_restore(flags); | ||
346 | } | ||
347 | |||
348 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
349 | { | ||
350 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
351 | } | ||
352 | |||
353 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
354 | { | ||
355 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
356 | } | ||
357 | |||
358 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | ||
359 | struct perf_event_attr *attr, | ||
360 | struct perf_event *group_leader) | ||
361 | { | ||
362 | struct perf_cgroup *cgrp; | ||
363 | struct cgroup_subsys_state *css; | ||
364 | struct file *file; | ||
365 | int ret = 0, fput_needed; | ||
366 | |||
367 | file = fget_light(fd, &fput_needed); | ||
368 | if (!file) | ||
369 | return -EBADF; | ||
370 | |||
371 | css = cgroup_css_from_dir(file, perf_subsys_id); | ||
372 | if (IS_ERR(css)) | ||
373 | return PTR_ERR(css); | ||
374 | |||
375 | cgrp = container_of(css, struct perf_cgroup, css); | ||
376 | event->cgrp = cgrp; | ||
377 | |||
378 | /* | ||
379 | * all events in a group must monitor | ||
380 | * the same cgroup because a task belongs | ||
381 | * to only one perf cgroup at a time | ||
382 | */ | ||
383 | if (group_leader && group_leader->cgrp != cgrp) { | ||
384 | perf_detach_cgroup(event); | ||
385 | ret = -EINVAL; | ||
386 | } else { | ||
387 | /* must be done before we fput() the file */ | ||
388 | perf_get_cgroup(event); | ||
389 | } | ||
390 | fput_light(file, fput_needed); | ||
391 | return ret; | ||
392 | } | ||
393 | |||
394 | static inline void | ||
395 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
396 | { | ||
397 | struct perf_cgroup_info *t; | ||
398 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
399 | event->shadow_ctx_time = now - t->timestamp; | ||
400 | } | ||
401 | |||
402 | static inline void | ||
403 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
404 | { | ||
405 | /* | ||
406 | * when the current task's perf cgroup does not match | ||
407 | * the event's, we need to remember to call the | ||
408 | * perf_mark_enable() function the first time a task with | ||
409 | * a matching perf cgroup is scheduled in. | ||
410 | */ | ||
411 | if (is_cgroup_event(event) && !perf_cgroup_match(event)) | ||
412 | event->cgrp_defer_enabled = 1; | ||
413 | } | ||
414 | |||
415 | static inline void | ||
416 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
417 | struct perf_event_context *ctx) | ||
418 | { | ||
419 | struct perf_event *sub; | ||
420 | u64 tstamp = perf_event_time(event); | ||
421 | |||
422 | if (!event->cgrp_defer_enabled) | ||
423 | return; | ||
424 | |||
425 | event->cgrp_defer_enabled = 0; | ||
426 | |||
427 | event->tstamp_enabled = tstamp - event->total_time_enabled; | ||
428 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
429 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | ||
430 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; | ||
431 | sub->cgrp_defer_enabled = 0; | ||
432 | } | ||
433 | } | ||
434 | } | ||
435 | #else /* !CONFIG_CGROUP_PERF */ | ||
436 | |||
437 | static inline bool | ||
438 | perf_cgroup_match(struct perf_event *event) | ||
439 | { | ||
440 | return true; | ||
441 | } | ||
442 | |||
443 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
444 | {} | ||
445 | |||
446 | static inline int is_cgroup_event(struct perf_event *event) | ||
447 | { | ||
448 | return 0; | ||
449 | } | ||
450 | |||
451 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
452 | { | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
457 | { | ||
458 | } | ||
459 | |||
460 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
461 | { | ||
462 | } | ||
463 | |||
464 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
465 | { | ||
466 | } | ||
467 | |||
468 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
469 | { | ||
470 | } | ||
471 | |||
472 | static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, | ||
473 | struct perf_event_attr *attr, | ||
474 | struct perf_event *group_leader) | ||
475 | { | ||
476 | return -EINVAL; | ||
477 | } | ||
478 | |||
479 | static inline void | ||
480 | perf_cgroup_set_timestamp(struct task_struct *task, u64 now) | ||
481 | { | ||
482 | } | ||
483 | |||
484 | void | ||
485 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | ||
486 | { | ||
487 | } | ||
488 | |||
489 | static inline void | ||
490 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
491 | { | ||
492 | } | ||
493 | |||
494 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
495 | { | ||
496 | return 0; | ||
497 | } | ||
498 | |||
499 | static inline void | ||
500 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
501 | { | ||
502 | } | ||
503 | |||
504 | static inline void | ||
505 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
506 | struct perf_event_context *ctx) | ||
507 | { | ||
508 | } | ||
509 | #endif | ||
510 | |||
165 | void perf_pmu_disable(struct pmu *pmu) | 511 | void perf_pmu_disable(struct pmu *pmu) |
166 | { | 512 | { |
167 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 513 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx) | |||
343 | static u64 perf_event_time(struct perf_event *event) | 689 | static u64 perf_event_time(struct perf_event *event) |
344 | { | 690 | { |
345 | struct perf_event_context *ctx = event->ctx; | 691 | struct perf_event_context *ctx = event->ctx; |
692 | |||
693 | if (is_cgroup_event(event)) | ||
694 | return perf_cgroup_event_time(event); | ||
695 | |||
346 | return ctx ? ctx->time : 0; | 696 | return ctx ? ctx->time : 0; |
347 | } | 697 | } |
348 | 698 | ||
@@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event) | |||
357 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 707 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
358 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 708 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
359 | return; | 709 | return; |
360 | 710 | /* | |
361 | if (ctx->is_active) | 711 | * in cgroup mode, time_enabled represents |
712 | * the time the event was enabled AND active | ||
713 | * tasks were in the monitored cgroup. This is | ||
714 | * independent of the activity of the context as | ||
715 | * there may be a mix of cgroup and non-cgroup events. | ||
716 | * | ||
717 | * That is why we treat cgroup events differently | ||
718 | * here. | ||
719 | */ | ||
720 | if (is_cgroup_event(event)) | ||
362 | run_end = perf_event_time(event); | 721 | run_end = perf_event_time(event); |
722 | else if (ctx->is_active) | ||
723 | run_end = ctx->time; | ||
363 | else | 724 | else |
364 | run_end = event->tstamp_stopped; | 725 | run_end = event->tstamp_stopped; |
365 | 726 | ||
@@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event) | |||
371 | run_end = perf_event_time(event); | 732 | run_end = perf_event_time(event); |
372 | 733 | ||
373 | event->total_time_running = run_end - event->tstamp_running; | 734 | event->total_time_running = run_end - event->tstamp_running; |
735 | |||
374 | } | 736 | } |
375 | 737 | ||
376 | /* | 738 | /* |
@@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
419 | list_add_tail(&event->group_entry, list); | 781 | list_add_tail(&event->group_entry, list); |
420 | } | 782 | } |
421 | 783 | ||
784 | if (is_cgroup_event(event)) { | ||
785 | ctx->nr_cgroups++; | ||
786 | /* | ||
787 | * one more event: | ||
788 | * - that has cgroup constraint on event->cpu | ||
789 | * - that may need work on context switch | ||
790 | */ | ||
791 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
792 | jump_label_inc(&perf_sched_events); | ||
793 | } | ||
794 | |||
422 | list_add_rcu(&event->event_entry, &ctx->event_list); | 795 | list_add_rcu(&event->event_entry, &ctx->event_list); |
423 | if (!ctx->nr_events) | 796 | if (!ctx->nr_events) |
424 | perf_pmu_rotate_start(ctx->pmu); | 797 | perf_pmu_rotate_start(ctx->pmu); |
@@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
545 | 918 | ||
546 | event->attach_state &= ~PERF_ATTACH_CONTEXT; | 919 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
547 | 920 | ||
921 | if (is_cgroup_event(event)) { | ||
922 | ctx->nr_cgroups--; | ||
923 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
924 | jump_label_dec(&perf_sched_events); | ||
925 | } | ||
926 | |||
548 | ctx->nr_events--; | 927 | ctx->nr_events--; |
549 | if (event->attr.inherit_stat) | 928 | if (event->attr.inherit_stat) |
550 | ctx->nr_stat--; | 929 | ctx->nr_stat--; |
@@ -616,7 +995,8 @@ out: | |||
616 | static inline int | 995 | static inline int |
617 | event_filter_match(struct perf_event *event) | 996 | event_filter_match(struct perf_event *event) |
618 | { | 997 | { |
619 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 998 | return (event->cpu == -1 || event->cpu == smp_processor_id()) |
999 | && perf_cgroup_match(event); | ||
620 | } | 1000 | } |
621 | 1001 | ||
622 | static void | 1002 | static void |
@@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event, | |||
634 | */ | 1014 | */ |
635 | if (event->state == PERF_EVENT_STATE_INACTIVE | 1015 | if (event->state == PERF_EVENT_STATE_INACTIVE |
636 | && !event_filter_match(event)) { | 1016 | && !event_filter_match(event)) { |
637 | delta = ctx->time - event->tstamp_stopped; | 1017 | delta = tstamp - event->tstamp_stopped; |
638 | event->tstamp_running += delta; | 1018 | event->tstamp_running += delta; |
639 | event->tstamp_stopped = tstamp; | 1019 | event->tstamp_stopped = tstamp; |
640 | } | 1020 | } |
@@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event, | |||
678 | cpuctx->exclusive = 0; | 1058 | cpuctx->exclusive = 0; |
679 | } | 1059 | } |
680 | 1060 | ||
681 | static inline struct perf_cpu_context * | ||
682 | __get_cpu_context(struct perf_event_context *ctx) | ||
683 | { | ||
684 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
685 | } | ||
686 | |||
687 | /* | 1061 | /* |
688 | * Cross CPU call to remove a performance event | 1062 | * Cross CPU call to remove a performance event |
689 | * | 1063 | * |
@@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info) | |||
783 | */ | 1157 | */ |
784 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 1158 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
785 | update_context_time(ctx); | 1159 | update_context_time(ctx); |
1160 | update_cgrp_time_from_event(event); | ||
786 | update_group_times(event); | 1161 | update_group_times(event); |
787 | if (event == event->group_leader) | 1162 | if (event == event->group_leader) |
788 | group_sched_out(event, cpuctx, ctx); | 1163 | group_sched_out(event, cpuctx, ctx); |
@@ -851,6 +1226,41 @@ retry: | |||
851 | raw_spin_unlock_irq(&ctx->lock); | 1226 | raw_spin_unlock_irq(&ctx->lock); |
852 | } | 1227 | } |
853 | 1228 | ||
1229 | static void perf_set_shadow_time(struct perf_event *event, | ||
1230 | struct perf_event_context *ctx, | ||
1231 | u64 tstamp) | ||
1232 | { | ||
1233 | /* | ||
1234 | * use the correct time source for the time snapshot | ||
1235 | * | ||
1236 | * We could get by without this by leveraging the | ||
1237 | * fact that to get to this function, the caller | ||
1238 | * has most likely already called update_context_time() | ||
1239 | * and update_cgrp_time_xx() and thus both timestamp | ||
1240 | * are identical (or very close). Given that tstamp is, | ||
1241 | * already adjusted for cgroup, we could say that: | ||
1242 | * tstamp - ctx->timestamp | ||
1243 | * is equivalent to | ||
1244 | * tstamp - cgrp->timestamp. | ||
1245 | * | ||
1246 | * Then, in perf_output_read(), the calculation would | ||
1247 | * work with no changes because: | ||
1248 | * - event is guaranteed scheduled in | ||
1249 | * - no scheduled out in between | ||
1250 | * - thus the timestamp would be the same | ||
1251 | * | ||
1252 | * But this is a bit hairy. | ||
1253 | * | ||
1254 | * So instead, we have an explicit cgroup call to remain | ||
1255 | * within the time time source all along. We believe it | ||
1256 | * is cleaner and simpler to understand. | ||
1257 | */ | ||
1258 | if (is_cgroup_event(event)) | ||
1259 | perf_cgroup_set_shadow_time(event, tstamp); | ||
1260 | else | ||
1261 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
1262 | } | ||
1263 | |||
854 | #define MAX_INTERRUPTS (~0ULL) | 1264 | #define MAX_INTERRUPTS (~0ULL) |
855 | 1265 | ||
856 | static void perf_log_throttle(struct perf_event *event, int enable); | 1266 | static void perf_log_throttle(struct perf_event *event, int enable); |
@@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event, | |||
891 | 1301 | ||
892 | event->tstamp_running += tstamp - event->tstamp_stopped; | 1302 | event->tstamp_running += tstamp - event->tstamp_stopped; |
893 | 1303 | ||
894 | event->shadow_ctx_time = tstamp - ctx->timestamp; | 1304 | perf_set_shadow_time(event, ctx, tstamp); |
895 | 1305 | ||
896 | if (!is_software_event(event)) | 1306 | if (!is_software_event(event)) |
897 | cpuctx->active_oncpu++; | 1307 | cpuctx->active_oncpu++; |
@@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event, | |||
1012 | event->tstamp_stopped = tstamp; | 1422 | event->tstamp_stopped = tstamp; |
1013 | } | 1423 | } |
1014 | 1424 | ||
1015 | static void perf_event_context_sched_in(struct perf_event_context *ctx); | 1425 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
1426 | struct task_struct *tsk); | ||
1016 | 1427 | ||
1017 | /* | 1428 | /* |
1018 | * Cross CPU call to install and enable a performance event | 1429 | * Cross CPU call to install and enable a performance event |
@@ -1033,11 +1444,17 @@ static int __perf_install_in_context(void *info) | |||
1033 | * which do context switches with IRQs enabled. | 1444 | * which do context switches with IRQs enabled. |
1034 | */ | 1445 | */ |
1035 | if (ctx->task && !cpuctx->task_ctx) | 1446 | if (ctx->task && !cpuctx->task_ctx) |
1036 | perf_event_context_sched_in(ctx); | 1447 | perf_event_context_sched_in(ctx, ctx->task); |
1037 | 1448 | ||
1038 | raw_spin_lock(&ctx->lock); | 1449 | raw_spin_lock(&ctx->lock); |
1039 | ctx->is_active = 1; | 1450 | ctx->is_active = 1; |
1040 | update_context_time(ctx); | 1451 | update_context_time(ctx); |
1452 | /* | ||
1453 | * update cgrp time only if current cgrp | ||
1454 | * matches event->cgrp. Must be done before | ||
1455 | * calling add_event_to_ctx() | ||
1456 | */ | ||
1457 | update_cgrp_time_from_event(event); | ||
1041 | 1458 | ||
1042 | add_event_to_ctx(event, ctx); | 1459 | add_event_to_ctx(event, ctx); |
1043 | 1460 | ||
@@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info) | |||
1175 | 1592 | ||
1176 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1593 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
1177 | goto unlock; | 1594 | goto unlock; |
1595 | |||
1596 | /* | ||
1597 | * set current task's cgroup time reference point | ||
1598 | */ | ||
1599 | perf_cgroup_set_timestamp(current, perf_clock()); | ||
1600 | |||
1178 | __perf_event_mark_enabled(event, ctx); | 1601 | __perf_event_mark_enabled(event, ctx); |
1179 | 1602 | ||
1180 | if (!event_filter_match(event)) | 1603 | if (!event_filter_match(event)) { |
1604 | if (is_cgroup_event(event)) | ||
1605 | perf_cgroup_defer_enabled(event); | ||
1181 | goto unlock; | 1606 | goto unlock; |
1607 | } | ||
1182 | 1608 | ||
1183 | /* | 1609 | /* |
1184 | * If the event is in a group and isn't the group leader, | 1610 | * If the event is in a group and isn't the group leader, |
@@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1307 | if (likely(!ctx->nr_events)) | 1733 | if (likely(!ctx->nr_events)) |
1308 | goto out; | 1734 | goto out; |
1309 | update_context_time(ctx); | 1735 | update_context_time(ctx); |
1736 | update_cgrp_time_from_cpuctx(cpuctx); | ||
1310 | 1737 | ||
1311 | if (!ctx->nr_active) | 1738 | if (!ctx->nr_active) |
1312 | goto out; | 1739 | goto out; |
@@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
1496 | 1923 | ||
1497 | for_each_task_context_nr(ctxn) | 1924 | for_each_task_context_nr(ctxn) |
1498 | perf_event_context_sched_out(task, ctxn, next); | 1925 | perf_event_context_sched_out(task, ctxn, next); |
1926 | |||
1927 | /* | ||
1928 | * if cgroup events exist on this CPU, then we need | ||
1929 | * to check if we have to switch out PMU state. | ||
1930 | * cgroup event are system-wide mode only | ||
1931 | */ | ||
1932 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
1933 | perf_cgroup_sched_out(task); | ||
1499 | } | 1934 | } |
1500 | 1935 | ||
1501 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1936 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
@@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1534 | if (!event_filter_match(event)) | 1969 | if (!event_filter_match(event)) |
1535 | continue; | 1970 | continue; |
1536 | 1971 | ||
1972 | /* may need to reset tstamp_enabled */ | ||
1973 | if (is_cgroup_event(event)) | ||
1974 | perf_cgroup_mark_enabled(event, ctx); | ||
1975 | |||
1537 | if (group_can_go_on(event, cpuctx, 1)) | 1976 | if (group_can_go_on(event, cpuctx, 1)) |
1538 | group_sched_in(event, cpuctx, ctx); | 1977 | group_sched_in(event, cpuctx, ctx); |
1539 | 1978 | ||
@@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1566 | if (!event_filter_match(event)) | 2005 | if (!event_filter_match(event)) |
1567 | continue; | 2006 | continue; |
1568 | 2007 | ||
2008 | /* may need to reset tstamp_enabled */ | ||
2009 | if (is_cgroup_event(event)) | ||
2010 | perf_cgroup_mark_enabled(event, ctx); | ||
2011 | |||
1569 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 2012 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1570 | if (group_sched_in(event, cpuctx, ctx)) | 2013 | if (group_sched_in(event, cpuctx, ctx)) |
1571 | can_add_hw = 0; | 2014 | can_add_hw = 0; |
@@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1576 | static void | 2019 | static void |
1577 | ctx_sched_in(struct perf_event_context *ctx, | 2020 | ctx_sched_in(struct perf_event_context *ctx, |
1578 | struct perf_cpu_context *cpuctx, | 2021 | struct perf_cpu_context *cpuctx, |
1579 | enum event_type_t event_type) | 2022 | enum event_type_t event_type, |
2023 | struct task_struct *task) | ||
1580 | { | 2024 | { |
2025 | u64 now; | ||
2026 | |||
1581 | raw_spin_lock(&ctx->lock); | 2027 | raw_spin_lock(&ctx->lock); |
1582 | ctx->is_active = 1; | 2028 | ctx->is_active = 1; |
1583 | if (likely(!ctx->nr_events)) | 2029 | if (likely(!ctx->nr_events)) |
1584 | goto out; | 2030 | goto out; |
1585 | 2031 | ||
1586 | ctx->timestamp = perf_clock(); | 2032 | now = perf_clock(); |
1587 | 2033 | ctx->timestamp = now; | |
2034 | perf_cgroup_set_timestamp(task, now); | ||
1588 | /* | 2035 | /* |
1589 | * First go through the list and put on any pinned groups | 2036 | * First go through the list and put on any pinned groups |
1590 | * in order to give them the best chance of going on. | 2037 | * in order to give them the best chance of going on. |
@@ -1601,11 +2048,12 @@ out: | |||
1601 | } | 2048 | } |
1602 | 2049 | ||
1603 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2050 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
1604 | enum event_type_t event_type) | 2051 | enum event_type_t event_type, |
2052 | struct task_struct *task) | ||
1605 | { | 2053 | { |
1606 | struct perf_event_context *ctx = &cpuctx->ctx; | 2054 | struct perf_event_context *ctx = &cpuctx->ctx; |
1607 | 2055 | ||
1608 | ctx_sched_in(ctx, cpuctx, event_type); | 2056 | ctx_sched_in(ctx, cpuctx, event_type, task); |
1609 | } | 2057 | } |
1610 | 2058 | ||
1611 | static void task_ctx_sched_in(struct perf_event_context *ctx, | 2059 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
@@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, | |||
1617 | if (cpuctx->task_ctx == ctx) | 2065 | if (cpuctx->task_ctx == ctx) |
1618 | return; | 2066 | return; |
1619 | 2067 | ||
1620 | ctx_sched_in(ctx, cpuctx, event_type); | 2068 | ctx_sched_in(ctx, cpuctx, event_type, NULL); |
1621 | cpuctx->task_ctx = ctx; | 2069 | cpuctx->task_ctx = ctx; |
1622 | } | 2070 | } |
1623 | 2071 | ||
1624 | static void perf_event_context_sched_in(struct perf_event_context *ctx) | 2072 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
2073 | struct task_struct *task) | ||
1625 | { | 2074 | { |
1626 | struct perf_cpu_context *cpuctx; | 2075 | struct perf_cpu_context *cpuctx; |
1627 | 2076 | ||
@@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx) | |||
1637 | */ | 2086 | */ |
1638 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2087 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1639 | 2088 | ||
1640 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | 2089 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); |
1641 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2090 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); |
1642 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | 2091 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
1643 | 2092 | ||
1644 | cpuctx->task_ctx = ctx; | 2093 | cpuctx->task_ctx = ctx; |
1645 | 2094 | ||
@@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
1672 | if (likely(!ctx)) | 2121 | if (likely(!ctx)) |
1673 | continue; | 2122 | continue; |
1674 | 2123 | ||
1675 | perf_event_context_sched_in(ctx); | 2124 | perf_event_context_sched_in(ctx, task); |
1676 | } | 2125 | } |
2126 | /* | ||
2127 | * if cgroup events exist on this CPU, then we need | ||
2128 | * to check if we have to switch in PMU state. | ||
2129 | * cgroup event are system-wide mode only | ||
2130 | */ | ||
2131 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
2132 | perf_cgroup_sched_in(task); | ||
1677 | } | 2133 | } |
1678 | 2134 | ||
1679 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2135 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
@@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
1873 | if (ctx) | 2329 | if (ctx) |
1874 | rotate_ctx(ctx); | 2330 | rotate_ctx(ctx); |
1875 | 2331 | ||
1876 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2332 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); |
1877 | if (ctx) | 2333 | if (ctx) |
1878 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | 2334 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1879 | 2335 | ||
@@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
1952 | 2408 | ||
1953 | raw_spin_unlock(&ctx->lock); | 2409 | raw_spin_unlock(&ctx->lock); |
1954 | 2410 | ||
1955 | perf_event_context_sched_in(ctx); | 2411 | perf_event_context_sched_in(ctx, ctx->task); |
1956 | out: | 2412 | out: |
1957 | local_irq_restore(flags); | 2413 | local_irq_restore(flags); |
1958 | } | 2414 | } |
@@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info) | |||
1977 | return; | 2433 | return; |
1978 | 2434 | ||
1979 | raw_spin_lock(&ctx->lock); | 2435 | raw_spin_lock(&ctx->lock); |
1980 | if (ctx->is_active) | 2436 | if (ctx->is_active) { |
1981 | update_context_time(ctx); | 2437 | update_context_time(ctx); |
2438 | update_cgrp_time_from_event(event); | ||
2439 | } | ||
1982 | update_event_times(event); | 2440 | update_event_times(event); |
1983 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2441 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
1984 | event->pmu->read(event); | 2442 | event->pmu->read(event); |
@@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event) | |||
2009 | * (e.g., thread is blocked), in that case | 2467 | * (e.g., thread is blocked), in that case |
2010 | * we cannot update context time | 2468 | * we cannot update context time |
2011 | */ | 2469 | */ |
2012 | if (ctx->is_active) | 2470 | if (ctx->is_active) { |
2013 | update_context_time(ctx); | 2471 | update_context_time(ctx); |
2472 | update_cgrp_time_from_event(event); | ||
2473 | } | ||
2014 | update_event_times(event); | 2474 | update_event_times(event); |
2015 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2475 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
2016 | } | 2476 | } |
@@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event) | |||
2395 | 2855 | ||
2396 | if (!event->parent) { | 2856 | if (!event->parent) { |
2397 | if (event->attach_state & PERF_ATTACH_TASK) | 2857 | if (event->attach_state & PERF_ATTACH_TASK) |
2398 | jump_label_dec(&perf_task_events); | 2858 | jump_label_dec(&perf_sched_events); |
2399 | if (event->attr.mmap || event->attr.mmap_data) | 2859 | if (event->attr.mmap || event->attr.mmap_data) |
2400 | atomic_dec(&nr_mmap_events); | 2860 | atomic_dec(&nr_mmap_events); |
2401 | if (event->attr.comm) | 2861 | if (event->attr.comm) |
@@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event) | |||
2411 | event->buffer = NULL; | 2871 | event->buffer = NULL; |
2412 | } | 2872 | } |
2413 | 2873 | ||
2874 | if (is_cgroup_event(event)) | ||
2875 | perf_detach_cgroup(event); | ||
2876 | |||
2414 | if (event->destroy) | 2877 | if (event->destroy) |
2415 | event->destroy(event); | 2878 | event->destroy(event); |
2416 | 2879 | ||
@@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event) | |||
5300 | 5763 | ||
5301 | if (!in_nmi()) { | 5764 | if (!in_nmi()) { |
5302 | update_context_time(event->ctx); | 5765 | update_context_time(event->ctx); |
5766 | update_cgrp_time_from_event(event); | ||
5303 | time = event->ctx->time; | 5767 | time = event->ctx->time; |
5304 | } else { | 5768 | } else { |
5305 | u64 now = perf_clock(); | 5769 | u64 now = perf_clock(); |
@@ -5725,7 +6189,7 @@ done: | |||
5725 | 6189 | ||
5726 | if (!event->parent) { | 6190 | if (!event->parent) { |
5727 | if (event->attach_state & PERF_ATTACH_TASK) | 6191 | if (event->attach_state & PERF_ATTACH_TASK) |
5728 | jump_label_inc(&perf_task_events); | 6192 | jump_label_inc(&perf_sched_events); |
5729 | if (event->attr.mmap || event->attr.mmap_data) | 6193 | if (event->attr.mmap || event->attr.mmap_data) |
5730 | atomic_inc(&nr_mmap_events); | 6194 | atomic_inc(&nr_mmap_events); |
5731 | if (event->attr.comm) | 6195 | if (event->attr.comm) |
@@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5900 | int err; | 6364 | int err; |
5901 | 6365 | ||
5902 | /* for future expandability... */ | 6366 | /* for future expandability... */ |
5903 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) | 6367 | if (flags & ~PERF_FLAG_ALL) |
5904 | return -EINVAL; | 6368 | return -EINVAL; |
5905 | 6369 | ||
5906 | err = perf_copy_attr(attr_uptr, &attr); | 6370 | err = perf_copy_attr(attr_uptr, &attr); |
@@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5917 | return -EINVAL; | 6381 | return -EINVAL; |
5918 | } | 6382 | } |
5919 | 6383 | ||
6384 | /* | ||
6385 | * In cgroup mode, the pid argument is used to pass the fd | ||
6386 | * opened to the cgroup directory in cgroupfs. The cpu argument | ||
6387 | * designates the cpu on which to monitor threads from that | ||
6388 | * cgroup. | ||
6389 | */ | ||
6390 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | ||
6391 | return -EINVAL; | ||
6392 | |||
5920 | event_fd = get_unused_fd_flags(O_RDWR); | 6393 | event_fd = get_unused_fd_flags(O_RDWR); |
5921 | if (event_fd < 0) | 6394 | if (event_fd < 0) |
5922 | return event_fd; | 6395 | return event_fd; |
@@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5934 | group_leader = NULL; | 6407 | group_leader = NULL; |
5935 | } | 6408 | } |
5936 | 6409 | ||
5937 | if (pid != -1) { | 6410 | if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { |
5938 | task = find_lively_task_by_vpid(pid); | 6411 | task = find_lively_task_by_vpid(pid); |
5939 | if (IS_ERR(task)) { | 6412 | if (IS_ERR(task)) { |
5940 | err = PTR_ERR(task); | 6413 | err = PTR_ERR(task); |
@@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5948 | goto err_task; | 6421 | goto err_task; |
5949 | } | 6422 | } |
5950 | 6423 | ||
6424 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
6425 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
6426 | if (err) | ||
6427 | goto err_alloc; | ||
6428 | } | ||
6429 | |||
5951 | /* | 6430 | /* |
5952 | * Special case software events and allow them to be part of | 6431 | * Special case software events and allow them to be part of |
5953 | * any hardware group. | 6432 | * any hardware group. |
@@ -6808,3 +7287,92 @@ unlock: | |||
6808 | return ret; | 7287 | return ret; |
6809 | } | 7288 | } |
6810 | device_initcall(perf_event_sysfs_init); | 7289 | device_initcall(perf_event_sysfs_init); |
7290 | |||
7291 | #ifdef CONFIG_CGROUP_PERF | ||
7292 | static struct cgroup_subsys_state *perf_cgroup_create( | ||
7293 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
7294 | { | ||
7295 | struct perf_cgroup *jc; | ||
7296 | struct perf_cgroup_info *t; | ||
7297 | int c; | ||
7298 | |||
7299 | jc = kmalloc(sizeof(*jc), GFP_KERNEL); | ||
7300 | if (!jc) | ||
7301 | return ERR_PTR(-ENOMEM); | ||
7302 | |||
7303 | memset(jc, 0, sizeof(*jc)); | ||
7304 | |||
7305 | jc->info = alloc_percpu(struct perf_cgroup_info); | ||
7306 | if (!jc->info) { | ||
7307 | kfree(jc); | ||
7308 | return ERR_PTR(-ENOMEM); | ||
7309 | } | ||
7310 | |||
7311 | for_each_possible_cpu(c) { | ||
7312 | t = per_cpu_ptr(jc->info, c); | ||
7313 | t->time = 0; | ||
7314 | t->timestamp = 0; | ||
7315 | } | ||
7316 | return &jc->css; | ||
7317 | } | ||
7318 | |||
7319 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, | ||
7320 | struct cgroup *cont) | ||
7321 | { | ||
7322 | struct perf_cgroup *jc; | ||
7323 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | ||
7324 | struct perf_cgroup, css); | ||
7325 | free_percpu(jc->info); | ||
7326 | kfree(jc); | ||
7327 | } | ||
7328 | |||
7329 | static int __perf_cgroup_move(void *info) | ||
7330 | { | ||
7331 | struct task_struct *task = info; | ||
7332 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); | ||
7333 | return 0; | ||
7334 | } | ||
7335 | |||
7336 | static void perf_cgroup_move(struct task_struct *task) | ||
7337 | { | ||
7338 | task_function_call(task, __perf_cgroup_move, task); | ||
7339 | } | ||
7340 | |||
7341 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7342 | struct cgroup *old_cgrp, struct task_struct *task, | ||
7343 | bool threadgroup) | ||
7344 | { | ||
7345 | perf_cgroup_move(task); | ||
7346 | if (threadgroup) { | ||
7347 | struct task_struct *c; | ||
7348 | rcu_read_lock(); | ||
7349 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
7350 | perf_cgroup_move(c); | ||
7351 | } | ||
7352 | rcu_read_unlock(); | ||
7353 | } | ||
7354 | } | ||
7355 | |||
7356 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7357 | struct cgroup *old_cgrp, struct task_struct *task) | ||
7358 | { | ||
7359 | /* | ||
7360 | * cgroup_exit() is called in the copy_process() failure path. | ||
7361 | * Ignore this case since the task hasn't ran yet, this avoids | ||
7362 | * trying to poke a half freed task state from generic code. | ||
7363 | */ | ||
7364 | if (!(task->flags & PF_EXITING)) | ||
7365 | return; | ||
7366 | |||
7367 | perf_cgroup_move(task); | ||
7368 | } | ||
7369 | |||
7370 | struct cgroup_subsys perf_subsys = { | ||
7371 | .name = "perf_event", | ||
7372 | .subsys_id = perf_subsys_id, | ||
7373 | .create = perf_cgroup_create, | ||
7374 | .destroy = perf_cgroup_destroy, | ||
7375 | .exit = perf_cgroup_exit, | ||
7376 | .attach = perf_cgroup_attach, | ||
7377 | }; | ||
7378 | #endif /* CONFIG_CGROUP_PERF */ | ||