aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c638
1 files changed, 603 insertions, 35 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d3f282fa50e..65dcdc76d709 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -111,13 +111,23 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
111 return data.ret; 111 return data.ret;
112} 112}
113 113
114#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
115 PERF_FLAG_FD_OUTPUT |\
116 PERF_FLAG_PID_CGROUP)
117
114enum event_type_t { 118enum event_type_t {
115 EVENT_FLEXIBLE = 0x1, 119 EVENT_FLEXIBLE = 0x1,
116 EVENT_PINNED = 0x2, 120 EVENT_PINNED = 0x2,
117 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 121 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
118}; 122};
119 123
120atomic_t perf_task_events __read_mostly; 124/*
125 * perf_sched_events : >0 events exist
126 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
127 */
128atomic_t perf_sched_events __read_mostly;
129static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
130
121static atomic_t nr_mmap_events __read_mostly; 131static atomic_t nr_mmap_events __read_mostly;
122static atomic_t nr_comm_events __read_mostly; 132static atomic_t nr_comm_events __read_mostly;
123static atomic_t nr_task_events __read_mostly; 133static atomic_t nr_task_events __read_mostly;
@@ -148,7 +158,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
148 enum event_type_t event_type); 158 enum event_type_t event_type);
149 159
150static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 160static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
151 enum event_type_t event_type); 161 enum event_type_t event_type,
162 struct task_struct *task);
163
164static void update_context_time(struct perf_event_context *ctx);
165static u64 perf_event_time(struct perf_event *event);
152 166
153void __weak perf_event_print_debug(void) { } 167void __weak perf_event_print_debug(void) { }
154 168
@@ -162,6 +176,338 @@ static inline u64 perf_clock(void)
162 return local_clock(); 176 return local_clock();
163} 177}
164 178
179static inline struct perf_cpu_context *
180__get_cpu_context(struct perf_event_context *ctx)
181{
182 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
183}
184
185#ifdef CONFIG_CGROUP_PERF
186
187static inline struct perf_cgroup *
188perf_cgroup_from_task(struct task_struct *task)
189{
190 return container_of(task_subsys_state(task, perf_subsys_id),
191 struct perf_cgroup, css);
192}
193
194static inline bool
195perf_cgroup_match(struct perf_event *event)
196{
197 struct perf_event_context *ctx = event->ctx;
198 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
199
200 return !event->cgrp || event->cgrp == cpuctx->cgrp;
201}
202
203static inline void perf_get_cgroup(struct perf_event *event)
204{
205 css_get(&event->cgrp->css);
206}
207
208static inline void perf_put_cgroup(struct perf_event *event)
209{
210 css_put(&event->cgrp->css);
211}
212
213static inline void perf_detach_cgroup(struct perf_event *event)
214{
215 perf_put_cgroup(event);
216 event->cgrp = NULL;
217}
218
219static inline int is_cgroup_event(struct perf_event *event)
220{
221 return event->cgrp != NULL;
222}
223
224static inline u64 perf_cgroup_event_time(struct perf_event *event)
225{
226 struct perf_cgroup_info *t;
227
228 t = per_cpu_ptr(event->cgrp->info, event->cpu);
229 return t->time;
230}
231
232static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
233{
234 struct perf_cgroup_info *info;
235 u64 now;
236
237 now = perf_clock();
238
239 info = this_cpu_ptr(cgrp->info);
240
241 info->time += now - info->timestamp;
242 info->timestamp = now;
243}
244
245static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
246{
247 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
248 if (cgrp_out)
249 __update_cgrp_time(cgrp_out);
250}
251
252static inline void update_cgrp_time_from_event(struct perf_event *event)
253{
254 struct perf_cgroup *cgrp = perf_cgroup_from_task(current);
255 /*
256 * do not update time when cgroup is not active
257 */
258 if (!event->cgrp || cgrp != event->cgrp)
259 return;
260
261 __update_cgrp_time(event->cgrp);
262}
263
264static inline void
265perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
266{
267 struct perf_cgroup *cgrp;
268 struct perf_cgroup_info *info;
269
270 if (!task)
271 return;
272
273 cgrp = perf_cgroup_from_task(task);
274 info = this_cpu_ptr(cgrp->info);
275 info->timestamp = now;
276}
277
278#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
279#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
280
281/*
282 * reschedule events based on the cgroup constraint of task.
283 *
284 * mode SWOUT : schedule out everything
285 * mode SWIN : schedule in based on cgroup for next
286 */
287void perf_cgroup_switch(struct task_struct *task, int mode)
288{
289 struct perf_cpu_context *cpuctx;
290 struct pmu *pmu;
291 unsigned long flags;
292
293 /*
294 * disable interrupts to avoid geting nr_cgroup
295 * changes via __perf_event_disable(). Also
296 * avoids preemption.
297 */
298 local_irq_save(flags);
299
300 /*
301 * we reschedule only in the presence of cgroup
302 * constrained events.
303 */
304 rcu_read_lock();
305
306 list_for_each_entry_rcu(pmu, &pmus, entry) {
307
308 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
309
310 perf_pmu_disable(cpuctx->ctx.pmu);
311
312 /*
313 * perf_cgroup_events says at least one
314 * context on this CPU has cgroup events.
315 *
316 * ctx->nr_cgroups reports the number of cgroup
317 * events for a context.
318 */
319 if (cpuctx->ctx.nr_cgroups > 0) {
320
321 if (mode & PERF_CGROUP_SWOUT) {
322 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
323 /*
324 * must not be done before ctxswout due
325 * to event_filter_match() in event_sched_out()
326 */
327 cpuctx->cgrp = NULL;
328 }
329
330 if (mode & PERF_CGROUP_SWIN) {
331 /* set cgrp before ctxsw in to
332 * allow event_filter_match() to not
333 * have to pass task around
334 */
335 cpuctx->cgrp = perf_cgroup_from_task(task);
336 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
337 }
338 }
339
340 perf_pmu_enable(cpuctx->ctx.pmu);
341 }
342
343 rcu_read_unlock();
344
345 local_irq_restore(flags);
346}
347
348static inline void perf_cgroup_sched_out(struct task_struct *task)
349{
350 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
351}
352
353static inline void perf_cgroup_sched_in(struct task_struct *task)
354{
355 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
356}
357
358static inline int perf_cgroup_connect(int fd, struct perf_event *event,
359 struct perf_event_attr *attr,
360 struct perf_event *group_leader)
361{
362 struct perf_cgroup *cgrp;
363 struct cgroup_subsys_state *css;
364 struct file *file;
365 int ret = 0, fput_needed;
366
367 file = fget_light(fd, &fput_needed);
368 if (!file)
369 return -EBADF;
370
371 css = cgroup_css_from_dir(file, perf_subsys_id);
372 if (IS_ERR(css))
373 return PTR_ERR(css);
374
375 cgrp = container_of(css, struct perf_cgroup, css);
376 event->cgrp = cgrp;
377
378 /*
379 * all events in a group must monitor
380 * the same cgroup because a task belongs
381 * to only one perf cgroup at a time
382 */
383 if (group_leader && group_leader->cgrp != cgrp) {
384 perf_detach_cgroup(event);
385 ret = -EINVAL;
386 } else {
387 /* must be done before we fput() the file */
388 perf_get_cgroup(event);
389 }
390 fput_light(file, fput_needed);
391 return ret;
392}
393
394static inline void
395perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
396{
397 struct perf_cgroup_info *t;
398 t = per_cpu_ptr(event->cgrp->info, event->cpu);
399 event->shadow_ctx_time = now - t->timestamp;
400}
401
402static inline void
403perf_cgroup_defer_enabled(struct perf_event *event)
404{
405 /*
406 * when the current task's perf cgroup does not match
407 * the event's, we need to remember to call the
408 * perf_mark_enable() function the first time a task with
409 * a matching perf cgroup is scheduled in.
410 */
411 if (is_cgroup_event(event) && !perf_cgroup_match(event))
412 event->cgrp_defer_enabled = 1;
413}
414
415static inline void
416perf_cgroup_mark_enabled(struct perf_event *event,
417 struct perf_event_context *ctx)
418{
419 struct perf_event *sub;
420 u64 tstamp = perf_event_time(event);
421
422 if (!event->cgrp_defer_enabled)
423 return;
424
425 event->cgrp_defer_enabled = 0;
426
427 event->tstamp_enabled = tstamp - event->total_time_enabled;
428 list_for_each_entry(sub, &event->sibling_list, group_entry) {
429 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
430 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
431 sub->cgrp_defer_enabled = 0;
432 }
433 }
434}
435#else /* !CONFIG_CGROUP_PERF */
436
437static inline bool
438perf_cgroup_match(struct perf_event *event)
439{
440 return true;
441}
442
443static inline void perf_detach_cgroup(struct perf_event *event)
444{}
445
446static inline int is_cgroup_event(struct perf_event *event)
447{
448 return 0;
449}
450
451static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
452{
453 return 0;
454}
455
456static inline void update_cgrp_time_from_event(struct perf_event *event)
457{
458}
459
460static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
461{
462}
463
464static inline void perf_cgroup_sched_out(struct task_struct *task)
465{
466}
467
468static inline void perf_cgroup_sched_in(struct task_struct *task)
469{
470}
471
472static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
473 struct perf_event_attr *attr,
474 struct perf_event *group_leader)
475{
476 return -EINVAL;
477}
478
479static inline void
480perf_cgroup_set_timestamp(struct task_struct *task, u64 now)
481{
482}
483
484void
485perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
486{
487}
488
489static inline void
490perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
491{
492}
493
494static inline u64 perf_cgroup_event_time(struct perf_event *event)
495{
496 return 0;
497}
498
499static inline void
500perf_cgroup_defer_enabled(struct perf_event *event)
501{
502}
503
504static inline void
505perf_cgroup_mark_enabled(struct perf_event *event,
506 struct perf_event_context *ctx)
507{
508}
509#endif
510
165void perf_pmu_disable(struct pmu *pmu) 511void perf_pmu_disable(struct pmu *pmu)
166{ 512{
167 int *count = this_cpu_ptr(pmu->pmu_disable_count); 513 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -343,6 +689,10 @@ static void update_context_time(struct perf_event_context *ctx)
343static u64 perf_event_time(struct perf_event *event) 689static u64 perf_event_time(struct perf_event *event)
344{ 690{
345 struct perf_event_context *ctx = event->ctx; 691 struct perf_event_context *ctx = event->ctx;
692
693 if (is_cgroup_event(event))
694 return perf_cgroup_event_time(event);
695
346 return ctx ? ctx->time : 0; 696 return ctx ? ctx->time : 0;
347} 697}
348 698
@@ -357,9 +707,20 @@ static void update_event_times(struct perf_event *event)
357 if (event->state < PERF_EVENT_STATE_INACTIVE || 707 if (event->state < PERF_EVENT_STATE_INACTIVE ||
358 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 708 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
359 return; 709 return;
360 710 /*
361 if (ctx->is_active) 711 * in cgroup mode, time_enabled represents
712 * the time the event was enabled AND active
713 * tasks were in the monitored cgroup. This is
714 * independent of the activity of the context as
715 * there may be a mix of cgroup and non-cgroup events.
716 *
717 * That is why we treat cgroup events differently
718 * here.
719 */
720 if (is_cgroup_event(event))
362 run_end = perf_event_time(event); 721 run_end = perf_event_time(event);
722 else if (ctx->is_active)
723 run_end = ctx->time;
363 else 724 else
364 run_end = event->tstamp_stopped; 725 run_end = event->tstamp_stopped;
365 726
@@ -371,6 +732,7 @@ static void update_event_times(struct perf_event *event)
371 run_end = perf_event_time(event); 732 run_end = perf_event_time(event);
372 733
373 event->total_time_running = run_end - event->tstamp_running; 734 event->total_time_running = run_end - event->tstamp_running;
735
374} 736}
375 737
376/* 738/*
@@ -419,6 +781,17 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
419 list_add_tail(&event->group_entry, list); 781 list_add_tail(&event->group_entry, list);
420 } 782 }
421 783
784 if (is_cgroup_event(event)) {
785 ctx->nr_cgroups++;
786 /*
787 * one more event:
788 * - that has cgroup constraint on event->cpu
789 * - that may need work on context switch
790 */
791 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
792 jump_label_inc(&perf_sched_events);
793 }
794
422 list_add_rcu(&event->event_entry, &ctx->event_list); 795 list_add_rcu(&event->event_entry, &ctx->event_list);
423 if (!ctx->nr_events) 796 if (!ctx->nr_events)
424 perf_pmu_rotate_start(ctx->pmu); 797 perf_pmu_rotate_start(ctx->pmu);
@@ -545,6 +918,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
545 918
546 event->attach_state &= ~PERF_ATTACH_CONTEXT; 919 event->attach_state &= ~PERF_ATTACH_CONTEXT;
547 920
921 if (is_cgroup_event(event)) {
922 ctx->nr_cgroups--;
923 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
924 jump_label_dec(&perf_sched_events);
925 }
926
548 ctx->nr_events--; 927 ctx->nr_events--;
549 if (event->attr.inherit_stat) 928 if (event->attr.inherit_stat)
550 ctx->nr_stat--; 929 ctx->nr_stat--;
@@ -616,7 +995,8 @@ out:
616static inline int 995static inline int
617event_filter_match(struct perf_event *event) 996event_filter_match(struct perf_event *event)
618{ 997{
619 return event->cpu == -1 || event->cpu == smp_processor_id(); 998 return (event->cpu == -1 || event->cpu == smp_processor_id())
999 && perf_cgroup_match(event);
620} 1000}
621 1001
622static void 1002static void
@@ -634,7 +1014,7 @@ event_sched_out(struct perf_event *event,
634 */ 1014 */
635 if (event->state == PERF_EVENT_STATE_INACTIVE 1015 if (event->state == PERF_EVENT_STATE_INACTIVE
636 && !event_filter_match(event)) { 1016 && !event_filter_match(event)) {
637 delta = ctx->time - event->tstamp_stopped; 1017 delta = tstamp - event->tstamp_stopped;
638 event->tstamp_running += delta; 1018 event->tstamp_running += delta;
639 event->tstamp_stopped = tstamp; 1019 event->tstamp_stopped = tstamp;
640 } 1020 }
@@ -678,12 +1058,6 @@ group_sched_out(struct perf_event *group_event,
678 cpuctx->exclusive = 0; 1058 cpuctx->exclusive = 0;
679} 1059}
680 1060
681static inline struct perf_cpu_context *
682__get_cpu_context(struct perf_event_context *ctx)
683{
684 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
685}
686
687/* 1061/*
688 * Cross CPU call to remove a performance event 1062 * Cross CPU call to remove a performance event
689 * 1063 *
@@ -783,6 +1157,7 @@ static int __perf_event_disable(void *info)
783 */ 1157 */
784 if (event->state >= PERF_EVENT_STATE_INACTIVE) { 1158 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
785 update_context_time(ctx); 1159 update_context_time(ctx);
1160 update_cgrp_time_from_event(event);
786 update_group_times(event); 1161 update_group_times(event);
787 if (event == event->group_leader) 1162 if (event == event->group_leader)
788 group_sched_out(event, cpuctx, ctx); 1163 group_sched_out(event, cpuctx, ctx);
@@ -851,6 +1226,41 @@ retry:
851 raw_spin_unlock_irq(&ctx->lock); 1226 raw_spin_unlock_irq(&ctx->lock);
852} 1227}
853 1228
1229static void perf_set_shadow_time(struct perf_event *event,
1230 struct perf_event_context *ctx,
1231 u64 tstamp)
1232{
1233 /*
1234 * use the correct time source for the time snapshot
1235 *
1236 * We could get by without this by leveraging the
1237 * fact that to get to this function, the caller
1238 * has most likely already called update_context_time()
1239 * and update_cgrp_time_xx() and thus both timestamp
1240 * are identical (or very close). Given that tstamp is,
1241 * already adjusted for cgroup, we could say that:
1242 * tstamp - ctx->timestamp
1243 * is equivalent to
1244 * tstamp - cgrp->timestamp.
1245 *
1246 * Then, in perf_output_read(), the calculation would
1247 * work with no changes because:
1248 * - event is guaranteed scheduled in
1249 * - no scheduled out in between
1250 * - thus the timestamp would be the same
1251 *
1252 * But this is a bit hairy.
1253 *
1254 * So instead, we have an explicit cgroup call to remain
1255 * within the time time source all along. We believe it
1256 * is cleaner and simpler to understand.
1257 */
1258 if (is_cgroup_event(event))
1259 perf_cgroup_set_shadow_time(event, tstamp);
1260 else
1261 event->shadow_ctx_time = tstamp - ctx->timestamp;
1262}
1263
854#define MAX_INTERRUPTS (~0ULL) 1264#define MAX_INTERRUPTS (~0ULL)
855 1265
856static void perf_log_throttle(struct perf_event *event, int enable); 1266static void perf_log_throttle(struct perf_event *event, int enable);
@@ -891,7 +1301,7 @@ event_sched_in(struct perf_event *event,
891 1301
892 event->tstamp_running += tstamp - event->tstamp_stopped; 1302 event->tstamp_running += tstamp - event->tstamp_stopped;
893 1303
894 event->shadow_ctx_time = tstamp - ctx->timestamp; 1304 perf_set_shadow_time(event, ctx, tstamp);
895 1305
896 if (!is_software_event(event)) 1306 if (!is_software_event(event))
897 cpuctx->active_oncpu++; 1307 cpuctx->active_oncpu++;
@@ -1012,7 +1422,8 @@ static void add_event_to_ctx(struct perf_event *event,
1012 event->tstamp_stopped = tstamp; 1422 event->tstamp_stopped = tstamp;
1013} 1423}
1014 1424
1015static void perf_event_context_sched_in(struct perf_event_context *ctx); 1425static void perf_event_context_sched_in(struct perf_event_context *ctx,
1426 struct task_struct *tsk);
1016 1427
1017/* 1428/*
1018 * Cross CPU call to install and enable a performance event 1429 * Cross CPU call to install and enable a performance event
@@ -1033,11 +1444,17 @@ static int __perf_install_in_context(void *info)
1033 * which do context switches with IRQs enabled. 1444 * which do context switches with IRQs enabled.
1034 */ 1445 */
1035 if (ctx->task && !cpuctx->task_ctx) 1446 if (ctx->task && !cpuctx->task_ctx)
1036 perf_event_context_sched_in(ctx); 1447 perf_event_context_sched_in(ctx, ctx->task);
1037 1448
1038 raw_spin_lock(&ctx->lock); 1449 raw_spin_lock(&ctx->lock);
1039 ctx->is_active = 1; 1450 ctx->is_active = 1;
1040 update_context_time(ctx); 1451 update_context_time(ctx);
1452 /*
1453 * update cgrp time only if current cgrp
1454 * matches event->cgrp. Must be done before
1455 * calling add_event_to_ctx()
1456 */
1457 update_cgrp_time_from_event(event);
1041 1458
1042 add_event_to_ctx(event, ctx); 1459 add_event_to_ctx(event, ctx);
1043 1460
@@ -1175,10 +1592,19 @@ static int __perf_event_enable(void *info)
1175 1592
1176 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1593 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1177 goto unlock; 1594 goto unlock;
1595
1596 /*
1597 * set current task's cgroup time reference point
1598 */
1599 perf_cgroup_set_timestamp(current, perf_clock());
1600
1178 __perf_event_mark_enabled(event, ctx); 1601 __perf_event_mark_enabled(event, ctx);
1179 1602
1180 if (!event_filter_match(event)) 1603 if (!event_filter_match(event)) {
1604 if (is_cgroup_event(event))
1605 perf_cgroup_defer_enabled(event);
1181 goto unlock; 1606 goto unlock;
1607 }
1182 1608
1183 /* 1609 /*
1184 * If the event is in a group and isn't the group leader, 1610 * If the event is in a group and isn't the group leader,
@@ -1307,6 +1733,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
1307 if (likely(!ctx->nr_events)) 1733 if (likely(!ctx->nr_events))
1308 goto out; 1734 goto out;
1309 update_context_time(ctx); 1735 update_context_time(ctx);
1736 update_cgrp_time_from_cpuctx(cpuctx);
1310 1737
1311 if (!ctx->nr_active) 1738 if (!ctx->nr_active)
1312 goto out; 1739 goto out;
@@ -1496,6 +1923,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
1496 1923
1497 for_each_task_context_nr(ctxn) 1924 for_each_task_context_nr(ctxn)
1498 perf_event_context_sched_out(task, ctxn, next); 1925 perf_event_context_sched_out(task, ctxn, next);
1926
1927 /*
1928 * if cgroup events exist on this CPU, then we need
1929 * to check if we have to switch out PMU state.
1930 * cgroup event are system-wide mode only
1931 */
1932 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1933 perf_cgroup_sched_out(task);
1499} 1934}
1500 1935
1501static void task_ctx_sched_out(struct perf_event_context *ctx, 1936static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1534,6 +1969,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
1534 if (!event_filter_match(event)) 1969 if (!event_filter_match(event))
1535 continue; 1970 continue;
1536 1971
1972 /* may need to reset tstamp_enabled */
1973 if (is_cgroup_event(event))
1974 perf_cgroup_mark_enabled(event, ctx);
1975
1537 if (group_can_go_on(event, cpuctx, 1)) 1976 if (group_can_go_on(event, cpuctx, 1))
1538 group_sched_in(event, cpuctx, ctx); 1977 group_sched_in(event, cpuctx, ctx);
1539 1978
@@ -1566,6 +2005,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1566 if (!event_filter_match(event)) 2005 if (!event_filter_match(event))
1567 continue; 2006 continue;
1568 2007
2008 /* may need to reset tstamp_enabled */
2009 if (is_cgroup_event(event))
2010 perf_cgroup_mark_enabled(event, ctx);
2011
1569 if (group_can_go_on(event, cpuctx, can_add_hw)) { 2012 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1570 if (group_sched_in(event, cpuctx, ctx)) 2013 if (group_sched_in(event, cpuctx, ctx))
1571 can_add_hw = 0; 2014 can_add_hw = 0;
@@ -1576,15 +2019,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
1576static void 2019static void
1577ctx_sched_in(struct perf_event_context *ctx, 2020ctx_sched_in(struct perf_event_context *ctx,
1578 struct perf_cpu_context *cpuctx, 2021 struct perf_cpu_context *cpuctx,
1579 enum event_type_t event_type) 2022 enum event_type_t event_type,
2023 struct task_struct *task)
1580{ 2024{
2025 u64 now;
2026
1581 raw_spin_lock(&ctx->lock); 2027 raw_spin_lock(&ctx->lock);
1582 ctx->is_active = 1; 2028 ctx->is_active = 1;
1583 if (likely(!ctx->nr_events)) 2029 if (likely(!ctx->nr_events))
1584 goto out; 2030 goto out;
1585 2031
1586 ctx->timestamp = perf_clock(); 2032 now = perf_clock();
1587 2033 ctx->timestamp = now;
2034 perf_cgroup_set_timestamp(task, now);
1588 /* 2035 /*
1589 * First go through the list and put on any pinned groups 2036 * First go through the list and put on any pinned groups
1590 * in order to give them the best chance of going on. 2037 * in order to give them the best chance of going on.
@@ -1601,11 +2048,12 @@ out:
1601} 2048}
1602 2049
1603static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2050static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1604 enum event_type_t event_type) 2051 enum event_type_t event_type,
2052 struct task_struct *task)
1605{ 2053{
1606 struct perf_event_context *ctx = &cpuctx->ctx; 2054 struct perf_event_context *ctx = &cpuctx->ctx;
1607 2055
1608 ctx_sched_in(ctx, cpuctx, event_type); 2056 ctx_sched_in(ctx, cpuctx, event_type, task);
1609} 2057}
1610 2058
1611static void task_ctx_sched_in(struct perf_event_context *ctx, 2059static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1617,11 +2065,12 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
1617 if (cpuctx->task_ctx == ctx) 2065 if (cpuctx->task_ctx == ctx)
1618 return; 2066 return;
1619 2067
1620 ctx_sched_in(ctx, cpuctx, event_type); 2068 ctx_sched_in(ctx, cpuctx, event_type, NULL);
1621 cpuctx->task_ctx = ctx; 2069 cpuctx->task_ctx = ctx;
1622} 2070}
1623 2071
1624static void perf_event_context_sched_in(struct perf_event_context *ctx) 2072static void perf_event_context_sched_in(struct perf_event_context *ctx,
2073 struct task_struct *task)
1625{ 2074{
1626 struct perf_cpu_context *cpuctx; 2075 struct perf_cpu_context *cpuctx;
1627 2076
@@ -1637,9 +2086,9 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx)
1637 */ 2086 */
1638 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2087 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1639 2088
1640 ctx_sched_in(ctx, cpuctx, EVENT_PINNED); 2089 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1641 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2090 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1642 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 2091 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1643 2092
1644 cpuctx->task_ctx = ctx; 2093 cpuctx->task_ctx = ctx;
1645 2094
@@ -1672,8 +2121,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
1672 if (likely(!ctx)) 2121 if (likely(!ctx))
1673 continue; 2122 continue;
1674 2123
1675 perf_event_context_sched_in(ctx); 2124 perf_event_context_sched_in(ctx, task);
1676 } 2125 }
2126 /*
2127 * if cgroup events exist on this CPU, then we need
2128 * to check if we have to switch in PMU state.
2129 * cgroup event are system-wide mode only
2130 */
2131 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2132 perf_cgroup_sched_in(task);
1677} 2133}
1678 2134
1679static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2135static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1873,7 +2329,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
1873 if (ctx) 2329 if (ctx)
1874 rotate_ctx(ctx); 2330 rotate_ctx(ctx);
1875 2331
1876 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 2332 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
1877 if (ctx) 2333 if (ctx)
1878 task_ctx_sched_in(ctx, EVENT_FLEXIBLE); 2334 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1879 2335
@@ -1952,7 +2408,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1952 2408
1953 raw_spin_unlock(&ctx->lock); 2409 raw_spin_unlock(&ctx->lock);
1954 2410
1955 perf_event_context_sched_in(ctx); 2411 perf_event_context_sched_in(ctx, ctx->task);
1956out: 2412out:
1957 local_irq_restore(flags); 2413 local_irq_restore(flags);
1958} 2414}
@@ -1977,8 +2433,10 @@ static void __perf_event_read(void *info)
1977 return; 2433 return;
1978 2434
1979 raw_spin_lock(&ctx->lock); 2435 raw_spin_lock(&ctx->lock);
1980 if (ctx->is_active) 2436 if (ctx->is_active) {
1981 update_context_time(ctx); 2437 update_context_time(ctx);
2438 update_cgrp_time_from_event(event);
2439 }
1982 update_event_times(event); 2440 update_event_times(event);
1983 if (event->state == PERF_EVENT_STATE_ACTIVE) 2441 if (event->state == PERF_EVENT_STATE_ACTIVE)
1984 event->pmu->read(event); 2442 event->pmu->read(event);
@@ -2009,8 +2467,10 @@ static u64 perf_event_read(struct perf_event *event)
2009 * (e.g., thread is blocked), in that case 2467 * (e.g., thread is blocked), in that case
2010 * we cannot update context time 2468 * we cannot update context time
2011 */ 2469 */
2012 if (ctx->is_active) 2470 if (ctx->is_active) {
2013 update_context_time(ctx); 2471 update_context_time(ctx);
2472 update_cgrp_time_from_event(event);
2473 }
2014 update_event_times(event); 2474 update_event_times(event);
2015 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2475 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2016 } 2476 }
@@ -2395,7 +2855,7 @@ static void free_event(struct perf_event *event)
2395 2855
2396 if (!event->parent) { 2856 if (!event->parent) {
2397 if (event->attach_state & PERF_ATTACH_TASK) 2857 if (event->attach_state & PERF_ATTACH_TASK)
2398 jump_label_dec(&perf_task_events); 2858 jump_label_dec(&perf_sched_events);
2399 if (event->attr.mmap || event->attr.mmap_data) 2859 if (event->attr.mmap || event->attr.mmap_data)
2400 atomic_dec(&nr_mmap_events); 2860 atomic_dec(&nr_mmap_events);
2401 if (event->attr.comm) 2861 if (event->attr.comm)
@@ -2411,6 +2871,9 @@ static void free_event(struct perf_event *event)
2411 event->buffer = NULL; 2871 event->buffer = NULL;
2412 } 2872 }
2413 2873
2874 if (is_cgroup_event(event))
2875 perf_detach_cgroup(event);
2876
2414 if (event->destroy) 2877 if (event->destroy)
2415 event->destroy(event); 2878 event->destroy(event);
2416 2879
@@ -5300,6 +5763,7 @@ static void task_clock_event_read(struct perf_event *event)
5300 5763
5301 if (!in_nmi()) { 5764 if (!in_nmi()) {
5302 update_context_time(event->ctx); 5765 update_context_time(event->ctx);
5766 update_cgrp_time_from_event(event);
5303 time = event->ctx->time; 5767 time = event->ctx->time;
5304 } else { 5768 } else {
5305 u64 now = perf_clock(); 5769 u64 now = perf_clock();
@@ -5725,7 +6189,7 @@ done:
5725 6189
5726 if (!event->parent) { 6190 if (!event->parent) {
5727 if (event->attach_state & PERF_ATTACH_TASK) 6191 if (event->attach_state & PERF_ATTACH_TASK)
5728 jump_label_inc(&perf_task_events); 6192 jump_label_inc(&perf_sched_events);
5729 if (event->attr.mmap || event->attr.mmap_data) 6193 if (event->attr.mmap || event->attr.mmap_data)
5730 atomic_inc(&nr_mmap_events); 6194 atomic_inc(&nr_mmap_events);
5731 if (event->attr.comm) 6195 if (event->attr.comm)
@@ -5900,7 +6364,7 @@ SYSCALL_DEFINE5(perf_event_open,
5900 int err; 6364 int err;
5901 6365
5902 /* for future expandability... */ 6366 /* for future expandability... */
5903 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) 6367 if (flags & ~PERF_FLAG_ALL)
5904 return -EINVAL; 6368 return -EINVAL;
5905 6369
5906 err = perf_copy_attr(attr_uptr, &attr); 6370 err = perf_copy_attr(attr_uptr, &attr);
@@ -5917,6 +6381,15 @@ SYSCALL_DEFINE5(perf_event_open,
5917 return -EINVAL; 6381 return -EINVAL;
5918 } 6382 }
5919 6383
6384 /*
6385 * In cgroup mode, the pid argument is used to pass the fd
6386 * opened to the cgroup directory in cgroupfs. The cpu argument
6387 * designates the cpu on which to monitor threads from that
6388 * cgroup.
6389 */
6390 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6391 return -EINVAL;
6392
5920 event_fd = get_unused_fd_flags(O_RDWR); 6393 event_fd = get_unused_fd_flags(O_RDWR);
5921 if (event_fd < 0) 6394 if (event_fd < 0)
5922 return event_fd; 6395 return event_fd;
@@ -5934,7 +6407,7 @@ SYSCALL_DEFINE5(perf_event_open,
5934 group_leader = NULL; 6407 group_leader = NULL;
5935 } 6408 }
5936 6409
5937 if (pid != -1) { 6410 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
5938 task = find_lively_task_by_vpid(pid); 6411 task = find_lively_task_by_vpid(pid);
5939 if (IS_ERR(task)) { 6412 if (IS_ERR(task)) {
5940 err = PTR_ERR(task); 6413 err = PTR_ERR(task);
@@ -5948,6 +6421,12 @@ SYSCALL_DEFINE5(perf_event_open,
5948 goto err_task; 6421 goto err_task;
5949 } 6422 }
5950 6423
6424 if (flags & PERF_FLAG_PID_CGROUP) {
6425 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6426 if (err)
6427 goto err_alloc;
6428 }
6429
5951 /* 6430 /*
5952 * Special case software events and allow them to be part of 6431 * Special case software events and allow them to be part of
5953 * any hardware group. 6432 * any hardware group.
@@ -6808,3 +7287,92 @@ unlock:
6808 return ret; 7287 return ret;
6809} 7288}
6810device_initcall(perf_event_sysfs_init); 7289device_initcall(perf_event_sysfs_init);
7290
7291#ifdef CONFIG_CGROUP_PERF
7292static struct cgroup_subsys_state *perf_cgroup_create(
7293 struct cgroup_subsys *ss, struct cgroup *cont)
7294{
7295 struct perf_cgroup *jc;
7296 struct perf_cgroup_info *t;
7297 int c;
7298
7299 jc = kmalloc(sizeof(*jc), GFP_KERNEL);
7300 if (!jc)
7301 return ERR_PTR(-ENOMEM);
7302
7303 memset(jc, 0, sizeof(*jc));
7304
7305 jc->info = alloc_percpu(struct perf_cgroup_info);
7306 if (!jc->info) {
7307 kfree(jc);
7308 return ERR_PTR(-ENOMEM);
7309 }
7310
7311 for_each_possible_cpu(c) {
7312 t = per_cpu_ptr(jc->info, c);
7313 t->time = 0;
7314 t->timestamp = 0;
7315 }
7316 return &jc->css;
7317}
7318
7319static void perf_cgroup_destroy(struct cgroup_subsys *ss,
7320 struct cgroup *cont)
7321{
7322 struct perf_cgroup *jc;
7323 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7324 struct perf_cgroup, css);
7325 free_percpu(jc->info);
7326 kfree(jc);
7327}
7328
7329static int __perf_cgroup_move(void *info)
7330{
7331 struct task_struct *task = info;
7332 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7333 return 0;
7334}
7335
7336static void perf_cgroup_move(struct task_struct *task)
7337{
7338 task_function_call(task, __perf_cgroup_move, task);
7339}
7340
7341static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7342 struct cgroup *old_cgrp, struct task_struct *task,
7343 bool threadgroup)
7344{
7345 perf_cgroup_move(task);
7346 if (threadgroup) {
7347 struct task_struct *c;
7348 rcu_read_lock();
7349 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7350 perf_cgroup_move(c);
7351 }
7352 rcu_read_unlock();
7353 }
7354}
7355
7356static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7357 struct cgroup *old_cgrp, struct task_struct *task)
7358{
7359 /*
7360 * cgroup_exit() is called in the copy_process() failure path.
7361 * Ignore this case since the task hasn't ran yet, this avoids
7362 * trying to poke a half freed task state from generic code.
7363 */
7364 if (!(task->flags & PF_EXITING))
7365 return;
7366
7367 perf_cgroup_move(task);
7368}
7369
7370struct cgroup_subsys perf_subsys = {
7371 .name = "perf_event",
7372 .subsys_id = perf_subsys_id,
7373 .create = perf_cgroup_create,
7374 .destroy = perf_cgroup_destroy,
7375 .exit = perf_cgroup_exit,
7376 .attach = perf_cgroup_attach,
7377};
7378#endif /* CONFIG_CGROUP_PERF */