diff options
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r-- | kernel/perf_event.c | 1004 |
1 files changed, 846 insertions, 158 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 656222fcf767..ed253aa24ba4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -38,13 +38,96 @@ | |||
38 | 38 | ||
39 | #include <asm/irq_regs.h> | 39 | #include <asm/irq_regs.h> |
40 | 40 | ||
41 | struct remote_function_call { | ||
42 | struct task_struct *p; | ||
43 | int (*func)(void *info); | ||
44 | void *info; | ||
45 | int ret; | ||
46 | }; | ||
47 | |||
48 | static void remote_function(void *data) | ||
49 | { | ||
50 | struct remote_function_call *tfc = data; | ||
51 | struct task_struct *p = tfc->p; | ||
52 | |||
53 | if (p) { | ||
54 | tfc->ret = -EAGAIN; | ||
55 | if (task_cpu(p) != smp_processor_id() || !task_curr(p)) | ||
56 | return; | ||
57 | } | ||
58 | |||
59 | tfc->ret = tfc->func(tfc->info); | ||
60 | } | ||
61 | |||
62 | /** | ||
63 | * task_function_call - call a function on the cpu on which a task runs | ||
64 | * @p: the task to evaluate | ||
65 | * @func: the function to be called | ||
66 | * @info: the function call argument | ||
67 | * | ||
68 | * Calls the function @func when the task is currently running. This might | ||
69 | * be on the current CPU, which just calls the function directly | ||
70 | * | ||
71 | * returns: @func return value, or | ||
72 | * -ESRCH - when the process isn't running | ||
73 | * -EAGAIN - when the process moved away | ||
74 | */ | ||
75 | static int | ||
76 | task_function_call(struct task_struct *p, int (*func) (void *info), void *info) | ||
77 | { | ||
78 | struct remote_function_call data = { | ||
79 | .p = p, | ||
80 | .func = func, | ||
81 | .info = info, | ||
82 | .ret = -ESRCH, /* No such (running) process */ | ||
83 | }; | ||
84 | |||
85 | if (task_curr(p)) | ||
86 | smp_call_function_single(task_cpu(p), remote_function, &data, 1); | ||
87 | |||
88 | return data.ret; | ||
89 | } | ||
90 | |||
91 | /** | ||
92 | * cpu_function_call - call a function on the cpu | ||
93 | * @func: the function to be called | ||
94 | * @info: the function call argument | ||
95 | * | ||
96 | * Calls the function @func on the remote cpu. | ||
97 | * | ||
98 | * returns: @func return value or -ENXIO when the cpu is offline | ||
99 | */ | ||
100 | static int cpu_function_call(int cpu, int (*func) (void *info), void *info) | ||
101 | { | ||
102 | struct remote_function_call data = { | ||
103 | .p = NULL, | ||
104 | .func = func, | ||
105 | .info = info, | ||
106 | .ret = -ENXIO, /* No such CPU */ | ||
107 | }; | ||
108 | |||
109 | smp_call_function_single(cpu, remote_function, &data, 1); | ||
110 | |||
111 | return data.ret; | ||
112 | } | ||
113 | |||
114 | #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ | ||
115 | PERF_FLAG_FD_OUTPUT |\ | ||
116 | PERF_FLAG_PID_CGROUP) | ||
117 | |||
41 | enum event_type_t { | 118 | enum event_type_t { |
42 | EVENT_FLEXIBLE = 0x1, | 119 | EVENT_FLEXIBLE = 0x1, |
43 | EVENT_PINNED = 0x2, | 120 | EVENT_PINNED = 0x2, |
44 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | 121 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, |
45 | }; | 122 | }; |
46 | 123 | ||
47 | atomic_t perf_task_events __read_mostly; | 124 | /* |
125 | * perf_sched_events : >0 events exist | ||
126 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | ||
127 | */ | ||
128 | atomic_t perf_sched_events __read_mostly; | ||
129 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | ||
130 | |||
48 | static atomic_t nr_mmap_events __read_mostly; | 131 | static atomic_t nr_mmap_events __read_mostly; |
49 | static atomic_t nr_comm_events __read_mostly; | 132 | static atomic_t nr_comm_events __read_mostly; |
50 | static atomic_t nr_task_events __read_mostly; | 133 | static atomic_t nr_task_events __read_mostly; |
@@ -67,7 +150,24 @@ int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ | |||
67 | /* | 150 | /* |
68 | * max perf event sample rate | 151 | * max perf event sample rate |
69 | */ | 152 | */ |
70 | int sysctl_perf_event_sample_rate __read_mostly = 100000; | 153 | #define DEFAULT_MAX_SAMPLE_RATE 100000 |
154 | int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | ||
155 | static int max_samples_per_tick __read_mostly = | ||
156 | DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | ||
157 | |||
158 | int perf_proc_update_handler(struct ctl_table *table, int write, | ||
159 | void __user *buffer, size_t *lenp, | ||
160 | loff_t *ppos) | ||
161 | { | ||
162 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | ||
163 | |||
164 | if (ret || !write) | ||
165 | return ret; | ||
166 | |||
167 | max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); | ||
168 | |||
169 | return 0; | ||
170 | } | ||
71 | 171 | ||
72 | static atomic64_t perf_event_id; | 172 | static atomic64_t perf_event_id; |
73 | 173 | ||
@@ -75,7 +175,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, | |||
75 | enum event_type_t event_type); | 175 | enum event_type_t event_type); |
76 | 176 | ||
77 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 177 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
78 | enum event_type_t event_type); | 178 | enum event_type_t event_type, |
179 | struct task_struct *task); | ||
180 | |||
181 | static void update_context_time(struct perf_event_context *ctx); | ||
182 | static u64 perf_event_time(struct perf_event *event); | ||
79 | 183 | ||
80 | void __weak perf_event_print_debug(void) { } | 184 | void __weak perf_event_print_debug(void) { } |
81 | 185 | ||
@@ -89,6 +193,360 @@ static inline u64 perf_clock(void) | |||
89 | return local_clock(); | 193 | return local_clock(); |
90 | } | 194 | } |
91 | 195 | ||
196 | static inline struct perf_cpu_context * | ||
197 | __get_cpu_context(struct perf_event_context *ctx) | ||
198 | { | ||
199 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
200 | } | ||
201 | |||
202 | #ifdef CONFIG_CGROUP_PERF | ||
203 | |||
204 | /* | ||
205 | * Must ensure cgroup is pinned (css_get) before calling | ||
206 | * this function. In other words, we cannot call this function | ||
207 | * if there is no cgroup event for the current CPU context. | ||
208 | */ | ||
209 | static inline struct perf_cgroup * | ||
210 | perf_cgroup_from_task(struct task_struct *task) | ||
211 | { | ||
212 | return container_of(task_subsys_state(task, perf_subsys_id), | ||
213 | struct perf_cgroup, css); | ||
214 | } | ||
215 | |||
216 | static inline bool | ||
217 | perf_cgroup_match(struct perf_event *event) | ||
218 | { | ||
219 | struct perf_event_context *ctx = event->ctx; | ||
220 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | ||
221 | |||
222 | return !event->cgrp || event->cgrp == cpuctx->cgrp; | ||
223 | } | ||
224 | |||
225 | static inline void perf_get_cgroup(struct perf_event *event) | ||
226 | { | ||
227 | css_get(&event->cgrp->css); | ||
228 | } | ||
229 | |||
230 | static inline void perf_put_cgroup(struct perf_event *event) | ||
231 | { | ||
232 | css_put(&event->cgrp->css); | ||
233 | } | ||
234 | |||
235 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
236 | { | ||
237 | perf_put_cgroup(event); | ||
238 | event->cgrp = NULL; | ||
239 | } | ||
240 | |||
241 | static inline int is_cgroup_event(struct perf_event *event) | ||
242 | { | ||
243 | return event->cgrp != NULL; | ||
244 | } | ||
245 | |||
246 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
247 | { | ||
248 | struct perf_cgroup_info *t; | ||
249 | |||
250 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
251 | return t->time; | ||
252 | } | ||
253 | |||
254 | static inline void __update_cgrp_time(struct perf_cgroup *cgrp) | ||
255 | { | ||
256 | struct perf_cgroup_info *info; | ||
257 | u64 now; | ||
258 | |||
259 | now = perf_clock(); | ||
260 | |||
261 | info = this_cpu_ptr(cgrp->info); | ||
262 | |||
263 | info->time += now - info->timestamp; | ||
264 | info->timestamp = now; | ||
265 | } | ||
266 | |||
267 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
268 | { | ||
269 | struct perf_cgroup *cgrp_out = cpuctx->cgrp; | ||
270 | if (cgrp_out) | ||
271 | __update_cgrp_time(cgrp_out); | ||
272 | } | ||
273 | |||
274 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
275 | { | ||
276 | struct perf_cgroup *cgrp; | ||
277 | |||
278 | /* | ||
279 | * ensure we access cgroup data only when needed and | ||
280 | * when we know the cgroup is pinned (css_get) | ||
281 | */ | ||
282 | if (!is_cgroup_event(event)) | ||
283 | return; | ||
284 | |||
285 | cgrp = perf_cgroup_from_task(current); | ||
286 | /* | ||
287 | * Do not update time when cgroup is not active | ||
288 | */ | ||
289 | if (cgrp == event->cgrp) | ||
290 | __update_cgrp_time(event->cgrp); | ||
291 | } | ||
292 | |||
293 | static inline void | ||
294 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
295 | struct perf_event_context *ctx) | ||
296 | { | ||
297 | struct perf_cgroup *cgrp; | ||
298 | struct perf_cgroup_info *info; | ||
299 | |||
300 | /* | ||
301 | * ctx->lock held by caller | ||
302 | * ensure we do not access cgroup data | ||
303 | * unless we have the cgroup pinned (css_get) | ||
304 | */ | ||
305 | if (!task || !ctx->nr_cgroups) | ||
306 | return; | ||
307 | |||
308 | cgrp = perf_cgroup_from_task(task); | ||
309 | info = this_cpu_ptr(cgrp->info); | ||
310 | info->timestamp = ctx->timestamp; | ||
311 | } | ||
312 | |||
313 | #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ | ||
314 | #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ | ||
315 | |||
316 | /* | ||
317 | * reschedule events based on the cgroup constraint of task. | ||
318 | * | ||
319 | * mode SWOUT : schedule out everything | ||
320 | * mode SWIN : schedule in based on cgroup for next | ||
321 | */ | ||
322 | void perf_cgroup_switch(struct task_struct *task, int mode) | ||
323 | { | ||
324 | struct perf_cpu_context *cpuctx; | ||
325 | struct pmu *pmu; | ||
326 | unsigned long flags; | ||
327 | |||
328 | /* | ||
329 | * disable interrupts to avoid geting nr_cgroup | ||
330 | * changes via __perf_event_disable(). Also | ||
331 | * avoids preemption. | ||
332 | */ | ||
333 | local_irq_save(flags); | ||
334 | |||
335 | /* | ||
336 | * we reschedule only in the presence of cgroup | ||
337 | * constrained events. | ||
338 | */ | ||
339 | rcu_read_lock(); | ||
340 | |||
341 | list_for_each_entry_rcu(pmu, &pmus, entry) { | ||
342 | |||
343 | cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); | ||
344 | |||
345 | perf_pmu_disable(cpuctx->ctx.pmu); | ||
346 | |||
347 | /* | ||
348 | * perf_cgroup_events says at least one | ||
349 | * context on this CPU has cgroup events. | ||
350 | * | ||
351 | * ctx->nr_cgroups reports the number of cgroup | ||
352 | * events for a context. | ||
353 | */ | ||
354 | if (cpuctx->ctx.nr_cgroups > 0) { | ||
355 | |||
356 | if (mode & PERF_CGROUP_SWOUT) { | ||
357 | cpu_ctx_sched_out(cpuctx, EVENT_ALL); | ||
358 | /* | ||
359 | * must not be done before ctxswout due | ||
360 | * to event_filter_match() in event_sched_out() | ||
361 | */ | ||
362 | cpuctx->cgrp = NULL; | ||
363 | } | ||
364 | |||
365 | if (mode & PERF_CGROUP_SWIN) { | ||
366 | /* set cgrp before ctxsw in to | ||
367 | * allow event_filter_match() to not | ||
368 | * have to pass task around | ||
369 | */ | ||
370 | cpuctx->cgrp = perf_cgroup_from_task(task); | ||
371 | cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); | ||
372 | } | ||
373 | } | ||
374 | |||
375 | perf_pmu_enable(cpuctx->ctx.pmu); | ||
376 | } | ||
377 | |||
378 | rcu_read_unlock(); | ||
379 | |||
380 | local_irq_restore(flags); | ||
381 | } | ||
382 | |||
383 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
384 | { | ||
385 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT); | ||
386 | } | ||
387 | |||
388 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
389 | { | ||
390 | perf_cgroup_switch(task, PERF_CGROUP_SWIN); | ||
391 | } | ||
392 | |||
393 | static inline int perf_cgroup_connect(int fd, struct perf_event *event, | ||
394 | struct perf_event_attr *attr, | ||
395 | struct perf_event *group_leader) | ||
396 | { | ||
397 | struct perf_cgroup *cgrp; | ||
398 | struct cgroup_subsys_state *css; | ||
399 | struct file *file; | ||
400 | int ret = 0, fput_needed; | ||
401 | |||
402 | file = fget_light(fd, &fput_needed); | ||
403 | if (!file) | ||
404 | return -EBADF; | ||
405 | |||
406 | css = cgroup_css_from_dir(file, perf_subsys_id); | ||
407 | if (IS_ERR(css)) { | ||
408 | ret = PTR_ERR(css); | ||
409 | goto out; | ||
410 | } | ||
411 | |||
412 | cgrp = container_of(css, struct perf_cgroup, css); | ||
413 | event->cgrp = cgrp; | ||
414 | |||
415 | /* must be done before we fput() the file */ | ||
416 | perf_get_cgroup(event); | ||
417 | |||
418 | /* | ||
419 | * all events in a group must monitor | ||
420 | * the same cgroup because a task belongs | ||
421 | * to only one perf cgroup at a time | ||
422 | */ | ||
423 | if (group_leader && group_leader->cgrp != cgrp) { | ||
424 | perf_detach_cgroup(event); | ||
425 | ret = -EINVAL; | ||
426 | } | ||
427 | out: | ||
428 | fput_light(file, fput_needed); | ||
429 | return ret; | ||
430 | } | ||
431 | |||
432 | static inline void | ||
433 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
434 | { | ||
435 | struct perf_cgroup_info *t; | ||
436 | t = per_cpu_ptr(event->cgrp->info, event->cpu); | ||
437 | event->shadow_ctx_time = now - t->timestamp; | ||
438 | } | ||
439 | |||
440 | static inline void | ||
441 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
442 | { | ||
443 | /* | ||
444 | * when the current task's perf cgroup does not match | ||
445 | * the event's, we need to remember to call the | ||
446 | * perf_mark_enable() function the first time a task with | ||
447 | * a matching perf cgroup is scheduled in. | ||
448 | */ | ||
449 | if (is_cgroup_event(event) && !perf_cgroup_match(event)) | ||
450 | event->cgrp_defer_enabled = 1; | ||
451 | } | ||
452 | |||
453 | static inline void | ||
454 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
455 | struct perf_event_context *ctx) | ||
456 | { | ||
457 | struct perf_event *sub; | ||
458 | u64 tstamp = perf_event_time(event); | ||
459 | |||
460 | if (!event->cgrp_defer_enabled) | ||
461 | return; | ||
462 | |||
463 | event->cgrp_defer_enabled = 0; | ||
464 | |||
465 | event->tstamp_enabled = tstamp - event->total_time_enabled; | ||
466 | list_for_each_entry(sub, &event->sibling_list, group_entry) { | ||
467 | if (sub->state >= PERF_EVENT_STATE_INACTIVE) { | ||
468 | sub->tstamp_enabled = tstamp - sub->total_time_enabled; | ||
469 | sub->cgrp_defer_enabled = 0; | ||
470 | } | ||
471 | } | ||
472 | } | ||
473 | #else /* !CONFIG_CGROUP_PERF */ | ||
474 | |||
475 | static inline bool | ||
476 | perf_cgroup_match(struct perf_event *event) | ||
477 | { | ||
478 | return true; | ||
479 | } | ||
480 | |||
481 | static inline void perf_detach_cgroup(struct perf_event *event) | ||
482 | {} | ||
483 | |||
484 | static inline int is_cgroup_event(struct perf_event *event) | ||
485 | { | ||
486 | return 0; | ||
487 | } | ||
488 | |||
489 | static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) | ||
490 | { | ||
491 | return 0; | ||
492 | } | ||
493 | |||
494 | static inline void update_cgrp_time_from_event(struct perf_event *event) | ||
495 | { | ||
496 | } | ||
497 | |||
498 | static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) | ||
499 | { | ||
500 | } | ||
501 | |||
502 | static inline void perf_cgroup_sched_out(struct task_struct *task) | ||
503 | { | ||
504 | } | ||
505 | |||
506 | static inline void perf_cgroup_sched_in(struct task_struct *task) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, | ||
511 | struct perf_event_attr *attr, | ||
512 | struct perf_event *group_leader) | ||
513 | { | ||
514 | return -EINVAL; | ||
515 | } | ||
516 | |||
517 | static inline void | ||
518 | perf_cgroup_set_timestamp(struct task_struct *task, | ||
519 | struct perf_event_context *ctx) | ||
520 | { | ||
521 | } | ||
522 | |||
523 | void | ||
524 | perf_cgroup_switch(struct task_struct *task, struct task_struct *next) | ||
525 | { | ||
526 | } | ||
527 | |||
528 | static inline void | ||
529 | perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) | ||
530 | { | ||
531 | } | ||
532 | |||
533 | static inline u64 perf_cgroup_event_time(struct perf_event *event) | ||
534 | { | ||
535 | return 0; | ||
536 | } | ||
537 | |||
538 | static inline void | ||
539 | perf_cgroup_defer_enabled(struct perf_event *event) | ||
540 | { | ||
541 | } | ||
542 | |||
543 | static inline void | ||
544 | perf_cgroup_mark_enabled(struct perf_event *event, | ||
545 | struct perf_event_context *ctx) | ||
546 | { | ||
547 | } | ||
548 | #endif | ||
549 | |||
92 | void perf_pmu_disable(struct pmu *pmu) | 550 | void perf_pmu_disable(struct pmu *pmu) |
93 | { | 551 | { |
94 | int *count = this_cpu_ptr(pmu->pmu_disable_count); | 552 | int *count = this_cpu_ptr(pmu->pmu_disable_count); |
@@ -254,7 +712,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) | |||
254 | raw_spin_lock_irqsave(&ctx->lock, flags); | 712 | raw_spin_lock_irqsave(&ctx->lock, flags); |
255 | --ctx->pin_count; | 713 | --ctx->pin_count; |
256 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 714 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
257 | put_ctx(ctx); | ||
258 | } | 715 | } |
259 | 716 | ||
260 | /* | 717 | /* |
@@ -271,6 +728,10 @@ static void update_context_time(struct perf_event_context *ctx) | |||
271 | static u64 perf_event_time(struct perf_event *event) | 728 | static u64 perf_event_time(struct perf_event *event) |
272 | { | 729 | { |
273 | struct perf_event_context *ctx = event->ctx; | 730 | struct perf_event_context *ctx = event->ctx; |
731 | |||
732 | if (is_cgroup_event(event)) | ||
733 | return perf_cgroup_event_time(event); | ||
734 | |||
274 | return ctx ? ctx->time : 0; | 735 | return ctx ? ctx->time : 0; |
275 | } | 736 | } |
276 | 737 | ||
@@ -285,9 +746,20 @@ static void update_event_times(struct perf_event *event) | |||
285 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 746 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
286 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 747 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
287 | return; | 748 | return; |
288 | 749 | /* | |
289 | if (ctx->is_active) | 750 | * in cgroup mode, time_enabled represents |
751 | * the time the event was enabled AND active | ||
752 | * tasks were in the monitored cgroup. This is | ||
753 | * independent of the activity of the context as | ||
754 | * there may be a mix of cgroup and non-cgroup events. | ||
755 | * | ||
756 | * That is why we treat cgroup events differently | ||
757 | * here. | ||
758 | */ | ||
759 | if (is_cgroup_event(event)) | ||
290 | run_end = perf_event_time(event); | 760 | run_end = perf_event_time(event); |
761 | else if (ctx->is_active) | ||
762 | run_end = ctx->time; | ||
291 | else | 763 | else |
292 | run_end = event->tstamp_stopped; | 764 | run_end = event->tstamp_stopped; |
293 | 765 | ||
@@ -299,6 +771,7 @@ static void update_event_times(struct perf_event *event) | |||
299 | run_end = perf_event_time(event); | 771 | run_end = perf_event_time(event); |
300 | 772 | ||
301 | event->total_time_running = run_end - event->tstamp_running; | 773 | event->total_time_running = run_end - event->tstamp_running; |
774 | |||
302 | } | 775 | } |
303 | 776 | ||
304 | /* | 777 | /* |
@@ -347,6 +820,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
347 | list_add_tail(&event->group_entry, list); | 820 | list_add_tail(&event->group_entry, list); |
348 | } | 821 | } |
349 | 822 | ||
823 | if (is_cgroup_event(event)) | ||
824 | ctx->nr_cgroups++; | ||
825 | |||
350 | list_add_rcu(&event->event_entry, &ctx->event_list); | 826 | list_add_rcu(&event->event_entry, &ctx->event_list); |
351 | if (!ctx->nr_events) | 827 | if (!ctx->nr_events) |
352 | perf_pmu_rotate_start(ctx->pmu); | 828 | perf_pmu_rotate_start(ctx->pmu); |
@@ -473,6 +949,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
473 | 949 | ||
474 | event->attach_state &= ~PERF_ATTACH_CONTEXT; | 950 | event->attach_state &= ~PERF_ATTACH_CONTEXT; |
475 | 951 | ||
952 | if (is_cgroup_event(event)) | ||
953 | ctx->nr_cgroups--; | ||
954 | |||
476 | ctx->nr_events--; | 955 | ctx->nr_events--; |
477 | if (event->attr.inherit_stat) | 956 | if (event->attr.inherit_stat) |
478 | ctx->nr_stat--; | 957 | ctx->nr_stat--; |
@@ -544,7 +1023,8 @@ out: | |||
544 | static inline int | 1023 | static inline int |
545 | event_filter_match(struct perf_event *event) | 1024 | event_filter_match(struct perf_event *event) |
546 | { | 1025 | { |
547 | return event->cpu == -1 || event->cpu == smp_processor_id(); | 1026 | return (event->cpu == -1 || event->cpu == smp_processor_id()) |
1027 | && perf_cgroup_match(event); | ||
548 | } | 1028 | } |
549 | 1029 | ||
550 | static void | 1030 | static void |
@@ -562,7 +1042,7 @@ event_sched_out(struct perf_event *event, | |||
562 | */ | 1042 | */ |
563 | if (event->state == PERF_EVENT_STATE_INACTIVE | 1043 | if (event->state == PERF_EVENT_STATE_INACTIVE |
564 | && !event_filter_match(event)) { | 1044 | && !event_filter_match(event)) { |
565 | delta = ctx->time - event->tstamp_stopped; | 1045 | delta = tstamp - event->tstamp_stopped; |
566 | event->tstamp_running += delta; | 1046 | event->tstamp_running += delta; |
567 | event->tstamp_stopped = tstamp; | 1047 | event->tstamp_stopped = tstamp; |
568 | } | 1048 | } |
@@ -606,47 +1086,30 @@ group_sched_out(struct perf_event *group_event, | |||
606 | cpuctx->exclusive = 0; | 1086 | cpuctx->exclusive = 0; |
607 | } | 1087 | } |
608 | 1088 | ||
609 | static inline struct perf_cpu_context * | ||
610 | __get_cpu_context(struct perf_event_context *ctx) | ||
611 | { | ||
612 | return this_cpu_ptr(ctx->pmu->pmu_cpu_context); | ||
613 | } | ||
614 | |||
615 | /* | 1089 | /* |
616 | * Cross CPU call to remove a performance event | 1090 | * Cross CPU call to remove a performance event |
617 | * | 1091 | * |
618 | * We disable the event on the hardware level first. After that we | 1092 | * We disable the event on the hardware level first. After that we |
619 | * remove it from the context list. | 1093 | * remove it from the context list. |
620 | */ | 1094 | */ |
621 | static void __perf_event_remove_from_context(void *info) | 1095 | static int __perf_remove_from_context(void *info) |
622 | { | 1096 | { |
623 | struct perf_event *event = info; | 1097 | struct perf_event *event = info; |
624 | struct perf_event_context *ctx = event->ctx; | 1098 | struct perf_event_context *ctx = event->ctx; |
625 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1099 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
626 | 1100 | ||
627 | /* | ||
628 | * If this is a task context, we need to check whether it is | ||
629 | * the current task context of this cpu. If not it has been | ||
630 | * scheduled out before the smp call arrived. | ||
631 | */ | ||
632 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
633 | return; | ||
634 | |||
635 | raw_spin_lock(&ctx->lock); | 1101 | raw_spin_lock(&ctx->lock); |
636 | |||
637 | event_sched_out(event, cpuctx, ctx); | 1102 | event_sched_out(event, cpuctx, ctx); |
638 | |||
639 | list_del_event(event, ctx); | 1103 | list_del_event(event, ctx); |
640 | |||
641 | raw_spin_unlock(&ctx->lock); | 1104 | raw_spin_unlock(&ctx->lock); |
1105 | |||
1106 | return 0; | ||
642 | } | 1107 | } |
643 | 1108 | ||
644 | 1109 | ||
645 | /* | 1110 | /* |
646 | * Remove the event from a task's (or a CPU's) list of events. | 1111 | * Remove the event from a task's (or a CPU's) list of events. |
647 | * | 1112 | * |
648 | * Must be called with ctx->mutex held. | ||
649 | * | ||
650 | * CPU events are removed with a smp call. For task events we only | 1113 | * CPU events are removed with a smp call. For task events we only |
651 | * call when the task is on a CPU. | 1114 | * call when the task is on a CPU. |
652 | * | 1115 | * |
@@ -657,49 +1120,48 @@ static void __perf_event_remove_from_context(void *info) | |||
657 | * When called from perf_event_exit_task, it's OK because the | 1120 | * When called from perf_event_exit_task, it's OK because the |
658 | * context has been detached from its task. | 1121 | * context has been detached from its task. |
659 | */ | 1122 | */ |
660 | static void perf_event_remove_from_context(struct perf_event *event) | 1123 | static void perf_remove_from_context(struct perf_event *event) |
661 | { | 1124 | { |
662 | struct perf_event_context *ctx = event->ctx; | 1125 | struct perf_event_context *ctx = event->ctx; |
663 | struct task_struct *task = ctx->task; | 1126 | struct task_struct *task = ctx->task; |
664 | 1127 | ||
1128 | lockdep_assert_held(&ctx->mutex); | ||
1129 | |||
665 | if (!task) { | 1130 | if (!task) { |
666 | /* | 1131 | /* |
667 | * Per cpu events are removed via an smp call and | 1132 | * Per cpu events are removed via an smp call and |
668 | * the removal is always successful. | 1133 | * the removal is always successful. |
669 | */ | 1134 | */ |
670 | smp_call_function_single(event->cpu, | 1135 | cpu_function_call(event->cpu, __perf_remove_from_context, event); |
671 | __perf_event_remove_from_context, | ||
672 | event, 1); | ||
673 | return; | 1136 | return; |
674 | } | 1137 | } |
675 | 1138 | ||
676 | retry: | 1139 | retry: |
677 | task_oncpu_function_call(task, __perf_event_remove_from_context, | 1140 | if (!task_function_call(task, __perf_remove_from_context, event)) |
678 | event); | 1141 | return; |
679 | 1142 | ||
680 | raw_spin_lock_irq(&ctx->lock); | 1143 | raw_spin_lock_irq(&ctx->lock); |
681 | /* | 1144 | /* |
682 | * If the context is active we need to retry the smp call. | 1145 | * If we failed to find a running task, but find the context active now |
1146 | * that we've acquired the ctx->lock, retry. | ||
683 | */ | 1147 | */ |
684 | if (ctx->nr_active && !list_empty(&event->group_entry)) { | 1148 | if (ctx->is_active) { |
685 | raw_spin_unlock_irq(&ctx->lock); | 1149 | raw_spin_unlock_irq(&ctx->lock); |
686 | goto retry; | 1150 | goto retry; |
687 | } | 1151 | } |
688 | 1152 | ||
689 | /* | 1153 | /* |
690 | * The lock prevents that this context is scheduled in so we | 1154 | * Since the task isn't running, its safe to remove the event, us |
691 | * can remove the event safely, if the call above did not | 1155 | * holding the ctx->lock ensures the task won't get scheduled in. |
692 | * succeed. | ||
693 | */ | 1156 | */ |
694 | if (!list_empty(&event->group_entry)) | 1157 | list_del_event(event, ctx); |
695 | list_del_event(event, ctx); | ||
696 | raw_spin_unlock_irq(&ctx->lock); | 1158 | raw_spin_unlock_irq(&ctx->lock); |
697 | } | 1159 | } |
698 | 1160 | ||
699 | /* | 1161 | /* |
700 | * Cross CPU call to disable a performance event | 1162 | * Cross CPU call to disable a performance event |
701 | */ | 1163 | */ |
702 | static void __perf_event_disable(void *info) | 1164 | static int __perf_event_disable(void *info) |
703 | { | 1165 | { |
704 | struct perf_event *event = info; | 1166 | struct perf_event *event = info; |
705 | struct perf_event_context *ctx = event->ctx; | 1167 | struct perf_event_context *ctx = event->ctx; |
@@ -708,9 +1170,12 @@ static void __perf_event_disable(void *info) | |||
708 | /* | 1170 | /* |
709 | * If this is a per-task event, need to check whether this | 1171 | * If this is a per-task event, need to check whether this |
710 | * event's task is the current task on this cpu. | 1172 | * event's task is the current task on this cpu. |
1173 | * | ||
1174 | * Can trigger due to concurrent perf_event_context_sched_out() | ||
1175 | * flipping contexts around. | ||
711 | */ | 1176 | */ |
712 | if (ctx->task && cpuctx->task_ctx != ctx) | 1177 | if (ctx->task && cpuctx->task_ctx != ctx) |
713 | return; | 1178 | return -EINVAL; |
714 | 1179 | ||
715 | raw_spin_lock(&ctx->lock); | 1180 | raw_spin_lock(&ctx->lock); |
716 | 1181 | ||
@@ -720,6 +1185,7 @@ static void __perf_event_disable(void *info) | |||
720 | */ | 1185 | */ |
721 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { | 1186 | if (event->state >= PERF_EVENT_STATE_INACTIVE) { |
722 | update_context_time(ctx); | 1187 | update_context_time(ctx); |
1188 | update_cgrp_time_from_event(event); | ||
723 | update_group_times(event); | 1189 | update_group_times(event); |
724 | if (event == event->group_leader) | 1190 | if (event == event->group_leader) |
725 | group_sched_out(event, cpuctx, ctx); | 1191 | group_sched_out(event, cpuctx, ctx); |
@@ -729,6 +1195,8 @@ static void __perf_event_disable(void *info) | |||
729 | } | 1195 | } |
730 | 1196 | ||
731 | raw_spin_unlock(&ctx->lock); | 1197 | raw_spin_unlock(&ctx->lock); |
1198 | |||
1199 | return 0; | ||
732 | } | 1200 | } |
733 | 1201 | ||
734 | /* | 1202 | /* |
@@ -753,13 +1221,13 @@ void perf_event_disable(struct perf_event *event) | |||
753 | /* | 1221 | /* |
754 | * Disable the event on the cpu that it's on | 1222 | * Disable the event on the cpu that it's on |
755 | */ | 1223 | */ |
756 | smp_call_function_single(event->cpu, __perf_event_disable, | 1224 | cpu_function_call(event->cpu, __perf_event_disable, event); |
757 | event, 1); | ||
758 | return; | 1225 | return; |
759 | } | 1226 | } |
760 | 1227 | ||
761 | retry: | 1228 | retry: |
762 | task_oncpu_function_call(task, __perf_event_disable, event); | 1229 | if (!task_function_call(task, __perf_event_disable, event)) |
1230 | return; | ||
763 | 1231 | ||
764 | raw_spin_lock_irq(&ctx->lock); | 1232 | raw_spin_lock_irq(&ctx->lock); |
765 | /* | 1233 | /* |
@@ -767,6 +1235,11 @@ retry: | |||
767 | */ | 1235 | */ |
768 | if (event->state == PERF_EVENT_STATE_ACTIVE) { | 1236 | if (event->state == PERF_EVENT_STATE_ACTIVE) { |
769 | raw_spin_unlock_irq(&ctx->lock); | 1237 | raw_spin_unlock_irq(&ctx->lock); |
1238 | /* | ||
1239 | * Reload the task pointer, it might have been changed by | ||
1240 | * a concurrent perf_event_context_sched_out(). | ||
1241 | */ | ||
1242 | task = ctx->task; | ||
770 | goto retry; | 1243 | goto retry; |
771 | } | 1244 | } |
772 | 1245 | ||
@@ -778,10 +1251,44 @@ retry: | |||
778 | update_group_times(event); | 1251 | update_group_times(event); |
779 | event->state = PERF_EVENT_STATE_OFF; | 1252 | event->state = PERF_EVENT_STATE_OFF; |
780 | } | 1253 | } |
781 | |||
782 | raw_spin_unlock_irq(&ctx->lock); | 1254 | raw_spin_unlock_irq(&ctx->lock); |
783 | } | 1255 | } |
784 | 1256 | ||
1257 | static void perf_set_shadow_time(struct perf_event *event, | ||
1258 | struct perf_event_context *ctx, | ||
1259 | u64 tstamp) | ||
1260 | { | ||
1261 | /* | ||
1262 | * use the correct time source for the time snapshot | ||
1263 | * | ||
1264 | * We could get by without this by leveraging the | ||
1265 | * fact that to get to this function, the caller | ||
1266 | * has most likely already called update_context_time() | ||
1267 | * and update_cgrp_time_xx() and thus both timestamp | ||
1268 | * are identical (or very close). Given that tstamp is, | ||
1269 | * already adjusted for cgroup, we could say that: | ||
1270 | * tstamp - ctx->timestamp | ||
1271 | * is equivalent to | ||
1272 | * tstamp - cgrp->timestamp. | ||
1273 | * | ||
1274 | * Then, in perf_output_read(), the calculation would | ||
1275 | * work with no changes because: | ||
1276 | * - event is guaranteed scheduled in | ||
1277 | * - no scheduled out in between | ||
1278 | * - thus the timestamp would be the same | ||
1279 | * | ||
1280 | * But this is a bit hairy. | ||
1281 | * | ||
1282 | * So instead, we have an explicit cgroup call to remain | ||
1283 | * within the time time source all along. We believe it | ||
1284 | * is cleaner and simpler to understand. | ||
1285 | */ | ||
1286 | if (is_cgroup_event(event)) | ||
1287 | perf_cgroup_set_shadow_time(event, tstamp); | ||
1288 | else | ||
1289 | event->shadow_ctx_time = tstamp - ctx->timestamp; | ||
1290 | } | ||
1291 | |||
785 | #define MAX_INTERRUPTS (~0ULL) | 1292 | #define MAX_INTERRUPTS (~0ULL) |
786 | 1293 | ||
787 | static void perf_log_throttle(struct perf_event *event, int enable); | 1294 | static void perf_log_throttle(struct perf_event *event, int enable); |
@@ -822,7 +1329,7 @@ event_sched_in(struct perf_event *event, | |||
822 | 1329 | ||
823 | event->tstamp_running += tstamp - event->tstamp_stopped; | 1330 | event->tstamp_running += tstamp - event->tstamp_stopped; |
824 | 1331 | ||
825 | event->shadow_ctx_time = tstamp - ctx->timestamp; | 1332 | perf_set_shadow_time(event, ctx, tstamp); |
826 | 1333 | ||
827 | if (!is_software_event(event)) | 1334 | if (!is_software_event(event)) |
828 | cpuctx->active_oncpu++; | 1335 | cpuctx->active_oncpu++; |
@@ -943,12 +1450,15 @@ static void add_event_to_ctx(struct perf_event *event, | |||
943 | event->tstamp_stopped = tstamp; | 1450 | event->tstamp_stopped = tstamp; |
944 | } | 1451 | } |
945 | 1452 | ||
1453 | static void perf_event_context_sched_in(struct perf_event_context *ctx, | ||
1454 | struct task_struct *tsk); | ||
1455 | |||
946 | /* | 1456 | /* |
947 | * Cross CPU call to install and enable a performance event | 1457 | * Cross CPU call to install and enable a performance event |
948 | * | 1458 | * |
949 | * Must be called with ctx->mutex held | 1459 | * Must be called with ctx->mutex held |
950 | */ | 1460 | */ |
951 | static void __perf_install_in_context(void *info) | 1461 | static int __perf_install_in_context(void *info) |
952 | { | 1462 | { |
953 | struct perf_event *event = info; | 1463 | struct perf_event *event = info; |
954 | struct perf_event_context *ctx = event->ctx; | 1464 | struct perf_event_context *ctx = event->ctx; |
@@ -957,21 +1467,22 @@ static void __perf_install_in_context(void *info) | |||
957 | int err; | 1467 | int err; |
958 | 1468 | ||
959 | /* | 1469 | /* |
960 | * If this is a task context, we need to check whether it is | 1470 | * In case we're installing a new context to an already running task, |
961 | * the current task context of this cpu. If not it has been | 1471 | * could also happen before perf_event_task_sched_in() on architectures |
962 | * scheduled out before the smp call arrived. | 1472 | * which do context switches with IRQs enabled. |
963 | * Or possibly this is the right context but it isn't | ||
964 | * on this cpu because it had no events. | ||
965 | */ | 1473 | */ |
966 | if (ctx->task && cpuctx->task_ctx != ctx) { | 1474 | if (ctx->task && !cpuctx->task_ctx) |
967 | if (cpuctx->task_ctx || ctx->task != current) | 1475 | perf_event_context_sched_in(ctx, ctx->task); |
968 | return; | ||
969 | cpuctx->task_ctx = ctx; | ||
970 | } | ||
971 | 1476 | ||
972 | raw_spin_lock(&ctx->lock); | 1477 | raw_spin_lock(&ctx->lock); |
973 | ctx->is_active = 1; | 1478 | ctx->is_active = 1; |
974 | update_context_time(ctx); | 1479 | update_context_time(ctx); |
1480 | /* | ||
1481 | * update cgrp time only if current cgrp | ||
1482 | * matches event->cgrp. Must be done before | ||
1483 | * calling add_event_to_ctx() | ||
1484 | */ | ||
1485 | update_cgrp_time_from_event(event); | ||
975 | 1486 | ||
976 | add_event_to_ctx(event, ctx); | 1487 | add_event_to_ctx(event, ctx); |
977 | 1488 | ||
@@ -1012,6 +1523,8 @@ static void __perf_install_in_context(void *info) | |||
1012 | 1523 | ||
1013 | unlock: | 1524 | unlock: |
1014 | raw_spin_unlock(&ctx->lock); | 1525 | raw_spin_unlock(&ctx->lock); |
1526 | |||
1527 | return 0; | ||
1015 | } | 1528 | } |
1016 | 1529 | ||
1017 | /* | 1530 | /* |
@@ -1023,8 +1536,6 @@ unlock: | |||
1023 | * If the event is attached to a task which is on a CPU we use a smp | 1536 | * If the event is attached to a task which is on a CPU we use a smp |
1024 | * call to enable it in the task context. The task might have been | 1537 | * call to enable it in the task context. The task might have been |
1025 | * scheduled away, but we check this in the smp call again. | 1538 | * scheduled away, but we check this in the smp call again. |
1026 | * | ||
1027 | * Must be called with ctx->mutex held. | ||
1028 | */ | 1539 | */ |
1029 | static void | 1540 | static void |
1030 | perf_install_in_context(struct perf_event_context *ctx, | 1541 | perf_install_in_context(struct perf_event_context *ctx, |
@@ -1033,6 +1544,8 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
1033 | { | 1544 | { |
1034 | struct task_struct *task = ctx->task; | 1545 | struct task_struct *task = ctx->task; |
1035 | 1546 | ||
1547 | lockdep_assert_held(&ctx->mutex); | ||
1548 | |||
1036 | event->ctx = ctx; | 1549 | event->ctx = ctx; |
1037 | 1550 | ||
1038 | if (!task) { | 1551 | if (!task) { |
@@ -1040,31 +1553,29 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
1040 | * Per cpu events are installed via an smp call and | 1553 | * Per cpu events are installed via an smp call and |
1041 | * the install is always successful. | 1554 | * the install is always successful. |
1042 | */ | 1555 | */ |
1043 | smp_call_function_single(cpu, __perf_install_in_context, | 1556 | cpu_function_call(cpu, __perf_install_in_context, event); |
1044 | event, 1); | ||
1045 | return; | 1557 | return; |
1046 | } | 1558 | } |
1047 | 1559 | ||
1048 | retry: | 1560 | retry: |
1049 | task_oncpu_function_call(task, __perf_install_in_context, | 1561 | if (!task_function_call(task, __perf_install_in_context, event)) |
1050 | event); | 1562 | return; |
1051 | 1563 | ||
1052 | raw_spin_lock_irq(&ctx->lock); | 1564 | raw_spin_lock_irq(&ctx->lock); |
1053 | /* | 1565 | /* |
1054 | * we need to retry the smp call. | 1566 | * If we failed to find a running task, but find the context active now |
1567 | * that we've acquired the ctx->lock, retry. | ||
1055 | */ | 1568 | */ |
1056 | if (ctx->is_active && list_empty(&event->group_entry)) { | 1569 | if (ctx->is_active) { |
1057 | raw_spin_unlock_irq(&ctx->lock); | 1570 | raw_spin_unlock_irq(&ctx->lock); |
1058 | goto retry; | 1571 | goto retry; |
1059 | } | 1572 | } |
1060 | 1573 | ||
1061 | /* | 1574 | /* |
1062 | * The lock prevents that this context is scheduled in so we | 1575 | * Since the task isn't running, its safe to add the event, us holding |
1063 | * can add the event safely, if it the call above did not | 1576 | * the ctx->lock ensures the task won't get scheduled in. |
1064 | * succeed. | ||
1065 | */ | 1577 | */ |
1066 | if (list_empty(&event->group_entry)) | 1578 | add_event_to_ctx(event, ctx); |
1067 | add_event_to_ctx(event, ctx); | ||
1068 | raw_spin_unlock_irq(&ctx->lock); | 1579 | raw_spin_unlock_irq(&ctx->lock); |
1069 | } | 1580 | } |
1070 | 1581 | ||
@@ -1093,7 +1604,7 @@ static void __perf_event_mark_enabled(struct perf_event *event, | |||
1093 | /* | 1604 | /* |
1094 | * Cross CPU call to enable a performance event | 1605 | * Cross CPU call to enable a performance event |
1095 | */ | 1606 | */ |
1096 | static void __perf_event_enable(void *info) | 1607 | static int __perf_event_enable(void *info) |
1097 | { | 1608 | { |
1098 | struct perf_event *event = info; | 1609 | struct perf_event *event = info; |
1099 | struct perf_event_context *ctx = event->ctx; | 1610 | struct perf_event_context *ctx = event->ctx; |
@@ -1101,26 +1612,27 @@ static void __perf_event_enable(void *info) | |||
1101 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 1612 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
1102 | int err; | 1613 | int err; |
1103 | 1614 | ||
1104 | /* | 1615 | if (WARN_ON_ONCE(!ctx->is_active)) |
1105 | * If this is a per-task event, need to check whether this | 1616 | return -EINVAL; |
1106 | * event's task is the current task on this cpu. | ||
1107 | */ | ||
1108 | if (ctx->task && cpuctx->task_ctx != ctx) { | ||
1109 | if (cpuctx->task_ctx || ctx->task != current) | ||
1110 | return; | ||
1111 | cpuctx->task_ctx = ctx; | ||
1112 | } | ||
1113 | 1617 | ||
1114 | raw_spin_lock(&ctx->lock); | 1618 | raw_spin_lock(&ctx->lock); |
1115 | ctx->is_active = 1; | ||
1116 | update_context_time(ctx); | 1619 | update_context_time(ctx); |
1117 | 1620 | ||
1118 | if (event->state >= PERF_EVENT_STATE_INACTIVE) | 1621 | if (event->state >= PERF_EVENT_STATE_INACTIVE) |
1119 | goto unlock; | 1622 | goto unlock; |
1623 | |||
1624 | /* | ||
1625 | * set current task's cgroup time reference point | ||
1626 | */ | ||
1627 | perf_cgroup_set_timestamp(current, ctx); | ||
1628 | |||
1120 | __perf_event_mark_enabled(event, ctx); | 1629 | __perf_event_mark_enabled(event, ctx); |
1121 | 1630 | ||
1122 | if (!event_filter_match(event)) | 1631 | if (!event_filter_match(event)) { |
1632 | if (is_cgroup_event(event)) | ||
1633 | perf_cgroup_defer_enabled(event); | ||
1123 | goto unlock; | 1634 | goto unlock; |
1635 | } | ||
1124 | 1636 | ||
1125 | /* | 1637 | /* |
1126 | * If the event is in a group and isn't the group leader, | 1638 | * If the event is in a group and isn't the group leader, |
@@ -1153,6 +1665,8 @@ static void __perf_event_enable(void *info) | |||
1153 | 1665 | ||
1154 | unlock: | 1666 | unlock: |
1155 | raw_spin_unlock(&ctx->lock); | 1667 | raw_spin_unlock(&ctx->lock); |
1668 | |||
1669 | return 0; | ||
1156 | } | 1670 | } |
1157 | 1671 | ||
1158 | /* | 1672 | /* |
@@ -1173,8 +1687,7 @@ void perf_event_enable(struct perf_event *event) | |||
1173 | /* | 1687 | /* |
1174 | * Enable the event on the cpu that it's on | 1688 | * Enable the event on the cpu that it's on |
1175 | */ | 1689 | */ |
1176 | smp_call_function_single(event->cpu, __perf_event_enable, | 1690 | cpu_function_call(event->cpu, __perf_event_enable, event); |
1177 | event, 1); | ||
1178 | return; | 1691 | return; |
1179 | } | 1692 | } |
1180 | 1693 | ||
@@ -1193,8 +1706,15 @@ void perf_event_enable(struct perf_event *event) | |||
1193 | event->state = PERF_EVENT_STATE_OFF; | 1706 | event->state = PERF_EVENT_STATE_OFF; |
1194 | 1707 | ||
1195 | retry: | 1708 | retry: |
1709 | if (!ctx->is_active) { | ||
1710 | __perf_event_mark_enabled(event, ctx); | ||
1711 | goto out; | ||
1712 | } | ||
1713 | |||
1196 | raw_spin_unlock_irq(&ctx->lock); | 1714 | raw_spin_unlock_irq(&ctx->lock); |
1197 | task_oncpu_function_call(task, __perf_event_enable, event); | 1715 | |
1716 | if (!task_function_call(task, __perf_event_enable, event)) | ||
1717 | return; | ||
1198 | 1718 | ||
1199 | raw_spin_lock_irq(&ctx->lock); | 1719 | raw_spin_lock_irq(&ctx->lock); |
1200 | 1720 | ||
@@ -1202,15 +1722,14 @@ retry: | |||
1202 | * If the context is active and the event is still off, | 1722 | * If the context is active and the event is still off, |
1203 | * we need to retry the cross-call. | 1723 | * we need to retry the cross-call. |
1204 | */ | 1724 | */ |
1205 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) | 1725 | if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { |
1726 | /* | ||
1727 | * task could have been flipped by a concurrent | ||
1728 | * perf_event_context_sched_out() | ||
1729 | */ | ||
1730 | task = ctx->task; | ||
1206 | goto retry; | 1731 | goto retry; |
1207 | 1732 | } | |
1208 | /* | ||
1209 | * Since we have the lock this context can't be scheduled | ||
1210 | * in, so we can change the state safely. | ||
1211 | */ | ||
1212 | if (event->state == PERF_EVENT_STATE_OFF) | ||
1213 | __perf_event_mark_enabled(event, ctx); | ||
1214 | 1733 | ||
1215 | out: | 1734 | out: |
1216 | raw_spin_unlock_irq(&ctx->lock); | 1735 | raw_spin_unlock_irq(&ctx->lock); |
@@ -1242,6 +1761,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
1242 | if (likely(!ctx->nr_events)) | 1761 | if (likely(!ctx->nr_events)) |
1243 | goto out; | 1762 | goto out; |
1244 | update_context_time(ctx); | 1763 | update_context_time(ctx); |
1764 | update_cgrp_time_from_cpuctx(cpuctx); | ||
1245 | 1765 | ||
1246 | if (!ctx->nr_active) | 1766 | if (!ctx->nr_active) |
1247 | goto out; | 1767 | goto out; |
@@ -1354,8 +1874,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, | |||
1354 | } | 1874 | } |
1355 | } | 1875 | } |
1356 | 1876 | ||
1357 | void perf_event_context_sched_out(struct task_struct *task, int ctxn, | 1877 | static void perf_event_context_sched_out(struct task_struct *task, int ctxn, |
1358 | struct task_struct *next) | 1878 | struct task_struct *next) |
1359 | { | 1879 | { |
1360 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 1880 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
1361 | struct perf_event_context *next_ctx; | 1881 | struct perf_event_context *next_ctx; |
@@ -1431,6 +1951,14 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
1431 | 1951 | ||
1432 | for_each_task_context_nr(ctxn) | 1952 | for_each_task_context_nr(ctxn) |
1433 | perf_event_context_sched_out(task, ctxn, next); | 1953 | perf_event_context_sched_out(task, ctxn, next); |
1954 | |||
1955 | /* | ||
1956 | * if cgroup events exist on this CPU, then we need | ||
1957 | * to check if we have to switch out PMU state. | ||
1958 | * cgroup event are system-wide mode only | ||
1959 | */ | ||
1960 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
1961 | perf_cgroup_sched_out(task); | ||
1434 | } | 1962 | } |
1435 | 1963 | ||
1436 | static void task_ctx_sched_out(struct perf_event_context *ctx, | 1964 | static void task_ctx_sched_out(struct perf_event_context *ctx, |
@@ -1469,6 +1997,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, | |||
1469 | if (!event_filter_match(event)) | 1997 | if (!event_filter_match(event)) |
1470 | continue; | 1998 | continue; |
1471 | 1999 | ||
2000 | /* may need to reset tstamp_enabled */ | ||
2001 | if (is_cgroup_event(event)) | ||
2002 | perf_cgroup_mark_enabled(event, ctx); | ||
2003 | |||
1472 | if (group_can_go_on(event, cpuctx, 1)) | 2004 | if (group_can_go_on(event, cpuctx, 1)) |
1473 | group_sched_in(event, cpuctx, ctx); | 2005 | group_sched_in(event, cpuctx, ctx); |
1474 | 2006 | ||
@@ -1501,6 +2033,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1501 | if (!event_filter_match(event)) | 2033 | if (!event_filter_match(event)) |
1502 | continue; | 2034 | continue; |
1503 | 2035 | ||
2036 | /* may need to reset tstamp_enabled */ | ||
2037 | if (is_cgroup_event(event)) | ||
2038 | perf_cgroup_mark_enabled(event, ctx); | ||
2039 | |||
1504 | if (group_can_go_on(event, cpuctx, can_add_hw)) { | 2040 | if (group_can_go_on(event, cpuctx, can_add_hw)) { |
1505 | if (group_sched_in(event, cpuctx, ctx)) | 2041 | if (group_sched_in(event, cpuctx, ctx)) |
1506 | can_add_hw = 0; | 2042 | can_add_hw = 0; |
@@ -1511,15 +2047,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, | |||
1511 | static void | 2047 | static void |
1512 | ctx_sched_in(struct perf_event_context *ctx, | 2048 | ctx_sched_in(struct perf_event_context *ctx, |
1513 | struct perf_cpu_context *cpuctx, | 2049 | struct perf_cpu_context *cpuctx, |
1514 | enum event_type_t event_type) | 2050 | enum event_type_t event_type, |
2051 | struct task_struct *task) | ||
1515 | { | 2052 | { |
2053 | u64 now; | ||
2054 | |||
1516 | raw_spin_lock(&ctx->lock); | 2055 | raw_spin_lock(&ctx->lock); |
1517 | ctx->is_active = 1; | 2056 | ctx->is_active = 1; |
1518 | if (likely(!ctx->nr_events)) | 2057 | if (likely(!ctx->nr_events)) |
1519 | goto out; | 2058 | goto out; |
1520 | 2059 | ||
1521 | ctx->timestamp = perf_clock(); | 2060 | now = perf_clock(); |
1522 | 2061 | ctx->timestamp = now; | |
2062 | perf_cgroup_set_timestamp(task, ctx); | ||
1523 | /* | 2063 | /* |
1524 | * First go through the list and put on any pinned groups | 2064 | * First go through the list and put on any pinned groups |
1525 | * in order to give them the best chance of going on. | 2065 | * in order to give them the best chance of going on. |
@@ -1536,11 +2076,12 @@ out: | |||
1536 | } | 2076 | } |
1537 | 2077 | ||
1538 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, | 2078 | static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, |
1539 | enum event_type_t event_type) | 2079 | enum event_type_t event_type, |
2080 | struct task_struct *task) | ||
1540 | { | 2081 | { |
1541 | struct perf_event_context *ctx = &cpuctx->ctx; | 2082 | struct perf_event_context *ctx = &cpuctx->ctx; |
1542 | 2083 | ||
1543 | ctx_sched_in(ctx, cpuctx, event_type); | 2084 | ctx_sched_in(ctx, cpuctx, event_type, task); |
1544 | } | 2085 | } |
1545 | 2086 | ||
1546 | static void task_ctx_sched_in(struct perf_event_context *ctx, | 2087 | static void task_ctx_sched_in(struct perf_event_context *ctx, |
@@ -1548,15 +2089,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx, | |||
1548 | { | 2089 | { |
1549 | struct perf_cpu_context *cpuctx; | 2090 | struct perf_cpu_context *cpuctx; |
1550 | 2091 | ||
1551 | cpuctx = __get_cpu_context(ctx); | 2092 | cpuctx = __get_cpu_context(ctx); |
1552 | if (cpuctx->task_ctx == ctx) | 2093 | if (cpuctx->task_ctx == ctx) |
1553 | return; | 2094 | return; |
1554 | 2095 | ||
1555 | ctx_sched_in(ctx, cpuctx, event_type); | 2096 | ctx_sched_in(ctx, cpuctx, event_type, NULL); |
1556 | cpuctx->task_ctx = ctx; | 2097 | cpuctx->task_ctx = ctx; |
1557 | } | 2098 | } |
1558 | 2099 | ||
1559 | void perf_event_context_sched_in(struct perf_event_context *ctx) | 2100 | static void perf_event_context_sched_in(struct perf_event_context *ctx, |
2101 | struct task_struct *task) | ||
1560 | { | 2102 | { |
1561 | struct perf_cpu_context *cpuctx; | 2103 | struct perf_cpu_context *cpuctx; |
1562 | 2104 | ||
@@ -1572,9 +2114,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) | |||
1572 | */ | 2114 | */ |
1573 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 2115 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
1574 | 2116 | ||
1575 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED); | 2117 | ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); |
1576 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2118 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); |
1577 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); | 2119 | ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); |
1578 | 2120 | ||
1579 | cpuctx->task_ctx = ctx; | 2121 | cpuctx->task_ctx = ctx; |
1580 | 2122 | ||
@@ -1607,8 +2149,15 @@ void __perf_event_task_sched_in(struct task_struct *task) | |||
1607 | if (likely(!ctx)) | 2149 | if (likely(!ctx)) |
1608 | continue; | 2150 | continue; |
1609 | 2151 | ||
1610 | perf_event_context_sched_in(ctx); | 2152 | perf_event_context_sched_in(ctx, task); |
1611 | } | 2153 | } |
2154 | /* | ||
2155 | * if cgroup events exist on this CPU, then we need | ||
2156 | * to check if we have to switch in PMU state. | ||
2157 | * cgroup event are system-wide mode only | ||
2158 | */ | ||
2159 | if (atomic_read(&__get_cpu_var(perf_cgroup_events))) | ||
2160 | perf_cgroup_sched_in(task); | ||
1612 | } | 2161 | } |
1613 | 2162 | ||
1614 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | 2163 | static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) |
@@ -1638,7 +2187,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) | |||
1638 | * Reduce accuracy by one bit such that @a and @b converge | 2187 | * Reduce accuracy by one bit such that @a and @b converge |
1639 | * to a similar magnitude. | 2188 | * to a similar magnitude. |
1640 | */ | 2189 | */ |
1641 | #define REDUCE_FLS(a, b) \ | 2190 | #define REDUCE_FLS(a, b) \ |
1642 | do { \ | 2191 | do { \ |
1643 | if (a##_fls > b##_fls) { \ | 2192 | if (a##_fls > b##_fls) { \ |
1644 | a >>= 1; \ | 2193 | a >>= 1; \ |
@@ -1808,7 +2357,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
1808 | if (ctx) | 2357 | if (ctx) |
1809 | rotate_ctx(ctx); | 2358 | rotate_ctx(ctx); |
1810 | 2359 | ||
1811 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); | 2360 | cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); |
1812 | if (ctx) | 2361 | if (ctx) |
1813 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); | 2362 | task_ctx_sched_in(ctx, EVENT_FLEXIBLE); |
1814 | 2363 | ||
@@ -1887,7 +2436,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) | |||
1887 | 2436 | ||
1888 | raw_spin_unlock(&ctx->lock); | 2437 | raw_spin_unlock(&ctx->lock); |
1889 | 2438 | ||
1890 | perf_event_context_sched_in(ctx); | 2439 | perf_event_context_sched_in(ctx, ctx->task); |
1891 | out: | 2440 | out: |
1892 | local_irq_restore(flags); | 2441 | local_irq_restore(flags); |
1893 | } | 2442 | } |
@@ -1912,8 +2461,10 @@ static void __perf_event_read(void *info) | |||
1912 | return; | 2461 | return; |
1913 | 2462 | ||
1914 | raw_spin_lock(&ctx->lock); | 2463 | raw_spin_lock(&ctx->lock); |
1915 | if (ctx->is_active) | 2464 | if (ctx->is_active) { |
1916 | update_context_time(ctx); | 2465 | update_context_time(ctx); |
2466 | update_cgrp_time_from_event(event); | ||
2467 | } | ||
1917 | update_event_times(event); | 2468 | update_event_times(event); |
1918 | if (event->state == PERF_EVENT_STATE_ACTIVE) | 2469 | if (event->state == PERF_EVENT_STATE_ACTIVE) |
1919 | event->pmu->read(event); | 2470 | event->pmu->read(event); |
@@ -1944,8 +2495,10 @@ static u64 perf_event_read(struct perf_event *event) | |||
1944 | * (e.g., thread is blocked), in that case | 2495 | * (e.g., thread is blocked), in that case |
1945 | * we cannot update context time | 2496 | * we cannot update context time |
1946 | */ | 2497 | */ |
1947 | if (ctx->is_active) | 2498 | if (ctx->is_active) { |
1948 | update_context_time(ctx); | 2499 | update_context_time(ctx); |
2500 | update_cgrp_time_from_event(event); | ||
2501 | } | ||
1949 | update_event_times(event); | 2502 | update_event_times(event); |
1950 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2503 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
1951 | } | 2504 | } |
@@ -2224,6 +2777,9 @@ errout: | |||
2224 | 2777 | ||
2225 | } | 2778 | } |
2226 | 2779 | ||
2780 | /* | ||
2781 | * Returns a matching context with refcount and pincount. | ||
2782 | */ | ||
2227 | static struct perf_event_context * | 2783 | static struct perf_event_context * |
2228 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | 2784 | find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) |
2229 | { | 2785 | { |
@@ -2248,6 +2804,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) | |||
2248 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); | 2804 | cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); |
2249 | ctx = &cpuctx->ctx; | 2805 | ctx = &cpuctx->ctx; |
2250 | get_ctx(ctx); | 2806 | get_ctx(ctx); |
2807 | ++ctx->pin_count; | ||
2251 | 2808 | ||
2252 | return ctx; | 2809 | return ctx; |
2253 | } | 2810 | } |
@@ -2261,6 +2818,7 @@ retry: | |||
2261 | ctx = perf_lock_task_context(task, ctxn, &flags); | 2818 | ctx = perf_lock_task_context(task, ctxn, &flags); |
2262 | if (ctx) { | 2819 | if (ctx) { |
2263 | unclone_ctx(ctx); | 2820 | unclone_ctx(ctx); |
2821 | ++ctx->pin_count; | ||
2264 | raw_spin_unlock_irqrestore(&ctx->lock, flags); | 2822 | raw_spin_unlock_irqrestore(&ctx->lock, flags); |
2265 | } | 2823 | } |
2266 | 2824 | ||
@@ -2282,8 +2840,10 @@ retry: | |||
2282 | err = -ESRCH; | 2840 | err = -ESRCH; |
2283 | else if (task->perf_event_ctxp[ctxn]) | 2841 | else if (task->perf_event_ctxp[ctxn]) |
2284 | err = -EAGAIN; | 2842 | err = -EAGAIN; |
2285 | else | 2843 | else { |
2844 | ++ctx->pin_count; | ||
2286 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); | 2845 | rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); |
2846 | } | ||
2287 | mutex_unlock(&task->perf_event_mutex); | 2847 | mutex_unlock(&task->perf_event_mutex); |
2288 | 2848 | ||
2289 | if (unlikely(err)) { | 2849 | if (unlikely(err)) { |
@@ -2323,7 +2883,7 @@ static void free_event(struct perf_event *event) | |||
2323 | 2883 | ||
2324 | if (!event->parent) { | 2884 | if (!event->parent) { |
2325 | if (event->attach_state & PERF_ATTACH_TASK) | 2885 | if (event->attach_state & PERF_ATTACH_TASK) |
2326 | jump_label_dec(&perf_task_events); | 2886 | jump_label_dec(&perf_sched_events); |
2327 | if (event->attr.mmap || event->attr.mmap_data) | 2887 | if (event->attr.mmap || event->attr.mmap_data) |
2328 | atomic_dec(&nr_mmap_events); | 2888 | atomic_dec(&nr_mmap_events); |
2329 | if (event->attr.comm) | 2889 | if (event->attr.comm) |
@@ -2332,6 +2892,10 @@ static void free_event(struct perf_event *event) | |||
2332 | atomic_dec(&nr_task_events); | 2892 | atomic_dec(&nr_task_events); |
2333 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) | 2893 | if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) |
2334 | put_callchain_buffers(); | 2894 | put_callchain_buffers(); |
2895 | if (is_cgroup_event(event)) { | ||
2896 | atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); | ||
2897 | jump_label_dec(&perf_sched_events); | ||
2898 | } | ||
2335 | } | 2899 | } |
2336 | 2900 | ||
2337 | if (event->buffer) { | 2901 | if (event->buffer) { |
@@ -2339,6 +2903,9 @@ static void free_event(struct perf_event *event) | |||
2339 | event->buffer = NULL; | 2903 | event->buffer = NULL; |
2340 | } | 2904 | } |
2341 | 2905 | ||
2906 | if (is_cgroup_event(event)) | ||
2907 | perf_detach_cgroup(event); | ||
2908 | |||
2342 | if (event->destroy) | 2909 | if (event->destroy) |
2343 | event->destroy(event); | 2910 | event->destroy(event); |
2344 | 2911 | ||
@@ -4406,26 +4973,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, | |||
4406 | if (unlikely(!is_sampling_event(event))) | 4973 | if (unlikely(!is_sampling_event(event))) |
4407 | return 0; | 4974 | return 0; |
4408 | 4975 | ||
4409 | if (!throttle) { | 4976 | if (unlikely(hwc->interrupts >= max_samples_per_tick)) { |
4410 | hwc->interrupts++; | 4977 | if (throttle) { |
4411 | } else { | 4978 | hwc->interrupts = MAX_INTERRUPTS; |
4412 | if (hwc->interrupts != MAX_INTERRUPTS) { | 4979 | perf_log_throttle(event, 0); |
4413 | hwc->interrupts++; | ||
4414 | if (HZ * hwc->interrupts > | ||
4415 | (u64)sysctl_perf_event_sample_rate) { | ||
4416 | hwc->interrupts = MAX_INTERRUPTS; | ||
4417 | perf_log_throttle(event, 0); | ||
4418 | ret = 1; | ||
4419 | } | ||
4420 | } else { | ||
4421 | /* | ||
4422 | * Keep re-disabling events even though on the previous | ||
4423 | * pass we disabled it - just in case we raced with a | ||
4424 | * sched-in and the event got enabled again: | ||
4425 | */ | ||
4426 | ret = 1; | 4980 | ret = 1; |
4427 | } | 4981 | } |
4428 | } | 4982 | } else |
4983 | hwc->interrupts++; | ||
4429 | 4984 | ||
4430 | if (event->attr.freq) { | 4985 | if (event->attr.freq) { |
4431 | u64 now = perf_clock(); | 4986 | u64 now = perf_clock(); |
@@ -5062,6 +5617,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
5062 | u64 period; | 5617 | u64 period; |
5063 | 5618 | ||
5064 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); | 5619 | event = container_of(hrtimer, struct perf_event, hw.hrtimer); |
5620 | |||
5621 | if (event->state != PERF_EVENT_STATE_ACTIVE) | ||
5622 | return HRTIMER_NORESTART; | ||
5623 | |||
5065 | event->pmu->read(event); | 5624 | event->pmu->read(event); |
5066 | 5625 | ||
5067 | perf_sample_data_init(&data, 0); | 5626 | perf_sample_data_init(&data, 0); |
@@ -5088,9 +5647,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) | |||
5088 | if (!is_sampling_event(event)) | 5647 | if (!is_sampling_event(event)) |
5089 | return; | 5648 | return; |
5090 | 5649 | ||
5091 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5092 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5093 | |||
5094 | period = local64_read(&hwc->period_left); | 5650 | period = local64_read(&hwc->period_left); |
5095 | if (period) { | 5651 | if (period) { |
5096 | if (period < 0) | 5652 | if (period < 0) |
@@ -5117,6 +5673,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) | |||
5117 | } | 5673 | } |
5118 | } | 5674 | } |
5119 | 5675 | ||
5676 | static void perf_swevent_init_hrtimer(struct perf_event *event) | ||
5677 | { | ||
5678 | struct hw_perf_event *hwc = &event->hw; | ||
5679 | |||
5680 | if (!is_sampling_event(event)) | ||
5681 | return; | ||
5682 | |||
5683 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
5684 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
5685 | |||
5686 | /* | ||
5687 | * Since hrtimers have a fixed rate, we can do a static freq->period | ||
5688 | * mapping and avoid the whole period adjust feedback stuff. | ||
5689 | */ | ||
5690 | if (event->attr.freq) { | ||
5691 | long freq = event->attr.sample_freq; | ||
5692 | |||
5693 | event->attr.sample_period = NSEC_PER_SEC / freq; | ||
5694 | hwc->sample_period = event->attr.sample_period; | ||
5695 | local64_set(&hwc->period_left, hwc->sample_period); | ||
5696 | event->attr.freq = 0; | ||
5697 | } | ||
5698 | } | ||
5699 | |||
5120 | /* | 5700 | /* |
5121 | * Software event: cpu wall time clock | 5701 | * Software event: cpu wall time clock |
5122 | */ | 5702 | */ |
@@ -5169,6 +5749,8 @@ static int cpu_clock_event_init(struct perf_event *event) | |||
5169 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) | 5749 | if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) |
5170 | return -ENOENT; | 5750 | return -ENOENT; |
5171 | 5751 | ||
5752 | perf_swevent_init_hrtimer(event); | ||
5753 | |||
5172 | return 0; | 5754 | return 0; |
5173 | } | 5755 | } |
5174 | 5756 | ||
@@ -5224,16 +5806,9 @@ static void task_clock_event_del(struct perf_event *event, int flags) | |||
5224 | 5806 | ||
5225 | static void task_clock_event_read(struct perf_event *event) | 5807 | static void task_clock_event_read(struct perf_event *event) |
5226 | { | 5808 | { |
5227 | u64 time; | 5809 | u64 now = perf_clock(); |
5228 | 5810 | u64 delta = now - event->ctx->timestamp; | |
5229 | if (!in_nmi()) { | 5811 | u64 time = event->ctx->time + delta; |
5230 | update_context_time(event->ctx); | ||
5231 | time = event->ctx->time; | ||
5232 | } else { | ||
5233 | u64 now = perf_clock(); | ||
5234 | u64 delta = now - event->ctx->timestamp; | ||
5235 | time = event->ctx->time + delta; | ||
5236 | } | ||
5237 | 5812 | ||
5238 | task_clock_event_update(event, time); | 5813 | task_clock_event_update(event, time); |
5239 | } | 5814 | } |
@@ -5246,6 +5821,8 @@ static int task_clock_event_init(struct perf_event *event) | |||
5246 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) | 5821 | if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) |
5247 | return -ENOENT; | 5822 | return -ENOENT; |
5248 | 5823 | ||
5824 | perf_swevent_init_hrtimer(event); | ||
5825 | |||
5249 | return 0; | 5826 | return 0; |
5250 | } | 5827 | } |
5251 | 5828 | ||
@@ -5517,17 +6094,22 @@ struct pmu *perf_init_event(struct perf_event *event) | |||
5517 | { | 6094 | { |
5518 | struct pmu *pmu = NULL; | 6095 | struct pmu *pmu = NULL; |
5519 | int idx; | 6096 | int idx; |
6097 | int ret; | ||
5520 | 6098 | ||
5521 | idx = srcu_read_lock(&pmus_srcu); | 6099 | idx = srcu_read_lock(&pmus_srcu); |
5522 | 6100 | ||
5523 | rcu_read_lock(); | 6101 | rcu_read_lock(); |
5524 | pmu = idr_find(&pmu_idr, event->attr.type); | 6102 | pmu = idr_find(&pmu_idr, event->attr.type); |
5525 | rcu_read_unlock(); | 6103 | rcu_read_unlock(); |
5526 | if (pmu) | 6104 | if (pmu) { |
6105 | ret = pmu->event_init(event); | ||
6106 | if (ret) | ||
6107 | pmu = ERR_PTR(ret); | ||
5527 | goto unlock; | 6108 | goto unlock; |
6109 | } | ||
5528 | 6110 | ||
5529 | list_for_each_entry_rcu(pmu, &pmus, entry) { | 6111 | list_for_each_entry_rcu(pmu, &pmus, entry) { |
5530 | int ret = pmu->event_init(event); | 6112 | ret = pmu->event_init(event); |
5531 | if (!ret) | 6113 | if (!ret) |
5532 | goto unlock; | 6114 | goto unlock; |
5533 | 6115 | ||
@@ -5653,7 +6235,7 @@ done: | |||
5653 | 6235 | ||
5654 | if (!event->parent) { | 6236 | if (!event->parent) { |
5655 | if (event->attach_state & PERF_ATTACH_TASK) | 6237 | if (event->attach_state & PERF_ATTACH_TASK) |
5656 | jump_label_inc(&perf_task_events); | 6238 | jump_label_inc(&perf_sched_events); |
5657 | if (event->attr.mmap || event->attr.mmap_data) | 6239 | if (event->attr.mmap || event->attr.mmap_data) |
5658 | atomic_inc(&nr_mmap_events); | 6240 | atomic_inc(&nr_mmap_events); |
5659 | if (event->attr.comm) | 6241 | if (event->attr.comm) |
@@ -5828,7 +6410,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5828 | int err; | 6410 | int err; |
5829 | 6411 | ||
5830 | /* for future expandability... */ | 6412 | /* for future expandability... */ |
5831 | if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) | 6413 | if (flags & ~PERF_FLAG_ALL) |
5832 | return -EINVAL; | 6414 | return -EINVAL; |
5833 | 6415 | ||
5834 | err = perf_copy_attr(attr_uptr, &attr); | 6416 | err = perf_copy_attr(attr_uptr, &attr); |
@@ -5845,6 +6427,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5845 | return -EINVAL; | 6427 | return -EINVAL; |
5846 | } | 6428 | } |
5847 | 6429 | ||
6430 | /* | ||
6431 | * In cgroup mode, the pid argument is used to pass the fd | ||
6432 | * opened to the cgroup directory in cgroupfs. The cpu argument | ||
6433 | * designates the cpu on which to monitor threads from that | ||
6434 | * cgroup. | ||
6435 | */ | ||
6436 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | ||
6437 | return -EINVAL; | ||
6438 | |||
5848 | event_fd = get_unused_fd_flags(O_RDWR); | 6439 | event_fd = get_unused_fd_flags(O_RDWR); |
5849 | if (event_fd < 0) | 6440 | if (event_fd < 0) |
5850 | return event_fd; | 6441 | return event_fd; |
@@ -5862,7 +6453,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5862 | group_leader = NULL; | 6453 | group_leader = NULL; |
5863 | } | 6454 | } |
5864 | 6455 | ||
5865 | if (pid != -1) { | 6456 | if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { |
5866 | task = find_lively_task_by_vpid(pid); | 6457 | task = find_lively_task_by_vpid(pid); |
5867 | if (IS_ERR(task)) { | 6458 | if (IS_ERR(task)) { |
5868 | err = PTR_ERR(task); | 6459 | err = PTR_ERR(task); |
@@ -5876,6 +6467,19 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5876 | goto err_task; | 6467 | goto err_task; |
5877 | } | 6468 | } |
5878 | 6469 | ||
6470 | if (flags & PERF_FLAG_PID_CGROUP) { | ||
6471 | err = perf_cgroup_connect(pid, event, &attr, group_leader); | ||
6472 | if (err) | ||
6473 | goto err_alloc; | ||
6474 | /* | ||
6475 | * one more event: | ||
6476 | * - that has cgroup constraint on event->cpu | ||
6477 | * - that may need work on context switch | ||
6478 | */ | ||
6479 | atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); | ||
6480 | jump_label_inc(&perf_sched_events); | ||
6481 | } | ||
6482 | |||
5879 | /* | 6483 | /* |
5880 | * Special case software events and allow them to be part of | 6484 | * Special case software events and allow them to be part of |
5881 | * any hardware group. | 6485 | * any hardware group. |
@@ -5961,10 +6565,10 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5961 | struct perf_event_context *gctx = group_leader->ctx; | 6565 | struct perf_event_context *gctx = group_leader->ctx; |
5962 | 6566 | ||
5963 | mutex_lock(&gctx->mutex); | 6567 | mutex_lock(&gctx->mutex); |
5964 | perf_event_remove_from_context(group_leader); | 6568 | perf_remove_from_context(group_leader); |
5965 | list_for_each_entry(sibling, &group_leader->sibling_list, | 6569 | list_for_each_entry(sibling, &group_leader->sibling_list, |
5966 | group_entry) { | 6570 | group_entry) { |
5967 | perf_event_remove_from_context(sibling); | 6571 | perf_remove_from_context(sibling); |
5968 | put_ctx(gctx); | 6572 | put_ctx(gctx); |
5969 | } | 6573 | } |
5970 | mutex_unlock(&gctx->mutex); | 6574 | mutex_unlock(&gctx->mutex); |
@@ -5987,6 +6591,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
5987 | 6591 | ||
5988 | perf_install_in_context(ctx, event, cpu); | 6592 | perf_install_in_context(ctx, event, cpu); |
5989 | ++ctx->generation; | 6593 | ++ctx->generation; |
6594 | perf_unpin_context(ctx); | ||
5990 | mutex_unlock(&ctx->mutex); | 6595 | mutex_unlock(&ctx->mutex); |
5991 | 6596 | ||
5992 | event->owner = current; | 6597 | event->owner = current; |
@@ -6012,6 +6617,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
6012 | return event_fd; | 6617 | return event_fd; |
6013 | 6618 | ||
6014 | err_context: | 6619 | err_context: |
6620 | perf_unpin_context(ctx); | ||
6015 | put_ctx(ctx); | 6621 | put_ctx(ctx); |
6016 | err_alloc: | 6622 | err_alloc: |
6017 | free_event(event); | 6623 | free_event(event); |
@@ -6062,6 +6668,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
6062 | mutex_lock(&ctx->mutex); | 6668 | mutex_lock(&ctx->mutex); |
6063 | perf_install_in_context(ctx, event, cpu); | 6669 | perf_install_in_context(ctx, event, cpu); |
6064 | ++ctx->generation; | 6670 | ++ctx->generation; |
6671 | perf_unpin_context(ctx); | ||
6065 | mutex_unlock(&ctx->mutex); | 6672 | mutex_unlock(&ctx->mutex); |
6066 | 6673 | ||
6067 | return event; | 6674 | return event; |
@@ -6115,7 +6722,7 @@ __perf_event_exit_task(struct perf_event *child_event, | |||
6115 | { | 6722 | { |
6116 | struct perf_event *parent_event; | 6723 | struct perf_event *parent_event; |
6117 | 6724 | ||
6118 | perf_event_remove_from_context(child_event); | 6725 | perf_remove_from_context(child_event); |
6119 | 6726 | ||
6120 | parent_event = child_event->parent; | 6727 | parent_event = child_event->parent; |
6121 | /* | 6728 | /* |
@@ -6422,7 +7029,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, | |||
6422 | return 0; | 7029 | return 0; |
6423 | } | 7030 | } |
6424 | 7031 | ||
6425 | child_ctx = child->perf_event_ctxp[ctxn]; | 7032 | child_ctx = child->perf_event_ctxp[ctxn]; |
6426 | if (!child_ctx) { | 7033 | if (!child_ctx) { |
6427 | /* | 7034 | /* |
6428 | * This is executed from the parent task context, so | 7035 | * This is executed from the parent task context, so |
@@ -6537,6 +7144,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) | |||
6537 | mutex_unlock(&parent_ctx->mutex); | 7144 | mutex_unlock(&parent_ctx->mutex); |
6538 | 7145 | ||
6539 | perf_unpin_context(parent_ctx); | 7146 | perf_unpin_context(parent_ctx); |
7147 | put_ctx(parent_ctx); | ||
6540 | 7148 | ||
6541 | return ret; | 7149 | return ret; |
6542 | } | 7150 | } |
@@ -6606,9 +7214,9 @@ static void __perf_event_exit_context(void *__info) | |||
6606 | perf_pmu_rotate_stop(ctx->pmu); | 7214 | perf_pmu_rotate_stop(ctx->pmu); |
6607 | 7215 | ||
6608 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) | 7216 | list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) |
6609 | __perf_event_remove_from_context(event); | 7217 | __perf_remove_from_context(event); |
6610 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) | 7218 | list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) |
6611 | __perf_event_remove_from_context(event); | 7219 | __perf_remove_from_context(event); |
6612 | } | 7220 | } |
6613 | 7221 | ||
6614 | static void perf_event_exit_cpu_context(int cpu) | 7222 | static void perf_event_exit_cpu_context(int cpu) |
@@ -6732,3 +7340,83 @@ unlock: | |||
6732 | return ret; | 7340 | return ret; |
6733 | } | 7341 | } |
6734 | device_initcall(perf_event_sysfs_init); | 7342 | device_initcall(perf_event_sysfs_init); |
7343 | |||
7344 | #ifdef CONFIG_CGROUP_PERF | ||
7345 | static struct cgroup_subsys_state *perf_cgroup_create( | ||
7346 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
7347 | { | ||
7348 | struct perf_cgroup *jc; | ||
7349 | |||
7350 | jc = kzalloc(sizeof(*jc), GFP_KERNEL); | ||
7351 | if (!jc) | ||
7352 | return ERR_PTR(-ENOMEM); | ||
7353 | |||
7354 | jc->info = alloc_percpu(struct perf_cgroup_info); | ||
7355 | if (!jc->info) { | ||
7356 | kfree(jc); | ||
7357 | return ERR_PTR(-ENOMEM); | ||
7358 | } | ||
7359 | |||
7360 | return &jc->css; | ||
7361 | } | ||
7362 | |||
7363 | static void perf_cgroup_destroy(struct cgroup_subsys *ss, | ||
7364 | struct cgroup *cont) | ||
7365 | { | ||
7366 | struct perf_cgroup *jc; | ||
7367 | jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), | ||
7368 | struct perf_cgroup, css); | ||
7369 | free_percpu(jc->info); | ||
7370 | kfree(jc); | ||
7371 | } | ||
7372 | |||
7373 | static int __perf_cgroup_move(void *info) | ||
7374 | { | ||
7375 | struct task_struct *task = info; | ||
7376 | perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); | ||
7377 | return 0; | ||
7378 | } | ||
7379 | |||
7380 | static void perf_cgroup_move(struct task_struct *task) | ||
7381 | { | ||
7382 | task_function_call(task, __perf_cgroup_move, task); | ||
7383 | } | ||
7384 | |||
7385 | static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7386 | struct cgroup *old_cgrp, struct task_struct *task, | ||
7387 | bool threadgroup) | ||
7388 | { | ||
7389 | perf_cgroup_move(task); | ||
7390 | if (threadgroup) { | ||
7391 | struct task_struct *c; | ||
7392 | rcu_read_lock(); | ||
7393 | list_for_each_entry_rcu(c, &task->thread_group, thread_group) { | ||
7394 | perf_cgroup_move(c); | ||
7395 | } | ||
7396 | rcu_read_unlock(); | ||
7397 | } | ||
7398 | } | ||
7399 | |||
7400 | static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
7401 | struct cgroup *old_cgrp, struct task_struct *task) | ||
7402 | { | ||
7403 | /* | ||
7404 | * cgroup_exit() is called in the copy_process() failure path. | ||
7405 | * Ignore this case since the task hasn't ran yet, this avoids | ||
7406 | * trying to poke a half freed task state from generic code. | ||
7407 | */ | ||
7408 | if (!(task->flags & PF_EXITING)) | ||
7409 | return; | ||
7410 | |||
7411 | perf_cgroup_move(task); | ||
7412 | } | ||
7413 | |||
7414 | struct cgroup_subsys perf_subsys = { | ||
7415 | .name = "perf_event", | ||
7416 | .subsys_id = perf_subsys_id, | ||
7417 | .create = perf_cgroup_create, | ||
7418 | .destroy = perf_cgroup_destroy, | ||
7419 | .exit = perf_cgroup_exit, | ||
7420 | .attach = perf_cgroup_attach, | ||
7421 | }; | ||
7422 | #endif /* CONFIG_CGROUP_PERF */ | ||