aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorStephane Eranian <eranian@google.com>2011-08-25 09:58:03 -0400
committerIngo Molnar <mingo@elte.hu>2011-08-29 06:28:33 -0400
commita8d757ef076f0f95f13a918808824058de25b3eb (patch)
tree3c1151ef886d9b72d0a7b7b267d9f37c72d5f475 /kernel
parentc6a389f123b9f68d605bb7e0f9b32ec1e3e14132 (diff)
perf events: Fix slow and broken cgroup context switch code
The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 <not counted> cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 <not counted> cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c63
-rw-r--r--kernel/sched.c2
2 files changed, 54 insertions, 11 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8785e26ee1c..45847fbb599a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -399,14 +399,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
399 local_irq_restore(flags); 399 local_irq_restore(flags);
400} 400}
401 401
402static inline void perf_cgroup_sched_out(struct task_struct *task) 402static inline void perf_cgroup_sched_out(struct task_struct *task,
403 struct task_struct *next)
403{ 404{
404 perf_cgroup_switch(task, PERF_CGROUP_SWOUT); 405 struct perf_cgroup *cgrp1;
406 struct perf_cgroup *cgrp2 = NULL;
407
408 /*
409 * we come here when we know perf_cgroup_events > 0
410 */
411 cgrp1 = perf_cgroup_from_task(task);
412
413 /*
414 * next is NULL when called from perf_event_enable_on_exec()
415 * that will systematically cause a cgroup_switch()
416 */
417 if (next)
418 cgrp2 = perf_cgroup_from_task(next);
419
420 /*
421 * only schedule out current cgroup events if we know
422 * that we are switching to a different cgroup. Otherwise,
423 * do no touch the cgroup events.
424 */
425 if (cgrp1 != cgrp2)
426 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
405} 427}
406 428
407static inline void perf_cgroup_sched_in(struct task_struct *task) 429static inline void perf_cgroup_sched_in(struct task_struct *prev,
430 struct task_struct *task)
408{ 431{
409 perf_cgroup_switch(task, PERF_CGROUP_SWIN); 432 struct perf_cgroup *cgrp1;
433 struct perf_cgroup *cgrp2 = NULL;
434
435 /*
436 * we come here when we know perf_cgroup_events > 0
437 */
438 cgrp1 = perf_cgroup_from_task(task);
439
440 /* prev can never be NULL */
441 cgrp2 = perf_cgroup_from_task(prev);
442
443 /*
444 * only need to schedule in cgroup events if we are changing
445 * cgroup during ctxsw. Cgroup events were not scheduled
446 * out of ctxsw out if that was not the case.
447 */
448 if (cgrp1 != cgrp2)
449 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
410} 450}
411 451
412static inline int perf_cgroup_connect(int fd, struct perf_event *event, 452static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -518,11 +558,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
518{ 558{
519} 559}
520 560
521static inline void perf_cgroup_sched_out(struct task_struct *task) 561static inline void perf_cgroup_sched_out(struct task_struct *task,
562 struct task_struct *next)
522{ 563{
523} 564}
524 565
525static inline void perf_cgroup_sched_in(struct task_struct *task) 566static inline void perf_cgroup_sched_in(struct task_struct *prev,
567 struct task_struct *task)
526{ 568{
527} 569}
528 570
@@ -1988,7 +2030,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1988 * cgroup event are system-wide mode only 2030 * cgroup event are system-wide mode only
1989 */ 2031 */
1990 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2032 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
1991 perf_cgroup_sched_out(task); 2033 perf_cgroup_sched_out(task, next);
1992} 2034}
1993 2035
1994static void task_ctx_sched_out(struct perf_event_context *ctx) 2036static void task_ctx_sched_out(struct perf_event_context *ctx)
@@ -2153,7 +2195,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2153 * accessing the event control register. If a NMI hits, then it will 2195 * accessing the event control register. If a NMI hits, then it will
2154 * keep the event running. 2196 * keep the event running.
2155 */ 2197 */
2156void __perf_event_task_sched_in(struct task_struct *task) 2198void __perf_event_task_sched_in(struct task_struct *prev,
2199 struct task_struct *task)
2157{ 2200{
2158 struct perf_event_context *ctx; 2201 struct perf_event_context *ctx;
2159 int ctxn; 2202 int ctxn;
@@ -2171,7 +2214,7 @@ void __perf_event_task_sched_in(struct task_struct *task)
2171 * cgroup event are system-wide mode only 2214 * cgroup event are system-wide mode only
2172 */ 2215 */
2173 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2216 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2174 perf_cgroup_sched_in(task); 2217 perf_cgroup_sched_in(prev, task);
2175} 2218}
2176 2219
2177static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2220static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2427,7 +2470,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2427 * ctxswin cgroup events which are already scheduled 2470 * ctxswin cgroup events which are already scheduled
2428 * in. 2471 * in.
2429 */ 2472 */
2430 perf_cgroup_sched_out(current); 2473 perf_cgroup_sched_out(current, NULL);
2431 2474
2432 raw_spin_lock(&ctx->lock); 2475 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx); 2476 task_ctx_sched_out(ctx);
diff --git a/kernel/sched.c b/kernel/sched.c
index ccacdbdecf45..0408cdc6d572 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3065,7 +3065,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3065#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3065#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3066 local_irq_disable(); 3066 local_irq_disable();
3067#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3067#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3068 perf_event_task_sched_in(current); 3068 perf_event_task_sched_in(prev, current);
3069#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 3069#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3070 local_irq_enable(); 3070 local_irq_enable();
3071#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 3071#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */