aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/perf_event.h
diff options
context:
space:
mode:
authorStephane Eranian <eranian@google.com>2011-08-25 09:58:03 -0400
committerIngo Molnar <mingo@elte.hu>2011-08-29 06:28:33 -0400
commita8d757ef076f0f95f13a918808824058de25b3eb (patch)
tree3c1151ef886d9b72d0a7b7b267d9f37c72d5f475 /include/linux/perf_event.h
parentc6a389f123b9f68d605bb7e0f9b32ec1e3e14132 (diff)
perf events: Fix slow and broken cgroup context switch code
The current cgroup context switch code was incorrect leading to bogus counts. Furthermore, as soon as there was an active cgroup event on a CPU, the context switch cost on that CPU would increase by a significant amount as demonstrated by a simple ping/pong example: $ ./pong Both processes pinned to CPU1, running for 10s 10684.51 ctxsw/s Now start a cgroup perf stat: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 $ ./pong Both processes pinned to CPU1, running for 10s 6674.61 ctxsw/s That's a 37% penalty. Note that pong is not even in the monitored cgroup. The results shown by perf stat are bogus: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 100 Performance counter stats for 'sleep 100': CPU1 <not counted> cycles test CPU1 16,984,189,138 cycles # 0.000 GHz The second 'cycles' event should report a count @ CPU clock (here 2.4GHz) as it is counting across all cgroups. The patch below fixes the bogus accounting and bypasses any cgroup switches in case the outgoing and incoming tasks are in the same cgroup. With this patch the same test now yields: $ ./pong Both processes pinned to CPU1, running for 10s 10775.30 ctxsw/s Start perf stat with cgroup: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Run pong outside the cgroup: $ /pong Both processes pinned to CPU1, running for 10s 10687.80 ctxsw/s The penalty is now less than 2%. And the results for perf stat are correct: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 <not counted> cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz Now perf stat reports the correct counts for for the non cgroup event. If we run pong inside the cgroup, then we also get the correct counts: $ perf stat -e cycles,cycles -A -a -G test -C 1 -- sleep 10 Performance counter stats for 'sleep 10': CPU1 22,297,726,205 cycles test # 0.000 GHz CPU1 23,933,981,448 cycles # 0.000 GHz 10.001457237 seconds time elapsed Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110825135803.GA4697@quad Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'include/linux/perf_event.h')
-rw-r--r--include/linux/perf_event.h24
1 files changed, 15 insertions, 9 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 245bafdafd5e..c816075c01ce 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -944,8 +944,10 @@ extern void perf_pmu_unregister(struct pmu *pmu);
944 944
945extern int perf_num_counters(void); 945extern int perf_num_counters(void);
946extern const char *perf_pmu_name(void); 946extern const char *perf_pmu_name(void);
947extern void __perf_event_task_sched_in(struct task_struct *task); 947extern void __perf_event_task_sched_in(struct task_struct *prev,
948extern void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next); 948 struct task_struct *task);
949extern void __perf_event_task_sched_out(struct task_struct *prev,
950 struct task_struct *next);
949extern int perf_event_init_task(struct task_struct *child); 951extern int perf_event_init_task(struct task_struct *child);
950extern void perf_event_exit_task(struct task_struct *child); 952extern void perf_event_exit_task(struct task_struct *child);
951extern void perf_event_free_task(struct task_struct *task); 953extern void perf_event_free_task(struct task_struct *task);
@@ -1059,17 +1061,20 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
1059 1061
1060extern struct jump_label_key perf_sched_events; 1062extern struct jump_label_key perf_sched_events;
1061 1063
1062static inline void perf_event_task_sched_in(struct task_struct *task) 1064static inline void perf_event_task_sched_in(struct task_struct *prev,
1065 struct task_struct *task)
1063{ 1066{
1064 if (static_branch(&perf_sched_events)) 1067 if (static_branch(&perf_sched_events))
1065 __perf_event_task_sched_in(task); 1068 __perf_event_task_sched_in(prev, task);
1066} 1069}
1067 1070
1068static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) 1071static inline void perf_event_task_sched_out(struct task_struct *prev,
1072 struct task_struct *next)
1069{ 1073{
1070 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); 1074 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
1071 1075
1072 __perf_event_task_sched_out(task, next); 1076 if (static_branch(&perf_sched_events))
1077 __perf_event_task_sched_out(prev, next);
1073} 1078}
1074 1079
1075extern void perf_event_mmap(struct vm_area_struct *vma); 1080extern void perf_event_mmap(struct vm_area_struct *vma);
@@ -1139,10 +1144,11 @@ extern void perf_event_disable(struct perf_event *event);
1139extern void perf_event_task_tick(void); 1144extern void perf_event_task_tick(void);
1140#else 1145#else
1141static inline void 1146static inline void
1142perf_event_task_sched_in(struct task_struct *task) { } 1147perf_event_task_sched_in(struct task_struct *prev,
1148 struct task_struct *task) { }
1143static inline void 1149static inline void
1144perf_event_task_sched_out(struct task_struct *task, 1150perf_event_task_sched_out(struct task_struct *prev,
1145 struct task_struct *next) { } 1151 struct task_struct *next) { }
1146static inline int perf_event_init_task(struct task_struct *child) { return 0; } 1152static inline int perf_event_init_task(struct task_struct *child) { return 0; }
1147static inline void perf_event_exit_task(struct task_struct *child) { } 1153static inline void perf_event_exit_task(struct task_struct *child) { }
1148static inline void perf_event_free_task(struct task_struct *task) { } 1154static inline void perf_event_free_task(struct task_struct *task) { }