aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStephane Eranian <eranian@google.com>2012-02-09 17:21:00 -0500
committerIngo Molnar <mingo@elte.hu>2012-03-05 08:55:42 -0500
commitd010b3326cf06b3406cdd88af16dcf4e4b6fec2e (patch)
treed0468d78582aeff6a603cb5d29b1a14310106896
parent2481c5fa6db0237e4f0168f88913178b2b495b7c (diff)
perf: Add callback to flush branch_stack on context switch
With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian <eranian@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/kernel/cpu/perf_event.c21
-rw-r--r--arch/x86/kernel/cpu/perf_event.h1
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c13
-rw-r--r--include/linux/perf_event.h9
-rw-r--r--kernel/events/core.c85
5 files changed, 121 insertions, 8 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cea567483274..0a18d16cb58d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
1671 NULL, 1671 NULL,
1672}; 1672};
1673 1673
1674static void x86_pmu_flush_branch_stack(void)
1675{
1676 if (x86_pmu.flush_branch_stack)
1677 x86_pmu.flush_branch_stack();
1678}
1679
1674static struct pmu pmu = { 1680static struct pmu pmu = {
1675 .pmu_enable = x86_pmu_enable, 1681 .pmu_enable = x86_pmu_enable,
1676 .pmu_disable = x86_pmu_disable, 1682 .pmu_disable = x86_pmu_disable,
1677 1683
1678 .attr_groups = x86_pmu_attr_groups, 1684 .attr_groups = x86_pmu_attr_groups,
1679 1685
1680 .event_init = x86_pmu_event_init, 1686 .event_init = x86_pmu_event_init,
1681 1687
1682 .add = x86_pmu_add, 1688 .add = x86_pmu_add,
1683 .del = x86_pmu_del, 1689 .del = x86_pmu_del,
1684 .start = x86_pmu_start, 1690 .start = x86_pmu_start,
1685 .stop = x86_pmu_stop, 1691 .stop = x86_pmu_stop,
1686 .read = x86_pmu_read, 1692 .read = x86_pmu_read,
1687 1693
1688 .start_txn = x86_pmu_start_txn, 1694 .start_txn = x86_pmu_start_txn,
1689 .cancel_txn = x86_pmu_cancel_txn, 1695 .cancel_txn = x86_pmu_cancel_txn,
1690 .commit_txn = x86_pmu_commit_txn, 1696 .commit_txn = x86_pmu_commit_txn,
1691 1697
1692 .event_idx = x86_pmu_event_idx, 1698 .event_idx = x86_pmu_event_idx,
1699 .flush_branch_stack = x86_pmu_flush_branch_stack,
1693}; 1700};
1694 1701
1695void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) 1702void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f104c054dc5c..74387c12dc72 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -324,6 +324,7 @@ struct x86_pmu {
324 void (*cpu_starting)(int cpu); 324 void (*cpu_starting)(int cpu);
325 void (*cpu_dying)(int cpu); 325 void (*cpu_dying)(int cpu);
326 void (*cpu_dead)(int cpu); 326 void (*cpu_dead)(int cpu);
327 void (*flush_branch_stack)(void);
327 328
328 /* 329 /*
329 * Intel Arch Perfmon v2+ 330 * Intel Arch Perfmon v2+
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7cc1e2dcc4dd..6627089232a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu)
1539 fini_debug_store_on_cpu(cpu); 1539 fini_debug_store_on_cpu(cpu);
1540} 1540}
1541 1541
1542static void intel_pmu_flush_branch_stack(void)
1543{
1544 /*
1545 * Intel LBR does not tag entries with the
1546 * PID of the current task, then we need to
1547 * flush it on ctxsw
1548 * For now, we simply reset it
1549 */
1550 if (x86_pmu.lbr_nr)
1551 intel_pmu_lbr_reset();
1552}
1553
1542static __initconst const struct x86_pmu intel_pmu = { 1554static __initconst const struct x86_pmu intel_pmu = {
1543 .name = "Intel", 1555 .name = "Intel",
1544 .handle_irq = intel_pmu_handle_irq, 1556 .handle_irq = intel_pmu_handle_irq,
@@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = {
1566 .cpu_starting = intel_pmu_cpu_starting, 1578 .cpu_starting = intel_pmu_cpu_starting,
1567 .cpu_dying = intel_pmu_cpu_dying, 1579 .cpu_dying = intel_pmu_cpu_dying,
1568 .guest_get_msrs = intel_guest_get_msrs, 1580 .guest_get_msrs = intel_guest_get_msrs,
1581 .flush_branch_stack = intel_pmu_flush_branch_stack,
1569}; 1582};
1570 1583
1571static __init void intel_clovertown_quirk(void) 1584static __init void intel_clovertown_quirk(void)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 5fc494f4a094..fbbf5e598368 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -746,6 +746,11 @@ struct pmu {
746 * if no implementation is provided it will default to: event->hw.idx + 1. 746 * if no implementation is provided it will default to: event->hw.idx + 1.
747 */ 747 */
748 int (*event_idx) (struct perf_event *event); /*optional */ 748 int (*event_idx) (struct perf_event *event); /*optional */
749
750 /*
751 * flush branch stack on context-switches (needed in cpu-wide mode)
752 */
753 void (*flush_branch_stack) (void);
749}; 754};
750 755
751/** 756/**
@@ -979,7 +984,8 @@ struct perf_event_context {
979 u64 parent_gen; 984 u64 parent_gen;
980 u64 generation; 985 u64 generation;
981 int pin_count; 986 int pin_count;
982 int nr_cgroups; /* cgroup events present */ 987 int nr_cgroups; /* cgroup evts */
988 int nr_branch_stack; /* branch_stack evt */
983 struct rcu_head rcu_head; 989 struct rcu_head rcu_head;
984}; 990};
985 991
@@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
1044extern u64 perf_event_read_value(struct perf_event *event, 1050extern u64 perf_event_read_value(struct perf_event *event,
1045 u64 *enabled, u64 *running); 1051 u64 *enabled, u64 *running);
1046 1052
1053
1047struct perf_sample_data { 1054struct perf_sample_data {
1048 u64 type; 1055 u64 type;
1049 1056
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 242bb51c67f2..c61234b1a988 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -137,6 +137,7 @@ enum event_type_t {
137 */ 137 */
138struct static_key_deferred perf_sched_events __read_mostly; 138struct static_key_deferred perf_sched_events __read_mostly;
139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
140 141
141static atomic_t nr_mmap_events __read_mostly; 142static atomic_t nr_mmap_events __read_mostly;
142static atomic_t nr_comm_events __read_mostly; 143static atomic_t nr_comm_events __read_mostly;
@@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
888 if (is_cgroup_event(event)) 889 if (is_cgroup_event(event))
889 ctx->nr_cgroups++; 890 ctx->nr_cgroups++;
890 891
892 if (has_branch_stack(event))
893 ctx->nr_branch_stack++;
894
891 list_add_rcu(&event->event_entry, &ctx->event_list); 895 list_add_rcu(&event->event_entry, &ctx->event_list);
892 if (!ctx->nr_events) 896 if (!ctx->nr_events)
893 perf_pmu_rotate_start(ctx->pmu); 897 perf_pmu_rotate_start(ctx->pmu);
@@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1027 cpuctx->cgrp = NULL; 1031 cpuctx->cgrp = NULL;
1028 } 1032 }
1029 1033
1034 if (has_branch_stack(event))
1035 ctx->nr_branch_stack--;
1036
1030 ctx->nr_events--; 1037 ctx->nr_events--;
1031 if (event->attr.inherit_stat) 1038 if (event->attr.inherit_stat)
1032 ctx->nr_stat--; 1039 ctx->nr_stat--;
@@ -2202,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2202} 2209}
2203 2210
2204/* 2211/*
2212 * When sampling the branck stack in system-wide, it may be necessary
2213 * to flush the stack on context switch. This happens when the branch
2214 * stack does not tag its entries with the pid of the current task.
2215 * Otherwise it becomes impossible to associate a branch entry with a
2216 * task. This ambiguity is more likely to appear when the branch stack
2217 * supports priv level filtering and the user sets it to monitor only
2218 * at the user level (which could be a useful measurement in system-wide
2219 * mode). In that case, the risk is high of having a branch stack with
2220 * branch from multiple tasks. Flushing may mean dropping the existing
2221 * entries or stashing them somewhere in the PMU specific code layer.
2222 *
2223 * This function provides the context switch callback to the lower code
2224 * layer. It is invoked ONLY when there is at least one system-wide context
2225 * with at least one active event using taken branch sampling.
2226 */
2227static void perf_branch_stack_sched_in(struct task_struct *prev,
2228 struct task_struct *task)
2229{
2230 struct perf_cpu_context *cpuctx;
2231 struct pmu *pmu;
2232 unsigned long flags;
2233
2234 /* no need to flush branch stack if not changing task */
2235 if (prev == task)
2236 return;
2237
2238 local_irq_save(flags);
2239
2240 rcu_read_lock();
2241
2242 list_for_each_entry_rcu(pmu, &pmus, entry) {
2243 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2244
2245 /*
2246 * check if the context has at least one
2247 * event using PERF_SAMPLE_BRANCH_STACK
2248 */
2249 if (cpuctx->ctx.nr_branch_stack > 0
2250 && pmu->flush_branch_stack) {
2251
2252 pmu = cpuctx->ctx.pmu;
2253
2254 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2255
2256 perf_pmu_disable(pmu);
2257
2258 pmu->flush_branch_stack();
2259
2260 perf_pmu_enable(pmu);
2261
2262 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2263 }
2264 }
2265
2266 rcu_read_unlock();
2267
2268 local_irq_restore(flags);
2269}
2270
2271/*
2205 * Called from scheduler to add the events of the current task 2272 * Called from scheduler to add the events of the current task
2206 * with interrupts disabled. 2273 * with interrupts disabled.
2207 * 2274 *
@@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2232 */ 2299 */
2233 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2300 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2234 perf_cgroup_sched_in(prev, task); 2301 perf_cgroup_sched_in(prev, task);
2302
2303 /* check for system-wide branch_stack events */
2304 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2305 perf_branch_stack_sched_in(prev, task);
2235} 2306}
2236 2307
2237static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event)
2798 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2869 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2799 static_key_slow_dec_deferred(&perf_sched_events); 2870 static_key_slow_dec_deferred(&perf_sched_events);
2800 } 2871 }
2872
2873 if (has_branch_stack(event)) {
2874 static_key_slow_dec_deferred(&perf_sched_events);
2875 /* is system-wide event */
2876 if (!(event->attach_state & PERF_ATTACH_TASK))
2877 atomic_dec(&per_cpu(perf_branch_stack_events,
2878 event->cpu));
2879 }
2801 } 2880 }
2802 2881
2803 if (event->rb) { 2882 if (event->rb) {
@@ -5924,6 +6003,12 @@ done:
5924 return ERR_PTR(err); 6003 return ERR_PTR(err);
5925 } 6004 }
5926 } 6005 }
6006 if (has_branch_stack(event)) {
6007 static_key_slow_inc(&perf_sched_events.key);
6008 if (!(event->attach_state & PERF_ATTACH_TASK))
6009 atomic_inc(&per_cpu(perf_branch_stack_events,
6010 event->cpu));
6011 }
5927 } 6012 }
5928 6013
5929 return event; 6014 return event;