aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYan, Zheng <zheng.z.yan@intel.com>2015-05-06 15:33:50 -0400
committerIngo Molnar <mingo@kernel.org>2015-06-07 10:08:49 -0400
commit3569c0d7c5440d6fd06b10e1ef9614588a049bc7 (patch)
treeb8241d20699e63b2f18ca3def3720d6b857d534d
parent21509084f999d7accd32e45961ef76853112e978 (diff)
perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)
PEBS always had the capability to log samples to its buffers without an interrupt. Traditionally perf has not used this but always set the PEBS threshold to one. For frequently occurring events (like cycles or branches or load/store) this in term requires using a relatively high sampling period to avoid overloading the system, by only processing PMIs. This in term increases sampling error. For the common cases we still need to use the PMI because the PEBS hardware has various limitations. The biggest one is that it can not supply a callgraph. It also requires setting a fixed period, as the hardware does not support adaptive period. Another issue is that it cannot supply a time stamp and some other options. To supply a TID it requires flushing on context switch. It can however supply the IP, the load/store address, TSX information, registers, and some other things. So we can make PEBS work for some specific cases, basically as long as you can do without a callgraph and can set the period you can use this new PEBS mode. The main benefit is the ability to support much lower sampling period (down to -c 1000) without extensive overhead. One use cases is for example to increase the resolution of the c2c tool. Another is double checking when you suspect the standard sampling has too much sampling error. Some numbers on the overhead, using cycle soak, comparing the elapsed time from "kernbench -M -H" between plain (threshold set to one) and multi (large threshold). The test command for plain: "perf record --time -e cycles:p -c $period -- kernbench -M -H" The test command for multi: "perf record --no-time -e cycles:p -c $period -- kernbench -M -H" ( The only difference of test command between multi and plain is time stamp options. Since time stamp is not supported by large PEBS threshold, it can be used as a flag to indicate if large threshold is enabled during the test. ) period plain(Sec) multi(Sec) Delta 10003 32.7 16.5 16.2 20003 30.2 16.2 14.0 40003 18.6 14.1 4.5 80003 16.8 14.6 2.2 100003 16.9 14.1 2.8 800003 15.4 15.7 -0.3 1000003 15.3 15.2 0.2 2000003 15.3 15.1 0.1 With periods below 100003, plain (threshold one) cause much more overhead. With 10003 sampling period, the Elapsed Time for multi is even 2X faster than plain. Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> Signed-off-by: Kan Liang <kan.liang@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: acme@infradead.org Cc: eranian@google.com Link: http://lkml.kernel.org/r/1430940834-8964-5-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/perf_event.h11
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c5
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c27
3 files changed, 38 insertions, 5 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7a3f0fdd2fbd..a73dfc97226b 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -76,6 +76,7 @@ struct event_constraint {
76#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ 76#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
77#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ 77#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */
78#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ 78#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */
79#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */
79 80
80 81
81struct amd_nb { 82struct amd_nb {
@@ -89,6 +90,16 @@ struct amd_nb {
89#define MAX_PEBS_EVENTS 8 90#define MAX_PEBS_EVENTS 8
90 91
91/* 92/*
93 * Flags PEBS can handle without an PMI.
94 *
95 */
96#define PEBS_FREERUNNING_FLAGS \
97 (PERF_SAMPLE_IP | PERF_SAMPLE_ADDR | \
98 PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
99 PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
100 PERF_SAMPLE_TRANSACTION)
101
102/*
92 * A debug store configuration. 103 * A debug store configuration.
93 * 104 *
94 * We only support architectures that use 64bit fields. 105 * We only support architectures that use 64bit fields.
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 17628930a80e..6985f43c5eb9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2261,8 +2261,11 @@ static int intel_pmu_hw_config(struct perf_event *event)
2261 return ret; 2261 return ret;
2262 2262
2263 if (event->attr.precise_ip) { 2263 if (event->attr.precise_ip) {
2264 if (!event->attr.freq) 2264 if (!event->attr.freq) {
2265 event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; 2265 event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
2266 if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
2267 event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
2268 }
2266 if (x86_pmu.pebs_aliases) 2269 if (x86_pmu.pebs_aliases)
2267 x86_pmu.pebs_aliases(event); 2270 x86_pmu.pebs_aliases(event);
2268 } 2271 }
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 72529c237e6e..0ce455d958b8 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -250,7 +250,7 @@ static int alloc_pebs_buffer(int cpu)
250{ 250{
251 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 251 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
252 int node = cpu_to_node(cpu); 252 int node = cpu_to_node(cpu);
253 int max, thresh = 1; /* always use a single PEBS record */ 253 int max;
254 void *buffer, *ibuffer; 254 void *buffer, *ibuffer;
255 255
256 if (!x86_pmu.pebs) 256 if (!x86_pmu.pebs)
@@ -280,9 +280,6 @@ static int alloc_pebs_buffer(int cpu)
280 ds->pebs_absolute_maximum = ds->pebs_buffer_base + 280 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
281 max * x86_pmu.pebs_record_size; 281 max * x86_pmu.pebs_record_size;
282 282
283 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
284 thresh * x86_pmu.pebs_record_size;
285
286 return 0; 283 return 0;
287} 284}
288 285
@@ -684,14 +681,22 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
684 return &emptyconstraint; 681 return &emptyconstraint;
685} 682}
686 683
684static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
685{
686 return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
687}
688
687void intel_pmu_pebs_enable(struct perf_event *event) 689void intel_pmu_pebs_enable(struct perf_event *event)
688{ 690{
689 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 691 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
690 struct hw_perf_event *hwc = &event->hw; 692 struct hw_perf_event *hwc = &event->hw;
691 struct debug_store *ds = cpuc->ds; 693 struct debug_store *ds = cpuc->ds;
694 bool first_pebs;
695 u64 threshold;
692 696
693 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 697 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
694 698
699 first_pebs = !pebs_is_enabled(cpuc);
695 cpuc->pebs_enabled |= 1ULL << hwc->idx; 700 cpuc->pebs_enabled |= 1ULL << hwc->idx;
696 701
697 if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) 702 if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -699,11 +704,25 @@ void intel_pmu_pebs_enable(struct perf_event *event)
699 else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) 704 else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
700 cpuc->pebs_enabled |= 1ULL << 63; 705 cpuc->pebs_enabled |= 1ULL << 63;
701 706
707 /*
708 * When the event is constrained enough we can use a larger
709 * threshold and run the event with less frequent PMI.
710 */
711 if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
712 threshold = ds->pebs_absolute_maximum -
713 x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
714 } else {
715 threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
716 }
717
702 /* Use auto-reload if possible to save a MSR write in the PMI */ 718 /* Use auto-reload if possible to save a MSR write in the PMI */
703 if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { 719 if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
704 ds->pebs_event_reset[hwc->idx] = 720 ds->pebs_event_reset[hwc->idx] =
705 (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; 721 (u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
706 } 722 }
723
724 if (first_pebs || ds->pebs_interrupt_threshold > threshold)
725 ds->pebs_interrupt_threshold = threshold;
707} 726}
708 727
709void intel_pmu_pebs_disable(struct perf_event *event) 728void intel_pmu_pebs_disable(struct perf_event *event)