diff options
author | Andi Kleen <ak@linux.intel.com> | 2015-02-17 21:18:06 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2015-03-27 04:14:03 -0400 |
commit | 294fe0f52a44c6f207211de0686c369a961b5533 (patch) | |
tree | 0802f465bd807ee3b0e9e6ecc4522033beb92522 /arch | |
parent | 91f1b70582c62576f429cf78d53751c66677553d (diff) |
perf/x86/intel: Add INST_RETIRED.ALL workarounds
On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.
This is erratum BDM11 and BDM55:
http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf
BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.
Add a new callback to enforce this, and set it for Broadwell.
How does this handle the case when an app requests a specific
period with some of the bottom bits set?
Short answer:
Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.
Long answer (by Peterz):
IFF we guarantee perf_event_attr::sample_period >= 128.
Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.
So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.
So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.
So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.c | 9 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/perf_event_intel.c | 27 |
3 files changed, 37 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e0dab5ce61e9..ec6e982fd464 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event *event) | |||
451 | if (event->attr.type == PERF_TYPE_RAW) | 451 | if (event->attr.type == PERF_TYPE_RAW) |
452 | event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; | 452 | event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; |
453 | 453 | ||
454 | if (event->attr.sample_period && x86_pmu.limit_period) { | ||
455 | if (x86_pmu.limit_period(event, event->attr.sample_period) > | ||
456 | event->attr.sample_period) | ||
457 | return -EINVAL; | ||
458 | } | ||
459 | |||
454 | return x86_setup_perfctr(event); | 460 | return x86_setup_perfctr(event); |
455 | } | 461 | } |
456 | 462 | ||
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct perf_event *event) | |||
988 | if (left > x86_pmu.max_period) | 994 | if (left > x86_pmu.max_period) |
989 | left = x86_pmu.max_period; | 995 | left = x86_pmu.max_period; |
990 | 996 | ||
997 | if (x86_pmu.limit_period) | ||
998 | left = x86_pmu.limit_period(event, left); | ||
999 | |||
991 | per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; | 1000 | per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; |
992 | 1001 | ||
993 | /* | 1002 | /* |
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index a371d27d6795..87e5081f4cdc 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h | |||
@@ -451,6 +451,7 @@ struct x86_pmu { | |||
451 | struct x86_pmu_quirk *quirks; | 451 | struct x86_pmu_quirk *quirks; |
452 | int perfctr_second_write; | 452 | int perfctr_second_write; |
453 | bool late_ack; | 453 | bool late_ack; |
454 | unsigned (*limit_period)(struct perf_event *event, unsigned l); | ||
454 | 455 | ||
455 | /* | 456 | /* |
456 | * sysfs attrs | 457 | * sysfs attrs |
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 28838536a9f7..fc6dbc46af4a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c | |||
@@ -2096,6 +2096,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) | |||
2096 | return c; | 2096 | return c; |
2097 | } | 2097 | } |
2098 | 2098 | ||
2099 | /* | ||
2100 | * Broadwell: | ||
2101 | * | ||
2102 | * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared | ||
2103 | * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine | ||
2104 | * the two to enforce a minimum period of 128 (the smallest value that has bits | ||
2105 | * 0-5 cleared and >= 100). | ||
2106 | * | ||
2107 | * Because of how the code in x86_perf_event_set_period() works, the truncation | ||
2108 | * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period | ||
2109 | * to make up for the 'lost' events due to carrying the 'error' in period_left. | ||
2110 | * | ||
2111 | * Therefore the effective (average) period matches the requested period, | ||
2112 | * despite coarser hardware granularity. | ||
2113 | */ | ||
2114 | static unsigned bdw_limit_period(struct perf_event *event, unsigned left) | ||
2115 | { | ||
2116 | if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == | ||
2117 | X86_CONFIG(.event=0xc0, .umask=0x01)) { | ||
2118 | if (left < 128) | ||
2119 | left = 128; | ||
2120 | left &= ~0x3fu; | ||
2121 | } | ||
2122 | return left; | ||
2123 | } | ||
2124 | |||
2099 | PMU_FORMAT_ATTR(event, "config:0-7" ); | 2125 | PMU_FORMAT_ATTR(event, "config:0-7" ); |
2100 | PMU_FORMAT_ATTR(umask, "config:8-15" ); | 2126 | PMU_FORMAT_ATTR(umask, "config:8-15" ); |
2101 | PMU_FORMAT_ATTR(edge, "config:18" ); | 2127 | PMU_FORMAT_ATTR(edge, "config:18" ); |
@@ -2774,6 +2800,7 @@ __init int intel_pmu_init(void) | |||
2774 | x86_pmu.hw_config = hsw_hw_config; | 2800 | x86_pmu.hw_config = hsw_hw_config; |
2775 | x86_pmu.get_event_constraints = hsw_get_event_constraints; | 2801 | x86_pmu.get_event_constraints = hsw_get_event_constraints; |
2776 | x86_pmu.cpu_events = hsw_events_attrs; | 2802 | x86_pmu.cpu_events = hsw_events_attrs; |
2803 | x86_pmu.limit_period = bdw_limit_period; | ||
2777 | pr_cont("Broadwell events, "); | 2804 | pr_cont("Broadwell events, "); |
2778 | break; | 2805 | break; |
2779 | 2806 | ||