aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMaria Dimakopoulou <maria.n.dimakopoulou@gmail.com>2014-11-17 14:06:58 -0500
committerIngo Molnar <mingo@kernel.org>2015-04-02 11:33:12 -0400
commite979121b1b1556e184492e6fc149bbe188fc83e6 (patch)
tree905c6df26aaf39ebd1a6bbbf8253283c5aaad760
parent6f6539cad926f55d5eb6e79d05bbe99f0d54d56d (diff)
perf/x86/intel: Implement cross-HT corruption bug workaround
This patch implements a software workaround for a HW erratum on Intel SandyBridge, IvyBridge and Haswell processors with Hyperthreading enabled. The errata are documented for each processor in their respective specification update documents: - SandyBridge: BJ122 - IvyBridge: BV98 - Haswell: HSD29 The bug causes silent counter corruption across hyperthreads only when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3). Counters measuring those events may leak counts to the sibling counter. For instance, counter 0, thread 0 measuring event 0xd0, may leak to counter 0, thread 1, regardless of the event measured there. The size of the leak is not predictible. It all depends on the workload and the state of each sibling hyper-thread. The corrupting events do undercount as a consequence of the leak. The leak is compensated automatically only when the sibling counter measures the exact same corrupting event AND the workload is on the two threads is the same. Given, there is no way to guarantee this, a work-around is necessary. Furthermore, there is a serious problem if the leaked count is added to a low-occurrence event. In that case the corruption on the low occurrence event can be very large, e.g., orders of magnitude. There is no HW or FW workaround for this problem. The bug is very easy to reproduce on a loaded system. Here is an example on a Haswell client, where CPU0, CPU4 are siblings. We load the CPUs with a simple triad app streaming large floating-point vector. We use 0x81d0 corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and 0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not using the LBR, the 0x20cc event should be zero. $ taskset -c 0 triad & $ taskset -c 4 triad & $ perf stat -a -C 0 -e r81d0 sleep 100 & $ perf stat -a -C 4 -r20cc sleep 10 Performance counter stats for 'system wide': 139 277 291 r20cc 10,000969126 seconds time elapsed In this example, 0x81d0 and r20cc ar eusing sinling counters on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it from 0 to 139 millions occurrences. This patch provides a software workaround to this problem by modifying the way events are scheduled onto counters by the kernel. The patch forces cross-thread mutual exclusion between counters in case a corrupting event is measured by one of the hyper-threads. If thread 0, counter 0 is measuring event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting event is measured on any hyper-thread, event scheduling proceeds as before. The same example run with the workaround enabled, yield the correct answer: $ taskset -c 0 triad & $ taskset -c 4 triad & $ perf stat -a -C 0 -e r81d0 sleep 100 & $ perf stat -a -C 4 -r20cc sleep 10 Performance counter stats for 'system wide': 0 r20cc 10,000969126 seconds time elapsed The patch does provide correctness for all non-corrupting events. It does not "repatriate" the leaked counts back to the leaking counter. This is planned for a second patch series. This patch series makes this repatriation more easy by guaranteeing the sibling counter is not measuring any useful event. The patch introduces dynamic constraints for events. That means that events which did not have constraints, i.e., could be measured on any counters, may now be constrained to a subset of the counters depending on what is going on the sibling thread. The algorithm is similar to a cache coherency protocol. We call it XSU in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU counter. As a consequence of the workaround, users may see an increased amount of event multiplexing, even in situtations where there are fewer events than counters measured on a CPU. Patch has been tested on all three impacted processors. Note that when HT is off, there is no corruption. However, the workaround is still enabled, yet not costing too much. Adding a dynamic detection of HT on turned out to be complex are requiring too much to code to be justified. This patch addresses the issue when PEBS is not used. A subsequent patch fixes the problem when PEBS is used. Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com> [spinlock_t -> raw_spinlock_t] Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Stephane Eranian <eranian@google.com> Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/cpu/perf_event.c31
-rw-r--r--arch/x86/kernel/cpu/perf_event.h6
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c307
3 files changed, 331 insertions, 13 deletions
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 71755401476c..b8b7a1277d8d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -779,7 +779,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
779 struct event_constraint *c; 779 struct event_constraint *c;
780 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 780 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
781 struct perf_event *e; 781 struct perf_event *e;
782 int i, wmin, wmax, num = 0; 782 int i, wmin, wmax, unsched = 0;
783 struct hw_perf_event *hwc; 783 struct hw_perf_event *hwc;
784 784
785 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 785 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
@@ -822,14 +822,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
822 822
823 /* slow path */ 823 /* slow path */
824 if (i != n) 824 if (i != n)
825 num = perf_assign_events(cpuc->event_list, n, wmin, 825 unsched = perf_assign_events(cpuc->event_list, n, wmin,
826 wmax, assign); 826 wmax, assign);
827 827
828 /* 828 /*
829 * Mark the event as committed, so we do not put_constraint() 829 * In case of success (unsched = 0), mark events as committed,
830 * in case new events are added and fail scheduling. 830 * so we do not put_constraint() in case new events are added
831 * and fail to be scheduled
832 *
833 * We invoke the lower level commit callback to lock the resource
834 *
835 * We do not need to do all of this in case we are called to
836 * validate an event group (assign == NULL)
831 */ 837 */
832 if (!num && assign) { 838 if (!unsched && assign) {
833 for (i = 0; i < n; i++) { 839 for (i = 0; i < n; i++) {
834 e = cpuc->event_list[i]; 840 e = cpuc->event_list[i];
835 e->hw.flags |= PERF_X86_EVENT_COMMITTED; 841 e->hw.flags |= PERF_X86_EVENT_COMMITTED;
@@ -837,11 +843,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
837 x86_pmu.commit_scheduling(cpuc, e, assign[i]); 843 x86_pmu.commit_scheduling(cpuc, e, assign[i]);
838 } 844 }
839 } 845 }
840 /* 846
841 * scheduling failed or is just a simulation, 847 if (!assign || unsched) {
842 * free resources if necessary 848
843 */
844 if (!assign || num) {
845 for (i = 0; i < n; i++) { 849 for (i = 0; i < n; i++) {
846 e = cpuc->event_list[i]; 850 e = cpuc->event_list[i];
847 /* 851 /*
@@ -851,6 +855,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
851 if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) 855 if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
852 continue; 856 continue;
853 857
858 /*
859 * release events that failed scheduling
860 */
854 if (x86_pmu.put_event_constraints) 861 if (x86_pmu.put_event_constraints)
855 x86_pmu.put_event_constraints(cpuc, e); 862 x86_pmu.put_event_constraints(cpuc, e);
856 } 863 }
@@ -859,7 +866,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
859 if (x86_pmu.stop_scheduling) 866 if (x86_pmu.stop_scheduling)
860 x86_pmu.stop_scheduling(cpuc); 867 x86_pmu.stop_scheduling(cpuc);
861 868
862 return num ? -EINVAL : 0; 869 return unsched ? -EINVAL : 0;
863} 870}
864 871
865/* 872/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f31f90e2d859..236afee35587 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -72,6 +72,7 @@ struct event_constraint {
72#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ 72#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
73#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ 73#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
74#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */ 74#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */
75#define PERF_X86_EVENT_DYNAMIC 0x80 /* dynamic alloc'd constraint */
75#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ 76#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
76 77
77 78
@@ -133,6 +134,7 @@ enum intel_excl_state_type {
133struct intel_excl_states { 134struct intel_excl_states {
134 enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; 135 enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
135 enum intel_excl_state_type state[X86_PMC_IDX_MAX]; 136 enum intel_excl_state_type state[X86_PMC_IDX_MAX];
137 bool sched_started; /* true if scheduling has started */
136}; 138};
137 139
138struct intel_excl_cntrs { 140struct intel_excl_cntrs {
@@ -296,6 +298,10 @@ struct cpu_hw_events {
296#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ 298#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
297 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) 299 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
298 300
301#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
302 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
303 HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
304
299#define INTEL_PLD_CONSTRAINT(c, n) \ 305#define INTEL_PLD_CONSTRAINT(c, n) \
300 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ 306 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
301 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) 307 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7f54000fd0f1..91cc7749d7ce 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1845,7 +1845,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
1845} 1845}
1846 1846
1847static struct event_constraint * 1847static struct event_constraint *
1848intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, 1848__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
1849 struct perf_event *event) 1849 struct perf_event *event)
1850{ 1850{
1851 struct event_constraint *c; 1851 struct event_constraint *c;
@@ -1866,6 +1866,254 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
1866} 1866}
1867 1867
1868static void 1868static void
1869intel_start_scheduling(struct cpu_hw_events *cpuc)
1870{
1871 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
1872 struct intel_excl_states *xl, *xlo;
1873 int tid = cpuc->excl_thread_id;
1874 int o_tid = 1 - tid; /* sibling thread */
1875
1876 /*
1877 * nothing needed if in group validation mode
1878 */
1879 if (cpuc->is_fake)
1880 return;
1881 /*
1882 * no exclusion needed
1883 */
1884 if (!excl_cntrs)
1885 return;
1886
1887 xlo = &excl_cntrs->states[o_tid];
1888 xl = &excl_cntrs->states[tid];
1889
1890 xl->sched_started = true;
1891
1892 /*
1893 * lock shared state until we are done scheduling
1894 * in stop_event_scheduling()
1895 * makes scheduling appear as a transaction
1896 */
1897 WARN_ON_ONCE(!irqs_disabled());
1898 raw_spin_lock(&excl_cntrs->lock);
1899
1900 /*
1901 * save initial state of sibling thread
1902 */
1903 memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
1904}
1905
1906static void
1907intel_stop_scheduling(struct cpu_hw_events *cpuc)
1908{
1909 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
1910 struct intel_excl_states *xl, *xlo;
1911 int tid = cpuc->excl_thread_id;
1912 int o_tid = 1 - tid; /* sibling thread */
1913
1914 /*
1915 * nothing needed if in group validation mode
1916 */
1917 if (cpuc->is_fake)
1918 return;
1919 /*
1920 * no exclusion needed
1921 */
1922 if (!excl_cntrs)
1923 return;
1924
1925 xlo = &excl_cntrs->states[o_tid];
1926 xl = &excl_cntrs->states[tid];
1927
1928 /*
1929 * make new sibling thread state visible
1930 */
1931 memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
1932
1933 xl->sched_started = false;
1934 /*
1935 * release shared state lock (acquired in intel_start_scheduling())
1936 */
1937 raw_spin_unlock(&excl_cntrs->lock);
1938}
1939
1940static struct event_constraint *
1941intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
1942 int idx, struct event_constraint *c)
1943{
1944 struct event_constraint *cx;
1945 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
1946 struct intel_excl_states *xl, *xlo;
1947 int is_excl, i;
1948 int tid = cpuc->excl_thread_id;
1949 int o_tid = 1 - tid; /* alternate */
1950
1951 /*
1952 * validating a group does not require
1953 * enforcing cross-thread exclusion
1954 */
1955 if (cpuc->is_fake)
1956 return c;
1957
1958 /*
1959 * event requires exclusive counter access
1960 * across HT threads
1961 */
1962 is_excl = c->flags & PERF_X86_EVENT_EXCL;
1963
1964 /*
1965 * xl = state of current HT
1966 * xlo = state of sibling HT
1967 */
1968 xl = &excl_cntrs->states[tid];
1969 xlo = &excl_cntrs->states[o_tid];
1970
1971 cx = c;
1972
1973 /*
1974 * because we modify the constraint, we need
1975 * to make a copy. Static constraints come
1976 * from static const tables.
1977 *
1978 * only needed when constraint has not yet
1979 * been cloned (marked dynamic)
1980 */
1981 if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
1982
1983 /* sanity check */
1984 if (idx < 0)
1985 return &emptyconstraint;
1986
1987 /*
1988 * grab pre-allocated constraint entry
1989 */
1990 cx = &cpuc->constraint_list[idx];
1991
1992 /*
1993 * initialize dynamic constraint
1994 * with static constraint
1995 */
1996 memcpy(cx, c, sizeof(*cx));
1997
1998 /*
1999 * mark constraint as dynamic, so we
2000 * can free it later on
2001 */
2002 cx->flags |= PERF_X86_EVENT_DYNAMIC;
2003 }
2004
2005 /*
2006 * From here on, the constraint is dynamic.
2007 * Either it was just allocated above, or it
2008 * was allocated during a earlier invocation
2009 * of this function
2010 */
2011
2012 /*
2013 * Modify static constraint with current dynamic
2014 * state of thread
2015 *
2016 * EXCLUSIVE: sibling counter measuring exclusive event
2017 * SHARED : sibling counter measuring non-exclusive event
2018 * UNUSED : sibling counter unused
2019 */
2020 for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
2021 /*
2022 * exclusive event in sibling counter
2023 * our corresponding counter cannot be used
2024 * regardless of our event
2025 */
2026 if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
2027 __clear_bit(i, cx->idxmsk);
2028 /*
2029 * if measuring an exclusive event, sibling
2030 * measuring non-exclusive, then counter cannot
2031 * be used
2032 */
2033 if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
2034 __clear_bit(i, cx->idxmsk);
2035 }
2036
2037 /*
2038 * recompute actual bit weight for scheduling algorithm
2039 */
2040 cx->weight = hweight64(cx->idxmsk64);
2041
2042 /*
2043 * if we return an empty mask, then switch
2044 * back to static empty constraint to avoid
2045 * the cost of freeing later on
2046 */
2047 if (cx->weight == 0)
2048 cx = &emptyconstraint;
2049
2050 return cx;
2051}
2052
2053static struct event_constraint *
2054intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
2055 struct perf_event *event)
2056{
2057 struct event_constraint *c = event->hw.constraint;
2058
2059 /*
2060 * first time only
2061 * - static constraint: no change across incremental scheduling calls
2062 * - dynamic constraint: handled by intel_get_excl_constraints()
2063 */
2064 if (!c)
2065 c = __intel_get_event_constraints(cpuc, idx, event);
2066
2067 if (cpuc->excl_cntrs)
2068 return intel_get_excl_constraints(cpuc, event, idx, c);
2069
2070 return c;
2071}
2072
2073static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
2074 struct perf_event *event)
2075{
2076 struct hw_perf_event *hwc = &event->hw;
2077 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
2078 struct intel_excl_states *xlo, *xl;
2079 unsigned long flags = 0; /* keep compiler happy */
2080 int tid = cpuc->excl_thread_id;
2081 int o_tid = 1 - tid;
2082
2083 /*
2084 * nothing needed if in group validation mode
2085 */
2086 if (cpuc->is_fake)
2087 return;
2088
2089 WARN_ON_ONCE(!excl_cntrs);
2090
2091 if (!excl_cntrs)
2092 return;
2093
2094 xl = &excl_cntrs->states[tid];
2095 xlo = &excl_cntrs->states[o_tid];
2096
2097 /*
2098 * put_constraint may be called from x86_schedule_events()
2099 * which already has the lock held so here make locking
2100 * conditional
2101 */
2102 if (!xl->sched_started)
2103 raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
2104
2105 /*
2106 * if event was actually assigned, then mark the
2107 * counter state as unused now
2108 */
2109 if (hwc->idx >= 0)
2110 xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
2111
2112 if (!xl->sched_started)
2113 raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
2114}
2115
2116static void
1869intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, 2117intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1870 struct perf_event *event) 2118 struct perf_event *event)
1871{ 2119{
@@ -1883,7 +2131,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1883static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 2131static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1884 struct perf_event *event) 2132 struct perf_event *event)
1885{ 2133{
2134 struct event_constraint *c = event->hw.constraint;
2135
1886 intel_put_shared_regs_event_constraints(cpuc, event); 2136 intel_put_shared_regs_event_constraints(cpuc, event);
2137
2138 /*
2139 * is PMU has exclusive counter restrictions, then
2140 * all events are subject to and must call the
2141 * put_excl_constraints() routine
2142 */
2143 if (c && cpuc->excl_cntrs)
2144 intel_put_excl_constraints(cpuc, event);
2145
2146 /* cleanup dynamic constraint */
2147 if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
2148 event->hw.constraint = NULL;
2149}
2150
2151static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
2152 struct perf_event *event, int cntr)
2153{
2154 struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
2155 struct event_constraint *c = event->hw.constraint;
2156 struct intel_excl_states *xlo, *xl;
2157 int tid = cpuc->excl_thread_id;
2158 int o_tid = 1 - tid;
2159 int is_excl;
2160
2161 if (cpuc->is_fake || !c)
2162 return;
2163
2164 is_excl = c->flags & PERF_X86_EVENT_EXCL;
2165
2166 if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
2167 return;
2168
2169 WARN_ON_ONCE(!excl_cntrs);
2170
2171 if (!excl_cntrs)
2172 return;
2173
2174 xl = &excl_cntrs->states[tid];
2175 xlo = &excl_cntrs->states[o_tid];
2176
2177 WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
2178
2179 if (cntr >= 0) {
2180 if (is_excl)
2181 xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
2182 else
2183 xlo->init_state[cntr] = INTEL_EXCL_SHARED;
2184 }
1887} 2185}
1888 2186
1889static void intel_pebs_aliases_core2(struct perf_event *event) 2187static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2349,6 +2647,13 @@ static void intel_pmu_cpu_dying(int cpu)
2349 cpuc->constraint_list = NULL; 2647 cpuc->constraint_list = NULL;
2350 } 2648 }
2351 2649
2650 c = cpuc->excl_cntrs;
2651 if (c) {
2652 if (c->core_id == -1 || --c->refcnt == 0)
2653 kfree(c);
2654 cpuc->excl_cntrs = NULL;
2655 }
2656
2352 fini_debug_store_on_cpu(cpu); 2657 fini_debug_store_on_cpu(cpu);
2353} 2658}
2354 2659