aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorAvi Kivity <avi@redhat.com>2010-04-19 05:52:53 -0400
committerAvi Kivity <avi@redhat.com>2010-05-17 05:17:58 -0400
commit9beeaa2d689842f7760aa16c512e6bb8182d38b6 (patch)
tree62cea0772127c4b1c0b476e46dec6830d36809c1 /arch/x86/kernel
parent3246af0ece6c61689847417977733f0b12dc4b6f (diff)
parenta1645ce12adb6c9cc9e19d7695466204e3f017fe (diff)
Merge branch 'perf'
Signed-off-by: Avi Kivity <avi@redhat.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c552
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c354
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c664
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c834
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c5
-rw-r--r--arch/x86/kernel/kprobes.c16
-rw-r--r--arch/x86/kernel/process.c18
-rw-r--r--arch/x86/kernel/process_32.c8
-rw-r--r--arch/x86/kernel/process_64.c8
-rw-r--r--arch/x86/kernel/ptrace.c382
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/traps.c4
20 files changed, 2264 insertions, 2786 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e..e77b2208372 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
47obj-y += process.o 47obj-y += process.o
48obj-y += i387.o xsave.o 48obj-y += i387.o xsave.o
49obj-y += ptrace.o 49obj-y += ptrace.o
50obj-$(CONFIG_X86_DS) += ds.o
51obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 50obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 51obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 52obj-y += step.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7e1cca13af3..d72377c41c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
12#include <asm/processor.h> 12#include <asm/processor.h>
13#include <asm/pgtable.h> 13#include <asm/pgtable.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/ds.h>
16#include <asm/bugs.h> 15#include <asm/bugs.h>
17#include <asm/cpu.h> 16#include <asm/cpu.h>
18 17
@@ -367,7 +366,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
367 set_cpu_cap(c, X86_FEATURE_BTS); 366 set_cpu_cap(c, X86_FEATURE_BTS);
368 if (!(l1 & (1<<12))) 367 if (!(l1 & (1<<12)))
369 set_cpu_cap(c, X86_FEATURE_PEBS); 368 set_cpu_cap(c, X86_FEATURE_PEBS);
370 ds_init_intel(c);
371 } 369 }
372 370
373 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 371 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf..2ea78abf69d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33 33
34static u64 perf_event_mask __read_mostly; 34#if 0
35#undef wrmsrl
36#define wrmsrl(msr, val) \
37do { \
38 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
39 (unsigned long)(val)); \
40 native_write_msr((msr), (u32)((u64)(val)), \
41 (u32)((u64)(val) >> 32)); \
42} while (0)
43#endif
35 44
36/* The maximal number of PEBS events: */ 45/*
37#define MAX_PEBS_EVENTS 4 46 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
47 */
48static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{
51 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0;
54 struct page *page;
55 void *map;
56 int ret;
38 57
39/* The size of a BTS record in bytes: */ 58 do {
40#define BTS_RECORD_SIZE 24 59 ret = __get_user_pages_fast(addr, 1, 0, &page);
60 if (!ret)
61 break;
41 62
42/* The size of a per-cpu BTS buffer in bytes: */ 63 offset = addr & (PAGE_SIZE - 1);
43#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) 64 size = min(PAGE_SIZE - offset, n - len);
44 65
45/* The BTS overflow threshold in bytes from the end of the buffer: */ 66 map = kmap_atomic(page, type);
46#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) 67 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type);
69 put_page(page);
47 70
71 len += size;
72 to += size;
73 addr += size;
48 74
49/* 75 } while (len < n);
50 * Bits in the debugctlmsr controlling branch tracing.
51 */
52#define X86_DEBUGCTL_TR (1 << 6)
53#define X86_DEBUGCTL_BTS (1 << 7)
54#define X86_DEBUGCTL_BTINT (1 << 8)
55#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
56#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
57 76
58/* 77 return len;
59 * A debug store configuration. 78}
60 *
61 * We only support architectures that use 64bit fields.
62 */
63struct debug_store {
64 u64 bts_buffer_base;
65 u64 bts_index;
66 u64 bts_absolute_maximum;
67 u64 bts_interrupt_threshold;
68 u64 pebs_buffer_base;
69 u64 pebs_index;
70 u64 pebs_absolute_maximum;
71 u64 pebs_interrupt_threshold;
72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
73};
74 79
75struct event_constraint { 80struct event_constraint {
76 union { 81 union {
@@ -89,18 +94,39 @@ struct amd_nb {
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 94 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90}; 95};
91 96
97#define MAX_LBR_ENTRIES 16
98
92struct cpu_hw_events { 99struct cpu_hw_events {
100 /*
101 * Generic x86 PMC bits
102 */
93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ 103 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 104 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
95 unsigned long interrupts;
96 int enabled; 105 int enabled;
97 struct debug_store *ds;
98 106
99 int n_events; 107 int n_events;
100 int n_added; 108 int n_added;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ 109 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX]; 110 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ 111 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
112
113 /*
114 * Intel DebugStore bits
115 */
116 struct debug_store *ds;
117 u64 pebs_enabled;
118
119 /*
120 * Intel LBR bits
121 */
122 int lbr_users;
123 void *lbr_context;
124 struct perf_branch_stack lbr_stack;
125 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
126
127 /*
128 * AMD specific bits
129 */
104 struct amd_nb *amd_nb; 130 struct amd_nb *amd_nb;
105}; 131};
106 132
@@ -114,11 +140,31 @@ struct cpu_hw_events {
114#define EVENT_CONSTRAINT(c, n, m) \ 140#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) 141 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116 142
143/*
144 * Constraint on the Event code.
145 */
117#define INTEL_EVENT_CONSTRAINT(c, n) \ 146#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) 147 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
119 148
149/*
150 * Constraint on the Event code + UMask + fixed-mask
151 *
152 * filter mask to validate fixed counter events.
153 * the following filters disqualify for fixed counters:
154 * - inv
155 * - edge
156 * - cnt-mask
157 * The other filters are supported by fixed counters.
158 * The any-thread option is supported starting with v3.
159 */
120#define FIXED_EVENT_CONSTRAINT(c, n) \ 160#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) 161 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
162
163/*
164 * Constraint on the Event code + UMask
165 */
166#define PEBS_EVENT_CONSTRAINT(c, n) \
167 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
122 168
123#define EVENT_CONSTRAINT_END \ 169#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0) 170 EVENT_CONSTRAINT(0, 0, 0)
@@ -126,32 +172,43 @@ struct cpu_hw_events {
126#define for_each_event_constraint(e, c) \ 172#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++) 173 for ((e) = (c); (e)->cmask; (e)++)
128 174
175union perf_capabilities {
176 struct {
177 u64 lbr_format : 6;
178 u64 pebs_trap : 1;
179 u64 pebs_arch_reg : 1;
180 u64 pebs_format : 4;
181 u64 smm_freeze : 1;
182 };
183 u64 capabilities;
184};
185
129/* 186/*
130 * struct x86_pmu - generic x86 pmu 187 * struct x86_pmu - generic x86 pmu
131 */ 188 */
132struct x86_pmu { 189struct x86_pmu {
190 /*
191 * Generic x86 PMC bits
192 */
133 const char *name; 193 const char *name;
134 int version; 194 int version;
135 int (*handle_irq)(struct pt_regs *); 195 int (*handle_irq)(struct pt_regs *);
136 void (*disable_all)(void); 196 void (*disable_all)(void);
137 void (*enable_all)(void); 197 void (*enable_all)(int added);
138 void (*enable)(struct perf_event *); 198 void (*enable)(struct perf_event *);
139 void (*disable)(struct perf_event *); 199 void (*disable)(struct perf_event *);
200 int (*hw_config)(struct perf_event *event);
201 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
140 unsigned eventsel; 202 unsigned eventsel;
141 unsigned perfctr; 203 unsigned perfctr;
142 u64 (*event_map)(int); 204 u64 (*event_map)(int);
143 u64 (*raw_event)(u64);
144 int max_events; 205 int max_events;
145 int num_events; 206 int num_counters;
146 int num_events_fixed; 207 int num_counters_fixed;
147 int event_bits; 208 int cntval_bits;
148 u64 event_mask; 209 u64 cntval_mask;
149 int apic; 210 int apic;
150 u64 max_period; 211 u64 max_period;
151 u64 intel_ctrl;
152 void (*enable_bts)(u64 config);
153 void (*disable_bts)(void);
154
155 struct event_constraint * 212 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc, 213 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event); 214 struct perf_event *event);
@@ -159,11 +216,32 @@ struct x86_pmu {
159 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 216 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
160 struct perf_event *event); 217 struct perf_event *event);
161 struct event_constraint *event_constraints; 218 struct event_constraint *event_constraints;
219 void (*quirks)(void);
162 220
163 int (*cpu_prepare)(int cpu); 221 int (*cpu_prepare)(int cpu);
164 void (*cpu_starting)(int cpu); 222 void (*cpu_starting)(int cpu);
165 void (*cpu_dying)(int cpu); 223 void (*cpu_dying)(int cpu);
166 void (*cpu_dead)(int cpu); 224 void (*cpu_dead)(int cpu);
225
226 /*
227 * Intel Arch Perfmon v2+
228 */
229 u64 intel_ctrl;
230 union perf_capabilities intel_cap;
231
232 /*
233 * Intel DebugStore bits
234 */
235 int bts, pebs;
236 int pebs_record_size;
237 void (*drain_pebs)(struct pt_regs *regs);
238 struct event_constraint *pebs_constraints;
239
240 /*
241 * Intel LBR
242 */
243 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
244 int lbr_nr; /* hardware stack size */
167}; 245};
168 246
169static struct x86_pmu x86_pmu __read_mostly; 247static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +276,7 @@ static u64
198x86_perf_event_update(struct perf_event *event) 276x86_perf_event_update(struct perf_event *event)
199{ 277{
200 struct hw_perf_event *hwc = &event->hw; 278 struct hw_perf_event *hwc = &event->hw;
201 int shift = 64 - x86_pmu.event_bits; 279 int shift = 64 - x86_pmu.cntval_bits;
202 u64 prev_raw_count, new_raw_count; 280 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx; 281 int idx = hwc->idx;
204 s64 delta; 282 s64 delta;
@@ -241,33 +319,32 @@ again:
241static atomic_t active_events; 319static atomic_t active_events;
242static DEFINE_MUTEX(pmc_reserve_mutex); 320static DEFINE_MUTEX(pmc_reserve_mutex);
243 321
322#ifdef CONFIG_X86_LOCAL_APIC
323
244static bool reserve_pmc_hardware(void) 324static bool reserve_pmc_hardware(void)
245{ 325{
246#ifdef CONFIG_X86_LOCAL_APIC
247 int i; 326 int i;
248 327
249 if (nmi_watchdog == NMI_LOCAL_APIC) 328 if (nmi_watchdog == NMI_LOCAL_APIC)
250 disable_lapic_nmi_watchdog(); 329 disable_lapic_nmi_watchdog();
251 330
252 for (i = 0; i < x86_pmu.num_events; i++) { 331 for (i = 0; i < x86_pmu.num_counters; i++) {
253 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 332 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
254 goto perfctr_fail; 333 goto perfctr_fail;
255 } 334 }
256 335
257 for (i = 0; i < x86_pmu.num_events; i++) { 336 for (i = 0; i < x86_pmu.num_counters; i++) {
258 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 337 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
259 goto eventsel_fail; 338 goto eventsel_fail;
260 } 339 }
261#endif
262 340
263 return true; 341 return true;
264 342
265#ifdef CONFIG_X86_LOCAL_APIC
266eventsel_fail: 343eventsel_fail:
267 for (i--; i >= 0; i--) 344 for (i--; i >= 0; i--)
268 release_evntsel_nmi(x86_pmu.eventsel + i); 345 release_evntsel_nmi(x86_pmu.eventsel + i);
269 346
270 i = x86_pmu.num_events; 347 i = x86_pmu.num_counters;
271 348
272perfctr_fail: 349perfctr_fail:
273 for (i--; i >= 0; i--) 350 for (i--; i >= 0; i--)
@@ -277,128 +354,36 @@ perfctr_fail:
277 enable_lapic_nmi_watchdog(); 354 enable_lapic_nmi_watchdog();
278 355
279 return false; 356 return false;
280#endif
281} 357}
282 358
283static void release_pmc_hardware(void) 359static void release_pmc_hardware(void)
284{ 360{
285#ifdef CONFIG_X86_LOCAL_APIC
286 int i; 361 int i;
287 362
288 for (i = 0; i < x86_pmu.num_events; i++) { 363 for (i = 0; i < x86_pmu.num_counters; i++) {
289 release_perfctr_nmi(x86_pmu.perfctr + i); 364 release_perfctr_nmi(x86_pmu.perfctr + i);
290 release_evntsel_nmi(x86_pmu.eventsel + i); 365 release_evntsel_nmi(x86_pmu.eventsel + i);
291 } 366 }
292 367
293 if (nmi_watchdog == NMI_LOCAL_APIC) 368 if (nmi_watchdog == NMI_LOCAL_APIC)
294 enable_lapic_nmi_watchdog(); 369 enable_lapic_nmi_watchdog();
295#endif
296}
297
298static inline bool bts_available(void)
299{
300 return x86_pmu.enable_bts != NULL;
301}
302
303static void init_debug_store_on_cpu(int cpu)
304{
305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
306
307 if (!ds)
308 return;
309
310 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
311 (u32)((u64)(unsigned long)ds),
312 (u32)((u64)(unsigned long)ds >> 32));
313}
314
315static void fini_debug_store_on_cpu(int cpu)
316{
317 if (!per_cpu(cpu_hw_events, cpu).ds)
318 return;
319
320 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
321}
322
323static void release_bts_hardware(void)
324{
325 int cpu;
326
327 if (!bts_available())
328 return;
329
330 get_online_cpus();
331
332 for_each_online_cpu(cpu)
333 fini_debug_store_on_cpu(cpu);
334
335 for_each_possible_cpu(cpu) {
336 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
337
338 if (!ds)
339 continue;
340
341 per_cpu(cpu_hw_events, cpu).ds = NULL;
342
343 kfree((void *)(unsigned long)ds->bts_buffer_base);
344 kfree(ds);
345 }
346
347 put_online_cpus();
348} 370}
349 371
350static int reserve_bts_hardware(void) 372#else
351{
352 int cpu, err = 0;
353
354 if (!bts_available())
355 return 0;
356
357 get_online_cpus();
358
359 for_each_possible_cpu(cpu) {
360 struct debug_store *ds;
361 void *buffer;
362
363 err = -ENOMEM;
364 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
365 if (unlikely(!buffer))
366 break;
367
368 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
369 if (unlikely(!ds)) {
370 kfree(buffer);
371 break;
372 }
373
374 ds->bts_buffer_base = (u64)(unsigned long)buffer;
375 ds->bts_index = ds->bts_buffer_base;
376 ds->bts_absolute_maximum =
377 ds->bts_buffer_base + BTS_BUFFER_SIZE;
378 ds->bts_interrupt_threshold =
379 ds->bts_absolute_maximum - BTS_OVFL_TH;
380 373
381 per_cpu(cpu_hw_events, cpu).ds = ds; 374static bool reserve_pmc_hardware(void) { return true; }
382 err = 0; 375static void release_pmc_hardware(void) {}
383 }
384 376
385 if (err) 377#endif
386 release_bts_hardware();
387 else {
388 for_each_online_cpu(cpu)
389 init_debug_store_on_cpu(cpu);
390 }
391
392 put_online_cpus();
393 378
394 return err; 379static int reserve_ds_buffers(void);
395} 380static void release_ds_buffers(void);
396 381
397static void hw_perf_event_destroy(struct perf_event *event) 382static void hw_perf_event_destroy(struct perf_event *event)
398{ 383{
399 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 384 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
400 release_pmc_hardware(); 385 release_pmc_hardware();
401 release_bts_hardware(); 386 release_ds_buffers();
402 mutex_unlock(&pmc_reserve_mutex); 387 mutex_unlock(&pmc_reserve_mutex);
403 } 388 }
404} 389}
@@ -441,6 +426,28 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
441 return 0; 426 return 0;
442} 427}
443 428
429static int x86_pmu_hw_config(struct perf_event *event)
430{
431 /*
432 * Generate PMC IRQs:
433 * (keep 'enabled' bit clear for now)
434 */
435 event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
436
437 /*
438 * Count user and OS events unless requested not to
439 */
440 if (!event->attr.exclude_user)
441 event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
442 if (!event->attr.exclude_kernel)
443 event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
444
445 if (event->attr.type == PERF_TYPE_RAW)
446 event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
447
448 return 0;
449}
450
444/* 451/*
445 * Setup the hardware configuration for a given attr_type 452 * Setup the hardware configuration for a given attr_type
446 */ 453 */
@@ -460,8 +467,11 @@ static int __hw_perf_event_init(struct perf_event *event)
460 if (atomic_read(&active_events) == 0) { 467 if (atomic_read(&active_events) == 0) {
461 if (!reserve_pmc_hardware()) 468 if (!reserve_pmc_hardware())
462 err = -EBUSY; 469 err = -EBUSY;
463 else 470 else {
464 err = reserve_bts_hardware(); 471 err = reserve_ds_buffers();
472 if (err)
473 release_pmc_hardware();
474 }
465 } 475 }
466 if (!err) 476 if (!err)
467 atomic_inc(&active_events); 477 atomic_inc(&active_events);
@@ -472,23 +482,14 @@ static int __hw_perf_event_init(struct perf_event *event)
472 482
473 event->destroy = hw_perf_event_destroy; 483 event->destroy = hw_perf_event_destroy;
474 484
475 /*
476 * Generate PMC IRQs:
477 * (keep 'enabled' bit clear for now)
478 */
479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
480
481 hwc->idx = -1; 485 hwc->idx = -1;
482 hwc->last_cpu = -1; 486 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL; 487 hwc->last_tag = ~0ULL;
484 488
485 /* 489 /* Processor specifics */
486 * Count user and OS events unless requested not to. 490 err = x86_pmu.hw_config(event);
487 */ 491 if (err)
488 if (!attr->exclude_user) 492 return err;
489 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
490 if (!attr->exclude_kernel)
491 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
492 493
493 if (!hwc->sample_period) { 494 if (!hwc->sample_period) {
494 hwc->sample_period = x86_pmu.max_period; 495 hwc->sample_period = x86_pmu.max_period;
@@ -505,16 +506,8 @@ static int __hw_perf_event_init(struct perf_event *event)
505 return -EOPNOTSUPP; 506 return -EOPNOTSUPP;
506 } 507 }
507 508
508 /* 509 if (attr->type == PERF_TYPE_RAW)
509 * Raw hw_event type provide the config in the hw_event structure
510 */
511 if (attr->type == PERF_TYPE_RAW) {
512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
516 return 0; 510 return 0;
517 }
518 511
519 if (attr->type == PERF_TYPE_HW_CACHE) 512 if (attr->type == PERF_TYPE_HW_CACHE)
520 return set_ext_hw_attr(hwc, attr); 513 return set_ext_hw_attr(hwc, attr);
@@ -539,11 +532,11 @@ static int __hw_perf_event_init(struct perf_event *event)
539 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 532 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
540 (hwc->sample_period == 1)) { 533 (hwc->sample_period == 1)) {
541 /* BTS is not supported by this architecture. */ 534 /* BTS is not supported by this architecture. */
542 if (!bts_available()) 535 if (!x86_pmu.bts)
543 return -EOPNOTSUPP; 536 return -EOPNOTSUPP;
544 537
545 /* BTS is currently only allowed for user-mode. */ 538 /* BTS is currently only allowed for user-mode. */
546 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) 539 if (!attr->exclude_kernel)
547 return -EOPNOTSUPP; 540 return -EOPNOTSUPP;
548 } 541 }
549 542
@@ -557,7 +550,7 @@ static void x86_pmu_disable_all(void)
557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 550 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
558 int idx; 551 int idx;
559 552
560 for (idx = 0; idx < x86_pmu.num_events; idx++) { 553 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
561 u64 val; 554 u64 val;
562 555
563 if (!test_bit(idx, cpuc->active_mask)) 556 if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +580,12 @@ void hw_perf_disable(void)
587 x86_pmu.disable_all(); 580 x86_pmu.disable_all();
588} 581}
589 582
590static void x86_pmu_enable_all(void) 583static void x86_pmu_enable_all(int added)
591{ 584{
592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 585 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
593 int idx; 586 int idx;
594 587
595 for (idx = 0; idx < x86_pmu.num_events; idx++) { 588 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
596 struct perf_event *event = cpuc->events[idx]; 589 struct perf_event *event = cpuc->events[idx];
597 u64 val; 590 u64 val;
598 591
@@ -667,14 +660,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
667 * assign events to counters starting with most 660 * assign events to counters starting with most
668 * constrained events. 661 * constrained events.
669 */ 662 */
670 wmax = x86_pmu.num_events; 663 wmax = x86_pmu.num_counters;
671 664
672 /* 665 /*
673 * when fixed event counters are present, 666 * when fixed event counters are present,
674 * wmax is incremented by 1 to account 667 * wmax is incremented by 1 to account
675 * for one more choice 668 * for one more choice
676 */ 669 */
677 if (x86_pmu.num_events_fixed) 670 if (x86_pmu.num_counters_fixed)
678 wmax++; 671 wmax++;
679 672
680 for (w = 1, num = n; num && w <= wmax; w++) { 673 for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +717,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
724 struct perf_event *event; 717 struct perf_event *event;
725 int n, max_count; 718 int n, max_count;
726 719
727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; 720 max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
728 721
729 /* current number of events already accepted */ 722 /* current number of events already accepted */
730 n = cpuc->n_events; 723 n = cpuc->n_events;
@@ -795,7 +788,7 @@ void hw_perf_enable(void)
795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 788 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
796 struct perf_event *event; 789 struct perf_event *event;
797 struct hw_perf_event *hwc; 790 struct hw_perf_event *hwc;
798 int i; 791 int i, added = cpuc->n_added;
799 792
800 if (!x86_pmu_initialized()) 793 if (!x86_pmu_initialized())
801 return; 794 return;
@@ -847,19 +840,20 @@ void hw_perf_enable(void)
847 cpuc->enabled = 1; 840 cpuc->enabled = 1;
848 barrier(); 841 barrier();
849 842
850 x86_pmu.enable_all(); 843 x86_pmu.enable_all(added);
851} 844}
852 845
853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) 846static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
854{ 847{
855 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 848 wrmsrl(hwc->config_base + hwc->idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); 849 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857} 850}
858 851
859static inline void x86_pmu_disable_event(struct perf_event *event) 852static inline void x86_pmu_disable_event(struct perf_event *event)
860{ 853{
861 struct hw_perf_event *hwc = &event->hw; 854 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); 855
856 wrmsrl(hwc->config_base + hwc->idx, hwc->config);
863} 857}
864 858
865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 859static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -874,7 +868,7 @@ x86_perf_event_set_period(struct perf_event *event)
874 struct hw_perf_event *hwc = &event->hw; 868 struct hw_perf_event *hwc = &event->hw;
875 s64 left = atomic64_read(&hwc->period_left); 869 s64 left = atomic64_read(&hwc->period_left);
876 s64 period = hwc->sample_period; 870 s64 period = hwc->sample_period;
877 int err, ret = 0, idx = hwc->idx; 871 int ret = 0, idx = hwc->idx;
878 872
879 if (idx == X86_PMC_IDX_FIXED_BTS) 873 if (idx == X86_PMC_IDX_FIXED_BTS)
880 return 0; 874 return 0;
@@ -912,8 +906,8 @@ x86_perf_event_set_period(struct perf_event *event)
912 */ 906 */
913 atomic64_set(&hwc->prev_count, (u64)-left); 907 atomic64_set(&hwc->prev_count, (u64)-left);
914 908
915 err = checking_wrmsrl(hwc->event_base + idx, 909 wrmsrl(hwc->event_base + idx,
916 (u64)(-left) & x86_pmu.event_mask); 910 (u64)(-left) & x86_pmu.cntval_mask);
917 911
918 perf_event_update_userpage(event); 912 perf_event_update_userpage(event);
919 913
@@ -950,7 +944,7 @@ static int x86_pmu_enable(struct perf_event *event)
950 if (n < 0) 944 if (n < 0)
951 return n; 945 return n;
952 946
953 ret = x86_schedule_events(cpuc, n, assign); 947 ret = x86_pmu.schedule_events(cpuc, n, assign);
954 if (ret) 948 if (ret)
955 return ret; 949 return ret;
956 /* 950 /*
@@ -991,11 +985,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
991void perf_event_print_debug(void) 985void perf_event_print_debug(void)
992{ 986{
993 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 987 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
988 u64 pebs;
994 struct cpu_hw_events *cpuc; 989 struct cpu_hw_events *cpuc;
995 unsigned long flags; 990 unsigned long flags;
996 int cpu, idx; 991 int cpu, idx;
997 992
998 if (!x86_pmu.num_events) 993 if (!x86_pmu.num_counters)
999 return; 994 return;
1000 995
1001 local_irq_save(flags); 996 local_irq_save(flags);
@@ -1008,16 +1003,18 @@ void perf_event_print_debug(void)
1008 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 1003 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
1009 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); 1004 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
1010 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); 1005 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1006 rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1011 1007
1012 pr_info("\n"); 1008 pr_info("\n");
1013 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); 1009 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
1014 pr_info("CPU#%d: status: %016llx\n", cpu, status); 1010 pr_info("CPU#%d: status: %016llx\n", cpu, status);
1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1011 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1012 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1013 pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
1017 } 1014 }
1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1015 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1019 1016
1020 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1017 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1018 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1022 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1019 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1023 1020
@@ -1030,7 +1027,7 @@ void perf_event_print_debug(void)
1030 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1027 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1031 cpu, idx, prev_left); 1028 cpu, idx, prev_left);
1032 } 1029 }
1033 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 1030 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1034 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1031 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1035 1032
1036 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1033 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1095,7 +1092,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1095 1092
1096 cpuc = &__get_cpu_var(cpu_hw_events); 1093 cpuc = &__get_cpu_var(cpu_hw_events);
1097 1094
1098 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1095 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1099 if (!test_bit(idx, cpuc->active_mask)) 1096 if (!test_bit(idx, cpuc->active_mask))
1100 continue; 1097 continue;
1101 1098
@@ -1103,7 +1100,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1103 hwc = &event->hw; 1100 hwc = &event->hw;
1104 1101
1105 val = x86_perf_event_update(event); 1102 val = x86_perf_event_update(event);
1106 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1103 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1107 continue; 1104 continue;
1108 1105
1109 /* 1106 /*
@@ -1146,7 +1143,6 @@ void set_perf_event_pending(void)
1146 1143
1147void perf_events_lapic_init(void) 1144void perf_events_lapic_init(void)
1148{ 1145{
1149#ifdef CONFIG_X86_LOCAL_APIC
1150 if (!x86_pmu.apic || !x86_pmu_initialized()) 1146 if (!x86_pmu.apic || !x86_pmu_initialized())
1151 return; 1147 return;
1152 1148
@@ -1154,7 +1150,6 @@ void perf_events_lapic_init(void)
1154 * Always use NMI for PMU 1150 * Always use NMI for PMU
1155 */ 1151 */
1156 apic_write(APIC_LVTPC, APIC_DM_NMI); 1152 apic_write(APIC_LVTPC, APIC_DM_NMI);
1157#endif
1158} 1153}
1159 1154
1160static int __kprobes 1155static int __kprobes
@@ -1178,9 +1173,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1178 1173
1179 regs = args->regs; 1174 regs = args->regs;
1180 1175
1181#ifdef CONFIG_X86_LOCAL_APIC
1182 apic_write(APIC_LVTPC, APIC_DM_NMI); 1176 apic_write(APIC_LVTPC, APIC_DM_NMI);
1183#endif
1184 /* 1177 /*
1185 * Can't rely on the handled return value to say it was our NMI, two 1178 * Can't rely on the handled return value to say it was our NMI, two
1186 * events could trigger 'simultaneously' raising two back-to-back NMIs. 1179 * events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1274,12 +1267,15 @@ int hw_perf_group_sched_in(struct perf_event *leader,
1274 int assign[X86_PMC_IDX_MAX]; 1267 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret; 1268 int n0, n1, ret;
1276 1269
1270 if (!x86_pmu_initialized())
1271 return 0;
1272
1277 /* n0 = total number of events */ 1273 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true); 1274 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0) 1275 if (n0 < 0)
1280 return n0; 1276 return n0;
1281 1277
1282 ret = x86_schedule_events(cpuc, n0, assign); 1278 ret = x86_pmu.schedule_events(cpuc, n0, assign);
1283 if (ret) 1279 if (ret)
1284 return ret; 1280 return ret;
1285 1281
@@ -1329,6 +1325,9 @@ undo:
1329 1325
1330#include "perf_event_amd.c" 1326#include "perf_event_amd.c"
1331#include "perf_event_p6.c" 1327#include "perf_event_p6.c"
1328#include "perf_event_p4.c"
1329#include "perf_event_intel_lbr.c"
1330#include "perf_event_intel_ds.c"
1332#include "perf_event_intel.c" 1331#include "perf_event_intel.c"
1333 1332
1334static int __cpuinit 1333static int __cpuinit
@@ -1402,48 +1401,50 @@ void __init init_hw_perf_events(void)
1402 1401
1403 pr_cont("%s PMU driver.\n", x86_pmu.name); 1402 pr_cont("%s PMU driver.\n", x86_pmu.name);
1404 1403
1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1404 if (x86_pmu.quirks)
1405 x86_pmu.quirks();
1406
1407 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1406 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", 1408 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1407 x86_pmu.num_events, X86_PMC_MAX_GENERIC); 1409 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1408 x86_pmu.num_events = X86_PMC_MAX_GENERIC; 1410 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1409 } 1411 }
1410 perf_event_mask = (1 << x86_pmu.num_events) - 1; 1412 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1411 perf_max_events = x86_pmu.num_events; 1413 perf_max_events = x86_pmu.num_counters;
1412 1414
1413 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { 1415 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1414 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1416 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1415 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); 1417 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1416 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; 1418 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1417 } 1419 }
1418 1420
1419 perf_event_mask |= 1421 x86_pmu.intel_ctrl |=
1420 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; 1422 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1421 x86_pmu.intel_ctrl = perf_event_mask;
1422 1423
1423 perf_events_lapic_init(); 1424 perf_events_lapic_init();
1424 register_die_notifier(&perf_event_nmi_notifier); 1425 register_die_notifier(&perf_event_nmi_notifier);
1425 1426
1426 unconstrained = (struct event_constraint) 1427 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, 1428 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1428 0, x86_pmu.num_events); 1429 0, x86_pmu.num_counters);
1429 1430
1430 if (x86_pmu.event_constraints) { 1431 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) { 1432 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK) 1433 if (c->cmask != X86_RAW_EVENT_MASK)
1433 continue; 1434 continue;
1434 1435
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; 1436 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1436 c->weight += x86_pmu.num_events; 1437 c->weight += x86_pmu.num_counters;
1437 } 1438 }
1438 } 1439 }
1439 1440
1440 pr_info("... version: %d\n", x86_pmu.version); 1441 pr_info("... version: %d\n", x86_pmu.version);
1441 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1442 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
1442 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1443 pr_info("... generic registers: %d\n", x86_pmu.num_counters);
1443 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); 1444 pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1445 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1446 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1446 pr_info("... event mask: %016Lx\n", perf_event_mask); 1447 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1447 1448
1448 perf_cpu_notifier(x86_pmu_notifier); 1449 perf_cpu_notifier(x86_pmu_notifier);
1449} 1450}
@@ -1463,6 +1464,32 @@ static const struct pmu pmu = {
1463}; 1464};
1464 1465
1465/* 1466/*
1467 * validate that we can schedule this event
1468 */
1469static int validate_event(struct perf_event *event)
1470{
1471 struct cpu_hw_events *fake_cpuc;
1472 struct event_constraint *c;
1473 int ret = 0;
1474
1475 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1476 if (!fake_cpuc)
1477 return -ENOMEM;
1478
1479 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1480
1481 if (!c || !c->weight)
1482 ret = -ENOSPC;
1483
1484 if (x86_pmu.put_event_constraints)
1485 x86_pmu.put_event_constraints(fake_cpuc, event);
1486
1487 kfree(fake_cpuc);
1488
1489 return ret;
1490}
1491
1492/*
1466 * validate a single event group 1493 * validate a single event group
1467 * 1494 *
1468 * validation include: 1495 * validation include:
@@ -1502,7 +1529,7 @@ static int validate_group(struct perf_event *event)
1502 1529
1503 fake_cpuc->n_events = n; 1530 fake_cpuc->n_events = n;
1504 1531
1505 ret = x86_schedule_events(fake_cpuc, n, NULL); 1532 ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1506 1533
1507out_free: 1534out_free:
1508 kfree(fake_cpuc); 1535 kfree(fake_cpuc);
@@ -1527,6 +1554,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1527 1554
1528 if (event->group_leader != event) 1555 if (event->group_leader != event)
1529 err = validate_group(event); 1556 err = validate_group(event);
1557 else
1558 err = validate_event(event);
1530 1559
1531 event->pmu = tmp; 1560 event->pmu = tmp;
1532 } 1561 }
@@ -1574,8 +1603,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1574{ 1603{
1575 struct perf_callchain_entry *entry = data; 1604 struct perf_callchain_entry *entry = data;
1576 1605
1577 if (reliable) 1606 callchain_store(entry, addr);
1578 callchain_store(entry, addr);
1579} 1607}
1580 1608
1581static const struct stacktrace_ops backtrace_ops = { 1609static const struct stacktrace_ops backtrace_ops = {
@@ -1597,41 +1625,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1625 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
1598} 1626}
1599 1627
1600/*
1601 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
1602 */
1603static unsigned long
1604copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
1605{
1606 unsigned long offset, addr = (unsigned long)from;
1607 int type = in_nmi() ? KM_NMI : KM_IRQ0;
1608 unsigned long size, len = 0;
1609 struct page *page;
1610 void *map;
1611 int ret;
1612
1613 do {
1614 ret = __get_user_pages_fast(addr, 1, 0, &page);
1615 if (!ret)
1616 break;
1617
1618 offset = addr & (PAGE_SIZE - 1);
1619 size = min(PAGE_SIZE - offset, n - len);
1620
1621 map = kmap_atomic(page, type);
1622 memcpy(to, map+offset, size);
1623 kunmap_atomic(map, type);
1624 put_page(page);
1625
1626 len += size;
1627 to += size;
1628 addr += size;
1629
1630 } while (len < n);
1631
1632 return len;
1633}
1634
1635#ifdef CONFIG_COMPAT 1628#ifdef CONFIG_COMPAT
1636static inline int 1629static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1630perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1720,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1727{ 1720{
1728 struct perf_callchain_entry *entry; 1721 struct perf_callchain_entry *entry;
1729 1722
1723 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1724 /* TODO: We don't support guest os callchain now */
1725 return NULL;
1726 }
1727
1730 if (in_nmi()) 1728 if (in_nmi())
1731 entry = &__get_cpu_var(pmc_nmi_entry); 1729 entry = &__get_cpu_var(pmc_nmi_entry);
1732 else 1730 else
@@ -1750,3 +1748,29 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
1750 regs->cs = __KERNEL_CS; 1748 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags); 1749 local_save_flags(regs->flags);
1752} 1750}
1751
1752unsigned long perf_instruction_pointer(struct pt_regs *regs)
1753{
1754 unsigned long ip;
1755 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
1756 ip = perf_guest_cbs->get_guest_ip();
1757 else
1758 ip = instruction_pointer(regs);
1759 return ip;
1760}
1761
1762unsigned long perf_misc_flags(struct pt_regs *regs)
1763{
1764 int misc = 0;
1765 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1766 misc |= perf_guest_cbs->is_user_mode() ?
1767 PERF_RECORD_MISC_GUEST_USER :
1768 PERF_RECORD_MISC_GUEST_KERNEL;
1769 } else
1770 misc |= user_mode(regs) ? PERF_RECORD_MISC_USER :
1771 PERF_RECORD_MISC_KERNEL;
1772 if (regs->flags & PERF_EFLAGS_EXACT)
1773 misc |= PERF_RECORD_MISC_EXACT;
1774
1775 return misc;
1776}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e..611df11ba15 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock); 3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4 4
5static __initconst u64 amd_hw_cache_event_ids 5static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
111 return amd_perfmon_event_map[hw_event]; 111 return amd_perfmon_event_map[hw_event];
112} 112}
113 113
114static u64 amd_pmu_raw_event(u64 hw_event) 114static int amd_pmu_hw_config(struct perf_event *event)
115{ 115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL 116 int ret = x86_pmu_hw_config(event);
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 117
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 118 if (ret)
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL 119 return ret;
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL 120
121 121 if (event->attr.type != PERF_TYPE_RAW)
122#define K7_EVNTSEL_MASK \ 122 return 0;
123 (K7_EVNTSEL_EVENT_MASK | \ 123
124 K7_EVNTSEL_UNIT_MASK | \ 124 event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
125 K7_EVNTSEL_EDGE_MASK | \ 125
126 K7_EVNTSEL_INV_MASK | \ 126 return 0;
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130} 127}
131 128
132/* 129/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
165 * be removed on one CPU at a time AND PMU is disabled 162 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here 163 * when we come here
167 */ 164 */
168 for (i = 0; i < x86_pmu.num_events; i++) { 165 for (i = 0; i < x86_pmu.num_counters; i++) {
169 if (nb->owners[i] == event) { 166 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL); 167 cmpxchg(nb->owners+i, event, NULL);
171 break; 168 break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
215 struct hw_perf_event *hwc = &event->hw; 212 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb; 213 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL; 214 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events; 215 int max = x86_pmu.num_counters;
219 int i, j, k = -1; 216 int i, j, k = -1;
220 217
221 /* 218 /*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
293 /* 290 /*
294 * initialize all possible NB constraints 291 * initialize all possible NB constraints
295 */ 292 */
296 for (i = 0; i < x86_pmu.num_events; i++) { 293 for (i = 0; i < x86_pmu.num_counters; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk); 294 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1; 295 nb->event_constraints[i].weight = 1;
299 } 296 }
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
371 raw_spin_unlock(&amd_nb_lock); 368 raw_spin_unlock(&amd_nb_lock);
372} 369}
373 370
374static __initconst struct x86_pmu amd_pmu = { 371static __initconst const struct x86_pmu amd_pmu = {
375 .name = "AMD", 372 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq, 373 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all, 374 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all, 375 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event, 376 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event, 377 .disable = x86_pmu_disable_event,
378 .hw_config = amd_pmu_hw_config,
379 .schedule_events = x86_schedule_events,
381 .eventsel = MSR_K7_EVNTSEL0, 380 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0, 381 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map, 382 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 383 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4, 384 .num_counters = 4,
387 .event_bits = 48, 385 .cntval_bits = 48,
388 .event_mask = (1ULL << 48) - 1, 386 .cntval_mask = (1ULL << 48) - 1,
389 .apic = 1, 387 .apic = 1,
390 /* use highest bit to detect overflow */ 388 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1, 389 .max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac8783..a099df96f91 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -88,7 +88,7 @@ static u64 intel_pmu_event_map(int hw_event)
88 return intel_perfmon_event_map[hw_event]; 88 return intel_perfmon_event_map[hw_event];
89} 89}
90 90
91static __initconst u64 westmere_hw_cache_event_ids 91static __initconst const u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX] 92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX] 93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +179,7 @@ static __initconst u64 westmere_hw_cache_event_ids
179 }, 179 },
180}; 180};
181 181
182static __initconst u64 nehalem_hw_cache_event_ids 182static __initconst const u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX] 183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX] 184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +270,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
270 }, 270 },
271}; 271};
272 272
273static __initconst u64 core2_hw_cache_event_ids 273static __initconst const u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX] 274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX] 275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +361,7 @@ static __initconst u64 core2_hw_cache_event_ids
361 }, 361 },
362}; 362};
363 363
364static __initconst u64 atom_hw_cache_event_ids 364static __initconst const u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX] 365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX] 366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] = 367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +452,6 @@ static __initconst u64 atom_hw_cache_event_ids
452 }, 452 },
453}; 453};
454 454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void) 455static void intel_pmu_disable_all(void)
510{ 456{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 457 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +460,17 @@ static void intel_pmu_disable_all(void)
514 460
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 461 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts(); 462 intel_pmu_disable_bts();
463
464 intel_pmu_pebs_disable_all();
465 intel_pmu_lbr_disable_all();
517} 466}
518 467
519static void intel_pmu_enable_all(void) 468static void intel_pmu_enable_all(int added)
520{ 469{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 470 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522 471
472 intel_pmu_pebs_enable_all();
473 intel_pmu_lbr_enable_all();
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 474 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524 475
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 476 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +484,41 @@ static void intel_pmu_enable_all(void)
533 } 484 }
534} 485}
535 486
487/*
488 * Workaround for:
489 * Intel Errata AAK100 (model 26)
490 * Intel Errata AAP53 (model 30)
491 * Intel Errata BD53 (model 44)
492 *
493 * These chips need to be 'reset' when adding counters by programming
494 * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
495 * either in sequence on the same PMC or on different PMCs.
496 */
497static void intel_pmu_nhm_enable_all(int added)
498{
499 if (added) {
500 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
501 int i;
502
503 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
504 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
505 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
506
507 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
508 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
509
510 for (i = 0; i < 3; i++) {
511 struct perf_event *event = cpuc->events[i];
512
513 if (!event)
514 continue;
515
516 __x86_pmu_enable_event(&event->hw);
517 }
518 }
519 intel_pmu_enable_all(added);
520}
521
536static inline u64 intel_pmu_get_status(void) 522static inline u64 intel_pmu_get_status(void)
537{ 523{
538 u64 status; 524 u64 status;
@@ -547,8 +533,7 @@ static inline void intel_pmu_ack_status(u64 ack)
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 533 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548} 534}
549 535
550static inline void 536static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{ 537{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED; 538 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask; 539 u64 ctrl_val, mask;
@@ -557,71 +542,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
557 542
558 rdmsrl(hwc->config_base, ctrl_val); 543 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask; 544 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 545 wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621} 546}
622 547
623static inline void 548static void intel_pmu_disable_event(struct perf_event *event)
624intel_pmu_disable_event(struct perf_event *event)
625{ 549{
626 struct hw_perf_event *hwc = &event->hw; 550 struct hw_perf_event *hwc = &event->hw;
627 551
@@ -637,14 +561,15 @@ intel_pmu_disable_event(struct perf_event *event)
637 } 561 }
638 562
639 x86_pmu_disable_event(event); 563 x86_pmu_disable_event(event);
564
565 if (unlikely(event->attr.precise))
566 intel_pmu_pebs_disable(event);
640} 567}
641 568
642static inline void 569static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{ 570{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED; 571 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask; 572 u64 ctrl_val, bits, mask;
647 int err;
648 573
649 /* 574 /*
650 * Enable IRQ generation (0x8), 575 * Enable IRQ generation (0x8),
@@ -669,7 +594,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
669 rdmsrl(hwc->config_base, ctrl_val); 594 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask; 595 ctrl_val &= ~mask;
671 ctrl_val |= bits; 596 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val); 597 wrmsrl(hwc->config_base, ctrl_val);
673} 598}
674 599
675static void intel_pmu_enable_event(struct perf_event *event) 600static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,6 +614,9 @@ static void intel_pmu_enable_event(struct perf_event *event)
689 return; 614 return;
690 } 615 }
691 616
617 if (unlikely(event->attr.precise))
618 intel_pmu_pebs_enable(event);
619
692 __x86_pmu_enable_event(hwc); 620 __x86_pmu_enable_event(hwc);
693} 621}
694 622
@@ -708,20 +636,20 @@ static void intel_pmu_reset(void)
708 unsigned long flags; 636 unsigned long flags;
709 int idx; 637 int idx;
710 638
711 if (!x86_pmu.num_events) 639 if (!x86_pmu.num_counters)
712 return; 640 return;
713 641
714 local_irq_save(flags); 642 local_irq_save(flags);
715 643
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 644 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717 645
718 for (idx = 0; idx < x86_pmu.num_events; idx++) { 646 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 647 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 648 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 } 649 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { 650 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 651 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 } 652
725 if (ds) 653 if (ds)
726 ds->bts_index = ds->bts_buffer_base; 654 ds->bts_index = ds->bts_buffer_base;
727 655
@@ -747,7 +675,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
747 intel_pmu_drain_bts_buffer(); 675 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status(); 676 status = intel_pmu_get_status();
749 if (!status) { 677 if (!status) {
750 intel_pmu_enable_all(); 678 intel_pmu_enable_all(0);
751 return 0; 679 return 0;
752 } 680 }
753 681
@@ -762,6 +690,15 @@ again:
762 690
763 inc_irq_stat(apic_perf_irqs); 691 inc_irq_stat(apic_perf_irqs);
764 ack = status; 692 ack = status;
693
694 intel_pmu_lbr_read();
695
696 /*
697 * PEBS overflow sets bit 62 in the global status register
698 */
699 if (__test_and_clear_bit(62, (unsigned long *)&status))
700 x86_pmu.drain_pebs(regs);
701
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 702 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit]; 703 struct perf_event *event = cpuc->events[bit];
767 704
@@ -787,26 +724,22 @@ again:
787 goto again; 724 goto again;
788 725
789done: 726done:
790 intel_pmu_enable_all(); 727 intel_pmu_enable_all(0);
791 return 1; 728 return 1;
792} 729}
793 730
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint * 731static struct event_constraint *
798intel_special_constraints(struct perf_event *event) 732intel_bts_constraints(struct perf_event *event)
799{ 733{
800 unsigned int hw_event; 734 struct hw_perf_event *hwc = &event->hw;
801 735 unsigned int hw_event, bts_event;
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803 736
804 if (unlikely((hw_event == 737 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 738 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
806 (event->hw.sample_period == 1))) {
807 739
740 if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
808 return &bts_constraint; 741 return &bts_constraint;
809 } 742
810 return NULL; 743 return NULL;
811} 744}
812 745
@@ -815,24 +748,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
815{ 748{
816 struct event_constraint *c; 749 struct event_constraint *c;
817 750
818 c = intel_special_constraints(event); 751 c = intel_bts_constraints(event);
752 if (c)
753 return c;
754
755 c = intel_pebs_constraints(event);
819 if (c) 756 if (c)
820 return c; 757 return c;
821 758
822 return x86_get_event_constraints(cpuc, event); 759 return x86_get_event_constraints(cpuc, event);
823} 760}
824 761
825static __initconst struct x86_pmu core_pmu = { 762static int intel_pmu_hw_config(struct perf_event *event)
763{
764 int ret = x86_pmu_hw_config(event);
765
766 if (ret)
767 return ret;
768
769 if (event->attr.type != PERF_TYPE_RAW)
770 return 0;
771
772 if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
773 return 0;
774
775 if (x86_pmu.version < 3)
776 return -EINVAL;
777
778 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
779 return -EACCES;
780
781 event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
782
783 return 0;
784}
785
786static __initconst const struct x86_pmu core_pmu = {
826 .name = "core", 787 .name = "core",
827 .handle_irq = x86_pmu_handle_irq, 788 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all, 789 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all, 790 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event, 791 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event, 792 .disable = x86_pmu_disable_event,
793 .hw_config = x86_pmu_hw_config,
794 .schedule_events = x86_schedule_events,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 795 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 796 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map, 797 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 798 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1, 799 .apic = 1,
838 /* 800 /*
@@ -845,17 +807,32 @@ static __initconst struct x86_pmu core_pmu = {
845 .event_constraints = intel_core_event_constraints, 807 .event_constraints = intel_core_event_constraints,
846}; 808};
847 809
848static __initconst struct x86_pmu intel_pmu = { 810static void intel_pmu_cpu_starting(int cpu)
811{
812 init_debug_store_on_cpu(cpu);
813 /*
814 * Deal with CPUs that don't clear their LBRs on power-up.
815 */
816 intel_pmu_lbr_reset();
817}
818
819static void intel_pmu_cpu_dying(int cpu)
820{
821 fini_debug_store_on_cpu(cpu);
822}
823
824static __initconst const struct x86_pmu intel_pmu = {
849 .name = "Intel", 825 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq, 826 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all, 827 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all, 828 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event, 829 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event, 830 .disable = intel_pmu_disable_event,
831 .hw_config = intel_pmu_hw_config,
832 .schedule_events = x86_schedule_events,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 833 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 834 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map, 835 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map), 836 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1, 837 .apic = 1,
861 /* 838 /*
@@ -864,14 +841,38 @@ static __initconst struct x86_pmu intel_pmu = {
864 * the generic event period: 841 * the generic event period:
865 */ 842 */
866 .max_period = (1ULL << 31) - 1, 843 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints, 844 .get_event_constraints = intel_get_event_constraints,
870 845
871 .cpu_starting = init_debug_store_on_cpu, 846 .cpu_starting = intel_pmu_cpu_starting,
872 .cpu_dying = fini_debug_store_on_cpu, 847 .cpu_dying = intel_pmu_cpu_dying,
873}; 848};
874 849
850static void intel_clovertown_quirks(void)
851{
852 /*
853 * PEBS is unreliable due to:
854 *
855 * AJ67 - PEBS may experience CPL leaks
856 * AJ68 - PEBS PMI may be delayed by one event
857 * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
858 * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
859 *
860 * AJ67 could be worked around by restricting the OS/USR flags.
861 * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
862 *
863 * AJ106 could possibly be worked around by not allowing LBR
864 * usage from PEBS, including the fixup.
865 * AJ68 could possibly be worked around by always programming
866 * a pebs_event_reset[0] value and coping with the lost events.
867 *
868 * But taken together it might just make sense to not enable PEBS on
869 * these chips.
870 */
871 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
872 x86_pmu.pebs = 0;
873 x86_pmu.pebs_constraints = NULL;
874}
875
875static __init int intel_pmu_init(void) 876static __init int intel_pmu_init(void)
876{ 877{
877 union cpuid10_edx edx; 878 union cpuid10_edx edx;
@@ -881,12 +882,13 @@ static __init int intel_pmu_init(void)
881 int version; 882 int version;
882 883
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 884 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */ 885 switch (boot_cpu_data.x86) {
885 if (boot_cpu_data.x86 == 6) { 886 case 0x6:
886 return p6_pmu_init(); 887 return p6_pmu_init();
887 } else { 888 case 0xf:
889 return p4_pmu_init();
890 }
888 return -ENODEV; 891 return -ENODEV;
889 }
890 } 892 }
891 893
892 /* 894 /*
@@ -904,16 +906,28 @@ static __init int intel_pmu_init(void)
904 x86_pmu = intel_pmu; 906 x86_pmu = intel_pmu;
905 907
906 x86_pmu.version = version; 908 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events; 909 x86_pmu.num_counters = eax.split.num_counters;
908 x86_pmu.event_bits = eax.split.bit_width; 910 x86_pmu.cntval_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; 911 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
910 912
911 /* 913 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so 914 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events: 915 * assume at least 3 events:
914 */ 916 */
915 if (version > 1) 917 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 918 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
919
920 /*
921 * v2 and above have a perf capabilities MSR
922 */
923 if (version > 1) {
924 u64 capabilities;
925
926 rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
927 x86_pmu.intel_cap.capabilities = capabilities;
928 }
929
930 intel_ds_init();
917 931
918 /* 932 /*
919 * Install the hw-cache-events table: 933 * Install the hw-cache-events table:
@@ -924,12 +938,15 @@ static __init int intel_pmu_init(void)
924 break; 938 break;
925 939
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 940 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
941 x86_pmu.quirks = intel_clovertown_quirks;
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 942 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 943 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */ 944 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 945 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids)); 946 sizeof(hw_cache_event_ids));
932 947
948 intel_pmu_lbr_init_core();
949
933 x86_pmu.event_constraints = intel_core2_event_constraints; 950 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, "); 951 pr_cont("Core2 events, ");
935 break; 952 break;
@@ -940,13 +957,19 @@ static __init int intel_pmu_init(void)
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 957 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids)); 958 sizeof(hw_cache_event_ids));
942 959
960 intel_pmu_lbr_init_nhm();
961
943 x86_pmu.event_constraints = intel_nehalem_event_constraints; 962 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, "); 963 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
964 pr_cont("Nehalem events, ");
945 break; 965 break;
966
946 case 28: /* Atom */ 967 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 968 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids)); 969 sizeof(hw_cache_event_ids));
949 970
971 intel_pmu_lbr_init_atom();
972
950 x86_pmu.event_constraints = intel_gen_event_constraints; 973 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, "); 974 pr_cont("Atom events, ");
952 break; 975 break;
@@ -956,7 +979,10 @@ static __init int intel_pmu_init(void)
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 979 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids)); 980 sizeof(hw_cache_event_ids));
958 981
982 intel_pmu_lbr_init_nhm();
983
959 x86_pmu.event_constraints = intel_westmere_event_constraints; 984 x86_pmu.event_constraints = intel_westmere_event_constraints;
985 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
960 pr_cont("Westmere events, "); 986 pr_cont("Westmere events, ");
961 break; 987 break;
962 988
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 00000000000..ec8b2e12e10
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,664 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/* The maximal number of PEBS events: */
4#define MAX_PEBS_EVENTS 4
5
6/* The size of a BTS record in bytes: */
7#define BTS_RECORD_SIZE 24
8
9#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
10#define PEBS_BUFFER_SIZE PAGE_SIZE
11
12/*
13 * pebs_record_32 for p4 and core not supported
14
15struct pebs_record_32 {
16 u32 flags, ip;
17 u32 ax, bc, cx, dx;
18 u32 si, di, bp, sp;
19};
20
21 */
22
23struct pebs_record_core {
24 u64 flags, ip;
25 u64 ax, bx, cx, dx;
26 u64 si, di, bp, sp;
27 u64 r8, r9, r10, r11;
28 u64 r12, r13, r14, r15;
29};
30
31struct pebs_record_nhm {
32 u64 flags, ip;
33 u64 ax, bx, cx, dx;
34 u64 si, di, bp, sp;
35 u64 r8, r9, r10, r11;
36 u64 r12, r13, r14, r15;
37 u64 status, dla, dse, lat;
38};
39
40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
58{
59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
60
61 if (!ds)
62 return;
63
64 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
65 (u32)((u64)(unsigned long)ds),
66 (u32)((u64)(unsigned long)ds >> 32));
67}
68
69static void fini_debug_store_on_cpu(int cpu)
70{
71 if (!per_cpu(cpu_hw_events, cpu).ds)
72 return;
73
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75}
76
77static void release_ds_buffers(void)
78{
79 int cpu;
80
81 if (!x86_pmu.bts && !x86_pmu.pebs)
82 return;
83
84 get_online_cpus();
85
86 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu);
88
89 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
91
92 if (!ds)
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 }
101
102 put_online_cpus();
103}
104
105static int reserve_ds_buffers(void)
106{
107 int cpu, err = 0;
108
109 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0;
111
112 get_online_cpus();
113
114 for_each_possible_cpu(cpu) {
115 struct debug_store *ds;
116 void *buffer;
117 int max, thresh;
118
119 err = -ENOMEM;
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL);
121 if (unlikely(!ds))
122 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds;
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140
141 if (x86_pmu.pebs) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
143 if (unlikely(!buffer))
144 break;
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158
159 err = 0;
160 }
161
162 if (err)
163 release_ds_buffers();
164 else {
165 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu);
167 }
168
169 put_online_cpus();
170
171 return err;
172}
173
174/*
175 * BTS
176 */
177
178static struct event_constraint bts_constraint =
179 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
180
181static void intel_pmu_enable_bts(u64 config)
182{
183 unsigned long debugctlmsr;
184
185 debugctlmsr = get_debugctlmsr();
186
187 debugctlmsr |= DEBUGCTLMSR_TR;
188 debugctlmsr |= DEBUGCTLMSR_BTS;
189 debugctlmsr |= DEBUGCTLMSR_BTINT;
190
191 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
192 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
193
194 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
195 debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
196
197 update_debugctlmsr(debugctlmsr);
198}
199
200static void intel_pmu_disable_bts(void)
201{
202 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
203 unsigned long debugctlmsr;
204
205 if (!cpuc->ds)
206 return;
207
208 debugctlmsr = get_debugctlmsr();
209
210 debugctlmsr &=
211 ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
212 DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
213
214 update_debugctlmsr(debugctlmsr);
215}
216
217static void intel_pmu_drain_bts_buffer(void)
218{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds;
221 struct bts_record {
222 u64 from;
223 u64 to;
224 u64 flags;
225 };
226 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
227 struct bts_record *at, *top;
228 struct perf_output_handle handle;
229 struct perf_event_header header;
230 struct perf_sample_data data;
231 struct pt_regs regs;
232
233 if (!event)
234 return;
235
236 if (!ds)
237 return;
238
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index;
241
242 if (top <= at)
243 return;
244
245 ds->bts_index = ds->bts_buffer_base;
246
247 perf_sample_data_init(&data, 0);
248 data.period = event->hw.last_period;
249 regs.ip = 0;
250
251 /*
252 * Prepare a generic sample, i.e. fill in the invariant fields.
253 * We will overwrite the from and to address before we output
254 * the sample.
255 */
256 perf_prepare_sample(&header, &data, event, &regs);
257
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return;
260
261 for (; at < top; at++) {
262 data.ip = at->from;
263 data.addr = at->to;
264
265 perf_output_sample(&handle, &header, &data, event);
266 }
267
268 perf_output_end(&handle);
269
270 /* There's new data available. */
271 event->hw.interrupts++;
272 event->pending_kill = POLL_IN;
273}
274
275/*
276 * PEBS
277 */
278
279static struct event_constraint intel_core_pebs_events[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
289 EVENT_CONSTRAINT_END
290};
291
292static struct event_constraint intel_nehalem_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
302 EVENT_CONSTRAINT_END
303};
304
305static struct event_constraint *
306intel_pebs_constraints(struct perf_event *event)
307{
308 struct event_constraint *c;
309
310 if (!event->attr.precise)
311 return NULL;
312
313 if (x86_pmu.pebs_constraints) {
314 for_each_event_constraint(c, x86_pmu.pebs_constraints) {
315 if ((event->hw.config & c->cmask) == c->code)
316 return c;
317 }
318 }
319
320 return &emptyconstraint;
321}
322
323static void intel_pmu_pebs_enable(struct perf_event *event)
324{
325 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
326 struct hw_perf_event *hwc = &event->hw;
327
328 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
329
330 cpuc->pebs_enabled |= 1ULL << hwc->idx;
331 WARN_ON_ONCE(cpuc->enabled);
332
333 if (x86_pmu.intel_cap.pebs_trap)
334 intel_pmu_lbr_enable(event);
335}
336
337static void intel_pmu_pebs_disable(struct perf_event *event)
338{
339 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
340 struct hw_perf_event *hwc = &event->hw;
341
342 cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
343 if (cpuc->enabled)
344 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
345
346 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
347
348 if (x86_pmu.intel_cap.pebs_trap)
349 intel_pmu_lbr_disable(event);
350}
351
352static void intel_pmu_pebs_enable_all(void)
353{
354 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
355
356 if (cpuc->pebs_enabled)
357 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
358}
359
360static void intel_pmu_pebs_disable_all(void)
361{
362 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
363
364 if (cpuc->pebs_enabled)
365 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
366}
367
368#include <asm/insn.h>
369
370static inline bool kernel_ip(unsigned long ip)
371{
372#ifdef CONFIG_X86_32
373 return ip > PAGE_OFFSET;
374#else
375 return (long)ip < 0;
376#endif
377}
378
379static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
380{
381 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
382 unsigned long from = cpuc->lbr_entries[0].from;
383 unsigned long old_to, to = cpuc->lbr_entries[0].to;
384 unsigned long ip = regs->ip;
385
386 /*
387 * We don't need to fixup if the PEBS assist is fault like
388 */
389 if (!x86_pmu.intel_cap.pebs_trap)
390 return 1;
391
392 /*
393 * No LBR entry, no basic block, no rewinding
394 */
395 if (!cpuc->lbr_stack.nr || !from || !to)
396 return 0;
397
398 /*
399 * Basic blocks should never cross user/kernel boundaries
400 */
401 if (kernel_ip(ip) != kernel_ip(to))
402 return 0;
403
404 /*
405 * unsigned math, either ip is before the start (impossible) or
406 * the basic block is larger than 1 page (sanity)
407 */
408 if ((ip - to) > PAGE_SIZE)
409 return 0;
410
411 /*
412 * We sampled a branch insn, rewind using the LBR stack
413 */
414 if (ip == to) {
415 regs->ip = from;
416 return 1;
417 }
418
419 do {
420 struct insn insn;
421 u8 buf[MAX_INSN_SIZE];
422 void *kaddr;
423
424 old_to = to;
425 if (!kernel_ip(ip)) {
426 int bytes, size = MAX_INSN_SIZE;
427
428 bytes = copy_from_user_nmi(buf, (void __user *)to, size);
429 if (bytes != size)
430 return 0;
431
432 kaddr = buf;
433 } else
434 kaddr = (void *)to;
435
436 kernel_insn_init(&insn, kaddr);
437 insn_get_length(&insn);
438 to += insn.length;
439 } while (to < ip);
440
441 if (to == ip) {
442 regs->ip = old_to;
443 return 1;
444 }
445
446 /*
447 * Even though we decoded the basic block, the instruction stream
448 * never matched the given IP, either the TO or the IP got corrupted.
449 */
450 return 0;
451}
452
453static int intel_pmu_save_and_restart(struct perf_event *event);
454
455static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
456{
457 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
458 struct debug_store *ds = cpuc->ds;
459 struct perf_event *event = cpuc->events[0]; /* PMC0 only */
460 struct pebs_record_core *at, *top;
461 struct perf_sample_data data;
462 struct perf_raw_record raw;
463 struct pt_regs regs;
464 int n;
465
466 if (!ds || !x86_pmu.pebs)
467 return;
468
469 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
470 top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
471
472 /*
473 * Whatever else happens, drain the thing
474 */
475 ds->pebs_index = ds->pebs_buffer_base;
476
477 if (!test_bit(0, cpuc->active_mask))
478 return;
479
480 WARN_ON_ONCE(!event);
481
482 if (!event->attr.precise)
483 return;
484
485 n = top - at;
486 if (n <= 0)
487 return;
488
489 if (!intel_pmu_save_and_restart(event))
490 return;
491
492 /*
493 * Should not happen, we program the threshold at 1 and do not
494 * set a reset value.
495 */
496 WARN_ON_ONCE(n > 1);
497 at += n - 1;
498
499 perf_sample_data_init(&data, 0);
500 data.period = event->hw.last_period;
501
502 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
503 raw.size = x86_pmu.pebs_record_size;
504 raw.data = at;
505 data.raw = &raw;
506 }
507
508 /*
509 * We use the interrupt regs as a base because the PEBS record
510 * does not contain a full regs set, specifically it seems to
511 * lack segment descriptors, which get used by things like
512 * user_mode().
513 *
514 * In the simple case fix up only the IP and BP,SP regs, for
515 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
516 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
517 */
518 regs = *iregs;
519 regs.ip = at->ip;
520 regs.bp = at->bp;
521 regs.sp = at->sp;
522
523 if (intel_pmu_pebs_fixup_ip(&regs))
524 regs.flags |= PERF_EFLAGS_EXACT;
525 else
526 regs.flags &= ~PERF_EFLAGS_EXACT;
527
528 if (perf_event_overflow(event, 1, &data, &regs))
529 x86_pmu_stop(event);
530}
531
532static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
533{
534 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
535 struct debug_store *ds = cpuc->ds;
536 struct pebs_record_nhm *at, *top;
537 struct perf_sample_data data;
538 struct perf_event *event = NULL;
539 struct perf_raw_record raw;
540 struct pt_regs regs;
541 u64 status = 0;
542 int bit, n;
543
544 if (!ds || !x86_pmu.pebs)
545 return;
546
547 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
548 top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
549
550 ds->pebs_index = ds->pebs_buffer_base;
551
552 n = top - at;
553 if (n <= 0)
554 return;
555
556 /*
557 * Should not happen, we program the threshold at 1 and do not
558 * set a reset value.
559 */
560 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
561
562 for ( ; at < top; at++) {
563 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
564 event = cpuc->events[bit];
565 if (!test_bit(bit, cpuc->active_mask))
566 continue;
567
568 WARN_ON_ONCE(!event);
569
570 if (!event->attr.precise)
571 continue;
572
573 if (__test_and_set_bit(bit, (unsigned long *)&status))
574 continue;
575
576 break;
577 }
578
579 if (!event || bit >= MAX_PEBS_EVENTS)
580 continue;
581
582 if (!intel_pmu_save_and_restart(event))
583 continue;
584
585 perf_sample_data_init(&data, 0);
586 data.period = event->hw.last_period;
587
588 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
589 raw.size = x86_pmu.pebs_record_size;
590 raw.data = at;
591 data.raw = &raw;
592 }
593
594 /*
595 * See the comment in intel_pmu_drain_pebs_core()
596 */
597 regs = *iregs;
598 regs.ip = at->ip;
599 regs.bp = at->bp;
600 regs.sp = at->sp;
601
602 if (intel_pmu_pebs_fixup_ip(&regs))
603 regs.flags |= PERF_EFLAGS_EXACT;
604 else
605 regs.flags &= ~PERF_EFLAGS_EXACT;
606
607 if (perf_event_overflow(event, 1, &data, &regs))
608 x86_pmu_stop(event);
609 }
610}
611
612/*
613 * BTS, PEBS probe and setup
614 */
615
616static void intel_ds_init(void)
617{
618 /*
619 * No support for 32bit formats
620 */
621 if (!boot_cpu_has(X86_FEATURE_DTES64))
622 return;
623
624 x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
625 x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
626 if (x86_pmu.pebs) {
627 char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
628 int format = x86_pmu.intel_cap.pebs_format;
629
630 switch (format) {
631 case 0:
632 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
633 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
634 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
635 x86_pmu.pebs_constraints = intel_core_pebs_events;
636 break;
637
638 case 1:
639 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
640 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
641 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
642 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
643 break;
644
645 default:
646 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
647 x86_pmu.pebs = 0;
648 break;
649 }
650 }
651}
652
653#else /* CONFIG_CPU_SUP_INTEL */
654
655static int reserve_ds_buffers(void)
656{
657 return 0;
658}
659
660static void release_ds_buffers(void)
661{
662}
663
664#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 00000000000..d202c1bece1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3enum {
4 LBR_FORMAT_32 = 0x00,
5 LBR_FORMAT_LIP = 0x01,
6 LBR_FORMAT_EIP = 0x02,
7 LBR_FORMAT_EIP_FLAGS = 0x03,
8};
9
10/*
11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
12 * otherwise it becomes near impossible to get a reliable stack.
13 */
14
15static void __intel_pmu_lbr_enable(void)
16{
17 u64 debugctl;
18
19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
21 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
22}
23
24static void __intel_pmu_lbr_disable(void)
25{
26 u64 debugctl;
27
28 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
29 debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
30 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
31}
32
33static void intel_pmu_lbr_reset_32(void)
34{
35 int i;
36
37 for (i = 0; i < x86_pmu.lbr_nr; i++)
38 wrmsrl(x86_pmu.lbr_from + i, 0);
39}
40
41static void intel_pmu_lbr_reset_64(void)
42{
43 int i;
44
45 for (i = 0; i < x86_pmu.lbr_nr; i++) {
46 wrmsrl(x86_pmu.lbr_from + i, 0);
47 wrmsrl(x86_pmu.lbr_to + i, 0);
48 }
49}
50
51static void intel_pmu_lbr_reset(void)
52{
53 if (!x86_pmu.lbr_nr)
54 return;
55
56 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
57 intel_pmu_lbr_reset_32();
58 else
59 intel_pmu_lbr_reset_64();
60}
61
62static void intel_pmu_lbr_enable(struct perf_event *event)
63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
65
66 if (!x86_pmu.lbr_nr)
67 return;
68
69 WARN_ON_ONCE(cpuc->enabled);
70
71 /*
72 * Reset the LBR stack if we changed task context to
73 * avoid data leaks.
74 */
75
76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
77 intel_pmu_lbr_reset();
78 cpuc->lbr_context = event->ctx;
79 }
80
81 cpuc->lbr_users++;
82}
83
84static void intel_pmu_lbr_disable(struct perf_event *event)
85{
86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
87
88 if (!x86_pmu.lbr_nr)
89 return;
90
91 cpuc->lbr_users--;
92 WARN_ON_ONCE(cpuc->lbr_users < 0);
93
94 if (cpuc->enabled && !cpuc->lbr_users)
95 __intel_pmu_lbr_disable();
96}
97
98static void intel_pmu_lbr_enable_all(void)
99{
100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
101
102 if (cpuc->lbr_users)
103 __intel_pmu_lbr_enable();
104}
105
106static void intel_pmu_lbr_disable_all(void)
107{
108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
109
110 if (cpuc->lbr_users)
111 __intel_pmu_lbr_disable();
112}
113
114static inline u64 intel_pmu_lbr_tos(void)
115{
116 u64 tos;
117
118 rdmsrl(x86_pmu.lbr_tos, tos);
119
120 return tos;
121}
122
123static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
124{
125 unsigned long mask = x86_pmu.lbr_nr - 1;
126 u64 tos = intel_pmu_lbr_tos();
127 int i;
128
129 for (i = 0; i < x86_pmu.lbr_nr; i++) {
130 unsigned long lbr_idx = (tos - i) & mask;
131 union {
132 struct {
133 u32 from;
134 u32 to;
135 };
136 u64 lbr;
137 } msr_lastbranch;
138
139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
140
141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
143 cpuc->lbr_entries[i].flags = 0;
144 }
145 cpuc->lbr_stack.nr = i;
146}
147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
150/*
151 * Due to lack of segmentation in Linux the effective address (offset)
152 * is the same as the linear address, allowing us to merge the LIP and EIP
153 * LBR formats.
154 */
155static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
156{
157 unsigned long mask = x86_pmu.lbr_nr - 1;
158 int lbr_format = x86_pmu.intel_cap.lbr_format;
159 u64 tos = intel_pmu_lbr_tos();
160 int i;
161
162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
163 unsigned long lbr_idx = (tos - i) & mask;
164 u64 from, to, flags = 0;
165
166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
168
169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
171 from = (u64)((((s64)from) << 1) >> 1);
172 }
173
174 cpuc->lbr_entries[i].from = from;
175 cpuc->lbr_entries[i].to = to;
176 cpuc->lbr_entries[i].flags = flags;
177 }
178 cpuc->lbr_stack.nr = i;
179}
180
181static void intel_pmu_lbr_read(void)
182{
183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
184
185 if (!cpuc->lbr_users)
186 return;
187
188 if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
189 intel_pmu_lbr_read_32(cpuc);
190 else
191 intel_pmu_lbr_read_64(cpuc);
192}
193
194static void intel_pmu_lbr_init_core(void)
195{
196 x86_pmu.lbr_nr = 4;
197 x86_pmu.lbr_tos = 0x01c9;
198 x86_pmu.lbr_from = 0x40;
199 x86_pmu.lbr_to = 0x60;
200}
201
202static void intel_pmu_lbr_init_nhm(void)
203{
204 x86_pmu.lbr_nr = 16;
205 x86_pmu.lbr_tos = 0x01c9;
206 x86_pmu.lbr_from = 0x680;
207 x86_pmu.lbr_to = 0x6c0;
208}
209
210static void intel_pmu_lbr_init_atom(void)
211{
212 x86_pmu.lbr_nr = 8;
213 x86_pmu.lbr_tos = 0x01c9;
214 x86_pmu.lbr_from = 0x40;
215 x86_pmu.lbr_to = 0x60;
216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 00000000000..15367cce66b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,834 @@
1/*
2 * Netburst Perfomance Events (P4, old Xeon)
3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
6 *
7 * For licencing details see kernel-base/COPYING
8 */
9
10#ifdef CONFIG_CPU_SUP_INTEL
11
12#include <asm/perf_event_p4.h>
13
14#define P4_CNTR_LIMIT 3
15/*
16 * array indices: 0,1 - HT threads, used with HT enabled cpu
17 */
18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 unsigned char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22};
23
24struct p4_cache_event_bind {
25 unsigned int metric_pebs;
26 unsigned int metric_vert;
27};
28
29#define P4_GEN_CACHE_EVENT_BIND(name) \
30 [P4_CACHE__##name] = { \
31 .metric_pebs = P4_PEBS__##name, \
32 .metric_vert = P4_VERT__##name, \
33 }
34
35static struct p4_cache_event_bind p4_cache_event_bind_map[] = {
36 P4_GEN_CACHE_EVENT_BIND(1stl_cache_load_miss_retired),
37 P4_GEN_CACHE_EVENT_BIND(2ndl_cache_load_miss_retired),
38 P4_GEN_CACHE_EVENT_BIND(dtlb_load_miss_retired),
39 P4_GEN_CACHE_EVENT_BIND(dtlb_store_miss_retired),
40};
41
42/*
43 * Note that we don't use CCCR1 here, there is an
44 * exception for P4_BSQ_ALLOCATION but we just have
45 * no workaround
46 *
47 * consider this binding as resources which particular
48 * event may borrow, it doesn't contain EventMask,
49 * Tags and friends -- they are left to a caller
50 */
51static struct p4_event_bind p4_event_bind_map[] = {
52 [P4_EVENT_TC_DELIVER_MODE] = {
53 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
54 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
55 .cntr = { {4, 5, -1}, {6, 7, -1} },
56 },
57 [P4_EVENT_BPU_FETCH_REQUEST] = {
58 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
59 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
60 .cntr = { {0, -1, -1}, {2, -1, -1} },
61 },
62 [P4_EVENT_ITLB_REFERENCE] = {
63 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
64 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
65 .cntr = { {0, -1, -1}, {2, -1, -1} },
66 },
67 [P4_EVENT_MEMORY_CANCEL] = {
68 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
69 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
70 .cntr = { {8, 9, -1}, {10, 11, -1} },
71 },
72 [P4_EVENT_MEMORY_COMPLETE] = {
73 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
74 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
75 .cntr = { {8, 9, -1}, {10, 11, -1} },
76 },
77 [P4_EVENT_LOAD_PORT_REPLAY] = {
78 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
79 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
80 .cntr = { {8, 9, -1}, {10, 11, -1} },
81 },
82 [P4_EVENT_STORE_PORT_REPLAY] = {
83 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
84 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
85 .cntr = { {8, 9, -1}, {10, 11, -1} },
86 },
87 [P4_EVENT_MOB_LOAD_REPLAY] = {
88 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
89 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
90 .cntr = { {0, -1, -1}, {2, -1, -1} },
91 },
92 [P4_EVENT_PAGE_WALK_TYPE] = {
93 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
94 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
95 .cntr = { {0, -1, -1}, {2, -1, -1} },
96 },
97 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
98 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
99 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
100 .cntr = { {0, -1, -1}, {2, -1, -1} },
101 },
102 [P4_EVENT_IOQ_ALLOCATION] = {
103 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
104 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
105 .cntr = { {0, -1, -1}, {2, -1, -1} },
106 },
107 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
108 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
109 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
110 .cntr = { {2, -1, -1}, {3, -1, -1} },
111 },
112 [P4_EVENT_FSB_DATA_ACTIVITY] = {
113 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
114 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
115 .cntr = { {0, -1, -1}, {2, -1, -1} },
116 },
117 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
118 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
119 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
120 .cntr = { {0, -1, -1}, {1, -1, -1} },
121 },
122 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
123 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
124 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
125 .cntr = { {2, -1, -1}, {3, -1, -1} },
126 },
127 [P4_EVENT_SSE_INPUT_ASSIST] = {
128 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
129 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
130 .cntr = { {8, 9, -1}, {10, 11, -1} },
131 },
132 [P4_EVENT_PACKED_SP_UOP] = {
133 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
134 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
135 .cntr = { {8, 9, -1}, {10, 11, -1} },
136 },
137 [P4_EVENT_PACKED_DP_UOP] = {
138 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
139 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
140 .cntr = { {8, 9, -1}, {10, 11, -1} },
141 },
142 [P4_EVENT_SCALAR_SP_UOP] = {
143 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
144 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
145 .cntr = { {8, 9, -1}, {10, 11, -1} },
146 },
147 [P4_EVENT_SCALAR_DP_UOP] = {
148 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
149 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
150 .cntr = { {8, 9, -1}, {10, 11, -1} },
151 },
152 [P4_EVENT_64BIT_MMX_UOP] = {
153 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
154 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
155 .cntr = { {8, 9, -1}, {10, 11, -1} },
156 },
157 [P4_EVENT_128BIT_MMX_UOP] = {
158 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
159 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
160 .cntr = { {8, 9, -1}, {10, 11, -1} },
161 },
162 [P4_EVENT_X87_FP_UOP] = {
163 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
164 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
165 .cntr = { {8, 9, -1}, {10, 11, -1} },
166 },
167 [P4_EVENT_TC_MISC] = {
168 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
169 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
170 .cntr = { {4, 5, -1}, {6, 7, -1} },
171 },
172 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
173 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
174 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
175 .cntr = { {0, -1, -1}, {2, -1, -1} },
176 },
177 [P4_EVENT_TC_MS_XFER] = {
178 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
179 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
180 .cntr = { {4, 5, -1}, {6, 7, -1} },
181 },
182 [P4_EVENT_UOP_QUEUE_WRITES] = {
183 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
184 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
185 .cntr = { {4, 5, -1}, {6, 7, -1} },
186 },
187 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
188 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
189 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
190 .cntr = { {4, 5, -1}, {6, 7, -1} },
191 },
192 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
193 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
194 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
195 .cntr = { {4, 5, -1}, {6, 7, -1} },
196 },
197 [P4_EVENT_RESOURCE_STALL] = {
198 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
199 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
200 .cntr = { {12, 13, 16}, {14, 15, 17} },
201 },
202 [P4_EVENT_WC_BUFFER] = {
203 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
204 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
205 .cntr = { {8, 9, -1}, {10, 11, -1} },
206 },
207 [P4_EVENT_B2B_CYCLES] = {
208 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
209 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
210 .cntr = { {0, -1, -1}, {2, -1, -1} },
211 },
212 [P4_EVENT_BNR] = {
213 .opcode = P4_OPCODE(P4_EVENT_BNR),
214 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
215 .cntr = { {0, -1, -1}, {2, -1, -1} },
216 },
217 [P4_EVENT_SNOOP] = {
218 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
219 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
220 .cntr = { {0, -1, -1}, {2, -1, -1} },
221 },
222 [P4_EVENT_RESPONSE] = {
223 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
224 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
225 .cntr = { {0, -1, -1}, {2, -1, -1} },
226 },
227 [P4_EVENT_FRONT_END_EVENT] = {
228 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
229 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
230 .cntr = { {12, 13, 16}, {14, 15, 17} },
231 },
232 [P4_EVENT_EXECUTION_EVENT] = {
233 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
234 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
235 .cntr = { {12, 13, 16}, {14, 15, 17} },
236 },
237 [P4_EVENT_REPLAY_EVENT] = {
238 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
239 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
240 .cntr = { {12, 13, 16}, {14, 15, 17} },
241 },
242 [P4_EVENT_INSTR_RETIRED] = {
243 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
244 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
245 .cntr = { {12, 13, 16}, {14, 15, 17} },
246 },
247 [P4_EVENT_UOPS_RETIRED] = {
248 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
249 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
250 .cntr = { {12, 13, 16}, {14, 15, 17} },
251 },
252 [P4_EVENT_UOP_TYPE] = {
253 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
254 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
255 .cntr = { {12, 13, 16}, {14, 15, 17} },
256 },
257 [P4_EVENT_BRANCH_RETIRED] = {
258 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
259 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
260 .cntr = { {12, 13, 16}, {14, 15, 17} },
261 },
262 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
263 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
264 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
265 .cntr = { {12, 13, 16}, {14, 15, 17} },
266 },
267 [P4_EVENT_X87_ASSIST] = {
268 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
269 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
270 .cntr = { {12, 13, 16}, {14, 15, 17} },
271 },
272 [P4_EVENT_MACHINE_CLEAR] = {
273 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
274 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
275 .cntr = { {12, 13, 16}, {14, 15, 17} },
276 },
277 [P4_EVENT_INSTR_COMPLETED] = {
278 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
279 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
280 .cntr = { {12, 13, 16}, {14, 15, 17} },
281 },
282};
283
284#define P4_GEN_CACHE_EVENT(event, bit, cache_event) \
285 p4_config_pack_escr(P4_ESCR_EVENT(event) | \
286 P4_ESCR_EMASK_BIT(event, bit)) | \
287 p4_config_pack_cccr(cache_event | \
288 P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
289
290static __initconst const u64 p4_hw_cache_event_ids
291 [PERF_COUNT_HW_CACHE_MAX]
292 [PERF_COUNT_HW_CACHE_OP_MAX]
293 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
294{
295 [ C(L1D ) ] = {
296 [ C(OP_READ) ] = {
297 [ C(RESULT_ACCESS) ] = 0x0,
298 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
299 P4_CACHE__1stl_cache_load_miss_retired),
300 },
301 },
302 [ C(LL ) ] = {
303 [ C(OP_READ) ] = {
304 [ C(RESULT_ACCESS) ] = 0x0,
305 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
306 P4_CACHE__2ndl_cache_load_miss_retired),
307 },
308},
309 [ C(DTLB) ] = {
310 [ C(OP_READ) ] = {
311 [ C(RESULT_ACCESS) ] = 0x0,
312 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
313 P4_CACHE__dtlb_load_miss_retired),
314 },
315 [ C(OP_WRITE) ] = {
316 [ C(RESULT_ACCESS) ] = 0x0,
317 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
318 P4_CACHE__dtlb_store_miss_retired),
319 },
320 },
321 [ C(ITLB) ] = {
322 [ C(OP_READ) ] = {
323 [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
324 P4_CACHE__itlb_reference_hit),
325 [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
326 P4_CACHE__itlb_reference_miss),
327 },
328 [ C(OP_WRITE) ] = {
329 [ C(RESULT_ACCESS) ] = -1,
330 [ C(RESULT_MISS) ] = -1,
331 },
332 [ C(OP_PREFETCH) ] = {
333 [ C(RESULT_ACCESS) ] = -1,
334 [ C(RESULT_MISS) ] = -1,
335 },
336 },
337};
338
339static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
340 /* non-halted CPU clocks */
341 [PERF_COUNT_HW_CPU_CYCLES] =
342 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
343 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
344
345 /*
346 * retired instructions
347 * in a sake of simplicity we don't use the FSB tagging
348 */
349 [PERF_COUNT_HW_INSTRUCTIONS] =
350 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
351 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
352 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
353
354 /* cache hits */
355 [PERF_COUNT_HW_CACHE_REFERENCES] =
356 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
359 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
360 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
361 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
362 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
363
364 /* cache misses */
365 [PERF_COUNT_HW_CACHE_MISSES] =
366 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
367 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
368 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
369 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
370
371 /* branch instructions retired */
372 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
373 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
374 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
375 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
376 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
377 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
378
379 /* mispredicted branches retired */
380 [PERF_COUNT_HW_BRANCH_MISSES] =
381 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
382 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
383
384 /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
385 [PERF_COUNT_HW_BUS_CYCLES] =
386 p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
387 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
388 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
389 p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
390};
391
392static struct p4_event_bind *p4_config_get_bind(u64 config)
393{
394 unsigned int evnt = p4_config_unpack_event(config);
395 struct p4_event_bind *bind = NULL;
396
397 if (evnt < ARRAY_SIZE(p4_event_bind_map))
398 bind = &p4_event_bind_map[evnt];
399
400 return bind;
401}
402
403static u64 p4_pmu_event_map(int hw_event)
404{
405 struct p4_event_bind *bind;
406 unsigned int esel;
407 u64 config;
408
409 if (hw_event > ARRAY_SIZE(p4_general_events)) {
410 printk_once(KERN_ERR "P4 PMU: Bad index: %i\n", hw_event);
411 return 0;
412 }
413
414 config = p4_general_events[hw_event];
415 bind = p4_config_get_bind(config);
416 esel = P4_OPCODE_ESEL(bind->opcode);
417 config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
418
419 return config;
420}
421
422static int p4_hw_config(struct perf_event *event)
423{
424 int cpu = raw_smp_processor_id();
425 u32 escr, cccr;
426
427 /*
428 * the reason we use cpu that early is that: if we get scheduled
429 * first time on the same cpu -- we will not need swap thread
430 * specific flags in config (and will save some cpu cycles)
431 */
432
433 cccr = p4_default_cccr_conf(cpu);
434 escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
435 event->attr.exclude_user);
436 event->hw.config = p4_config_pack_escr(escr) |
437 p4_config_pack_cccr(cccr);
438
439 if (p4_ht_active() && p4_ht_thread(cpu))
440 event->hw.config = p4_set_ht_bit(event->hw.config);
441
442 if (event->attr.type != PERF_TYPE_RAW)
443 return 0;
444
445 /*
446 * We don't control raw events so it's up to the caller
447 * to pass sane values (and we don't count the thread number
448 * on HT machine but allow HT-compatible specifics to be
449 * passed on)
450 *
451 * XXX: HT wide things should check perf_paranoid_cpu() &&
452 * CAP_SYS_ADMIN
453 */
454 event->hw.config |= event->attr.config &
455 (p4_config_pack_escr(P4_ESCR_MASK_HT) |
456 p4_config_pack_cccr(P4_CCCR_MASK_HT));
457
458 return 0;
459}
460
461static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
462{
463 unsigned long dummy;
464
465 rdmsrl(hwc->config_base + hwc->idx, dummy);
466 if (dummy & P4_CCCR_OVF) {
467 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
468 ((u64)dummy) & ~P4_CCCR_OVF);
469 }
470}
471
472static inline void p4_pmu_disable_event(struct perf_event *event)
473{
474 struct hw_perf_event *hwc = &event->hw;
475
476 /*
477 * If event gets disabled while counter is in overflowed
478 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
479 * asserted again and again
480 */
481 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
482 (u64)(p4_config_unpack_cccr(hwc->config)) &
483 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
484}
485
486static void p4_pmu_disable_all(void)
487{
488 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
489 int idx;
490
491 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
492 struct perf_event *event = cpuc->events[idx];
493 if (!test_bit(idx, cpuc->active_mask))
494 continue;
495 p4_pmu_disable_event(event);
496 }
497}
498
499static void p4_pmu_enable_event(struct perf_event *event)
500{
501 struct hw_perf_event *hwc = &event->hw;
502 int thread = p4_ht_config_thread(hwc->config);
503 u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
504 unsigned int idx = p4_config_unpack_event(hwc->config);
505 unsigned int idx_cache = p4_config_unpack_cache_event(hwc->config);
506 struct p4_event_bind *bind;
507 struct p4_cache_event_bind *bind_cache;
508 u64 escr_addr, cccr;
509
510 bind = &p4_event_bind_map[idx];
511 escr_addr = (u64)bind->escr_msr[thread];
512
513 /*
514 * - we dont support cascaded counters yet
515 * - and counter 1 is broken (erratum)
516 */
517 WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
518 WARN_ON_ONCE(hwc->idx == 1);
519
520 /* we need a real Event value */
521 escr_conf &= ~P4_ESCR_EVENT_MASK;
522 escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
523
524 cccr = p4_config_unpack_cccr(hwc->config);
525
526 /*
527 * it could be Cache event so that we need to
528 * set metrics into additional MSRs
529 */
530 BUILD_BUG_ON(P4_CACHE__MAX > P4_CCCR_CACHE_OPS_MASK);
531 if (idx_cache > P4_CACHE__NONE &&
532 idx_cache < ARRAY_SIZE(p4_cache_event_bind_map)) {
533 bind_cache = &p4_cache_event_bind_map[idx_cache];
534 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind_cache->metric_pebs);
535 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind_cache->metric_vert);
536 }
537
538 (void)checking_wrmsrl(escr_addr, escr_conf);
539 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
540 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
541}
542
543static void p4_pmu_enable_all(int added)
544{
545 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
546 int idx;
547
548 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
549 struct perf_event *event = cpuc->events[idx];
550 if (!test_bit(idx, cpuc->active_mask))
551 continue;
552 p4_pmu_enable_event(event);
553 }
554}
555
556static int p4_pmu_handle_irq(struct pt_regs *regs)
557{
558 struct perf_sample_data data;
559 struct cpu_hw_events *cpuc;
560 struct perf_event *event;
561 struct hw_perf_event *hwc;
562 int idx, handled = 0;
563 u64 val;
564
565 data.addr = 0;
566 data.raw = NULL;
567
568 cpuc = &__get_cpu_var(cpu_hw_events);
569
570 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
571
572 if (!test_bit(idx, cpuc->active_mask))
573 continue;
574
575 event = cpuc->events[idx];
576 hwc = &event->hw;
577
578 WARN_ON_ONCE(hwc->idx != idx);
579
580 /*
581 * FIXME: Redundant call, actually not needed
582 * but just to check if we're screwed
583 */
584 p4_pmu_clear_cccr_ovf(hwc);
585
586 val = x86_perf_event_update(event);
587 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
588 continue;
589
590 /*
591 * event overflow
592 */
593 handled = 1;
594 data.period = event->hw.last_period;
595
596 if (!x86_perf_event_set_period(event))
597 continue;
598 if (perf_event_overflow(event, 1, &data, regs))
599 p4_pmu_disable_event(event);
600 }
601
602 if (handled) {
603 /* p4 quirk: unmask it again */
604 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
605 inc_irq_stat(apic_perf_irqs);
606 }
607
608 return handled;
609}
610
611/*
612 * swap thread specific fields according to a thread
613 * we are going to run on
614 */
615static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
616{
617 u32 escr, cccr;
618
619 /*
620 * we either lucky and continue on same cpu or no HT support
621 */
622 if (!p4_should_swap_ts(hwc->config, cpu))
623 return;
624
625 /*
626 * the event is migrated from an another logical
627 * cpu, so we need to swap thread specific flags
628 */
629
630 escr = p4_config_unpack_escr(hwc->config);
631 cccr = p4_config_unpack_cccr(hwc->config);
632
633 if (p4_ht_thread(cpu)) {
634 cccr &= ~P4_CCCR_OVF_PMI_T0;
635 cccr |= P4_CCCR_OVF_PMI_T1;
636 if (escr & P4_ESCR_T0_OS) {
637 escr &= ~P4_ESCR_T0_OS;
638 escr |= P4_ESCR_T1_OS;
639 }
640 if (escr & P4_ESCR_T0_USR) {
641 escr &= ~P4_ESCR_T0_USR;
642 escr |= P4_ESCR_T1_USR;
643 }
644 hwc->config = p4_config_pack_escr(escr);
645 hwc->config |= p4_config_pack_cccr(cccr);
646 hwc->config |= P4_CONFIG_HT;
647 } else {
648 cccr &= ~P4_CCCR_OVF_PMI_T1;
649 cccr |= P4_CCCR_OVF_PMI_T0;
650 if (escr & P4_ESCR_T1_OS) {
651 escr &= ~P4_ESCR_T1_OS;
652 escr |= P4_ESCR_T0_OS;
653 }
654 if (escr & P4_ESCR_T1_USR) {
655 escr &= ~P4_ESCR_T1_USR;
656 escr |= P4_ESCR_T0_USR;
657 }
658 hwc->config = p4_config_pack_escr(escr);
659 hwc->config |= p4_config_pack_cccr(cccr);
660 hwc->config &= ~P4_CONFIG_HT;
661 }
662}
663
664/* ESCRs are not sequential in memory so we need a map */
665static const unsigned int p4_escr_map[ARCH_P4_TOTAL_ESCR] = {
666 MSR_P4_ALF_ESCR0, /* 0 */
667 MSR_P4_ALF_ESCR1, /* 1 */
668 MSR_P4_BPU_ESCR0, /* 2 */
669 MSR_P4_BPU_ESCR1, /* 3 */
670 MSR_P4_BSU_ESCR0, /* 4 */
671 MSR_P4_BSU_ESCR1, /* 5 */
672 MSR_P4_CRU_ESCR0, /* 6 */
673 MSR_P4_CRU_ESCR1, /* 7 */
674 MSR_P4_CRU_ESCR2, /* 8 */
675 MSR_P4_CRU_ESCR3, /* 9 */
676 MSR_P4_CRU_ESCR4, /* 10 */
677 MSR_P4_CRU_ESCR5, /* 11 */
678 MSR_P4_DAC_ESCR0, /* 12 */
679 MSR_P4_DAC_ESCR1, /* 13 */
680 MSR_P4_FIRM_ESCR0, /* 14 */
681 MSR_P4_FIRM_ESCR1, /* 15 */
682 MSR_P4_FLAME_ESCR0, /* 16 */
683 MSR_P4_FLAME_ESCR1, /* 17 */
684 MSR_P4_FSB_ESCR0, /* 18 */
685 MSR_P4_FSB_ESCR1, /* 19 */
686 MSR_P4_IQ_ESCR0, /* 20 */
687 MSR_P4_IQ_ESCR1, /* 21 */
688 MSR_P4_IS_ESCR0, /* 22 */
689 MSR_P4_IS_ESCR1, /* 23 */
690 MSR_P4_ITLB_ESCR0, /* 24 */
691 MSR_P4_ITLB_ESCR1, /* 25 */
692 MSR_P4_IX_ESCR0, /* 26 */
693 MSR_P4_IX_ESCR1, /* 27 */
694 MSR_P4_MOB_ESCR0, /* 28 */
695 MSR_P4_MOB_ESCR1, /* 29 */
696 MSR_P4_MS_ESCR0, /* 30 */
697 MSR_P4_MS_ESCR1, /* 31 */
698 MSR_P4_PMH_ESCR0, /* 32 */
699 MSR_P4_PMH_ESCR1, /* 33 */
700 MSR_P4_RAT_ESCR0, /* 34 */
701 MSR_P4_RAT_ESCR1, /* 35 */
702 MSR_P4_SAAT_ESCR0, /* 36 */
703 MSR_P4_SAAT_ESCR1, /* 37 */
704 MSR_P4_SSU_ESCR0, /* 38 */
705 MSR_P4_SSU_ESCR1, /* 39 */
706 MSR_P4_TBPU_ESCR0, /* 40 */
707 MSR_P4_TBPU_ESCR1, /* 41 */
708 MSR_P4_TC_ESCR0, /* 42 */
709 MSR_P4_TC_ESCR1, /* 43 */
710 MSR_P4_U2L_ESCR0, /* 44 */
711 MSR_P4_U2L_ESCR1, /* 45 */
712};
713
714static int p4_get_escr_idx(unsigned int addr)
715{
716 unsigned int i;
717
718 for (i = 0; i < ARRAY_SIZE(p4_escr_map); i++) {
719 if (addr == p4_escr_map[i])
720 return i;
721 }
722
723 return -1;
724}
725
726static int p4_next_cntr(int thread, unsigned long *used_mask,
727 struct p4_event_bind *bind)
728{
729 int i = 0, j;
730
731 for (i = 0; i < P4_CNTR_LIMIT; i++) {
732 j = bind->cntr[thread][i++];
733 if (j == -1 || !test_bit(j, used_mask))
734 return j;
735 }
736
737 return -1;
738}
739
740static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
741{
742 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
743 unsigned long escr_mask[BITS_TO_LONGS(ARCH_P4_TOTAL_ESCR)];
744 int cpu = raw_smp_processor_id();
745 struct hw_perf_event *hwc;
746 struct p4_event_bind *bind;
747 unsigned int i, thread, num;
748 int cntr_idx, escr_idx;
749
750 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
751 bitmap_zero(escr_mask, ARCH_P4_TOTAL_ESCR);
752
753 for (i = 0, num = n; i < n; i++, num--) {
754
755 hwc = &cpuc->event_list[i]->hw;
756 thread = p4_ht_thread(cpu);
757 bind = p4_config_get_bind(hwc->config);
758 escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
759
760 if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
761 cntr_idx = hwc->idx;
762 if (assign)
763 assign[i] = hwc->idx;
764 goto reserve;
765 }
766
767 cntr_idx = p4_next_cntr(thread, used_mask, bind);
768 if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
769 goto done;
770
771 p4_pmu_swap_config_ts(hwc, cpu);
772 if (assign)
773 assign[i] = cntr_idx;
774reserve:
775 set_bit(cntr_idx, used_mask);
776 set_bit(escr_idx, escr_mask);
777 }
778
779done:
780 return num ? -ENOSPC : 0;
781}
782
783static __initconst const struct x86_pmu p4_pmu = {
784 .name = "Netburst P4/Xeon",
785 .handle_irq = p4_pmu_handle_irq,
786 .disable_all = p4_pmu_disable_all,
787 .enable_all = p4_pmu_enable_all,
788 .enable = p4_pmu_enable_event,
789 .disable = p4_pmu_disable_event,
790 .eventsel = MSR_P4_BPU_CCCR0,
791 .perfctr = MSR_P4_BPU_PERFCTR0,
792 .event_map = p4_pmu_event_map,
793 .max_events = ARRAY_SIZE(p4_general_events),
794 .get_event_constraints = x86_get_event_constraints,
795 /*
796 * IF HT disabled we may need to use all
797 * ARCH_P4_MAX_CCCR counters simulaneously
798 * though leave it restricted at moment assuming
799 * HT is on
800 */
801 .num_counters = ARCH_P4_MAX_CCCR,
802 .apic = 1,
803 .cntval_bits = 40,
804 .cntval_mask = (1ULL << 40) - 1,
805 .max_period = (1ULL << 39) - 1,
806 .hw_config = p4_hw_config,
807 .schedule_events = p4_pmu_schedule_events,
808};
809
810static __init int p4_pmu_init(void)
811{
812 unsigned int low, high;
813
814 /* If we get stripped -- indexig fails */
815 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
816
817 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
818 if (!(low & (1 << 7))) {
819 pr_cont("unsupported Netburst CPU model %d ",
820 boot_cpu_data.x86_model);
821 return -ENODEV;
822 }
823
824 memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
825 sizeof(hw_cache_event_ids));
826
827 pr_cont("Netburst events, ");
828
829 x86_pmu = p4_pmu;
830
831 return 0;
832}
833
834#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14d..34ba07be2cd 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
27 */ 27 */
28#define P6_NOP_EVENT 0x0000002EULL 28#define P6_NOP_EVENT 0x0000002EULL
29 29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] = 30static struct event_constraint p6_event_constraints[] =
49{ 31{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
66 wrmsrl(MSR_P6_EVNTSEL0, val); 48 wrmsrl(MSR_P6_EVNTSEL0, val);
67} 49}
68 50
69static void p6_pmu_enable_all(void) 51static void p6_pmu_enable_all(int added)
70{ 52{
71 unsigned long val; 53 unsigned long val;
72 54
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103} 85}
104 86
105static __initconst struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
106 .name = "p6", 88 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all, 90 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all, 91 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event, 92 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event, 93 .disable = p6_pmu_disable_event,
94 .hw_config = x86_pmu_hw_config,
95 .schedule_events = x86_schedule_events,
112 .eventsel = MSR_P6_EVNTSEL0, 96 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0, 97 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map, 98 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map), 99 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1, 100 .apic = 1,
118 .max_period = (1ULL << 31) - 1, 101 .max_period = (1ULL << 31) - 1,
119 .version = 0, 102 .version = 0,
120 .num_events = 2, 103 .num_counters = 2,
121 /* 104 /*
122 * Events have 40 bits implemented. However they are designed such 105 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the 106 * that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
125 * 108 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B 109 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */ 110 */
128 .event_bits = 32, 111 .cntval_bits = 32,
129 .event_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
132}; 115};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e..00000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
1/*
2 * Debug Store support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - DS and BTS hardware configuration
10 * - buffer overflow handling (to be done)
11 * - buffer access
12 *
13 * It does not do:
14 * - security checking (is the caller allowed to trace the task)
15 * - buffer allocation (memory accounting)
16 *
17 *
18 * Copyright (C) 2007-2009 Intel Corporation.
19 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
20 */
21
22#include <linux/kernel.h>
23#include <linux/string.h>
24#include <linux/errno.h>
25#include <linux/sched.h>
26#include <linux/slab.h>
27#include <linux/mm.h>
28#include <linux/trace_clock.h>
29
30#include <asm/ds.h>
31
32#include "ds_selftest.h"
33
34/*
35 * The configuration for a particular DS hardware implementation:
36 */
37struct ds_configuration {
38 /* The name of the configuration: */
39 const char *name;
40
41 /* The size of pointer-typed fields in DS, BTS, and PEBS: */
42 unsigned char sizeof_ptr_field;
43
44 /* The size of a BTS/PEBS record in bytes: */
45 unsigned char sizeof_rec[2];
46
47 /* The number of pebs counter reset values in the DS structure. */
48 unsigned char nr_counter_reset;
49
50 /* Control bit-masks indexed by enum ds_feature: */
51 unsigned long ctl[dsf_ctl_max];
52};
53static struct ds_configuration ds_cfg __read_mostly;
54
55
56/* Maximal size of a DS configuration: */
57#define MAX_SIZEOF_DS 0x80
58
59/* Maximal size of a BTS record: */
60#define MAX_SIZEOF_BTS (3 * 8)
61
62/* BTS and PEBS buffer alignment: */
63#define DS_ALIGNMENT (1 << 3)
64
65/* Number of buffer pointers in DS: */
66#define NUM_DS_PTR_FIELDS 8
67
68/* Size of a pebs reset value in DS: */
69#define PEBS_RESET_FIELD_SIZE 8
70
71/* Mask of control bits in the DS MSR register: */
72#define BTS_CONTROL \
73 ( ds_cfg.ctl[dsf_bts] | \
74 ds_cfg.ctl[dsf_bts_kernel] | \
75 ds_cfg.ctl[dsf_bts_user] | \
76 ds_cfg.ctl[dsf_bts_overflow] )
77
78/*
79 * A BTS or PEBS tracer.
80 *
81 * This holds the configuration of the tracer and serves as a handle
82 * to identify tracers.
83 */
84struct ds_tracer {
85 /* The DS context (partially) owned by this tracer. */
86 struct ds_context *context;
87 /* The buffer provided on ds_request() and its size in bytes. */
88 void *buffer;
89 size_t size;
90};
91
92struct bts_tracer {
93 /* The common DS part: */
94 struct ds_tracer ds;
95
96 /* The trace including the DS configuration: */
97 struct bts_trace trace;
98
99 /* Buffer overflow notification function: */
100 bts_ovfl_callback_t ovfl;
101
102 /* Active flags affecting trace collection. */
103 unsigned int flags;
104};
105
106struct pebs_tracer {
107 /* The common DS part: */
108 struct ds_tracer ds;
109
110 /* The trace including the DS configuration: */
111 struct pebs_trace trace;
112
113 /* Buffer overflow notification function: */
114 pebs_ovfl_callback_t ovfl;
115};
116
117/*
118 * Debug Store (DS) save area configuration (see Intel64 and IA32
119 * Architectures Software Developer's Manual, section 18.5)
120 *
121 * The DS configuration consists of the following fields; different
122 * architetures vary in the size of those fields.
123 *
124 * - double-word aligned base linear address of the BTS buffer
125 * - write pointer into the BTS buffer
126 * - end linear address of the BTS buffer (one byte beyond the end of
127 * the buffer)
128 * - interrupt pointer into BTS buffer
129 * (interrupt occurs when write pointer passes interrupt pointer)
130 * - double-word aligned base linear address of the PEBS buffer
131 * - write pointer into the PEBS buffer
132 * - end linear address of the PEBS buffer (one byte beyond the end of
133 * the buffer)
134 * - interrupt pointer into PEBS buffer
135 * (interrupt occurs when write pointer passes interrupt pointer)
136 * - value to which counter is reset following counter overflow
137 *
138 * Later architectures use 64bit pointers throughout, whereas earlier
139 * architectures use 32bit pointers in 32bit mode.
140 *
141 *
142 * We compute the base address for the first 8 fields based on:
143 * - the field size stored in the DS configuration
144 * - the relative field position
145 * - an offset giving the start of the respective region
146 *
147 * This offset is further used to index various arrays holding
148 * information for BTS and PEBS at the respective index.
149 *
150 * On later 32bit processors, we only access the lower 32bit of the
151 * 64bit pointer fields. The upper halves will be zeroed out.
152 */
153
154enum ds_field {
155 ds_buffer_base = 0,
156 ds_index,
157 ds_absolute_maximum,
158 ds_interrupt_threshold,
159};
160
161enum ds_qualifier {
162 ds_bts = 0,
163 ds_pebs
164};
165
166static inline unsigned long
167ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
168{
169 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
170 return *(unsigned long *)base;
171}
172
173static inline void
174ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
175 unsigned long value)
176{
177 base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
178 (*(unsigned long *)base) = value;
179}
180
181
182/*
183 * Locking is done only for allocating BTS or PEBS resources.
184 */
185static DEFINE_SPINLOCK(ds_lock);
186
187/*
188 * We either support (system-wide) per-cpu or per-thread allocation.
189 * We distinguish the two based on the task_struct pointer, where a
190 * NULL pointer indicates per-cpu allocation for the current cpu.
191 *
192 * Allocations are use-counted. As soon as resources are allocated,
193 * further allocations must be of the same type (per-cpu or
194 * per-thread). We model this by counting allocations (i.e. the number
195 * of tracers of a certain type) for one type negatively:
196 * =0 no tracers
197 * >0 number of per-thread tracers
198 * <0 number of per-cpu tracers
199 *
200 * Tracers essentially gives the number of ds contexts for a certain
201 * type of allocation.
202 */
203static atomic_t tracers = ATOMIC_INIT(0);
204
205static inline int get_tracer(struct task_struct *task)
206{
207 int error;
208
209 spin_lock_irq(&ds_lock);
210
211 if (task) {
212 error = -EPERM;
213 if (atomic_read(&tracers) < 0)
214 goto out;
215 atomic_inc(&tracers);
216 } else {
217 error = -EPERM;
218 if (atomic_read(&tracers) > 0)
219 goto out;
220 atomic_dec(&tracers);
221 }
222
223 error = 0;
224out:
225 spin_unlock_irq(&ds_lock);
226 return error;
227}
228
229static inline void put_tracer(struct task_struct *task)
230{
231 if (task)
232 atomic_dec(&tracers);
233 else
234 atomic_inc(&tracers);
235}
236
237/*
238 * The DS context is either attached to a thread or to a cpu:
239 * - in the former case, the thread_struct contains a pointer to the
240 * attached context.
241 * - in the latter case, we use a static array of per-cpu context
242 * pointers.
243 *
244 * Contexts are use-counted. They are allocated on first access and
245 * deallocated when the last user puts the context.
246 */
247struct ds_context {
248 /* The DS configuration; goes into MSR_IA32_DS_AREA: */
249 unsigned char ds[MAX_SIZEOF_DS];
250
251 /* The owner of the BTS and PEBS configuration, respectively: */
252 struct bts_tracer *bts_master;
253 struct pebs_tracer *pebs_master;
254
255 /* Use count: */
256 unsigned long count;
257
258 /* Pointer to the context pointer field: */
259 struct ds_context **this;
260
261 /* The traced task; NULL for cpu tracing: */
262 struct task_struct *task;
263
264 /* The traced cpu; only valid if task is NULL: */
265 int cpu;
266};
267
268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269
270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{
273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL;
277
278 /* Chances are small that we already have a context. */
279 new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
280 if (!new_context)
281 return NULL;
282
283 spin_lock_irq(&ds_lock);
284
285 context = *p_context;
286 if (likely(!context)) {
287 context = new_context;
288
289 context->this = p_context;
290 context->task = task;
291 context->cpu = cpu;
292 context->count = 0;
293
294 *p_context = context;
295 }
296
297 context->count++;
298
299 spin_unlock_irq(&ds_lock);
300
301 if (context != new_context)
302 kfree(new_context);
303
304 return context;
305}
306
307static void ds_put_context(struct ds_context *context)
308{
309 struct task_struct *task;
310 unsigned long irq;
311
312 if (!context)
313 return;
314
315 spin_lock_irqsave(&ds_lock, irq);
316
317 if (--context->count) {
318 spin_unlock_irqrestore(&ds_lock, irq);
319 return;
320 }
321
322 *(context->this) = NULL;
323
324 task = context->task;
325
326 if (task)
327 clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
328
329 /*
330 * We leave the (now dangling) pointer to the DS configuration in
331 * the DS_AREA msr. This is as good or as bad as replacing it with
332 * NULL - the hardware would crash if we enabled tracing.
333 *
334 * This saves us some problems with having to write an msr on a
335 * different cpu while preventing others from doing the same for the
336 * next context for that same cpu.
337 */
338
339 spin_unlock_irqrestore(&ds_lock, irq);
340
341 /* The context might still be in use for context switching. */
342 if (task && (task != current))
343 wait_task_context_switch(task);
344
345 kfree(context);
346}
347
348static void ds_install_ds_area(struct ds_context *context)
349{
350 unsigned long ds;
351
352 ds = (unsigned long)context->ds;
353
354 /*
355 * There is a race between the bts master and the pebs master.
356 *
357 * The thread/cpu access is synchronized via get/put_cpu() for
358 * task tracing and via wrmsr_on_cpu for cpu tracing.
359 *
360 * If bts and pebs are collected for the same task or same cpu,
361 * the same confiuration is written twice.
362 */
363 if (context->task) {
364 get_cpu();
365 if (context->task == current)
366 wrmsrl(MSR_IA32_DS_AREA, ds);
367 set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
368 put_cpu();
369 } else
370 wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
371 (u32)((u64)ds), (u32)((u64)ds >> 32));
372}
373
374/*
375 * Call the tracer's callback on a buffer overflow.
376 *
377 * context: the ds context
378 * qual: the buffer type
379 */
380static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
381{
382 switch (qual) {
383 case ds_bts:
384 if (context->bts_master &&
385 context->bts_master->ovfl)
386 context->bts_master->ovfl(context->bts_master);
387 break;
388 case ds_pebs:
389 if (context->pebs_master &&
390 context->pebs_master->ovfl)
391 context->pebs_master->ovfl(context->pebs_master);
392 break;
393 }
394}
395
396
397/*
398 * Write raw data into the BTS or PEBS buffer.
399 *
400 * The remainder of any partially written record is zeroed out.
401 *
402 * context: the DS context
403 * qual: the buffer type
404 * record: the data to write
405 * size: the size of the data
406 */
407static int ds_write(struct ds_context *context, enum ds_qualifier qual,
408 const void *record, size_t size)
409{
410 int bytes_written = 0;
411
412 if (!record)
413 return -EINVAL;
414
415 while (size) {
416 unsigned long base, index, end, write_end, int_th;
417 unsigned long write_size, adj_write_size;
418
419 /*
420 * Write as much as possible without producing an
421 * overflow interrupt.
422 *
423 * Interrupt_threshold must either be
424 * - bigger than absolute_maximum or
425 * - point to a record between buffer_base and absolute_maximum
426 *
427 * Index points to a valid record.
428 */
429 base = ds_get(context->ds, qual, ds_buffer_base);
430 index = ds_get(context->ds, qual, ds_index);
431 end = ds_get(context->ds, qual, ds_absolute_maximum);
432 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
433
434 write_end = min(end, int_th);
435
436 /*
437 * If we are already beyond the interrupt threshold,
438 * we fill the entire buffer.
439 */
440 if (write_end <= index)
441 write_end = end;
442
443 if (write_end <= index)
444 break;
445
446 write_size = min((unsigned long) size, write_end - index);
447 memcpy((void *)index, record, write_size);
448
449 record = (const char *)record + write_size;
450 size -= write_size;
451 bytes_written += write_size;
452
453 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
454 adj_write_size *= ds_cfg.sizeof_rec[qual];
455
456 /* Zero out trailing bytes. */
457 memset((char *)index + write_size, 0,
458 adj_write_size - write_size);
459 index += adj_write_size;
460
461 if (index >= end)
462 index = base;
463 ds_set(context->ds, qual, ds_index, index);
464
465 if (index >= int_th)
466 ds_overflow(context, qual);
467 }
468
469 return bytes_written;
470}
471
472
473/*
474 * Branch Trace Store (BTS) uses the following format. Different
475 * architectures vary in the size of those fields.
476 * - source linear address
477 * - destination linear address
478 * - flags
479 *
480 * Later architectures use 64bit pointers throughout, whereas earlier
481 * architectures use 32bit pointers in 32bit mode.
482 *
483 * We compute the base address for the fields based on:
484 * - the field size stored in the DS configuration
485 * - the relative field position
486 *
487 * In order to store additional information in the BTS buffer, we use
488 * a special source address to indicate that the record requires
489 * special interpretation.
490 *
491 * Netburst indicated via a bit in the flags field whether the branch
492 * was predicted; this is ignored.
493 *
494 * We use two levels of abstraction:
495 * - the raw data level defined here
496 * - an arch-independent level defined in ds.h
497 */
498
499enum bts_field {
500 bts_from,
501 bts_to,
502 bts_flags,
503
504 bts_qual = bts_from,
505 bts_clock = bts_to,
506 bts_pid = bts_flags,
507
508 bts_qual_mask = (bts_qual_max - 1),
509 bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
510};
511
512static inline unsigned long bts_get(const char *base, unsigned long field)
513{
514 base += (ds_cfg.sizeof_ptr_field * field);
515 return *(unsigned long *)base;
516}
517
518static inline void bts_set(char *base, unsigned long field, unsigned long val)
519{
520 base += (ds_cfg.sizeof_ptr_field * field);
521 (*(unsigned long *)base) = val;
522}
523
524
525/*
526 * The raw BTS data is architecture dependent.
527 *
528 * For higher-level users, we give an arch-independent view.
529 * - ds.h defines struct bts_struct
530 * - bts_read translates one raw bts record into a bts_struct
531 * - bts_write translates one bts_struct into the raw format and
532 * writes it into the top of the parameter tracer's buffer.
533 *
534 * return: bytes read/written on success; -Eerrno, otherwise
535 */
536static int
537bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
538{
539 if (!tracer)
540 return -EINVAL;
541
542 if (at < tracer->trace.ds.begin)
543 return -EINVAL;
544
545 if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
546 return -EINVAL;
547
548 memset(out, 0, sizeof(*out));
549 if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
550 out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
551 out->variant.event.clock = bts_get(at, bts_clock);
552 out->variant.event.pid = bts_get(at, bts_pid);
553 } else {
554 out->qualifier = bts_branch;
555 out->variant.lbr.from = bts_get(at, bts_from);
556 out->variant.lbr.to = bts_get(at, bts_to);
557
558 if (!out->variant.lbr.from && !out->variant.lbr.to)
559 out->qualifier = bts_invalid;
560 }
561
562 return ds_cfg.sizeof_rec[ds_bts];
563}
564
565static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
566{
567 unsigned char raw[MAX_SIZEOF_BTS];
568
569 if (!tracer)
570 return -EINVAL;
571
572 if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
573 return -EOVERFLOW;
574
575 switch (in->qualifier) {
576 case bts_invalid:
577 bts_set(raw, bts_from, 0);
578 bts_set(raw, bts_to, 0);
579 bts_set(raw, bts_flags, 0);
580 break;
581 case bts_branch:
582 bts_set(raw, bts_from, in->variant.lbr.from);
583 bts_set(raw, bts_to, in->variant.lbr.to);
584 bts_set(raw, bts_flags, 0);
585 break;
586 case bts_task_arrives:
587 case bts_task_departs:
588 bts_set(raw, bts_qual, (bts_escape | in->qualifier));
589 bts_set(raw, bts_clock, in->variant.event.clock);
590 bts_set(raw, bts_pid, in->variant.event.pid);
591 break;
592 default:
593 return -EINVAL;
594 }
595
596 return ds_write(tracer->ds.context, ds_bts, raw,
597 ds_cfg.sizeof_rec[ds_bts]);
598}
599
600
601static void ds_write_config(struct ds_context *context,
602 struct ds_trace *cfg, enum ds_qualifier qual)
603{
604 unsigned char *ds = context->ds;
605
606 ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
607 ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
608 ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
609 ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
610}
611
612static void ds_read_config(struct ds_context *context,
613 struct ds_trace *cfg, enum ds_qualifier qual)
614{
615 unsigned char *ds = context->ds;
616
617 cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
618 cfg->top = (void *)ds_get(ds, qual, ds_index);
619 cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
620 cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
621}
622
623static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
624 void *base, size_t size, size_t ith,
625 unsigned int flags) {
626 unsigned long buffer, adj;
627
628 /*
629 * Adjust the buffer address and size to meet alignment
630 * constraints:
631 * - buffer is double-word aligned
632 * - size is multiple of record size
633 *
634 * We checked the size at the very beginning; we have enough
635 * space to do the adjustment.
636 */
637 buffer = (unsigned long)base;
638
639 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
640 buffer += adj;
641 size -= adj;
642
643 trace->n = size / ds_cfg.sizeof_rec[qual];
644 trace->size = ds_cfg.sizeof_rec[qual];
645
646 size = (trace->n * trace->size);
647
648 trace->begin = (void *)buffer;
649 trace->top = trace->begin;
650 trace->end = (void *)(buffer + size);
651 /*
652 * The value for 'no threshold' is -1, which will set the
653 * threshold outside of the buffer, just like we want it.
654 */
655 ith *= ds_cfg.sizeof_rec[qual];
656 trace->ith = (void *)(buffer + size - ith);
657
658 trace->flags = flags;
659}
660
661
662static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
663 enum ds_qualifier qual, struct task_struct *task,
664 int cpu, void *base, size_t size, size_t th)
665{
666 struct ds_context *context;
667 int error;
668 size_t req_size;
669
670 error = -EOPNOTSUPP;
671 if (!ds_cfg.sizeof_rec[qual])
672 goto out;
673
674 error = -EINVAL;
675 if (!base)
676 goto out;
677
678 req_size = ds_cfg.sizeof_rec[qual];
679 /* We might need space for alignment adjustments. */
680 if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
681 req_size += DS_ALIGNMENT;
682
683 error = -EINVAL;
684 if (size < req_size)
685 goto out;
686
687 if (th != (size_t)-1) {
688 th *= ds_cfg.sizeof_rec[qual];
689
690 error = -EINVAL;
691 if (size <= th)
692 goto out;
693 }
694
695 tracer->buffer = base;
696 tracer->size = size;
697
698 error = -ENOMEM;
699 context = ds_get_context(task, cpu);
700 if (!context)
701 goto out;
702 tracer->context = context;
703
704 /*
705 * Defer any tracer-specific initialization work for the context until
706 * context ownership has been clarified.
707 */
708
709 error = 0;
710 out:
711 return error;
712}
713
714static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
715 void *base, size_t size,
716 bts_ovfl_callback_t ovfl, size_t th,
717 unsigned int flags)
718{
719 struct bts_tracer *tracer;
720 int error;
721
722 /* Buffer overflow notification is not yet implemented. */
723 error = -EOPNOTSUPP;
724 if (ovfl)
725 goto out;
726
727 error = get_tracer(task);
728 if (error < 0)
729 goto out;
730
731 error = -ENOMEM;
732 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
733 if (!tracer)
734 goto out_put_tracer;
735 tracer->ovfl = ovfl;
736
737 /* Do some more error checking and acquire a tracing context. */
738 error = ds_request(&tracer->ds, &tracer->trace.ds,
739 ds_bts, task, cpu, base, size, th);
740 if (error < 0)
741 goto out_tracer;
742
743 /* Claim the bts part of the tracing context we acquired above. */
744 spin_lock_irq(&ds_lock);
745
746 error = -EPERM;
747 if (tracer->ds.context->bts_master)
748 goto out_unlock;
749 tracer->ds.context->bts_master = tracer;
750
751 spin_unlock_irq(&ds_lock);
752
753 /*
754 * Now that we own the bts part of the context, let's complete the
755 * initialization for that part.
756 */
757 ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
758 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
759 ds_install_ds_area(tracer->ds.context);
760
761 tracer->trace.read = bts_read;
762 tracer->trace.write = bts_write;
763
764 /* Start tracing. */
765 ds_resume_bts(tracer);
766
767 return tracer;
768
769 out_unlock:
770 spin_unlock_irq(&ds_lock);
771 ds_put_context(tracer->ds.context);
772 out_tracer:
773 kfree(tracer);
774 out_put_tracer:
775 put_tracer(task);
776 out:
777 return ERR_PTR(error);
778}
779
780struct bts_tracer *ds_request_bts_task(struct task_struct *task,
781 void *base, size_t size,
782 bts_ovfl_callback_t ovfl,
783 size_t th, unsigned int flags)
784{
785 return ds_request_bts(task, 0, base, size, ovfl, th, flags);
786}
787
788struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
789 bts_ovfl_callback_t ovfl,
790 size_t th, unsigned int flags)
791{
792 return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
793}
794
795static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
796 void *base, size_t size,
797 pebs_ovfl_callback_t ovfl, size_t th,
798 unsigned int flags)
799{
800 struct pebs_tracer *tracer;
801 int error;
802
803 /* Buffer overflow notification is not yet implemented. */
804 error = -EOPNOTSUPP;
805 if (ovfl)
806 goto out;
807
808 error = get_tracer(task);
809 if (error < 0)
810 goto out;
811
812 error = -ENOMEM;
813 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
814 if (!tracer)
815 goto out_put_tracer;
816 tracer->ovfl = ovfl;
817
818 /* Do some more error checking and acquire a tracing context. */
819 error = ds_request(&tracer->ds, &tracer->trace.ds,
820 ds_pebs, task, cpu, base, size, th);
821 if (error < 0)
822 goto out_tracer;
823
824 /* Claim the pebs part of the tracing context we acquired above. */
825 spin_lock_irq(&ds_lock);
826
827 error = -EPERM;
828 if (tracer->ds.context->pebs_master)
829 goto out_unlock;
830 tracer->ds.context->pebs_master = tracer;
831
832 spin_unlock_irq(&ds_lock);
833
834 /*
835 * Now that we own the pebs part of the context, let's complete the
836 * initialization for that part.
837 */
838 ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
839 ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
840 ds_install_ds_area(tracer->ds.context);
841
842 /* Start tracing. */
843 ds_resume_pebs(tracer);
844
845 return tracer;
846
847 out_unlock:
848 spin_unlock_irq(&ds_lock);
849 ds_put_context(tracer->ds.context);
850 out_tracer:
851 kfree(tracer);
852 out_put_tracer:
853 put_tracer(task);
854 out:
855 return ERR_PTR(error);
856}
857
858struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
859 void *base, size_t size,
860 pebs_ovfl_callback_t ovfl,
861 size_t th, unsigned int flags)
862{
863 return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
864}
865
866struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
867 pebs_ovfl_callback_t ovfl,
868 size_t th, unsigned int flags)
869{
870 return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
871}
872
873static void ds_free_bts(struct bts_tracer *tracer)
874{
875 struct task_struct *task;
876
877 task = tracer->ds.context->task;
878
879 WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
880 tracer->ds.context->bts_master = NULL;
881
882 /* Make sure tracing stopped and the tracer is not in use. */
883 if (task && (task != current))
884 wait_task_context_switch(task);
885
886 ds_put_context(tracer->ds.context);
887 put_tracer(task);
888
889 kfree(tracer);
890}
891
892void ds_release_bts(struct bts_tracer *tracer)
893{
894 might_sleep();
895
896 if (!tracer)
897 return;
898
899 ds_suspend_bts(tracer);
900 ds_free_bts(tracer);
901}
902
903int ds_release_bts_noirq(struct bts_tracer *tracer)
904{
905 struct task_struct *task;
906 unsigned long irq;
907 int error;
908
909 if (!tracer)
910 return 0;
911
912 task = tracer->ds.context->task;
913
914 local_irq_save(irq);
915
916 error = -EPERM;
917 if (!task &&
918 (tracer->ds.context->cpu != smp_processor_id()))
919 goto out;
920
921 error = -EPERM;
922 if (task && (task != current))
923 goto out;
924
925 ds_suspend_bts_noirq(tracer);
926 ds_free_bts(tracer);
927
928 error = 0;
929 out:
930 local_irq_restore(irq);
931 return error;
932}
933
934static void update_task_debugctlmsr(struct task_struct *task,
935 unsigned long debugctlmsr)
936{
937 task->thread.debugctlmsr = debugctlmsr;
938
939 get_cpu();
940 if (task == current)
941 update_debugctlmsr(debugctlmsr);
942 put_cpu();
943}
944
945void ds_suspend_bts(struct bts_tracer *tracer)
946{
947 struct task_struct *task;
948 unsigned long debugctlmsr;
949 int cpu;
950
951 if (!tracer)
952 return;
953
954 tracer->flags = 0;
955
956 task = tracer->ds.context->task;
957 cpu = tracer->ds.context->cpu;
958
959 WARN_ON(!task && irqs_disabled());
960
961 debugctlmsr = (task ?
962 task->thread.debugctlmsr :
963 get_debugctlmsr_on_cpu(cpu));
964 debugctlmsr &= ~BTS_CONTROL;
965
966 if (task)
967 update_task_debugctlmsr(task, debugctlmsr);
968 else
969 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
970}
971
972int ds_suspend_bts_noirq(struct bts_tracer *tracer)
973{
974 struct task_struct *task;
975 unsigned long debugctlmsr, irq;
976 int cpu, error = 0;
977
978 if (!tracer)
979 return 0;
980
981 tracer->flags = 0;
982
983 task = tracer->ds.context->task;
984 cpu = tracer->ds.context->cpu;
985
986 local_irq_save(irq);
987
988 error = -EPERM;
989 if (!task && (cpu != smp_processor_id()))
990 goto out;
991
992 debugctlmsr = (task ?
993 task->thread.debugctlmsr :
994 get_debugctlmsr());
995 debugctlmsr &= ~BTS_CONTROL;
996
997 if (task)
998 update_task_debugctlmsr(task, debugctlmsr);
999 else
1000 update_debugctlmsr(debugctlmsr);
1001
1002 error = 0;
1003 out:
1004 local_irq_restore(irq);
1005 return error;
1006}
1007
1008static unsigned long ds_bts_control(struct bts_tracer *tracer)
1009{
1010 unsigned long control;
1011
1012 control = ds_cfg.ctl[dsf_bts];
1013 if (!(tracer->trace.ds.flags & BTS_KERNEL))
1014 control |= ds_cfg.ctl[dsf_bts_kernel];
1015 if (!(tracer->trace.ds.flags & BTS_USER))
1016 control |= ds_cfg.ctl[dsf_bts_user];
1017
1018 return control;
1019}
1020
1021void ds_resume_bts(struct bts_tracer *tracer)
1022{
1023 struct task_struct *task;
1024 unsigned long debugctlmsr;
1025 int cpu;
1026
1027 if (!tracer)
1028 return;
1029
1030 tracer->flags = tracer->trace.ds.flags;
1031
1032 task = tracer->ds.context->task;
1033 cpu = tracer->ds.context->cpu;
1034
1035 WARN_ON(!task && irqs_disabled());
1036
1037 debugctlmsr = (task ?
1038 task->thread.debugctlmsr :
1039 get_debugctlmsr_on_cpu(cpu));
1040 debugctlmsr |= ds_bts_control(tracer);
1041
1042 if (task)
1043 update_task_debugctlmsr(task, debugctlmsr);
1044 else
1045 update_debugctlmsr_on_cpu(cpu, debugctlmsr);
1046}
1047
1048int ds_resume_bts_noirq(struct bts_tracer *tracer)
1049{
1050 struct task_struct *task;
1051 unsigned long debugctlmsr, irq;
1052 int cpu, error = 0;
1053
1054 if (!tracer)
1055 return 0;
1056
1057 tracer->flags = tracer->trace.ds.flags;
1058
1059 task = tracer->ds.context->task;
1060 cpu = tracer->ds.context->cpu;
1061
1062 local_irq_save(irq);
1063
1064 error = -EPERM;
1065 if (!task && (cpu != smp_processor_id()))
1066 goto out;
1067
1068 debugctlmsr = (task ?
1069 task->thread.debugctlmsr :
1070 get_debugctlmsr());
1071 debugctlmsr |= ds_bts_control(tracer);
1072
1073 if (task)
1074 update_task_debugctlmsr(task, debugctlmsr);
1075 else
1076 update_debugctlmsr(debugctlmsr);
1077
1078 error = 0;
1079 out:
1080 local_irq_restore(irq);
1081 return error;
1082}
1083
1084static void ds_free_pebs(struct pebs_tracer *tracer)
1085{
1086 struct task_struct *task;
1087
1088 task = tracer->ds.context->task;
1089
1090 WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
1091 tracer->ds.context->pebs_master = NULL;
1092
1093 ds_put_context(tracer->ds.context);
1094 put_tracer(task);
1095
1096 kfree(tracer);
1097}
1098
1099void ds_release_pebs(struct pebs_tracer *tracer)
1100{
1101 might_sleep();
1102
1103 if (!tracer)
1104 return;
1105
1106 ds_suspend_pebs(tracer);
1107 ds_free_pebs(tracer);
1108}
1109
1110int ds_release_pebs_noirq(struct pebs_tracer *tracer)
1111{
1112 struct task_struct *task;
1113 unsigned long irq;
1114 int error;
1115
1116 if (!tracer)
1117 return 0;
1118
1119 task = tracer->ds.context->task;
1120
1121 local_irq_save(irq);
1122
1123 error = -EPERM;
1124 if (!task &&
1125 (tracer->ds.context->cpu != smp_processor_id()))
1126 goto out;
1127
1128 error = -EPERM;
1129 if (task && (task != current))
1130 goto out;
1131
1132 ds_suspend_pebs_noirq(tracer);
1133 ds_free_pebs(tracer);
1134
1135 error = 0;
1136 out:
1137 local_irq_restore(irq);
1138 return error;
1139}
1140
1141void ds_suspend_pebs(struct pebs_tracer *tracer)
1142{
1143
1144}
1145
1146int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
1147{
1148 return 0;
1149}
1150
1151void ds_resume_pebs(struct pebs_tracer *tracer)
1152{
1153
1154}
1155
1156int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
1157{
1158 return 0;
1159}
1160
1161const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
1162{
1163 if (!tracer)
1164 return NULL;
1165
1166 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
1167 return &tracer->trace;
1168}
1169
1170const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
1171{
1172 if (!tracer)
1173 return NULL;
1174
1175 ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
1176
1177 tracer->trace.counters = ds_cfg.nr_counter_reset;
1178 memcpy(tracer->trace.counter_reset,
1179 tracer->ds.context->ds +
1180 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
1181 ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
1182
1183 return &tracer->trace;
1184}
1185
1186int ds_reset_bts(struct bts_tracer *tracer)
1187{
1188 if (!tracer)
1189 return -EINVAL;
1190
1191 tracer->trace.ds.top = tracer->trace.ds.begin;
1192
1193 ds_set(tracer->ds.context->ds, ds_bts, ds_index,
1194 (unsigned long)tracer->trace.ds.top);
1195
1196 return 0;
1197}
1198
1199int ds_reset_pebs(struct pebs_tracer *tracer)
1200{
1201 if (!tracer)
1202 return -EINVAL;
1203
1204 tracer->trace.ds.top = tracer->trace.ds.begin;
1205
1206 ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
1207 (unsigned long)tracer->trace.ds.top);
1208
1209 return 0;
1210}
1211
1212int ds_set_pebs_reset(struct pebs_tracer *tracer,
1213 unsigned int counter, u64 value)
1214{
1215 if (!tracer)
1216 return -EINVAL;
1217
1218 if (ds_cfg.nr_counter_reset < counter)
1219 return -EINVAL;
1220
1221 *(u64 *)(tracer->ds.context->ds +
1222 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
1223 (counter * PEBS_RESET_FIELD_SIZE)) = value;
1224
1225 return 0;
1226}
1227
1228static const struct ds_configuration ds_cfg_netburst = {
1229 .name = "Netburst",
1230 .ctl[dsf_bts] = (1 << 2) | (1 << 3),
1231 .ctl[dsf_bts_kernel] = (1 << 5),
1232 .ctl[dsf_bts_user] = (1 << 6),
1233 .nr_counter_reset = 1,
1234};
1235static const struct ds_configuration ds_cfg_pentium_m = {
1236 .name = "Pentium M",
1237 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1238 .nr_counter_reset = 1,
1239};
1240static const struct ds_configuration ds_cfg_core2_atom = {
1241 .name = "Core 2/Atom",
1242 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1243 .ctl[dsf_bts_kernel] = (1 << 9),
1244 .ctl[dsf_bts_user] = (1 << 10),
1245 .nr_counter_reset = 1,
1246};
1247static const struct ds_configuration ds_cfg_core_i7 = {
1248 .name = "Core i7",
1249 .ctl[dsf_bts] = (1 << 6) | (1 << 7),
1250 .ctl[dsf_bts_kernel] = (1 << 9),
1251 .ctl[dsf_bts_user] = (1 << 10),
1252 .nr_counter_reset = 4,
1253};
1254
1255static void
1256ds_configure(const struct ds_configuration *cfg,
1257 struct cpuinfo_x86 *cpu)
1258{
1259 unsigned long nr_pebs_fields = 0;
1260
1261 printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
1262
1263#ifdef __i386__
1264 nr_pebs_fields = 10;
1265#else
1266 nr_pebs_fields = 18;
1267#endif
1268
1269 /*
1270 * Starting with version 2, architectural performance
1271 * monitoring supports a format specifier.
1272 */
1273 if ((cpuid_eax(0xa) & 0xff) > 1) {
1274 unsigned long perf_capabilities, format;
1275
1276 rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
1277
1278 format = (perf_capabilities >> 8) & 0xf;
1279
1280 switch (format) {
1281 case 0:
1282 nr_pebs_fields = 18;
1283 break;
1284 case 1:
1285 nr_pebs_fields = 22;
1286 break;
1287 default:
1288 printk(KERN_INFO
1289 "[ds] unknown PEBS format: %lu\n", format);
1290 nr_pebs_fields = 0;
1291 break;
1292 }
1293 }
1294
1295 memset(&ds_cfg, 0, sizeof(ds_cfg));
1296 ds_cfg = *cfg;
1297
1298 ds_cfg.sizeof_ptr_field =
1299 (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
1300
1301 ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
1302 ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
1303
1304 if (!cpu_has(cpu, X86_FEATURE_BTS)) {
1305 ds_cfg.sizeof_rec[ds_bts] = 0;
1306 printk(KERN_INFO "[ds] bts not available\n");
1307 }
1308 if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
1309 ds_cfg.sizeof_rec[ds_pebs] = 0;
1310 printk(KERN_INFO "[ds] pebs not available\n");
1311 }
1312
1313 printk(KERN_INFO "[ds] sizes: address: %u bit, ",
1314 8 * ds_cfg.sizeof_ptr_field);
1315 printk("bts/pebs record: %u/%u bytes\n",
1316 ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
1317
1318 WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
1319}
1320
1321void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
1322{
1323 /* Only configure the first cpu. Others are identical. */
1324 if (ds_cfg.name)
1325 return;
1326
1327 switch (c->x86) {
1328 case 0x6:
1329 switch (c->x86_model) {
1330 case 0x9:
1331 case 0xd: /* Pentium M */
1332 ds_configure(&ds_cfg_pentium_m, c);
1333 break;
1334 case 0xf:
1335 case 0x17: /* Core2 */
1336 case 0x1c: /* Atom */
1337 ds_configure(&ds_cfg_core2_atom, c);
1338 break;
1339 case 0x1a: /* Core i7 */
1340 ds_configure(&ds_cfg_core_i7, c);
1341 break;
1342 default:
1343 /* Sorry, don't know about them. */
1344 break;
1345 }
1346 break;
1347 case 0xf:
1348 switch (c->x86_model) {
1349 case 0x0:
1350 case 0x1:
1351 case 0x2: /* Netburst */
1352 ds_configure(&ds_cfg_netburst, c);
1353 break;
1354 default:
1355 /* Sorry, don't know about them. */
1356 break;
1357 }
1358 break;
1359 default:
1360 /* Sorry, don't know about them. */
1361 break;
1362 }
1363}
1364
1365static inline void ds_take_timestamp(struct ds_context *context,
1366 enum bts_qualifier qualifier,
1367 struct task_struct *task)
1368{
1369 struct bts_tracer *tracer = context->bts_master;
1370 struct bts_struct ts;
1371
1372 /* Prevent compilers from reading the tracer pointer twice. */
1373 barrier();
1374
1375 if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
1376 return;
1377
1378 memset(&ts, 0, sizeof(ts));
1379 ts.qualifier = qualifier;
1380 ts.variant.event.clock = trace_clock_global();
1381 ts.variant.event.pid = task->pid;
1382
1383 bts_write(tracer, &ts);
1384}
1385
1386/*
1387 * Change the DS configuration from tracing prev to tracing next.
1388 */
1389void ds_switch_to(struct task_struct *prev, struct task_struct *next)
1390{
1391 struct ds_context *prev_ctx = prev->thread.ds_ctx;
1392 struct ds_context *next_ctx = next->thread.ds_ctx;
1393 unsigned long debugctlmsr = next->thread.debugctlmsr;
1394
1395 /* Make sure all data is read before we start. */
1396 barrier();
1397
1398 if (prev_ctx) {
1399 update_debugctlmsr(0);
1400
1401 ds_take_timestamp(prev_ctx, bts_task_departs, prev);
1402 }
1403
1404 if (next_ctx) {
1405 ds_take_timestamp(next_ctx, bts_task_arrives, next);
1406
1407 wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
1408 }
1409
1410 update_debugctlmsr(debugctlmsr);
1411}
1412
1413static __init int ds_selftest(void)
1414{
1415 if (ds_cfg.sizeof_rec[ds_bts]) {
1416 int error;
1417
1418 error = ds_selftest_bts();
1419 if (error) {
1420 WARN(1, "[ds] selftest failed. disabling bts.\n");
1421 ds_cfg.sizeof_rec[ds_bts] = 0;
1422 }
1423 }
1424
1425 if (ds_cfg.sizeof_rec[ds_pebs]) {
1426 int error;
1427
1428 error = ds_selftest_pebs();
1429 if (error) {
1430 WARN(1, "[ds] selftest failed. disabling pebs.\n");
1431 ds_cfg.sizeof_rec[ds_pebs] = 0;
1432 }
1433 }
1434
1435 return 0;
1436}
1437device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab9..00000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#include "ds_selftest.h"
10
11#include <linux/kernel.h>
12#include <linux/string.h>
13#include <linux/smp.h>
14#include <linux/cpu.h>
15
16#include <asm/ds.h>
17
18
19#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
20#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
21
22struct ds_selftest_bts_conf {
23 struct bts_tracer *tracer;
24 int error;
25 int (*suspend)(struct bts_tracer *);
26 int (*resume)(struct bts_tracer *);
27};
28
29static int ds_selftest_bts_consistency(const struct bts_trace *trace)
30{
31 int error = 0;
32
33 if (!trace) {
34 printk(KERN_CONT "failed to access trace...");
35 /* Bail out. Other tests are pointless. */
36 return -1;
37 }
38
39 if (!trace->read) {
40 printk(KERN_CONT "bts read not available...");
41 error = -1;
42 }
43
44 /* Do some sanity checks on the trace configuration. */
45 if (!trace->ds.n) {
46 printk(KERN_CONT "empty bts buffer...");
47 error = -1;
48 }
49 if (!trace->ds.size) {
50 printk(KERN_CONT "bad bts trace setup...");
51 error = -1;
52 }
53 if (trace->ds.end !=
54 (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
55 printk(KERN_CONT "bad bts buffer setup...");
56 error = -1;
57 }
58 /*
59 * We allow top in [begin; end], since its not clear when the
60 * overflow adjustment happens: after the increment or before the
61 * write.
62 */
63 if ((trace->ds.top < trace->ds.begin) ||
64 (trace->ds.end < trace->ds.top)) {
65 printk(KERN_CONT "bts top out of bounds...");
66 error = -1;
67 }
68
69 return error;
70}
71
72static int ds_selftest_bts_read(struct bts_tracer *tracer,
73 const struct bts_trace *trace,
74 const void *from, const void *to)
75{
76 const unsigned char *at;
77
78 /*
79 * Check a few things which do not belong to this test.
80 * They should be covered by other tests.
81 */
82 if (!trace)
83 return -1;
84
85 if (!trace->read)
86 return -1;
87
88 if (to < from)
89 return -1;
90
91 if (from < trace->ds.begin)
92 return -1;
93
94 if (trace->ds.end < to)
95 return -1;
96
97 if (!trace->ds.size)
98 return -1;
99
100 /* Now to the test itself. */
101 for (at = from; (void *)at < to; at += trace->ds.size) {
102 struct bts_struct bts;
103 unsigned long index;
104 int error;
105
106 if (((void *)at - trace->ds.begin) % trace->ds.size) {
107 printk(KERN_CONT
108 "read from non-integer index...");
109 return -1;
110 }
111 index = ((void *)at - trace->ds.begin) / trace->ds.size;
112
113 memset(&bts, 0, sizeof(bts));
114 error = trace->read(tracer, at, &bts);
115 if (error < 0) {
116 printk(KERN_CONT
117 "error reading bts trace at [%lu] (0x%p)...",
118 index, at);
119 return error;
120 }
121
122 switch (bts.qualifier) {
123 case BTS_BRANCH:
124 break;
125 default:
126 printk(KERN_CONT
127 "unexpected bts entry %llu at [%lu] (0x%p)...",
128 bts.qualifier, index, at);
129 return -1;
130 }
131 }
132
133 return 0;
134}
135
136static void ds_selftest_bts_cpu(void *arg)
137{
138 struct ds_selftest_bts_conf *conf = arg;
139 const struct bts_trace *trace;
140 void *top;
141
142 if (IS_ERR(conf->tracer)) {
143 conf->error = PTR_ERR(conf->tracer);
144 conf->tracer = NULL;
145
146 printk(KERN_CONT
147 "initialization failed (err: %d)...", conf->error);
148 return;
149 }
150
151 /* We should meanwhile have enough trace. */
152 conf->error = conf->suspend(conf->tracer);
153 if (conf->error < 0)
154 return;
155
156 /* Let's see if we can access the trace. */
157 trace = ds_read_bts(conf->tracer);
158
159 conf->error = ds_selftest_bts_consistency(trace);
160 if (conf->error < 0)
161 return;
162
163 /* If everything went well, we should have a few trace entries. */
164 if (trace->ds.top == trace->ds.begin) {
165 /*
166 * It is possible but highly unlikely that we got a
167 * buffer overflow and end up at exactly the same
168 * position we started from.
169 * Let's issue a warning, but continue.
170 */
171 printk(KERN_CONT "no trace/overflow...");
172 }
173
174 /* Let's try to read the trace we collected. */
175 conf->error =
176 ds_selftest_bts_read(conf->tracer, trace,
177 trace->ds.begin, trace->ds.top);
178 if (conf->error < 0)
179 return;
180
181 /*
182 * Let's read the trace again.
183 * Since we suspended tracing, we should get the same result.
184 */
185 top = trace->ds.top;
186
187 trace = ds_read_bts(conf->tracer);
188 conf->error = ds_selftest_bts_consistency(trace);
189 if (conf->error < 0)
190 return;
191
192 if (top != trace->ds.top) {
193 printk(KERN_CONT "suspend not working...");
194 conf->error = -1;
195 return;
196 }
197
198 /* Let's collect some more trace - see if resume is working. */
199 conf->error = conf->resume(conf->tracer);
200 if (conf->error < 0)
201 return;
202
203 conf->error = conf->suspend(conf->tracer);
204 if (conf->error < 0)
205 return;
206
207 trace = ds_read_bts(conf->tracer);
208
209 conf->error = ds_selftest_bts_consistency(trace);
210 if (conf->error < 0)
211 return;
212
213 if (trace->ds.top == top) {
214 /*
215 * It is possible but highly unlikely that we got a
216 * buffer overflow and end up at exactly the same
217 * position we started from.
218 * Let's issue a warning and check the full trace.
219 */
220 printk(KERN_CONT
221 "no resume progress/overflow...");
222
223 conf->error =
224 ds_selftest_bts_read(conf->tracer, trace,
225 trace->ds.begin, trace->ds.end);
226 } else if (trace->ds.top < top) {
227 /*
228 * We had a buffer overflow - the entire buffer should
229 * contain trace records.
230 */
231 conf->error =
232 ds_selftest_bts_read(conf->tracer, trace,
233 trace->ds.begin, trace->ds.end);
234 } else {
235 /*
236 * It is quite likely that the buffer did not overflow.
237 * Let's just check the delta trace.
238 */
239 conf->error =
240 ds_selftest_bts_read(conf->tracer, trace, top,
241 trace->ds.top);
242 }
243 if (conf->error < 0)
244 return;
245
246 conf->error = 0;
247}
248
249static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
250{
251 ds_suspend_bts(tracer);
252 return 0;
253}
254
255static int ds_resume_bts_wrap(struct bts_tracer *tracer)
256{
257 ds_resume_bts(tracer);
258 return 0;
259}
260
261static void ds_release_bts_noirq_wrap(void *tracer)
262{
263 (void)ds_release_bts_noirq(tracer);
264}
265
266static int ds_selftest_bts_bad_release_noirq(int cpu,
267 struct bts_tracer *tracer)
268{
269 int error = -EPERM;
270
271 /* Try to release the tracer on the wrong cpu. */
272 get_cpu();
273 if (cpu != smp_processor_id()) {
274 error = ds_release_bts_noirq(tracer);
275 if (error != -EPERM)
276 printk(KERN_CONT "release on wrong cpu...");
277 }
278 put_cpu();
279
280 return error ? 0 : -1;
281}
282
283static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
284{
285 struct bts_tracer *tracer;
286 int error;
287
288 /* Try to request cpu tracing while task tracing is active. */
289 tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
290 (size_t)-1, BTS_KERNEL);
291 error = PTR_ERR(tracer);
292 if (!IS_ERR(tracer)) {
293 ds_release_bts(tracer);
294 error = 0;
295 }
296
297 if (error != -EPERM)
298 printk(KERN_CONT "cpu/task tracing overlap...");
299
300 return error ? 0 : -1;
301}
302
303static int ds_selftest_bts_bad_request_task(void *buffer)
304{
305 struct bts_tracer *tracer;
306 int error;
307
308 /* Try to request cpu tracing while task tracing is active. */
309 tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
310 (size_t)-1, BTS_KERNEL);
311 error = PTR_ERR(tracer);
312 if (!IS_ERR(tracer)) {
313 error = 0;
314 ds_release_bts(tracer);
315 }
316
317 if (error != -EPERM)
318 printk(KERN_CONT "task/cpu tracing overlap...");
319
320 return error ? 0 : -1;
321}
322
323int ds_selftest_bts(void)
324{
325 struct ds_selftest_bts_conf conf;
326 unsigned char buffer[BUFFER_SIZE], *small_buffer;
327 unsigned long irq;
328 int cpu;
329
330 printk(KERN_INFO "[ds] bts selftest...");
331 conf.error = 0;
332
333 small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
334
335 get_online_cpus();
336 for_each_online_cpu(cpu) {
337 conf.suspend = ds_suspend_bts_wrap;
338 conf.resume = ds_resume_bts_wrap;
339 conf.tracer =
340 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
341 NULL, (size_t)-1, BTS_KERNEL);
342 ds_selftest_bts_cpu(&conf);
343 if (conf.error >= 0)
344 conf.error = ds_selftest_bts_bad_request_task(buffer);
345 ds_release_bts(conf.tracer);
346 if (conf.error < 0)
347 goto out;
348
349 conf.suspend = ds_suspend_bts_noirq;
350 conf.resume = ds_resume_bts_noirq;
351 conf.tracer =
352 ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
353 NULL, (size_t)-1, BTS_KERNEL);
354 smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
355 if (conf.error >= 0) {
356 conf.error =
357 ds_selftest_bts_bad_release_noirq(cpu,
358 conf.tracer);
359 /* We must not release the tracer twice. */
360 if (conf.error < 0)
361 conf.tracer = NULL;
362 }
363 if (conf.error >= 0)
364 conf.error = ds_selftest_bts_bad_request_task(buffer);
365 smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
366 conf.tracer, 1);
367 if (conf.error < 0)
368 goto out;
369 }
370
371 conf.suspend = ds_suspend_bts_wrap;
372 conf.resume = ds_resume_bts_wrap;
373 conf.tracer =
374 ds_request_bts_task(current, buffer, BUFFER_SIZE,
375 NULL, (size_t)-1, BTS_KERNEL);
376 ds_selftest_bts_cpu(&conf);
377 if (conf.error >= 0)
378 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
379 ds_release_bts(conf.tracer);
380 if (conf.error < 0)
381 goto out;
382
383 conf.suspend = ds_suspend_bts_noirq;
384 conf.resume = ds_resume_bts_noirq;
385 conf.tracer =
386 ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
387 NULL, (size_t)-1, BTS_KERNEL);
388 local_irq_save(irq);
389 ds_selftest_bts_cpu(&conf);
390 if (conf.error >= 0)
391 conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
392 ds_release_bts_noirq(conf.tracer);
393 local_irq_restore(irq);
394 if (conf.error < 0)
395 goto out;
396
397 conf.error = 0;
398 out:
399 put_online_cpus();
400 printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
401
402 return conf.error;
403}
404
405int ds_selftest_pebs(void)
406{
407 return 0;
408}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c666..00000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
1/*
2 * Debug Store support - selftest
3 *
4 *
5 * Copyright (C) 2009 Intel Corporation.
6 * Markus Metzger <markus.t.metzger@intel.com>, 2009
7 */
8
9#ifdef CONFIG_X86_DS_SELFTEST
10extern int ds_selftest_bts(void);
11extern int ds_selftest_pebs(void);
12#else
13static inline int ds_selftest_bts(void) { return 0; }
14static inline int ds_selftest_pebs(void) { return 0; }
15#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780..c89a386930b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -224,11 +224,6 @@ unsigned __kprobes long oops_begin(void)
224 int cpu; 224 int cpu;
225 unsigned long flags; 225 unsigned long flags;
226 226
227 /* notify the hw-branch tracer so it may disable tracing and
228 add the last trace to the trace buffer -
229 the earlier this happens, the more useful the trace. */
230 trace_hw_branch_oops();
231
232 oops_enter(); 227 oops_enter();
233 228
234 /* racy, but better than risking deadlock. */ 229 /* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b43bbaebe2c..f2f56c0967b 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -422,14 +422,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
422 422
423static void __kprobes clear_btf(void) 423static void __kprobes clear_btf(void)
424{ 424{
425 if (test_thread_flag(TIF_DEBUGCTLMSR)) 425 if (test_thread_flag(TIF_BLOCKSTEP)) {
426 update_debugctlmsr(0); 426 unsigned long debugctl = get_debugctlmsr();
427
428 debugctl &= ~DEBUGCTLMSR_BTF;
429 update_debugctlmsr(debugctl);
430 }
427} 431}
428 432
429static void __kprobes restore_btf(void) 433static void __kprobes restore_btf(void)
430{ 434{
431 if (test_thread_flag(TIF_DEBUGCTLMSR)) 435 if (test_thread_flag(TIF_BLOCKSTEP)) {
432 update_debugctlmsr(current->thread.debugctlmsr); 436 unsigned long debugctl = get_debugctlmsr();
437
438 debugctl |= DEBUGCTLMSR_BTF;
439 update_debugctlmsr(debugctl);
440 }
433} 441}
434 442
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 443void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 28ad9f4d8b9..eccdb57094e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <asm/i387.h> 22#include <asm/i387.h>
23#include <asm/ds.h>
24#include <asm/debugreg.h> 23#include <asm/debugreg.h>
25 24
26unsigned long idle_halt; 25unsigned long idle_halt;
@@ -50,8 +49,6 @@ void free_thread_xstate(struct task_struct *tsk)
50 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); 49 kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
51 tsk->thread.xstate = NULL; 50 tsk->thread.xstate = NULL;
52 } 51 }
53
54 WARN(tsk->thread.ds_ctx, "leaking DS context\n");
55} 52}
56 53
57void free_thread_info(struct thread_info *ti) 54void free_thread_info(struct thread_info *ti)
@@ -198,11 +195,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
198 prev = &prev_p->thread; 195 prev = &prev_p->thread;
199 next = &next_p->thread; 196 next = &next_p->thread;
200 197
201 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || 198 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
202 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) 199 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
203 ds_switch_to(prev_p, next_p); 200 unsigned long debugctl = get_debugctlmsr();
204 else if (next->debugctlmsr != prev->debugctlmsr) 201
205 update_debugctlmsr(next->debugctlmsr); 202 debugctl &= ~DEBUGCTLMSR_BTF;
203 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
204 debugctl |= DEBUGCTLMSR_BTF;
205
206 update_debugctlmsr(debugctl);
207 }
206 208
207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 209 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
208 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 210 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30..75090c589b7 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,7 +55,6 @@
55#include <asm/cpu.h> 55#include <asm/cpu.h>
56#include <asm/idle.h> 56#include <asm/idle.h>
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/ds.h>
59#include <asm/debugreg.h> 58#include <asm/debugreg.h>
60 59
61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
@@ -238,13 +237,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
238 kfree(p->thread.io_bitmap_ptr); 237 kfree(p->thread.io_bitmap_ptr);
239 p->thread.io_bitmap_max = 0; 238 p->thread.io_bitmap_max = 0;
240 } 239 }
241
242 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
243 p->thread.ds_ctx = NULL;
244
245 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
246 p->thread.debugctlmsr = 0;
247
248 return err; 240 return err;
249} 241}
250 242
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index dc9690b4c4c..cc4258f2beb 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
49#include <asm/ia32.h> 49#include <asm/ia32.h>
50#include <asm/idle.h> 50#include <asm/idle.h>
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/ds.h>
53#include <asm/debugreg.h> 52#include <asm/debugreg.h>
54 53
55asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
@@ -313,13 +312,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
313 if (err) 312 if (err)
314 goto out; 313 goto out;
315 } 314 }
316
317 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
318 p->thread.ds_ctx = NULL;
319
320 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
321 p->thread.debugctlmsr = 0;
322
323 err = 0; 315 err = 0;
324out: 316out:
325 if (err && p->thread.io_bitmap_ptr) { 317 if (err && p->thread.io_bitmap_ptr) {
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7..055be0afd33 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
2/* 2/*
3 * Pentium III FXSR, SSE support 3 * Pentium III FXSR, SSE support
4 * Gareth Hughes <gareth@valinux.com>, May 2000 4 * Gareth Hughes <gareth@valinux.com>, May 2000
5 *
6 * BTS tracing
7 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
8 */ 5 */
9 6
10#include <linux/kernel.h> 7#include <linux/kernel.h>
@@ -22,7 +19,6 @@
22#include <linux/audit.h> 19#include <linux/audit.h>
23#include <linux/seccomp.h> 20#include <linux/seccomp.h>
24#include <linux/signal.h> 21#include <linux/signal.h>
25#include <linux/workqueue.h>
26#include <linux/perf_event.h> 22#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
28 24
@@ -36,7 +32,6 @@
36#include <asm/desc.h> 32#include <asm/desc.h>
37#include <asm/prctl.h> 33#include <asm/prctl.h>
38#include <asm/proto.h> 34#include <asm/proto.h>
39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
41 36
42#include "tls.h" 37#include "tls.h"
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
789 0, IO_BITMAP_BYTES); 784 0, IO_BITMAP_BYTES);
790} 785}
791 786
792#ifdef CONFIG_X86_PTRACE_BTS
793/*
794 * A branch trace store context.
795 *
796 * Contexts may only be installed by ptrace_bts_config() and only for
797 * ptraced tasks.
798 *
799 * Contexts are destroyed when the tracee is detached from the tracer.
800 * The actual destruction work requires interrupts enabled, so the
801 * work is deferred and will be scheduled during __ptrace_unlink().
802 *
803 * Contexts hold an additional task_struct reference on the traced
804 * task, as well as a reference on the tracer's mm.
805 *
806 * Ptrace already holds a task_struct for the duration of ptrace operations,
807 * but since destruction is deferred, it may be executed after both
808 * tracer and tracee exited.
809 */
810struct bts_context {
811 /* The branch trace handle. */
812 struct bts_tracer *tracer;
813
814 /* The buffer used to store the branch trace and its size. */
815 void *buffer;
816 unsigned int size;
817
818 /* The mm that paid for the above buffer. */
819 struct mm_struct *mm;
820
821 /* The task this context belongs to. */
822 struct task_struct *task;
823
824 /* The signal to send on a bts buffer overflow. */
825 unsigned int bts_ovfl_signal;
826
827 /* The work struct to destroy a context. */
828 struct work_struct work;
829};
830
831static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
832{
833 void *buffer = NULL;
834 int err = -ENOMEM;
835
836 err = account_locked_memory(current->mm, current->signal->rlim, size);
837 if (err < 0)
838 return err;
839
840 buffer = kzalloc(size, GFP_KERNEL);
841 if (!buffer)
842 goto out_refund;
843
844 context->buffer = buffer;
845 context->size = size;
846 context->mm = get_task_mm(current);
847
848 return 0;
849
850 out_refund:
851 refund_locked_memory(current->mm, size);
852 return err;
853}
854
855static inline void free_bts_buffer(struct bts_context *context)
856{
857 if (!context->buffer)
858 return;
859
860 kfree(context->buffer);
861 context->buffer = NULL;
862
863 refund_locked_memory(context->mm, context->size);
864 context->size = 0;
865
866 mmput(context->mm);
867 context->mm = NULL;
868}
869
870static void free_bts_context_work(struct work_struct *w)
871{
872 struct bts_context *context;
873
874 context = container_of(w, struct bts_context, work);
875
876 ds_release_bts(context->tracer);
877 put_task_struct(context->task);
878 free_bts_buffer(context);
879 kfree(context);
880}
881
882static inline void free_bts_context(struct bts_context *context)
883{
884 INIT_WORK(&context->work, free_bts_context_work);
885 schedule_work(&context->work);
886}
887
888static inline struct bts_context *alloc_bts_context(struct task_struct *task)
889{
890 struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
891 if (context) {
892 context->task = task;
893 task->bts = context;
894
895 get_task_struct(task);
896 }
897
898 return context;
899}
900
901static int ptrace_bts_read_record(struct task_struct *child, size_t index,
902 struct bts_struct __user *out)
903{
904 struct bts_context *context;
905 const struct bts_trace *trace;
906 struct bts_struct bts;
907 const unsigned char *at;
908 int error;
909
910 context = child->bts;
911 if (!context)
912 return -ESRCH;
913
914 trace = ds_read_bts(context->tracer);
915 if (!trace)
916 return -ESRCH;
917
918 at = trace->ds.top - ((index + 1) * trace->ds.size);
919 if ((void *)at < trace->ds.begin)
920 at += (trace->ds.n * trace->ds.size);
921
922 if (!trace->read)
923 return -EOPNOTSUPP;
924
925 error = trace->read(context->tracer, at, &bts);
926 if (error < 0)
927 return error;
928
929 if (copy_to_user(out, &bts, sizeof(bts)))
930 return -EFAULT;
931
932 return sizeof(bts);
933}
934
935static int ptrace_bts_drain(struct task_struct *child,
936 long size,
937 struct bts_struct __user *out)
938{
939 struct bts_context *context;
940 const struct bts_trace *trace;
941 const unsigned char *at;
942 int error, drained = 0;
943
944 context = child->bts;
945 if (!context)
946 return -ESRCH;
947
948 trace = ds_read_bts(context->tracer);
949 if (!trace)
950 return -ESRCH;
951
952 if (!trace->read)
953 return -EOPNOTSUPP;
954
955 if (size < (trace->ds.top - trace->ds.begin))
956 return -EIO;
957
958 for (at = trace->ds.begin; (void *)at < trace->ds.top;
959 out++, drained++, at += trace->ds.size) {
960 struct bts_struct bts;
961
962 error = trace->read(context->tracer, at, &bts);
963 if (error < 0)
964 return error;
965
966 if (copy_to_user(out, &bts, sizeof(bts)))
967 return -EFAULT;
968 }
969
970 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
971
972 error = ds_reset_bts(context->tracer);
973 if (error < 0)
974 return error;
975
976 return drained;
977}
978
979static int ptrace_bts_config(struct task_struct *child,
980 long cfg_size,
981 const struct ptrace_bts_config __user *ucfg)
982{
983 struct bts_context *context;
984 struct ptrace_bts_config cfg;
985 unsigned int flags = 0;
986
987 if (cfg_size < sizeof(cfg))
988 return -EIO;
989
990 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
991 return -EFAULT;
992
993 context = child->bts;
994 if (!context)
995 context = alloc_bts_context(child);
996 if (!context)
997 return -ENOMEM;
998
999 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
1000 if (!cfg.signal)
1001 return -EINVAL;
1002
1003 return -EOPNOTSUPP;
1004 context->bts_ovfl_signal = cfg.signal;
1005 }
1006
1007 ds_release_bts(context->tracer);
1008 context->tracer = NULL;
1009
1010 if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
1011 int err;
1012
1013 free_bts_buffer(context);
1014 if (!cfg.size)
1015 return 0;
1016
1017 err = alloc_bts_buffer(context, cfg.size);
1018 if (err < 0)
1019 return err;
1020 }
1021
1022 if (cfg.flags & PTRACE_BTS_O_TRACE)
1023 flags |= BTS_USER;
1024
1025 if (cfg.flags & PTRACE_BTS_O_SCHED)
1026 flags |= BTS_TIMESTAMPS;
1027
1028 context->tracer =
1029 ds_request_bts_task(child, context->buffer, context->size,
1030 NULL, (size_t)-1, flags);
1031 if (unlikely(IS_ERR(context->tracer))) {
1032 int error = PTR_ERR(context->tracer);
1033
1034 free_bts_buffer(context);
1035 context->tracer = NULL;
1036 return error;
1037 }
1038
1039 return sizeof(cfg);
1040}
1041
1042static int ptrace_bts_status(struct task_struct *child,
1043 long cfg_size,
1044 struct ptrace_bts_config __user *ucfg)
1045{
1046 struct bts_context *context;
1047 const struct bts_trace *trace;
1048 struct ptrace_bts_config cfg;
1049
1050 context = child->bts;
1051 if (!context)
1052 return -ESRCH;
1053
1054 if (cfg_size < sizeof(cfg))
1055 return -EIO;
1056
1057 trace = ds_read_bts(context->tracer);
1058 if (!trace)
1059 return -ESRCH;
1060
1061 memset(&cfg, 0, sizeof(cfg));
1062 cfg.size = trace->ds.end - trace->ds.begin;
1063 cfg.signal = context->bts_ovfl_signal;
1064 cfg.bts_size = sizeof(struct bts_struct);
1065
1066 if (cfg.signal)
1067 cfg.flags |= PTRACE_BTS_O_SIGNAL;
1068
1069 if (trace->ds.flags & BTS_USER)
1070 cfg.flags |= PTRACE_BTS_O_TRACE;
1071
1072 if (trace->ds.flags & BTS_TIMESTAMPS)
1073 cfg.flags |= PTRACE_BTS_O_SCHED;
1074
1075 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
1076 return -EFAULT;
1077
1078 return sizeof(cfg);
1079}
1080
1081static int ptrace_bts_clear(struct task_struct *child)
1082{
1083 struct bts_context *context;
1084 const struct bts_trace *trace;
1085
1086 context = child->bts;
1087 if (!context)
1088 return -ESRCH;
1089
1090 trace = ds_read_bts(context->tracer);
1091 if (!trace)
1092 return -ESRCH;
1093
1094 memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
1095
1096 return ds_reset_bts(context->tracer);
1097}
1098
1099static int ptrace_bts_size(struct task_struct *child)
1100{
1101 struct bts_context *context;
1102 const struct bts_trace *trace;
1103
1104 context = child->bts;
1105 if (!context)
1106 return -ESRCH;
1107
1108 trace = ds_read_bts(context->tracer);
1109 if (!trace)
1110 return -ESRCH;
1111
1112 return (trace->ds.top - trace->ds.begin) / trace->ds.size;
1113}
1114
1115/*
1116 * Called from __ptrace_unlink() after the child has been moved back
1117 * to its original parent.
1118 */
1119void ptrace_bts_untrace(struct task_struct *child)
1120{
1121 if (unlikely(child->bts)) {
1122 free_bts_context(child->bts);
1123 child->bts = NULL;
1124 }
1125}
1126#endif /* CONFIG_X86_PTRACE_BTS */
1127
1128/* 787/*
1129 * Called by kernel/ptrace.c when detaching.. 788 * Called by kernel/ptrace.c when detaching..
1130 * 789 *
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1252 break; 911 break;
1253#endif 912#endif
1254 913
1255 /*
1256 * These bits need more cooking - not enabled yet:
1257 */
1258#ifdef CONFIG_X86_PTRACE_BTS
1259 case PTRACE_BTS_CONFIG:
1260 ret = ptrace_bts_config
1261 (child, data, (struct ptrace_bts_config __user *)addr);
1262 break;
1263
1264 case PTRACE_BTS_STATUS:
1265 ret = ptrace_bts_status
1266 (child, data, (struct ptrace_bts_config __user *)addr);
1267 break;
1268
1269 case PTRACE_BTS_SIZE:
1270 ret = ptrace_bts_size(child);
1271 break;
1272
1273 case PTRACE_BTS_GET:
1274 ret = ptrace_bts_read_record
1275 (child, data, (struct bts_struct __user *) addr);
1276 break;
1277
1278 case PTRACE_BTS_CLEAR:
1279 ret = ptrace_bts_clear(child);
1280 break;
1281
1282 case PTRACE_BTS_DRAIN:
1283 ret = ptrace_bts_drain
1284 (child, data, (struct bts_struct __user *) addr);
1285 break;
1286#endif /* CONFIG_X86_PTRACE_BTS */
1287
1288 default: 914 default:
1289 ret = ptrace_request(child, request, addr, data); 915 ret = ptrace_request(child, request, addr, data);
1290 break; 916 break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1544 1170
1545 case PTRACE_GET_THREAD_AREA: 1171 case PTRACE_GET_THREAD_AREA:
1546 case PTRACE_SET_THREAD_AREA: 1172 case PTRACE_SET_THREAD_AREA:
1547#ifdef CONFIG_X86_PTRACE_BTS
1548 case PTRACE_BTS_CONFIG:
1549 case PTRACE_BTS_STATUS:
1550 case PTRACE_BTS_SIZE:
1551 case PTRACE_BTS_GET:
1552 case PTRACE_BTS_CLEAR:
1553 case PTRACE_BTS_DRAIN:
1554#endif /* CONFIG_X86_PTRACE_BTS */
1555 return arch_ptrace(child, request, addr, data); 1173 return arch_ptrace(child, request, addr, data);
1556 1174
1557 default: 1175 default:
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff10..58de45ee08b 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
158} 158}
159 159
160/* 160/*
161 * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
162 */
163static void write_debugctlmsr(struct task_struct *child, unsigned long val)
164{
165 if (child->thread.debugctlmsr == val)
166 return;
167
168 child->thread.debugctlmsr = val;
169
170 if (child != current)
171 return;
172
173 update_debugctlmsr(val);
174}
175
176/*
177 * Enable single or block step. 161 * Enable single or block step.
178 */ 162 */
179static void enable_step(struct task_struct *child, bool block) 163static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
186 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
187 */ 171 */
188 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
189 set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 173 unsigned long debugctl = get_debugctlmsr();
190 write_debugctlmsr(child, 174
191 child->thread.debugctlmsr | DEBUGCTLMSR_BTF); 175 debugctl |= DEBUGCTLMSR_BTF;
192 } else { 176 update_debugctlmsr(debugctl);
193 write_debugctlmsr(child, 177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
194 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
195 179 unsigned long debugctl = get_debugctlmsr();
196 if (!child->thread.debugctlmsr) 180
197 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
198 } 184 }
199} 185}
200 186
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
213 /* 199 /*
214 * Make sure block stepping (BTF) is disabled. 200 * Make sure block stepping (BTF) is disabled.
215 */ 201 */
216 write_debugctlmsr(child, 202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
217 child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); 203 unsigned long debugctl = get_debugctlmsr();
218 204
219 if (!child->thread.debugctlmsr) 205 debugctl &= ~DEBUGCTLMSR_BTF;
220 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
221 209
222 /* Always clear TIF_SINGLESTEP... */ 210 /* Always clear TIF_SINGLESTEP... */
223 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 211 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e445418..36f1bd9f8e7 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -543,11 +543,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
543 543
544 /* DR6 may or may not be cleared by the CPU */ 544 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6); 545 set_debugreg(0, 6);
546
546 /* 547 /*
547 * The processor cleared BTF, so don't mark that we need it set. 548 * The processor cleared BTF, so don't mark that we need it set.
548 */ 549 */
549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 550 clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
550 tsk->thread.debugctlmsr = 0;
551 551
552 /* Store the virtualized DR6 value */ 552 /* Store the virtualized DR6 value */
553 tsk->thread.debugreg6 = dr6; 553 tsk->thread.debugreg6 = dr6;