aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlexander Shishkin <alexander.shishkin@linux.intel.com>2015-01-30 05:39:52 -0500
committerIngo Molnar <mingo@kernel.org>2015-04-02 11:14:20 -0400
commit52ca9ced3f70779589e6ecc329baffe69d8f5f7a (patch)
tree3e29af8a16f3ba9a8738480775cc58ac0aeb86b2
parent4807034248bed58d49a4f9f450c024e3b0f58577 (diff)
perf/x86/intel/pt: Add Intel PT PMU driver
Add support for Intel Processor Trace (PT) to kernel's perf events. PT is an extension of Intel Architecture that collects information about software execuction such as control flow, execution modes and timings and formats it into highly compressed binary packets. Even being compressed, these packets are generated at hundreds of megabytes per second per core, which makes it impractical to decode them on the fly in the kernel. This driver exports trace data by through AUX space in the perf ring buffer, which is zero-copy mapped into userspace for faster data retrieval. Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kaixu Xia <kaixu.xia@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Robert Richter <rric@kernel.org> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1422614392-114498-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h18
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h131
-rw-r--r--arch/x86/kernel/cpu/perf_event.h2
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c8
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c1096
6 files changed, 1256 insertions, 0 deletions
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..1a4eae695ca8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -74,6 +74,24 @@
74#define MSR_IA32_PERF_CAPABILITIES 0x00000345 74#define MSR_IA32_PERF_CAPABILITIES 0x00000345
75#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 75#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
76 76
77#define MSR_IA32_RTIT_CTL 0x00000570
78#define RTIT_CTL_TRACEEN BIT(0)
79#define RTIT_CTL_OS BIT(2)
80#define RTIT_CTL_USR BIT(3)
81#define RTIT_CTL_CR3EN BIT(7)
82#define RTIT_CTL_TOPA BIT(8)
83#define RTIT_CTL_TSC_EN BIT(10)
84#define RTIT_CTL_DISRETC BIT(11)
85#define RTIT_CTL_BRANCH_EN BIT(13)
86#define MSR_IA32_RTIT_STATUS 0x00000571
87#define RTIT_STATUS_CONTEXTEN BIT(1)
88#define RTIT_STATUS_TRIGGEREN BIT(2)
89#define RTIT_STATUS_ERROR BIT(4)
90#define RTIT_STATUS_STOPPED BIT(5)
91#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
92#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
93#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
94
77#define MSR_MTRRfix64K_00000 0x00000250 95#define MSR_MTRRfix64K_00000 0x00000250
78#define MSR_MTRRfix16K_80000 0x00000258 96#define MSR_MTRRfix16K_80000 0x00000258
79#define MSR_MTRRfix16K_A0000 0x00000259 97#define MSR_MTRRfix16K_A0000 0x00000259
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6c1ca139f736..e6b353f10e09 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,6 +40,7 @@ endif
40obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o 40obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
41obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 41obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
42obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o 42obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
43obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o
43 44
44obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \ 45obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
45 perf_event_intel_uncore_snb.o \ 46 perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
1/*
2 * Intel(R) Processor Trace PMU driver for perf
3 * Copyright (c) 2013-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15 * Programming Reference:
16 * http://software.intel.com/en-us/intel-isa-extensions
17 */
18
19#ifndef __INTEL_PT_H__
20#define __INTEL_PT_H__
21
22/*
23 * Single-entry ToPA: when this close to region boundary, switch
24 * buffers to avoid losing data.
25 */
26#define TOPA_PMI_MARGIN 512
27
28/*
29 * Table of Physical Addresses bits
30 */
31enum topa_sz {
32 TOPA_4K = 0,
33 TOPA_8K,
34 TOPA_16K,
35 TOPA_32K,
36 TOPA_64K,
37 TOPA_128K,
38 TOPA_256K,
39 TOPA_512K,
40 TOPA_1MB,
41 TOPA_2MB,
42 TOPA_4MB,
43 TOPA_8MB,
44 TOPA_16MB,
45 TOPA_32MB,
46 TOPA_64MB,
47 TOPA_128MB,
48 TOPA_SZ_END,
49};
50
51static inline unsigned int sizes(enum topa_sz tsz)
52{
53 return 1 << (tsz + 12);
54};
55
56struct topa_entry {
57 u64 end : 1;
58 u64 rsvd0 : 1;
59 u64 intr : 1;
60 u64 rsvd1 : 1;
61 u64 stop : 1;
62 u64 rsvd2 : 1;
63 u64 size : 4;
64 u64 rsvd3 : 2;
65 u64 base : 36;
66 u64 rsvd4 : 16;
67};
68
69#define TOPA_SHIFT 12
70#define PT_CPUID_LEAVES 2
71
72enum pt_capabilities {
73 PT_CAP_max_subleaf = 0,
74 PT_CAP_cr3_filtering,
75 PT_CAP_topa_output,
76 PT_CAP_topa_multiple_entries,
77 PT_CAP_payloads_lip,
78};
79
80struct pt_pmu {
81 struct pmu pmu;
82 u32 caps[4 * PT_CPUID_LEAVES];
83};
84
85/**
86 * struct pt_buffer - buffer configuration; one buffer per task_struct or
87 * cpu, depending on perf event configuration
88 * @cpu: cpu for per-cpu allocation
89 * @tables: list of ToPA tables in this buffer
90 * @first: shorthand for first topa table
91 * @last: shorthand for last topa table
92 * @cur: current topa table
93 * @nr_pages: buffer size in pages
94 * @cur_idx: current output region's index within @cur table
95 * @output_off: offset within the current output region
96 * @data_size: running total of the amount of data in this buffer
97 * @lost: if data was lost/truncated
98 * @head: logical write offset inside the buffer
99 * @snapshot: if this is for a snapshot/overwrite counter
100 * @stop_pos: STOP topa entry in the buffer
101 * @intr_pos: INT topa entry in the buffer
102 * @data_pages: array of pages from perf
103 * @topa_index: table of topa entries indexed by page offset
104 */
105struct pt_buffer {
106 int cpu;
107 struct list_head tables;
108 struct topa *first, *last, *cur;
109 unsigned int cur_idx;
110 size_t output_off;
111 unsigned long nr_pages;
112 local_t data_size;
113 local_t lost;
114 local64_t head;
115 bool snapshot;
116 unsigned long stop_pos, intr_pos;
117 void **data_pages;
118 struct topa_entry *topa_index[0];
119};
120
121/**
122 * struct pt - per-cpu pt context
123 * @handle: perf output handle
124 * @handle_nmi: do handle PT PMI on this cpu, there's an active event
125 */
126struct pt {
127 struct perf_output_handle handle;
128 int handle_nmi;
129};
130
131#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 47499661e8d4..f04729ac3290 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -808,6 +808,8 @@ void intel_pmu_lbr_init_hsw(void);
808 808
809int intel_pmu_setup_lbr_filter(struct perf_event *event); 809int intel_pmu_setup_lbr_filter(struct perf_event *event);
810 810
811void intel_pt_interrupt(void);
812
811int p4_pmu_init(void); 813int p4_pmu_init(void);
812 814
813int p6_pmu_init(void); 815int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b7b3ff21c832..8eb22ce26303 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1590,6 +1590,14 @@ again:
1590 } 1590 }
1591 1591
1592 /* 1592 /*
1593 * Intel PT
1594 */
1595 if (__test_and_clear_bit(55, (unsigned long *)&status)) {
1596 handled++;
1597 intel_pt_interrupt();
1598 }
1599
1600 /*
1593 * Checkpointed counters can lead to 'spurious' PMIs because the 1601 * Checkpointed counters can lead to 'spurious' PMIs because the
1594 * rollback caused by the PMI will have cleared the overflow status 1602 * rollback caused by the PMI will have cleared the overflow status
1595 * bit. Therefore always force probe these counters. 1603 * bit. Therefore always force probe these counters.
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..a9a1092cf836
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1096 @@
1/*
2 * Intel(R) Processor Trace PMU driver for perf
3 * Copyright (c) 2013-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15 * Programming Reference:
16 * http://software.intel.com/en-us/intel-isa-extensions
17 */
18
19#undef DEBUG
20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/types.h>
24#include <linux/slab.h>
25#include <linux/device.h>
26
27#include <asm/perf_event.h>
28#include <asm/insn.h>
29#include <asm/io.h>
30
31#include "perf_event.h"
32#include "intel_pt.h"
33
34static DEFINE_PER_CPU(struct pt, pt_ctx);
35
36static struct pt_pmu pt_pmu;
37
38enum cpuid_regs {
39 CR_EAX = 0,
40 CR_ECX,
41 CR_EDX,
42 CR_EBX
43};
44
45/*
46 * Capabilities of Intel PT hardware, such as number of address bits or
47 * supported output schemes, are cached and exported to userspace as "caps"
48 * attribute group of pt pmu device
49 * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
50 * relevant bits together with intel_pt traces.
51 *
52 * These are necessary for both trace decoding (payloads_lip, contains address
53 * width encoded in IP-related packets), and event configuration (bitmasks with
54 * permitted values for certain bit fields).
55 */
56#define PT_CAP(_n, _l, _r, _m) \
57 [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
58 .reg = _r, .mask = _m }
59
60static struct pt_cap_desc {
61 const char *name;
62 u32 leaf;
63 u8 reg;
64 u32 mask;
65} pt_caps[] = {
66 PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
67 PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
68 PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
69 PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
70 PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
71};
72
73static u32 pt_cap_get(enum pt_capabilities cap)
74{
75 struct pt_cap_desc *cd = &pt_caps[cap];
76 u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
77 unsigned int shift = __ffs(cd->mask);
78
79 return (c & cd->mask) >> shift;
80}
81
82static ssize_t pt_cap_show(struct device *cdev,
83 struct device_attribute *attr,
84 char *buf)
85{
86 struct dev_ext_attribute *ea =
87 container_of(attr, struct dev_ext_attribute, attr);
88 enum pt_capabilities cap = (long)ea->var;
89
90 return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
91}
92
93static struct attribute_group pt_cap_group = {
94 .name = "caps",
95};
96
97PMU_FORMAT_ATTR(tsc, "config:10" );
98PMU_FORMAT_ATTR(noretcomp, "config:11" );
99
100static struct attribute *pt_formats_attr[] = {
101 &format_attr_tsc.attr,
102 &format_attr_noretcomp.attr,
103 NULL,
104};
105
106static struct attribute_group pt_format_group = {
107 .name = "format",
108 .attrs = pt_formats_attr,
109};
110
111static const struct attribute_group *pt_attr_groups[] = {
112 &pt_cap_group,
113 &pt_format_group,
114 NULL,
115};
116
117static int __init pt_pmu_hw_init(void)
118{
119 struct dev_ext_attribute *de_attrs;
120 struct attribute **attrs;
121 size_t size;
122 long i;
123
124 if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) {
125 for (i = 0; i < PT_CPUID_LEAVES; i++)
126 cpuid_count(20, i,
127 &pt_pmu.caps[CR_EAX + i * 4],
128 &pt_pmu.caps[CR_EBX + i * 4],
129 &pt_pmu.caps[CR_ECX + i * 4],
130 &pt_pmu.caps[CR_EDX + i * 4]);
131 } else {
132 return -ENODEV;
133 }
134
135 size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1);
136 attrs = kzalloc(size, GFP_KERNEL);
137 if (!attrs)
138 goto err_attrs;
139
140 size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1);
141 de_attrs = kzalloc(size, GFP_KERNEL);
142 if (!de_attrs)
143 goto err_de_attrs;
144
145 for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
146 de_attrs[i].attr.attr.name = pt_caps[i].name;
147
148 sysfs_attr_init(&de_attrs[i].attr.attr);
149 de_attrs[i].attr.attr.mode = S_IRUGO;
150 de_attrs[i].attr.show = pt_cap_show;
151 de_attrs[i].var = (void *)i;
152 attrs[i] = &de_attrs[i].attr.attr;
153 }
154
155 pt_cap_group.attrs = attrs;
156 return 0;
157
158err_de_attrs:
159 kfree(de_attrs);
160err_attrs:
161 kfree(attrs);
162
163 return -ENOMEM;
164}
165
166#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
167
168static bool pt_event_valid(struct perf_event *event)
169{
170 u64 config = event->attr.config;
171
172 if ((config & PT_CONFIG_MASK) != config)
173 return false;
174
175 return true;
176}
177
178/*
179 * PT configuration helpers
180 * These all are cpu affine and operate on a local PT
181 */
182
183static bool pt_is_running(void)
184{
185 u64 ctl;
186
187 rdmsrl(MSR_IA32_RTIT_CTL, ctl);
188
189 return !!(ctl & RTIT_CTL_TRACEEN);
190}
191
192static void pt_config(struct perf_event *event)
193{
194 u64 reg;
195
196 reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
197
198 if (!event->attr.exclude_kernel)
199 reg |= RTIT_CTL_OS;
200 if (!event->attr.exclude_user)
201 reg |= RTIT_CTL_USR;
202
203 reg |= (event->attr.config & PT_CONFIG_MASK);
204
205 wrmsrl(MSR_IA32_RTIT_CTL, reg);
206}
207
208static void pt_config_start(bool start)
209{
210 u64 ctl;
211
212 rdmsrl(MSR_IA32_RTIT_CTL, ctl);
213 if (start)
214 ctl |= RTIT_CTL_TRACEEN;
215 else
216 ctl &= ~RTIT_CTL_TRACEEN;
217 wrmsrl(MSR_IA32_RTIT_CTL, ctl);
218
219 /*
220 * A wrmsr that disables trace generation serializes other PT
221 * registers and causes all data packets to be written to memory,
222 * but a fence is required for the data to become globally visible.
223 *
224 * The below WMB, separating data store and aux_head store matches
225 * the consumer's RMB that separates aux_head load and data load.
226 */
227 if (!start)
228 wmb();
229}
230
231static void pt_config_buffer(void *buf, unsigned int topa_idx,
232 unsigned int output_off)
233{
234 u64 reg;
235
236 wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
237
238 reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
239
240 wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
241}
242
243/*
244 * Keep ToPA table-related metadata on the same page as the actual table,
245 * taking up a few words from the top
246 */
247
248#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
249
250/**
251 * struct topa - page-sized ToPA table with metadata at the top
252 * @table: actual ToPA table entries, as understood by PT hardware
253 * @list: linkage to struct pt_buffer's list of tables
254 * @phys: physical address of this page
255 * @offset: offset of the first entry in this table in the buffer
256 * @size: total size of all entries in this table
257 * @last: index of the last initialized entry in this table
258 */
259struct topa {
260 struct topa_entry table[TENTS_PER_PAGE];
261 struct list_head list;
262 u64 phys;
263 u64 offset;
264 size_t size;
265 int last;
266};
267
268/* make -1 stand for the last table entry */
269#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
270
271/**
272 * topa_alloc() - allocate page-sized ToPA table
273 * @cpu: CPU on which to allocate.
274 * @gfp: Allocation flags.
275 *
276 * Return: On success, return the pointer to ToPA table page.
277 */
278static struct topa *topa_alloc(int cpu, gfp_t gfp)
279{
280 int node = cpu_to_node(cpu);
281 struct topa *topa;
282 struct page *p;
283
284 p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
285 if (!p)
286 return NULL;
287
288 topa = page_address(p);
289 topa->last = 0;
290 topa->phys = page_to_phys(p);
291
292 /*
293 * In case of singe-entry ToPA, always put the self-referencing END
294 * link as the 2nd entry in the table
295 */
296 if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
297 TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
298 TOPA_ENTRY(topa, 1)->end = 1;
299 }
300
301 return topa;
302}
303
304/**
305 * topa_free() - free a page-sized ToPA table
306 * @topa: Table to deallocate.
307 */
308static void topa_free(struct topa *topa)
309{
310 free_page((unsigned long)topa);
311}
312
313/**
314 * topa_insert_table() - insert a ToPA table into a buffer
315 * @buf: PT buffer that's being extended.
316 * @topa: New topa table to be inserted.
317 *
318 * If it's the first table in this buffer, set up buffer's pointers
319 * accordingly; otherwise, add a END=1 link entry to @topa to the current
320 * "last" table and adjust the last table pointer to @topa.
321 */
322static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
323{
324 struct topa *last = buf->last;
325
326 list_add_tail(&topa->list, &buf->tables);
327
328 if (!buf->first) {
329 buf->first = buf->last = buf->cur = topa;
330 return;
331 }
332
333 topa->offset = last->offset + last->size;
334 buf->last = topa;
335
336 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
337 return;
338
339 BUG_ON(last->last != TENTS_PER_PAGE - 1);
340
341 TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
342 TOPA_ENTRY(last, -1)->end = 1;
343}
344
345/**
346 * topa_table_full() - check if a ToPA table is filled up
347 * @topa: ToPA table.
348 */
349static bool topa_table_full(struct topa *topa)
350{
351 /* single-entry ToPA is a special case */
352 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
353 return !!topa->last;
354
355 return topa->last == TENTS_PER_PAGE - 1;
356}
357
358/**
359 * topa_insert_pages() - create a list of ToPA tables
360 * @buf: PT buffer being initialized.
361 * @gfp: Allocation flags.
362 *
363 * This initializes a list of ToPA tables with entries from
364 * the data_pages provided by rb_alloc_aux().
365 *
366 * Return: 0 on success or error code.
367 */
368static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
369{
370 struct topa *topa = buf->last;
371 int order = 0;
372 struct page *p;
373
374 p = virt_to_page(buf->data_pages[buf->nr_pages]);
375 if (PagePrivate(p))
376 order = page_private(p);
377
378 if (topa_table_full(topa)) {
379 topa = topa_alloc(buf->cpu, gfp);
380 if (!topa)
381 return -ENOMEM;
382
383 topa_insert_table(buf, topa);
384 }
385
386 TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
387 TOPA_ENTRY(topa, -1)->size = order;
388 if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
389 TOPA_ENTRY(topa, -1)->intr = 1;
390 TOPA_ENTRY(topa, -1)->stop = 1;
391 }
392
393 topa->last++;
394 topa->size += sizes(order);
395
396 buf->nr_pages += 1ul << order;
397
398 return 0;
399}
400
401/**
402 * pt_topa_dump() - print ToPA tables and their entries
403 * @buf: PT buffer.
404 */
405static void pt_topa_dump(struct pt_buffer *buf)
406{
407 struct topa *topa;
408
409 list_for_each_entry(topa, &buf->tables, list) {
410 int i;
411
412 pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table,
413 (void *)topa->phys, topa->offset, topa->size);
414 for (i = 0; i < TENTS_PER_PAGE; i++) {
415 pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
416 &topa->table[i],
417 (unsigned long)topa->table[i].base << TOPA_SHIFT,
418 sizes(topa->table[i].size),
419 topa->table[i].end ? 'E' : ' ',
420 topa->table[i].intr ? 'I' : ' ',
421 topa->table[i].stop ? 'S' : ' ',
422 *(u64 *)&topa->table[i]);
423 if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
424 topa->table[i].stop) ||
425 topa->table[i].end)
426 break;
427 }
428 }
429}
430
431/**
432 * pt_buffer_advance() - advance to the next output region
433 * @buf: PT buffer.
434 *
435 * Advance the current pointers in the buffer to the next ToPA entry.
436 */
437static void pt_buffer_advance(struct pt_buffer *buf)
438{
439 buf->output_off = 0;
440 buf->cur_idx++;
441
442 if (buf->cur_idx == buf->cur->last) {
443 if (buf->cur == buf->last)
444 buf->cur = buf->first;
445 else
446 buf->cur = list_entry(buf->cur->list.next, struct topa,
447 list);
448 buf->cur_idx = 0;
449 }
450}
451
452/**
453 * pt_update_head() - calculate current offsets and sizes
454 * @pt: Per-cpu pt context.
455 *
456 * Update buffer's current write pointer position and data size.
457 */
458static void pt_update_head(struct pt *pt)
459{
460 struct pt_buffer *buf = perf_get_aux(&pt->handle);
461 u64 topa_idx, base, old;
462
463 /* offset of the first region in this table from the beginning of buf */
464 base = buf->cur->offset + buf->output_off;
465
466 /* offset of the current output region within this table */
467 for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
468 base += sizes(buf->cur->table[topa_idx].size);
469
470 if (buf->snapshot) {
471 local_set(&buf->data_size, base);
472 } else {
473 old = (local64_xchg(&buf->head, base) &
474 ((buf->nr_pages << PAGE_SHIFT) - 1));
475 if (base < old)
476 base += buf->nr_pages << PAGE_SHIFT;
477
478 local_add(base - old, &buf->data_size);
479 }
480}
481
482/**
483 * pt_buffer_region() - obtain current output region's address
484 * @buf: PT buffer.
485 */
486static void *pt_buffer_region(struct pt_buffer *buf)
487{
488 return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
489}
490
491/**
492 * pt_buffer_region_size() - obtain current output region's size
493 * @buf: PT buffer.
494 */
495static size_t pt_buffer_region_size(struct pt_buffer *buf)
496{
497 return sizes(buf->cur->table[buf->cur_idx].size);
498}
499
500/**
501 * pt_handle_status() - take care of possible status conditions
502 * @pt: Per-cpu pt context.
503 */
504static void pt_handle_status(struct pt *pt)
505{
506 struct pt_buffer *buf = perf_get_aux(&pt->handle);
507 int advance = 0;
508 u64 status;
509
510 rdmsrl(MSR_IA32_RTIT_STATUS, status);
511
512 if (status & RTIT_STATUS_ERROR) {
513 pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
514 pt_topa_dump(buf);
515 status &= ~RTIT_STATUS_ERROR;
516 }
517
518 if (status & RTIT_STATUS_STOPPED) {
519 status &= ~RTIT_STATUS_STOPPED;
520
521 /*
522 * On systems that only do single-entry ToPA, hitting STOP
523 * means we are already losing data; need to let the decoder
524 * know.
525 */
526 if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
527 buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
528 local_inc(&buf->lost);
529 advance++;
530 }
531 }
532
533 /*
534 * Also on single-entry ToPA implementations, interrupt will come
535 * before the output reaches its output region's boundary.
536 */
537 if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
538 pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
539 void *head = pt_buffer_region(buf);
540
541 /* everything within this margin needs to be zeroed out */
542 memset(head + buf->output_off, 0,
543 pt_buffer_region_size(buf) -
544 buf->output_off);
545 advance++;
546 }
547
548 if (advance)
549 pt_buffer_advance(buf);
550
551 wrmsrl(MSR_IA32_RTIT_STATUS, status);
552}
553
554/**
555 * pt_read_offset() - translate registers into buffer pointers
556 * @buf: PT buffer.
557 *
558 * Set buffer's output pointers from MSR values.
559 */
560static void pt_read_offset(struct pt_buffer *buf)
561{
562 u64 offset, base_topa;
563
564 rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
565 buf->cur = phys_to_virt(base_topa);
566
567 rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
568 /* offset within current output region */
569 buf->output_off = offset >> 32;
570 /* index of current output region within this table */
571 buf->cur_idx = (offset & 0xffffff80) >> 7;
572}
573
574/**
575 * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
576 * @buf: PT buffer.
577 * @pg: Page offset in the buffer.
578 *
579 * When advancing to the next output region (ToPA entry), given a page offset
580 * into the buffer, we need to find the offset of the first page in the next
581 * region.
582 */
583static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
584{
585 struct topa_entry *te = buf->topa_index[pg];
586
587 /* one region */
588 if (buf->first == buf->last && buf->first->last == 1)
589 return pg;
590
591 do {
592 pg++;
593 pg &= buf->nr_pages - 1;
594 } while (buf->topa_index[pg] == te);
595
596 return pg;
597}
598
599/**
600 * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
601 * @buf: PT buffer.
602 * @handle: Current output handle.
603 *
604 * Place INT and STOP marks to prevent overwriting old data that the consumer
605 * hasn't yet collected.
606 */
607static int pt_buffer_reset_markers(struct pt_buffer *buf,
608 struct perf_output_handle *handle)
609
610{
611 unsigned long idx, npages, end;
612
613 if (buf->snapshot)
614 return 0;
615
616 /* can't stop in the middle of an output region */
617 if (buf->output_off + handle->size + 1 <
618 sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
619 return -EINVAL;
620
621
622 /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
623 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
624 return 0;
625
626 /* clear STOP and INT from current entry */
627 buf->topa_index[buf->stop_pos]->stop = 0;
628 buf->topa_index[buf->intr_pos]->intr = 0;
629
630 if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
631 npages = (handle->size + 1) >> PAGE_SHIFT;
632 end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
633 /*if (end > handle->wakeup >> PAGE_SHIFT)
634 end = handle->wakeup >> PAGE_SHIFT;*/
635 idx = end & (buf->nr_pages - 1);
636 buf->stop_pos = idx;
637 idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
638 idx &= buf->nr_pages - 1;
639 buf->intr_pos = idx;
640 }
641
642 buf->topa_index[buf->stop_pos]->stop = 1;
643 buf->topa_index[buf->intr_pos]->intr = 1;
644
645 return 0;
646}
647
648/**
649 * pt_buffer_setup_topa_index() - build topa_index[] table of regions
650 * @buf: PT buffer.
651 *
652 * topa_index[] references output regions indexed by offset into the
653 * buffer for purposes of quick reverse lookup.
654 */
655static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
656{
657 struct topa *cur = buf->first, *prev = buf->last;
658 struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
659 *te_prev = TOPA_ENTRY(prev, prev->last - 1);
660 int pg = 0, idx = 0, ntopa = 0;
661
662 while (pg < buf->nr_pages) {
663 int tidx;
664
665 /* pages within one topa entry */
666 for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
667 buf->topa_index[pg] = te_prev;
668
669 te_prev = te_cur;
670
671 if (idx == cur->last - 1) {
672 /* advance to next topa table */
673 idx = 0;
674 cur = list_entry(cur->list.next, struct topa, list);
675 ntopa++;
676 } else
677 idx++;
678 te_cur = TOPA_ENTRY(cur, idx);
679 }
680
681}
682
683/**
684 * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
685 * @buf: PT buffer.
686 * @head: Write pointer (aux_head) from AUX buffer.
687 *
688 * Find the ToPA table and entry corresponding to given @head and set buffer's
689 * "current" pointers accordingly.
690 */
691static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
692{
693 int pg;
694
695 if (buf->snapshot)
696 head &= (buf->nr_pages << PAGE_SHIFT) - 1;
697
698 pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
699 pg = pt_topa_next_entry(buf, pg);
700
701 buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
702 buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
703 (unsigned long)buf->cur) / sizeof(struct topa_entry);
704 buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
705
706 local64_set(&buf->head, head);
707 local_set(&buf->data_size, 0);
708}
709
710/**
711 * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
712 * @buf: PT buffer.
713 */
714static void pt_buffer_fini_topa(struct pt_buffer *buf)
715{
716 struct topa *topa, *iter;
717
718 list_for_each_entry_safe(topa, iter, &buf->tables, list) {
719 /*
720 * right now, this is in free_aux() path only, so
721 * no need to unlink this table from the list
722 */
723 topa_free(topa);
724 }
725}
726
727/**
728 * pt_buffer_init_topa() - initialize ToPA table for pt buffer
729 * @buf: PT buffer.
730 * @size: Total size of all regions within this ToPA.
731 * @gfp: Allocation flags.
732 */
733static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
734 gfp_t gfp)
735{
736 struct topa *topa;
737 int err;
738
739 topa = topa_alloc(buf->cpu, gfp);
740 if (!topa)
741 return -ENOMEM;
742
743 topa_insert_table(buf, topa);
744
745 while (buf->nr_pages < nr_pages) {
746 err = topa_insert_pages(buf, gfp);
747 if (err) {
748 pt_buffer_fini_topa(buf);
749 return -ENOMEM;
750 }
751 }
752
753 pt_buffer_setup_topa_index(buf);
754
755 /* link last table to the first one, unless we're double buffering */
756 if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
757 TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
758 TOPA_ENTRY(buf->last, -1)->end = 1;
759 }
760
761 pt_topa_dump(buf);
762 return 0;
763}
764
765/**
766 * pt_buffer_setup_aux() - set up topa tables for a PT buffer
767 * @cpu: Cpu on which to allocate, -1 means current.
768 * @pages: Array of pointers to buffer pages passed from perf core.
769 * @nr_pages: Number of pages in the buffer.
770 * @snapshot: If this is a snapshot/overwrite counter.
771 *
772 * This is a pmu::setup_aux callback that sets up ToPA tables and all the
773 * bookkeeping for an AUX buffer.
774 *
775 * Return: Our private PT buffer structure.
776 */
777static void *
778pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
779{
780 struct pt_buffer *buf;
781 int node, ret;
782
783 if (!nr_pages)
784 return NULL;
785
786 if (cpu == -1)
787 cpu = raw_smp_processor_id();
788 node = cpu_to_node(cpu);
789
790 buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
791 GFP_KERNEL, node);
792 if (!buf)
793 return NULL;
794
795 buf->cpu = cpu;
796 buf->snapshot = snapshot;
797 buf->data_pages = pages;
798
799 INIT_LIST_HEAD(&buf->tables);
800
801 ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
802 if (ret) {
803 kfree(buf);
804 return NULL;
805 }
806
807 return buf;
808}
809
810/**
811 * pt_buffer_free_aux() - perf AUX deallocation path callback
812 * @data: PT buffer.
813 */
814static void pt_buffer_free_aux(void *data)
815{
816 struct pt_buffer *buf = data;
817
818 pt_buffer_fini_topa(buf);
819 kfree(buf);
820}
821
822/**
823 * pt_buffer_is_full() - check if the buffer is full
824 * @buf: PT buffer.
825 * @pt: Per-cpu pt handle.
826 *
827 * If the user hasn't read data from the output region that aux_head
828 * points to, the buffer is considered full: the user needs to read at
829 * least this region and update aux_tail to point past it.
830 */
831static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
832{
833 if (buf->snapshot)
834 return false;
835
836 if (local_read(&buf->data_size) >= pt->handle.size)
837 return true;
838
839 return false;
840}
841
842/**
843 * intel_pt_interrupt() - PT PMI handler
844 */
845void intel_pt_interrupt(void)
846{
847 struct pt *pt = this_cpu_ptr(&pt_ctx);
848 struct pt_buffer *buf;
849 struct perf_event *event = pt->handle.event;
850
851 /*
852 * There may be a dangling PT bit in the interrupt status register
853 * after PT has been disabled by pt_event_stop(). Make sure we don't
854 * do anything (particularly, re-enable) for this event here.
855 */
856 if (!ACCESS_ONCE(pt->handle_nmi))
857 return;
858
859 pt_config_start(false);
860
861 if (!event)
862 return;
863
864 buf = perf_get_aux(&pt->handle);
865 if (!buf)
866 return;
867
868 pt_read_offset(buf);
869
870 pt_handle_status(pt);
871
872 pt_update_head(pt);
873
874 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
875 local_xchg(&buf->lost, 0));
876
877 if (!event->hw.state) {
878 int ret;
879
880 buf = perf_aux_output_begin(&pt->handle, event);
881 if (!buf) {
882 event->hw.state = PERF_HES_STOPPED;
883 return;
884 }
885
886 pt_buffer_reset_offsets(buf, pt->handle.head);
887 ret = pt_buffer_reset_markers(buf, &pt->handle);
888 if (ret) {
889 perf_aux_output_end(&pt->handle, 0, true);
890 return;
891 }
892
893 pt_config_buffer(buf->cur->table, buf->cur_idx,
894 buf->output_off);
895 wrmsrl(MSR_IA32_RTIT_STATUS, 0);
896 pt_config(event);
897 }
898}
899
900/*
901 * PMU callbacks
902 */
903
904static void pt_event_start(struct perf_event *event, int mode)
905{
906 struct pt *pt = this_cpu_ptr(&pt_ctx);
907 struct pt_buffer *buf = perf_get_aux(&pt->handle);
908
909 if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
910 event->hw.state = PERF_HES_STOPPED;
911 return;
912 }
913
914 ACCESS_ONCE(pt->handle_nmi) = 1;
915 event->hw.state = 0;
916
917 pt_config_buffer(buf->cur->table, buf->cur_idx,
918 buf->output_off);
919 wrmsrl(MSR_IA32_RTIT_STATUS, 0);
920 pt_config(event);
921}
922
923static void pt_event_stop(struct perf_event *event, int mode)
924{
925 struct pt *pt = this_cpu_ptr(&pt_ctx);
926
927 /*
928 * Protect against the PMI racing with disabling wrmsr,
929 * see comment in intel_pt_interrupt().
930 */
931 ACCESS_ONCE(pt->handle_nmi) = 0;
932 pt_config_start(false);
933
934 if (event->hw.state == PERF_HES_STOPPED)
935 return;
936
937 event->hw.state = PERF_HES_STOPPED;
938
939 if (mode & PERF_EF_UPDATE) {
940 struct pt *pt = this_cpu_ptr(&pt_ctx);
941 struct pt_buffer *buf = perf_get_aux(&pt->handle);
942
943 if (!buf)
944 return;
945
946 if (WARN_ON_ONCE(pt->handle.event != event))
947 return;
948
949 pt_read_offset(buf);
950
951 pt_handle_status(pt);
952
953 pt_update_head(pt);
954 }
955}
956
957static void pt_event_del(struct perf_event *event, int mode)
958{
959 struct pt *pt = this_cpu_ptr(&pt_ctx);
960 struct pt_buffer *buf;
961
962 pt_event_stop(event, PERF_EF_UPDATE);
963
964 buf = perf_get_aux(&pt->handle);
965
966 if (buf) {
967 if (buf->snapshot)
968 pt->handle.head =
969 local_xchg(&buf->data_size,
970 buf->nr_pages << PAGE_SHIFT);
971 perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
972 local_xchg(&buf->lost, 0));
973 }
974}
975
976static int pt_event_add(struct perf_event *event, int mode)
977{
978 struct pt_buffer *buf;
979 struct pt *pt = this_cpu_ptr(&pt_ctx);
980 struct hw_perf_event *hwc = &event->hw;
981 int ret = -EBUSY;
982
983 if (pt->handle.event)
984 goto out;
985
986 buf = perf_aux_output_begin(&pt->handle, event);
987 if (!buf) {
988 ret = -EINVAL;
989 goto out;
990 }
991
992 pt_buffer_reset_offsets(buf, pt->handle.head);
993 if (!buf->snapshot) {
994 ret = pt_buffer_reset_markers(buf, &pt->handle);
995 if (ret) {
996 perf_aux_output_end(&pt->handle, 0, true);
997 goto out;
998 }
999 }
1000
1001 if (mode & PERF_EF_START) {
1002 pt_event_start(event, 0);
1003 if (hwc->state == PERF_HES_STOPPED) {
1004 pt_event_del(event, 0);
1005 ret = -EBUSY;
1006 }
1007 } else {
1008 hwc->state = PERF_HES_STOPPED;
1009 }
1010
1011 ret = 0;
1012out:
1013
1014 if (ret)
1015 hwc->state = PERF_HES_STOPPED;
1016
1017 return ret;
1018}
1019
1020static void pt_event_read(struct perf_event *event)
1021{
1022}
1023
1024static void pt_event_destroy(struct perf_event *event)
1025{
1026 x86_del_exclusive(x86_lbr_exclusive_pt);
1027}
1028
1029static int pt_event_init(struct perf_event *event)
1030{
1031 if (event->attr.type != pt_pmu.pmu.type)
1032 return -ENOENT;
1033
1034 if (!pt_event_valid(event))
1035 return -EINVAL;
1036
1037 if (x86_add_exclusive(x86_lbr_exclusive_pt))
1038 return -EBUSY;
1039
1040 event->destroy = pt_event_destroy;
1041
1042 return 0;
1043}
1044
1045static __init int pt_init(void)
1046{
1047 int ret, cpu, prior_warn = 0;
1048
1049 BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1050 get_online_cpus();
1051 for_each_online_cpu(cpu) {
1052 u64 ctl;
1053
1054 ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1055 if (!ret && (ctl & RTIT_CTL_TRACEEN))
1056 prior_warn++;
1057 }
1058 put_online_cpus();
1059
1060 if (prior_warn) {
1061 x86_add_exclusive(x86_lbr_exclusive_pt);
1062 pr_warn("PT is enabled at boot time, doing nothing\n");
1063
1064 return -EBUSY;
1065 }
1066
1067 ret = pt_pmu_hw_init();
1068 if (ret)
1069 return ret;
1070
1071 if (!pt_cap_get(PT_CAP_topa_output)) {
1072 pr_warn("ToPA output is not supported on this CPU\n");
1073 return -ENODEV;
1074 }
1075
1076 if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1077 pt_pmu.pmu.capabilities =
1078 PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1079
1080 pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1081 pt_pmu.pmu.attr_groups = pt_attr_groups;
1082 pt_pmu.pmu.task_ctx_nr = perf_sw_context;
1083 pt_pmu.pmu.event_init = pt_event_init;
1084 pt_pmu.pmu.add = pt_event_add;
1085 pt_pmu.pmu.del = pt_event_del;
1086 pt_pmu.pmu.start = pt_event_start;
1087 pt_pmu.pmu.stop = pt_event_stop;
1088 pt_pmu.pmu.read = pt_event_read;
1089 pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
1090 pt_pmu.pmu.free_aux = pt_buffer_free_aux;
1091 ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1092
1093 return ret;
1094}
1095
1096module_init(pt_init);