aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>2009-10-13 03:19:41 -0400
committerIngo Molnar <mingo@elte.hu>2009-10-13 03:43:38 -0400
commit8968f9d3dc23d9a1821d97c6f11e72a59382e56c (patch)
tree79272a0169556c1cf22ede10d5eedb59747c28a0
parent9dbdd6c41c12fb42ee7188eafa7e1917b192af3a (diff)
perf_event, x86, mce: Use TRACE_EVENT() for MCE logging
This approach is the first baby step towards solving many of the structural problems the x86 MCE logging code is having today: - It has a private ring-buffer implementation that has a number of limitations and has been historically fragile and buggy. - It is using a quirky /dev/mcelog ioctl driven ABI that is MCE specific. /dev/mcelog is not part of any larger logging framework and hence has remained on the fringes for many years. - The MCE logging code is still very unclean partly due to its ABI limitations. Fields are being reused for multiple purposes, and the whole message structure is limited and x86 specific to begin with. All in one, the x86 tree would like to move away from this private implementation of an event logging facility to a broader framework. By using perf events we gain the following advantages: - Multiple user-space agents can access MCE events. We can have an mcelog daemon running but also a system-wide tracer capturing important events in flight-recorder mode. - Sampling support: the kernel and the user-space call-chain of MCE events can be stored and analyzed as well. This way actual patterns of bad behavior can be matched to precisely what kind of activity happened in the kernel (and/or in the app) around that moment in time. - Coupling with other hardware and software events: the PMU can track a number of other anomalies - monitoring software might chose to monitor those plus the MCE events as well - in one coherent stream of events. - Discovery of MCE sources - tracepoints are enumerated and tools can act upon the existence (or non-existence) of various channels of MCE information. - Filtering support: we just subscribe to and act upon the events we are interested in. Then even on a per event source basis there's in-kernel filter expressions available that can restrict the amount of data that hits the event channel. - Arbitrary deep per cpu buffering of events - we can buffer 32 entries or we can buffer as much as we want, as long as we have the RAM. - An NMI-safe ring-buffer implementation - mappable to user-space. - Built-in support for timestamping of events, PID markers, CPU markers, etc. - A rich ABI accessible over system call interface. Per cpu, per task and per workload monitoring of MCE events can be done this way. The ABI itself has a nice, meaningful structure. - Extensible ABI: new fields can be added without breaking tooling. New tracepoints can be added as the hardware side evolves. There's various parsers that can be used. - Lots of scheduling/buffering/batching modes of operandi for MCE events. poll() support. mmap() support. read() support. You name it. - Rich tooling support: even without any MCE specific extensions added the 'perf' tool today offers various views of MCE data: perf report, perf stat, perf trace can all be used to view logged MCE events and perhaps correlate them to certain user-space usage patterns. But it can be used directly as well, for user-space agents and policy action in mcelog, etc. With this we hope to achieve significant code cleanup and feature improvements in the MCE code, and we hope to be able to drop the /dev/mcelog facility in the end. This patch is just a plain dumb dump of mce_log() records to the tracepoints / perf events framework - a first proof of concept step. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Andi Kleen <ak@linux.intel.com> LKML-Reference: <4AD42A0D.7050104@jp.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c6
-rw-r--r--include/trace/events/mce.h69
2 files changed, 75 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b1598a9436d0..39caea3d8bc3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -46,6 +46,9 @@
46 46
47#include "mce-internal.h" 47#include "mce-internal.h"
48 48
49#define CREATE_TRACE_POINTS
50#include <trace/events/mce.h>
51
49int mce_disabled __read_mostly; 52int mce_disabled __read_mostly;
50 53
51#define MISC_MCELOG_MINOR 227 54#define MISC_MCELOG_MINOR 227
@@ -141,6 +144,9 @@ void mce_log(struct mce *mce)
141{ 144{
142 unsigned next, entry; 145 unsigned next, entry;
143 146
147 /* Emit the trace record: */
148 trace_mce_record(mce);
149
144 mce->finished = 0; 150 mce->finished = 0;
145 wmb(); 151 wmb();
146 for (;;) { 152 for (;;) {
diff --git a/include/trace/events/mce.h b/include/trace/events/mce.h
new file mode 100644
index 000000000000..7eee77895cb3
--- /dev/null
+++ b/include/trace/events/mce.h
@@ -0,0 +1,69 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM mce
3
4#if !defined(_TRACE_MCE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_MCE_H
6
7#include <linux/ktime.h>
8#include <linux/tracepoint.h>
9#include <asm/mce.h>
10
11TRACE_EVENT(mce_record,
12
13 TP_PROTO(struct mce *m),
14
15 TP_ARGS(m),
16
17 TP_STRUCT__entry(
18 __field( u64, mcgcap )
19 __field( u64, mcgstatus )
20 __field( u8, bank )
21 __field( u64, status )
22 __field( u64, addr )
23 __field( u64, misc )
24 __field( u64, ip )
25 __field( u8, cs )
26 __field( u64, tsc )
27 __field( u64, walltime )
28 __field( u32, cpu )
29 __field( u32, cpuid )
30 __field( u32, apicid )
31 __field( u32, socketid )
32 __field( u8, cpuvendor )
33 ),
34
35 TP_fast_assign(
36 __entry->mcgcap = m->mcgcap;
37 __entry->mcgstatus = m->mcgstatus;
38 __entry->bank = m->bank;
39 __entry->status = m->status;
40 __entry->addr = m->addr;
41 __entry->misc = m->misc;
42 __entry->ip = m->ip;
43 __entry->cs = m->cs;
44 __entry->tsc = m->tsc;
45 __entry->walltime = m->time;
46 __entry->cpu = m->extcpu;
47 __entry->cpuid = m->cpuid;
48 __entry->apicid = m->apicid;
49 __entry->socketid = m->socketid;
50 __entry->cpuvendor = m->cpuvendor;
51 ),
52
53 TP_printk("CPU: %d, MCGc/s: %llx/%llx, MC%d: %016Lx, ADDR/MISC: %016Lx/%016Lx, RIP: %02x:<%016Lx>, TSC: %llx, PROCESSOR: %u:%x, TIME: %llu, SOCKET: %u, APIC: %x",
54 __entry->cpu,
55 __entry->mcgcap, __entry->mcgstatus,
56 __entry->bank, __entry->status,
57 __entry->addr, __entry->misc,
58 __entry->cs, __entry->ip,
59 __entry->tsc,
60 __entry->cpuvendor, __entry->cpuid,
61 __entry->walltime,
62 __entry->socketid,
63 __entry->apicid)
64);
65
66#endif /* _TRACE_MCE_H */
67
68/* This part must be outside protection */
69#include <trace/define_trace.h>