aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorXie XiuQi <xiexiuqi@huawei.com>2015-06-24 19:57:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-06-24 20:49:43 -0400
commit97f0b13452198290799fd6780f05fbaa74f927d3 (patch)
tree06401ca906140b76ed4206968a412eb860bd65c0
parentcc3e2af42e7b7e0457b93bf17c19b44c635cd40c (diff)
tracing: add trace event for memory-failure
RAS user space tools like rasdaemon which base on trace event, could receive mce error event, but no memory recovery result event. So, I want to add this event to make this scenario complete. This patch add a event at ras group for memory-failure. The output like below: # tracer: nop # # entries-in-buffer/entries-written: 2/2 #P:24 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | mce-inject-13150 [001] .... 277.019359: memory_failure_event: pfn 0x19869: recovery action for free buddy page: Delayed [xiexiuqi@huawei.com: fix build error] Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: Tony Luck <tony.luck@intel.com> Cc: Chen Gong <gong.chen@linux.intel.com> Cc: Jim Davis <jim.epost@gmail.com> Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/ras/ras_event.h85
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/memory-failure.c3
3 files changed, 89 insertions, 0 deletions
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 79abb9c71772..1443d79e4fe6 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -11,6 +11,7 @@
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/aer.h> 12#include <linux/aer.h>
13#include <linux/cper.h> 13#include <linux/cper.h>
14#include <linux/mm.h>
14 15
15/* 16/*
16 * MCE Extended Error Log trace event 17 * MCE Extended Error Log trace event
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event,
232 __print_flags(__entry->status, "|", aer_uncorrectable_errors)) 233 __print_flags(__entry->status, "|", aer_uncorrectable_errors))
233); 234);
234 235
236/*
237 * memory-failure recovery action result event
238 *
239 * unsigned long pfn - Page Frame Number of the corrupted page
240 * int type - Page types of the corrupted page
241 * int result - Result of recovery action
242 */
243
244#ifdef CONFIG_MEMORY_FAILURE
245#define MF_ACTION_RESULT \
246 EM ( MF_IGNORED, "Ignored" ) \
247 EM ( MF_FAILED, "Failed" ) \
248 EM ( MF_DELAYED, "Delayed" ) \
249 EMe ( MF_RECOVERED, "Recovered" )
250
251#define MF_PAGE_TYPE \
252 EM ( MF_MSG_KERNEL, "reserved kernel page" ) \
253 EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \
254 EM ( MF_MSG_SLAB, "kernel slab page" ) \
255 EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
256 EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \
257 EM ( MF_MSG_HUGE, "huge page" ) \
258 EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
259 EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \
260 EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \
261 EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \
262 EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \
263 EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \
264 EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \
265 EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \
266 EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \
267 EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \
268 EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \
269 EM ( MF_MSG_BUDDY, "free buddy page" ) \
270 EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \
271 EMe ( MF_MSG_UNKNOWN, "unknown page" )
272
273/*
274 * First define the enums in MM_ACTION_RESULT to be exported to userspace
275 * via TRACE_DEFINE_ENUM().
276 */
277#undef EM
278#undef EMe
279#define EM(a, b) TRACE_DEFINE_ENUM(a);
280#define EMe(a, b) TRACE_DEFINE_ENUM(a);
281
282MF_ACTION_RESULT
283MF_PAGE_TYPE
284
285/*
286 * Now redefine the EM() and EMe() macros to map the enums to the strings
287 * that will be printed in the output.
288 */
289#undef EM
290#undef EMe
291#define EM(a, b) { a, b },
292#define EMe(a, b) { a, b }
293
294TRACE_EVENT(memory_failure_event,
295 TP_PROTO(unsigned long pfn,
296 int type,
297 int result),
298
299 TP_ARGS(pfn, type, result),
300
301 TP_STRUCT__entry(
302 __field(unsigned long, pfn)
303 __field(int, type)
304 __field(int, result)
305 ),
306
307 TP_fast_assign(
308 __entry->pfn = pfn;
309 __entry->type = type;
310 __entry->result = result;
311 ),
312
313 TP_printk("pfn %#lx: recovery action for %s: %s",
314 __entry->pfn,
315 __print_symbolic(__entry->type, MF_PAGE_TYPE),
316 __print_symbolic(__entry->result, MF_ACTION_RESULT)
317 )
318);
319#endif /* CONFIG_MEMORY_FAILURE */
235#endif /* _TRACE_HW_EVENT_MC_H */ 320#endif /* _TRACE_HW_EVENT_MC_H */
236 321
237/* This part must be outside protection */ 322/* This part must be outside protection */
diff --git a/mm/Kconfig b/mm/Kconfig
index 390214da4546..c180af880ed5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
368 depends on ARCH_SUPPORTS_MEMORY_FAILURE 368 depends on ARCH_SUPPORTS_MEMORY_FAILURE
369 bool "Enable recovery from hardware memory errors" 369 bool "Enable recovery from hardware memory errors"
370 select MEMORY_ISOLATION 370 select MEMORY_ISOLATION
371 select RAS
371 help 372 help
372 Enables code to recover from some memory failures on systems 373 Enables code to recover from some memory failures on systems
373 with MCA recovery. This allows a system to continue running 374 with MCA recovery. This allows a system to continue running
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 15c0d5ab0893..c53543d89282 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -57,6 +57,7 @@
57#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
58#include <linux/kfifo.h> 58#include <linux/kfifo.h>
59#include "internal.h" 59#include "internal.h"
60#include "ras/ras_event.h"
60 61
61int sysctl_memory_failure_early_kill __read_mostly = 0; 62int sysctl_memory_failure_early_kill __read_mostly = 0;
62 63
@@ -855,6 +856,8 @@ static struct page_state {
855static void action_result(unsigned long pfn, enum mf_action_page_type type, 856static void action_result(unsigned long pfn, enum mf_action_page_type type,
856 enum mf_result result) 857 enum mf_result result)
857{ 858{
859 trace_memory_failure_event(pfn, type, result);
860
858 pr_err("MCE %#lx: recovery action for %s: %s\n", 861 pr_err("MCE %#lx: recovery action for %s: %s\n",
859 pfn, action_page_types[type], action_name[result]); 862 pfn, action_page_types[type], action_name[result]);
860} 863}