diff options
author | Xie XiuQi <xiexiuqi@huawei.com> | 2015-06-24 19:57:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-24 20:49:43 -0400 |
commit | 97f0b13452198290799fd6780f05fbaa74f927d3 (patch) | |
tree | 06401ca906140b76ed4206968a412eb860bd65c0 | |
parent | cc3e2af42e7b7e0457b93bf17c19b44c635cd40c (diff) |
tracing: add trace event for memory-failure
RAS user space tools like rasdaemon which base on trace event, could
receive mce error event, but no memory recovery result event. So, I want
to add this event to make this scenario complete.
This patch add a event at ras group for memory-failure.
The output like below:
# tracer: nop
#
# entries-in-buffer/entries-written: 2/2 #P:24
#
# _-----=> irqs-off
# / _----=> need-resched
# | / _---=> hardirq/softirq
# || / _--=> preempt-depth
# ||| / delay
# TASK-PID CPU# |||| TIMESTAMP FUNCTION
# | | | |||| | |
mce-inject-13150 [001] .... 277.019359: memory_failure_event: pfn 0x19869: recovery action for free buddy page: Delayed
[xiexiuqi@huawei.com: fix build error]
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Chen Gong <gong.chen@linux.intel.com>
Cc: Jim Davis <jim.epost@gmail.com>
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/ras/ras_event.h | 85 | ||||
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/memory-failure.c | 3 |
3 files changed, 89 insertions, 0 deletions
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 79abb9c71772..1443d79e4fe6 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/pci.h> | 11 | #include <linux/pci.h> |
12 | #include <linux/aer.h> | 12 | #include <linux/aer.h> |
13 | #include <linux/cper.h> | 13 | #include <linux/cper.h> |
14 | #include <linux/mm.h> | ||
14 | 15 | ||
15 | /* | 16 | /* |
16 | * MCE Extended Error Log trace event | 17 | * MCE Extended Error Log trace event |
@@ -232,6 +233,90 @@ TRACE_EVENT(aer_event, | |||
232 | __print_flags(__entry->status, "|", aer_uncorrectable_errors)) | 233 | __print_flags(__entry->status, "|", aer_uncorrectable_errors)) |
233 | ); | 234 | ); |
234 | 235 | ||
236 | /* | ||
237 | * memory-failure recovery action result event | ||
238 | * | ||
239 | * unsigned long pfn - Page Frame Number of the corrupted page | ||
240 | * int type - Page types of the corrupted page | ||
241 | * int result - Result of recovery action | ||
242 | */ | ||
243 | |||
244 | #ifdef CONFIG_MEMORY_FAILURE | ||
245 | #define MF_ACTION_RESULT \ | ||
246 | EM ( MF_IGNORED, "Ignored" ) \ | ||
247 | EM ( MF_FAILED, "Failed" ) \ | ||
248 | EM ( MF_DELAYED, "Delayed" ) \ | ||
249 | EMe ( MF_RECOVERED, "Recovered" ) | ||
250 | |||
251 | #define MF_PAGE_TYPE \ | ||
252 | EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ | ||
253 | EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ | ||
254 | EM ( MF_MSG_SLAB, "kernel slab page" ) \ | ||
255 | EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ | ||
256 | EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ | ||
257 | EM ( MF_MSG_HUGE, "huge page" ) \ | ||
258 | EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ | ||
259 | EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ | ||
260 | EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ | ||
261 | EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ | ||
262 | EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ | ||
263 | EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ | ||
264 | EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ | ||
265 | EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ | ||
266 | EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ | ||
267 | EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ | ||
268 | EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ | ||
269 | EM ( MF_MSG_BUDDY, "free buddy page" ) \ | ||
270 | EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ | ||
271 | EMe ( MF_MSG_UNKNOWN, "unknown page" ) | ||
272 | |||
273 | /* | ||
274 | * First define the enums in MM_ACTION_RESULT to be exported to userspace | ||
275 | * via TRACE_DEFINE_ENUM(). | ||
276 | */ | ||
277 | #undef EM | ||
278 | #undef EMe | ||
279 | #define EM(a, b) TRACE_DEFINE_ENUM(a); | ||
280 | #define EMe(a, b) TRACE_DEFINE_ENUM(a); | ||
281 | |||
282 | MF_ACTION_RESULT | ||
283 | MF_PAGE_TYPE | ||
284 | |||
285 | /* | ||
286 | * Now redefine the EM() and EMe() macros to map the enums to the strings | ||
287 | * that will be printed in the output. | ||
288 | */ | ||
289 | #undef EM | ||
290 | #undef EMe | ||
291 | #define EM(a, b) { a, b }, | ||
292 | #define EMe(a, b) { a, b } | ||
293 | |||
294 | TRACE_EVENT(memory_failure_event, | ||
295 | TP_PROTO(unsigned long pfn, | ||
296 | int type, | ||
297 | int result), | ||
298 | |||
299 | TP_ARGS(pfn, type, result), | ||
300 | |||
301 | TP_STRUCT__entry( | ||
302 | __field(unsigned long, pfn) | ||
303 | __field(int, type) | ||
304 | __field(int, result) | ||
305 | ), | ||
306 | |||
307 | TP_fast_assign( | ||
308 | __entry->pfn = pfn; | ||
309 | __entry->type = type; | ||
310 | __entry->result = result; | ||
311 | ), | ||
312 | |||
313 | TP_printk("pfn %#lx: recovery action for %s: %s", | ||
314 | __entry->pfn, | ||
315 | __print_symbolic(__entry->type, MF_PAGE_TYPE), | ||
316 | __print_symbolic(__entry->result, MF_ACTION_RESULT) | ||
317 | ) | ||
318 | ); | ||
319 | #endif /* CONFIG_MEMORY_FAILURE */ | ||
235 | #endif /* _TRACE_HW_EVENT_MC_H */ | 320 | #endif /* _TRACE_HW_EVENT_MC_H */ |
236 | 321 | ||
237 | /* This part must be outside protection */ | 322 | /* This part must be outside protection */ |
diff --git a/mm/Kconfig b/mm/Kconfig index 390214da4546..c180af880ed5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -368,6 +368,7 @@ config MEMORY_FAILURE | |||
368 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | 368 | depends on ARCH_SUPPORTS_MEMORY_FAILURE |
369 | bool "Enable recovery from hardware memory errors" | 369 | bool "Enable recovery from hardware memory errors" |
370 | select MEMORY_ISOLATION | 370 | select MEMORY_ISOLATION |
371 | select RAS | ||
371 | help | 372 | help |
372 | Enables code to recover from some memory failures on systems | 373 | Enables code to recover from some memory failures on systems |
373 | with MCA recovery. This allows a system to continue running | 374 | with MCA recovery. This allows a system to continue running |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 15c0d5ab0893..c53543d89282 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
58 | #include <linux/kfifo.h> | 58 | #include <linux/kfifo.h> |
59 | #include "internal.h" | 59 | #include "internal.h" |
60 | #include "ras/ras_event.h" | ||
60 | 61 | ||
61 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 62 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
62 | 63 | ||
@@ -855,6 +856,8 @@ static struct page_state { | |||
855 | static void action_result(unsigned long pfn, enum mf_action_page_type type, | 856 | static void action_result(unsigned long pfn, enum mf_action_page_type type, |
856 | enum mf_result result) | 857 | enum mf_result result) |
857 | { | 858 | { |
859 | trace_memory_failure_event(pfn, type, result); | ||
860 | |||
858 | pr_err("MCE %#lx: recovery action for %s: %s\n", | 861 | pr_err("MCE %#lx: recovery action for %s: %s\n", |
859 | pfn, action_page_types[type], action_name[result]); | 862 | pfn, action_page_types[type], action_name[result]); |
860 | } | 863 | } |