aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <dave.hansen@linux.intel.com>2013-06-21 11:51:35 -0400
committerIngo Molnar <mingo@kernel.org>2013-06-23 05:52:56 -0400
commit2ab00456ea8a0d79acb1390659b98416111880b2 (patch)
tree5bda8b0f3f8a530431691e5ab309b6f2f056894e
parentbde96030f438b5eb6fb74f3bdd06d9f68bb3ba00 (diff)
x86: Warn when NMI handlers take large amounts of time
I have a system which is causing all kinds of problems. It has 8 NUMA nodes, and lots of cores that can fight over cachelines. If things are not working _perfectly_, then NMIs can take longer than expected. If we get too many of them backed up to each other, we can easily end up in a situation where we are doing nothing *but* running NMIs. The biggest problem, though, is that this happens _silently_. You might be lucky to get an hrtimer warning, but most of the time system simply hangs. This patch should at least give us some warning before we fall off the cliff. the warnings look like this: nmi_handle: perf_event_nmi_handler() took: 26095071 ns The message is triggered whenever we notice the longest NMI we've seen to date. You can always view and reset this value via the debugfs interface if you like. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: paulus@samba.org Cc: acme@ghostprotocols.net Cc: Dave Hansen <dave@sr71.net> Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--arch/x86/kernel/nmi.c29
1 files changed, 28 insertions, 1 deletions
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 60308053fdb2..e9bae4c2f2dd 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -14,6 +14,7 @@
14#include <linux/kprobes.h> 14#include <linux/kprobes.h>
15#include <linux/kdebug.h> 15#include <linux/kdebug.h>
16#include <linux/nmi.h> 16#include <linux/nmi.h>
17#include <linux/debugfs.h>
17#include <linux/delay.h> 18#include <linux/delay.h>
18#include <linux/hardirq.h> 19#include <linux/hardirq.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
@@ -82,6 +83,15 @@ __setup("unknown_nmi_panic", setup_unknown_nmi_panic);
82 83
83#define nmi_to_desc(type) (&nmi_desc[type]) 84#define nmi_to_desc(type) (&nmi_desc[type])
84 85
86static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC;
87static int __init nmi_warning_debugfs(void)
88{
89 debugfs_create_u64("nmi_longest_ns", 0644,
90 arch_debugfs_dir, &nmi_longest_ns);
91 return 0;
92}
93fs_initcall(nmi_warning_debugfs);
94
85static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) 95static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
86{ 96{
87 struct nmi_desc *desc = nmi_to_desc(type); 97 struct nmi_desc *desc = nmi_to_desc(type);
@@ -96,8 +106,25 @@ static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2
96 * can be latched at any given time. Walk the whole list 106 * can be latched at any given time. Walk the whole list
97 * to handle those situations. 107 * to handle those situations.
98 */ 108 */
99 list_for_each_entry_rcu(a, &desc->head, list) 109 list_for_each_entry_rcu(a, &desc->head, list) {
110 u64 before, delta, whole_msecs;
111 int decimal_msecs;
112
113 before = local_clock();
100 handled += a->handler(type, regs); 114 handled += a->handler(type, regs);
115 delta = local_clock() - before;
116
117 if (delta < nmi_longest_ns)
118 continue;
119
120 nmi_longest_ns = delta;
121 whole_msecs = do_div(delta, (1000 * 1000));
122 decimal_msecs = do_div(delta, 1000) % 1000;
123 printk_ratelimited(KERN_INFO
124 "INFO: NMI handler (%ps) took too long to run: "
125 "%lld.%03d msecs\n", a->handler, whole_msecs,
126 decimal_msecs);
127 }
101 128
102 rcu_read_unlock(); 129 rcu_read_unlock();
103 130