aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVishal Verma <vishal.l.verma@intel.com>2017-04-18 14:42:35 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2017-04-27 03:10:39 -0400
commitf8bc0881fe95496ddf3f8a16808d9860db207941 (patch)
tree72d8b3bd7ca801e389295120fc3bc3ff61d8b8f4
parent6966a6579e1bc7b3486bed395e7d6875912a9a9b (diff)
x86/mce: Make the MCE notifier a blocking one
commit 0dc9c639e6553e39c13b2c0d54c8a1b098cb95e2 upstream. The NFIT MCE handler callback (for handling media errors on NVDIMMs) takes a mutex to add the location of a memory error to a list. But since the notifier call chain for machine checks (x86_mce_decoder_chain) is atomic, we get a lockdep splat like: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:620 in_atomic(): 1, irqs_disabled(): 0, pid: 4, name: kworker/0:0 [..] Call Trace: dump_stack ___might_sleep __might_sleep mutex_lock_nested ? __lock_acquire nfit_handle_mce notifier_call_chain atomic_notifier_call_chain ? atomic_notifier_call_chain mce_gen_pool_process Convert the notifier to a blocking one which gets to run only in process context. Boris: remove the notifier call in atomic context in print_mce(). For now, let's print the MCE on the atomic path so that we can make sure they go out and get logged at least. Fixes: 6839a6d96f4e ("nfit: do an ARS scrub on hitting a latent media error") Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> Acked-by: Tony Luck <tony.luck@intel.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Cc: x86-ml <x86@kernel.org> Link: http://lkml.kernel.org/r/20170411224457.24777-1-vishal.l.verma@intel.com Signed-off-by: Borislav Petkov <bp@suse.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-genpool.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c16
3 files changed, 5 insertions, 15 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/mce-genpool.c b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
index 93d824ec3120..040af1939460 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -85,7 +85,7 @@ void mce_gen_pool_process(void)
85 head = llist_reverse_order(head); 85 head = llist_reverse_order(head);
86 llist_for_each_entry_safe(node, tmp, head, llnode) { 86 llist_for_each_entry_safe(node, tmp, head, llnode) {
87 mce = &node->mce; 87 mce = &node->mce;
88 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 88 blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
89 gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node)); 89 gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
90 } 90 }
91} 91}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index cd74a3f00aea..de20902ecf23 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,7 +13,7 @@ enum severity_level {
13 MCE_PANIC_SEVERITY, 13 MCE_PANIC_SEVERITY,
14}; 14};
15 15
16extern struct atomic_notifier_head x86_mce_decoder_chain; 16extern struct blocking_notifier_head x86_mce_decoder_chain;
17 17
18#define ATTR_LEN 16 18#define ATTR_LEN 16
19#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ 19#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf453d895..22cda29d654e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -120,7 +120,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
120 * CPU/chipset specific EDAC code can register a notifier call here to print 120 * CPU/chipset specific EDAC code can register a notifier call here to print
121 * MCE errors in a human-readable form. 121 * MCE errors in a human-readable form.
122 */ 122 */
123ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 123BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
124 124
125/* Do initial initialization of a struct mce */ 125/* Do initial initialization of a struct mce */
126void mce_setup(struct mce *m) 126void mce_setup(struct mce *m)
@@ -213,13 +213,13 @@ void mce_register_decode_chain(struct notifier_block *nb)
213 if (nb != &mce_srao_nb && nb->priority == INT_MAX) 213 if (nb != &mce_srao_nb && nb->priority == INT_MAX)
214 nb->priority -= 1; 214 nb->priority -= 1;
215 215
216 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); 216 blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
217} 217}
218EXPORT_SYMBOL_GPL(mce_register_decode_chain); 218EXPORT_SYMBOL_GPL(mce_register_decode_chain);
219 219
220void mce_unregister_decode_chain(struct notifier_block *nb) 220void mce_unregister_decode_chain(struct notifier_block *nb)
221{ 221{
222 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); 222 blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
223} 223}
224EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); 224EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
225 225
@@ -272,8 +272,6 @@ struct mca_msr_regs msr_ops = {
272 272
273static void print_mce(struct mce *m) 273static void print_mce(struct mce *m)
274{ 274{
275 int ret = 0;
276
277 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 275 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
278 m->extcpu, m->mcgstatus, m->bank, m->status); 276 m->extcpu, m->mcgstatus, m->bank, m->status);
279 277
@@ -309,14 +307,6 @@ static void print_mce(struct mce *m)
309 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, 307 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
310 cpu_data(m->extcpu).microcode); 308 cpu_data(m->extcpu).microcode);
311 309
312 /*
313 * Print out human-readable details about the MCE error,
314 * (if the CPU has an implementation for that)
315 */
316 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
317 if (ret == NOTIFY_STOP)
318 return;
319
320 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n"); 310 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
321} 311}
322 312