aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSrivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>2013-03-20 06:01:29 -0400
committerTony Luck <tony.luck@intel.com>2013-04-02 17:04:01 -0400
commit7a0c819d28f5c91955854e048766d6afef7c8a3d (patch)
tree539295c0861de4e4e971f8895c13816df975fbe2
parent07961ac7c0ee8b546658717034fe692fd12eefa9 (diff)
x86/mce: Rework cmci_rediscover() to play well with CPU hotplug
Dave Jones reports that offlining a CPU leads to this trace: numa_remove_cpu cpu 1 node 0: mask now 0,2-3 smpboot: CPU 1 is now offline BUG: using smp_processor_id() in preemptible [00000000] code: cpu-offline.sh/10591 caller is cmci_rediscover+0x6a/0xe0 Pid: 10591, comm: cpu-offline.sh Not tainted 3.9.0-rc3+ #2 Call Trace: [<ffffffff81333bbd>] debug_smp_processor_id+0xdd/0x100 [<ffffffff8101edba>] cmci_rediscover+0x6a/0xe0 [<ffffffff815f5b9f>] mce_cpu_callback+0x19d/0x1ae [<ffffffff8160ea66>] notifier_call_chain+0x66/0x150 [<ffffffff8107ad7e>] __raw_notifier_call_chain+0xe/0x10 [<ffffffff8104c2e3>] cpu_notify+0x23/0x50 [<ffffffff8104c31e>] cpu_notify_nofail+0xe/0x20 [<ffffffff815ef082>] _cpu_down+0x302/0x350 [<ffffffff815ef106>] cpu_down+0x36/0x50 [<ffffffff815f1c9d>] store_online+0x8d/0xd0 [<ffffffff813edc48>] dev_attr_store+0x18/0x30 [<ffffffff81226eeb>] sysfs_write_file+0xdb/0x150 [<ffffffff811adfb2>] vfs_write+0xa2/0x170 [<ffffffff811ae16c>] sys_write+0x4c/0xa0 [<ffffffff81613019>] system_call_fastpath+0x16/0x1b However, a look at cmci_rediscover shows that it can be simplified quite a bit, apart from solving the above issue. It invokes functions that take spin locks with interrupts disabled, and hence it can run in atomic context. Also, it is run in the CPU_POST_DEAD phase, so the dying CPU is already dead and out of the cpu_online_mask. So take these points into account and simplify the code, and thereby also fix the above issue. Reported-by: Dave Jones <davej@redhat.com> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/x86/include/asm/mce.h4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c25
3 files changed, 8 insertions, 23 deletions
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f4076af1f4ed..fa5f71e021d5 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -146,13 +146,13 @@ DECLARE_PER_CPU(struct device *, mce_device);
146void mce_intel_feature_init(struct cpuinfo_x86 *c); 146void mce_intel_feature_init(struct cpuinfo_x86 *c);
147void cmci_clear(void); 147void cmci_clear(void);
148void cmci_reenable(void); 148void cmci_reenable(void);
149void cmci_rediscover(int dying); 149void cmci_rediscover(void);
150void cmci_recheck(void); 150void cmci_recheck(void);
151#else 151#else
152static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { } 152static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
153static inline void cmci_clear(void) {} 153static inline void cmci_clear(void) {}
154static inline void cmci_reenable(void) {} 154static inline void cmci_reenable(void) {}
155static inline void cmci_rediscover(int dying) {} 155static inline void cmci_rediscover(void) {}
156static inline void cmci_recheck(void) {} 156static inline void cmci_recheck(void) {}
157#endif 157#endif
158 158
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7bc126346ace..9239504b41cb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2358,7 +2358,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2358 2358
2359 if (action == CPU_POST_DEAD) { 2359 if (action == CPU_POST_DEAD) {
2360 /* intentionally ignoring frozen here */ 2360 /* intentionally ignoring frozen here */
2361 cmci_rediscover(cpu); 2361 cmci_rediscover();
2362 } 2362 }
2363 2363
2364 return NOTIFY_OK; 2364 return NOTIFY_OK;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 402c454fbff0..ae1697c2afe3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -285,39 +285,24 @@ void cmci_clear(void)
285 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 285 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
286} 286}
287 287
288static long cmci_rediscover_work_func(void *arg) 288static void cmci_rediscover_work_func(void *arg)
289{ 289{
290 int banks; 290 int banks;
291 291
292 /* Recheck banks in case CPUs don't all have the same */ 292 /* Recheck banks in case CPUs don't all have the same */
293 if (cmci_supported(&banks)) 293 if (cmci_supported(&banks))
294 cmci_discover(banks); 294 cmci_discover(banks);
295
296 return 0;
297} 295}
298 296
299/* 297/* After a CPU went down cycle through all the others and rediscover */
300 * After a CPU went down cycle through all the others and rediscover 298void cmci_rediscover(void)
301 * Must run in process context.
302 */
303void cmci_rediscover(int dying)
304{ 299{
305 int cpu, banks; 300 int banks;
306 301
307 if (!cmci_supported(&banks)) 302 if (!cmci_supported(&banks))
308 return; 303 return;
309 304
310 for_each_online_cpu(cpu) { 305 on_each_cpu(cmci_rediscover_work_func, NULL, 1);
311 if (cpu == dying)
312 continue;
313
314 if (cpu == smp_processor_id()) {
315 cmci_rediscover_work_func(NULL);
316 continue;
317 }
318
319 work_on_cpu(cpu, cmci_rediscover_work_func, NULL);
320 }
321} 306}
322 307
323/* 308/*