aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2007-02-13 07:26:23 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:23 -0500
commita98f0dd34d94ea0b5f3816196bea5dba467827bb (patch)
tree019235e4d668b95366dd98dc6474716139c1584b
parent24ce0e96f2dea558762c994d054ea2f3c01fa95a (diff)
[PATCH] x86-64: Allow to run a program when a machine check event is detected
When a machine check event is detected (including a AMD RevF threshold overflow event) allow to run a "trigger" program. This allows user space to react to such events sooner. The trigger is configured using a new trigger entry in the machinecheck sysfs interface. It is currently shared between all CPUs. I also fixed the AMD threshold handler to run the machine check polling code immediately to actually log any events that might have caused the threshold interrupt. Also added some documentation for the mce sysfs interface. Signed-off-by: Andi Kleen <ak@suse.de>
-rw-r--r--Documentation/x86_64/machinecheck70
-rw-r--r--arch/x86_64/kernel/mce.c66
-rw-r--r--arch/x86_64/kernel/mce_amd.c4
-rw-r--r--include/asm-x86_64/mce.h2
-rw-r--r--kernel/kmod.c44
5 files changed, 160 insertions, 26 deletions
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck
new file mode 100644
index 000000000000..068a6d9904b9
--- /dev/null
+++ b/Documentation/x86_64/machinecheck
@@ -0,0 +1,70 @@
1
2Configurable sysfs parameters for the x86-64 machine check code.
3
4Machine checks report internal hardware error conditions detected
5by the CPU. Uncorrected errors typically cause a machine check
6(often with panic), corrected ones cause a machine check log entry.
7
8Machine checks are organized in banks (normally associated with
9a hardware subsystem) and subevents in a bank. The exact meaning
10of the banks and subevent is CPU specific.
11
12mcelog knows how to decode them.
13
14When you see the "Machine check errors logged" message in the system
15log then mcelog should run to collect and decode machine check entries
16from /dev/mcelog. Normally mcelog should be run regularly from a cronjob.
17
18Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN
19(N = CPU number)
20
21The directory contains some configurable entries:
22
23Entries:
24
25bankNctl
26(N bank number)
27 64bit Hex bitmask enabling/disabling specific subevents for bank N
28 When a bit in the bitmask is zero then the respective
29 subevent will not be reported.
30 By default all events are enabled.
31 Note that BIOS maintain another mask to disable specific events
32 per bank. This is not visible here
33
34The following entries appear for each CPU, but they are truly shared
35between all CPUs.
36
37check_interval
38 How often to poll for corrected machine check errors, in seconds
39 (Note output is hexademical). Default 5 minutes.
40
41tolerant
42 Tolerance level. When a machine check exception occurs for a non
43 corrected machine check the kernel can take different actions.
44 Since machine check exceptions can happen any time it is sometimes
45 risky for the kernel to kill a process because it defies
46 normal kernel locking rules. The tolerance level configures
47 how hard the kernel tries to recover even at some risk of deadlock.
48
49 0: always panic,
50 1: panic if deadlock possible,
51 2: try to avoid panic,
52 3: never panic or exit (for testing only)
53
54 Default: 1
55
56 Note this only makes a difference if the CPU allows recovery
57 from a machine check exception. Current x86 CPUs generally do not.
58
59trigger
60 Program to run when a machine check event is detected.
61 This is an alternative to running mcelog regularly from cron
62 and allows to detect events faster.
63
64TBD document entries for AMD threshold interrupt configuration
65
66For more details about the x86 machine check architecture
67see the Intel and AMD architecture manuals from their developer websites.
68
69For more details about the architecture see
70see http://one.firstfloor.org/~andi/mce.pdf
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index bdb54a2c9f18..8011a8e1c7d4 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -19,6 +19,7 @@
19#include <linux/cpu.h> 19#include <linux/cpu.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/ctype.h> 21#include <linux/ctype.h>
22#include <linux/kmod.h>
22#include <asm/processor.h> 23#include <asm/processor.h>
23#include <asm/msr.h> 24#include <asm/msr.h>
24#include <asm/mce.h> 25#include <asm/mce.h>
@@ -42,6 +43,10 @@ static unsigned long console_logged;
42static int notify_user; 43static int notify_user;
43static int rip_msr; 44static int rip_msr;
44static int mce_bootlog = 1; 45static int mce_bootlog = 1;
46static atomic_t mce_events;
47
48static char trigger[128];
49static char *trigger_argv[2] = { trigger, NULL };
45 50
46/* 51/*
47 * Lockless MCE logging infrastructure. 52 * Lockless MCE logging infrastructure.
@@ -57,6 +62,7 @@ struct mce_log mcelog = {
57void mce_log(struct mce *mce) 62void mce_log(struct mce *mce)
58{ 63{
59 unsigned next, entry; 64 unsigned next, entry;
65 atomic_inc(&mce_events);
60 mce->finished = 0; 66 mce->finished = 0;
61 wmb(); 67 wmb();
62 for (;;) { 68 for (;;) {
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
161 } 167 }
162} 168}
163 169
170static void do_mce_trigger(void)
171{
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1);
178 }
179}
180
164/* 181/*
165 * The actual machine check handler 182 * The actual machine check handler
166 */ 183 */
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
234 } 251 }
235 252
236 /* Never do anything final in the polling timer */ 253 /* Never do anything final in the polling timer */
237 if (!regs) 254 if (!regs) {
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
238 goto out; 258 goto out;
259 }
239 260
240 /* If we didn't find an uncorrectable error, pick 261 /* If we didn't find an uncorrectable error, pick
241 the last one (shouldn't happen, just being safe). */ 262 the last one (shouldn't happen, just being safe). */
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce);
606 } \ 627 } \
607 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); 628 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
608 629
630/* TBD should generate these dynamically based on number of available banks */
609ACCESSOR(bank0ctl,bank[0],mce_restart()) 631ACCESSOR(bank0ctl,bank[0],mce_restart())
610ACCESSOR(bank1ctl,bank[1],mce_restart()) 632ACCESSOR(bank1ctl,bank[1],mce_restart())
611ACCESSOR(bank2ctl,bank[2],mce_restart()) 633ACCESSOR(bank2ctl,bank[2],mce_restart())
612ACCESSOR(bank3ctl,bank[3],mce_restart()) 634ACCESSOR(bank3ctl,bank[3],mce_restart())
613ACCESSOR(bank4ctl,bank[4],mce_restart()) 635ACCESSOR(bank4ctl,bank[4],mce_restart())
614ACCESSOR(bank5ctl,bank[5],mce_restart()) 636ACCESSOR(bank5ctl,bank[5],mce_restart())
615static struct sysdev_attribute * bank_attributes[NR_BANKS] = { 637
616 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 638static ssize_t show_trigger(struct sys_device *s, char *buf)
617 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; 639{
640 strcpy(buf, trigger);
641 strcat(buf, "\n");
642 return strlen(trigger) + 1;
643}
644
645static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
646{
647 char *p;
648 int len;
649 strncpy(trigger, buf, sizeof(trigger));
650 trigger[sizeof(trigger)-1] = 0;
651 len = strlen(trigger);
652 p = strchr(trigger, '\n');
653 if (*p) *p = 0;
654 return len;
655}
656
657static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
618ACCESSOR(tolerant,tolerant,) 658ACCESSOR(tolerant,tolerant,)
619ACCESSOR(check_interval,check_interval,mce_restart()) 659ACCESSOR(check_interval,check_interval,mce_restart())
660static struct sysdev_attribute *mce_attributes[] = {
661 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
662 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
663 &attr_tolerant, &attr_check_interval, &attr_trigger,
664 NULL
665};
620 666
621/* Per cpu sysdev init. All of the cpus still share the same ctl bank */ 667/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
622static __cpuinit int mce_create_device(unsigned int cpu) 668static __cpuinit int mce_create_device(unsigned int cpu)
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu)
632 err = sysdev_register(&per_cpu(device_mce,cpu)); 678 err = sysdev_register(&per_cpu(device_mce,cpu));
633 679
634 if (!err) { 680 if (!err) {
635 for (i = 0; i < banks; i++) 681 for (i = 0; mce_attributes[i]; i++)
636 sysdev_create_file(&per_cpu(device_mce,cpu), 682 sysdev_create_file(&per_cpu(device_mce,cpu),
637 bank_attributes[i]); 683 mce_attributes[i]);
638 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant);
639 sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval);
640 } 684 }
641 return err; 685 return err;
642} 686}
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu)
645{ 689{
646 int i; 690 int i;
647 691
648 for (i = 0; i < banks; i++) 692 for (i = 0; mce_attributes[i]; i++)
649 sysdev_remove_file(&per_cpu(device_mce,cpu), 693 sysdev_remove_file(&per_cpu(device_mce,cpu),
650 bank_attributes[i]); 694 mce_attributes[i]);
651 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant);
652 sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval);
653 sysdev_unregister(&per_cpu(device_mce,cpu)); 695 sysdev_unregister(&per_cpu(device_mce,cpu));
654 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); 696 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
655} 697}
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c
index cd8dbe57b33a..d0bd5d66e103 100644
--- a/arch/x86_64/kernel/mce_amd.c
+++ b/arch/x86_64/kernel/mce_amd.c
@@ -220,6 +220,10 @@ asmlinkage void mce_threshold_interrupt(void)
220 (high & MASK_LOCKED_HI)) 220 (high & MASK_LOCKED_HI))
221 continue; 221 continue;
222 222
223 /* Log the machine check that caused the threshold
224 event. */
225 do_machine_check(NULL, 0);
226
223 if (high & MASK_OVERFLOW_HI) { 227 if (high & MASK_OVERFLOW_HI) {
224 rdmsrl(address, m.misc); 228 rdmsrl(address, m.misc);
225 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, 229 rdmsrl(MSR_IA32_MC0_STATUS + bank * 4,
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h
index 5a11146d6d9c..177e92b4019b 100644
--- a/include/asm-x86_64/mce.h
+++ b/include/asm-x86_64/mce.h
@@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
103 103
104extern atomic_t mce_entry; 104extern atomic_t mce_entry;
105 105
106extern void do_machine_check(struct pt_regs *, long);
107
106#endif 108#endif
107 109
108#endif 110#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a7379aa31ca..796276141e51 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data)
217 sub_info->retval = ret; 217 sub_info->retval = ret;
218 } 218 }
219 219
220 complete(sub_info->complete); 220 if (sub_info->wait < 0)
221 kfree(sub_info);
222 else
223 complete(sub_info->complete);
221 return 0; 224 return 0;
222} 225}
223 226
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work)
239 pid = kernel_thread(____call_usermodehelper, sub_info, 242 pid = kernel_thread(____call_usermodehelper, sub_info,
240 CLONE_VFORK | SIGCHLD); 243 CLONE_VFORK | SIGCHLD);
241 244
245 if (wait < 0)
246 return;
247
242 if (pid < 0) { 248 if (pid < 0) {
243 sub_info->retval = pid; 249 sub_info->retval = pid;
244 complete(sub_info->complete); 250 complete(sub_info->complete);
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work)
253 * @envp: null-terminated environment list 259 * @envp: null-terminated environment list
254 * @session_keyring: session keyring for process (NULL for an empty keyring) 260 * @session_keyring: session keyring for process (NULL for an empty keyring)
255 * @wait: wait for the application to finish and return status. 261 * @wait: wait for the application to finish and return status.
262 * when -1 don't wait at all, but you get no useful error back when
263 * the program couldn't be exec'ed. This makes it safe to call
264 * from interrupt context.
256 * 265 *
257 * Runs a user-space application. The application is started 266 * Runs a user-space application. The application is started
258 * asynchronously if wait is not set, and runs as a child of keventd. 267 * asynchronously if wait is not set, and runs as a child of keventd.
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
265 struct key *session_keyring, int wait) 274 struct key *session_keyring, int wait)
266{ 275{
267 DECLARE_COMPLETION_ONSTACK(done); 276 DECLARE_COMPLETION_ONSTACK(done);
268 struct subprocess_info sub_info = { 277 struct subprocess_info *sub_info;
269 .work = __WORK_INITIALIZER(sub_info.work, 278 int retval;
270 __call_usermodehelper),
271 .complete = &done,
272 .path = path,
273 .argv = argv,
274 .envp = envp,
275 .ring = session_keyring,
276 .wait = wait,
277 .retval = 0,
278 };
279 279
280 if (!khelper_wq) 280 if (!khelper_wq)
281 return -EBUSY; 281 return -EBUSY;
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp,
283 if (path[0] == '\0') 283 if (path[0] == '\0')
284 return 0; 284 return 0;
285 285
286 queue_work(khelper_wq, &sub_info.work); 286 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC);
287 if (!sub_info)
288 return -ENOMEM;
289
290 INIT_WORK(&sub_info->work, __call_usermodehelper);
291 sub_info->complete = &done;
292 sub_info->path = path;
293 sub_info->argv = argv;
294 sub_info->envp = envp;
295 sub_info->ring = session_keyring;
296 sub_info->wait = wait;
297
298 queue_work(khelper_wq, &sub_info->work);
299 if (wait < 0) /* task has freed sub_info */
300 return 0;
287 wait_for_completion(&done); 301 wait_for_completion(&done);
288 return sub_info.retval; 302 retval = sub_info->retval;
303 kfree(sub_info);
304 return retval;
289} 305}
290EXPORT_SYMBOL(call_usermodehelper_keys); 306EXPORT_SYMBOL(call_usermodehelper_keys);
291 307