diff options
-rw-r--r-- | Documentation/x86_64/machinecheck | 70 | ||||
-rw-r--r-- | arch/x86_64/kernel/mce.c | 66 | ||||
-rw-r--r-- | arch/x86_64/kernel/mce_amd.c | 4 | ||||
-rw-r--r-- | include/asm-x86_64/mce.h | 2 | ||||
-rw-r--r-- | kernel/kmod.c | 44 |
5 files changed, 160 insertions, 26 deletions
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck new file mode 100644 index 000000000000..068a6d9904b9 --- /dev/null +++ b/Documentation/x86_64/machinecheck | |||
@@ -0,0 +1,70 @@ | |||
1 | |||
2 | Configurable sysfs parameters for the x86-64 machine check code. | ||
3 | |||
4 | Machine checks report internal hardware error conditions detected | ||
5 | by the CPU. Uncorrected errors typically cause a machine check | ||
6 | (often with panic), corrected ones cause a machine check log entry. | ||
7 | |||
8 | Machine checks are organized in banks (normally associated with | ||
9 | a hardware subsystem) and subevents in a bank. The exact meaning | ||
10 | of the banks and subevent is CPU specific. | ||
11 | |||
12 | mcelog knows how to decode them. | ||
13 | |||
14 | When you see the "Machine check errors logged" message in the system | ||
15 | log then mcelog should run to collect and decode machine check entries | ||
16 | from /dev/mcelog. Normally mcelog should be run regularly from a cronjob. | ||
17 | |||
18 | Each CPU has a directory in /sys/devices/system/machinecheck/machinecheckN | ||
19 | (N = CPU number) | ||
20 | |||
21 | The directory contains some configurable entries: | ||
22 | |||
23 | Entries: | ||
24 | |||
25 | bankNctl | ||
26 | (N bank number) | ||
27 | 64bit Hex bitmask enabling/disabling specific subevents for bank N | ||
28 | When a bit in the bitmask is zero then the respective | ||
29 | subevent will not be reported. | ||
30 | By default all events are enabled. | ||
31 | Note that BIOS maintain another mask to disable specific events | ||
32 | per bank. This is not visible here | ||
33 | |||
34 | The following entries appear for each CPU, but they are truly shared | ||
35 | between all CPUs. | ||
36 | |||
37 | check_interval | ||
38 | How often to poll for corrected machine check errors, in seconds | ||
39 | (Note output is hexademical). Default 5 minutes. | ||
40 | |||
41 | tolerant | ||
42 | Tolerance level. When a machine check exception occurs for a non | ||
43 | corrected machine check the kernel can take different actions. | ||
44 | Since machine check exceptions can happen any time it is sometimes | ||
45 | risky for the kernel to kill a process because it defies | ||
46 | normal kernel locking rules. The tolerance level configures | ||
47 | how hard the kernel tries to recover even at some risk of deadlock. | ||
48 | |||
49 | 0: always panic, | ||
50 | 1: panic if deadlock possible, | ||
51 | 2: try to avoid panic, | ||
52 | 3: never panic or exit (for testing only) | ||
53 | |||
54 | Default: 1 | ||
55 | |||
56 | Note this only makes a difference if the CPU allows recovery | ||
57 | from a machine check exception. Current x86 CPUs generally do not. | ||
58 | |||
59 | trigger | ||
60 | Program to run when a machine check event is detected. | ||
61 | This is an alternative to running mcelog regularly from cron | ||
62 | and allows to detect events faster. | ||
63 | |||
64 | TBD document entries for AMD threshold interrupt configuration | ||
65 | |||
66 | For more details about the x86 machine check architecture | ||
67 | see the Intel and AMD architecture manuals from their developer websites. | ||
68 | |||
69 | For more details about the architecture see | ||
70 | see http://one.firstfloor.org/~andi/mce.pdf | ||
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bdb54a2c9f18..8011a8e1c7d4 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
20 | #include <linux/percpu.h> | 20 | #include <linux/percpu.h> |
21 | #include <linux/ctype.h> | 21 | #include <linux/ctype.h> |
22 | #include <linux/kmod.h> | ||
22 | #include <asm/processor.h> | 23 | #include <asm/processor.h> |
23 | #include <asm/msr.h> | 24 | #include <asm/msr.h> |
24 | #include <asm/mce.h> | 25 | #include <asm/mce.h> |
@@ -42,6 +43,10 @@ static unsigned long console_logged; | |||
42 | static int notify_user; | 43 | static int notify_user; |
43 | static int rip_msr; | 44 | static int rip_msr; |
44 | static int mce_bootlog = 1; | 45 | static int mce_bootlog = 1; |
46 | static atomic_t mce_events; | ||
47 | |||
48 | static char trigger[128]; | ||
49 | static char *trigger_argv[2] = { trigger, NULL }; | ||
45 | 50 | ||
46 | /* | 51 | /* |
47 | * Lockless MCE logging infrastructure. | 52 | * Lockless MCE logging infrastructure. |
@@ -57,6 +62,7 @@ struct mce_log mcelog = { | |||
57 | void mce_log(struct mce *mce) | 62 | void mce_log(struct mce *mce) |
58 | { | 63 | { |
59 | unsigned next, entry; | 64 | unsigned next, entry; |
65 | atomic_inc(&mce_events); | ||
60 | mce->finished = 0; | 66 | mce->finished = 0; |
61 | wmb(); | 67 | wmb(); |
62 | for (;;) { | 68 | for (;;) { |
@@ -161,6 +167,17 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | |||
161 | } | 167 | } |
162 | } | 168 | } |
163 | 169 | ||
170 | static void do_mce_trigger(void) | ||
171 | { | ||
172 | static atomic_t mce_logged; | ||
173 | int events = atomic_read(&mce_events); | ||
174 | if (events != atomic_read(&mce_logged) && trigger[0]) { | ||
175 | /* Small race window, but should be harmless. */ | ||
176 | atomic_set(&mce_logged, events); | ||
177 | call_usermodehelper(trigger, trigger_argv, NULL, -1); | ||
178 | } | ||
179 | } | ||
180 | |||
164 | /* | 181 | /* |
165 | * The actual machine check handler | 182 | * The actual machine check handler |
166 | */ | 183 | */ |
@@ -234,8 +251,12 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
234 | } | 251 | } |
235 | 252 | ||
236 | /* Never do anything final in the polling timer */ | 253 | /* Never do anything final in the polling timer */ |
237 | if (!regs) | 254 | if (!regs) { |
255 | /* Normal interrupt context here. Call trigger for any new | ||
256 | events. */ | ||
257 | do_mce_trigger(); | ||
238 | goto out; | 258 | goto out; |
259 | } | ||
239 | 260 | ||
240 | /* If we didn't find an uncorrectable error, pick | 261 | /* If we didn't find an uncorrectable error, pick |
241 | the last one (shouldn't happen, just being safe). */ | 262 | the last one (shouldn't happen, just being safe). */ |
@@ -606,17 +627,42 @@ DEFINE_PER_CPU(struct sys_device, device_mce); | |||
606 | } \ | 627 | } \ |
607 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); | 628 | static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name); |
608 | 629 | ||
630 | /* TBD should generate these dynamically based on number of available banks */ | ||
609 | ACCESSOR(bank0ctl,bank[0],mce_restart()) | 631 | ACCESSOR(bank0ctl,bank[0],mce_restart()) |
610 | ACCESSOR(bank1ctl,bank[1],mce_restart()) | 632 | ACCESSOR(bank1ctl,bank[1],mce_restart()) |
611 | ACCESSOR(bank2ctl,bank[2],mce_restart()) | 633 | ACCESSOR(bank2ctl,bank[2],mce_restart()) |
612 | ACCESSOR(bank3ctl,bank[3],mce_restart()) | 634 | ACCESSOR(bank3ctl,bank[3],mce_restart()) |
613 | ACCESSOR(bank4ctl,bank[4],mce_restart()) | 635 | ACCESSOR(bank4ctl,bank[4],mce_restart()) |
614 | ACCESSOR(bank5ctl,bank[5],mce_restart()) | 636 | ACCESSOR(bank5ctl,bank[5],mce_restart()) |
615 | static struct sysdev_attribute * bank_attributes[NR_BANKS] = { | 637 | |
616 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | 638 | static ssize_t show_trigger(struct sys_device *s, char *buf) |
617 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl}; | 639 | { |
640 | strcpy(buf, trigger); | ||
641 | strcat(buf, "\n"); | ||
642 | return strlen(trigger) + 1; | ||
643 | } | ||
644 | |||
645 | static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) | ||
646 | { | ||
647 | char *p; | ||
648 | int len; | ||
649 | strncpy(trigger, buf, sizeof(trigger)); | ||
650 | trigger[sizeof(trigger)-1] = 0; | ||
651 | len = strlen(trigger); | ||
652 | p = strchr(trigger, '\n'); | ||
653 | if (*p) *p = 0; | ||
654 | return len; | ||
655 | } | ||
656 | |||
657 | static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); | ||
618 | ACCESSOR(tolerant,tolerant,) | 658 | ACCESSOR(tolerant,tolerant,) |
619 | ACCESSOR(check_interval,check_interval,mce_restart()) | 659 | ACCESSOR(check_interval,check_interval,mce_restart()) |
660 | static struct sysdev_attribute *mce_attributes[] = { | ||
661 | &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, | ||
662 | &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, | ||
663 | &attr_tolerant, &attr_check_interval, &attr_trigger, | ||
664 | NULL | ||
665 | }; | ||
620 | 666 | ||
621 | /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ | 667 | /* Per cpu sysdev init. All of the cpus still share the same ctl bank */ |
622 | static __cpuinit int mce_create_device(unsigned int cpu) | 668 | static __cpuinit int mce_create_device(unsigned int cpu) |
@@ -632,11 +678,9 @@ static __cpuinit int mce_create_device(unsigned int cpu) | |||
632 | err = sysdev_register(&per_cpu(device_mce,cpu)); | 678 | err = sysdev_register(&per_cpu(device_mce,cpu)); |
633 | 679 | ||
634 | if (!err) { | 680 | if (!err) { |
635 | for (i = 0; i < banks; i++) | 681 | for (i = 0; mce_attributes[i]; i++) |
636 | sysdev_create_file(&per_cpu(device_mce,cpu), | 682 | sysdev_create_file(&per_cpu(device_mce,cpu), |
637 | bank_attributes[i]); | 683 | mce_attributes[i]); |
638 | sysdev_create_file(&per_cpu(device_mce,cpu), &attr_tolerant); | ||
639 | sysdev_create_file(&per_cpu(device_mce,cpu), &attr_check_interval); | ||
640 | } | 684 | } |
641 | return err; | 685 | return err; |
642 | } | 686 | } |
@@ -645,11 +689,9 @@ static void mce_remove_device(unsigned int cpu) | |||
645 | { | 689 | { |
646 | int i; | 690 | int i; |
647 | 691 | ||
648 | for (i = 0; i < banks; i++) | 692 | for (i = 0; mce_attributes[i]; i++) |
649 | sysdev_remove_file(&per_cpu(device_mce,cpu), | 693 | sysdev_remove_file(&per_cpu(device_mce,cpu), |
650 | bank_attributes[i]); | 694 | mce_attributes[i]); |
651 | sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); | ||
652 | sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); | ||
653 | sysdev_unregister(&per_cpu(device_mce,cpu)); | 695 | sysdev_unregister(&per_cpu(device_mce,cpu)); |
654 | memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); | 696 | memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); |
655 | } | 697 | } |
diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index cd8dbe57b33a..d0bd5d66e103 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c | |||
@@ -220,6 +220,10 @@ asmlinkage void mce_threshold_interrupt(void) | |||
220 | (high & MASK_LOCKED_HI)) | 220 | (high & MASK_LOCKED_HI)) |
221 | continue; | 221 | continue; |
222 | 222 | ||
223 | /* Log the machine check that caused the threshold | ||
224 | event. */ | ||
225 | do_machine_check(NULL, 0); | ||
226 | |||
223 | if (high & MASK_OVERFLOW_HI) { | 227 | if (high & MASK_OVERFLOW_HI) { |
224 | rdmsrl(address, m.misc); | 228 | rdmsrl(address, m.misc); |
225 | rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, | 229 | rdmsrl(MSR_IA32_MC0_STATUS + bank * 4, |
diff --git a/include/asm-x86_64/mce.h b/include/asm-x86_64/mce.h index 5a11146d6d9c..177e92b4019b 100644 --- a/include/asm-x86_64/mce.h +++ b/include/asm-x86_64/mce.h | |||
@@ -103,6 +103,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status); | |||
103 | 103 | ||
104 | extern atomic_t mce_entry; | 104 | extern atomic_t mce_entry; |
105 | 105 | ||
106 | extern void do_machine_check(struct pt_regs *, long); | ||
107 | |||
106 | #endif | 108 | #endif |
107 | 109 | ||
108 | #endif | 110 | #endif |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a7379aa31ca..796276141e51 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -217,7 +217,10 @@ static int wait_for_helper(void *data) | |||
217 | sub_info->retval = ret; | 217 | sub_info->retval = ret; |
218 | } | 218 | } |
219 | 219 | ||
220 | complete(sub_info->complete); | 220 | if (sub_info->wait < 0) |
221 | kfree(sub_info); | ||
222 | else | ||
223 | complete(sub_info->complete); | ||
221 | return 0; | 224 | return 0; |
222 | } | 225 | } |
223 | 226 | ||
@@ -239,6 +242,9 @@ static void __call_usermodehelper(struct work_struct *work) | |||
239 | pid = kernel_thread(____call_usermodehelper, sub_info, | 242 | pid = kernel_thread(____call_usermodehelper, sub_info, |
240 | CLONE_VFORK | SIGCHLD); | 243 | CLONE_VFORK | SIGCHLD); |
241 | 244 | ||
245 | if (wait < 0) | ||
246 | return; | ||
247 | |||
242 | if (pid < 0) { | 248 | if (pid < 0) { |
243 | sub_info->retval = pid; | 249 | sub_info->retval = pid; |
244 | complete(sub_info->complete); | 250 | complete(sub_info->complete); |
@@ -253,6 +259,9 @@ static void __call_usermodehelper(struct work_struct *work) | |||
253 | * @envp: null-terminated environment list | 259 | * @envp: null-terminated environment list |
254 | * @session_keyring: session keyring for process (NULL for an empty keyring) | 260 | * @session_keyring: session keyring for process (NULL for an empty keyring) |
255 | * @wait: wait for the application to finish and return status. | 261 | * @wait: wait for the application to finish and return status. |
262 | * when -1 don't wait at all, but you get no useful error back when | ||
263 | * the program couldn't be exec'ed. This makes it safe to call | ||
264 | * from interrupt context. | ||
256 | * | 265 | * |
257 | * Runs a user-space application. The application is started | 266 | * Runs a user-space application. The application is started |
258 | * asynchronously if wait is not set, and runs as a child of keventd. | 267 | * asynchronously if wait is not set, and runs as a child of keventd. |
@@ -265,17 +274,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
265 | struct key *session_keyring, int wait) | 274 | struct key *session_keyring, int wait) |
266 | { | 275 | { |
267 | DECLARE_COMPLETION_ONSTACK(done); | 276 | DECLARE_COMPLETION_ONSTACK(done); |
268 | struct subprocess_info sub_info = { | 277 | struct subprocess_info *sub_info; |
269 | .work = __WORK_INITIALIZER(sub_info.work, | 278 | int retval; |
270 | __call_usermodehelper), | ||
271 | .complete = &done, | ||
272 | .path = path, | ||
273 | .argv = argv, | ||
274 | .envp = envp, | ||
275 | .ring = session_keyring, | ||
276 | .wait = wait, | ||
277 | .retval = 0, | ||
278 | }; | ||
279 | 279 | ||
280 | if (!khelper_wq) | 280 | if (!khelper_wq) |
281 | return -EBUSY; | 281 | return -EBUSY; |
@@ -283,9 +283,25 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
283 | if (path[0] == '\0') | 283 | if (path[0] == '\0') |
284 | return 0; | 284 | return 0; |
285 | 285 | ||
286 | queue_work(khelper_wq, &sub_info.work); | 286 | sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); |
287 | if (!sub_info) | ||
288 | return -ENOMEM; | ||
289 | |||
290 | INIT_WORK(&sub_info->work, __call_usermodehelper); | ||
291 | sub_info->complete = &done; | ||
292 | sub_info->path = path; | ||
293 | sub_info->argv = argv; | ||
294 | sub_info->envp = envp; | ||
295 | sub_info->ring = session_keyring; | ||
296 | sub_info->wait = wait; | ||
297 | |||
298 | queue_work(khelper_wq, &sub_info->work); | ||
299 | if (wait < 0) /* task has freed sub_info */ | ||
300 | return 0; | ||
287 | wait_for_completion(&done); | 301 | wait_for_completion(&done); |
288 | return sub_info.retval; | 302 | retval = sub_info->retval; |
303 | kfree(sub_info); | ||
304 | return retval; | ||
289 | } | 305 | } |
290 | EXPORT_SYMBOL(call_usermodehelper_keys); | 306 | EXPORT_SYMBOL(call_usermodehelper_keys); |
291 | 307 | ||