diff options
-rw-r--r-- | Documentation/x86_64/boot-options.txt | 8 | ||||
-rw-r--r-- | Documentation/x86_64/machinecheck | 14 | ||||
-rw-r--r-- | arch/x86_64/kernel/mce.c | 101 |
3 files changed, 80 insertions, 43 deletions
diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index a4595b22e092..945311840a10 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt | |||
@@ -14,9 +14,11 @@ Machine check | |||
14 | mce=nobootlog | 14 | mce=nobootlog |
15 | Disable boot machine check logging. | 15 | Disable boot machine check logging. |
16 | mce=tolerancelevel (number) | 16 | mce=tolerancelevel (number) |
17 | 0: always panic, 1: panic if deadlock possible, | 17 | 0: always panic on uncorrected errors, log corrected errors |
18 | 2: try to avoid panic, 3: never panic or exit (for testing) | 18 | 1: panic or SIGBUS on uncorrected errors, log corrected errors |
19 | default is 1 | 19 | 2: SIGBUS or log uncorrected errors, log corrected errors |
20 | 3: never panic or SIGBUS, log all errors (for testing only) | ||
21 | Default is 1 | ||
20 | Can be also set using sysfs which is preferable. | 22 | Can be also set using sysfs which is preferable. |
21 | 23 | ||
22 | nomce (for compatibility with i386): same as mce=off | 24 | nomce (for compatibility with i386): same as mce=off |
diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck index feaeaf6f6e4d..a05e58e7b159 100644 --- a/Documentation/x86_64/machinecheck +++ b/Documentation/x86_64/machinecheck | |||
@@ -49,12 +49,14 @@ tolerant | |||
49 | Since machine check exceptions can happen any time it is sometimes | 49 | Since machine check exceptions can happen any time it is sometimes |
50 | risky for the kernel to kill a process because it defies | 50 | risky for the kernel to kill a process because it defies |
51 | normal kernel locking rules. The tolerance level configures | 51 | normal kernel locking rules. The tolerance level configures |
52 | how hard the kernel tries to recover even at some risk of deadlock. | 52 | how hard the kernel tries to recover even at some risk of |
53 | 53 | deadlock. Higher tolerant values trade potentially better uptime | |
54 | 0: always panic, | 54 | with the risk of a crash or even corruption (for tolerant >= 3). |
55 | 1: panic if deadlock possible, | 55 | |
56 | 2: try to avoid panic, | 56 | 0: always panic on uncorrected errors, log corrected errors |
57 | 3: never panic or exit (for testing only) | 57 | 1: panic or SIGBUS on uncorrected errors, log corrected errors |
58 | 2: SIGBUS or log uncorrected errors, log corrected errors | ||
59 | 3: never panic or SIGBUS, log all errors (for testing only) | ||
58 | 60 | ||
59 | Default: 1 | 61 | Default: 1 |
60 | 62 | ||
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 968613572b9a..7c8ab423abe3 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c | |||
@@ -37,8 +37,13 @@ atomic_t mce_entry; | |||
37 | 37 | ||
38 | static int mce_dont_init; | 38 | static int mce_dont_init; |
39 | 39 | ||
40 | /* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, | 40 | /* |
41 | 3: never panic or exit (for testing only) */ | 41 | * Tolerant levels: |
42 | * 0: always panic on uncorrected errors, log corrected errors | ||
43 | * 1: panic or SIGBUS on uncorrected errors, log corrected errors | ||
44 | * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors | ||
45 | * 3: never panic or SIGBUS, log all errors (for testing only) | ||
46 | */ | ||
42 | static int tolerant = 1; | 47 | static int tolerant = 1; |
43 | static int banks; | 48 | static int banks; |
44 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; | 49 | static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; |
@@ -132,9 +137,6 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) | |||
132 | { | 137 | { |
133 | int i; | 138 | int i; |
134 | 139 | ||
135 | if (tolerant >= 3) | ||
136 | return; | ||
137 | |||
138 | oops_begin(); | 140 | oops_begin(); |
139 | for (i = 0; i < MCE_LOG_LEN; i++) { | 141 | for (i = 0; i < MCE_LOG_LEN; i++) { |
140 | unsigned long tsc = mcelog.entry[i].tsc; | 142 | unsigned long tsc = mcelog.entry[i].tsc; |
@@ -178,11 +180,19 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) | |||
178 | void do_machine_check(struct pt_regs * regs, long error_code) | 180 | void do_machine_check(struct pt_regs * regs, long error_code) |
179 | { | 181 | { |
180 | struct mce m, panicm; | 182 | struct mce m, panicm; |
181 | int nowayout = (tolerant < 1); | ||
182 | int kill_it = 0; | ||
183 | u64 mcestart = 0; | 183 | u64 mcestart = 0; |
184 | int i; | 184 | int i; |
185 | int panicm_found = 0; | 185 | int panicm_found = 0; |
186 | /* | ||
187 | * If no_way_out gets set, there is no safe way to recover from this | ||
188 | * MCE. If tolerant is cranked up, we'll try anyway. | ||
189 | */ | ||
190 | int no_way_out = 0; | ||
191 | /* | ||
192 | * If kill_it gets set, there might be a way to recover from this | ||
193 | * error. | ||
194 | */ | ||
195 | int kill_it = 0; | ||
186 | 196 | ||
187 | atomic_inc(&mce_entry); | 197 | atomic_inc(&mce_entry); |
188 | 198 | ||
@@ -194,8 +204,9 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
194 | memset(&m, 0, sizeof(struct mce)); | 204 | memset(&m, 0, sizeof(struct mce)); |
195 | m.cpu = smp_processor_id(); | 205 | m.cpu = smp_processor_id(); |
196 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); | 206 | rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); |
207 | /* if the restart IP is not valid, we're done for */ | ||
197 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) | 208 | if (!(m.mcgstatus & MCG_STATUS_RIPV)) |
198 | kill_it = 1; | 209 | no_way_out = 1; |
199 | 210 | ||
200 | rdtscll(mcestart); | 211 | rdtscll(mcestart); |
201 | barrier(); | 212 | barrier(); |
@@ -214,10 +225,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
214 | continue; | 225 | continue; |
215 | 226 | ||
216 | if (m.status & MCI_STATUS_EN) { | 227 | if (m.status & MCI_STATUS_EN) { |
217 | /* In theory _OVER could be a nowayout too, but | 228 | /* if PCC was set, there's no way out */ |
218 | assume any overflowed errors were no fatal. */ | 229 | no_way_out |= !!(m.status & MCI_STATUS_PCC); |
219 | nowayout |= !!(m.status & MCI_STATUS_PCC); | 230 | /* |
220 | kill_it |= !!(m.status & MCI_STATUS_UC); | 231 | * If this error was uncorrectable and there was |
232 | * an overflow, we're in trouble. If no overflow, | ||
233 | * we might get away with just killing a task. | ||
234 | */ | ||
235 | if (m.status & MCI_STATUS_UC) { | ||
236 | if (tolerant < 1 || m.status & MCI_STATUS_OVER) | ||
237 | no_way_out = 1; | ||
238 | kill_it = 1; | ||
239 | } | ||
221 | } | 240 | } |
222 | 241 | ||
223 | if (m.status & MCI_STATUS_MISCV) | 242 | if (m.status & MCI_STATUS_MISCV) |
@@ -228,7 +247,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
228 | mce_get_rip(&m, regs); | 247 | mce_get_rip(&m, regs); |
229 | if (error_code >= 0) | 248 | if (error_code >= 0) |
230 | rdtscll(m.tsc); | 249 | rdtscll(m.tsc); |
231 | wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); | ||
232 | if (error_code != -2) | 250 | if (error_code != -2) |
233 | mce_log(&m); | 251 | mce_log(&m); |
234 | 252 | ||
@@ -251,37 +269,52 @@ void do_machine_check(struct pt_regs * regs, long error_code) | |||
251 | the last one (shouldn't happen, just being safe). */ | 269 | the last one (shouldn't happen, just being safe). */ |
252 | if (!panicm_found) | 270 | if (!panicm_found) |
253 | panicm = m; | 271 | panicm = m; |
254 | if (nowayout) | 272 | |
273 | /* | ||
274 | * If we have decided that we just CAN'T continue, and the user | ||
275 | * has not set tolerant to an insane level, give up and die. | ||
276 | */ | ||
277 | if (no_way_out && tolerant < 3) | ||
255 | mce_panic("Machine check", &panicm, mcestart); | 278 | mce_panic("Machine check", &panicm, mcestart); |
256 | if (kill_it) { | 279 | |
280 | /* | ||
281 | * If the error seems to be unrecoverable, something should be | ||
282 | * done. Try to kill as little as possible. If we can kill just | ||
283 | * one task, do that. If the user has set the tolerance very | ||
284 | * high, don't try to do anything at all. | ||
285 | */ | ||
286 | if (kill_it && tolerant < 3) { | ||
257 | int user_space = 0; | 287 | int user_space = 0; |
258 | 288 | ||
259 | if (m.mcgstatus & MCG_STATUS_RIPV) | 289 | /* |
290 | * If the EIPV bit is set, it means the saved IP is the | ||
291 | * instruction which caused the MCE. | ||
292 | */ | ||
293 | if (m.mcgstatus & MCG_STATUS_EIPV) | ||
260 | user_space = panicm.rip && (panicm.cs & 3); | 294 | user_space = panicm.rip && (panicm.cs & 3); |
261 | 295 | ||
262 | /* When the machine was in user space and the CPU didn't get | 296 | /* |
263 | confused it's normally not necessary to panic, unless you | 297 | * If we know that the error was in user space, send a |
264 | are paranoid (tolerant == 0) | 298 | * SIGBUS. Otherwise, panic if tolerance is low. |
265 | 299 | * | |
266 | RED-PEN could be more tolerant for MCEs in idle, | 300 | * do_exit() takes an awful lot of locks and has a slight |
267 | but most likely they occur at boot anyways, where | 301 | * risk of deadlocking. |
268 | it is best to just halt the machine. */ | 302 | */ |
269 | if ((!user_space && (panic_on_oops || tolerant < 2)) || | 303 | if (user_space) { |
270 | (unsigned)current->pid <= 1) | ||
271 | mce_panic("Uncorrected machine check", &panicm, mcestart); | ||
272 | |||
273 | /* do_exit takes an awful lot of locks and has as | ||
274 | slight risk of deadlocking. If you don't want that | ||
275 | don't set tolerant >= 2 */ | ||
276 | if (tolerant < 3) | ||
277 | do_exit(SIGBUS); | 304 | do_exit(SIGBUS); |
305 | } else if (panic_on_oops || tolerant < 2) { | ||
306 | mce_panic("Uncorrected machine check", | ||
307 | &panicm, mcestart); | ||
308 | } | ||
278 | } | 309 | } |
279 | 310 | ||
280 | /* notify userspace ASAP */ | 311 | /* notify userspace ASAP */ |
281 | set_thread_flag(TIF_MCE_NOTIFY); | 312 | set_thread_flag(TIF_MCE_NOTIFY); |
282 | 313 | ||
283 | out: | 314 | out: |
284 | /* Last thing done in the machine check exception to clear state. */ | 315 | /* the last thing we do is clear state */ |
316 | for (i = 0; i < banks; i++) | ||
317 | wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); | ||
285 | wrmsrl(MSR_IA32_MCG_STATUS, 0); | 318 | wrmsrl(MSR_IA32_MCG_STATUS, 0); |
286 | out2: | 319 | out2: |
287 | atomic_dec(&mce_entry); | 320 | atomic_dec(&mce_entry); |
@@ -506,7 +539,7 @@ static int mce_open(struct inode *inode, struct file *file) | |||
506 | 539 | ||
507 | spin_unlock(&mce_state_lock); | 540 | spin_unlock(&mce_state_lock); |
508 | 541 | ||
509 | return 0; | 542 | return nonseekable_open(inode, file); |
510 | } | 543 | } |
511 | 544 | ||
512 | static int mce_release(struct inode *inode, struct file *file) | 545 | static int mce_release(struct inode *inode, struct file *file) |