aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>2006-09-26 17:44:37 -0400
committerTony Luck <tony.luck@intel.com>2006-09-26 17:44:37 -0400
commit43ed3baf623410b3fa6ca14a9d3f6deca3493c56 (patch)
treeb086b18adff2af6b2633e239e9d1b26d764ae333
parent816add4e986499145135c4014a7c8a8857f9f3c3 (diff)
[IA64] printing support for MCA/INIT
Printing message to console from MCA/INIT handler is useful, however doing oops_in_progress = 1 in them exactly makes something in kernel wrong. Especially it sounds ugly if system goes wrong after returning from recoverable MCA. This patch adds ia64_mca_printk() function that collects messages into temporary-not-so-large message buffer during in MCA/INIT environment and print them out later, after returning to normal context or when handlers determine to down the system. Also this print function is exported for use in extensional MCA handler. It would be useful to describe detail about recovery. NOTE: I don't think it is sane thing if temporary message buffer is enlarged enough to hold whole stack dumps from INIT, so buffering is disabled during stack dump from INIT-monarch (= default_monarch_init_process). please fix it in future. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Acked-by: Russ Anderson <rja@sgi.com> Signed-off-by: Tony Luck <tony.luck@intel.com>
-rw-r--r--arch/ia64/kernel/mca.c216
-rw-r--r--arch/ia64/kernel/mca_drv.c54
-rw-r--r--arch/ia64/kernel/mca_drv.h4
-rw-r--r--arch/ia64/kernel/salinfo.c4
4 files changed, 242 insertions, 36 deletions
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 2fbe4536fe18..98f3b26d7aff 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -54,6 +54,9 @@
54 * 54 *
55 * 2005-10-07 Keith Owens <kaos@sgi.com> 55 * 2005-10-07 Keith Owens <kaos@sgi.com>
56 * Add notify_die() hooks. 56 * Add notify_die() hooks.
57 *
58 * 2006-09-15 Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
59 * Add printing support for MCA/INIT.
57 */ 60 */
58#include <linux/types.h> 61#include <linux/types.h>
59#include <linux/init.h> 62#include <linux/init.h>
@@ -136,11 +139,175 @@ extern void salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe);
136 139
137static int mca_init __initdata; 140static int mca_init __initdata;
138 141
142/*
143 * limited & delayed printing support for MCA/INIT handler
144 */
145
146#define mprintk(fmt...) ia64_mca_printk(fmt)
147
148#define MLOGBUF_SIZE (512+256*NR_CPUS)
149#define MLOGBUF_MSGMAX 256
150static char mlogbuf[MLOGBUF_SIZE];
151static DEFINE_SPINLOCK(mlogbuf_wlock); /* mca context only */
152static DEFINE_SPINLOCK(mlogbuf_rlock); /* normal context only */
153static unsigned long mlogbuf_start;
154static unsigned long mlogbuf_end;
155static unsigned int mlogbuf_finished = 0;
156static unsigned long mlogbuf_timestamp = 0;
157
158static int loglevel_save = -1;
159#define BREAK_LOGLEVEL(__console_loglevel) \
160 oops_in_progress = 1; \
161 if (loglevel_save < 0) \
162 loglevel_save = __console_loglevel; \
163 __console_loglevel = 15;
164
165#define RESTORE_LOGLEVEL(__console_loglevel) \
166 if (loglevel_save >= 0) { \
167 __console_loglevel = loglevel_save; \
168 loglevel_save = -1; \
169 } \
170 mlogbuf_finished = 0; \
171 oops_in_progress = 0;
172
173/*
174 * Push messages into buffer, print them later if not urgent.
175 */
176void ia64_mca_printk(const char *fmt, ...)
177{
178 va_list args;
179 int printed_len;
180 char temp_buf[MLOGBUF_MSGMAX];
181 char *p;
182
183 va_start(args, fmt);
184 printed_len = vscnprintf(temp_buf, sizeof(temp_buf), fmt, args);
185 va_end(args);
186
187 /* Copy the output into mlogbuf */
188 if (oops_in_progress) {
189 /* mlogbuf was abandoned, use printk directly instead. */
190 printk(temp_buf);
191 } else {
192 spin_lock(&mlogbuf_wlock);
193 for (p = temp_buf; *p; p++) {
194 unsigned long next = (mlogbuf_end + 1) % MLOGBUF_SIZE;
195 if (next != mlogbuf_start) {
196 mlogbuf[mlogbuf_end] = *p;
197 mlogbuf_end = next;
198 } else {
199 /* buffer full */
200 break;
201 }
202 }
203 mlogbuf[mlogbuf_end] = '\0';
204 spin_unlock(&mlogbuf_wlock);
205 }
206}
207EXPORT_SYMBOL(ia64_mca_printk);
208
209/*
210 * Print buffered messages.
211 * NOTE: call this after returning normal context. (ex. from salinfod)
212 */
213void ia64_mlogbuf_dump(void)
214{
215 char temp_buf[MLOGBUF_MSGMAX];
216 char *p;
217 unsigned long index;
218 unsigned long flags;
219 unsigned int printed_len;
220
221 /* Get output from mlogbuf */
222 while (mlogbuf_start != mlogbuf_end) {
223 temp_buf[0] = '\0';
224 p = temp_buf;
225 printed_len = 0;
226
227 spin_lock_irqsave(&mlogbuf_rlock, flags);
228
229 index = mlogbuf_start;
230 while (index != mlogbuf_end) {
231 *p = mlogbuf[index];
232 index = (index + 1) % MLOGBUF_SIZE;
233 if (!*p)
234 break;
235 p++;
236 if (++printed_len >= MLOGBUF_MSGMAX - 1)
237 break;
238 }
239 *p = '\0';
240 if (temp_buf[0])
241 printk(temp_buf);
242 mlogbuf_start = index;
243
244 mlogbuf_timestamp = 0;
245 spin_unlock_irqrestore(&mlogbuf_rlock, flags);
246 }
247}
248EXPORT_SYMBOL(ia64_mlogbuf_dump);
249
250/*
251 * Call this if system is going to down or if immediate flushing messages to
252 * console is required. (ex. recovery was failed, crash dump is going to be
253 * invoked, long-wait rendezvous etc.)
254 * NOTE: this should be called from monarch.
255 */
256static void ia64_mlogbuf_finish(int wait)
257{
258 BREAK_LOGLEVEL(console_loglevel);
259
260 spin_lock_init(&mlogbuf_rlock);
261 ia64_mlogbuf_dump();
262 printk(KERN_EMERG "mlogbuf_finish: printing switched to urgent mode, "
263 "MCA/INIT might be dodgy or fail.\n");
264
265 if (!wait)
266 return;
267
268 /* wait for console */
269 printk("Delaying for 5 seconds...\n");
270 udelay(5*1000000);
271
272 mlogbuf_finished = 1;
273}
274EXPORT_SYMBOL(ia64_mlogbuf_finish);
275
276/*
277 * Print buffered messages from INIT context.
278 */
279static void ia64_mlogbuf_dump_from_init(void)
280{
281 if (mlogbuf_finished)
282 return;
283
284 if (mlogbuf_timestamp && (mlogbuf_timestamp + 30*HZ > jiffies)) {
285 printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT "
286 " and the system seems to be messed up.\n");
287 ia64_mlogbuf_finish(0);
288 return;
289 }
290
291 if (!spin_trylock(&mlogbuf_rlock)) {
292 printk(KERN_ERR "INIT: mlogbuf_dump is interrupted by INIT. "
293 "Generated messages other than stack dump will be "
294 "buffered to mlogbuf and will be printed later.\n");
295 printk(KERN_ERR "INIT: If messages would not printed after "
296 "this INIT, wait 30sec and assert INIT again.\n");
297 if (!mlogbuf_timestamp)
298 mlogbuf_timestamp = jiffies;
299 return;
300 }
301 spin_unlock(&mlogbuf_rlock);
302 ia64_mlogbuf_dump();
303}
139 304
140static void inline 305static void inline
141ia64_mca_spin(const char *func) 306ia64_mca_spin(const char *func)
142{ 307{
143 printk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func); 308 if (monarch_cpu == smp_processor_id())
309 ia64_mlogbuf_finish(0);
310 mprintk(KERN_EMERG "%s: spinning here, not returning to SAL\n", func);
144 while (1) 311 while (1)
145 cpu_relax(); 312 cpu_relax();
146} 313}
@@ -988,18 +1155,22 @@ ia64_wait_for_slaves(int monarch, const char *type)
988 } 1155 }
989 if (!missing) 1156 if (!missing)
990 goto all_in; 1157 goto all_in;
991 printk(KERN_INFO "OS %s slave did not rendezvous on cpu", type); 1158 /*
1159 * Maybe slave(s) dead. Print buffered messages immediately.
1160 */
1161 ia64_mlogbuf_finish(0);
1162 mprintk(KERN_INFO "OS %s slave did not rendezvous on cpu", type);
992 for_each_online_cpu(c) { 1163 for_each_online_cpu(c) {
993 if (c == monarch) 1164 if (c == monarch)
994 continue; 1165 continue;
995 if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) 1166 if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE)
996 printk(" %d", c); 1167 mprintk(" %d", c);
997 } 1168 }
998 printk("\n"); 1169 mprintk("\n");
999 return; 1170 return;
1000 1171
1001all_in: 1172all_in:
1002 printk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type); 1173 mprintk(KERN_INFO "All OS %s slaves have reached rendezvous\n", type);
1003 return; 1174 return;
1004} 1175}
1005 1176
@@ -1027,10 +1198,8 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
1027 struct ia64_mca_notify_die nd = 1198 struct ia64_mca_notify_die nd =
1028 { .sos = sos, .monarch_cpu = &monarch_cpu }; 1199 { .sos = sos, .monarch_cpu = &monarch_cpu };
1029 1200
1030 oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ 1201 mprintk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d "
1031 console_loglevel = 15; /* make sure printks make it to console */ 1202 "monarch=%ld\n", sos->proc_state_param, cpu, sos->monarch);
1032 printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n",
1033 sos->proc_state_param, cpu, sos->monarch);
1034 1203
1035 previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); 1204 previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA");
1036 monarch_cpu = cpu; 1205 monarch_cpu = cpu;
@@ -1066,6 +1235,9 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
1066 rh->severity = sal_log_severity_corrected; 1235 rh->severity = sal_log_severity_corrected;
1067 ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA); 1236 ia64_sal_clear_state_info(SAL_INFO_TYPE_MCA);
1068 sos->os_status = IA64_MCA_CORRECTED; 1237 sos->os_status = IA64_MCA_CORRECTED;
1238 } else {
1239 /* Dump buffered message to console */
1240 ia64_mlogbuf_finish(1);
1069 } 1241 }
1070 if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover) 1242 if (notify_die(DIE_MCA_MONARCH_LEAVE, "MCA", regs, (long)&nd, 0, recover)
1071 == NOTIFY_STOP) 1243 == NOTIFY_STOP)
@@ -1305,6 +1477,15 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi
1305 struct task_struct *g, *t; 1477 struct task_struct *g, *t;
1306 if (val != DIE_INIT_MONARCH_PROCESS) 1478 if (val != DIE_INIT_MONARCH_PROCESS)
1307 return NOTIFY_DONE; 1479 return NOTIFY_DONE;
1480
1481 /*
1482 * FIXME: mlogbuf will brim over with INIT stack dumps.
1483 * To enable show_stack from INIT, we use oops_in_progress which should
1484 * be used in real oops. This would cause something wrong after INIT.
1485 */
1486 BREAK_LOGLEVEL(console_loglevel);
1487 ia64_mlogbuf_dump_from_init();
1488
1308 printk(KERN_ERR "Processes interrupted by INIT -"); 1489 printk(KERN_ERR "Processes interrupted by INIT -");
1309 for_each_online_cpu(c) { 1490 for_each_online_cpu(c) {
1310 struct ia64_sal_os_state *s; 1491 struct ia64_sal_os_state *s;
@@ -1326,6 +1507,8 @@ default_monarch_init_process(struct notifier_block *self, unsigned long val, voi
1326 } while_each_thread (g, t); 1507 } while_each_thread (g, t);
1327 read_unlock(&tasklist_lock); 1508 read_unlock(&tasklist_lock);
1328 } 1509 }
1510 /* FIXME: This will not restore zapped printk locks. */
1511 RESTORE_LOGLEVEL(console_loglevel);
1329 return NOTIFY_DONE; 1512 return NOTIFY_DONE;
1330} 1513}
1331 1514
@@ -1357,12 +1540,9 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
1357 struct ia64_mca_notify_die nd = 1540 struct ia64_mca_notify_die nd =
1358 { .sos = sos, .monarch_cpu = &monarch_cpu }; 1541 { .sos = sos, .monarch_cpu = &monarch_cpu };
1359 1542
1360 oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */
1361 console_loglevel = 15; /* make sure printks make it to console */
1362
1363 (void) notify_die(DIE_INIT_ENTER, "INIT", regs, (long)&nd, 0, 0); 1543 (void) notify_die(DIE_INIT_ENTER, "INIT", regs, (long)&nd, 0, 0);
1364 1544
1365 printk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n", 1545 mprintk(KERN_INFO "Entered OS INIT handler. PSP=%lx cpu=%d monarch=%ld\n",
1366 sos->proc_state_param, cpu, sos->monarch); 1546 sos->proc_state_param, cpu, sos->monarch);
1367 salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0); 1547 salinfo_log_wakeup(SAL_INFO_TYPE_INIT, NULL, 0, 0);
1368 1548
@@ -1375,7 +1555,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
1375 * fix their proms and get their customers updated. 1555 * fix their proms and get their customers updated.
1376 */ 1556 */
1377 if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) { 1557 if (!sos->monarch && atomic_add_return(1, &slaves) == num_online_cpus()) {
1378 printk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n", 1558 mprintk(KERN_WARNING "%s: Promoting cpu %d to monarch.\n",
1379 __FUNCTION__, cpu); 1559 __FUNCTION__, cpu);
1380 atomic_dec(&slaves); 1560 atomic_dec(&slaves);
1381 sos->monarch = 1; 1561 sos->monarch = 1;
@@ -1387,7 +1567,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
1387 * fix their proms and get their customers updated. 1567 * fix their proms and get their customers updated.
1388 */ 1568 */
1389 if (sos->monarch && atomic_add_return(1, &monarchs) > 1) { 1569 if (sos->monarch && atomic_add_return(1, &monarchs) > 1) {
1390 printk(KERN_WARNING "%s: Demoting cpu %d to slave.\n", 1570 mprintk(KERN_WARNING "%s: Demoting cpu %d to slave.\n",
1391 __FUNCTION__, cpu); 1571 __FUNCTION__, cpu);
1392 atomic_dec(&monarchs); 1572 atomic_dec(&monarchs);
1393 sos->monarch = 0; 1573 sos->monarch = 0;
@@ -1408,7 +1588,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
1408 if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, (long)&nd, 0, 0) 1588 if (notify_die(DIE_INIT_SLAVE_LEAVE, "INIT", regs, (long)&nd, 0, 0)
1409 == NOTIFY_STOP) 1589 == NOTIFY_STOP)
1410 ia64_mca_spin(__FUNCTION__); 1590 ia64_mca_spin(__FUNCTION__);
1411 printk("Slave on cpu %d returning to normal service.\n", cpu); 1591 mprintk("Slave on cpu %d returning to normal service.\n", cpu);
1412 set_curr_task(cpu, previous_current); 1592 set_curr_task(cpu, previous_current);
1413 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; 1593 ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
1414 atomic_dec(&slaves); 1594 atomic_dec(&slaves);
@@ -1426,7 +1606,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
1426 * same serial line, the user will need some time to switch out of the BMC before 1606 * same serial line, the user will need some time to switch out of the BMC before
1427 * the dump begins. 1607 * the dump begins.
1428 */ 1608 */
1429 printk("Delaying for 5 seconds...\n"); 1609 mprintk("Delaying for 5 seconds...\n");
1430 udelay(5*1000000); 1610 udelay(5*1000000);
1431 ia64_wait_for_slaves(cpu, "INIT"); 1611 ia64_wait_for_slaves(cpu, "INIT");
1432 /* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through 1612 /* If nobody intercepts DIE_INIT_MONARCH_PROCESS then we drop through
@@ -1439,7 +1619,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
1439 if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, (long)&nd, 0, 0) 1619 if (notify_die(DIE_INIT_MONARCH_LEAVE, "INIT", regs, (long)&nd, 0, 0)
1440 == NOTIFY_STOP) 1620 == NOTIFY_STOP)
1441 ia64_mca_spin(__FUNCTION__); 1621 ia64_mca_spin(__FUNCTION__);
1442 printk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); 1622 mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu);
1443 atomic_dec(&monarchs); 1623 atomic_dec(&monarchs);
1444 set_curr_task(cpu, previous_current); 1624 set_curr_task(cpu, previous_current);
1445 monarch_cpu = -1; 1625 monarch_cpu = -1;
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c
index 8db6e0cedadc..a45009d2bc90 100644
--- a/arch/ia64/kernel/mca_drv.c
+++ b/arch/ia64/kernel/mca_drv.c
@@ -79,14 +79,30 @@ static int
79fatal_mca(const char *fmt, ...) 79fatal_mca(const char *fmt, ...)
80{ 80{
81 va_list args; 81 va_list args;
82 char buf[256];
82 83
83 va_start(args, fmt); 84 va_start(args, fmt);
84 vprintk(fmt, args); 85 vsnprintf(buf, sizeof(buf), fmt, args);
85 va_end(args); 86 va_end(args);
87 ia64_mca_printk(KERN_ALERT "MCA: %s\n", buf);
86 88
87 return MCA_NOT_RECOVERED; 89 return MCA_NOT_RECOVERED;
88} 90}
89 91
92static int
93mca_recovered(const char *fmt, ...)
94{
95 va_list args;
96 char buf[256];
97
98 va_start(args, fmt);
99 vsnprintf(buf, sizeof(buf), fmt, args);
100 va_end(args);
101 ia64_mca_printk(KERN_INFO "MCA: %s\n", buf);
102
103 return MCA_RECOVERED;
104}
105
90/** 106/**
91 * mca_page_isolate - isolate a poisoned page in order not to use it later 107 * mca_page_isolate - isolate a poisoned page in order not to use it later
92 * @paddr: poisoned memory location 108 * @paddr: poisoned memory location
@@ -140,6 +156,7 @@ mca_page_isolate(unsigned long paddr)
140void 156void
141mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) 157mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
142{ 158{
159 ia64_mlogbuf_dump();
143 printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, " 160 printk(KERN_ERR "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
144 "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n", 161 "iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
145 raw_smp_processor_id(), current->pid, current->uid, 162 raw_smp_processor_id(), current->pid, current->uid,
@@ -440,7 +457,7 @@ recover_from_read_error(slidx_table_t *slidx,
440 457
441 /* Is target address valid? */ 458 /* Is target address valid? */
442 if (!pbci->tv) 459 if (!pbci->tv)
443 return fatal_mca(KERN_ALERT "MCA: target address not valid\n"); 460 return fatal_mca("target address not valid");
444 461
445 /* 462 /*
446 * cpu read or memory-mapped io read 463 * cpu read or memory-mapped io read
@@ -458,7 +475,7 @@ recover_from_read_error(slidx_table_t *slidx,
458 475
459 /* Is minstate valid? */ 476 /* Is minstate valid? */
460 if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) 477 if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate))
461 return fatal_mca(KERN_ALERT "MCA: minstate not valid\n"); 478 return fatal_mca("minstate not valid");
462 psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); 479 psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
463 psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); 480 psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
464 481
@@ -492,13 +509,14 @@ recover_from_read_error(slidx_table_t *slidx,
492 psr2->bn = 1; 509 psr2->bn = 1;
493 psr2->i = 0; 510 psr2->i = 0;
494 511
495 return MCA_RECOVERED; 512 return mca_recovered("user memory corruption. "
513 "kill affected process - recovered.");
496 } 514 }
497 515
498 } 516 }
499 517
500 return fatal_mca(KERN_ALERT "MCA: kernel context not recovered," 518 return fatal_mca("kernel context not recovered, iip 0x%lx\n",
501 " iip 0x%lx\n", pmsa->pmsa_iip); 519 pmsa->pmsa_iip);
502} 520}
503 521
504/** 522/**
@@ -584,13 +602,13 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
584 * The machine check is corrected. 602 * The machine check is corrected.
585 */ 603 */
586 if (psp->cm == 1) 604 if (psp->cm == 1)
587 return MCA_RECOVERED; 605 return mca_recovered("machine check is already corrected.");
588 606
589 /* 607 /*
590 * The error was not contained. Software must be reset. 608 * The error was not contained. Software must be reset.
591 */ 609 */
592 if (psp->us || psp->ci == 0) 610 if (psp->us || psp->ci == 0)
593 return fatal_mca(KERN_ALERT "MCA: error not contained\n"); 611 return fatal_mca("error not contained");
594 612
595 /* 613 /*
596 * The cache check and bus check bits have four possible states 614 * The cache check and bus check bits have four possible states
@@ -601,22 +619,22 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
601 * 1 1 Memory error, attempt recovery 619 * 1 1 Memory error, attempt recovery
602 */ 620 */
603 if (psp->bc == 0 || pbci == NULL) 621 if (psp->bc == 0 || pbci == NULL)
604 return fatal_mca(KERN_ALERT "MCA: No bus check\n"); 622 return fatal_mca("No bus check");
605 623
606 /* 624 /*
607 * Sorry, we cannot handle so many. 625 * Sorry, we cannot handle so many.
608 */ 626 */
609 if (peidx_bus_check_num(peidx) > 1) 627 if (peidx_bus_check_num(peidx) > 1)
610 return fatal_mca(KERN_ALERT "MCA: Too many bus checks\n"); 628 return fatal_mca("Too many bus checks");
611 /* 629 /*
612 * Well, here is only one bus error. 630 * Well, here is only one bus error.
613 */ 631 */
614 if (pbci->ib) 632 if (pbci->ib)
615 return fatal_mca(KERN_ALERT "MCA: Internal Bus error\n"); 633 return fatal_mca("Internal Bus error");
616 if (pbci->cc) 634 if (pbci->cc)
617 return fatal_mca(KERN_ALERT "MCA: Cache-cache error\n"); 635 return fatal_mca("Cache-cache error");
618 if (pbci->eb && pbci->bsi > 0) 636 if (pbci->eb && pbci->bsi > 0)
619 return fatal_mca(KERN_ALERT "MCA: External bus check fatal status\n"); 637 return fatal_mca("External bus check fatal status");
620 638
621 /* 639 /*
622 * This is a local MCA and estimated as recoverble external bus error. 640 * This is a local MCA and estimated as recoverble external bus error.
@@ -628,7 +646,7 @@ recover_from_processor_error(int platform, slidx_table_t *slidx,
628 /* 646 /*
629 * On account of strange SAL error record, we cannot recover. 647 * On account of strange SAL error record, we cannot recover.
630 */ 648 */
631 return fatal_mca(KERN_ALERT "MCA: Strange SAL record\n"); 649 return fatal_mca("Strange SAL record");
632} 650}
633 651
634/** 652/**
@@ -657,10 +675,10 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos)
657 675
658 /* Now, OS can recover when there is one processor error section */ 676 /* Now, OS can recover when there is one processor error section */
659 if (n_proc_err > 1) 677 if (n_proc_err > 1)
660 return fatal_mca(KERN_ALERT "MCA: Too Many Errors\n"); 678 return fatal_mca("Too Many Errors");
661 else if (n_proc_err == 0) 679 else if (n_proc_err == 0)
662 /* Weird SAL record ... We need not to recover */ 680 /* Weird SAL record ... We can't do anything */
663 return fatal_mca(KERN_ALERT "MCA: Weird SAL record\n"); 681 return fatal_mca("Weird SAL record");
664 682
665 /* Make index of processor error section */ 683 /* Make index of processor error section */
666 mca_make_peidx((sal_log_processor_info_t*) 684 mca_make_peidx((sal_log_processor_info_t*)
@@ -671,7 +689,7 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos)
671 689
672 /* Check whether MCA is global or not */ 690 /* Check whether MCA is global or not */
673 if (is_mca_global(&peidx, &pbci, sos)) 691 if (is_mca_global(&peidx, &pbci, sos))
674 return fatal_mca(KERN_ALERT "MCA: global MCA\n"); 692 return fatal_mca("global MCA");
675 693
676 /* Try to recover a processor error */ 694 /* Try to recover a processor error */
677 return recover_from_processor_error(platform_err, &slidx, &peidx, 695 return recover_from_processor_error(platform_err, &slidx, &peidx,
diff --git a/arch/ia64/kernel/mca_drv.h b/arch/ia64/kernel/mca_drv.h
index 31a2e52bb16f..c85e943ba5fd 100644
--- a/arch/ia64/kernel/mca_drv.h
+++ b/arch/ia64/kernel/mca_drv.h
@@ -118,3 +118,7 @@ struct mca_table_entry {
118 118
119extern const struct mca_table_entry *search_mca_tables (unsigned long addr); 119extern const struct mca_table_entry *search_mca_tables (unsigned long addr);
120extern int mca_recover_range(unsigned long); 120extern int mca_recover_range(unsigned long);
121extern void ia64_mca_printk(const char * fmt, ...)
122 __attribute__ ((format (printf, 1, 2)));
123extern void ia64_mlogbuf_dump(void);
124
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index 9065f0f01ba3..e63b8ca5344a 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -266,6 +266,7 @@ salinfo_log_wakeup(int type, u8 *buffer, u64 size, int irqsafe)
266/* Check for outstanding MCA/INIT records every minute (arbitrary) */ 266/* Check for outstanding MCA/INIT records every minute (arbitrary) */
267#define SALINFO_TIMER_DELAY (60*HZ) 267#define SALINFO_TIMER_DELAY (60*HZ)
268static struct timer_list salinfo_timer; 268static struct timer_list salinfo_timer;
269extern void ia64_mlogbuf_dump(void);
269 270
270static void 271static void
271salinfo_timeout_check(struct salinfo_data *data) 272salinfo_timeout_check(struct salinfo_data *data)
@@ -283,6 +284,7 @@ salinfo_timeout_check(struct salinfo_data *data)
283static void 284static void
284salinfo_timeout (unsigned long arg) 285salinfo_timeout (unsigned long arg)
285{ 286{
287 ia64_mlogbuf_dump();
286 salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA); 288 salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_MCA);
287 salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT); 289 salinfo_timeout_check(salinfo_data + SAL_INFO_TYPE_INIT);
288 salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY; 290 salinfo_timer.expires = jiffies + SALINFO_TIMER_DELAY;
@@ -332,6 +334,8 @@ retry:
332 if (cpu == -1) 334 if (cpu == -1)
333 goto retry; 335 goto retry;
334 336
337 ia64_mlogbuf_dump();
338
335 /* for next read, start checking at next CPU */ 339 /* for next read, start checking at next CPU */
336 data->cpu_check = cpu; 340 data->cpu_check = cpu;
337 if (++data->cpu_check == NR_CPUS) 341 if (++data->cpu_check == NR_CPUS)