diff options
author | Russ Anderson <rja@efs.americas.sgi.com> | 2006-04-27 11:07:08 -0400 |
---|---|---|
committer | Tony Luck <tony.luck@intel.com> | 2006-04-27 17:34:01 -0400 |
commit | 189979619f90fd2eb168fbb9c262569176160624 (patch) | |
tree | 8f274464ac1604d927351448d1e5148c199960b7 /arch | |
parent | cda3d4a069b915cf46e640bb6872a9d9aefeaabe (diff) |
[IA64] Add mca recovery failure messages
When the mca recovery code encounters a condition that makes
the MCA non-recoverable, print the reason it could not recover.
This will make it easier to identify why the recovery code did
not recover.
Signed-off-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/ia64/kernel/mca_drv.c | 54 |
1 files changed, 36 insertions, 18 deletions
diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index 37c88eb55873..ca6666b51ccb 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c | |||
@@ -62,6 +62,11 @@ typedef enum { | |||
62 | ISOLATE_NONE | 62 | ISOLATE_NONE |
63 | } isolate_status_t; | 63 | } isolate_status_t; |
64 | 64 | ||
65 | typedef enum { | ||
66 | MCA_NOT_RECOVERED = 0, | ||
67 | MCA_RECOVERED = 1 | ||
68 | } recovery_status_t; | ||
69 | |||
65 | /* | 70 | /* |
66 | * This pool keeps pointers to the section part of SAL error record | 71 | * This pool keeps pointers to the section part of SAL error record |
67 | */ | 72 | */ |
@@ -71,6 +76,18 @@ static struct { | |||
71 | int max_idx; /* Maximum index of section pointer list pool */ | 76 | int max_idx; /* Maximum index of section pointer list pool */ |
72 | } slidx_pool; | 77 | } slidx_pool; |
73 | 78 | ||
79 | static int | ||
80 | fatal_mca(const char *fmt, ...) | ||
81 | { | ||
82 | va_list args; | ||
83 | |||
84 | va_start(args, fmt); | ||
85 | vprintk(fmt, args); | ||
86 | va_end(args); | ||
87 | |||
88 | return MCA_NOT_RECOVERED; | ||
89 | } | ||
90 | |||
74 | /** | 91 | /** |
75 | * mca_page_isolate - isolate a poisoned page in order not to use it later | 92 | * mca_page_isolate - isolate a poisoned page in order not to use it later |
76 | * @paddr: poisoned memory location | 93 | * @paddr: poisoned memory location |
@@ -424,7 +441,7 @@ recover_from_read_error(slidx_table_t *slidx, | |||
424 | 441 | ||
425 | /* Is target address valid? */ | 442 | /* Is target address valid? */ |
426 | if (!pbci->tv) | 443 | if (!pbci->tv) |
427 | return 0; | 444 | return fatal_mca(KERN_ALERT "MCA: target address not valid\n"); |
428 | 445 | ||
429 | /* | 446 | /* |
430 | * cpu read or memory-mapped io read | 447 | * cpu read or memory-mapped io read |
@@ -442,7 +459,7 @@ recover_from_read_error(slidx_table_t *slidx, | |||
442 | 459 | ||
443 | /* Is minstate valid? */ | 460 | /* Is minstate valid? */ |
444 | if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) | 461 | if (!peidx_bottom(peidx) || !(peidx_bottom(peidx)->valid.minstate)) |
445 | return 0; | 462 | return fatal_mca(KERN_ALERT "MCA: minstate not valid\n"); |
446 | psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); | 463 | psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr); |
447 | psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); | 464 | psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr); |
448 | 465 | ||
@@ -476,12 +493,13 @@ recover_from_read_error(slidx_table_t *slidx, | |||
476 | psr2->bn = 1; | 493 | psr2->bn = 1; |
477 | psr2->i = 0; | 494 | psr2->i = 0; |
478 | 495 | ||
479 | return 1; | 496 | return MCA_RECOVERED; |
480 | } | 497 | } |
481 | 498 | ||
482 | } | 499 | } |
483 | 500 | ||
484 | return 0; | 501 | return fatal_mca(KERN_ALERT "MCA: kernel context not recovered," |
502 | " iip 0x%lx\n", pmsa->pmsa_iip); | ||
485 | } | 503 | } |
486 | 504 | ||
487 | /** | 505 | /** |
@@ -567,13 +585,13 @@ recover_from_processor_error(int platform, slidx_table_t *slidx, | |||
567 | * The machine check is corrected. | 585 | * The machine check is corrected. |
568 | */ | 586 | */ |
569 | if (psp->cm == 1) | 587 | if (psp->cm == 1) |
570 | return 1; | 588 | return MCA_RECOVERED; |
571 | 589 | ||
572 | /* | 590 | /* |
573 | * The error was not contained. Software must be reset. | 591 | * The error was not contained. Software must be reset. |
574 | */ | 592 | */ |
575 | if (psp->us || psp->ci == 0) | 593 | if (psp->us || psp->ci == 0) |
576 | return 0; | 594 | return fatal_mca(KERN_ALERT "MCA: error not contained\n"); |
577 | 595 | ||
578 | /* | 596 | /* |
579 | * The cache check and bus check bits have four possible states | 597 | * The cache check and bus check bits have four possible states |
@@ -584,20 +602,22 @@ recover_from_processor_error(int platform, slidx_table_t *slidx, | |||
584 | * 1 1 Memory error, attempt recovery | 602 | * 1 1 Memory error, attempt recovery |
585 | */ | 603 | */ |
586 | if (psp->bc == 0 || pbci == NULL) | 604 | if (psp->bc == 0 || pbci == NULL) |
587 | return 0; | 605 | return fatal_mca(KERN_ALERT "MCA: No bus check\n"); |
588 | 606 | ||
589 | /* | 607 | /* |
590 | * Sorry, we cannot handle so many. | 608 | * Sorry, we cannot handle so many. |
591 | */ | 609 | */ |
592 | if (peidx_bus_check_num(peidx) > 1) | 610 | if (peidx_bus_check_num(peidx) > 1) |
593 | return 0; | 611 | return fatal_mca(KERN_ALERT "MCA: Too many bus checks\n"); |
594 | /* | 612 | /* |
595 | * Well, here is only one bus error. | 613 | * Well, here is only one bus error. |
596 | */ | 614 | */ |
597 | if (pbci->ib || pbci->cc) | 615 | if (pbci->ib) |
598 | return 0; | 616 | return fatal_mca(KERN_ALERT "MCA: Internal Bus error\n"); |
617 | if (pbci->cc) | ||
618 | return fatal_mca(KERN_ALERT "MCA: Cache-cache error\n"); | ||
599 | if (pbci->eb && pbci->bsi > 0) | 619 | if (pbci->eb && pbci->bsi > 0) |
600 | return 0; | 620 | return fatal_mca(KERN_ALERT "MCA: External bus check fatal status\n"); |
601 | 621 | ||
602 | /* | 622 | /* |
603 | * This is a local MCA and estimated as recoverble external bus error. | 623 | * This is a local MCA and estimated as recoverble external bus error. |
@@ -609,7 +629,7 @@ recover_from_processor_error(int platform, slidx_table_t *slidx, | |||
609 | /* | 629 | /* |
610 | * On account of strange SAL error record, we cannot recover. | 630 | * On account of strange SAL error record, we cannot recover. |
611 | */ | 631 | */ |
612 | return 0; | 632 | return fatal_mca(KERN_ALERT "MCA: Strange SAL record\n"); |
613 | } | 633 | } |
614 | 634 | ||
615 | /** | 635 | /** |
@@ -638,12 +658,10 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) | |||
638 | 658 | ||
639 | /* Now, OS can recover when there is one processor error section */ | 659 | /* Now, OS can recover when there is one processor error section */ |
640 | if (n_proc_err > 1) | 660 | if (n_proc_err > 1) |
641 | return 0; | 661 | return fatal_mca(KERN_ALERT "MCA: Too Many Errors\n"); |
642 | else if (n_proc_err == 0) { | 662 | else if (n_proc_err == 0) |
643 | /* Weird SAL record ... We need not to recover */ | 663 | /* Weird SAL record ... We need not to recover */ |
644 | 664 | return fatal_mca(KERN_ALERT "MCA: Weird SAL record\n"); | |
645 | return 1; | ||
646 | } | ||
647 | 665 | ||
648 | /* Make index of processor error section */ | 666 | /* Make index of processor error section */ |
649 | mca_make_peidx((sal_log_processor_info_t*) | 667 | mca_make_peidx((sal_log_processor_info_t*) |
@@ -654,7 +672,7 @@ mca_try_to_recover(void *rec, struct ia64_sal_os_state *sos) | |||
654 | 672 | ||
655 | /* Check whether MCA is global or not */ | 673 | /* Check whether MCA is global or not */ |
656 | if (is_mca_global(&peidx, &pbci, sos)) | 674 | if (is_mca_global(&peidx, &pbci, sos)) |
657 | return 0; | 675 | return fatal_mca(KERN_ALERT "MCA: global MCA\n"); |
658 | 676 | ||
659 | /* Try to recover a processor error */ | 677 | /* Try to recover a processor error */ |
660 | return recover_from_processor_error(platform_err, &slidx, &peidx, | 678 | return recover_from_processor_error(platform_err, &slidx, &peidx, |