aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGavin Shan <shangw@linux.vnet.ibm.com>2013-06-20 01:21:12 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2013-06-20 03:06:40 -0400
commit70f942db4669c4417b7bb4f3353b3eddf1179aae (patch)
tree73f2b88c1b8932ef5930114e179724e92133a449
parentbf90dfea2397fe136ce35bc896c3bc84133272c6 (diff)
powerpc/eeh: I/O chip next error
The patch implements the backend for EEH core to retrieve next EEH error to handle. For the informational errors, we won't bother the EEH core. Otherwise, the EEH should take appropriate actions depending on the return value: 0 - No further errors detected 1 - Frozen PE 2 - Fenced PHB 3 - Dead PHB 4 - Dead IOC Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c334
-rw-r--r--arch/powerpc/platforms/powernv/pci.h1
2 files changed, 333 insertions, 2 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 8d9c2d232d2a..a3eebd193dfe 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -34,6 +34,15 @@
34#include "powernv.h" 34#include "powernv.h"
35#include "pci.h" 35#include "pci.h"
36 36
37/* Debugging option */
38#ifdef IODA_EEH_DBG_ON
39#define IODA_EEH_DBG(args...) pr_info(args)
40#else
41#define IODA_EEH_DBG(args...)
42#endif
43
44static char *hub_diag = NULL;
45
37/** 46/**
38 * ioda_eeh_post_init - Chip dependent post initialization 47 * ioda_eeh_post_init - Chip dependent post initialization
39 * @hose: PCI controller 48 * @hose: PCI controller
@@ -47,8 +56,19 @@ static int ioda_eeh_post_init(struct pci_controller *hose)
47 struct pnv_phb *phb = hose->private_data; 56 struct pnv_phb *phb = hose->private_data;
48 57
49 /* FIXME: Enable it for PHB3 later */ 58 /* FIXME: Enable it for PHB3 later */
50 if (phb->type == PNV_PHB_IODA1) 59 if (phb->type == PNV_PHB_IODA1) {
60 if (!hub_diag) {
61 hub_diag = (char *)__get_free_page(GFP_KERNEL |
62 __GFP_ZERO);
63 if (!hub_diag) {
64 pr_err("%s: Out of memory !\n",
65 __func__);
66 return -ENOMEM;
67 }
68 }
69
51 phb->eeh_enabled = 1; 70 phb->eeh_enabled = 1;
71 }
52 72
53 return 0; 73 return 0;
54} 74}
@@ -498,6 +518,316 @@ static int ioda_eeh_configure_bridge(struct eeh_pe *pe)
498 return 0; 518 return 0;
499} 519}
500 520
521static void ioda_eeh_hub_diag_common(struct OpalIoP7IOCErrorData *data)
522{
523 /* GEM */
524 pr_info(" GEM XFIR: %016llx\n", data->gemXfir);
525 pr_info(" GEM RFIR: %016llx\n", data->gemRfir);
526 pr_info(" GEM RIRQFIR: %016llx\n", data->gemRirqfir);
527 pr_info(" GEM Mask: %016llx\n", data->gemMask);
528 pr_info(" GEM RWOF: %016llx\n", data->gemRwof);
529
530 /* LEM */
531 pr_info(" LEM FIR: %016llx\n", data->lemFir);
532 pr_info(" LEM Error Mask: %016llx\n", data->lemErrMask);
533 pr_info(" LEM Action 0: %016llx\n", data->lemAction0);
534 pr_info(" LEM Action 1: %016llx\n", data->lemAction1);
535 pr_info(" LEM WOF: %016llx\n", data->lemWof);
536}
537
538static void ioda_eeh_hub_diag(struct pci_controller *hose)
539{
540 struct pnv_phb *phb = hose->private_data;
541 struct OpalIoP7IOCErrorData *data;
542 long rc;
543
544 data = (struct OpalIoP7IOCErrorData *)ioda_eeh_hub_diag;
545 rc = opal_pci_get_hub_diag_data(phb->hub_id, data, PAGE_SIZE);
546 if (rc != OPAL_SUCCESS) {
547 pr_warning("%s: Failed to get HUB#%llx diag-data (%ld)\n",
548 __func__, phb->hub_id, rc);
549 return;
550 }
551
552 switch (data->type) {
553 case OPAL_P7IOC_DIAG_TYPE_RGC:
554 pr_info("P7IOC diag-data for RGC\n\n");
555 ioda_eeh_hub_diag_common(data);
556 pr_info(" RGC Status: %016llx\n", data->rgc.rgcStatus);
557 pr_info(" RGC LDCP: %016llx\n", data->rgc.rgcLdcp);
558 break;
559 case OPAL_P7IOC_DIAG_TYPE_BI:
560 pr_info("P7IOC diag-data for BI %s\n\n",
561 data->bi.biDownbound ? "Downbound" : "Upbound");
562 ioda_eeh_hub_diag_common(data);
563 pr_info(" BI LDCP 0: %016llx\n", data->bi.biLdcp0);
564 pr_info(" BI LDCP 1: %016llx\n", data->bi.biLdcp1);
565 pr_info(" BI LDCP 2: %016llx\n", data->bi.biLdcp2);
566 pr_info(" BI Fence Status: %016llx\n", data->bi.biFenceStatus);
567 break;
568 case OPAL_P7IOC_DIAG_TYPE_CI:
569 pr_info("P7IOC diag-data for CI Port %d\\nn",
570 data->ci.ciPort);
571 ioda_eeh_hub_diag_common(data);
572 pr_info(" CI Port Status: %016llx\n", data->ci.ciPortStatus);
573 pr_info(" CI Port LDCP: %016llx\n", data->ci.ciPortLdcp);
574 break;
575 case OPAL_P7IOC_DIAG_TYPE_MISC:
576 pr_info("P7IOC diag-data for MISC\n\n");
577 ioda_eeh_hub_diag_common(data);
578 break;
579 case OPAL_P7IOC_DIAG_TYPE_I2C:
580 pr_info("P7IOC diag-data for I2C\n\n");
581 ioda_eeh_hub_diag_common(data);
582 break;
583 default:
584 pr_warning("%s: Invalid type of HUB#%llx diag-data (%d)\n",
585 __func__, phb->hub_id, data->type);
586 }
587}
588
589static void ioda_eeh_p7ioc_phb_diag(struct pci_controller *hose,
590 struct OpalIoPhbErrorCommon *common)
591{
592 struct OpalIoP7IOCPhbErrorData *data;
593 int i;
594
595 data = (struct OpalIoP7IOCPhbErrorData *)common;
596
597 pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n\n",
598 hose->global_number, common->version);
599
600 pr_info(" brdgCtl: %08x\n", data->brdgCtl);
601
602 pr_info(" portStatusReg: %08x\n", data->portStatusReg);
603 pr_info(" rootCmplxStatus: %08x\n", data->rootCmplxStatus);
604 pr_info(" busAgentStatus: %08x\n", data->busAgentStatus);
605
606 pr_info(" deviceStatus: %08x\n", data->deviceStatus);
607 pr_info(" slotStatus: %08x\n", data->slotStatus);
608 pr_info(" linkStatus: %08x\n", data->linkStatus);
609 pr_info(" devCmdStatus: %08x\n", data->devCmdStatus);
610 pr_info(" devSecStatus: %08x\n", data->devSecStatus);
611
612 pr_info(" rootErrorStatus: %08x\n", data->rootErrorStatus);
613 pr_info(" uncorrErrorStatus: %08x\n", data->uncorrErrorStatus);
614 pr_info(" corrErrorStatus: %08x\n", data->corrErrorStatus);
615 pr_info(" tlpHdr1: %08x\n", data->tlpHdr1);
616 pr_info(" tlpHdr2: %08x\n", data->tlpHdr2);
617 pr_info(" tlpHdr3: %08x\n", data->tlpHdr3);
618 pr_info(" tlpHdr4: %08x\n", data->tlpHdr4);
619 pr_info(" sourceId: %08x\n", data->sourceId);
620
621 pr_info(" errorClass: %016llx\n", data->errorClass);
622 pr_info(" correlator: %016llx\n", data->correlator);
623 pr_info(" p7iocPlssr: %016llx\n", data->p7iocPlssr);
624 pr_info(" p7iocCsr: %016llx\n", data->p7iocCsr);
625 pr_info(" lemFir: %016llx\n", data->lemFir);
626 pr_info(" lemErrorMask: %016llx\n", data->lemErrorMask);
627 pr_info(" lemWOF: %016llx\n", data->lemWOF);
628 pr_info(" phbErrorStatus: %016llx\n", data->phbErrorStatus);
629 pr_info(" phbFirstErrorStatus: %016llx\n", data->phbFirstErrorStatus);
630 pr_info(" phbErrorLog0: %016llx\n", data->phbErrorLog0);
631 pr_info(" phbErrorLog1: %016llx\n", data->phbErrorLog1);
632 pr_info(" mmioErrorStatus: %016llx\n", data->mmioErrorStatus);
633 pr_info(" mmioFirstErrorStatus: %016llx\n", data->mmioFirstErrorStatus);
634 pr_info(" mmioErrorLog0: %016llx\n", data->mmioErrorLog0);
635 pr_info(" mmioErrorLog1: %016llx\n", data->mmioErrorLog1);
636 pr_info(" dma0ErrorStatus: %016llx\n", data->dma0ErrorStatus);
637 pr_info(" dma0FirstErrorStatus: %016llx\n", data->dma0FirstErrorStatus);
638 pr_info(" dma0ErrorLog0: %016llx\n", data->dma0ErrorLog0);
639 pr_info(" dma0ErrorLog1: %016llx\n", data->dma0ErrorLog1);
640 pr_info(" dma1ErrorStatus: %016llx\n", data->dma1ErrorStatus);
641 pr_info(" dma1FirstErrorStatus: %016llx\n", data->dma1FirstErrorStatus);
642 pr_info(" dma1ErrorLog0: %016llx\n", data->dma1ErrorLog0);
643 pr_info(" dma1ErrorLog1: %016llx\n", data->dma1ErrorLog1);
644
645 for (i = 0; i < OPAL_P7IOC_NUM_PEST_REGS; i++) {
646 if ((data->pestA[i] >> 63) == 0 &&
647 (data->pestB[i] >> 63) == 0)
648 continue;
649
650 pr_info(" PE[%3d] PESTA: %016llx\n", i, data->pestA[i]);
651 pr_info(" PESTB: %016llx\n", data->pestB[i]);
652 }
653}
654
655static void ioda_eeh_phb_diag(struct pci_controller *hose)
656{
657 struct pnv_phb *phb = hose->private_data;
658 struct OpalIoPhbErrorCommon *common;
659 long rc;
660
661 common = (struct OpalIoPhbErrorCommon *)phb->diag.blob;
662 rc = opal_pci_get_phb_diag_data2(phb->opal_id, common, PAGE_SIZE);
663 if (rc != OPAL_SUCCESS) {
664 pr_warning("%s: Failed to get diag-data for PHB#%x (%ld)\n",
665 __func__, hose->global_number, rc);
666 return;
667 }
668
669 switch (common->ioType) {
670 case OPAL_PHB_ERROR_DATA_TYPE_P7IOC:
671 ioda_eeh_p7ioc_phb_diag(hose, common);
672 break;
673 default:
674 pr_warning("%s: Unrecognized I/O chip %d\n",
675 __func__, common->ioType);
676 }
677}
678
679static int ioda_eeh_get_phb_pe(struct pci_controller *hose,
680 struct eeh_pe **pe)
681{
682 struct eeh_pe *phb_pe;
683
684 phb_pe = eeh_phb_pe_get(hose);
685 if (!phb_pe) {
686 pr_warning("%s Can't find PE for PHB#%d\n",
687 __func__, hose->global_number);
688 return -EEXIST;
689 }
690
691 *pe = phb_pe;
692 return 0;
693}
694
695static int ioda_eeh_get_pe(struct pci_controller *hose,
696 u16 pe_no, struct eeh_pe **pe)
697{
698 struct eeh_pe *phb_pe, *dev_pe;
699 struct eeh_dev dev;
700
701 /* Find the PHB PE */
702 if (ioda_eeh_get_phb_pe(hose, &phb_pe))
703 return -EEXIST;
704
705 /* Find the PE according to PE# */
706 memset(&dev, 0, sizeof(struct eeh_dev));
707 dev.phb = hose;
708 dev.pe_config_addr = pe_no;
709 dev_pe = eeh_pe_get(&dev);
710 if (!dev_pe) {
711 pr_warning("%s: Can't find PE for PHB#%x - PE#%x\n",
712 __func__, hose->global_number, pe_no);
713 return -EEXIST;
714 }
715
716 *pe = dev_pe;
717 return 0;
718}
719
720/**
721 * ioda_eeh_next_error - Retrieve next error for EEH core to handle
722 * @pe: The affected PE
723 *
724 * The function is expected to be called by EEH core while it gets
725 * special EEH event (without binding PE). The function calls to
726 * OPAL APIs for next error to handle. The informational error is
727 * handled internally by platform. However, the dead IOC, dead PHB,
728 * fenced PHB and frozen PE should be handled by EEH core eventually.
729 */
730static int ioda_eeh_next_error(struct eeh_pe **pe)
731{
732 struct pci_controller *hose, *tmp;
733 struct pnv_phb *phb;
734 u64 frozen_pe_no;
735 u16 err_type, severity;
736 long rc;
737 int ret = 1;
738
739 /* While running here, it's safe to purge the event queue */
740 eeh_remove_event(NULL);
741
742 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
743 /*
744 * If the subordinate PCI buses of the PHB has been
745 * removed, we needn't take care of it any more.
746 */
747 phb = hose->private_data;
748 if (phb->removed)
749 continue;
750
751 rc = opal_pci_next_error(phb->opal_id,
752 &frozen_pe_no, &err_type, &severity);
753
754 /* If OPAL API returns error, we needn't proceed */
755 if (rc != OPAL_SUCCESS) {
756 IODA_EEH_DBG("%s: Invalid return value on "
757 "PHB#%x (0x%lx) from opal_pci_next_error",
758 __func__, hose->global_number, rc);
759 continue;
760 }
761
762 /* If the PHB doesn't have error, stop processing */
763 if (err_type == OPAL_EEH_NO_ERROR ||
764 severity == OPAL_EEH_SEV_NO_ERROR) {
765 IODA_EEH_DBG("%s: No error found on PHB#%x\n",
766 __func__, hose->global_number);
767 continue;
768 }
769
770 /*
771 * Processing the error. We're expecting the error with
772 * highest priority reported upon multiple errors on the
773 * specific PHB.
774 */
775 IODA_EEH_DBG("%s: Error (%d, %d, %d) on PHB#%x\n",
776 err_type, severity, pe_no, hose->global_number);
777 switch (err_type) {
778 case OPAL_EEH_IOC_ERROR:
779 if (severity == OPAL_EEH_SEV_IOC_DEAD) {
780 list_for_each_entry_safe(hose, tmp,
781 &hose_list, list_node) {
782 phb = hose->private_data;
783 phb->removed = 1;
784 }
785
786 WARN(1, "EEH: dead IOC detected\n");
787 ret = 4;
788 goto out;
789 } else if (severity == OPAL_EEH_SEV_INF)
790 ioda_eeh_hub_diag(hose);
791
792 break;
793 case OPAL_EEH_PHB_ERROR:
794 if (severity == OPAL_EEH_SEV_PHB_DEAD) {
795 if (ioda_eeh_get_phb_pe(hose, pe))
796 break;
797
798 WARN(1, "EEH: dead PHB#%x detected\n",
799 hose->global_number);
800 phb->removed = 1;
801 ret = 3;
802 goto out;
803 } else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
804 if (ioda_eeh_get_phb_pe(hose, pe))
805 break;
806
807 WARN(1, "EEH: fenced PHB#%x detected\n",
808 hose->global_number);
809 ret = 2;
810 goto out;
811 } else if (severity == OPAL_EEH_SEV_INF)
812 ioda_eeh_phb_diag(hose);
813
814 break;
815 case OPAL_EEH_PE_ERROR:
816 if (ioda_eeh_get_pe(hose, frozen_pe_no, pe))
817 break;
818
819 WARN(1, "EEH: Frozen PE#%x on PHB#%x detected\n",
820 (*pe)->addr, (*pe)->phb->global_number);
821 ret = 1;
822 goto out;
823 }
824 }
825
826 ret = 0;
827out:
828 return ret;
829}
830
501struct pnv_eeh_ops ioda_eeh_ops = { 831struct pnv_eeh_ops ioda_eeh_ops = {
502 .post_init = ioda_eeh_post_init, 832 .post_init = ioda_eeh_post_init,
503 .set_option = ioda_eeh_set_option, 833 .set_option = ioda_eeh_set_option,
@@ -505,5 +835,5 @@ struct pnv_eeh_ops ioda_eeh_ops = {
505 .reset = ioda_eeh_reset, 835 .reset = ioda_eeh_reset,
506 .get_log = ioda_eeh_get_log, 836 .get_log = ioda_eeh_get_log,
507 .configure_bridge = ioda_eeh_configure_bridge, 837 .configure_bridge = ioda_eeh_configure_bridge,
508 .next_error = NULL 838 .next_error = ioda_eeh_next_error
509}; 839};
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 336c9dc1b314..3656a2409e9a 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -93,6 +93,7 @@ struct pnv_phb {
93#ifdef CONFIG_EEH 93#ifdef CONFIG_EEH
94 struct pnv_eeh_ops *eeh_ops; 94 struct pnv_eeh_ops *eeh_ops;
95 int eeh_enabled; 95 int eeh_enabled;
96 int removed;
96#endif 97#endif
97 98
98#ifdef CONFIG_PCI_MSI 99#ifdef CONFIG_PCI_MSI