aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
authorGavin Shan <shangw@linux.vnet.ibm.com>2014-01-15 00:16:11 -0500
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2014-01-15 01:18:58 -0500
commit7e4e7867b1e551b7b8f326da3604c47332972bc6 (patch)
treee071befc52d4aa431b7ebba8e04685de3bf85751 /arch/powerpc/platforms/powernv
parentfac515db45207718168cb55ca4d0a390e43b61af (diff)
powerpc/eeh: Handle multiple EEH errors
For one PCI error relevant OPAL event, we possibly have multiple EEH errors for that. For example, multiple frozen PEs detected on different PHBs. Unfortunately, we didn't cover the case. The patch enumarates the return value from eeh_ops::next_error() and change eeh_handle_special_event() and eeh_ops::next_error() to handle all existing EEH errors. As Ben pointed out, we needn't list_for_each_entry_safe() since we are not deleting any PHB from the hose_list and the EEH serialized lock should be held while purging EEH events. The patch covers those suggestions as well. Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c39
1 files changed, 24 insertions, 15 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 007ac4989841..8dd16f4675a2 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -735,12 +735,12 @@ static int ioda_eeh_get_pe(struct pci_controller *hose,
735 */ 735 */
736static int ioda_eeh_next_error(struct eeh_pe **pe) 736static int ioda_eeh_next_error(struct eeh_pe **pe)
737{ 737{
738 struct pci_controller *hose, *tmp; 738 struct pci_controller *hose;
739 struct pnv_phb *phb; 739 struct pnv_phb *phb;
740 u64 frozen_pe_no; 740 u64 frozen_pe_no;
741 u16 err_type, severity; 741 u16 err_type, severity;
742 long rc; 742 long rc;
743 int ret = 1; 743 int ret = EEH_NEXT_ERR_NONE;
744 744
745 /* 745 /*
746 * While running here, it's safe to purge the event queue. 746 * While running here, it's safe to purge the event queue.
@@ -750,7 +750,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
750 eeh_remove_event(NULL); 750 eeh_remove_event(NULL);
751 opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); 751 opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
752 752
753 list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 753 list_for_each_entry(hose, &hose_list, list_node) {
754 /* 754 /*
755 * If the subordinate PCI buses of the PHB has been 755 * If the subordinate PCI buses of the PHB has been
756 * removed, we needn't take care of it any more. 756 * removed, we needn't take care of it any more.
@@ -789,19 +789,19 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
789 switch (err_type) { 789 switch (err_type) {
790 case OPAL_EEH_IOC_ERROR: 790 case OPAL_EEH_IOC_ERROR:
791 if (severity == OPAL_EEH_SEV_IOC_DEAD) { 791 if (severity == OPAL_EEH_SEV_IOC_DEAD) {
792 list_for_each_entry_safe(hose, tmp, 792 list_for_each_entry(hose, &hose_list,
793 &hose_list, list_node) { 793 list_node) {
794 phb = hose->private_data; 794 phb = hose->private_data;
795 phb->eeh_state |= PNV_EEH_STATE_REMOVED; 795 phb->eeh_state |= PNV_EEH_STATE_REMOVED;
796 } 796 }
797 797
798 pr_err("EEH: dead IOC detected\n"); 798 pr_err("EEH: dead IOC detected\n");
799 ret = 4; 799 ret = EEH_NEXT_ERR_DEAD_IOC;
800 goto out;
801 } else if (severity == OPAL_EEH_SEV_INF) { 800 } else if (severity == OPAL_EEH_SEV_INF) {
802 pr_info("EEH: IOC informative error " 801 pr_info("EEH: IOC informative error "
803 "detected\n"); 802 "detected\n");
804 ioda_eeh_hub_diag(hose); 803 ioda_eeh_hub_diag(hose);
804 ret = EEH_NEXT_ERR_NONE;
805 } 805 }
806 806
807 break; 807 break;
@@ -813,21 +813,20 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
813 pr_err("EEH: dead PHB#%x detected\n", 813 pr_err("EEH: dead PHB#%x detected\n",
814 hose->global_number); 814 hose->global_number);
815 phb->eeh_state |= PNV_EEH_STATE_REMOVED; 815 phb->eeh_state |= PNV_EEH_STATE_REMOVED;
816 ret = 3; 816 ret = EEH_NEXT_ERR_DEAD_PHB;
817 goto out;
818 } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { 817 } else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
819 if (ioda_eeh_get_phb_pe(hose, pe)) 818 if (ioda_eeh_get_phb_pe(hose, pe))
820 break; 819 break;
821 820
822 pr_err("EEH: fenced PHB#%x detected\n", 821 pr_err("EEH: fenced PHB#%x detected\n",
823 hose->global_number); 822 hose->global_number);
824 ret = 2; 823 ret = EEH_NEXT_ERR_FENCED_PHB;
825 goto out;
826 } else if (severity == OPAL_EEH_SEV_INF) { 824 } else if (severity == OPAL_EEH_SEV_INF) {
827 pr_info("EEH: PHB#%x informative error " 825 pr_info("EEH: PHB#%x informative error "
828 "detected\n", 826 "detected\n",
829 hose->global_number); 827 hose->global_number);
830 ioda_eeh_phb_diag(hose); 828 ioda_eeh_phb_diag(hose);
829 ret = EEH_NEXT_ERR_NONE;
831 } 830 }
832 831
833 break; 832 break;
@@ -837,13 +836,23 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
837 836
838 pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", 837 pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
839 (*pe)->addr, (*pe)->phb->global_number); 838 (*pe)->addr, (*pe)->phb->global_number);
840 ret = 1; 839 ret = EEH_NEXT_ERR_FROZEN_PE;
841 goto out; 840 break;
841 default:
842 pr_warn("%s: Unexpected error type %d\n",
843 __func__, err_type);
842 } 844 }
845
846 /*
847 * If we have no errors on the specific PHB or only
848 * informative error there, we continue poking it.
849 * Otherwise, we need actions to be taken by upper
850 * layer.
851 */
852 if (ret > EEH_NEXT_ERR_INF)
853 break;
843 } 854 }
844 855
845 ret = 0;
846out:
847 return ret; 856 return ret;
848} 857}
849 858