diff options
author | Gavin Shan <shangw@linux.vnet.ibm.com> | 2014-01-15 00:16:11 -0500 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2014-01-15 01:18:58 -0500 |
commit | 7e4e7867b1e551b7b8f326da3604c47332972bc6 (patch) | |
tree | e071befc52d4aa431b7ebba8e04685de3bf85751 /arch/powerpc/platforms/powernv | |
parent | fac515db45207718168cb55ca4d0a390e43b61af (diff) |
powerpc/eeh: Handle multiple EEH errors
For one PCI error relevant OPAL event, we possibly have multiple
EEH errors for that. For example, multiple frozen PEs detected on
different PHBs. Unfortunately, we didn't cover the case. The patch
enumarates the return value from eeh_ops::next_error() and change
eeh_handle_special_event() and eeh_ops::next_error() to handle all
existing EEH errors.
As Ben pointed out, we needn't list_for_each_entry_safe() since we
are not deleting any PHB from the hose_list and the EEH serialized
lock should be held while purging EEH events. The patch covers those
suggestions as well.
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r-- | arch/powerpc/platforms/powernv/eeh-ioda.c | 39 |
1 files changed, 24 insertions, 15 deletions
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 007ac4989841..8dd16f4675a2 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c | |||
@@ -735,12 +735,12 @@ static int ioda_eeh_get_pe(struct pci_controller *hose, | |||
735 | */ | 735 | */ |
736 | static int ioda_eeh_next_error(struct eeh_pe **pe) | 736 | static int ioda_eeh_next_error(struct eeh_pe **pe) |
737 | { | 737 | { |
738 | struct pci_controller *hose, *tmp; | 738 | struct pci_controller *hose; |
739 | struct pnv_phb *phb; | 739 | struct pnv_phb *phb; |
740 | u64 frozen_pe_no; | 740 | u64 frozen_pe_no; |
741 | u16 err_type, severity; | 741 | u16 err_type, severity; |
742 | long rc; | 742 | long rc; |
743 | int ret = 1; | 743 | int ret = EEH_NEXT_ERR_NONE; |
744 | 744 | ||
745 | /* | 745 | /* |
746 | * While running here, it's safe to purge the event queue. | 746 | * While running here, it's safe to purge the event queue. |
@@ -750,7 +750,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) | |||
750 | eeh_remove_event(NULL); | 750 | eeh_remove_event(NULL); |
751 | opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); | 751 | opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); |
752 | 752 | ||
753 | list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { | 753 | list_for_each_entry(hose, &hose_list, list_node) { |
754 | /* | 754 | /* |
755 | * If the subordinate PCI buses of the PHB has been | 755 | * If the subordinate PCI buses of the PHB has been |
756 | * removed, we needn't take care of it any more. | 756 | * removed, we needn't take care of it any more. |
@@ -789,19 +789,19 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) | |||
789 | switch (err_type) { | 789 | switch (err_type) { |
790 | case OPAL_EEH_IOC_ERROR: | 790 | case OPAL_EEH_IOC_ERROR: |
791 | if (severity == OPAL_EEH_SEV_IOC_DEAD) { | 791 | if (severity == OPAL_EEH_SEV_IOC_DEAD) { |
792 | list_for_each_entry_safe(hose, tmp, | 792 | list_for_each_entry(hose, &hose_list, |
793 | &hose_list, list_node) { | 793 | list_node) { |
794 | phb = hose->private_data; | 794 | phb = hose->private_data; |
795 | phb->eeh_state |= PNV_EEH_STATE_REMOVED; | 795 | phb->eeh_state |= PNV_EEH_STATE_REMOVED; |
796 | } | 796 | } |
797 | 797 | ||
798 | pr_err("EEH: dead IOC detected\n"); | 798 | pr_err("EEH: dead IOC detected\n"); |
799 | ret = 4; | 799 | ret = EEH_NEXT_ERR_DEAD_IOC; |
800 | goto out; | ||
801 | } else if (severity == OPAL_EEH_SEV_INF) { | 800 | } else if (severity == OPAL_EEH_SEV_INF) { |
802 | pr_info("EEH: IOC informative error " | 801 | pr_info("EEH: IOC informative error " |
803 | "detected\n"); | 802 | "detected\n"); |
804 | ioda_eeh_hub_diag(hose); | 803 | ioda_eeh_hub_diag(hose); |
804 | ret = EEH_NEXT_ERR_NONE; | ||
805 | } | 805 | } |
806 | 806 | ||
807 | break; | 807 | break; |
@@ -813,21 +813,20 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) | |||
813 | pr_err("EEH: dead PHB#%x detected\n", | 813 | pr_err("EEH: dead PHB#%x detected\n", |
814 | hose->global_number); | 814 | hose->global_number); |
815 | phb->eeh_state |= PNV_EEH_STATE_REMOVED; | 815 | phb->eeh_state |= PNV_EEH_STATE_REMOVED; |
816 | ret = 3; | 816 | ret = EEH_NEXT_ERR_DEAD_PHB; |
817 | goto out; | ||
818 | } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { | 817 | } else if (severity == OPAL_EEH_SEV_PHB_FENCED) { |
819 | if (ioda_eeh_get_phb_pe(hose, pe)) | 818 | if (ioda_eeh_get_phb_pe(hose, pe)) |
820 | break; | 819 | break; |
821 | 820 | ||
822 | pr_err("EEH: fenced PHB#%x detected\n", | 821 | pr_err("EEH: fenced PHB#%x detected\n", |
823 | hose->global_number); | 822 | hose->global_number); |
824 | ret = 2; | 823 | ret = EEH_NEXT_ERR_FENCED_PHB; |
825 | goto out; | ||
826 | } else if (severity == OPAL_EEH_SEV_INF) { | 824 | } else if (severity == OPAL_EEH_SEV_INF) { |
827 | pr_info("EEH: PHB#%x informative error " | 825 | pr_info("EEH: PHB#%x informative error " |
828 | "detected\n", | 826 | "detected\n", |
829 | hose->global_number); | 827 | hose->global_number); |
830 | ioda_eeh_phb_diag(hose); | 828 | ioda_eeh_phb_diag(hose); |
829 | ret = EEH_NEXT_ERR_NONE; | ||
831 | } | 830 | } |
832 | 831 | ||
833 | break; | 832 | break; |
@@ -837,13 +836,23 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) | |||
837 | 836 | ||
838 | pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", | 837 | pr_err("EEH: Frozen PE#%x on PHB#%x detected\n", |
839 | (*pe)->addr, (*pe)->phb->global_number); | 838 | (*pe)->addr, (*pe)->phb->global_number); |
840 | ret = 1; | 839 | ret = EEH_NEXT_ERR_FROZEN_PE; |
841 | goto out; | 840 | break; |
841 | default: | ||
842 | pr_warn("%s: Unexpected error type %d\n", | ||
843 | __func__, err_type); | ||
842 | } | 844 | } |
845 | |||
846 | /* | ||
847 | * If we have no errors on the specific PHB or only | ||
848 | * informative error there, we continue poking it. | ||
849 | * Otherwise, we need actions to be taken by upper | ||
850 | * layer. | ||
851 | */ | ||
852 | if (ret > EEH_NEXT_ERR_INF) | ||
853 | break; | ||
843 | } | 854 | } |
844 | 855 | ||
845 | ret = 0; | ||
846 | out: | ||
847 | return ret; | 856 | return ret; |
848 | } | 857 | } |
849 | 858 | ||