aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
authorGavin Shan <gwshan@linux.vnet.ibm.com>2014-06-04 03:31:52 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2014-06-11 03:04:33 -0400
commit5c7a35e3e25232aef8d7aee484436f8cbe3b9b94 (patch)
tree7bf01ffd5b9e4057089c2c2383fbcf492694fca9 /arch/powerpc
parent6e0fdf9af216887e0032c19d276889aad41cad00 (diff)
powerpc/powernv: Fix killed EEH event
On PowerNV platform, EEH errors are reported by IO accessors or poller driven by interrupt. After the PE is isolated, we won't produce EEH event for the PE. The current implementation has possibility of EEH event lost in this way: The interrupt handler queues one "special" event, which drives the poller. EEH thread doesn't pick the special event yet. IO accessors kicks in, the frozen PE is marked as "isolated" and EEH event is queued to the list. EEH thread runs because of special event and purge all existing EEH events. However, we never produce an other EEH event for the frozen PE. Eventually, the PE is marked as "isolated" and we don't have EEH event to recover it. The patch fixes the issue to keep EEH events for PEs that have been marked as "isolated" with the help of additional "force" help to eeh_remove_event(). Reported-by: Rolf Brudeseth <rolfb@us.ibm.com> Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/include/asm/eeh_event.h2
-rw-r--r--arch/powerpc/kernel/eeh_driver.c4
-rw-r--r--arch/powerpc/kernel/eeh_event.c21
-rw-r--r--arch/powerpc/platforms/powernv/eeh-ioda.c2
4 files changed, 19 insertions, 10 deletions
diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index 89d5670b2eeb..1e551a2d6f82 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -33,7 +33,7 @@ struct eeh_event {
33 33
34int eeh_event_init(void); 34int eeh_event_init(void);
35int eeh_send_failure_event(struct eeh_pe *pe); 35int eeh_send_failure_event(struct eeh_pe *pe);
36void eeh_remove_event(struct eeh_pe *pe); 36void eeh_remove_event(struct eeh_pe *pe, bool force);
37void eeh_handle_event(struct eeh_pe *pe); 37void eeh_handle_event(struct eeh_pe *pe);
38 38
39#endif /* __KERNEL__ */ 39#endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 8bb40e7cdeb6..420da61d4ce0 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -770,7 +770,7 @@ static void eeh_handle_special_event(void)
770 eeh_serialize_lock(&flags); 770 eeh_serialize_lock(&flags);
771 771
772 /* Purge all events */ 772 /* Purge all events */
773 eeh_remove_event(NULL); 773 eeh_remove_event(NULL, true);
774 774
775 list_for_each_entry(hose, &hose_list, list_node) { 775 list_for_each_entry(hose, &hose_list, list_node) {
776 phb_pe = eeh_phb_pe_get(hose); 776 phb_pe = eeh_phb_pe_get(hose);
@@ -789,7 +789,7 @@ static void eeh_handle_special_event(void)
789 eeh_serialize_lock(&flags); 789 eeh_serialize_lock(&flags);
790 790
791 /* Purge all events of the PHB */ 791 /* Purge all events of the PHB */
792 eeh_remove_event(pe); 792 eeh_remove_event(pe, true);
793 793
794 if (rc == EEH_NEXT_ERR_DEAD_PHB) 794 if (rc == EEH_NEXT_ERR_DEAD_PHB)
795 eeh_pe_state_mark(pe, EEH_PE_ISOLATED); 795 eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 72d748b56c86..4eefb6e34dbb 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -152,24 +152,33 @@ int eeh_send_failure_event(struct eeh_pe *pe)
152/** 152/**
153 * eeh_remove_event - Remove EEH event from the queue 153 * eeh_remove_event - Remove EEH event from the queue
154 * @pe: Event binding to the PE 154 * @pe: Event binding to the PE
155 * @force: Event will be removed unconditionally
155 * 156 *
156 * On PowerNV platform, we might have subsequent coming events 157 * On PowerNV platform, we might have subsequent coming events
157 * is part of the former one. For that case, those subsequent 158 * is part of the former one. For that case, those subsequent
158 * coming events are totally duplicated and unnecessary, thus 159 * coming events are totally duplicated and unnecessary, thus
159 * they should be removed. 160 * they should be removed.
160 */ 161 */
161void eeh_remove_event(struct eeh_pe *pe) 162void eeh_remove_event(struct eeh_pe *pe, bool force)
162{ 163{
163 unsigned long flags; 164 unsigned long flags;
164 struct eeh_event *event, *tmp; 165 struct eeh_event *event, *tmp;
165 166
167 /*
168 * If we have NULL PE passed in, we have dead IOC
169 * or we're sure we can report all existing errors
170 * by the caller.
171 *
172 * With "force", the event with associated PE that
173 * have been isolated, the event won't be removed
174 * to avoid event lost.
175 */
166 spin_lock_irqsave(&eeh_eventlist_lock, flags); 176 spin_lock_irqsave(&eeh_eventlist_lock, flags);
167 list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { 177 list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) {
168 /* 178 if (!force && event->pe &&
169 * If we don't have valid PE passed in, that means 179 (event->pe->state & EEH_PE_ISOLATED))
170 * we already have event corresponding to dead IOC 180 continue;
171 * and all events should be purged. 181
172 */
173 if (!pe) { 182 if (!pe) {
174 list_del(&event->list); 183 list_del(&event->list);
175 kfree(event); 184 kfree(event);
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 5711f6f1fda6..9c002099f875 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -717,7 +717,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
717 * And we should keep the cached OPAL notifier event sychronized 717 * And we should keep the cached OPAL notifier event sychronized
718 * between the kernel and firmware. 718 * between the kernel and firmware.
719 */ 719 */
720 eeh_remove_event(NULL); 720 eeh_remove_event(NULL, false);
721 opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul); 721 opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
722 722
723 list_for_each_entry(hose, &hose_list, list_node) { 723 list_for_each_entry(hose, &hose_list, list_node) {