aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/platforms/pseries/eeh_event.c
diff options
context:
space:
mode:
authorLinas Vepstas <linas@linas.org>2005-11-03 19:50:04 -0500
committerPaul Mackerras <paulus@samba.org>2005-11-09 19:38:05 -0500
commit172ca9261800bacbbc7d320d9924d9b482dff8de (patch)
tree7abd6ddf1e6b9a147a0826c374f0d1bca80806d3 /arch/powerpc/platforms/pseries/eeh_event.c
parent7f79da7accd63a6adb84f4602f66779f6a701e7b (diff)
[PATCH] ppc64: PCI error event dispatcher
12-eeh-event-dispatcher.patch ppc64: EEH Recovery dispatcher thread This patch adds a mechanism to create recovery threads when an EEH event is received. Since an EEH freeze state may be detected within an interrupt context, we need to get out of the interrupt context before starting recovery. This dispatcher does this in two steps: first, it uses a workqueue to get out, and then lanuches a kernel thread, so that the recovery routine can sleep for exteded periods without upseting the keventd. A kernel thread is created with each EEH event, rather than having one long-running daemon started at boot time. This is because it is anticipated that EEH events will be very rare (very very rare, ideally) and so its pointless to cluter the process tables with a daemon that will almost never run. Signed-off-by: Linas Vepstas <linas@austin.ibm.com> Signed-off-by: Paul Mackerras <paulus@samba.org>
Diffstat (limited to 'arch/powerpc/platforms/pseries/eeh_event.c')
-rw-r--r--arch/powerpc/platforms/pseries/eeh_event.c155
1 files changed, 155 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
new file mode 100644
index 000000000000..92497333c2b6
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -0,0 +1,155 @@
1/*
2 * eeh_event.c
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2005 Linas Vepstas <linas@linas.org>
19 */
20
21#include <linux/list.h>
22#include <linux/pci.h>
23#include <asm/eeh_event.h>
24
25/** Overview:
26 * EEH error states may be detected within exception handlers;
27 * however, the recovery processing needs to occur asynchronously
28 * in a normal kernel context and not an interrupt context.
29 * This pair of routines creates an event and queues it onto a
30 * work-queue, where a worker thread can drive recovery.
31 */
32
33/* EEH event workqueue setup. */
34static spinlock_t eeh_eventlist_lock = SPIN_LOCK_UNLOCKED;
35LIST_HEAD(eeh_eventlist);
36static void eeh_thread_launcher(void *);
37DECLARE_WORK(eeh_event_wq, eeh_thread_launcher, NULL);
38
39/**
40 * eeh_panic - call panic() for an eeh event that cannot be handled.
41 * The philosophy of this routine is that it is better to panic and
42 * halt the OS than it is to risk possible data corruption by
43 * oblivious device drivers that don't know better.
44 *
45 * @dev pci device that had an eeh event
46 * @reset_state current reset state of the device slot
47 */
48static void eeh_panic(struct pci_dev *dev, int reset_state)
49{
50 /*
51 * Since the panic_on_oops sysctl is used to halt the system
52 * in light of potential corruption, we can use it here.
53 */
54 if (panic_on_oops) {
55 panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
56 pci_name(dev));
57 }
58 else {
59 printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
60 reset_state, pci_name(dev));
61 }
62}
63
64/**
65 * eeh_event_handler - dispatch EEH events. The detection of a frozen
66 * slot can occur inside an interrupt, where it can be hard to do
67 * anything about it. The goal of this routine is to pull these
68 * detection events out of the context of the interrupt handler, and
69 * re-dispatch them for processing at a later time in a normal context.
70 *
71 * @dummy - unused
72 */
73static int eeh_event_handler(void * dummy)
74{
75 unsigned long flags;
76 struct eeh_event *event;
77
78 daemonize ("eehd");
79
80 while (1) {
81 set_current_state(TASK_INTERRUPTIBLE);
82
83 spin_lock_irqsave(&eeh_eventlist_lock, flags);
84 event = NULL;
85 if (!list_empty(&eeh_eventlist)) {
86 event = list_entry(eeh_eventlist.next, struct eeh_event, list);
87 list_del(&event->list);
88 }
89 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
90 if (event == NULL)
91 break;
92
93 printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
94 pci_name(event->dev));
95
96 eeh_panic (event->dev, event->state);
97
98 kfree(event);
99 }
100
101 return 0;
102}
103
104/**
105 * eeh_thread_launcher
106 *
107 * @dummy - unused
108 */
109static void eeh_thread_launcher(void *dummy)
110{
111 if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0)
112 printk(KERN_ERR "Failed to start EEH daemon\n");
113}
114
115/**
116 * eeh_send_failure_event - generate a PCI error event
117 * @dev pci device
118 *
119 * This routine can be called within an interrupt context;
120 * the actual event will be delivered in a normal context
121 * (from a workqueue).
122 */
123int eeh_send_failure_event (struct device_node *dn,
124 struct pci_dev *dev,
125 int state,
126 int time_unavail)
127{
128 unsigned long flags;
129 struct eeh_event *event;
130
131 event = kmalloc(sizeof(*event), GFP_ATOMIC);
132 if (event == NULL) {
133 printk (KERN_ERR "EEH: out of memory, event not handled\n");
134 return 1;
135 }
136
137 if (dev)
138 pci_dev_get(dev);
139
140 event->dn = dn;
141 event->dev = dev;
142 event->state = state;
143 event->time_unavail = time_unavail;
144
145 /* We may or may not be called in an interrupt context */
146 spin_lock_irqsave(&eeh_eventlist_lock, flags);
147 list_add(&event->list, &eeh_eventlist);
148 spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
149
150 schedule_work(&eeh_event_wq);
151
152 return 0;
153}
154
155/********************** END OF FILE ******************************/