aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGavin Shan <shangw@linux.vnet.ibm.com>2012-09-17 00:34:27 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2012-09-18 01:32:48 -0400
commitfeadf7c0a1a7c08c74bebb4a13b755f8c40e3bbc (patch)
tree00d8b8c42f57f566618dd0d0aea65190706ca4d8
parent8e9f69371536981a2a8c9ee4a49dbe3aa4946df4 (diff)
powerpc/eeh: Lock module while handling EEH event
The EEH core is talking with the PCI device driver to determine the action (purely reset, or PCI device removal). During the period, the driver might be unloaded and in turn causes kernel crash as follows: EEH: Detected PCI bus error on PHB#4-PE#10000 EEH: This PCI device has failed 3 times in the last hour lpfc 0004:01:00.0: 0:2710 PCI channel disable preparing for reset Unable to handle kernel paging request for data at address 0x00000490 Faulting instruction address: 0xd00000000e682c90 cpu 0x1: Vector: 300 (Data Access) at [c000000fc75ffa20] pc: d00000000e682c90: .lpfc_io_error_detected+0x30/0x240 [lpfc] lr: d00000000e682c8c: .lpfc_io_error_detected+0x2c/0x240 [lpfc] sp: c000000fc75ffca0 msr: 8000000000009032 dar: 490 dsisr: 40000000 current = 0xc000000fc79b88b0 paca = 0xc00000000edb0380 softe: 0 irq_happened: 0x00 pid = 3386, comm = eehd enter ? for help [c000000fc75ffca0] c000000fc75ffd30 (unreliable) [c000000fc75ffd30] c00000000004fd3c .eeh_report_error+0x7c/0xf0 [c000000fc75ffdc0] c00000000004ee00 .eeh_pe_dev_traverse+0xa0/0x180 [c000000fc75ffe70] c00000000004ffd8 .eeh_handle_event+0x68/0x300 [c000000fc75fff00] c0000000000503a0 .eeh_event_handler+0x130/0x1a0 [c000000fc75fff90] c000000000020138 .kernel_thread+0x54/0x70 1:mon> The patch increases the reference of the corresponding driver modules while EEH core does the negotiation with PCI device driver so that the corresponding driver modules can't be unloaded during the period and we're safe to refer the callbacks. Cc: stable@vger.kernel.org Reported-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/platforms/pseries/eeh_driver.c91
1 files changed, 70 insertions, 21 deletions
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 37c2cf743133..a3fefb61097c 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -25,6 +25,7 @@
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/interrupt.h> 26#include <linux/interrupt.h>
27#include <linux/irq.h> 27#include <linux/irq.h>
28#include <linux/module.h>
28#include <linux/pci.h> 29#include <linux/pci.h>
29#include <asm/eeh.h> 30#include <asm/eeh.h>
30#include <asm/eeh_event.h> 31#include <asm/eeh_event.h>
@@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name(struct pci_dev *pdev)
47 return ""; 48 return "";
48} 49}
49 50
51/**
52 * eeh_pcid_get - Get the PCI device driver
53 * @pdev: PCI device
54 *
55 * The function is used to retrieve the PCI device driver for
56 * the indicated PCI device. Besides, we will increase the reference
57 * of the PCI device driver to prevent that being unloaded on
58 * the fly. Otherwise, kernel crash would be seen.
59 */
60static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
61{
62 if (!pdev || !pdev->driver)
63 return NULL;
64
65 if (!try_module_get(pdev->driver->driver.owner))
66 return NULL;
67
68 return pdev->driver;
69}
70
71/**
72 * eeh_pcid_put - Dereference on the PCI device driver
73 * @pdev: PCI device
74 *
75 * The function is called to do dereference on the PCI device
76 * driver of the indicated PCI device.
77 */
78static inline void eeh_pcid_put(struct pci_dev *pdev)
79{
80 if (!pdev || !pdev->driver)
81 return;
82
83 module_put(pdev->driver->driver.owner);
84}
85
50#if 0 86#if 0
51static void print_device_node_tree(struct pci_dn *pdn, int dent) 87static void print_device_node_tree(struct pci_dn *pdn, int dent)
52{ 88{
@@ -128,23 +164,24 @@ static void *eeh_report_error(void *data, void *userdata)
128 struct eeh_dev *edev = (struct eeh_dev *)data; 164 struct eeh_dev *edev = (struct eeh_dev *)data;
129 struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 165 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
130 enum pci_ers_result rc, *res = userdata; 166 enum pci_ers_result rc, *res = userdata;
131 struct pci_driver *driver = dev->driver; 167 struct pci_driver *driver;
132 168
133 /* We might not have the associated PCI device, 169 /* We might not have the associated PCI device,
134 * then we should continue for next one. 170 * then we should continue for next one.
135 */ 171 */
136 if (!dev) return NULL; 172 if (!dev) return NULL;
137
138 dev->error_state = pci_channel_io_frozen; 173 dev->error_state = pci_channel_io_frozen;
139 174
140 if (!driver) 175 driver = eeh_pcid_get(dev);
141 return NULL; 176 if (!driver) return NULL;
142 177
143 eeh_disable_irq(dev); 178 eeh_disable_irq(dev);
144 179
145 if (!driver->err_handler || 180 if (!driver->err_handler ||
146 !driver->err_handler->error_detected) 181 !driver->err_handler->error_detected) {
182 eeh_pcid_put(dev);
147 return NULL; 183 return NULL;
184 }
148 185
149 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); 186 rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
150 187
@@ -152,6 +189,7 @@ static void *eeh_report_error(void *data, void *userdata)
152 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; 189 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
153 if (*res == PCI_ERS_RESULT_NONE) *res = rc; 190 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
154 191
192 eeh_pcid_put(dev);
155 return NULL; 193 return NULL;
156} 194}
157 195
@@ -171,12 +209,14 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
171 enum pci_ers_result rc, *res = userdata; 209 enum pci_ers_result rc, *res = userdata;
172 struct pci_driver *driver; 210 struct pci_driver *driver;
173 211
174 if (!dev) return NULL; 212 driver = eeh_pcid_get(dev);
213 if (!driver) return NULL;
175 214
176 if (!(driver = dev->driver) || 215 if (!driver->err_handler ||
177 !driver->err_handler || 216 !driver->err_handler->mmio_enabled) {
178 !driver->err_handler->mmio_enabled) 217 eeh_pcid_put(dev);
179 return NULL; 218 return NULL;
219 }
180 220
181 rc = driver->err_handler->mmio_enabled(dev); 221 rc = driver->err_handler->mmio_enabled(dev);
182 222
@@ -184,6 +224,7 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
184 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; 224 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
185 if (*res == PCI_ERS_RESULT_NONE) *res = rc; 225 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
186 226
227 eeh_pcid_put(dev);
187 return NULL; 228 return NULL;
188} 229}
189 230
@@ -204,16 +245,19 @@ static void *eeh_report_reset(void *data, void *userdata)
204 enum pci_ers_result rc, *res = userdata; 245 enum pci_ers_result rc, *res = userdata;
205 struct pci_driver *driver; 246 struct pci_driver *driver;
206 247
207 if (!dev || !(driver = dev->driver)) 248 if (!dev) return NULL;
208 return NULL;
209
210 dev->error_state = pci_channel_io_normal; 249 dev->error_state = pci_channel_io_normal;
211 250
251 driver = eeh_pcid_get(dev);
252 if (!driver) return NULL;
253
212 eeh_enable_irq(dev); 254 eeh_enable_irq(dev);
213 255
214 if (!driver->err_handler || 256 if (!driver->err_handler ||
215 !driver->err_handler->slot_reset) 257 !driver->err_handler->slot_reset) {
258 eeh_pcid_put(dev);
216 return NULL; 259 return NULL;
260 }
217 261
218 rc = driver->err_handler->slot_reset(dev); 262 rc = driver->err_handler->slot_reset(dev);
219 if ((*res == PCI_ERS_RESULT_NONE) || 263 if ((*res == PCI_ERS_RESULT_NONE) ||
@@ -221,6 +265,7 @@ static void *eeh_report_reset(void *data, void *userdata)
221 if (*res == PCI_ERS_RESULT_DISCONNECT && 265 if (*res == PCI_ERS_RESULT_DISCONNECT &&
222 rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; 266 rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
223 267
268 eeh_pcid_put(dev);
224 return NULL; 269 return NULL;
225} 270}
226 271
@@ -240,20 +285,22 @@ static void *eeh_report_resume(void *data, void *userdata)
240 struct pci_driver *driver; 285 struct pci_driver *driver;
241 286
242 if (!dev) return NULL; 287 if (!dev) return NULL;
243
244 dev->error_state = pci_channel_io_normal; 288 dev->error_state = pci_channel_io_normal;
245 289
246 if (!(driver = dev->driver)) 290 driver = eeh_pcid_get(dev);
247 return NULL; 291 if (!driver) return NULL;
248 292
249 eeh_enable_irq(dev); 293 eeh_enable_irq(dev);
250 294
251 if (!driver->err_handler || 295 if (!driver->err_handler ||
252 !driver->err_handler->resume) 296 !driver->err_handler->resume) {
297 eeh_pcid_put(dev);
253 return NULL; 298 return NULL;
299 }
254 300
255 driver->err_handler->resume(dev); 301 driver->err_handler->resume(dev);
256 302
303 eeh_pcid_put(dev);
257 return NULL; 304 return NULL;
258} 305}
259 306
@@ -272,20 +319,22 @@ static void *eeh_report_failure(void *data, void *userdata)
272 struct pci_driver *driver; 319 struct pci_driver *driver;
273 320
274 if (!dev) return NULL; 321 if (!dev) return NULL;
275
276 dev->error_state = pci_channel_io_perm_failure; 322 dev->error_state = pci_channel_io_perm_failure;
277 323
278 if (!(driver = dev->driver)) 324 driver = eeh_pcid_get(dev);
279 return NULL; 325 if (!driver) return NULL;
280 326
281 eeh_disable_irq(dev); 327 eeh_disable_irq(dev);
282 328
283 if (!driver->err_handler || 329 if (!driver->err_handler ||
284 !driver->err_handler->error_detected) 330 !driver->err_handler->error_detected) {
331 eeh_pcid_put(dev);
285 return NULL; 332 return NULL;
333 }
286 334
287 driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); 335 driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
288 336
337 eeh_pcid_put(dev);
289 return NULL; 338 return NULL;
290} 339}
291 340