aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeith Busch <keith.busch@intel.com>2018-09-20 12:27:13 -0400
committerBjorn Helgaas <bhelgaas@google.com>2018-09-26 15:23:15 -0400
commitbfcb79fca19d267712e425af1dd48812c40dec0c (patch)
treef34cdd194d762c97f3e2b7424d3d2b59890bb31e
parentbdb5ac85777de67c909c9ad4327f03f7648b543f (diff)
PCI/ERR: Run error recovery callbacks for all affected devices
If an Endpoint reported an error with ERR_FATAL, we previously ran driver error recovery callbacks only for the Endpoint's driver. But if we reset a Link to recover from the error, all downstream components are affected, including the Endpoint, any multi-function peers, and children of those peers. Initiate the Link reset from the deepest Downstream Port that is reliable, and call the error recovery callbacks for all its children. If a Downstream Port (including a Root Port) reports an error, we assume the Port itself is reliable and we need to reset its downstream Link. In all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume the Link leading to the component needs to be reset, so we initiate the reset at the parent Downstream Port. This allows two other clean-ups. First, we currently only use a Link reset, which can only be initiated using a Downstream Port, so we can remove checks for Endpoints. Second, the Downstream Port where we initiate the Link reset is reliable (unlike components downstream from it), so the special cases for error detect and resume are no longer necessary. Signed-off-by: Keith Busch <keith.busch@intel.com> [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas <bhelgaas@google.com> Reviewed-by: Sinan Kaya <okaya@kernel.org>
-rw-r--r--drivers/pci/pcie/err.c85
1 files changed, 21 insertions, 64 deletions
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 644f3f725ef0..0fa5e1417a4a 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -63,30 +63,12 @@ static int report_error_detected(struct pci_dev *dev, void *data)
63 if (!dev->driver || 63 if (!dev->driver ||
64 !dev->driver->err_handler || 64 !dev->driver->err_handler ||
65 !dev->driver->err_handler->error_detected) { 65 !dev->driver->err_handler->error_detected) {
66 if (result_data->state == pci_channel_io_frozen &&
67 dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
68 /*
69 * In case of fatal recovery, if one of down-
70 * stream device has no driver. We might be
71 * unable to recover because a later insmod
72 * of a driver for this device is unaware of
73 * its hw state.
74 */
75 pci_printk(KERN_DEBUG, dev, "device has %s\n",
76 dev->driver ?
77 "no AER-aware driver" : "no driver");
78 }
79
80 /* 66 /*
81 * If there's any device in the subtree that does not 67 * If any device in the subtree does not have an error_detected
82 * have an error_detected callback, returning 68 * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
83 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of 69 * error callbacks of "any" device in the subtree, and will
84 * the subsequent mmio_enabled/slot_reset/resume 70 * exit in the disconnected error state.
85 * callbacks of "any" device in the subtree. All the
86 * devices in the subtree are left in the error state
87 * without recovery.
88 */ 71 */
89
90 if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) 72 if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
91 vote = PCI_ERS_RESULT_NO_AER_DRIVER; 73 vote = PCI_ERS_RESULT_NO_AER_DRIVER;
92 else 74 else
@@ -184,34 +166,23 @@ static pci_ers_result_t default_reset_link(struct pci_dev *dev)
184 166
185static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service) 167static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
186{ 168{
187 struct pci_dev *udev;
188 pci_ers_result_t status; 169 pci_ers_result_t status;
189 struct pcie_port_service_driver *driver = NULL; 170 struct pcie_port_service_driver *driver = NULL;
190 171
191 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 172 driver = pcie_port_find_service(dev, service);
192 /* Reset this port for all subordinates */
193 udev = dev;
194 } else {
195 /* Reset the upstream component (likely downstream port) */
196 udev = dev->bus->self;
197 }
198
199 /* Use the aer driver of the component firstly */
200 driver = pcie_port_find_service(udev, service);
201
202 if (driver && driver->reset_link) { 173 if (driver && driver->reset_link) {
203 status = driver->reset_link(udev); 174 status = driver->reset_link(dev);
204 } else if (udev->has_secondary_link) { 175 } else if (dev->has_secondary_link) {
205 status = default_reset_link(udev); 176 status = default_reset_link(dev);
206 } else { 177 } else {
207 pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n", 178 pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
208 pci_name(udev)); 179 pci_name(dev));
209 return PCI_ERS_RESULT_DISCONNECT; 180 return PCI_ERS_RESULT_DISCONNECT;
210 } 181 }
211 182
212 if (status != PCI_ERS_RESULT_RECOVERED) { 183 if (status != PCI_ERS_RESULT_RECOVERED) {
213 pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n", 184 pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
214 pci_name(udev)); 185 pci_name(dev));
215 return PCI_ERS_RESULT_DISCONNECT; 186 return PCI_ERS_RESULT_DISCONNECT;
216 } 187 }
217 188
@@ -243,31 +214,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
243 else 214 else
244 result_data.result = PCI_ERS_RESULT_RECOVERED; 215 result_data.result = PCI_ERS_RESULT_RECOVERED;
245 216
246 if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { 217 pci_walk_bus(dev->subordinate, cb, &result_data);
247 /*
248 * If the error is reported by a bridge, we think this error
249 * is related to the downstream link of the bridge, so we
250 * do error recovery on all subordinates of the bridge instead
251 * of the bridge and clear the error status of the bridge.
252 */
253 if (cb == report_error_detected)
254 dev->error_state = state;
255 pci_walk_bus(dev->subordinate, cb, &result_data);
256 if (cb == report_resume) {
257 pci_aer_clear_device_status(dev);
258 pci_cleanup_aer_uncorrect_error_status(dev);
259 dev->error_state = pci_channel_io_normal;
260 }
261 } else {
262 /*
263 * If the error is reported by an end point, we think this
264 * error is related to the upstream link of the end point.
265 * The error is non fatal so the bus is ok; just invoke
266 * the callback for the function that logged the error.
267 */
268 cb(dev, &result_data);
269 }
270
271 return result_data.result; 218 return result_data.result;
272} 219}
273 220
@@ -276,6 +223,14 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
276{ 223{
277 pci_ers_result_t status; 224 pci_ers_result_t status;
278 225
226 /*
227 * Error recovery runs on all subordinates of the first downstream port.
228 * If the downstream port detected the error, it is cleared at the end.
229 */
230 if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
231 pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
232 dev = dev->bus->self;
233
279 status = broadcast_error_message(dev, 234 status = broadcast_error_message(dev,
280 state, 235 state,
281 "error_detected", 236 "error_detected",
@@ -311,6 +266,8 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
311 "resume", 266 "resume",
312 report_resume); 267 report_resume);
313 268
269 pci_aer_clear_device_status(dev);
270 pci_cleanup_aer_uncorrect_error_status(dev);
314 pci_info(dev, "AER: Device recovery successful\n"); 271 pci_info(dev, "AER: Device recovery successful\n");
315 return; 272 return;
316 273