aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorWei Yang <weiyang@linux.vnet.ibm.com>2016-03-03 18:53:11 -0500
committerMichael Ellerman <mpe@ellerman.id.au>2016-03-08 17:58:23 -0500
commit67086e32b56481531ab1292b284e074b1a8d764c (patch)
tree67012baf5d2b62c7cc1d0f05600bee2c30f29c9b /arch
parent0dc2830e0a48d520c7db7cc55dbbd0edefac02f5 (diff)
powerpc/eeh: powerpc/eeh: Support error recovery for VF PE
PFs are enumerated on PCI bus, while VFs are created by PF's driver. In EEH recovery, it has two cases: 1. Device and driver is EEH aware, error handlers are called. 2. Device and driver is not EEH aware, un-plug the device and plug it again by enumerating it. The special thing happens on the second case. For a PF, we could use the original pci core to enumerate the bus, while for VF we need to record the VFs which aer un-plugged then plug it again. Also The patch caches the VF index in pci_dn, which can be used to calculate VF's bus, device and function number. Those information helps to locate the VF's PCI device instance when doing hotplug during EEH recovery if necessary. Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com> Acked-by: Gavin Shan <gwshan@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Diffstat (limited to 'arch')
-rw-r--r--arch/powerpc/include/asm/eeh.h2
-rw-r--r--arch/powerpc/include/asm/pci-bridge.h1
-rw-r--r--arch/powerpc/kernel/eeh.c8
-rw-r--r--arch/powerpc/kernel/eeh_dev.c1
-rw-r--r--arch/powerpc/kernel/eeh_driver.c137
-rw-r--r--arch/powerpc/kernel/pci_dn.c4
6 files changed, 127 insertions, 26 deletions
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index b5b5f45dfb0a..fb9f376ae27b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -140,9 +140,11 @@ struct eeh_dev {
140 int af_cap; /* Saved AF capability */ 140 int af_cap; /* Saved AF capability */
141 struct eeh_pe *pe; /* Associated PE */ 141 struct eeh_pe *pe; /* Associated PE */
142 struct list_head list; /* Form link list in the PE */ 142 struct list_head list; /* Form link list in the PE */
143 struct list_head rmv_list; /* Record the removed edevs */
143 struct pci_controller *phb; /* Associated PHB */ 144 struct pci_controller *phb; /* Associated PHB */
144 struct pci_dn *pdn; /* Associated PCI device node */ 145 struct pci_dn *pdn; /* Associated PCI device node */
145 struct pci_dev *pdev; /* Associated PCI device */ 146 struct pci_dev *pdev; /* Associated PCI device */
147 bool in_error; /* Error flag for edev */
146 struct pci_dev *physfn; /* Associated SRIOV PF */ 148 struct pci_dev *physfn; /* Associated SRIOV PF */
147 struct pci_bus *bus; /* PCI bus for partial hotplug */ 149 struct pci_bus *bus; /* PCI bus for partial hotplug */
148}; 150};
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index f4d17587fa31..9f165e8a77bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -212,6 +212,7 @@ struct pci_dn {
212#define IODA_INVALID_PE (-1) 212#define IODA_INVALID_PE (-1)
213#ifdef CONFIG_PPC_POWERNV 213#ifdef CONFIG_PPC_POWERNV
214 int pe_number; 214 int pe_number;
215 int vf_index; /* VF index in the PF */
215#ifdef CONFIG_PCI_IOV 216#ifdef CONFIG_PCI_IOV
216 u16 vfs_expanded; /* number of VFs IOV BAR expanded */ 217 u16 vfs_expanded; /* number of VFs IOV BAR expanded */
217 u16 num_vfs; /* number of VFs enabled*/ 218 u16 num_vfs; /* number of VFs enabled*/
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 0d724625662f..b7338a9426df 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1246,6 +1246,14 @@ void eeh_remove_device(struct pci_dev *dev)
1246 * from the parent PE during the BAR resotre. 1246 * from the parent PE during the BAR resotre.
1247 */ 1247 */
1248 edev->pdev = NULL; 1248 edev->pdev = NULL;
1249
1250 /*
1251 * The flag "in_error" is used to trace EEH devices for VFs
1252 * in error state or not. It's set in eeh_report_error(). If
1253 * it's not set, eeh_report_{reset,resume}() won't be called
1254 * for the VF EEH device.
1255 */
1256 edev->in_error = false;
1249 dev->dev.archdata.edev = NULL; 1257 dev->dev.archdata.edev = NULL;
1250 if (!(edev->pe->state & EEH_PE_KEEP)) 1258 if (!(edev->pe->state & EEH_PE_KEEP))
1251 eeh_rmv_from_parent_pe(edev); 1259 eeh_rmv_from_parent_pe(edev);
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index aabba94ff9cb..7815095fe3d8 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -67,6 +67,7 @@ void *eeh_dev_init(struct pci_dn *pdn, void *data)
67 edev->pdn = pdn; 67 edev->pdn = pdn;
68 edev->phb = phb; 68 edev->phb = phb;
69 INIT_LIST_HEAD(&edev->list); 69 INIT_LIST_HEAD(&edev->list);
70 INIT_LIST_HEAD(&edev->rmv_list);
70 71
71 return NULL; 72 return NULL;
72} 73}
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 650cfb31ea3d..c0fe7a6be2c9 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -34,6 +34,11 @@
34#include <asm/prom.h> 34#include <asm/prom.h>
35#include <asm/rtas.h> 35#include <asm/rtas.h>
36 36
37struct eeh_rmv_data {
38 struct list_head edev_list;
39 int removed;
40};
41
37/** 42/**
38 * eeh_pcid_name - Retrieve name of PCI device driver 43 * eeh_pcid_name - Retrieve name of PCI device driver
39 * @pdev: PCI device 44 * @pdev: PCI device
@@ -211,6 +216,7 @@ static void *eeh_report_error(void *data, void *userdata)
211 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; 216 if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
212 if (*res == PCI_ERS_RESULT_NONE) *res = rc; 217 if (*res == PCI_ERS_RESULT_NONE) *res = rc;
213 218
219 edev->in_error = true;
214 eeh_pcid_put(dev); 220 eeh_pcid_put(dev);
215 return NULL; 221 return NULL;
216} 222}
@@ -282,7 +288,8 @@ static void *eeh_report_reset(void *data, void *userdata)
282 288
283 if (!driver->err_handler || 289 if (!driver->err_handler ||
284 !driver->err_handler->slot_reset || 290 !driver->err_handler->slot_reset ||
285 (edev->mode & EEH_DEV_NO_HANDLER)) { 291 (edev->mode & EEH_DEV_NO_HANDLER) ||
292 (!edev->in_error)) {
286 eeh_pcid_put(dev); 293 eeh_pcid_put(dev);
287 return NULL; 294 return NULL;
288 } 295 }
@@ -326,6 +333,7 @@ static void *eeh_report_resume(void *data, void *userdata)
326{ 333{
327 struct eeh_dev *edev = (struct eeh_dev *)data; 334 struct eeh_dev *edev = (struct eeh_dev *)data;
328 struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 335 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
336 bool was_in_error;
329 struct pci_driver *driver; 337 struct pci_driver *driver;
330 338
331 if (!dev || eeh_dev_removed(edev)) 339 if (!dev || eeh_dev_removed(edev))
@@ -335,11 +343,13 @@ static void *eeh_report_resume(void *data, void *userdata)
335 driver = eeh_pcid_get(dev); 343 driver = eeh_pcid_get(dev);
336 if (!driver) return NULL; 344 if (!driver) return NULL;
337 345
346 was_in_error = edev->in_error;
347 edev->in_error = false;
338 eeh_enable_irq(dev); 348 eeh_enable_irq(dev);
339 349
340 if (!driver->err_handler || 350 if (!driver->err_handler ||
341 !driver->err_handler->resume || 351 !driver->err_handler->resume ||
342 (edev->mode & EEH_DEV_NO_HANDLER)) { 352 (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
343 edev->mode &= ~EEH_DEV_NO_HANDLER; 353 edev->mode &= ~EEH_DEV_NO_HANDLER;
344 eeh_pcid_put(dev); 354 eeh_pcid_put(dev);
345 return NULL; 355 return NULL;
@@ -386,12 +396,40 @@ static void *eeh_report_failure(void *data, void *userdata)
386 return NULL; 396 return NULL;
387} 397}
388 398
399static void *eeh_add_virt_device(void *data, void *userdata)
400{
401 struct pci_driver *driver;
402 struct eeh_dev *edev = (struct eeh_dev *)data;
403 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
404 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
405
406 if (!(edev->physfn)) {
407 pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
408 __func__, edev->phb->global_number, pdn->busno,
409 PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
410 return NULL;
411 }
412
413 driver = eeh_pcid_get(dev);
414 if (driver) {
415 eeh_pcid_put(dev);
416 if (driver->err_handler)
417 return NULL;
418 }
419
420#ifdef CONFIG_PPC_POWERNV
421 pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0);
422#endif
423 return NULL;
424}
425
389static void *eeh_rmv_device(void *data, void *userdata) 426static void *eeh_rmv_device(void *data, void *userdata)
390{ 427{
391 struct pci_driver *driver; 428 struct pci_driver *driver;
392 struct eeh_dev *edev = (struct eeh_dev *)data; 429 struct eeh_dev *edev = (struct eeh_dev *)data;
393 struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 430 struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
394 int *removed = (int *)userdata; 431 struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
432 int *removed = rmv_data ? &rmv_data->removed : NULL;
395 433
396 /* 434 /*
397 * Actually, we should remove the PCI bridges as well. 435 * Actually, we should remove the PCI bridges as well.
@@ -416,7 +454,8 @@ static void *eeh_rmv_device(void *data, void *userdata)
416 driver = eeh_pcid_get(dev); 454 driver = eeh_pcid_get(dev);
417 if (driver) { 455 if (driver) {
418 eeh_pcid_put(dev); 456 eeh_pcid_put(dev);
419 if (driver->err_handler && 457 if (removed &&
458 driver->err_handler &&
420 driver->err_handler->error_detected && 459 driver->err_handler->error_detected &&
421 driver->err_handler->slot_reset) 460 driver->err_handler->slot_reset)
422 return NULL; 461 return NULL;
@@ -427,11 +466,29 @@ static void *eeh_rmv_device(void *data, void *userdata)
427 pci_name(dev)); 466 pci_name(dev));
428 edev->bus = dev->bus; 467 edev->bus = dev->bus;
429 edev->mode |= EEH_DEV_DISCONNECTED; 468 edev->mode |= EEH_DEV_DISCONNECTED;
430 (*removed)++; 469 if (removed)
470 (*removed)++;
431 471
432 pci_lock_rescan_remove(); 472 if (edev->physfn) {
433 pci_stop_and_remove_bus_device(dev); 473#ifdef CONFIG_PPC_POWERNV
434 pci_unlock_rescan_remove(); 474 struct pci_dn *pdn = eeh_dev_to_pdn(edev);
475
476 pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
477 edev->pdev = NULL;
478
479 /*
480 * We have to set the VF PE number to invalid one, which is
481 * required to plug the VF successfully.
482 */
483 pdn->pe_number = IODA_INVALID_PE;
484#endif
485 if (rmv_data)
486 list_add(&edev->rmv_list, &rmv_data->edev_list);
487 } else {
488 pci_lock_rescan_remove();
489 pci_stop_and_remove_bus_device(dev);
490 pci_unlock_rescan_remove();
491 }
435 492
436 return NULL; 493 return NULL;
437} 494}
@@ -545,11 +602,13 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
545 * During the reset, udev might be invoked because those affected 602 * During the reset, udev might be invoked because those affected
546 * PCI devices will be removed and then added. 603 * PCI devices will be removed and then added.
547 */ 604 */
548static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) 605static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
606 struct eeh_rmv_data *rmv_data)
549{ 607{
550 struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); 608 struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
551 struct timeval tstamp; 609 struct timeval tstamp;
552 int cnt, rc, removed = 0; 610 int cnt, rc;
611 struct eeh_dev *edev;
553 612
554 /* pcibios will clear the counter; save the value */ 613 /* pcibios will clear the counter; save the value */
555 cnt = pe->freeze_count; 614 cnt = pe->freeze_count;
@@ -563,12 +622,16 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
563 */ 622 */
564 eeh_pe_state_mark(pe, EEH_PE_KEEP); 623 eeh_pe_state_mark(pe, EEH_PE_KEEP);
565 if (bus) { 624 if (bus) {
566 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 625 if (pe->type & EEH_PE_VF) {
567 pci_lock_rescan_remove(); 626 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
568 pcibios_remove_pci_devices(bus); 627 } else {
569 pci_unlock_rescan_remove(); 628 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
629 pci_lock_rescan_remove();
630 pcibios_remove_pci_devices(bus);
631 pci_unlock_rescan_remove();
632 }
570 } else if (frozen_bus) { 633 } else if (frozen_bus) {
571 eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed); 634 eeh_pe_dev_traverse(pe, eeh_rmv_device, &rmv_data);
572 } 635 }
573 636
574 /* 637 /*
@@ -610,14 +673,22 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
610 * PE. We should disconnect it so the binding can be 673 * PE. We should disconnect it so the binding can be
611 * rebuilt when adding PCI devices. 674 * rebuilt when adding PCI devices.
612 */ 675 */
676 edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
613 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); 677 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
614 pcibios_add_pci_devices(bus); 678 if (pe->type & EEH_PE_VF)
615 } else if (frozen_bus && removed) { 679 eeh_add_virt_device(edev, NULL);
680 else
681 pcibios_add_pci_devices(bus);
682 } else if (frozen_bus && rmv_data->removed) {
616 pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); 683 pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
617 ssleep(5); 684 ssleep(5);
618 685
686 edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
619 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); 687 eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
620 pcibios_add_pci_devices(frozen_bus); 688 if (pe->type & EEH_PE_VF)
689 eeh_add_virt_device(edev, NULL);
690 else
691 pcibios_add_pci_devices(frozen_bus);
621 } 692 }
622 eeh_pe_state_clear(pe, EEH_PE_KEEP); 693 eeh_pe_state_clear(pe, EEH_PE_KEEP);
623 694
@@ -636,8 +707,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
636static void eeh_handle_normal_event(struct eeh_pe *pe) 707static void eeh_handle_normal_event(struct eeh_pe *pe)
637{ 708{
638 struct pci_bus *frozen_bus; 709 struct pci_bus *frozen_bus;
710 struct eeh_dev *edev, *tmp;
639 int rc = 0; 711 int rc = 0;
640 enum pci_ers_result result = PCI_ERS_RESULT_NONE; 712 enum pci_ers_result result = PCI_ERS_RESULT_NONE;
713 struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
641 714
642 frozen_bus = eeh_pe_bus_get(pe); 715 frozen_bus = eeh_pe_bus_get(pe);
643 if (!frozen_bus) { 716 if (!frozen_bus) {
@@ -692,7 +765,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
692 */ 765 */
693 if (result == PCI_ERS_RESULT_NONE) { 766 if (result == PCI_ERS_RESULT_NONE) {
694 pr_info("EEH: Reset with hotplug activity\n"); 767 pr_info("EEH: Reset with hotplug activity\n");
695 rc = eeh_reset_device(pe, frozen_bus); 768 rc = eeh_reset_device(pe, frozen_bus, NULL);
696 if (rc) { 769 if (rc) {
697 pr_warn("%s: Unable to reset, err=%d\n", 770 pr_warn("%s: Unable to reset, err=%d\n",
698 __func__, rc); 771 __func__, rc);
@@ -744,7 +817,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
744 /* If any device called out for a reset, then reset the slot */ 817 /* If any device called out for a reset, then reset the slot */
745 if (result == PCI_ERS_RESULT_NEED_RESET) { 818 if (result == PCI_ERS_RESULT_NEED_RESET) {
746 pr_info("EEH: Reset without hotplug activity\n"); 819 pr_info("EEH: Reset without hotplug activity\n");
747 rc = eeh_reset_device(pe, NULL); 820 rc = eeh_reset_device(pe, NULL, &rmv_data);
748 if (rc) { 821 if (rc) {
749 pr_warn("%s: Cannot reset, err=%d\n", 822 pr_warn("%s: Cannot reset, err=%d\n",
750 __func__, rc); 823 __func__, rc);
@@ -764,6 +837,15 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
764 goto hard_fail; 837 goto hard_fail;
765 } 838 }
766 839
840 /*
841 * For those hot removed VFs, we should add back them after PF get
842 * recovered properly.
843 */
844 list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) {
845 eeh_add_virt_device(edev, NULL);
846 list_del(&edev->rmv_list);
847 }
848
767 /* Tell all device drivers that they can resume operations */ 849 /* Tell all device drivers that they can resume operations */
768 pr_info("EEH: Notify device driver to resume\n"); 850 pr_info("EEH: Notify device driver to resume\n");
769 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); 851 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
@@ -803,12 +885,17 @@ perm_error:
803 * the their PCI config any more. 885 * the their PCI config any more.
804 */ 886 */
805 if (frozen_bus) { 887 if (frozen_bus) {
806 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 888 if (pe->type & EEH_PE_VF) {
807 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 889 eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
890 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
891 } else {
892 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
893 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
808 894
809 pci_lock_rescan_remove(); 895 pci_lock_rescan_remove();
810 pcibios_remove_pci_devices(frozen_bus); 896 pcibios_remove_pci_devices(frozen_bus);
811 pci_unlock_rescan_remove(); 897 pci_unlock_rescan_remove();
898 }
812 } 899 }
813} 900}
814 901
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index e23bdf786dab..38102cb9baa9 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -139,6 +139,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
139#ifdef CONFIG_PCI_IOV 139#ifdef CONFIG_PCI_IOV
140static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, 140static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
141 struct pci_dev *pdev, 141 struct pci_dev *pdev,
142 int vf_index,
142 int busno, int devfn) 143 int busno, int devfn)
143{ 144{
144 struct pci_dn *pdn; 145 struct pci_dn *pdn;
@@ -158,6 +159,7 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
158 pdn->busno = busno; 159 pdn->busno = busno;
159 pdn->devfn = devfn; 160 pdn->devfn = devfn;
160#ifdef CONFIG_PPC_POWERNV 161#ifdef CONFIG_PPC_POWERNV
162 pdn->vf_index = vf_index;
161 pdn->pe_number = IODA_INVALID_PE; 163 pdn->pe_number = IODA_INVALID_PE;
162#endif 164#endif
163 INIT_LIST_HEAD(&pdn->child_list); 165 INIT_LIST_HEAD(&pdn->child_list);
@@ -197,7 +199,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
197 return NULL; 199 return NULL;
198 200
199 for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { 201 for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
200 pdn = add_one_dev_pci_data(parent, NULL, 202 pdn = add_one_dev_pci_data(parent, NULL, i,
201 pci_iov_virtfn_bus(pdev, i), 203 pci_iov_virtfn_bus(pdev, i),
202 pci_iov_virtfn_devfn(pdev, i)); 204 pci_iov_virtfn_devfn(pdev, i));
203 if (!pdn) { 205 if (!pdn) {