aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/misc
diff options
context:
space:
mode:
authorKleber Sacilotto de Souza <klebers@linux.vnet.ibm.com>2014-06-04 09:57:51 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-07-09 17:14:27 -0400
commitfb145456fa4f4311f90703aeee058bab3b274bf8 (patch)
tree980ae85a3c35b8b574dddeaf96bf84d41dbb83d3 /drivers/misc
parentc1f732ad767e37bd1d41043cbdefc0874b4d05e5 (diff)
GenWQE: Add support for EEH error recovery
This patch implements the callbacks and functions necessary to have EEH recovery support. It adds a config option to enable or disable explicit calls to trigger platform specific mechanisms on error recovery paths. This option is enabled by default only on PPC64 systems and can be overritten via debugfs. If this option is enabled, on the error recovery path the driver will call pci_channel_offline() to check for error condition and issue non-raw MMIO reads to trigger early EEH detection in case of hardware failures. This is necessary since the driver MMIO helper funtions use raw accessors. Signed-off-by: Kleber Sacilotto de Souza <klebers@linux.vnet.ibm.com> Acked-by: Frank Haverkamp <haver@linux.vnet.ibm.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'drivers/misc')
-rw-r--r--drivers/misc/genwqe/Kconfig6
-rw-r--r--drivers/misc/genwqe/card_base.c79
-rw-r--r--drivers/misc/genwqe/card_base.h2
-rw-r--r--drivers/misc/genwqe/card_ddcb.c24
-rw-r--r--drivers/misc/genwqe/card_debugfs.c7
-rw-r--r--drivers/misc/genwqe/card_dev.c5
-rw-r--r--drivers/misc/genwqe/card_utils.c10
7 files changed, 115 insertions, 18 deletions
diff --git a/drivers/misc/genwqe/Kconfig b/drivers/misc/genwqe/Kconfig
index 6069d8cd79d7..4c0a033cbfdb 100644
--- a/drivers/misc/genwqe/Kconfig
+++ b/drivers/misc/genwqe/Kconfig
@@ -11,3 +11,9 @@ menuconfig GENWQE
11 Enables PCIe card driver for IBM GenWQE accelerators. 11 Enables PCIe card driver for IBM GenWQE accelerators.
12 The user-space interface is described in 12 The user-space interface is described in
13 include/linux/genwqe/genwqe_card.h. 13 include/linux/genwqe/genwqe_card.h.
14
15config GENWQE_PLATFORM_ERROR_RECOVERY
16 int "Use platform recovery procedures (0=off, 1=on)"
17 depends on GENWQE
18 default 1 if PPC64
19 default 0
diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c
index e6cc3e1e7326..87ebaba9b133 100644
--- a/drivers/misc/genwqe/card_base.c
+++ b/drivers/misc/genwqe/card_base.c
@@ -140,6 +140,12 @@ static struct genwqe_dev *genwqe_dev_alloc(void)
140 cd->class_genwqe = class_genwqe; 140 cd->class_genwqe = class_genwqe;
141 cd->debugfs_genwqe = debugfs_genwqe; 141 cd->debugfs_genwqe = debugfs_genwqe;
142 142
143 /*
144 * This comes from kernel config option and can be overritten via
145 * debugfs.
146 */
147 cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY;
148
143 init_waitqueue_head(&cd->queue_waitq); 149 init_waitqueue_head(&cd->queue_waitq);
144 150
145 spin_lock_init(&cd->file_lock); 151 spin_lock_init(&cd->file_lock);
@@ -943,6 +949,19 @@ static int genwqe_health_thread(void *data)
943 return 0; 949 return 0;
944 950
945 fatal_error: 951 fatal_error:
952 if (cd->use_platform_recovery) {
953 /*
954 * Since we use raw accessors, EEH errors won't be detected
955 * by the platform until we do a non-raw MMIO or config space
956 * read
957 */
958 readq(cd->mmio + IO_SLC_CFGREG_GFIR);
959
960 /* We do nothing if the card is going over PCI recovery */
961 if (pci_channel_offline(pci_dev))
962 return -EIO;
963 }
964
946 dev_err(&pci_dev->dev, 965 dev_err(&pci_dev->dev,
947 "[%s] card unusable. Please trigger unbind!\n", __func__); 966 "[%s] card unusable. Please trigger unbind!\n", __func__);
948 967
@@ -1048,6 +1067,9 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
1048 pci_set_master(pci_dev); 1067 pci_set_master(pci_dev);
1049 pci_enable_pcie_error_reporting(pci_dev); 1068 pci_enable_pcie_error_reporting(pci_dev);
1050 1069
1070 /* EEH recovery requires PCIe fundamental reset */
1071 pci_dev->needs_freset = 1;
1072
1051 /* request complete BAR-0 space (length = 0) */ 1073 /* request complete BAR-0 space (length = 0) */
1052 cd->mmio_len = pci_resource_len(pci_dev, 0); 1074 cd->mmio_len = pci_resource_len(pci_dev, 0);
1053 cd->mmio = pci_iomap(pci_dev, 0, 0); 1075 cd->mmio = pci_iomap(pci_dev, 0, 0);
@@ -1186,23 +1208,40 @@ static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev,
1186 1208
1187 dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state); 1209 dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state);
1188 1210
1189 if (pci_dev == NULL)
1190 return PCI_ERS_RESULT_NEED_RESET;
1191
1192 cd = dev_get_drvdata(&pci_dev->dev); 1211 cd = dev_get_drvdata(&pci_dev->dev);
1193 if (cd == NULL) 1212 if (cd == NULL)
1194 return PCI_ERS_RESULT_NEED_RESET; 1213 return PCI_ERS_RESULT_DISCONNECT;
1195 1214
1196 switch (state) { 1215 /* Stop the card */
1197 case pci_channel_io_normal: 1216 genwqe_health_check_stop(cd);
1198 return PCI_ERS_RESULT_CAN_RECOVER; 1217 genwqe_stop(cd);
1199 case pci_channel_io_frozen: 1218
1200 return PCI_ERS_RESULT_NEED_RESET; 1219 /*
1201 case pci_channel_io_perm_failure: 1220 * On permanent failure, the PCI code will call device remove
1221 * after the return of this function.
1222 * genwqe_stop() can be called twice.
1223 */
1224 if (state == pci_channel_io_perm_failure) {
1202 return PCI_ERS_RESULT_DISCONNECT; 1225 return PCI_ERS_RESULT_DISCONNECT;
1226 } else {
1227 genwqe_pci_remove(cd);
1228 return PCI_ERS_RESULT_NEED_RESET;
1203 } 1229 }
1230}
1231
1232static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev)
1233{
1234 int rc;
1235 struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1204 1236
1205 return PCI_ERS_RESULT_NEED_RESET; 1237 rc = genwqe_pci_setup(cd);
1238 if (!rc) {
1239 return PCI_ERS_RESULT_RECOVERED;
1240 } else {
1241 dev_err(&pci_dev->dev,
1242 "err: problems with PCI setup (err=%d)\n", rc);
1243 return PCI_ERS_RESULT_DISCONNECT;
1244 }
1206} 1245}
1207 1246
1208static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev) 1247static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
@@ -1210,8 +1249,22 @@ static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
1210 return PCI_ERS_RESULT_NONE; 1249 return PCI_ERS_RESULT_NONE;
1211} 1250}
1212 1251
1213static void genwqe_err_resume(struct pci_dev *dev) 1252static void genwqe_err_resume(struct pci_dev *pci_dev)
1214{ 1253{
1254 int rc;
1255 struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1256
1257 rc = genwqe_start(cd);
1258 if (!rc) {
1259 rc = genwqe_health_check_start(cd);
1260 if (rc)
1261 dev_err(&pci_dev->dev,
1262 "err: cannot start health checking! (err=%d)\n",
1263 rc);
1264 } else {
1265 dev_err(&pci_dev->dev,
1266 "err: cannot start card services! (err=%d)\n", rc);
1267 }
1215} 1268}
1216 1269
1217static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs) 1270static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
@@ -1234,7 +1287,7 @@ static struct pci_error_handlers genwqe_err_handler = {
1234 .error_detected = genwqe_err_error_detected, 1287 .error_detected = genwqe_err_error_detected,
1235 .mmio_enabled = genwqe_err_result_none, 1288 .mmio_enabled = genwqe_err_result_none,
1236 .link_reset = genwqe_err_result_none, 1289 .link_reset = genwqe_err_result_none,
1237 .slot_reset = genwqe_err_result_none, 1290 .slot_reset = genwqe_err_slot_reset,
1238 .resume = genwqe_err_resume, 1291 .resume = genwqe_err_resume,
1239}; 1292};
1240 1293
diff --git a/drivers/misc/genwqe/card_base.h b/drivers/misc/genwqe/card_base.h
index 0e608a288603..67abd8cb2247 100644
--- a/drivers/misc/genwqe/card_base.h
+++ b/drivers/misc/genwqe/card_base.h
@@ -291,6 +291,8 @@ struct genwqe_dev {
291 struct task_struct *health_thread; 291 struct task_struct *health_thread;
292 wait_queue_head_t health_waitq; 292 wait_queue_head_t health_waitq;
293 293
294 int use_platform_recovery; /* use platform recovery mechanisms */
295
294 /* char device */ 296 /* char device */
295 dev_t devnum_genwqe; /* major/minor num card */ 297 dev_t devnum_genwqe; /* major/minor num card */
296 struct class *class_genwqe; /* reference to class object */ 298 struct class *class_genwqe; /* reference to class object */
diff --git a/drivers/misc/genwqe/card_ddcb.c b/drivers/misc/genwqe/card_ddcb.c
index c8046db2d5a2..f0de6153bea2 100644
--- a/drivers/misc/genwqe/card_ddcb.c
+++ b/drivers/misc/genwqe/card_ddcb.c
@@ -1118,7 +1118,21 @@ static irqreturn_t genwqe_pf_isr(int irq, void *dev_id)
1118 * safer, but slower for the good-case ... See above. 1118 * safer, but slower for the good-case ... See above.
1119 */ 1119 */
1120 gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); 1120 gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
1121 if ((gfir & GFIR_ERR_TRIGGER) != 0x0) { 1121 if (((gfir & GFIR_ERR_TRIGGER) != 0x0) &&
1122 !pci_channel_offline(pci_dev)) {
1123
1124 if (cd->use_platform_recovery) {
1125 /*
1126 * Since we use raw accessors, EEH errors won't be
1127 * detected by the platform until we do a non-raw
1128 * MMIO or config space read
1129 */
1130 readq(cd->mmio + IO_SLC_CFGREG_GFIR);
1131
1132 /* Don't do anything if the PCI channel is frozen */
1133 if (pci_channel_offline(pci_dev))
1134 goto exit;
1135 }
1122 1136
1123 wake_up_interruptible(&cd->health_waitq); 1137 wake_up_interruptible(&cd->health_waitq);
1124 1138
@@ -1126,12 +1140,12 @@ static irqreturn_t genwqe_pf_isr(int irq, void *dev_id)
1126 * By default GFIRs causes recovery actions. This 1140 * By default GFIRs causes recovery actions. This
1127 * count is just for debug when recovery is masked. 1141 * count is just for debug when recovery is masked.
1128 */ 1142 */
1129 printk_ratelimited(KERN_ERR 1143 dev_err_ratelimited(&pci_dev->dev,
1130 "%s %s: [%s] GFIR=%016llx\n", 1144 "[%s] GFIR=%016llx\n",
1131 GENWQE_DEVNAME, dev_name(&pci_dev->dev), 1145 __func__, gfir);
1132 __func__, gfir);
1133 } 1146 }
1134 1147
1148 exit:
1135 return IRQ_HANDLED; 1149 return IRQ_HANDLED;
1136} 1150}
1137 1151
diff --git a/drivers/misc/genwqe/card_debugfs.c b/drivers/misc/genwqe/card_debugfs.c
index 0a33ade64109..c9b4d6d0eb99 100644
--- a/drivers/misc/genwqe/card_debugfs.c
+++ b/drivers/misc/genwqe/card_debugfs.c
@@ -485,6 +485,13 @@ int genwqe_init_debugfs(struct genwqe_dev *cd)
485 goto err1; 485 goto err1;
486 } 486 }
487 487
488 file = debugfs_create_u32("use_platform_recovery", 0666, root,
489 &cd->use_platform_recovery);
490 if (!file) {
491 ret = -ENOMEM;
492 goto err1;
493 }
494
488 cd->debugfs_root = root; 495 cd->debugfs_root = root;
489 return 0; 496 return 0;
490err1: 497err1:
diff --git a/drivers/misc/genwqe/card_dev.c b/drivers/misc/genwqe/card_dev.c
index 1d2f163a1906..aae42555e2ca 100644
--- a/drivers/misc/genwqe/card_dev.c
+++ b/drivers/misc/genwqe/card_dev.c
@@ -1048,10 +1048,15 @@ static long genwqe_ioctl(struct file *filp, unsigned int cmd,
1048 int rc = 0; 1048 int rc = 0;
1049 struct genwqe_file *cfile = (struct genwqe_file *)filp->private_data; 1049 struct genwqe_file *cfile = (struct genwqe_file *)filp->private_data;
1050 struct genwqe_dev *cd = cfile->cd; 1050 struct genwqe_dev *cd = cfile->cd;
1051 struct pci_dev *pci_dev = cd->pci_dev;
1051 struct genwqe_reg_io __user *io; 1052 struct genwqe_reg_io __user *io;
1052 u64 val; 1053 u64 val;
1053 u32 reg_offs; 1054 u32 reg_offs;
1054 1055
1056 /* Return -EIO if card hit EEH */
1057 if (pci_channel_offline(pci_dev))
1058 return -EIO;
1059
1055 if (_IOC_TYPE(cmd) != GENWQE_IOC_CODE) 1060 if (_IOC_TYPE(cmd) != GENWQE_IOC_CODE)
1056 return -EINVAL; 1061 return -EINVAL;
1057 1062
diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c
index 62cc6bb3f62e..4a500582eef0 100644
--- a/drivers/misc/genwqe/card_utils.c
+++ b/drivers/misc/genwqe/card_utils.c
@@ -53,12 +53,17 @@
53 */ 53 */
54int __genwqe_writeq(struct genwqe_dev *cd, u64 byte_offs, u64 val) 54int __genwqe_writeq(struct genwqe_dev *cd, u64 byte_offs, u64 val)
55{ 55{
56 struct pci_dev *pci_dev = cd->pci_dev;
57
56 if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE) 58 if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE)
57 return -EIO; 59 return -EIO;
58 60
59 if (cd->mmio == NULL) 61 if (cd->mmio == NULL)
60 return -EIO; 62 return -EIO;
61 63
64 if (pci_channel_offline(pci_dev))
65 return -EIO;
66
62 __raw_writeq((__force u64)cpu_to_be64(val), cd->mmio + byte_offs); 67 __raw_writeq((__force u64)cpu_to_be64(val), cd->mmio + byte_offs);
63 return 0; 68 return 0;
64} 69}
@@ -99,12 +104,17 @@ u64 __genwqe_readq(struct genwqe_dev *cd, u64 byte_offs)
99 */ 104 */
100int __genwqe_writel(struct genwqe_dev *cd, u64 byte_offs, u32 val) 105int __genwqe_writel(struct genwqe_dev *cd, u64 byte_offs, u32 val)
101{ 106{
107 struct pci_dev *pci_dev = cd->pci_dev;
108
102 if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE) 109 if (cd->err_inject & GENWQE_INJECT_HARDWARE_FAILURE)
103 return -EIO; 110 return -EIO;
104 111
105 if (cd->mmio == NULL) 112 if (cd->mmio == NULL)
106 return -EIO; 113 return -EIO;
107 114
115 if (pci_channel_offline(pci_dev))
116 return -EIO;
117
108 __raw_writel((__force u32)cpu_to_be32(val), cd->mmio + byte_offs); 118 __raw_writel((__force u32)cpu_to_be32(val), cd->mmio + byte_offs);
109 return 0; 119 return 0;
110} 120}