aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Morgenstein <jackm@mellanox.co.il>2006-08-15 14:11:18 -0400
committerRoland Dreier <rolandd@cisco.com>2006-09-22 18:22:54 -0400
commitb3b30f5e8a0c50db3d76b6f7c7cc50245aeb57fd (patch)
tree02e7b931be922a71e82da5c9baf273ac5d2312d9
parent07eeec0627e93a1a753c4df004a97a4d0a7b9ceb (diff)
IB/mthca: Recover from catastrophic errors
Trigger device remove and then add when a catastrophic error is detected in hardware. This, in turn, will cause a device reset, which we hope will recover from the catastrophic condition. Since this might interefere with debugging the root cause, add a module option to suppress this behaviour. Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il> Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r--drivers/infiniband/hw/mthca/mthca_catas.c62
-rw-r--r--drivers/infiniband/hw/mthca/mthca_dev.h7
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c88
3 files changed, 136 insertions, 21 deletions
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
index c3bec7490f52..cd044ea2dfa4 100644
--- a/drivers/infiniband/hw/mthca/mthca_catas.c
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -34,6 +34,7 @@
34 34
35#include <linux/jiffies.h> 35#include <linux/jiffies.h>
36#include <linux/timer.h> 36#include <linux/timer.h>
37#include <linux/workqueue.h>
37 38
38#include "mthca_dev.h" 39#include "mthca_dev.h"
39 40
@@ -48,9 +49,41 @@ enum {
48 49
49static DEFINE_SPINLOCK(catas_lock); 50static DEFINE_SPINLOCK(catas_lock);
50 51
52static LIST_HEAD(catas_list);
53static struct workqueue_struct *catas_wq;
54static struct work_struct catas_work;
55
56static int catas_reset_disable;
57module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
58MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
59
60static void catas_reset(void *work_ptr)
61{
62 struct mthca_dev *dev, *tmpdev;
63 LIST_HEAD(tlist);
64 int ret;
65
66 mutex_lock(&mthca_device_mutex);
67
68 spin_lock_irq(&catas_lock);
69 list_splice_init(&catas_list, &tlist);
70 spin_unlock_irq(&catas_lock);
71
72 list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
73 ret = __mthca_restart_one(dev->pdev);
74 if (ret)
75 mthca_err(dev, "Reset failed (%d)\n", ret);
76 else
77 mthca_dbg(dev, "Reset succeeded\n");
78 }
79
80 mutex_unlock(&mthca_device_mutex);
81}
82
51static void handle_catas(struct mthca_dev *dev) 83static void handle_catas(struct mthca_dev *dev)
52{ 84{
53 struct ib_event event; 85 struct ib_event event;
86 unsigned long flags;
54 const char *type; 87 const char *type;
55 int i; 88 int i;
56 89
@@ -82,6 +115,14 @@ static void handle_catas(struct mthca_dev *dev)
82 for (i = 0; i < dev->catas_err.size; ++i) 115 for (i = 0; i < dev->catas_err.size; ++i)
83 mthca_err(dev, " buf[%02x]: %08x\n", 116 mthca_err(dev, " buf[%02x]: %08x\n",
84 i, swab32(readl(dev->catas_err.map + i))); 117 i, swab32(readl(dev->catas_err.map + i)));
118
119 if (catas_reset_disable)
120 return;
121
122 spin_lock_irqsave(&catas_lock, flags);
123 list_add(&dev->catas_err.list, &catas_list);
124 queue_work(catas_wq, &catas_work);
125 spin_unlock_irqrestore(&catas_lock, flags);
85} 126}
86 127
87static void poll_catas(unsigned long dev_ptr) 128static void poll_catas(unsigned long dev_ptr)
@@ -135,6 +176,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
135 dev->catas_err.timer.data = (unsigned long) dev; 176 dev->catas_err.timer.data = (unsigned long) dev;
136 dev->catas_err.timer.function = poll_catas; 177 dev->catas_err.timer.function = poll_catas;
137 dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; 178 dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL;
179 INIT_LIST_HEAD(&dev->catas_err.list);
138 add_timer(&dev->catas_err.timer); 180 add_timer(&dev->catas_err.timer);
139} 181}
140 182
@@ -153,4 +195,24 @@ void mthca_stop_catas_poll(struct mthca_dev *dev)
153 dev->catas_err.addr), 195 dev->catas_err.addr),
154 dev->catas_err.size * 4); 196 dev->catas_err.size * 4);
155 } 197 }
198
199 spin_lock_irq(&catas_lock);
200 list_del(&dev->catas_err.list);
201 spin_unlock_irq(&catas_lock);
202}
203
204int __init mthca_catas_init(void)
205{
206 INIT_WORK(&catas_work, catas_reset, NULL);
207
208 catas_wq = create_singlethread_workqueue("mthca_catas");
209 if (!catas_wq)
210 return -ENOMEM;
211
212 return 0;
213}
214
215void mthca_catas_cleanup(void)
216{
217 destroy_workqueue(catas_wq);
156} 218}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 33bd0b8bfd13..fe5cecf70fed 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -45,6 +45,7 @@
45#include <linux/dma-mapping.h> 45#include <linux/dma-mapping.h>
46#include <linux/timer.h> 46#include <linux/timer.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/list.h>
48 49
49#include <asm/semaphore.h> 50#include <asm/semaphore.h>
50 51
@@ -283,8 +284,11 @@ struct mthca_catas_err {
283 unsigned long stop; 284 unsigned long stop;
284 u32 size; 285 u32 size;
285 struct timer_list timer; 286 struct timer_list timer;
287 struct list_head list;
286}; 288};
287 289
290extern struct mutex mthca_device_mutex;
291
288struct mthca_dev { 292struct mthca_dev {
289 struct ib_device ib_dev; 293 struct ib_device ib_dev;
290 struct pci_dev *pdev; 294 struct pci_dev *pdev;
@@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev);
450 454
451void mthca_start_catas_poll(struct mthca_dev *dev); 455void mthca_start_catas_poll(struct mthca_dev *dev);
452void mthca_stop_catas_poll(struct mthca_dev *dev); 456void mthca_stop_catas_poll(struct mthca_dev *dev);
457int __mthca_restart_one(struct pci_dev *pdev);
458int mthca_catas_init(void);
459void mthca_catas_cleanup(void);
453 460
454int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); 461int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
455void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); 462void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 7b82c1907f04..47ea02148368 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -80,6 +80,8 @@ static int tune_pci = 0;
80module_param(tune_pci, int, 0444); 80module_param(tune_pci, int, 0444);
81MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); 81MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
82 82
83struct mutex mthca_device_mutex;
84
83static const char mthca_version[] __devinitdata = 85static const char mthca_version[] __devinitdata =
84 DRV_NAME ": Mellanox InfiniBand HCA driver v" 86 DRV_NAME ": Mellanox InfiniBand HCA driver v"
85 DRV_VERSION " (" DRV_RELDATE ")\n"; 87 DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -978,28 +980,15 @@ static struct {
978 MTHCA_FLAG_SINAI_OPT } 980 MTHCA_FLAG_SINAI_OPT }
979}; 981};
980 982
981static int __devinit mthca_init_one(struct pci_dev *pdev, 983static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
982 const struct pci_device_id *id)
983{ 984{
984 static int mthca_version_printed = 0;
985 int ddr_hidden = 0; 985 int ddr_hidden = 0;
986 int err; 986 int err;
987 struct mthca_dev *mdev; 987 struct mthca_dev *mdev;
988 988
989 if (!mthca_version_printed) {
990 printk(KERN_INFO "%s", mthca_version);
991 ++mthca_version_printed;
992 }
993
994 printk(KERN_INFO PFX "Initializing %s\n", 989 printk(KERN_INFO PFX "Initializing %s\n",
995 pci_name(pdev)); 990 pci_name(pdev));
996 991
997 if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
998 printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
999 pci_name(pdev), id->driver_data);
1000 return -ENODEV;
1001 }
1002
1003 err = pci_enable_device(pdev); 992 err = pci_enable_device(pdev);
1004 if (err) { 993 if (err) {
1005 dev_err(&pdev->dev, "Cannot enable PCI device, " 994 dev_err(&pdev->dev, "Cannot enable PCI device, "
@@ -1065,7 +1054,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
1065 1054
1066 mdev->pdev = pdev; 1055 mdev->pdev = pdev;
1067 1056
1068 mdev->mthca_flags = mthca_hca_table[id->driver_data].flags; 1057 mdev->mthca_flags = mthca_hca_table[hca_type].flags;
1069 if (ddr_hidden) 1058 if (ddr_hidden)
1070 mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; 1059 mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
1071 1060
@@ -1099,13 +1088,13 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
1099 if (err) 1088 if (err)
1100 goto err_cmd; 1089 goto err_cmd;
1101 1090
1102 if (mdev->fw_ver < mthca_hca_table[id->driver_data].latest_fw) { 1091 if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
1103 mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", 1092 mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n",
1104 (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, 1093 (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
1105 (int) (mdev->fw_ver & 0xffff), 1094 (int) (mdev->fw_ver & 0xffff),
1106 (int) (mthca_hca_table[id->driver_data].latest_fw >> 32), 1095 (int) (mthca_hca_table[hca_type].latest_fw >> 32),
1107 (int) (mthca_hca_table[id->driver_data].latest_fw >> 16) & 0xffff, 1096 (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
1108 (int) (mthca_hca_table[id->driver_data].latest_fw & 0xffff)); 1097 (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
1109 mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); 1098 mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
1110 } 1099 }
1111 1100
@@ -1122,6 +1111,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
1122 goto err_unregister; 1111 goto err_unregister;
1123 1112
1124 pci_set_drvdata(pdev, mdev); 1113 pci_set_drvdata(pdev, mdev);
1114 mdev->hca_type = hca_type;
1125 1115
1126 return 0; 1116 return 0;
1127 1117
@@ -1166,7 +1156,7 @@ err_disable_pdev:
1166 return err; 1156 return err;
1167} 1157}
1168 1158
1169static void __devexit mthca_remove_one(struct pci_dev *pdev) 1159static void __mthca_remove_one(struct pci_dev *pdev)
1170{ 1160{
1171 struct mthca_dev *mdev = pci_get_drvdata(pdev); 1161 struct mthca_dev *mdev = pci_get_drvdata(pdev);
1172 u8 status; 1162 u8 status;
@@ -1211,6 +1201,51 @@ static void __devexit mthca_remove_one(struct pci_dev *pdev)
1211 } 1201 }
1212} 1202}
1213 1203
1204int __mthca_restart_one(struct pci_dev *pdev)
1205{
1206 struct mthca_dev *mdev;
1207
1208 mdev = pci_get_drvdata(pdev);
1209 if (!mdev)
1210 return -ENODEV;
1211 __mthca_remove_one(pdev);
1212 return __mthca_init_one(pdev, mdev->hca_type);
1213}
1214
1215static int __devinit mthca_init_one(struct pci_dev *pdev,
1216 const struct pci_device_id *id)
1217{
1218 static int mthca_version_printed = 0;
1219 int ret;
1220
1221 mutex_lock(&mthca_device_mutex);
1222
1223 if (!mthca_version_printed) {
1224 printk(KERN_INFO "%s", mthca_version);
1225 ++mthca_version_printed;
1226 }
1227
1228 if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
1229 printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
1230 pci_name(pdev), id->driver_data);
1231 mutex_unlock(&mthca_device_mutex);
1232 return -ENODEV;
1233 }
1234
1235 ret = __mthca_init_one(pdev, id->driver_data);
1236
1237 mutex_unlock(&mthca_device_mutex);
1238
1239 return ret;
1240}
1241
1242static void __devexit mthca_remove_one(struct pci_dev *pdev)
1243{
1244 mutex_lock(&mthca_device_mutex);
1245 __mthca_remove_one(pdev);
1246 mutex_unlock(&mthca_device_mutex);
1247}
1248
1214static struct pci_device_id mthca_pci_table[] = { 1249static struct pci_device_id mthca_pci_table[] = {
1215 { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), 1250 { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
1216 .driver_data = TAVOR }, 1251 .driver_data = TAVOR },
@@ -1248,13 +1283,24 @@ static int __init mthca_init(void)
1248{ 1283{
1249 int ret; 1284 int ret;
1250 1285
1286 mutex_init(&mthca_device_mutex);
1287 ret = mthca_catas_init();
1288 if (ret)
1289 return ret;
1290
1251 ret = pci_register_driver(&mthca_driver); 1291 ret = pci_register_driver(&mthca_driver);
1252 return ret < 0 ? ret : 0; 1292 if (ret < 0) {
1293 mthca_catas_cleanup();
1294 return ret;
1295 }
1296
1297 return 0;
1253} 1298}
1254 1299
1255static void __exit mthca_cleanup(void) 1300static void __exit mthca_cleanup(void)
1256{ 1301{
1257 pci_unregister_driver(&mthca_driver); 1302 pci_unregister_driver(&mthca_driver);
1303 mthca_catas_cleanup();
1258} 1304}
1259 1305
1260module_init(mthca_init); 1306module_init(mthca_init);