diff options
author | Jack Morgenstein <jackm@mellanox.co.il> | 2006-08-15 14:11:18 -0400 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2006-09-22 18:22:54 -0400 |
commit | b3b30f5e8a0c50db3d76b6f7c7cc50245aeb57fd (patch) | |
tree | 02e7b931be922a71e82da5c9baf273ac5d2312d9 | |
parent | 07eeec0627e93a1a753c4df004a97a4d0a7b9ceb (diff) |
IB/mthca: Recover from catastrophic errors
Trigger device remove and then add when a catastrophic error is
detected in hardware. This, in turn, will cause a device reset, which
we hope will recover from the catastrophic condition.
Since this might interefere with debugging the root cause, add a
module option to suppress this behaviour.
Signed-off-by: Jack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_catas.c | 62 | ||||
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_dev.h | 7 | ||||
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_main.c | 88 |
3 files changed, 136 insertions, 21 deletions
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c index c3bec7490f52..cd044ea2dfa4 100644 --- a/drivers/infiniband/hw/mthca/mthca_catas.c +++ b/drivers/infiniband/hw/mthca/mthca_catas.c | |||
@@ -34,6 +34,7 @@ | |||
34 | 34 | ||
35 | #include <linux/jiffies.h> | 35 | #include <linux/jiffies.h> |
36 | #include <linux/timer.h> | 36 | #include <linux/timer.h> |
37 | #include <linux/workqueue.h> | ||
37 | 38 | ||
38 | #include "mthca_dev.h" | 39 | #include "mthca_dev.h" |
39 | 40 | ||
@@ -48,9 +49,41 @@ enum { | |||
48 | 49 | ||
49 | static DEFINE_SPINLOCK(catas_lock); | 50 | static DEFINE_SPINLOCK(catas_lock); |
50 | 51 | ||
52 | static LIST_HEAD(catas_list); | ||
53 | static struct workqueue_struct *catas_wq; | ||
54 | static struct work_struct catas_work; | ||
55 | |||
56 | static int catas_reset_disable; | ||
57 | module_param_named(catas_reset_disable, catas_reset_disable, int, 0644); | ||
58 | MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); | ||
59 | |||
60 | static void catas_reset(void *work_ptr) | ||
61 | { | ||
62 | struct mthca_dev *dev, *tmpdev; | ||
63 | LIST_HEAD(tlist); | ||
64 | int ret; | ||
65 | |||
66 | mutex_lock(&mthca_device_mutex); | ||
67 | |||
68 | spin_lock_irq(&catas_lock); | ||
69 | list_splice_init(&catas_list, &tlist); | ||
70 | spin_unlock_irq(&catas_lock); | ||
71 | |||
72 | list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { | ||
73 | ret = __mthca_restart_one(dev->pdev); | ||
74 | if (ret) | ||
75 | mthca_err(dev, "Reset failed (%d)\n", ret); | ||
76 | else | ||
77 | mthca_dbg(dev, "Reset succeeded\n"); | ||
78 | } | ||
79 | |||
80 | mutex_unlock(&mthca_device_mutex); | ||
81 | } | ||
82 | |||
51 | static void handle_catas(struct mthca_dev *dev) | 83 | static void handle_catas(struct mthca_dev *dev) |
52 | { | 84 | { |
53 | struct ib_event event; | 85 | struct ib_event event; |
86 | unsigned long flags; | ||
54 | const char *type; | 87 | const char *type; |
55 | int i; | 88 | int i; |
56 | 89 | ||
@@ -82,6 +115,14 @@ static void handle_catas(struct mthca_dev *dev) | |||
82 | for (i = 0; i < dev->catas_err.size; ++i) | 115 | for (i = 0; i < dev->catas_err.size; ++i) |
83 | mthca_err(dev, " buf[%02x]: %08x\n", | 116 | mthca_err(dev, " buf[%02x]: %08x\n", |
84 | i, swab32(readl(dev->catas_err.map + i))); | 117 | i, swab32(readl(dev->catas_err.map + i))); |
118 | |||
119 | if (catas_reset_disable) | ||
120 | return; | ||
121 | |||
122 | spin_lock_irqsave(&catas_lock, flags); | ||
123 | list_add(&dev->catas_err.list, &catas_list); | ||
124 | queue_work(catas_wq, &catas_work); | ||
125 | spin_unlock_irqrestore(&catas_lock, flags); | ||
85 | } | 126 | } |
86 | 127 | ||
87 | static void poll_catas(unsigned long dev_ptr) | 128 | static void poll_catas(unsigned long dev_ptr) |
@@ -135,6 +176,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev) | |||
135 | dev->catas_err.timer.data = (unsigned long) dev; | 176 | dev->catas_err.timer.data = (unsigned long) dev; |
136 | dev->catas_err.timer.function = poll_catas; | 177 | dev->catas_err.timer.function = poll_catas; |
137 | dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; | 178 | dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; |
179 | INIT_LIST_HEAD(&dev->catas_err.list); | ||
138 | add_timer(&dev->catas_err.timer); | 180 | add_timer(&dev->catas_err.timer); |
139 | } | 181 | } |
140 | 182 | ||
@@ -153,4 +195,24 @@ void mthca_stop_catas_poll(struct mthca_dev *dev) | |||
153 | dev->catas_err.addr), | 195 | dev->catas_err.addr), |
154 | dev->catas_err.size * 4); | 196 | dev->catas_err.size * 4); |
155 | } | 197 | } |
198 | |||
199 | spin_lock_irq(&catas_lock); | ||
200 | list_del(&dev->catas_err.list); | ||
201 | spin_unlock_irq(&catas_lock); | ||
202 | } | ||
203 | |||
204 | int __init mthca_catas_init(void) | ||
205 | { | ||
206 | INIT_WORK(&catas_work, catas_reset, NULL); | ||
207 | |||
208 | catas_wq = create_singlethread_workqueue("mthca_catas"); | ||
209 | if (!catas_wq) | ||
210 | return -ENOMEM; | ||
211 | |||
212 | return 0; | ||
213 | } | ||
214 | |||
215 | void mthca_catas_cleanup(void) | ||
216 | { | ||
217 | destroy_workqueue(catas_wq); | ||
156 | } | 218 | } |
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 33bd0b8bfd13..fe5cecf70fed 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/dma-mapping.h> | 45 | #include <linux/dma-mapping.h> |
46 | #include <linux/timer.h> | 46 | #include <linux/timer.h> |
47 | #include <linux/mutex.h> | 47 | #include <linux/mutex.h> |
48 | #include <linux/list.h> | ||
48 | 49 | ||
49 | #include <asm/semaphore.h> | 50 | #include <asm/semaphore.h> |
50 | 51 | ||
@@ -283,8 +284,11 @@ struct mthca_catas_err { | |||
283 | unsigned long stop; | 284 | unsigned long stop; |
284 | u32 size; | 285 | u32 size; |
285 | struct timer_list timer; | 286 | struct timer_list timer; |
287 | struct list_head list; | ||
286 | }; | 288 | }; |
287 | 289 | ||
290 | extern struct mutex mthca_device_mutex; | ||
291 | |||
288 | struct mthca_dev { | 292 | struct mthca_dev { |
289 | struct ib_device ib_dev; | 293 | struct ib_device ib_dev; |
290 | struct pci_dev *pdev; | 294 | struct pci_dev *pdev; |
@@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev); | |||
450 | 454 | ||
451 | void mthca_start_catas_poll(struct mthca_dev *dev); | 455 | void mthca_start_catas_poll(struct mthca_dev *dev); |
452 | void mthca_stop_catas_poll(struct mthca_dev *dev); | 456 | void mthca_stop_catas_poll(struct mthca_dev *dev); |
457 | int __mthca_restart_one(struct pci_dev *pdev); | ||
458 | int mthca_catas_init(void); | ||
459 | void mthca_catas_cleanup(void); | ||
453 | 460 | ||
454 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); | 461 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); |
455 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); | 462 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); |
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index 7b82c1907f04..47ea02148368 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c | |||
@@ -80,6 +80,8 @@ static int tune_pci = 0; | |||
80 | module_param(tune_pci, int, 0444); | 80 | module_param(tune_pci, int, 0444); |
81 | MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); | 81 | MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero"); |
82 | 82 | ||
83 | struct mutex mthca_device_mutex; | ||
84 | |||
83 | static const char mthca_version[] __devinitdata = | 85 | static const char mthca_version[] __devinitdata = |
84 | DRV_NAME ": Mellanox InfiniBand HCA driver v" | 86 | DRV_NAME ": Mellanox InfiniBand HCA driver v" |
85 | DRV_VERSION " (" DRV_RELDATE ")\n"; | 87 | DRV_VERSION " (" DRV_RELDATE ")\n"; |
@@ -978,28 +980,15 @@ static struct { | |||
978 | MTHCA_FLAG_SINAI_OPT } | 980 | MTHCA_FLAG_SINAI_OPT } |
979 | }; | 981 | }; |
980 | 982 | ||
981 | static int __devinit mthca_init_one(struct pci_dev *pdev, | 983 | static int __mthca_init_one(struct pci_dev *pdev, int hca_type) |
982 | const struct pci_device_id *id) | ||
983 | { | 984 | { |
984 | static int mthca_version_printed = 0; | ||
985 | int ddr_hidden = 0; | 985 | int ddr_hidden = 0; |
986 | int err; | 986 | int err; |
987 | struct mthca_dev *mdev; | 987 | struct mthca_dev *mdev; |
988 | 988 | ||
989 | if (!mthca_version_printed) { | ||
990 | printk(KERN_INFO "%s", mthca_version); | ||
991 | ++mthca_version_printed; | ||
992 | } | ||
993 | |||
994 | printk(KERN_INFO PFX "Initializing %s\n", | 989 | printk(KERN_INFO PFX "Initializing %s\n", |
995 | pci_name(pdev)); | 990 | pci_name(pdev)); |
996 | 991 | ||
997 | if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { | ||
998 | printk(KERN_ERR PFX "%s has invalid driver data %lx\n", | ||
999 | pci_name(pdev), id->driver_data); | ||
1000 | return -ENODEV; | ||
1001 | } | ||
1002 | |||
1003 | err = pci_enable_device(pdev); | 992 | err = pci_enable_device(pdev); |
1004 | if (err) { | 993 | if (err) { |
1005 | dev_err(&pdev->dev, "Cannot enable PCI device, " | 994 | dev_err(&pdev->dev, "Cannot enable PCI device, " |
@@ -1065,7 +1054,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, | |||
1065 | 1054 | ||
1066 | mdev->pdev = pdev; | 1055 | mdev->pdev = pdev; |
1067 | 1056 | ||
1068 | mdev->mthca_flags = mthca_hca_table[id->driver_data].flags; | 1057 | mdev->mthca_flags = mthca_hca_table[hca_type].flags; |
1069 | if (ddr_hidden) | 1058 | if (ddr_hidden) |
1070 | mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; | 1059 | mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN; |
1071 | 1060 | ||
@@ -1099,13 +1088,13 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, | |||
1099 | if (err) | 1088 | if (err) |
1100 | goto err_cmd; | 1089 | goto err_cmd; |
1101 | 1090 | ||
1102 | if (mdev->fw_ver < mthca_hca_table[id->driver_data].latest_fw) { | 1091 | if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) { |
1103 | mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", | 1092 | mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n", |
1104 | (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, | 1093 | (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff, |
1105 | (int) (mdev->fw_ver & 0xffff), | 1094 | (int) (mdev->fw_ver & 0xffff), |
1106 | (int) (mthca_hca_table[id->driver_data].latest_fw >> 32), | 1095 | (int) (mthca_hca_table[hca_type].latest_fw >> 32), |
1107 | (int) (mthca_hca_table[id->driver_data].latest_fw >> 16) & 0xffff, | 1096 | (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff, |
1108 | (int) (mthca_hca_table[id->driver_data].latest_fw & 0xffff)); | 1097 | (int) (mthca_hca_table[hca_type].latest_fw & 0xffff)); |
1109 | mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); | 1098 | mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n"); |
1110 | } | 1099 | } |
1111 | 1100 | ||
@@ -1122,6 +1111,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev, | |||
1122 | goto err_unregister; | 1111 | goto err_unregister; |
1123 | 1112 | ||
1124 | pci_set_drvdata(pdev, mdev); | 1113 | pci_set_drvdata(pdev, mdev); |
1114 | mdev->hca_type = hca_type; | ||
1125 | 1115 | ||
1126 | return 0; | 1116 | return 0; |
1127 | 1117 | ||
@@ -1166,7 +1156,7 @@ err_disable_pdev: | |||
1166 | return err; | 1156 | return err; |
1167 | } | 1157 | } |
1168 | 1158 | ||
1169 | static void __devexit mthca_remove_one(struct pci_dev *pdev) | 1159 | static void __mthca_remove_one(struct pci_dev *pdev) |
1170 | { | 1160 | { |
1171 | struct mthca_dev *mdev = pci_get_drvdata(pdev); | 1161 | struct mthca_dev *mdev = pci_get_drvdata(pdev); |
1172 | u8 status; | 1162 | u8 status; |
@@ -1211,6 +1201,51 @@ static void __devexit mthca_remove_one(struct pci_dev *pdev) | |||
1211 | } | 1201 | } |
1212 | } | 1202 | } |
1213 | 1203 | ||
1204 | int __mthca_restart_one(struct pci_dev *pdev) | ||
1205 | { | ||
1206 | struct mthca_dev *mdev; | ||
1207 | |||
1208 | mdev = pci_get_drvdata(pdev); | ||
1209 | if (!mdev) | ||
1210 | return -ENODEV; | ||
1211 | __mthca_remove_one(pdev); | ||
1212 | return __mthca_init_one(pdev, mdev->hca_type); | ||
1213 | } | ||
1214 | |||
1215 | static int __devinit mthca_init_one(struct pci_dev *pdev, | ||
1216 | const struct pci_device_id *id) | ||
1217 | { | ||
1218 | static int mthca_version_printed = 0; | ||
1219 | int ret; | ||
1220 | |||
1221 | mutex_lock(&mthca_device_mutex); | ||
1222 | |||
1223 | if (!mthca_version_printed) { | ||
1224 | printk(KERN_INFO "%s", mthca_version); | ||
1225 | ++mthca_version_printed; | ||
1226 | } | ||
1227 | |||
1228 | if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) { | ||
1229 | printk(KERN_ERR PFX "%s has invalid driver data %lx\n", | ||
1230 | pci_name(pdev), id->driver_data); | ||
1231 | mutex_unlock(&mthca_device_mutex); | ||
1232 | return -ENODEV; | ||
1233 | } | ||
1234 | |||
1235 | ret = __mthca_init_one(pdev, id->driver_data); | ||
1236 | |||
1237 | mutex_unlock(&mthca_device_mutex); | ||
1238 | |||
1239 | return ret; | ||
1240 | } | ||
1241 | |||
1242 | static void __devexit mthca_remove_one(struct pci_dev *pdev) | ||
1243 | { | ||
1244 | mutex_lock(&mthca_device_mutex); | ||
1245 | __mthca_remove_one(pdev); | ||
1246 | mutex_unlock(&mthca_device_mutex); | ||
1247 | } | ||
1248 | |||
1214 | static struct pci_device_id mthca_pci_table[] = { | 1249 | static struct pci_device_id mthca_pci_table[] = { |
1215 | { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), | 1250 | { PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR), |
1216 | .driver_data = TAVOR }, | 1251 | .driver_data = TAVOR }, |
@@ -1248,13 +1283,24 @@ static int __init mthca_init(void) | |||
1248 | { | 1283 | { |
1249 | int ret; | 1284 | int ret; |
1250 | 1285 | ||
1286 | mutex_init(&mthca_device_mutex); | ||
1287 | ret = mthca_catas_init(); | ||
1288 | if (ret) | ||
1289 | return ret; | ||
1290 | |||
1251 | ret = pci_register_driver(&mthca_driver); | 1291 | ret = pci_register_driver(&mthca_driver); |
1252 | return ret < 0 ? ret : 0; | 1292 | if (ret < 0) { |
1293 | mthca_catas_cleanup(); | ||
1294 | return ret; | ||
1295 | } | ||
1296 | |||
1297 | return 0; | ||
1253 | } | 1298 | } |
1254 | 1299 | ||
1255 | static void __exit mthca_cleanup(void) | 1300 | static void __exit mthca_cleanup(void) |
1256 | { | 1301 | { |
1257 | pci_unregister_driver(&mthca_driver); | 1302 | pci_unregister_driver(&mthca_driver); |
1303 | mthca_catas_cleanup(); | ||
1258 | } | 1304 | } |
1259 | 1305 | ||
1260 | module_init(mthca_init); | 1306 | module_init(mthca_init); |