diff options
author | Roland Dreier <rolandd@cisco.com> | 2005-10-27 14:03:38 -0400 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2005-10-27 14:03:38 -0400 |
commit | 3d155f8cd0d077938d271225d26ee52f8eb26082 (patch) | |
tree | 28f65ed2dc5e9f12884daf2a97f50367f145d8a0 /drivers/infiniband | |
parent | 7cc656efb560cda66b5ed48444cad7556ea4fe99 (diff) |
[IB] mthca: first pass at catastrophic error reporting
Add some initial support for detecting and reporting catastrophic
errors reported by Mellanox HCAs. We start a periodic timer which
polls the catastrophic error reporting buffer in device memory. If an
error is detected, we dump the contents of the buffer for port-mortem
debugging, and report a fatal asynchronous error to higher levels.
In the future we can try to recover from these errors by resetting the
device, but this will require some work in higher-level code as well.
Let's get this in now, so that we at least get catastrophic errors
reported in logs.
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r-- | drivers/infiniband/hw/mthca/Makefile | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_catas.c | 153 | ||||
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_cmd.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_dev.h | 13 | ||||
-rw-r--r-- | drivers/infiniband/hw/mthca/mthca_provider.c | 3 |
5 files changed, 176 insertions, 1 deletions
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile index c44f7bae5424..47ec5a7cba0b 100644 --- a/drivers/infiniband/hw/mthca/Makefile +++ b/drivers/infiniband/hw/mthca/Makefile | |||
@@ -7,4 +7,5 @@ obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o | |||
7 | ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ | 7 | ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ |
8 | mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \ | 8 | mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \ |
9 | mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \ | 9 | mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \ |
10 | mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o | 10 | mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \ |
11 | mthca_catas.o | ||
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c new file mode 100644 index 000000000000..7ac52af43b99 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_catas.c | |||
@@ -0,0 +1,153 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2005 Cisco Systems. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | * | ||
32 | * $Id$ | ||
33 | */ | ||
34 | |||
35 | #include "mthca_dev.h" | ||
36 | |||
37 | enum { | ||
38 | MTHCA_CATAS_POLL_INTERVAL = 5 * HZ, | ||
39 | |||
40 | MTHCA_CATAS_TYPE_INTERNAL = 0, | ||
41 | MTHCA_CATAS_TYPE_UPLINK = 3, | ||
42 | MTHCA_CATAS_TYPE_DDR = 4, | ||
43 | MTHCA_CATAS_TYPE_PARITY = 5, | ||
44 | }; | ||
45 | |||
46 | static DEFINE_SPINLOCK(catas_lock); | ||
47 | |||
48 | static void handle_catas(struct mthca_dev *dev) | ||
49 | { | ||
50 | struct ib_event event; | ||
51 | const char *type; | ||
52 | int i; | ||
53 | |||
54 | event.device = &dev->ib_dev; | ||
55 | event.event = IB_EVENT_DEVICE_FATAL; | ||
56 | event.element.port_num = 0; | ||
57 | |||
58 | ib_dispatch_event(&event); | ||
59 | |||
60 | switch (swab32(readl(dev->catas_err.map)) >> 24) { | ||
61 | case MTHCA_CATAS_TYPE_INTERNAL: | ||
62 | type = "internal error"; | ||
63 | break; | ||
64 | case MTHCA_CATAS_TYPE_UPLINK: | ||
65 | type = "uplink bus error"; | ||
66 | break; | ||
67 | case MTHCA_CATAS_TYPE_DDR: | ||
68 | type = "DDR data error"; | ||
69 | break; | ||
70 | case MTHCA_CATAS_TYPE_PARITY: | ||
71 | type = "internal parity error"; | ||
72 | break; | ||
73 | default: | ||
74 | type = "unknown error"; | ||
75 | break; | ||
76 | } | ||
77 | |||
78 | mthca_err(dev, "Catastrophic error detected: %s\n", type); | ||
79 | for (i = 0; i < dev->catas_err.size; ++i) | ||
80 | mthca_err(dev, " buf[%02x]: %08x\n", | ||
81 | i, swab32(readl(dev->catas_err.map + i))); | ||
82 | } | ||
83 | |||
84 | static void poll_catas(unsigned long dev_ptr) | ||
85 | { | ||
86 | struct mthca_dev *dev = (struct mthca_dev *) dev_ptr; | ||
87 | unsigned long flags; | ||
88 | int i; | ||
89 | |||
90 | for (i = 0; i < dev->catas_err.size; ++i) | ||
91 | if (readl(dev->catas_err.map + i)) { | ||
92 | handle_catas(dev); | ||
93 | return; | ||
94 | } | ||
95 | |||
96 | spin_lock_irqsave(&catas_lock, flags); | ||
97 | if (dev->catas_err.stop) | ||
98 | mod_timer(&dev->catas_err.timer, | ||
99 | jiffies + MTHCA_CATAS_POLL_INTERVAL); | ||
100 | spin_unlock_irqrestore(&catas_lock, flags); | ||
101 | |||
102 | return; | ||
103 | } | ||
104 | |||
105 | void mthca_start_catas_poll(struct mthca_dev *dev) | ||
106 | { | ||
107 | unsigned long addr; | ||
108 | |||
109 | init_timer(&dev->catas_err.timer); | ||
110 | dev->catas_err.stop = 0; | ||
111 | dev->catas_err.map = NULL; | ||
112 | |||
113 | addr = pci_resource_start(dev->pdev, 0) + | ||
114 | ((pci_resource_len(dev->pdev, 0) - 1) & | ||
115 | dev->catas_err.addr); | ||
116 | |||
117 | if (!request_mem_region(addr, dev->catas_err.size * 4, | ||
118 | DRV_NAME)) { | ||
119 | mthca_warn(dev, "couldn't request catastrophic error region " | ||
120 | "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4); | ||
121 | return; | ||
122 | } | ||
123 | |||
124 | dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4); | ||
125 | if (!dev->catas_err.map) { | ||
126 | mthca_warn(dev, "couldn't map catastrophic error region " | ||
127 | "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4); | ||
128 | release_mem_region(addr, dev->catas_err.size * 4); | ||
129 | return; | ||
130 | } | ||
131 | |||
132 | dev->catas_err.timer.data = (unsigned long) dev; | ||
133 | dev->catas_err.timer.function = poll_catas; | ||
134 | dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; | ||
135 | add_timer(&dev->catas_err.timer); | ||
136 | } | ||
137 | |||
138 | void mthca_stop_catas_poll(struct mthca_dev *dev) | ||
139 | { | ||
140 | spin_lock_irq(&catas_lock); | ||
141 | dev->catas_err.stop = 1; | ||
142 | spin_unlock_irq(&catas_lock); | ||
143 | |||
144 | del_timer_sync(&dev->catas_err.timer); | ||
145 | |||
146 | if (dev->catas_err.map) { | ||
147 | iounmap(dev->catas_err.map); | ||
148 | release_mem_region(pci_resource_start(dev->pdev, 0) + | ||
149 | ((pci_resource_len(dev->pdev, 0) - 1) & | ||
150 | dev->catas_err.addr), | ||
151 | dev->catas_err.size * 4); | ||
152 | } | ||
153 | } | ||
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 1bd7dc8f778c..9220473dbfbd 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c | |||
@@ -1,6 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. | 2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. |
3 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. | 3 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. |
4 | * Copyright (c) 2005 Cisco Systems. All rights reserved. | ||
4 | * | 5 | * |
5 | * This software is available to you under a choice of one of two | 6 | * This software is available to you under a choice of one of two |
6 | * licenses. You may choose to be licensed under the terms of the GNU | 7 | * licenses. You may choose to be licensed under the terms of the GNU |
@@ -706,9 +707,13 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status) | |||
706 | 707 | ||
707 | MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); | 708 | MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); |
708 | dev->cmd.max_cmds = 1 << lg; | 709 | dev->cmd.max_cmds = 1 << lg; |
710 | MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); | ||
711 | MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); | ||
709 | 712 | ||
710 | mthca_dbg(dev, "FW version %012llx, max commands %d\n", | 713 | mthca_dbg(dev, "FW version %012llx, max commands %d\n", |
711 | (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); | 714 | (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); |
715 | mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n", | ||
716 | (unsigned long long) dev->catas_err.addr, dev->catas_err.size); | ||
712 | 717 | ||
713 | if (mthca_is_memfree(dev)) { | 718 | if (mthca_is_memfree(dev)) { |
714 | MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); | 719 | MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); |
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index f106bac0f925..7e68bd4a3780 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h | |||
@@ -258,6 +258,14 @@ struct mthca_mcg_table { | |||
258 | struct mthca_icm_table *table; | 258 | struct mthca_icm_table *table; |
259 | }; | 259 | }; |
260 | 260 | ||
261 | struct mthca_catas_err { | ||
262 | u64 addr; | ||
263 | u32 __iomem *map; | ||
264 | unsigned long stop; | ||
265 | u32 size; | ||
266 | struct timer_list timer; | ||
267 | }; | ||
268 | |||
261 | struct mthca_dev { | 269 | struct mthca_dev { |
262 | struct ib_device ib_dev; | 270 | struct ib_device ib_dev; |
263 | struct pci_dev *pdev; | 271 | struct pci_dev *pdev; |
@@ -318,6 +326,8 @@ struct mthca_dev { | |||
318 | struct mthca_av_table av_table; | 326 | struct mthca_av_table av_table; |
319 | struct mthca_mcg_table mcg_table; | 327 | struct mthca_mcg_table mcg_table; |
320 | 328 | ||
329 | struct mthca_catas_err catas_err; | ||
330 | |||
321 | struct mthca_uar driver_uar; | 331 | struct mthca_uar driver_uar; |
322 | struct mthca_db_table *db_tab; | 332 | struct mthca_db_table *db_tab; |
323 | struct mthca_pd driver_pd; | 333 | struct mthca_pd driver_pd; |
@@ -405,6 +415,9 @@ void mthca_cleanup_mcg_table(struct mthca_dev *dev); | |||
405 | int mthca_register_device(struct mthca_dev *dev); | 415 | int mthca_register_device(struct mthca_dev *dev); |
406 | void mthca_unregister_device(struct mthca_dev *dev); | 416 | void mthca_unregister_device(struct mthca_dev *dev); |
407 | 417 | ||
418 | void mthca_start_catas_poll(struct mthca_dev *dev); | ||
419 | void mthca_stop_catas_poll(struct mthca_dev *dev); | ||
420 | |||
408 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); | 421 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); |
409 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); | 422 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); |
410 | 423 | ||
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 9e911a1ea415..1b9477edbd7b 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c | |||
@@ -1175,10 +1175,13 @@ int mthca_register_device(struct mthca_dev *dev) | |||
1175 | } | 1175 | } |
1176 | } | 1176 | } |
1177 | 1177 | ||
1178 | mthca_start_catas_poll(dev); | ||
1179 | |||
1178 | return 0; | 1180 | return 0; |
1179 | } | 1181 | } |
1180 | 1182 | ||
1181 | void mthca_unregister_device(struct mthca_dev *dev) | 1183 | void mthca_unregister_device(struct mthca_dev *dev) |
1182 | { | 1184 | { |
1185 | mthca_stop_catas_poll(dev); | ||
1183 | ib_unregister_device(&dev->ib_dev); | 1186 | ib_unregister_device(&dev->ib_dev); |
1184 | } | 1187 | } |