diff options
| author | Roland Dreier <rolandd@cisco.com> | 2005-10-27 14:03:38 -0400 |
|---|---|---|
| committer | Roland Dreier <rolandd@cisco.com> | 2005-10-27 14:03:38 -0400 |
| commit | 3d155f8cd0d077938d271225d26ee52f8eb26082 (patch) | |
| tree | 28f65ed2dc5e9f12884daf2a97f50367f145d8a0 | |
| parent | 7cc656efb560cda66b5ed48444cad7556ea4fe99 (diff) | |
[IB] mthca: first pass at catastrophic error reporting
Add some initial support for detecting and reporting catastrophic
errors reported by Mellanox HCAs. We start a periodic timer which
polls the catastrophic error reporting buffer in device memory. If an
error is detected, we dump the contents of the buffer for port-mortem
debugging, and report a fatal asynchronous error to higher levels.
In the future we can try to recover from these errors by resetting the
device, but this will require some work in higher-level code as well.
Let's get this in now, so that we at least get catastrophic errors
reported in logs.
Signed-off-by: Roland Dreier <rolandd@cisco.com>
| -rw-r--r-- | drivers/infiniband/hw/mthca/Makefile | 3 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mthca/mthca_catas.c | 153 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mthca/mthca_cmd.c | 5 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mthca/mthca_dev.h | 13 | ||||
| -rw-r--r-- | drivers/infiniband/hw/mthca/mthca_provider.c | 3 |
5 files changed, 176 insertions, 1 deletions
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile index c44f7bae5424..47ec5a7cba0b 100644 --- a/drivers/infiniband/hw/mthca/Makefile +++ b/drivers/infiniband/hw/mthca/Makefile | |||
| @@ -7,4 +7,5 @@ obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o | |||
| 7 | ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ | 7 | ib_mthca-y := mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \ |
| 8 | mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \ | 8 | mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \ |
| 9 | mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \ | 9 | mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \ |
| 10 | mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o | 10 | mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \ |
| 11 | mthca_catas.o | ||
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c new file mode 100644 index 000000000000..7ac52af43b99 --- /dev/null +++ b/drivers/infiniband/hw/mthca/mthca_catas.c | |||
| @@ -0,0 +1,153 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2005 Cisco Systems. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | * | ||
| 32 | * $Id$ | ||
| 33 | */ | ||
| 34 | |||
| 35 | #include "mthca_dev.h" | ||
| 36 | |||
| 37 | enum { | ||
| 38 | MTHCA_CATAS_POLL_INTERVAL = 5 * HZ, | ||
| 39 | |||
| 40 | MTHCA_CATAS_TYPE_INTERNAL = 0, | ||
| 41 | MTHCA_CATAS_TYPE_UPLINK = 3, | ||
| 42 | MTHCA_CATAS_TYPE_DDR = 4, | ||
| 43 | MTHCA_CATAS_TYPE_PARITY = 5, | ||
| 44 | }; | ||
| 45 | |||
| 46 | static DEFINE_SPINLOCK(catas_lock); | ||
| 47 | |||
| 48 | static void handle_catas(struct mthca_dev *dev) | ||
| 49 | { | ||
| 50 | struct ib_event event; | ||
| 51 | const char *type; | ||
| 52 | int i; | ||
| 53 | |||
| 54 | event.device = &dev->ib_dev; | ||
| 55 | event.event = IB_EVENT_DEVICE_FATAL; | ||
| 56 | event.element.port_num = 0; | ||
| 57 | |||
| 58 | ib_dispatch_event(&event); | ||
| 59 | |||
| 60 | switch (swab32(readl(dev->catas_err.map)) >> 24) { | ||
| 61 | case MTHCA_CATAS_TYPE_INTERNAL: | ||
| 62 | type = "internal error"; | ||
| 63 | break; | ||
| 64 | case MTHCA_CATAS_TYPE_UPLINK: | ||
| 65 | type = "uplink bus error"; | ||
| 66 | break; | ||
| 67 | case MTHCA_CATAS_TYPE_DDR: | ||
| 68 | type = "DDR data error"; | ||
| 69 | break; | ||
| 70 | case MTHCA_CATAS_TYPE_PARITY: | ||
| 71 | type = "internal parity error"; | ||
| 72 | break; | ||
| 73 | default: | ||
| 74 | type = "unknown error"; | ||
| 75 | break; | ||
| 76 | } | ||
| 77 | |||
| 78 | mthca_err(dev, "Catastrophic error detected: %s\n", type); | ||
| 79 | for (i = 0; i < dev->catas_err.size; ++i) | ||
| 80 | mthca_err(dev, " buf[%02x]: %08x\n", | ||
| 81 | i, swab32(readl(dev->catas_err.map + i))); | ||
| 82 | } | ||
| 83 | |||
| 84 | static void poll_catas(unsigned long dev_ptr) | ||
| 85 | { | ||
| 86 | struct mthca_dev *dev = (struct mthca_dev *) dev_ptr; | ||
| 87 | unsigned long flags; | ||
| 88 | int i; | ||
| 89 | |||
| 90 | for (i = 0; i < dev->catas_err.size; ++i) | ||
| 91 | if (readl(dev->catas_err.map + i)) { | ||
| 92 | handle_catas(dev); | ||
| 93 | return; | ||
| 94 | } | ||
| 95 | |||
| 96 | spin_lock_irqsave(&catas_lock, flags); | ||
| 97 | if (dev->catas_err.stop) | ||
| 98 | mod_timer(&dev->catas_err.timer, | ||
| 99 | jiffies + MTHCA_CATAS_POLL_INTERVAL); | ||
| 100 | spin_unlock_irqrestore(&catas_lock, flags); | ||
| 101 | |||
| 102 | return; | ||
| 103 | } | ||
| 104 | |||
| 105 | void mthca_start_catas_poll(struct mthca_dev *dev) | ||
| 106 | { | ||
| 107 | unsigned long addr; | ||
| 108 | |||
| 109 | init_timer(&dev->catas_err.timer); | ||
| 110 | dev->catas_err.stop = 0; | ||
| 111 | dev->catas_err.map = NULL; | ||
| 112 | |||
| 113 | addr = pci_resource_start(dev->pdev, 0) + | ||
| 114 | ((pci_resource_len(dev->pdev, 0) - 1) & | ||
| 115 | dev->catas_err.addr); | ||
| 116 | |||
| 117 | if (!request_mem_region(addr, dev->catas_err.size * 4, | ||
| 118 | DRV_NAME)) { | ||
| 119 | mthca_warn(dev, "couldn't request catastrophic error region " | ||
| 120 | "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4); | ||
| 121 | return; | ||
| 122 | } | ||
| 123 | |||
| 124 | dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4); | ||
| 125 | if (!dev->catas_err.map) { | ||
| 126 | mthca_warn(dev, "couldn't map catastrophic error region " | ||
| 127 | "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4); | ||
| 128 | release_mem_region(addr, dev->catas_err.size * 4); | ||
| 129 | return; | ||
| 130 | } | ||
| 131 | |||
| 132 | dev->catas_err.timer.data = (unsigned long) dev; | ||
| 133 | dev->catas_err.timer.function = poll_catas; | ||
| 134 | dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; | ||
| 135 | add_timer(&dev->catas_err.timer); | ||
| 136 | } | ||
| 137 | |||
| 138 | void mthca_stop_catas_poll(struct mthca_dev *dev) | ||
| 139 | { | ||
| 140 | spin_lock_irq(&catas_lock); | ||
| 141 | dev->catas_err.stop = 1; | ||
| 142 | spin_unlock_irq(&catas_lock); | ||
| 143 | |||
| 144 | del_timer_sync(&dev->catas_err.timer); | ||
| 145 | |||
| 146 | if (dev->catas_err.map) { | ||
| 147 | iounmap(dev->catas_err.map); | ||
| 148 | release_mem_region(pci_resource_start(dev->pdev, 0) + | ||
| 149 | ((pci_resource_len(dev->pdev, 0) - 1) & | ||
| 150 | dev->catas_err.addr), | ||
| 151 | dev->catas_err.size * 4); | ||
| 152 | } | ||
| 153 | } | ||
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 1bd7dc8f778c..9220473dbfbd 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. | 2 | * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. |
| 3 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. | 3 | * Copyright (c) 2005 Mellanox Technologies. All rights reserved. |
| 4 | * Copyright (c) 2005 Cisco Systems. All rights reserved. | ||
| 4 | * | 5 | * |
| 5 | * This software is available to you under a choice of one of two | 6 | * This software is available to you under a choice of one of two |
| 6 | * licenses. You may choose to be licensed under the terms of the GNU | 7 | * licenses. You may choose to be licensed under the terms of the GNU |
| @@ -706,9 +707,13 @@ int mthca_QUERY_FW(struct mthca_dev *dev, u8 *status) | |||
| 706 | 707 | ||
| 707 | MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); | 708 | MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET); |
| 708 | dev->cmd.max_cmds = 1 << lg; | 709 | dev->cmd.max_cmds = 1 << lg; |
| 710 | MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET); | ||
| 711 | MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET); | ||
| 709 | 712 | ||
| 710 | mthca_dbg(dev, "FW version %012llx, max commands %d\n", | 713 | mthca_dbg(dev, "FW version %012llx, max commands %d\n", |
| 711 | (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); | 714 | (unsigned long long) dev->fw_ver, dev->cmd.max_cmds); |
| 715 | mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n", | ||
| 716 | (unsigned long long) dev->catas_err.addr, dev->catas_err.size); | ||
| 712 | 717 | ||
| 713 | if (mthca_is_memfree(dev)) { | 718 | if (mthca_is_memfree(dev)) { |
| 714 | MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); | 719 | MTHCA_GET(dev->fw.arbel.fw_pages, outbox, QUERY_FW_SIZE_OFFSET); |
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index f106bac0f925..7e68bd4a3780 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h | |||
| @@ -258,6 +258,14 @@ struct mthca_mcg_table { | |||
| 258 | struct mthca_icm_table *table; | 258 | struct mthca_icm_table *table; |
| 259 | }; | 259 | }; |
| 260 | 260 | ||
| 261 | struct mthca_catas_err { | ||
| 262 | u64 addr; | ||
| 263 | u32 __iomem *map; | ||
| 264 | unsigned long stop; | ||
| 265 | u32 size; | ||
| 266 | struct timer_list timer; | ||
| 267 | }; | ||
| 268 | |||
| 261 | struct mthca_dev { | 269 | struct mthca_dev { |
| 262 | struct ib_device ib_dev; | 270 | struct ib_device ib_dev; |
| 263 | struct pci_dev *pdev; | 271 | struct pci_dev *pdev; |
| @@ -318,6 +326,8 @@ struct mthca_dev { | |||
| 318 | struct mthca_av_table av_table; | 326 | struct mthca_av_table av_table; |
| 319 | struct mthca_mcg_table mcg_table; | 327 | struct mthca_mcg_table mcg_table; |
| 320 | 328 | ||
| 329 | struct mthca_catas_err catas_err; | ||
| 330 | |||
| 321 | struct mthca_uar driver_uar; | 331 | struct mthca_uar driver_uar; |
| 322 | struct mthca_db_table *db_tab; | 332 | struct mthca_db_table *db_tab; |
| 323 | struct mthca_pd driver_pd; | 333 | struct mthca_pd driver_pd; |
| @@ -405,6 +415,9 @@ void mthca_cleanup_mcg_table(struct mthca_dev *dev); | |||
| 405 | int mthca_register_device(struct mthca_dev *dev); | 415 | int mthca_register_device(struct mthca_dev *dev); |
| 406 | void mthca_unregister_device(struct mthca_dev *dev); | 416 | void mthca_unregister_device(struct mthca_dev *dev); |
| 407 | 417 | ||
| 418 | void mthca_start_catas_poll(struct mthca_dev *dev); | ||
| 419 | void mthca_stop_catas_poll(struct mthca_dev *dev); | ||
| 420 | |||
| 408 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); | 421 | int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar); |
| 409 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); | 422 | void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar); |
| 410 | 423 | ||
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 9e911a1ea415..1b9477edbd7b 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c | |||
| @@ -1175,10 +1175,13 @@ int mthca_register_device(struct mthca_dev *dev) | |||
| 1175 | } | 1175 | } |
| 1176 | } | 1176 | } |
| 1177 | 1177 | ||
| 1178 | mthca_start_catas_poll(dev); | ||
| 1179 | |||
| 1178 | return 0; | 1180 | return 0; |
| 1179 | } | 1181 | } |
| 1180 | 1182 | ||
| 1181 | void mthca_unregister_device(struct mthca_dev *dev) | 1183 | void mthca_unregister_device(struct mthca_dev *dev) |
| 1182 | { | 1184 | { |
| 1185 | mthca_stop_catas_poll(dev); | ||
| 1183 | ib_unregister_device(&dev->ib_dev); | 1186 | ib_unregister_device(&dev->ib_dev); |
| 1184 | } | 1187 | } |
