diff options
author | Bryan O'Sullivan <bos@pathscale.com> | 2006-09-28 12:00:18 -0400 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2006-09-28 14:17:03 -0400 |
commit | 89d1e09b6a6d844ef327937f41658a426be42501 (patch) | |
tree | 5730241c737baf67b0b1ddf89ff38f6936d649c4 | |
parent | 510847750c9d26052a71631e0fcad9e7f7a5f369 (diff) |
IB/ipath: Fix and recover TXE piobuf and PBC parity errors
We can sometimes trigger parity errors due to processor speculative
reads to our write-combined memory (mostly seen on Woodcrest). Add a
stats counter for these.
Factored out the sendbuffererror buffer cancellation code so it can be
used in the new handling; suppress likely subsequent error messages if
within two jiffies of the cancellation.
Also restore 2 dropped TXE lines on hwe_bitsextant noticed while
debugging.
Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
-rw-r--r-- | drivers/infiniband/hw/ipath/ipath_common.h | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/ipath/ipath_iba6110.c | 32 | ||||
-rw-r--r-- | drivers/infiniband/hw/ipath/ipath_iba6120.c | 37 | ||||
-rw-r--r-- | drivers/infiniband/hw/ipath/ipath_intr.c | 98 | ||||
-rw-r--r-- | drivers/infiniband/hw/ipath/ipath_kernel.h | 6 |
5 files changed, 124 insertions, 52 deletions
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h index 382956d2ea4b..a9b109a353bc 100644 --- a/drivers/infiniband/hw/ipath/ipath_common.h +++ b/drivers/infiniband/hw/ipath/ipath_common.h | |||
@@ -141,8 +141,9 @@ struct infinipath_stats { | |||
141 | * packets if ipath not configured, etc.) | 141 | * packets if ipath not configured, etc.) |
142 | */ | 142 | */ |
143 | __u64 sps_krdrops; | 143 | __u64 sps_krdrops; |
144 | __u64 sps_txeparity; /* PIO buffer parity error, recovered */ | ||
144 | /* pad for future growth */ | 145 | /* pad for future growth */ |
145 | __u64 __sps_pad[46]; | 146 | __u64 __sps_pad[45]; |
146 | }; | 147 | }; |
147 | 148 | ||
148 | /* | 149 | /* |
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c index fd49c9c32c68..9e4e8d4c6e20 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6110.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c | |||
@@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, | |||
451 | * make sure we get this much out, unless told to be quiet, | 451 | * make sure we get this much out, unless told to be quiet, |
452 | * or it's occurred within the last 5 seconds | 452 | * or it's occurred within the last 5 seconds |
453 | */ | 453 | */ |
454 | if ((hwerrs & ~dd->ipath_lasthwerror) || | 454 | if ((hwerrs & ~(dd->ipath_lasthwerror | |
455 | ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | | ||
456 | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) | ||
457 | << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) || | ||
455 | (ipath_debug & __IPATH_VERBDBG)) | 458 | (ipath_debug & __IPATH_VERBDBG)) |
456 | dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " | 459 | dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " |
457 | "(cleared)\n", (unsigned long long) hwerrs); | 460 | "(cleared)\n", (unsigned long long) hwerrs); |
@@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg, | |||
464 | 467 | ||
465 | ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); | 468 | ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); |
466 | if (ctrl & INFINIPATH_C_FREEZEMODE) { | 469 | if (ctrl & INFINIPATH_C_FREEZEMODE) { |
470 | /* | ||
471 | * parity errors in send memory are recoverable, | ||
472 | * just cancel the send (if indicated in * sendbuffererror), | ||
473 | * count the occurrence, unfreeze (if no other handled | ||
474 | * hardware error bits are set), and continue. They can | ||
475 | * occur if a processor speculative read is done to the PIO | ||
476 | * buffer while we are sending a packet, for example. | ||
477 | */ | ||
478 | if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | | ||
479 | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) | ||
480 | << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { | ||
481 | ipath_stats.sps_txeparity++; | ||
482 | ipath_dbg("Recovering from TXE parity error (%llu), " | ||
483 | "hwerrstatus=%llx\n", | ||
484 | (unsigned long long) ipath_stats.sps_txeparity, | ||
485 | (unsigned long long) hwerrs); | ||
486 | ipath_disarm_senderrbufs(dd); | ||
487 | hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | | ||
488 | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) | ||
489 | << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); | ||
490 | if (!hwerrs) { /* else leave in freeze mode */ | ||
491 | ipath_write_kreg(dd, | ||
492 | dd->ipath_kregs->kr_control, | ||
493 | dd->ipath_control); | ||
494 | return; | ||
495 | } | ||
496 | } | ||
467 | if (hwerrs) { | 497 | if (hwerrs) { |
468 | /* | 498 | /* |
469 | * if any set that we aren't ignoring; only | 499 | * if any set that we aren't ignoring; only |
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c index 08a44dd9ed6f..024b6aa320f1 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba6120.c +++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c | |||
@@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, | |||
370 | * make sure we get this much out, unless told to be quiet, | 370 | * make sure we get this much out, unless told to be quiet, |
371 | * or it's occurred within the last 5 seconds | 371 | * or it's occurred within the last 5 seconds |
372 | */ | 372 | */ |
373 | if ((hwerrs & ~dd->ipath_lasthwerror) || | 373 | if ((hwerrs & ~(dd->ipath_lasthwerror | |
374 | ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | | ||
375 | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) | ||
376 | << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) || | ||
374 | (ipath_debug & __IPATH_VERBDBG)) | 377 | (ipath_debug & __IPATH_VERBDBG)) |
375 | dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " | 378 | dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx " |
376 | "(cleared)\n", (unsigned long long) hwerrs); | 379 | "(cleared)\n", (unsigned long long) hwerrs); |
@@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, | |||
383 | 386 | ||
384 | ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); | 387 | ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control); |
385 | if (ctrl & INFINIPATH_C_FREEZEMODE) { | 388 | if (ctrl & INFINIPATH_C_FREEZEMODE) { |
389 | /* | ||
390 | * parity errors in send memory are recoverable, | ||
391 | * just cancel the send (if indicated in * sendbuffererror), | ||
392 | * count the occurrence, unfreeze (if no other handled | ||
393 | * hardware error bits are set), and continue. They can | ||
394 | * occur if a processor speculative read is done to the PIO | ||
395 | * buffer while we are sending a packet, for example. | ||
396 | */ | ||
397 | if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | | ||
398 | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) | ||
399 | << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) { | ||
400 | ipath_stats.sps_txeparity++; | ||
401 | ipath_dbg("Recovering from TXE parity error (%llu), " | ||
402 | "hwerrstatus=%llx\n", | ||
403 | (unsigned long long) ipath_stats.sps_txeparity, | ||
404 | (unsigned long long) hwerrs); | ||
405 | ipath_disarm_senderrbufs(dd); | ||
406 | hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | | ||
407 | INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) | ||
408 | << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT); | ||
409 | if (!hwerrs) { /* else leave in freeze mode */ | ||
410 | ipath_write_kreg(dd, | ||
411 | dd->ipath_kregs->kr_control, | ||
412 | dd->ipath_control); | ||
413 | return; | ||
414 | } | ||
415 | } | ||
386 | if (hwerrs) { | 416 | if (hwerrs) { |
387 | /* | 417 | /* |
388 | * if any set that we aren't ignoring only make the | 418 | * if any set that we aren't ignoring only make the |
@@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg, | |||
406 | } else { | 436 | } else { |
407 | ipath_dbg("Clearing freezemode on ignored hardware " | 437 | ipath_dbg("Clearing freezemode on ignored hardware " |
408 | "error\n"); | 438 | "error\n"); |
409 | ctrl &= ~INFINIPATH_C_FREEZEMODE; | ||
410 | ipath_write_kreg(dd, dd->ipath_kregs->kr_control, | 439 | ipath_write_kreg(dd, dd->ipath_kregs->kr_control, |
411 | ctrl); | 440 | dd->ipath_control); |
412 | } | 441 | } |
413 | } | 442 | } |
414 | 443 | ||
@@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd) | |||
880 | dd->ipath_hwe_bitsextant = | 909 | dd->ipath_hwe_bitsextant = |
881 | (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << | 910 | (INFINIPATH_HWE_RXEMEMPARITYERR_MASK << |
882 | INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | | 911 | INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) | |
912 | (INFINIPATH_HWE_TXEMEMPARITYERR_MASK << | ||
913 | INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) | | ||
883 | (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK << | 914 | (INFINIPATH_HWE_PCIEMEMPARITYERR_MASK << |
884 | INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) | | 915 | INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) | |
885 | INFINIPATH_HWE_PCIE1PLLFAILED | | 916 | INFINIPATH_HWE_PCIE1PLLFAILED | |
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index f4d8aafc6306..6bee53ce5f33 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c | |||
@@ -37,6 +37,50 @@ | |||
37 | #include "ipath_verbs.h" | 37 | #include "ipath_verbs.h" |
38 | #include "ipath_common.h" | 38 | #include "ipath_common.h" |
39 | 39 | ||
40 | /* | ||
41 | * Called when we might have an error that is specific to a particular | ||
42 | * PIO buffer, and may need to cancel that buffer, so it can be re-used. | ||
43 | */ | ||
44 | void ipath_disarm_senderrbufs(struct ipath_devdata *dd) | ||
45 | { | ||
46 | u32 piobcnt; | ||
47 | unsigned long sbuf[4]; | ||
48 | /* | ||
49 | * it's possible that sendbuffererror could have bits set; might | ||
50 | * have already done this as a result of hardware error handling | ||
51 | */ | ||
52 | piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; | ||
53 | /* read these before writing errorclear */ | ||
54 | sbuf[0] = ipath_read_kreg64( | ||
55 | dd, dd->ipath_kregs->kr_sendbuffererror); | ||
56 | sbuf[1] = ipath_read_kreg64( | ||
57 | dd, dd->ipath_kregs->kr_sendbuffererror + 1); | ||
58 | if (piobcnt > 128) { | ||
59 | sbuf[2] = ipath_read_kreg64( | ||
60 | dd, dd->ipath_kregs->kr_sendbuffererror + 2); | ||
61 | sbuf[3] = ipath_read_kreg64( | ||
62 | dd, dd->ipath_kregs->kr_sendbuffererror + 3); | ||
63 | } | ||
64 | |||
65 | if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { | ||
66 | int i; | ||
67 | if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) { | ||
68 | __IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG, | ||
69 | "SendbufErrs %lx %lx", sbuf[0], | ||
70 | sbuf[1]); | ||
71 | if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) | ||
72 | printk(" %lx %lx ", sbuf[2], sbuf[3]); | ||
73 | printk("\n"); | ||
74 | } | ||
75 | |||
76 | for (i = 0; i < piobcnt; i++) | ||
77 | if (test_bit(i, sbuf)) | ||
78 | ipath_disarm_piobufs(dd, i, 1); | ||
79 | dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */ | ||
80 | } | ||
81 | } | ||
82 | |||
83 | |||
40 | /* These are all rcv-related errors which we want to count for stats */ | 84 | /* These are all rcv-related errors which we want to count for stats */ |
41 | #define E_SUM_PKTERRS \ | 85 | #define E_SUM_PKTERRS \ |
42 | (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ | 86 | (INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \ |
@@ -68,53 +112,9 @@ | |||
68 | 112 | ||
69 | static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) | 113 | static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs) |
70 | { | 114 | { |
71 | unsigned long sbuf[4]; | ||
72 | u64 ignore_this_time = 0; | 115 | u64 ignore_this_time = 0; |
73 | u32 piobcnt; | ||
74 | |||
75 | /* if possible that sendbuffererror could be valid */ | ||
76 | piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k; | ||
77 | /* read these before writing errorclear */ | ||
78 | sbuf[0] = ipath_read_kreg64( | ||
79 | dd, dd->ipath_kregs->kr_sendbuffererror); | ||
80 | sbuf[1] = ipath_read_kreg64( | ||
81 | dd, dd->ipath_kregs->kr_sendbuffererror + 1); | ||
82 | if (piobcnt > 128) { | ||
83 | sbuf[2] = ipath_read_kreg64( | ||
84 | dd, dd->ipath_kregs->kr_sendbuffererror + 2); | ||
85 | sbuf[3] = ipath_read_kreg64( | ||
86 | dd, dd->ipath_kregs->kr_sendbuffererror + 3); | ||
87 | } | ||
88 | 116 | ||
89 | if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) { | 117 | ipath_disarm_senderrbufs(dd); |
90 | int i; | ||
91 | |||
92 | ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]); | ||
93 | if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128) | ||
94 | printk("%lx %lx ", sbuf[2], sbuf[3]); | ||
95 | for (i = 0; i < piobcnt; i++) { | ||
96 | if (test_bit(i, sbuf)) { | ||
97 | u32 __iomem *piobuf; | ||
98 | if (i < dd->ipath_piobcnt2k) | ||
99 | piobuf = (u32 __iomem *) | ||
100 | (dd->ipath_pio2kbase + | ||
101 | i * dd->ipath_palign); | ||
102 | else | ||
103 | piobuf = (u32 __iomem *) | ||
104 | (dd->ipath_pio4kbase + | ||
105 | (i - dd->ipath_piobcnt2k) * | ||
106 | dd->ipath_4kalign); | ||
107 | |||
108 | ipath_cdbg(PKT, | ||
109 | "PIObuf[%u] @%p pbc is %x; ", | ||
110 | i, piobuf, readl(piobuf)); | ||
111 | |||
112 | ipath_disarm_piobufs(dd, i, 1); | ||
113 | } | ||
114 | } | ||
115 | if (ipath_debug & __IPATH_PKTDBG) | ||
116 | printk("\n"); | ||
117 | } | ||
118 | if ((errs & E_SUM_LINK_PKTERRS) && | 118 | if ((errs & E_SUM_LINK_PKTERRS) && |
119 | !(dd->ipath_flags & IPATH_LINKACTIVE)) { | 119 | !(dd->ipath_flags & IPATH_LINKACTIVE)) { |
120 | /* | 120 | /* |
@@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs) | |||
554 | ~(INFINIPATH_E_HARDWARE | | 554 | ~(INFINIPATH_E_HARDWARE | |
555 | INFINIPATH_E_IBSTATUSCHANGED); | 555 | INFINIPATH_E_IBSTATUSCHANGED); |
556 | } | 556 | } |
557 | |||
558 | /* likely due to cancel, so suppress */ | ||
559 | if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) && | ||
560 | dd->ipath_lastcancel > jiffies) { | ||
561 | ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n"); | ||
562 | errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN); | ||
563 | } | ||
564 | |||
557 | if (!errs) | 565 | if (!errs) |
558 | return 0; | 566 | return 0; |
559 | 567 | ||
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h index 02134cb3e944..d7540b71b451 100644 --- a/drivers/infiniband/hw/ipath/ipath_kernel.h +++ b/drivers/infiniband/hw/ipath/ipath_kernel.h | |||
@@ -427,6 +427,9 @@ struct ipath_devdata { | |||
427 | unsigned long ipath_rcvctrl; | 427 | unsigned long ipath_rcvctrl; |
428 | /* shadow kr_sendctrl */ | 428 | /* shadow kr_sendctrl */ |
429 | unsigned long ipath_sendctrl; | 429 | unsigned long ipath_sendctrl; |
430 | /* ports waiting for PIOavail intr */ | ||
431 | unsigned long ipath_portpiowait; | ||
432 | unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */ | ||
430 | 433 | ||
431 | /* value we put in kr_rcvhdrcnt */ | 434 | /* value we put in kr_rcvhdrcnt */ |
432 | u32 ipath_rcvhdrcnt; | 435 | u32 ipath_rcvhdrcnt; |
@@ -490,8 +493,6 @@ struct ipath_devdata { | |||
490 | u32 ipath_htwidth; | 493 | u32 ipath_htwidth; |
491 | /* HT speed (200,400,800,1000) from HT config */ | 494 | /* HT speed (200,400,800,1000) from HT config */ |
492 | u32 ipath_htspeed; | 495 | u32 ipath_htspeed; |
493 | /* ports waiting for PIOavail intr */ | ||
494 | unsigned long ipath_portpiowait; | ||
495 | /* | 496 | /* |
496 | * number of sequential ibcstatus change for polling active/quiet | 497 | * number of sequential ibcstatus change for polling active/quiet |
497 | * (i.e., link not coming up). | 498 | * (i.e., link not coming up). |
@@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd); | |||
585 | void ipath_disable_wc(struct ipath_devdata *dd); | 586 | void ipath_disable_wc(struct ipath_devdata *dd); |
586 | int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp); | 587 | int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp); |
587 | void ipath_shutdown_device(struct ipath_devdata *); | 588 | void ipath_shutdown_device(struct ipath_devdata *); |
589 | void ipath_disarm_senderrbufs(struct ipath_devdata *); | ||
588 | 590 | ||
589 | struct file_operations; | 591 | struct file_operations; |
590 | int ipath_cdev_init(int minor, char *name, struct file_operations *fops, | 592 | int ipath_cdev_init(int minor, char *name, struct file_operations *fops, |