summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIan Munsie <imunsie@au1.ibm.com>2016-06-30 12:50:40 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2016-07-08 08:11:18 -0400
commit5e7823c9bc44965c2e7d1d755b382109830c4916 (patch)
tree3ba0f08a052e088deca0e138fce3b1cc53ce6d80
parent2224b6719b09052a9fbf29422a0e9b4f42407c35 (diff)
cxl: Fix bug where AFU disable operation had no effect
The AFU disable operation has a bug where it will not clear the enable bit and therefore will have no effect. To date this has likely been masked by fact that we perform an AFU reset before the disable, which also has the effect of clearing the enable bit, making the following disable operation effectively a noop on most hardware. This patch modifies the afu_control function to take a parameter to clear from the AFU control register so that the disable operation can clear the appropriate bit. This bug was uncovered on the Mellanox CX4, which uses an XSL rather than a PSL. On the XSL the reset operation will not complete while the AFU is enabled, meaning the enable bit was still set at the start of the disable and as a result this bug was hit and the disable also timed out. Because of this difference in behaviour between the PSL and XSL, this patch now makes the reset dependent on the card using a PSL to avoid waiting for a timeout on the XSL. It is entirely possible that we may be able to drop the reset altogether if it turns out we only ever needed it due to this bug - however I am not willing to drop it without further regression testing and have added comments to the code explaining the background. This also fixes a small issue where the AFU_Cntl register was read outside of the lock that protects it. Signed-off-by: Ian Munsie <imunsie@au1.ibm.com> Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--drivers/misc/cxl/cxl.h1
-rw-r--r--drivers/misc/cxl/native.c58
-rw-r--r--drivers/misc/cxl/pci.c1
3 files changed, 53 insertions, 7 deletions
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 27578fceda8a..aafffa8554e2 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -552,6 +552,7 @@ struct cxl_service_layer_ops {
552 void (*write_timebase_ctrl)(struct cxl *adapter); 552 void (*write_timebase_ctrl)(struct cxl *adapter);
553 u64 (*timebase_read)(struct cxl *adapter); 553 u64 (*timebase_read)(struct cxl *adapter);
554 int capi_mode; 554 int capi_mode;
555 bool needs_reset_before_disable;
555}; 556};
556 557
557struct cxl_native { 558struct cxl_native {
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 120c468ee9e0..e77450567f69 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -21,10 +21,10 @@
21#include "cxl.h" 21#include "cxl.h"
22#include "trace.h" 22#include "trace.h"
23 23
24static int afu_control(struct cxl_afu *afu, u64 command, 24static int afu_control(struct cxl_afu *afu, u64 command, u64 clear,
25 u64 result, u64 mask, bool enabled) 25 u64 result, u64 mask, bool enabled)
26{ 26{
27 u64 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An); 27 u64 AFU_Cntl;
28 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); 28 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
29 int rc = 0; 29 int rc = 0;
30 30
@@ -33,7 +33,8 @@ static int afu_control(struct cxl_afu *afu, u64 command,
33 33
34 trace_cxl_afu_ctrl(afu, command); 34 trace_cxl_afu_ctrl(afu, command);
35 35
36 cxl_p2n_write(afu, CXL_AFU_Cntl_An, AFU_Cntl | command); 36 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
37 cxl_p2n_write(afu, CXL_AFU_Cntl_An, (AFU_Cntl & ~clear) | command);
37 38
38 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An); 39 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
39 while ((AFU_Cntl & mask) != result) { 40 while ((AFU_Cntl & mask) != result) {
@@ -67,7 +68,7 @@ static int afu_enable(struct cxl_afu *afu)
67{ 68{
68 pr_devel("AFU enable request\n"); 69 pr_devel("AFU enable request\n");
69 70
70 return afu_control(afu, CXL_AFU_Cntl_An_E, 71 return afu_control(afu, CXL_AFU_Cntl_An_E, 0,
71 CXL_AFU_Cntl_An_ES_Enabled, 72 CXL_AFU_Cntl_An_ES_Enabled,
72 CXL_AFU_Cntl_An_ES_MASK, true); 73 CXL_AFU_Cntl_An_ES_MASK, true);
73} 74}
@@ -76,7 +77,8 @@ int cxl_afu_disable(struct cxl_afu *afu)
76{ 77{
77 pr_devel("AFU disable request\n"); 78 pr_devel("AFU disable request\n");
78 79
79 return afu_control(afu, 0, CXL_AFU_Cntl_An_ES_Disabled, 80 return afu_control(afu, 0, CXL_AFU_Cntl_An_E,
81 CXL_AFU_Cntl_An_ES_Disabled,
80 CXL_AFU_Cntl_An_ES_MASK, false); 82 CXL_AFU_Cntl_An_ES_MASK, false);
81} 83}
82 84
@@ -85,7 +87,7 @@ static int native_afu_reset(struct cxl_afu *afu)
85{ 87{
86 pr_devel("AFU reset request\n"); 88 pr_devel("AFU reset request\n");
87 89
88 return afu_control(afu, CXL_AFU_Cntl_An_RA, 90 return afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
89 CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled, 91 CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
90 CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK, 92 CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
91 false); 93 false);
@@ -595,7 +597,33 @@ static int deactivate_afu_directed(struct cxl_afu *afu)
595 cxl_sysfs_afu_m_remove(afu); 597 cxl_sysfs_afu_m_remove(afu);
596 cxl_chardev_afu_remove(afu); 598 cxl_chardev_afu_remove(afu);
597 599
598 cxl_ops->afu_reset(afu); 600 /*
601 * The CAIA section 2.2.1 indicates that the procedure for starting and
602 * stopping an AFU in AFU directed mode is AFU specific, which is not
603 * ideal since this code is generic and with one exception has no
604 * knowledge of the AFU. This is in contrast to the procedure for
605 * disabling a dedicated process AFU, which is documented to just
606 * require a reset. The architecture does indicate that both an AFU
607 * reset and an AFU disable should result in the AFU being disabled and
608 * we do both followed by a PSL purge for safety.
609 *
610 * Notably we used to have some issues with the disable sequence on PSL
611 * cards, which is why we ended up using this heavy weight procedure in
612 * the first place, however a bug was discovered that had rendered the
613 * disable operation ineffective, so it is conceivable that was the
614 * sole explanation for those difficulties. Careful regression testing
615 * is recommended if anyone attempts to remove or reorder these
616 * operations.
617 *
618 * The XSL on the Mellanox CX4 behaves a little differently from the
619 * PSL based cards and will time out an AFU reset if the AFU is still
620 * enabled. That card is special in that we do have a means to identify
621 * it from this code, so in that case we skip the reset and just use a
622 * disable/purge to avoid the timeout and corresponding noise in the
623 * kernel log.
624 */
625 if (afu->adapter->native->sl_ops->needs_reset_before_disable)
626 cxl_ops->afu_reset(afu);
599 cxl_afu_disable(afu); 627 cxl_afu_disable(afu);
600 cxl_psl_purge(afu); 628 cxl_psl_purge(afu);
601 629
@@ -735,6 +763,22 @@ static int native_attach_process(struct cxl_context *ctx, bool kernel,
735 763
736static inline int detach_process_native_dedicated(struct cxl_context *ctx) 764static inline int detach_process_native_dedicated(struct cxl_context *ctx)
737{ 765{
766 /*
767 * The CAIA section 2.1.1 indicates that we need to do an AFU reset to
768 * stop the AFU in dedicated mode (we therefore do not make that
769 * optional like we do in the afu directed path). It does not indicate
770 * that we need to do an explicit disable (which should occur
771 * implicitly as part of the reset) or purge, but we do these as well
772 * to be on the safe side.
773 *
774 * Notably we used to have some issues with the disable sequence
775 * (before the sequence was spelled out in the architecture) which is
776 * why we were so heavy weight in the first place, however a bug was
777 * discovered that had rendered the disable operation ineffective, so
778 * it is conceivable that was the sole explanation for those
779 * difficulties. Point is, we should be careful and do some regression
780 * testing if we ever attempt to remove any part of this procedure.
781 */
738 cxl_ops->afu_reset(ctx->afu); 782 cxl_ops->afu_reset(ctx->afu);
739 cxl_afu_disable(ctx->afu); 783 cxl_afu_disable(ctx->afu);
740 cxl_psl_purge(ctx->afu); 784 cxl_psl_purge(ctx->afu);
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 58d7d8215964..b7f2e96c1528 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1309,6 +1309,7 @@ static const struct cxl_service_layer_ops psl_ops = {
1309 .write_timebase_ctrl = write_timebase_ctrl_psl, 1309 .write_timebase_ctrl = write_timebase_ctrl_psl,
1310 .timebase_read = timebase_read_psl, 1310 .timebase_read = timebase_read_psl,
1311 .capi_mode = OPAL_PHB_CAPI_MODE_CAPI, 1311 .capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
1312 .needs_reset_before_disable = true,
1312}; 1313};
1313 1314
1314static const struct cxl_service_layer_ops xsl_ops = { 1315static const struct cxl_service_layer_ops xsl_ops = {