diff options
author | Chuck Lever <chuck.lever@oracle.com> | 2014-05-28 10:34:07 -0400 |
---|---|---|
committer | Anna Schumaker <Anna.Schumaker@Netapp.com> | 2014-06-04 08:56:47 -0400 |
commit | ec62f40d3505a643497d105c297093bb90afd44e (patch) | |
tree | 32cb9abf5da170e58d6855d78cd4d6b5016c3652 /net/sunrpc/xprtrdma | |
parent | 65866f8259851cea5e356d2fd46fc37a4e26330e (diff) |
xprtrdma: Ensure ia->ri_id->qp is not NULL when reconnecting
Devesh Sharma <Devesh.Sharma@Emulex.Com> reports that after a
disconnect, his HCA is failing to create a fresh QP, leaving
ia_ri->ri_id->qp set to NULL. But xprtrdma still allows RPCs to
wake up and post LOCAL_INV as they exit, causing an oops.
rpcrdma_ep_connect() is allowing the wake-up by leaking the QP
creation error code (-EPERM in this case) to the RPC client's
generic layer. xprt_connect_status() does not recognize -EPERM, so
it kills pending RPC tasks immediately rather than retrying the
connect.
Re-arrange the QP creation logic so that when it fails on reconnect,
it leaves ->qp with the old QP rather than NULL. If pending RPC
tasks wake and exit, LOCAL_INV work requests will flush rather than
oops.
On initial connect, leaving ->qp == NULL is OK, since there are no
pending RPCs that might use ->qp. But be sure not to try to destroy
a NULL QP when rpcrdma_ep_connect() is retried.
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 29 |
1 files changed, 20 insertions, 9 deletions
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c80995af82de..54edf2ac48a1 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c | |||
@@ -867,6 +867,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) | |||
867 | if (ep->rep_connected != 0) { | 867 | if (ep->rep_connected != 0) { |
868 | struct rpcrdma_xprt *xprt; | 868 | struct rpcrdma_xprt *xprt; |
869 | retry: | 869 | retry: |
870 | dprintk("RPC: %s: reconnecting...\n", __func__); | ||
870 | rc = rpcrdma_ep_disconnect(ep, ia); | 871 | rc = rpcrdma_ep_disconnect(ep, ia); |
871 | if (rc && rc != -ENOTCONN) | 872 | if (rc && rc != -ENOTCONN) |
872 | dprintk("RPC: %s: rpcrdma_ep_disconnect" | 873 | dprintk("RPC: %s: rpcrdma_ep_disconnect" |
@@ -879,7 +880,7 @@ retry: | |||
879 | id = rpcrdma_create_id(xprt, ia, | 880 | id = rpcrdma_create_id(xprt, ia, |
880 | (struct sockaddr *)&xprt->rx_data.addr); | 881 | (struct sockaddr *)&xprt->rx_data.addr); |
881 | if (IS_ERR(id)) { | 882 | if (IS_ERR(id)) { |
882 | rc = PTR_ERR(id); | 883 | rc = -EHOSTUNREACH; |
883 | goto out; | 884 | goto out; |
884 | } | 885 | } |
885 | /* TEMP TEMP TEMP - fail if new device: | 886 | /* TEMP TEMP TEMP - fail if new device: |
@@ -893,20 +894,30 @@ retry: | |||
893 | printk("RPC: %s: can't reconnect on " | 894 | printk("RPC: %s: can't reconnect on " |
894 | "different device!\n", __func__); | 895 | "different device!\n", __func__); |
895 | rdma_destroy_id(id); | 896 | rdma_destroy_id(id); |
896 | rc = -ENETDOWN; | 897 | rc = -ENETUNREACH; |
897 | goto out; | 898 | goto out; |
898 | } | 899 | } |
899 | /* END TEMP */ | 900 | /* END TEMP */ |
901 | rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); | ||
902 | if (rc) { | ||
903 | dprintk("RPC: %s: rdma_create_qp failed %i\n", | ||
904 | __func__, rc); | ||
905 | rdma_destroy_id(id); | ||
906 | rc = -ENETUNREACH; | ||
907 | goto out; | ||
908 | } | ||
900 | rdma_destroy_qp(ia->ri_id); | 909 | rdma_destroy_qp(ia->ri_id); |
901 | rdma_destroy_id(ia->ri_id); | 910 | rdma_destroy_id(ia->ri_id); |
902 | ia->ri_id = id; | 911 | ia->ri_id = id; |
903 | } | 912 | } else { |
904 | 913 | dprintk("RPC: %s: connecting...\n", __func__); | |
905 | rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); | 914 | rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); |
906 | if (rc) { | 915 | if (rc) { |
907 | dprintk("RPC: %s: rdma_create_qp failed %i\n", | 916 | dprintk("RPC: %s: rdma_create_qp failed %i\n", |
908 | __func__, rc); | 917 | __func__, rc); |
909 | goto out; | 918 | /* do not update ep->rep_connected */ |
919 | return -ENETUNREACH; | ||
920 | } | ||
910 | } | 921 | } |
911 | 922 | ||
912 | /* XXX Tavor device performs badly with 2K MTU! */ | 923 | /* XXX Tavor device performs badly with 2K MTU! */ |