From c48cbb405c4f338ce3263c44d621eff41d9a95fc Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 11 Mar 2008 14:31:39 -0400 Subject: SVCRDMA: Add xprt refs to fix close/unmount crash RDMA connection shutdown on an SMP machine can cause a kernel crash due to the transport close path racing with the I/O tasklet. Additional transport references were added as follows: - A reference when on the DTO Q to avoid having the transport deleted while queued for I/O. - A reference while there is a QP able to generate events. - A reference until the DISCONNECTED event is received on the CM ID Signed-off-by: Tom Tucker Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 96 +++++++++++++++++++------------- 1 file changed, 58 insertions(+), 38 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f09444c451bc..16fd3f6718ff 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -54,7 +54,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, int flags); static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); static void svc_rdma_release_rqst(struct svc_rqst *); -static void rdma_destroy_xprt(struct svcxprt_rdma *xprt); static void dto_tasklet_func(unsigned long data); static void svc_rdma_detach(struct svc_xprt *xprt); static void svc_rdma_free(struct svc_xprt *xprt); @@ -247,6 +246,7 @@ static void dto_tasklet_func(unsigned long data) sq_cq_reap(xprt); } + svc_xprt_put(&xprt->sc_xprt); spin_lock_irqsave(&dto_lock, flags); } spin_unlock_irqrestore(&dto_lock, flags); @@ -275,8 +275,10 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) * add it */ spin_lock_irqsave(&dto_lock, flags); - if (list_empty(&xprt->sc_dto_q)) + if (list_empty(&xprt->sc_dto_q)) { + svc_xprt_get(&xprt->sc_xprt); list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + } spin_unlock_irqrestore(&dto_lock, flags); /* Tasklet does all the work to avoid irqsave locks. */ @@ -386,8 +388,10 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) * add it */ spin_lock_irqsave(&dto_lock, flags); - if (list_empty(&xprt->sc_dto_q)) + if (list_empty(&xprt->sc_dto_q)) { + svc_xprt_get(&xprt->sc_xprt); list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + } spin_unlock_irqrestore(&dto_lock, flags); /* Tasklet does all the work to avoid irqsave locks. */ @@ -611,6 +615,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Accept complete */ + svc_xprt_get(xprt); dprintk("svcrdma: Connection completed on DTO xprt=%p, " "cm_id=%p\n", xprt, cma_id); clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); @@ -661,15 +666,15 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); if (IS_ERR(listen_id)) { - rdma_destroy_xprt(cma_xprt); + svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_create_id failed = %ld\n", PTR_ERR(listen_id)); return (void *)listen_id; } ret = rdma_bind_addr(listen_id, sa); if (ret) { - rdma_destroy_xprt(cma_xprt); rdma_destroy_id(listen_id); + svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); return ERR_PTR(ret); } @@ -678,8 +683,9 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); if (ret) { rdma_destroy_id(listen_id); - rdma_destroy_xprt(cma_xprt); + svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_listen failed = %d\n", ret); + return ERR_PTR(ret); } /* @@ -820,6 +826,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; } + svc_xprt_get(&newxprt->sc_xprt); newxprt->sc_qp = newxprt->sc_cm_id->qp; /* Register all of physical memory */ @@ -891,8 +898,15 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) errout: dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + /* Take a reference in case the DTO handler runs */ + svc_xprt_get(&newxprt->sc_xprt); + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) { + ib_destroy_qp(newxprt->sc_qp); + svc_xprt_put(&newxprt->sc_xprt); + } rdma_destroy_id(newxprt->sc_cm_id); - rdma_destroy_xprt(newxprt); + /* This call to put will destroy the transport */ + svc_xprt_put(&newxprt->sc_xprt); return NULL; } @@ -919,54 +933,60 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; } -/* Disable data ready events for this connection */ +/* + * When connected, an svc_xprt has at least three references: + * + * - A reference held by the QP. We still hold that here because this + * code deletes the QP and puts the reference. + * + * - A reference held by the cm_id between the ESTABLISHED and + * DISCONNECTED events. If the remote peer disconnected first, this + * reference could be gone. + * + * - A reference held by the svc_recv code that called this function + * as part of close processing. + * + * At a minimum two references should still be held. + */ static void svc_rdma_detach(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - unsigned long flags; - dprintk("svc: svc_rdma_detach(%p)\n", xprt); - /* - * Shutdown the connection. This will ensure we don't get any - * more events from the provider. - */ + + /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); - rdma_destroy_id(rdma->sc_cm_id); - /* We may already be on the DTO list */ - spin_lock_irqsave(&dto_lock, flags); - if (!list_empty(&rdma->sc_dto_q)) - list_del_init(&rdma->sc_dto_q); - spin_unlock_irqrestore(&dto_lock, flags); + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) { + ib_destroy_qp(rdma->sc_qp); + svc_xprt_put(xprt); + } + + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); } static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); - rdma_destroy_xprt(rdma); - kfree(rdma); -} - -static void rdma_destroy_xprt(struct svcxprt_rdma *xprt) -{ - if (xprt->sc_qp && !IS_ERR(xprt->sc_qp)) - ib_destroy_qp(xprt->sc_qp); - - if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq)) - ib_destroy_cq(xprt->sc_sq_cq); + /* We should only be called from kref_put */ + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0); + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) + ib_destroy_cq(rdma->sc_sq_cq); - if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq)) - ib_destroy_cq(xprt->sc_rq_cq); + if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) + ib_destroy_cq(rdma->sc_rq_cq); - if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr)) - ib_dereg_mr(xprt->sc_phys_mr); + if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) + ib_dereg_mr(rdma->sc_phys_mr); - if (xprt->sc_pd && !IS_ERR(xprt->sc_pd)) - ib_dealloc_pd(xprt->sc_pd); + if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) + ib_dealloc_pd(rdma->sc_pd); - destroy_context_cache(xprt->sc_ctxt_head); + destroy_context_cache(rdma->sc_ctxt_head); + kfree(rdma); } static int svc_rdma_has_wspace(struct svc_xprt *xprt) -- cgit v1.2.2 From 3fedb3c5a80595d94f7cbe47a6dba9184d869eb8 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 11 Mar 2008 14:31:40 -0400 Subject: SVCRDMA: Fix erroneous BUG_ON in send_write The assertion that checks for sge context overflow is incorrectly hard-coded to 32. This causes a kernel bug check when using big-data mounts. Changed the BUG_ON to use the computed value RPCSVC_MAXPAGES. Signed-off-by: Tom Tucker Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 0598b229c11d..981f190c1b39 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -156,7 +156,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *ctxt; int ret = 0; - BUG_ON(sge_count >= 32); + BUG_ON(sge_count > RPCSVC_MAXPAGES); dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " "write_len=%d, xdr_sge=%p, sge_count=%d\n", rmr, (unsigned long long)to, xdr_off, -- cgit v1.2.2 From d3073779f8362d64b804882f5f41c208c4a5e11e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 24 Mar 2008 12:03:03 -0400 Subject: SVCRDMA: Use only 1 RDMA read scatter entry for iWARP adapters The iWARP protocol limits RDMA read requests to a single scatter entry. NFS/RDMA has code in rdma_read_max_sge() that is supposed to limit the sge_count for RDMA read requests to 1, but the code to do that is inside an #ifdef RDMA_TRANSPORT_IWARP block. In the mainline kernel at least, RDMA_TRANSPORT_IWARP is an enum and not a preprocessor #define, so the #ifdef'ed code is never compiled. In my test of a kernel build with -j8 on an NFS/RDMA mount, this problem eventually leads to trouble starting with: svcrdma: Error posting send = -22 svcrdma : RDMA_READ error = -22 and things go downhill from there. The trivial fix is to delete the #ifdef guard. The check seems to be a remnant of when the NFS/RDMA code was not merged and needed to compile against multiple kernel versions, although I don't think it ever worked as intended. In any case now that the code is upstream there's no need to test whether the RDMA_TRANSPORT_IWARP constant is defined or not. Without this patch, my kernel build on an NFS/RDMA mount using NetEffect adapters quickly and 100% reproducibly failed with an error like: ld: final link failed: Software caused connection abort With the patch applied I was able to complete a kernel build on the same setup. (Tom Tucker says this is "actually an _ancient_ remnant when it had to compile against iWARP vs. non-iWARP enabled OFA trees.") Signed-off-by: Roland Dreier Acked-by: Tom Tucker Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ab54a736486e..971271602dd0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -237,14 +237,12 @@ static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) { -#ifdef RDMA_TRANSPORT_IWARP if ((RDMA_TRANSPORT_IWARP == rdma_node_get_transport(xprt->sc_cm_id-> device->node_type)) && sge_count > 1) return 1; else -#endif return min_t(int, sge_count, xprt->sc_max_sge); } -- cgit v1.2.2 From c8237a5fcea9d49a73275b4c8f541dd42f8da1a4 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 25 Mar 2008 22:27:19 -0400 Subject: SVCRDMA: Check num_sge when setting LAST_CTXT bit The RDMACTXT_F_LAST_CTXT bit was getting set incorrectly when the last chunk in the read-list spanned multiple pages. This resulted in a kernel panic when the wrong context was used to build the RPC iovec page list. RDMA_READ is used to fetch RPC data from the client for NFS_WRITE requests. A scatter-gather is used to map the advertised client side buffer to the server-side iovec and associated page list. WR contexts are used to convey which scatter-gather entries are handled by each WR. When the write data is large, a single RPC may require multiple RDMA_READ requests so the contexts for a single RPC are chained together in a linked list. The last context in this list is marked with a bit RDMACTXT_F_LAST_CTXT so that when this WR completes, the CQ handler code can enqueue the RPC for processing. The code in rdma_read_xdr was setting this bit on the last two contexts on this list when the last read-list chunk spanned multiple pages. This caused the svc_rdma_recvfrom logic to incorrectly build the RPC and caused the kernel to crash because the second-to-last context doesn't contain the iovec page list. Modified the condition that sets this bit so that it correctly detects the last context for the RPC. Signed-off-by: Tom Tucker Tested-by: Roland Dreier Signed-off-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net/sunrpc/xprtrdma') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 971271602dd0..c22d6b6f2db4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -322,15 +322,6 @@ next_sge: ctxt->direction = DMA_FROM_DEVICE; clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - if ((ch+1)->rc_discrim == 0) { - /* - * Checked in sq_cq_reap to see if we need to - * be enqueued - */ - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->next = hdr_ctxt; - hdr_ctxt->next = head; - } /* Prepare READ WR */ memset(&read_wr, 0, sizeof read_wr); @@ -348,7 +339,17 @@ next_sge: rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], &sgl_offset, read_wr.num_sge); - + if (((ch+1)->rc_discrim == 0) && + (read_wr.num_sge == ch_sge_ary[ch_no].count)) { + /* + * Mark the last RDMA_READ with a bit to + * indicate all RPC data has been fetched from + * the client and the RPC needs to be enqueued. + */ + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + ctxt->next = hdr_ctxt; + hdr_ctxt->next = head; + } /* Post the read */ err = svc_rdma_send(xprt, &read_wr); if (err) { -- cgit v1.2.2