diff options
author | Tom Tucker <tom@ogc.us> | 2011-02-09 14:45:34 -0500 |
---|---|---|
committer | Trond Myklebust <Trond.Myklebust@netapp.com> | 2011-03-11 15:39:27 -0500 |
commit | 5c635e09cec0feeeb310968e51dad01040244851 (patch) | |
tree | 6f776276df4d20b221c02f776a677f8719f9a0aa /net/sunrpc/xprtrdma/verbs.c | |
parent | bd7ea31b9e8a342be76e0fe8d638343886c2d8c5 (diff) |
RPCRDMA: Fix FRMR registration/invalidate handling.
When the rpc_memreg_strategy is 5, FRMR are used to map RPC data.
This mode uses an FRMR to map the RPC data, then invalidates
(i.e. unregisers) the data in xprt_rdma_free. These FRMR are used
across connections on the same mount, i.e. if the connection goes
away on an idle timeout and reconnects later, the FRMR are not
destroyed and recreated.
This creates a problem for transport errors because the WR that
invalidate an FRMR may be flushed (i.e. fail) leaving the
FRMR valid. When the FRMR is later used to map an RPC it will fail,
tearing down the transport and starting over. Over time, more and
more of the FRMR pool end up in the wrong state resulting in
seemingly random disconnects.
This fix keeps track of the FRMR state explicitly by setting it's
state based on the successful completion of a reg/inv WR. If the FRMR
is ever used and found to be in the wrong state, an invalidate WR
is prepended, re-syncing the FRMR state and avoiding the connection loss.
Signed-off-by: Tom Tucker <tom@ogc.us>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Diffstat (limited to 'net/sunrpc/xprtrdma/verbs.c')
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 52 |
1 files changed, 44 insertions, 8 deletions
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5f4c7b3bc711..570f08dc0b03 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c | |||
@@ -144,6 +144,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) | |||
144 | static inline | 144 | static inline |
145 | void rpcrdma_event_process(struct ib_wc *wc) | 145 | void rpcrdma_event_process(struct ib_wc *wc) |
146 | { | 146 | { |
147 | struct rpcrdma_mw *frmr; | ||
147 | struct rpcrdma_rep *rep = | 148 | struct rpcrdma_rep *rep = |
148 | (struct rpcrdma_rep *)(unsigned long) wc->wr_id; | 149 | (struct rpcrdma_rep *)(unsigned long) wc->wr_id; |
149 | 150 | ||
@@ -154,15 +155,23 @@ void rpcrdma_event_process(struct ib_wc *wc) | |||
154 | return; | 155 | return; |
155 | 156 | ||
156 | if (IB_WC_SUCCESS != wc->status) { | 157 | if (IB_WC_SUCCESS != wc->status) { |
157 | dprintk("RPC: %s: %s WC status %X, connection lost\n", | 158 | dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", |
158 | __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", | 159 | __func__, wc->opcode, wc->status); |
159 | wc->status); | ||
160 | rep->rr_len = ~0U; | 160 | rep->rr_len = ~0U; |
161 | rpcrdma_schedule_tasklet(rep); | 161 | if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) |
162 | rpcrdma_schedule_tasklet(rep); | ||
162 | return; | 163 | return; |
163 | } | 164 | } |
164 | 165 | ||
165 | switch (wc->opcode) { | 166 | switch (wc->opcode) { |
167 | case IB_WC_FAST_REG_MR: | ||
168 | frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; | ||
169 | frmr->r.frmr.state = FRMR_IS_VALID; | ||
170 | break; | ||
171 | case IB_WC_LOCAL_INV: | ||
172 | frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; | ||
173 | frmr->r.frmr.state = FRMR_IS_INVALID; | ||
174 | break; | ||
166 | case IB_WC_RECV: | 175 | case IB_WC_RECV: |
167 | rep->rr_len = wc->byte_len; | 176 | rep->rr_len = wc->byte_len; |
168 | ib_dma_sync_single_for_cpu( | 177 | ib_dma_sync_single_for_cpu( |
@@ -1450,6 +1459,11 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) | |||
1450 | seg->mr_dma = ib_dma_map_single(ia->ri_id->device, | 1459 | seg->mr_dma = ib_dma_map_single(ia->ri_id->device, |
1451 | seg->mr_offset, | 1460 | seg->mr_offset, |
1452 | seg->mr_dmalen, seg->mr_dir); | 1461 | seg->mr_dmalen, seg->mr_dir); |
1462 | if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { | ||
1463 | dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", | ||
1464 | __func__, | ||
1465 | seg->mr_dma, seg->mr_offset, seg->mr_dmalen); | ||
1466 | } | ||
1453 | } | 1467 | } |
1454 | 1468 | ||
1455 | static void | 1469 | static void |
@@ -1469,7 +1483,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1469 | struct rpcrdma_xprt *r_xprt) | 1483 | struct rpcrdma_xprt *r_xprt) |
1470 | { | 1484 | { |
1471 | struct rpcrdma_mr_seg *seg1 = seg; | 1485 | struct rpcrdma_mr_seg *seg1 = seg; |
1472 | struct ib_send_wr frmr_wr, *bad_wr; | 1486 | struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; |
1487 | |||
1473 | u8 key; | 1488 | u8 key; |
1474 | int len, pageoff; | 1489 | int len, pageoff; |
1475 | int i, rc; | 1490 | int i, rc; |
@@ -1484,6 +1499,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1484 | rpcrdma_map_one(ia, seg, writing); | 1499 | rpcrdma_map_one(ia, seg, writing); |
1485 | seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; | 1500 | seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; |
1486 | len += seg->mr_len; | 1501 | len += seg->mr_len; |
1502 | BUG_ON(seg->mr_len > PAGE_SIZE); | ||
1487 | ++seg; | 1503 | ++seg; |
1488 | ++i; | 1504 | ++i; |
1489 | /* Check for holes */ | 1505 | /* Check for holes */ |
@@ -1494,26 +1510,45 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1494 | dprintk("RPC: %s: Using frmr %p to map %d segments\n", | 1510 | dprintk("RPC: %s: Using frmr %p to map %d segments\n", |
1495 | __func__, seg1->mr_chunk.rl_mw, i); | 1511 | __func__, seg1->mr_chunk.rl_mw, i); |
1496 | 1512 | ||
1513 | if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { | ||
1514 | dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", | ||
1515 | __func__, | ||
1516 | seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); | ||
1517 | /* Invalidate before using. */ | ||
1518 | memset(&invalidate_wr, 0, sizeof invalidate_wr); | ||
1519 | invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; | ||
1520 | invalidate_wr.next = &frmr_wr; | ||
1521 | invalidate_wr.opcode = IB_WR_LOCAL_INV; | ||
1522 | invalidate_wr.send_flags = IB_SEND_SIGNALED; | ||
1523 | invalidate_wr.ex.invalidate_rkey = | ||
1524 | seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; | ||
1525 | DECR_CQCOUNT(&r_xprt->rx_ep); | ||
1526 | post_wr = &invalidate_wr; | ||
1527 | } else | ||
1528 | post_wr = &frmr_wr; | ||
1529 | |||
1497 | /* Bump the key */ | 1530 | /* Bump the key */ |
1498 | key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); | 1531 | key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); |
1499 | ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); | 1532 | ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); |
1500 | 1533 | ||
1501 | /* Prepare FRMR WR */ | 1534 | /* Prepare FRMR WR */ |
1502 | memset(&frmr_wr, 0, sizeof frmr_wr); | 1535 | memset(&frmr_wr, 0, sizeof frmr_wr); |
1536 | frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; | ||
1503 | frmr_wr.opcode = IB_WR_FAST_REG_MR; | 1537 | frmr_wr.opcode = IB_WR_FAST_REG_MR; |
1504 | frmr_wr.send_flags = 0; /* unsignaled */ | 1538 | frmr_wr.send_flags = IB_SEND_SIGNALED; |
1505 | frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; | 1539 | frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; |
1506 | frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; | 1540 | frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; |
1507 | frmr_wr.wr.fast_reg.page_list_len = i; | 1541 | frmr_wr.wr.fast_reg.page_list_len = i; |
1508 | frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; | 1542 | frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; |
1509 | frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; | 1543 | frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; |
1544 | BUG_ON(frmr_wr.wr.fast_reg.length < len); | ||
1510 | frmr_wr.wr.fast_reg.access_flags = (writing ? | 1545 | frmr_wr.wr.fast_reg.access_flags = (writing ? |
1511 | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : | 1546 | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : |
1512 | IB_ACCESS_REMOTE_READ); | 1547 | IB_ACCESS_REMOTE_READ); |
1513 | frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; | 1548 | frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; |
1514 | DECR_CQCOUNT(&r_xprt->rx_ep); | 1549 | DECR_CQCOUNT(&r_xprt->rx_ep); |
1515 | 1550 | ||
1516 | rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); | 1551 | rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); |
1517 | 1552 | ||
1518 | if (rc) { | 1553 | if (rc) { |
1519 | dprintk("RPC: %s: failed ib_post_send for register," | 1554 | dprintk("RPC: %s: failed ib_post_send for register," |
@@ -1542,8 +1577,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, | |||
1542 | rpcrdma_unmap_one(ia, seg++); | 1577 | rpcrdma_unmap_one(ia, seg++); |
1543 | 1578 | ||
1544 | memset(&invalidate_wr, 0, sizeof invalidate_wr); | 1579 | memset(&invalidate_wr, 0, sizeof invalidate_wr); |
1580 | invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; | ||
1545 | invalidate_wr.opcode = IB_WR_LOCAL_INV; | 1581 | invalidate_wr.opcode = IB_WR_LOCAL_INV; |
1546 | invalidate_wr.send_flags = 0; /* unsignaled */ | 1582 | invalidate_wr.send_flags = IB_SEND_SIGNALED; |
1547 | invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; | 1583 | invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; |
1548 | DECR_CQCOUNT(&r_xprt->rx_ep); | 1584 | DECR_CQCOUNT(&r_xprt->rx_ep); |
1549 | 1585 | ||