diff options
Diffstat (limited to 'net/rds/ib_send.c')
| -rw-r--r-- | net/rds/ib_send.c | 682 |
1 files changed, 408 insertions, 274 deletions
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 17fa80803ab0..71f373c421bc 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c | |||
| @@ -36,11 +36,49 @@ | |||
| 36 | #include <linux/dmapool.h> | 36 | #include <linux/dmapool.h> |
| 37 | 37 | ||
| 38 | #include "rds.h" | 38 | #include "rds.h" |
| 39 | #include "rdma.h" | ||
| 40 | #include "ib.h" | 39 | #include "ib.h" |
| 41 | 40 | ||
| 42 | static void rds_ib_send_rdma_complete(struct rds_message *rm, | 41 | static char *rds_ib_wc_status_strings[] = { |
| 43 | int wc_status) | 42 | #define RDS_IB_WC_STATUS_STR(foo) \ |
| 43 | [IB_WC_##foo] = __stringify(IB_WC_##foo) | ||
| 44 | RDS_IB_WC_STATUS_STR(SUCCESS), | ||
| 45 | RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), | ||
| 46 | RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), | ||
| 47 | RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), | ||
| 48 | RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), | ||
| 49 | RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), | ||
| 50 | RDS_IB_WC_STATUS_STR(MW_BIND_ERR), | ||
| 51 | RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), | ||
| 52 | RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), | ||
| 53 | RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), | ||
| 54 | RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), | ||
| 55 | RDS_IB_WC_STATUS_STR(REM_OP_ERR), | ||
| 56 | RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), | ||
| 57 | RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), | ||
| 58 | RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), | ||
| 59 | RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), | ||
| 60 | RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), | ||
| 61 | RDS_IB_WC_STATUS_STR(INV_EECN_ERR), | ||
| 62 | RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), | ||
| 63 | RDS_IB_WC_STATUS_STR(FATAL_ERR), | ||
| 64 | RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), | ||
| 65 | RDS_IB_WC_STATUS_STR(GENERAL_ERR), | ||
| 66 | #undef RDS_IB_WC_STATUS_STR | ||
| 67 | }; | ||
| 68 | |||
| 69 | char *rds_ib_wc_status_str(enum ib_wc_status status) | ||
| 70 | { | ||
| 71 | return rds_str_array(rds_ib_wc_status_strings, | ||
| 72 | ARRAY_SIZE(rds_ib_wc_status_strings), status); | ||
| 73 | } | ||
| 74 | |||
| 75 | /* | ||
| 76 | * Convert IB-specific error message to RDS error message and call core | ||
| 77 | * completion handler. | ||
| 78 | */ | ||
| 79 | static void rds_ib_send_complete(struct rds_message *rm, | ||
| 80 | int wc_status, | ||
| 81 | void (*complete)(struct rds_message *rm, int status)) | ||
| 44 | { | 82 | { |
| 45 | int notify_status; | 83 | int notify_status; |
| 46 | 84 | ||
| @@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, | |||
| 60 | notify_status = RDS_RDMA_OTHER_ERROR; | 98 | notify_status = RDS_RDMA_OTHER_ERROR; |
| 61 | break; | 99 | break; |
| 62 | } | 100 | } |
| 63 | rds_rdma_send_complete(rm, notify_status); | 101 | complete(rm, notify_status); |
| 102 | } | ||
| 103 | |||
| 104 | static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, | ||
| 105 | struct rm_data_op *op, | ||
| 106 | int wc_status) | ||
| 107 | { | ||
| 108 | if (op->op_nents) | ||
| 109 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
| 110 | op->op_sg, op->op_nents, | ||
| 111 | DMA_TO_DEVICE); | ||
| 64 | } | 112 | } |
| 65 | 113 | ||
| 66 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, | 114 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, |
| 67 | struct rds_rdma_op *op) | 115 | struct rm_rdma_op *op, |
| 116 | int wc_status) | ||
| 68 | { | 117 | { |
| 69 | if (op->r_mapped) { | 118 | if (op->op_mapped) { |
| 70 | ib_dma_unmap_sg(ic->i_cm_id->device, | 119 | ib_dma_unmap_sg(ic->i_cm_id->device, |
| 71 | op->r_sg, op->r_nents, | 120 | op->op_sg, op->op_nents, |
| 72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | 121 | op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
| 73 | op->r_mapped = 0; | 122 | op->op_mapped = 0; |
| 74 | } | 123 | } |
| 124 | |||
| 125 | /* If the user asked for a completion notification on this | ||
| 126 | * message, we can implement three different semantics: | ||
| 127 | * 1. Notify when we received the ACK on the RDS message | ||
| 128 | * that was queued with the RDMA. This provides reliable | ||
| 129 | * notification of RDMA status at the expense of a one-way | ||
| 130 | * packet delay. | ||
| 131 | * 2. Notify when the IB stack gives us the completion event for | ||
| 132 | * the RDMA operation. | ||
| 133 | * 3. Notify when the IB stack gives us the completion event for | ||
| 134 | * the accompanying RDS messages. | ||
| 135 | * Here, we implement approach #3. To implement approach #2, | ||
| 136 | * we would need to take an event for the rdma WR. To implement #1, | ||
| 137 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
| 138 | * handling in the ACK processing code. | ||
| 139 | * | ||
| 140 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
| 141 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
| 142 | * operation itself unmapped the RDMA buffers, which takes care | ||
| 143 | * of synching. | ||
| 144 | */ | ||
| 145 | rds_ib_send_complete(container_of(op, struct rds_message, rdma), | ||
| 146 | wc_status, rds_rdma_send_complete); | ||
| 147 | |||
| 148 | if (op->op_write) | ||
| 149 | rds_stats_add(s_send_rdma_bytes, op->op_bytes); | ||
| 150 | else | ||
| 151 | rds_stats_add(s_recv_rdma_bytes, op->op_bytes); | ||
| 75 | } | 152 | } |
| 76 | 153 | ||
| 77 | static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, | 154 | static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, |
| 78 | struct rds_ib_send_work *send, | 155 | struct rm_atomic_op *op, |
| 79 | int wc_status) | 156 | int wc_status) |
| 80 | { | 157 | { |
| 81 | struct rds_message *rm = send->s_rm; | 158 | /* unmap atomic recvbuf */ |
| 82 | 159 | if (op->op_mapped) { | |
| 83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | 160 | ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, |
| 84 | 161 | DMA_FROM_DEVICE); | |
| 85 | ib_dma_unmap_sg(ic->i_cm_id->device, | 162 | op->op_mapped = 0; |
| 86 | rm->m_sg, rm->m_nents, | 163 | } |
| 87 | DMA_TO_DEVICE); | ||
| 88 | |||
| 89 | if (rm->m_rdma_op != NULL) { | ||
| 90 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
| 91 | |||
| 92 | /* If the user asked for a completion notification on this | ||
| 93 | * message, we can implement three different semantics: | ||
| 94 | * 1. Notify when we received the ACK on the RDS message | ||
| 95 | * that was queued with the RDMA. This provides reliable | ||
| 96 | * notification of RDMA status at the expense of a one-way | ||
| 97 | * packet delay. | ||
| 98 | * 2. Notify when the IB stack gives us the completion event for | ||
| 99 | * the RDMA operation. | ||
| 100 | * 3. Notify when the IB stack gives us the completion event for | ||
| 101 | * the accompanying RDS messages. | ||
| 102 | * Here, we implement approach #3. To implement approach #2, | ||
| 103 | * call rds_rdma_send_complete from the cq_handler. To implement #1, | ||
| 104 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
| 105 | * handling in the ACK processing code. | ||
| 106 | * | ||
| 107 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
| 108 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
| 109 | * operation itself unmapped the RDMA buffers, which takes care | ||
| 110 | * of synching. | ||
| 111 | */ | ||
| 112 | rds_ib_send_rdma_complete(rm, wc_status); | ||
| 113 | 164 | ||
| 114 | if (rm->m_rdma_op->r_write) | 165 | rds_ib_send_complete(container_of(op, struct rds_message, atomic), |
| 115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | 166 | wc_status, rds_atomic_send_complete); |
| 116 | else | 167 | |
| 117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | 168 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) |
| 169 | rds_ib_stats_inc(s_ib_atomic_cswp); | ||
| 170 | else | ||
| 171 | rds_ib_stats_inc(s_ib_atomic_fadd); | ||
| 172 | } | ||
| 173 | |||
| 174 | /* | ||
| 175 | * Unmap the resources associated with a struct send_work. | ||
| 176 | * | ||
| 177 | * Returns the rm for no good reason other than it is unobtainable | ||
| 178 | * other than by switching on wr.opcode, currently, and the caller, | ||
| 179 | * the event handler, needs it. | ||
| 180 | */ | ||
| 181 | static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, | ||
| 182 | struct rds_ib_send_work *send, | ||
| 183 | int wc_status) | ||
| 184 | { | ||
| 185 | struct rds_message *rm = NULL; | ||
| 186 | |||
| 187 | /* In the error case, wc.opcode sometimes contains garbage */ | ||
| 188 | switch (send->s_wr.opcode) { | ||
| 189 | case IB_WR_SEND: | ||
| 190 | if (send->s_op) { | ||
| 191 | rm = container_of(send->s_op, struct rds_message, data); | ||
| 192 | rds_ib_send_unmap_data(ic, send->s_op, wc_status); | ||
| 193 | } | ||
| 194 | break; | ||
| 195 | case IB_WR_RDMA_WRITE: | ||
| 196 | case IB_WR_RDMA_READ: | ||
| 197 | if (send->s_op) { | ||
| 198 | rm = container_of(send->s_op, struct rds_message, rdma); | ||
| 199 | rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); | ||
| 200 | } | ||
| 201 | break; | ||
| 202 | case IB_WR_ATOMIC_FETCH_AND_ADD: | ||
| 203 | case IB_WR_ATOMIC_CMP_AND_SWP: | ||
| 204 | if (send->s_op) { | ||
| 205 | rm = container_of(send->s_op, struct rds_message, atomic); | ||
| 206 | rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); | ||
| 207 | } | ||
| 208 | break; | ||
| 209 | default: | ||
| 210 | if (printk_ratelimit()) | ||
| 211 | printk(KERN_NOTICE | ||
| 212 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
| 213 | __func__, send->s_wr.opcode); | ||
| 214 | break; | ||
| 118 | } | 215 | } |
| 119 | 216 | ||
| 120 | /* If anyone waited for this message to get flushed out, wake | 217 | send->s_wr.opcode = 0xdead; |
| 121 | * them up now */ | ||
| 122 | rds_message_unmapped(rm); | ||
| 123 | 218 | ||
| 124 | rds_message_put(rm); | 219 | return rm; |
| 125 | send->s_rm = NULL; | ||
| 126 | } | 220 | } |
| 127 | 221 | ||
| 128 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) | 222 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) |
| @@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) | |||
| 133 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 227 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
| 134 | struct ib_sge *sge; | 228 | struct ib_sge *sge; |
| 135 | 229 | ||
| 136 | send->s_rm = NULL; | ||
| 137 | send->s_op = NULL; | 230 | send->s_op = NULL; |
| 138 | 231 | ||
| 139 | send->s_wr.wr_id = i; | 232 | send->s_wr.wr_id = i; |
| 140 | send->s_wr.sg_list = send->s_sge; | 233 | send->s_wr.sg_list = send->s_sge; |
| 141 | send->s_wr.num_sge = 1; | ||
| 142 | send->s_wr.opcode = IB_WR_SEND; | ||
| 143 | send->s_wr.send_flags = 0; | ||
| 144 | send->s_wr.ex.imm_data = 0; | 234 | send->s_wr.ex.imm_data = 0; |
| 145 | 235 | ||
| 146 | sge = rds_ib_data_sge(ic, send->s_sge); | 236 | sge = &send->s_sge[0]; |
| 147 | sge->lkey = ic->i_mr->lkey; | ||
| 148 | |||
| 149 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
| 150 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); | 237 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); |
| 151 | sge->length = sizeof(struct rds_header); | 238 | sge->length = sizeof(struct rds_header); |
| 152 | sge->lkey = ic->i_mr->lkey; | 239 | sge->lkey = ic->i_mr->lkey; |
| 240 | |||
| 241 | send->s_sge[1].lkey = ic->i_mr->lkey; | ||
| 153 | } | 242 | } |
| 154 | } | 243 | } |
| 155 | 244 | ||
| @@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) | |||
| 159 | u32 i; | 248 | u32 i; |
| 160 | 249 | ||
| 161 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 250 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
| 162 | if (send->s_wr.opcode == 0xdead) | 251 | if (send->s_op && send->s_wr.opcode != 0xdead) |
| 163 | continue; | 252 | rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); |
| 164 | if (send->s_rm) | ||
| 165 | rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); | ||
| 166 | if (send->s_op) | ||
| 167 | rds_ib_send_unmap_rdma(ic, send->s_op); | ||
| 168 | } | 253 | } |
| 169 | } | 254 | } |
| 170 | 255 | ||
| 171 | /* | 256 | /* |
| 257 | * The only fast path caller always has a non-zero nr, so we don't | ||
| 258 | * bother testing nr before performing the atomic sub. | ||
| 259 | */ | ||
| 260 | static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) | ||
| 261 | { | ||
| 262 | if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && | ||
| 263 | waitqueue_active(&rds_ib_ring_empty_wait)) | ||
| 264 | wake_up(&rds_ib_ring_empty_wait); | ||
| 265 | BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); | ||
| 266 | } | ||
| 267 | |||
| 268 | /* | ||
| 172 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc | 269 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc |
| 173 | * operations performed in the send path. As the sender allocs and potentially | 270 | * operations performed in the send path. As the sender allocs and potentially |
| 174 | * unallocs the next free entry in the ring it doesn't alter which is | 271 | * unallocs the next free entry in the ring it doesn't alter which is |
| @@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 178 | { | 275 | { |
| 179 | struct rds_connection *conn = context; | 276 | struct rds_connection *conn = context; |
| 180 | struct rds_ib_connection *ic = conn->c_transport_data; | 277 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 278 | struct rds_message *rm = NULL; | ||
| 181 | struct ib_wc wc; | 279 | struct ib_wc wc; |
| 182 | struct rds_ib_send_work *send; | 280 | struct rds_ib_send_work *send; |
| 183 | u32 completed; | 281 | u32 completed; |
| 184 | u32 oldest; | 282 | u32 oldest; |
| 185 | u32 i = 0; | 283 | u32 i = 0; |
| 186 | int ret; | 284 | int ret; |
| 285 | int nr_sig = 0; | ||
| 187 | 286 | ||
| 188 | rdsdebug("cq %p conn %p\n", cq, conn); | 287 | rdsdebug("cq %p conn %p\n", cq, conn); |
| 189 | rds_ib_stats_inc(s_ib_tx_cq_call); | 288 | rds_ib_stats_inc(s_ib_tx_cq_call); |
| @@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 192 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | 291 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); |
| 193 | 292 | ||
| 194 | while (ib_poll_cq(cq, 1, &wc) > 0) { | 293 | while (ib_poll_cq(cq, 1, &wc) > 0) { |
| 195 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 294 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
| 196 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 295 | (unsigned long long)wc.wr_id, wc.status, |
| 296 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
| 197 | be32_to_cpu(wc.ex.imm_data)); | 297 | be32_to_cpu(wc.ex.imm_data)); |
| 198 | rds_ib_stats_inc(s_ib_tx_cq_event); | 298 | rds_ib_stats_inc(s_ib_tx_cq_event); |
| 199 | 299 | ||
| @@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 210 | 310 | ||
| 211 | for (i = 0; i < completed; i++) { | 311 | for (i = 0; i < completed; i++) { |
| 212 | send = &ic->i_sends[oldest]; | 312 | send = &ic->i_sends[oldest]; |
| 313 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
| 314 | nr_sig++; | ||
| 213 | 315 | ||
| 214 | /* In the error case, wc.opcode sometimes contains garbage */ | 316 | rm = rds_ib_send_unmap_op(ic, send, wc.status); |
| 215 | switch (send->s_wr.opcode) { | ||
| 216 | case IB_WR_SEND: | ||
| 217 | if (send->s_rm) | ||
| 218 | rds_ib_send_unmap_rm(ic, send, wc.status); | ||
| 219 | break; | ||
| 220 | case IB_WR_RDMA_WRITE: | ||
| 221 | case IB_WR_RDMA_READ: | ||
| 222 | /* Nothing to be done - the SG list will be unmapped | ||
| 223 | * when the SEND completes. */ | ||
| 224 | break; | ||
| 225 | default: | ||
| 226 | if (printk_ratelimit()) | ||
| 227 | printk(KERN_NOTICE | ||
| 228 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
| 229 | __func__, send->s_wr.opcode); | ||
| 230 | break; | ||
| 231 | } | ||
| 232 | 317 | ||
| 233 | send->s_wr.opcode = 0xdead; | ||
| 234 | send->s_wr.num_sge = 1; | ||
| 235 | if (send->s_queued + HZ/2 < jiffies) | 318 | if (send->s_queued + HZ/2 < jiffies) |
| 236 | rds_ib_stats_inc(s_ib_tx_stalled); | 319 | rds_ib_stats_inc(s_ib_tx_stalled); |
| 237 | 320 | ||
| 238 | /* If a RDMA operation produced an error, signal this right | 321 | if (send->s_op) { |
| 239 | * away. If we don't, the subsequent SEND that goes with this | 322 | if (send->s_op == rm->m_final_op) { |
| 240 | * RDMA will be canceled with ERR_WFLUSH, and the application | 323 | /* If anyone waited for this message to get flushed out, wake |
| 241 | * never learn that the RDMA failed. */ | 324 | * them up now */ |
| 242 | if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { | 325 | rds_message_unmapped(rm); |
| 243 | struct rds_message *rm; | ||
| 244 | |||
| 245 | rm = rds_send_get_message(conn, send->s_op); | ||
| 246 | if (rm) { | ||
| 247 | if (rm->m_rdma_op) | ||
| 248 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
| 249 | rds_ib_send_rdma_complete(rm, wc.status); | ||
| 250 | rds_message_put(rm); | ||
| 251 | } | 326 | } |
| 327 | rds_message_put(rm); | ||
| 328 | send->s_op = NULL; | ||
| 252 | } | 329 | } |
| 253 | 330 | ||
| 254 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; | 331 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; |
| 255 | } | 332 | } |
| 256 | 333 | ||
| 257 | rds_ib_ring_free(&ic->i_send_ring, completed); | 334 | rds_ib_ring_free(&ic->i_send_ring, completed); |
| 335 | rds_ib_sub_signaled(ic, nr_sig); | ||
| 336 | nr_sig = 0; | ||
| 258 | 337 | ||
| 259 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || | 338 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || |
| 260 | test_bit(0, &conn->c_map_queued)) | 339 | test_bit(0, &conn->c_map_queued)) |
| @@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 262 | 341 | ||
| 263 | /* We expect errors as the qp is drained during shutdown */ | 342 | /* We expect errors as the qp is drained during shutdown */ |
| 264 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { | 343 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { |
| 265 | rds_ib_conn_error(conn, | 344 | rds_ib_conn_error(conn, "send completion on %pI4 had status " |
| 266 | "send completion on %pI4 " | 345 | "%u (%s), disconnecting and reconnecting\n", |
| 267 | "had status %u, disconnecting and reconnecting\n", | 346 | &conn->c_faddr, wc.status, |
| 268 | &conn->c_faddr, wc.status); | 347 | rds_ib_wc_status_str(wc.status)); |
| 269 | } | 348 | } |
| 270 | } | 349 | } |
| 271 | } | 350 | } |
| @@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
| 294 | * credits (see rds_ib_send_add_credits below). | 373 | * credits (see rds_ib_send_add_credits below). |
| 295 | * | 374 | * |
| 296 | * The RDS send code is essentially single-threaded; rds_send_xmit | 375 | * The RDS send code is essentially single-threaded; rds_send_xmit |
| 297 | * grabs c_send_lock to ensure exclusive access to the send ring. | 376 | * sets RDS_IN_XMIT to ensure exclusive access to the send ring. |
| 298 | * However, the ACK sending code is independent and can race with | 377 | * However, the ACK sending code is independent and can race with |
| 299 | * message SENDs. | 378 | * message SENDs. |
| 300 | * | 379 | * |
| @@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) | |||
| 413 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | 492 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); |
| 414 | } | 493 | } |
| 415 | 494 | ||
| 416 | static inline void | 495 | static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, |
| 417 | rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, | 496 | struct rds_ib_send_work *send, |
| 418 | struct rds_ib_send_work *send, unsigned int pos, | 497 | bool notify) |
| 419 | unsigned long buffer, unsigned int length, | ||
| 420 | int send_flags) | ||
| 421 | { | 498 | { |
| 422 | struct ib_sge *sge; | 499 | /* |
| 423 | 500 | * We want to delay signaling completions just enough to get | |
| 424 | WARN_ON(pos != send - ic->i_sends); | 501 | * the batching benefits but not so much that we create dead time |
| 425 | 502 | * on the wire. | |
| 426 | send->s_wr.send_flags = send_flags; | 503 | */ |
| 427 | send->s_wr.opcode = IB_WR_SEND; | 504 | if (ic->i_unsignaled_wrs-- == 0 || notify) { |
| 428 | send->s_wr.num_sge = 2; | 505 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; |
| 429 | send->s_wr.next = NULL; | 506 | send->s_wr.send_flags |= IB_SEND_SIGNALED; |
| 430 | send->s_queued = jiffies; | 507 | return 1; |
| 431 | send->s_op = NULL; | ||
| 432 | |||
| 433 | if (length != 0) { | ||
| 434 | sge = rds_ib_data_sge(ic, send->s_sge); | ||
| 435 | sge->addr = buffer; | ||
| 436 | sge->length = length; | ||
| 437 | sge->lkey = ic->i_mr->lkey; | ||
| 438 | |||
| 439 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
| 440 | } else { | ||
| 441 | /* We're sending a packet with no payload. There is only | ||
| 442 | * one SGE */ | ||
| 443 | send->s_wr.num_sge = 1; | ||
| 444 | sge = &send->s_sge[0]; | ||
| 445 | } | 508 | } |
| 446 | 509 | return 0; | |
| 447 | sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); | ||
| 448 | sge->length = sizeof(struct rds_header); | ||
| 449 | sge->lkey = ic->i_mr->lkey; | ||
| 450 | } | 510 | } |
| 451 | 511 | ||
| 452 | /* | 512 | /* |
| @@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 475 | u32 pos; | 535 | u32 pos; |
| 476 | u32 i; | 536 | u32 i; |
| 477 | u32 work_alloc; | 537 | u32 work_alloc; |
| 478 | u32 credit_alloc; | 538 | u32 credit_alloc = 0; |
| 479 | u32 posted; | 539 | u32 posted; |
| 480 | u32 adv_credits = 0; | 540 | u32 adv_credits = 0; |
| 481 | int send_flags = 0; | 541 | int send_flags = 0; |
| 482 | int sent; | 542 | int bytes_sent = 0; |
| 483 | int ret; | 543 | int ret; |
| 484 | int flow_controlled = 0; | 544 | int flow_controlled = 0; |
| 545 | int nr_sig = 0; | ||
| 485 | 546 | ||
| 486 | BUG_ON(off % RDS_FRAG_SIZE); | 547 | BUG_ON(off % RDS_FRAG_SIZE); |
| 487 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); | 548 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); |
| @@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 507 | goto out; | 568 | goto out; |
| 508 | } | 569 | } |
| 509 | 570 | ||
| 510 | credit_alloc = work_alloc; | ||
| 511 | if (ic->i_flowctl) { | 571 | if (ic->i_flowctl) { |
| 512 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); | 572 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); |
| 513 | adv_credits += posted; | 573 | adv_credits += posted; |
| 514 | if (credit_alloc < work_alloc) { | 574 | if (credit_alloc < work_alloc) { |
| 515 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); | 575 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); |
| 516 | work_alloc = credit_alloc; | 576 | work_alloc = credit_alloc; |
| 517 | flow_controlled++; | 577 | flow_controlled = 1; |
| 518 | } | 578 | } |
| 519 | if (work_alloc == 0) { | 579 | if (work_alloc == 0) { |
| 520 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | 580 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); |
| @@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 525 | } | 585 | } |
| 526 | 586 | ||
| 527 | /* map the message the first time we see it */ | 587 | /* map the message the first time we see it */ |
| 528 | if (ic->i_rm == NULL) { | 588 | if (!ic->i_data_op) { |
| 529 | /* | 589 | if (rm->data.op_nents) { |
| 530 | printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", | 590 | rm->data.op_count = ib_dma_map_sg(dev, |
| 531 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | 591 | rm->data.op_sg, |
| 532 | rm->m_inc.i_hdr.h_flags, | 592 | rm->data.op_nents, |
| 533 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | 593 | DMA_TO_DEVICE); |
| 534 | */ | 594 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); |
| 535 | if (rm->m_nents) { | 595 | if (rm->data.op_count == 0) { |
| 536 | rm->m_count = ib_dma_map_sg(dev, | ||
| 537 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | ||
| 538 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | ||
| 539 | if (rm->m_count == 0) { | ||
| 540 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 596 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
| 541 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 597 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 542 | ret = -ENOMEM; /* XXX ? */ | 598 | ret = -ENOMEM; /* XXX ? */ |
| 543 | goto out; | 599 | goto out; |
| 544 | } | 600 | } |
| 545 | } else { | 601 | } else { |
| 546 | rm->m_count = 0; | 602 | rm->data.op_count = 0; |
| 547 | } | 603 | } |
| 548 | 604 | ||
| 549 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
| 550 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | ||
| 551 | rds_message_addref(rm); | 605 | rds_message_addref(rm); |
| 552 | ic->i_rm = rm; | 606 | ic->i_data_op = &rm->data; |
| 553 | 607 | ||
| 554 | /* Finalize the header */ | 608 | /* Finalize the header */ |
| 555 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) | 609 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) |
| @@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 559 | 613 | ||
| 560 | /* If it has a RDMA op, tell the peer we did it. This is | 614 | /* If it has a RDMA op, tell the peer we did it. This is |
| 561 | * used by the peer to release use-once RDMA MRs. */ | 615 | * used by the peer to release use-once RDMA MRs. */ |
| 562 | if (rm->m_rdma_op) { | 616 | if (rm->rdma.op_active) { |
| 563 | struct rds_ext_header_rdma ext_hdr; | 617 | struct rds_ext_header_rdma ext_hdr; |
| 564 | 618 | ||
| 565 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | 619 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); |
| 566 | rds_message_add_extension(&rm->m_inc.i_hdr, | 620 | rds_message_add_extension(&rm->m_inc.i_hdr, |
| 567 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | 621 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); |
| 568 | } | 622 | } |
| @@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
| 582 | /* | 636 | /* |
| 583 | * Update adv_credits since we reset the ACK_REQUIRED bit. | 637 | * Update adv_credits since we reset the ACK_REQUIRED bit. |
| 584 | */ | 638 | */ |
| 585 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); | 639 | if (ic->i_flowctl) { |
| 586 | adv_credits += posted; | 640 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); |
| 587 | BUG_ON(adv_credits > 255); | 641 | adv_credits += posted; |
| 642 | BUG_ON(adv_credits > 255); | ||
| 643 | } | ||
| 588 | } | 644 | } |
| 589 | 645 | ||
| 590 | send = &ic->i_sends[pos]; | ||
| 591 | first = send; | ||
| 592 | prev = NULL; | ||
| 593 | scat = &rm->m_sg[sg]; | ||
| 594 | sent = 0; | ||
| 595 | i = 0; | ||
| 596 | |||
| 597 | /* Sometimes you want to put a fence between an RDMA | 646 | /* Sometimes you want to put a fence between an RDMA |
| 598 | * READ and the following SEND. | 647 | * READ and the following SEND. |
| 599 | * We could either do this all the time | 648 | * We could either do this all the time |
| 600 | * or when requested by the user. Right now, we let | 649 | * or when requested by the user. Right now, we let |
| 601 | * the application choose. | 650 | * the application choose. |
| 602 | */ | 651 | */ |
| 603 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | 652 | if (rm->rdma.op_active && rm->rdma.op_fence) |
| 604 | send_flags = IB_SEND_FENCE; | 653 | send_flags = IB_SEND_FENCE; |
| 605 | 654 | ||
| 606 | /* | 655 | /* Each frag gets a header. Msgs may be 0 bytes */ |
| 607 | * We could be copying the header into the unused tail of the page. | 656 | send = &ic->i_sends[pos]; |
| 608 | * That would need to be changed in the future when those pages might | 657 | first = send; |
| 609 | * be mapped userspace pages or page cache pages. So instead we always | 658 | prev = NULL; |
| 610 | * use a second sge and our long-lived ring of mapped headers. We send | 659 | scat = &ic->i_data_op->op_sg[sg]; |
| 611 | * the header after the data so that the data payload can be aligned on | 660 | i = 0; |
| 612 | * the receiver. | 661 | do { |
| 613 | */ | 662 | unsigned int len = 0; |
| 614 | 663 | ||
| 615 | /* handle a 0-len message */ | 664 | /* Set up the header */ |
| 616 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { | 665 | send->s_wr.send_flags = send_flags; |
| 617 | rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); | 666 | send->s_wr.opcode = IB_WR_SEND; |
| 618 | goto add_header; | 667 | send->s_wr.num_sge = 1; |
| 619 | } | 668 | send->s_wr.next = NULL; |
| 669 | send->s_queued = jiffies; | ||
| 670 | send->s_op = NULL; | ||
| 620 | 671 | ||
| 621 | /* if there's data reference it with a chain of work reqs */ | 672 | send->s_sge[0].addr = ic->i_send_hdrs_dma |
| 622 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | 673 | + (pos * sizeof(struct rds_header)); |
| 623 | unsigned int len; | 674 | send->s_sge[0].length = sizeof(struct rds_header); |
| 624 | 675 | ||
| 625 | send = &ic->i_sends[pos]; | 676 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); |
| 626 | 677 | ||
| 627 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); | 678 | /* Set up the data, if present */ |
| 628 | rds_ib_xmit_populate_wr(ic, send, pos, | 679 | if (i < work_alloc |
| 629 | ib_sg_dma_address(dev, scat) + off, len, | 680 | && scat != &rm->data.op_sg[rm->data.op_count]) { |
| 630 | send_flags); | 681 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); |
| 682 | send->s_wr.num_sge = 2; | ||
| 631 | 683 | ||
| 632 | /* | 684 | send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; |
| 633 | * We want to delay signaling completions just enough to get | 685 | send->s_sge[1].length = len; |
| 634 | * the batching benefits but not so much that we create dead time | ||
| 635 | * on the wire. | ||
| 636 | */ | ||
| 637 | if (ic->i_unsignaled_wrs-- == 0) { | ||
| 638 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
| 639 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
| 640 | } | ||
| 641 | 686 | ||
| 642 | ic->i_unsignaled_bytes -= len; | 687 | bytes_sent += len; |
| 643 | if (ic->i_unsignaled_bytes <= 0) { | 688 | off += len; |
| 644 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | 689 | if (off == ib_sg_dma_len(dev, scat)) { |
| 645 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 690 | scat++; |
| 691 | off = 0; | ||
| 692 | } | ||
| 646 | } | 693 | } |
| 647 | 694 | ||
| 695 | rds_ib_set_wr_signal_state(ic, send, 0); | ||
| 696 | |||
| 648 | /* | 697 | /* |
| 649 | * Always signal the last one if we're stopping due to flow control. | 698 | * Always signal the last one if we're stopping due to flow control. |
| 650 | */ | 699 | */ |
| 651 | if (flow_controlled && i == (work_alloc-1)) | 700 | if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) |
| 652 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 701 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
| 653 | 702 | ||
| 703 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
| 704 | nr_sig++; | ||
| 705 | |||
| 654 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | 706 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, |
| 655 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | 707 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); |
| 656 | 708 | ||
| 657 | sent += len; | 709 | if (ic->i_flowctl && adv_credits) { |
| 658 | off += len; | ||
| 659 | if (off == ib_sg_dma_len(dev, scat)) { | ||
| 660 | scat++; | ||
| 661 | off = 0; | ||
| 662 | } | ||
| 663 | |||
| 664 | add_header: | ||
| 665 | /* Tack on the header after the data. The header SGE should already | ||
| 666 | * have been set up to point to the right header buffer. */ | ||
| 667 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); | ||
| 668 | |||
| 669 | if (0) { | ||
| 670 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
| 671 | |||
| 672 | printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", | ||
| 673 | be16_to_cpu(hdr->h_dport), | ||
| 674 | hdr->h_flags, | ||
| 675 | be32_to_cpu(hdr->h_len)); | ||
| 676 | } | ||
| 677 | if (adv_credits) { | ||
| 678 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | 710 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; |
| 679 | 711 | ||
| 680 | /* add credit and redo the header checksum */ | 712 | /* add credit and redo the header checksum */ |
| @@ -689,20 +721,25 @@ add_header: | |||
| 689 | prev = send; | 721 | prev = send; |
| 690 | 722 | ||
| 691 | pos = (pos + 1) % ic->i_send_ring.w_nr; | 723 | pos = (pos + 1) % ic->i_send_ring.w_nr; |
| 692 | } | 724 | send = &ic->i_sends[pos]; |
| 725 | i++; | ||
| 726 | |||
| 727 | } while (i < work_alloc | ||
| 728 | && scat != &rm->data.op_sg[rm->data.op_count]); | ||
| 693 | 729 | ||
| 694 | /* Account the RDS header in the number of bytes we sent, but just once. | 730 | /* Account the RDS header in the number of bytes we sent, but just once. |
| 695 | * The caller has no concept of fragmentation. */ | 731 | * The caller has no concept of fragmentation. */ |
| 696 | if (hdr_off == 0) | 732 | if (hdr_off == 0) |
| 697 | sent += sizeof(struct rds_header); | 733 | bytes_sent += sizeof(struct rds_header); |
| 698 | 734 | ||
| 699 | /* if we finished the message then send completion owns it */ | 735 | /* if we finished the message then send completion owns it */ |
| 700 | if (scat == &rm->m_sg[rm->m_count]) { | 736 | if (scat == &rm->data.op_sg[rm->data.op_count]) { |
| 701 | prev->s_rm = ic->i_rm; | 737 | prev->s_op = ic->i_data_op; |
| 702 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 738 | prev->s_wr.send_flags |= IB_SEND_SOLICITED; |
| 703 | ic->i_rm = NULL; | 739 | ic->i_data_op = NULL; |
| 704 | } | 740 | } |
| 705 | 741 | ||
| 742 | /* Put back wrs & credits we didn't use */ | ||
| 706 | if (i < work_alloc) { | 743 | if (i < work_alloc) { |
| 707 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 744 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
| 708 | work_alloc = i; | 745 | work_alloc = i; |
| @@ -710,6 +747,9 @@ add_header: | |||
| 710 | if (ic->i_flowctl && i < credit_alloc) | 747 | if (ic->i_flowctl && i < credit_alloc) |
| 711 | rds_ib_send_add_credits(conn, credit_alloc - i); | 748 | rds_ib_send_add_credits(conn, credit_alloc - i); |
| 712 | 749 | ||
| 750 | if (nr_sig) | ||
| 751 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
| 752 | |||
| 713 | /* XXX need to worry about failed_wr and partial sends. */ | 753 | /* XXX need to worry about failed_wr and partial sends. */ |
| 714 | failed_wr = &first->s_wr; | 754 | failed_wr = &first->s_wr; |
| 715 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 755 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
| @@ -720,32 +760,127 @@ add_header: | |||
| 720 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " | 760 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " |
| 721 | "returned %d\n", &conn->c_faddr, ret); | 761 | "returned %d\n", &conn->c_faddr, ret); |
| 722 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 762 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 723 | if (prev->s_rm) { | 763 | rds_ib_sub_signaled(ic, nr_sig); |
| 724 | ic->i_rm = prev->s_rm; | 764 | if (prev->s_op) { |
| 725 | prev->s_rm = NULL; | 765 | ic->i_data_op = prev->s_op; |
| 766 | prev->s_op = NULL; | ||
| 726 | } | 767 | } |
| 727 | 768 | ||
| 728 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); | 769 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); |
| 729 | goto out; | 770 | goto out; |
| 730 | } | 771 | } |
| 731 | 772 | ||
| 732 | ret = sent; | 773 | ret = bytes_sent; |
| 733 | out: | 774 | out: |
| 734 | BUG_ON(adv_credits); | 775 | BUG_ON(adv_credits); |
| 735 | return ret; | 776 | return ret; |
| 736 | } | 777 | } |
| 737 | 778 | ||
| 738 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | 779 | /* |
| 780 | * Issue atomic operation. | ||
| 781 | * A simplified version of the rdma case, we always map 1 SG, and | ||
| 782 | * only 8 bytes, for the return value from the atomic operation. | ||
| 783 | */ | ||
| 784 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) | ||
| 785 | { | ||
| 786 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
| 787 | struct rds_ib_send_work *send = NULL; | ||
| 788 | struct ib_send_wr *failed_wr; | ||
| 789 | struct rds_ib_device *rds_ibdev; | ||
| 790 | u32 pos; | ||
| 791 | u32 work_alloc; | ||
| 792 | int ret; | ||
| 793 | int nr_sig = 0; | ||
| 794 | |||
| 795 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
| 796 | |||
| 797 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); | ||
| 798 | if (work_alloc != 1) { | ||
| 799 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
| 800 | rds_ib_stats_inc(s_ib_tx_ring_full); | ||
| 801 | ret = -ENOMEM; | ||
| 802 | goto out; | ||
| 803 | } | ||
| 804 | |||
| 805 | /* address of send request in ring */ | ||
| 806 | send = &ic->i_sends[pos]; | ||
| 807 | send->s_queued = jiffies; | ||
| 808 | |||
| 809 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { | ||
| 810 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; | ||
| 811 | send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; | ||
| 812 | send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; | ||
| 813 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; | ||
| 814 | send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; | ||
| 815 | } else { /* FADD */ | ||
| 816 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; | ||
| 817 | send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; | ||
| 818 | send->s_wr.wr.atomic.swap = 0; | ||
| 819 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; | ||
| 820 | send->s_wr.wr.atomic.swap_mask = 0; | ||
| 821 | } | ||
| 822 | nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); | ||
| 823 | send->s_wr.num_sge = 1; | ||
| 824 | send->s_wr.next = NULL; | ||
| 825 | send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; | ||
| 826 | send->s_wr.wr.atomic.rkey = op->op_rkey; | ||
| 827 | send->s_op = op; | ||
| 828 | rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); | ||
| 829 | |||
| 830 | /* map 8 byte retval buffer to the device */ | ||
| 831 | ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); | ||
| 832 | rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); | ||
| 833 | if (ret != 1) { | ||
| 834 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
| 835 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||
| 836 | ret = -ENOMEM; /* XXX ? */ | ||
| 837 | goto out; | ||
| 838 | } | ||
| 839 | |||
| 840 | /* Convert our struct scatterlist to struct ib_sge */ | ||
| 841 | send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); | ||
| 842 | send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); | ||
| 843 | send->s_sge[0].lkey = ic->i_mr->lkey; | ||
| 844 | |||
| 845 | rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, | ||
| 846 | send->s_sge[0].addr, send->s_sge[0].length); | ||
| 847 | |||
| 848 | if (nr_sig) | ||
| 849 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
| 850 | |||
| 851 | failed_wr = &send->s_wr; | ||
| 852 | ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); | ||
| 853 | rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, | ||
| 854 | send, &send->s_wr, ret, failed_wr); | ||
| 855 | BUG_ON(failed_wr != &send->s_wr); | ||
| 856 | if (ret) { | ||
| 857 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " | ||
| 858 | "returned %d\n", &conn->c_faddr, ret); | ||
| 859 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
| 860 | rds_ib_sub_signaled(ic, nr_sig); | ||
| 861 | goto out; | ||
| 862 | } | ||
| 863 | |||
| 864 | if (unlikely(failed_wr != &send->s_wr)) { | ||
| 865 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); | ||
| 866 | BUG_ON(failed_wr != &send->s_wr); | ||
| 867 | } | ||
| 868 | |||
| 869 | out: | ||
| 870 | return ret; | ||
| 871 | } | ||
| 872 | |||
| 873 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) | ||
| 739 | { | 874 | { |
| 740 | struct rds_ib_connection *ic = conn->c_transport_data; | 875 | struct rds_ib_connection *ic = conn->c_transport_data; |
| 741 | struct rds_ib_send_work *send = NULL; | 876 | struct rds_ib_send_work *send = NULL; |
| 742 | struct rds_ib_send_work *first; | 877 | struct rds_ib_send_work *first; |
| 743 | struct rds_ib_send_work *prev; | 878 | struct rds_ib_send_work *prev; |
| 744 | struct ib_send_wr *failed_wr; | 879 | struct ib_send_wr *failed_wr; |
| 745 | struct rds_ib_device *rds_ibdev; | ||
| 746 | struct scatterlist *scat; | 880 | struct scatterlist *scat; |
| 747 | unsigned long len; | 881 | unsigned long len; |
| 748 | u64 remote_addr = op->r_remote_addr; | 882 | u64 remote_addr = op->op_remote_addr; |
| 883 | u32 max_sge = ic->rds_ibdev->max_sge; | ||
| 749 | u32 pos; | 884 | u32 pos; |
| 750 | u32 work_alloc; | 885 | u32 work_alloc; |
| 751 | u32 i; | 886 | u32 i; |
| @@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 753 | int sent; | 888 | int sent; |
| 754 | int ret; | 889 | int ret; |
| 755 | int num_sge; | 890 | int num_sge; |
| 756 | 891 | int nr_sig = 0; | |
| 757 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 892 | |
| 758 | 893 | /* map the op the first time we see it */ | |
| 759 | /* map the message the first time we see it */ | 894 | if (!op->op_mapped) { |
| 760 | if (!op->r_mapped) { | 895 | op->op_count = ib_dma_map_sg(ic->i_cm_id->device, |
| 761 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | 896 | op->op_sg, op->op_nents, (op->op_write) ? |
| 762 | op->r_sg, op->r_nents, (op->r_write) ? | 897 | DMA_TO_DEVICE : DMA_FROM_DEVICE); |
| 763 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | 898 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); |
| 764 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | 899 | if (op->op_count == 0) { |
| 765 | if (op->r_count == 0) { | ||
| 766 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 900 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
| 767 | ret = -ENOMEM; /* XXX ? */ | 901 | ret = -ENOMEM; /* XXX ? */ |
| 768 | goto out; | 902 | goto out; |
| 769 | } | 903 | } |
| 770 | 904 | ||
| 771 | op->r_mapped = 1; | 905 | op->op_mapped = 1; |
| 772 | } | 906 | } |
| 773 | 907 | ||
| 774 | /* | 908 | /* |
| 775 | * Instead of knowing how to return a partial rdma read/write we insist that there | 909 | * Instead of knowing how to return a partial rdma read/write we insist that there |
| 776 | * be enough work requests to send the entire message. | 910 | * be enough work requests to send the entire message. |
| 777 | */ | 911 | */ |
| 778 | i = ceil(op->r_count, rds_ibdev->max_sge); | 912 | i = ceil(op->op_count, max_sge); |
| 779 | 913 | ||
| 780 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); | 914 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); |
| 781 | if (work_alloc != i) { | 915 | if (work_alloc != i) { |
| @@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 788 | send = &ic->i_sends[pos]; | 922 | send = &ic->i_sends[pos]; |
| 789 | first = send; | 923 | first = send; |
| 790 | prev = NULL; | 924 | prev = NULL; |
| 791 | scat = &op->r_sg[0]; | 925 | scat = &op->op_sg[0]; |
| 792 | sent = 0; | 926 | sent = 0; |
| 793 | num_sge = op->r_count; | 927 | num_sge = op->op_count; |
| 794 | 928 | ||
| 795 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | 929 | for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { |
| 796 | send->s_wr.send_flags = 0; | 930 | send->s_wr.send_flags = 0; |
| 797 | send->s_queued = jiffies; | 931 | send->s_queued = jiffies; |
| 798 | /* | 932 | send->s_op = NULL; |
| 799 | * We want to delay signaling completions just enough to get | 933 | |
| 800 | * the batching benefits but not so much that we create dead time on the wire. | 934 | nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); |
| 801 | */ | ||
| 802 | if (ic->i_unsignaled_wrs-- == 0) { | ||
| 803 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
| 804 | send->s_wr.send_flags = IB_SEND_SIGNALED; | ||
| 805 | } | ||
| 806 | 935 | ||
| 807 | send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; | 936 | send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; |
| 808 | send->s_wr.wr.rdma.remote_addr = remote_addr; | 937 | send->s_wr.wr.rdma.remote_addr = remote_addr; |
| 809 | send->s_wr.wr.rdma.rkey = op->r_key; | 938 | send->s_wr.wr.rdma.rkey = op->op_rkey; |
| 810 | send->s_op = op; | ||
| 811 | 939 | ||
| 812 | if (num_sge > rds_ibdev->max_sge) { | 940 | if (num_sge > max_sge) { |
| 813 | send->s_wr.num_sge = rds_ibdev->max_sge; | 941 | send->s_wr.num_sge = max_sge; |
| 814 | num_sge -= rds_ibdev->max_sge; | 942 | num_sge -= max_sge; |
| 815 | } else { | 943 | } else { |
| 816 | send->s_wr.num_sge = num_sge; | 944 | send->s_wr.num_sge = num_sge; |
| 817 | } | 945 | } |
| @@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 821 | if (prev) | 949 | if (prev) |
| 822 | prev->s_wr.next = &send->s_wr; | 950 | prev->s_wr.next = &send->s_wr; |
| 823 | 951 | ||
| 824 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | 952 | for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { |
| 825 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | 953 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); |
| 826 | send->s_sge[j].addr = | 954 | send->s_sge[j].addr = |
| 827 | ib_sg_dma_address(ic->i_cm_id->device, scat); | 955 | ib_sg_dma_address(ic->i_cm_id->device, scat); |
| @@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 843 | send = ic->i_sends; | 971 | send = ic->i_sends; |
| 844 | } | 972 | } |
| 845 | 973 | ||
| 846 | /* if we finished the message then send completion owns it */ | 974 | /* give a reference to the last op */ |
| 847 | if (scat == &op->r_sg[op->r_count]) | 975 | if (scat == &op->op_sg[op->op_count]) { |
| 848 | prev->s_wr.send_flags = IB_SEND_SIGNALED; | 976 | prev->s_op = op; |
| 977 | rds_message_addref(container_of(op, struct rds_message, rdma)); | ||
| 978 | } | ||
| 849 | 979 | ||
| 850 | if (i < work_alloc) { | 980 | if (i < work_alloc) { |
| 851 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 981 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
| 852 | work_alloc = i; | 982 | work_alloc = i; |
| 853 | } | 983 | } |
| 854 | 984 | ||
| 985 | if (nr_sig) | ||
| 986 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
| 987 | |||
| 855 | failed_wr = &first->s_wr; | 988 | failed_wr = &first->s_wr; |
| 856 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 989 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
| 857 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | 990 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, |
| @@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
| 861 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " | 994 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " |
| 862 | "returned %d\n", &conn->c_faddr, ret); | 995 | "returned %d\n", &conn->c_faddr, ret); |
| 863 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 996 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
| 997 | rds_ib_sub_signaled(ic, nr_sig); | ||
| 864 | goto out; | 998 | goto out; |
| 865 | } | 999 | } |
| 866 | 1000 | ||
