diff options
Diffstat (limited to 'net/rds/ib_send.c')
-rw-r--r-- | net/rds/ib_send.c | 682 |
1 files changed, 408 insertions, 274 deletions
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 17fa80803ab0..71f373c421bc 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c | |||
@@ -36,11 +36,49 @@ | |||
36 | #include <linux/dmapool.h> | 36 | #include <linux/dmapool.h> |
37 | 37 | ||
38 | #include "rds.h" | 38 | #include "rds.h" |
39 | #include "rdma.h" | ||
40 | #include "ib.h" | 39 | #include "ib.h" |
41 | 40 | ||
42 | static void rds_ib_send_rdma_complete(struct rds_message *rm, | 41 | static char *rds_ib_wc_status_strings[] = { |
43 | int wc_status) | 42 | #define RDS_IB_WC_STATUS_STR(foo) \ |
43 | [IB_WC_##foo] = __stringify(IB_WC_##foo) | ||
44 | RDS_IB_WC_STATUS_STR(SUCCESS), | ||
45 | RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), | ||
46 | RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), | ||
47 | RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), | ||
48 | RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), | ||
49 | RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), | ||
50 | RDS_IB_WC_STATUS_STR(MW_BIND_ERR), | ||
51 | RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), | ||
52 | RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), | ||
53 | RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), | ||
54 | RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), | ||
55 | RDS_IB_WC_STATUS_STR(REM_OP_ERR), | ||
56 | RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), | ||
57 | RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), | ||
58 | RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), | ||
59 | RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), | ||
60 | RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), | ||
61 | RDS_IB_WC_STATUS_STR(INV_EECN_ERR), | ||
62 | RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), | ||
63 | RDS_IB_WC_STATUS_STR(FATAL_ERR), | ||
64 | RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), | ||
65 | RDS_IB_WC_STATUS_STR(GENERAL_ERR), | ||
66 | #undef RDS_IB_WC_STATUS_STR | ||
67 | }; | ||
68 | |||
69 | char *rds_ib_wc_status_str(enum ib_wc_status status) | ||
70 | { | ||
71 | return rds_str_array(rds_ib_wc_status_strings, | ||
72 | ARRAY_SIZE(rds_ib_wc_status_strings), status); | ||
73 | } | ||
74 | |||
75 | /* | ||
76 | * Convert IB-specific error message to RDS error message and call core | ||
77 | * completion handler. | ||
78 | */ | ||
79 | static void rds_ib_send_complete(struct rds_message *rm, | ||
80 | int wc_status, | ||
81 | void (*complete)(struct rds_message *rm, int status)) | ||
44 | { | 82 | { |
45 | int notify_status; | 83 | int notify_status; |
46 | 84 | ||
@@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, | |||
60 | notify_status = RDS_RDMA_OTHER_ERROR; | 98 | notify_status = RDS_RDMA_OTHER_ERROR; |
61 | break; | 99 | break; |
62 | } | 100 | } |
63 | rds_rdma_send_complete(rm, notify_status); | 101 | complete(rm, notify_status); |
102 | } | ||
103 | |||
104 | static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, | ||
105 | struct rm_data_op *op, | ||
106 | int wc_status) | ||
107 | { | ||
108 | if (op->op_nents) | ||
109 | ib_dma_unmap_sg(ic->i_cm_id->device, | ||
110 | op->op_sg, op->op_nents, | ||
111 | DMA_TO_DEVICE); | ||
64 | } | 112 | } |
65 | 113 | ||
66 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, | 114 | static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, |
67 | struct rds_rdma_op *op) | 115 | struct rm_rdma_op *op, |
116 | int wc_status) | ||
68 | { | 117 | { |
69 | if (op->r_mapped) { | 118 | if (op->op_mapped) { |
70 | ib_dma_unmap_sg(ic->i_cm_id->device, | 119 | ib_dma_unmap_sg(ic->i_cm_id->device, |
71 | op->r_sg, op->r_nents, | 120 | op->op_sg, op->op_nents, |
72 | op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); | 121 | op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); |
73 | op->r_mapped = 0; | 122 | op->op_mapped = 0; |
74 | } | 123 | } |
124 | |||
125 | /* If the user asked for a completion notification on this | ||
126 | * message, we can implement three different semantics: | ||
127 | * 1. Notify when we received the ACK on the RDS message | ||
128 | * that was queued with the RDMA. This provides reliable | ||
129 | * notification of RDMA status at the expense of a one-way | ||
130 | * packet delay. | ||
131 | * 2. Notify when the IB stack gives us the completion event for | ||
132 | * the RDMA operation. | ||
133 | * 3. Notify when the IB stack gives us the completion event for | ||
134 | * the accompanying RDS messages. | ||
135 | * Here, we implement approach #3. To implement approach #2, | ||
136 | * we would need to take an event for the rdma WR. To implement #1, | ||
137 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
138 | * handling in the ACK processing code. | ||
139 | * | ||
140 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
141 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
142 | * operation itself unmapped the RDMA buffers, which takes care | ||
143 | * of synching. | ||
144 | */ | ||
145 | rds_ib_send_complete(container_of(op, struct rds_message, rdma), | ||
146 | wc_status, rds_rdma_send_complete); | ||
147 | |||
148 | if (op->op_write) | ||
149 | rds_stats_add(s_send_rdma_bytes, op->op_bytes); | ||
150 | else | ||
151 | rds_stats_add(s_recv_rdma_bytes, op->op_bytes); | ||
75 | } | 152 | } |
76 | 153 | ||
77 | static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, | 154 | static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, |
78 | struct rds_ib_send_work *send, | 155 | struct rm_atomic_op *op, |
79 | int wc_status) | 156 | int wc_status) |
80 | { | 157 | { |
81 | struct rds_message *rm = send->s_rm; | 158 | /* unmap atomic recvbuf */ |
82 | 159 | if (op->op_mapped) { | |
83 | rdsdebug("ic %p send %p rm %p\n", ic, send, rm); | 160 | ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, |
84 | 161 | DMA_FROM_DEVICE); | |
85 | ib_dma_unmap_sg(ic->i_cm_id->device, | 162 | op->op_mapped = 0; |
86 | rm->m_sg, rm->m_nents, | 163 | } |
87 | DMA_TO_DEVICE); | ||
88 | |||
89 | if (rm->m_rdma_op != NULL) { | ||
90 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
91 | |||
92 | /* If the user asked for a completion notification on this | ||
93 | * message, we can implement three different semantics: | ||
94 | * 1. Notify when we received the ACK on the RDS message | ||
95 | * that was queued with the RDMA. This provides reliable | ||
96 | * notification of RDMA status at the expense of a one-way | ||
97 | * packet delay. | ||
98 | * 2. Notify when the IB stack gives us the completion event for | ||
99 | * the RDMA operation. | ||
100 | * 3. Notify when the IB stack gives us the completion event for | ||
101 | * the accompanying RDS messages. | ||
102 | * Here, we implement approach #3. To implement approach #2, | ||
103 | * call rds_rdma_send_complete from the cq_handler. To implement #1, | ||
104 | * don't call rds_rdma_send_complete at all, and fall back to the notify | ||
105 | * handling in the ACK processing code. | ||
106 | * | ||
107 | * Note: There's no need to explicitly sync any RDMA buffers using | ||
108 | * ib_dma_sync_sg_for_cpu - the completion for the RDMA | ||
109 | * operation itself unmapped the RDMA buffers, which takes care | ||
110 | * of synching. | ||
111 | */ | ||
112 | rds_ib_send_rdma_complete(rm, wc_status); | ||
113 | 164 | ||
114 | if (rm->m_rdma_op->r_write) | 165 | rds_ib_send_complete(container_of(op, struct rds_message, atomic), |
115 | rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); | 166 | wc_status, rds_atomic_send_complete); |
116 | else | 167 | |
117 | rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); | 168 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) |
169 | rds_ib_stats_inc(s_ib_atomic_cswp); | ||
170 | else | ||
171 | rds_ib_stats_inc(s_ib_atomic_fadd); | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * Unmap the resources associated with a struct send_work. | ||
176 | * | ||
177 | * Returns the rm for no good reason other than it is unobtainable | ||
178 | * other than by switching on wr.opcode, currently, and the caller, | ||
179 | * the event handler, needs it. | ||
180 | */ | ||
181 | static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, | ||
182 | struct rds_ib_send_work *send, | ||
183 | int wc_status) | ||
184 | { | ||
185 | struct rds_message *rm = NULL; | ||
186 | |||
187 | /* In the error case, wc.opcode sometimes contains garbage */ | ||
188 | switch (send->s_wr.opcode) { | ||
189 | case IB_WR_SEND: | ||
190 | if (send->s_op) { | ||
191 | rm = container_of(send->s_op, struct rds_message, data); | ||
192 | rds_ib_send_unmap_data(ic, send->s_op, wc_status); | ||
193 | } | ||
194 | break; | ||
195 | case IB_WR_RDMA_WRITE: | ||
196 | case IB_WR_RDMA_READ: | ||
197 | if (send->s_op) { | ||
198 | rm = container_of(send->s_op, struct rds_message, rdma); | ||
199 | rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); | ||
200 | } | ||
201 | break; | ||
202 | case IB_WR_ATOMIC_FETCH_AND_ADD: | ||
203 | case IB_WR_ATOMIC_CMP_AND_SWP: | ||
204 | if (send->s_op) { | ||
205 | rm = container_of(send->s_op, struct rds_message, atomic); | ||
206 | rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); | ||
207 | } | ||
208 | break; | ||
209 | default: | ||
210 | if (printk_ratelimit()) | ||
211 | printk(KERN_NOTICE | ||
212 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
213 | __func__, send->s_wr.opcode); | ||
214 | break; | ||
118 | } | 215 | } |
119 | 216 | ||
120 | /* If anyone waited for this message to get flushed out, wake | 217 | send->s_wr.opcode = 0xdead; |
121 | * them up now */ | ||
122 | rds_message_unmapped(rm); | ||
123 | 218 | ||
124 | rds_message_put(rm); | 219 | return rm; |
125 | send->s_rm = NULL; | ||
126 | } | 220 | } |
127 | 221 | ||
128 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) | 222 | void rds_ib_send_init_ring(struct rds_ib_connection *ic) |
@@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) | |||
133 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 227 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
134 | struct ib_sge *sge; | 228 | struct ib_sge *sge; |
135 | 229 | ||
136 | send->s_rm = NULL; | ||
137 | send->s_op = NULL; | 230 | send->s_op = NULL; |
138 | 231 | ||
139 | send->s_wr.wr_id = i; | 232 | send->s_wr.wr_id = i; |
140 | send->s_wr.sg_list = send->s_sge; | 233 | send->s_wr.sg_list = send->s_sge; |
141 | send->s_wr.num_sge = 1; | ||
142 | send->s_wr.opcode = IB_WR_SEND; | ||
143 | send->s_wr.send_flags = 0; | ||
144 | send->s_wr.ex.imm_data = 0; | 234 | send->s_wr.ex.imm_data = 0; |
145 | 235 | ||
146 | sge = rds_ib_data_sge(ic, send->s_sge); | 236 | sge = &send->s_sge[0]; |
147 | sge->lkey = ic->i_mr->lkey; | ||
148 | |||
149 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
150 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); | 237 | sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); |
151 | sge->length = sizeof(struct rds_header); | 238 | sge->length = sizeof(struct rds_header); |
152 | sge->lkey = ic->i_mr->lkey; | 239 | sge->lkey = ic->i_mr->lkey; |
240 | |||
241 | send->s_sge[1].lkey = ic->i_mr->lkey; | ||
153 | } | 242 | } |
154 | } | 243 | } |
155 | 244 | ||
@@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) | |||
159 | u32 i; | 248 | u32 i; |
160 | 249 | ||
161 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { | 250 | for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { |
162 | if (send->s_wr.opcode == 0xdead) | 251 | if (send->s_op && send->s_wr.opcode != 0xdead) |
163 | continue; | 252 | rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); |
164 | if (send->s_rm) | ||
165 | rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); | ||
166 | if (send->s_op) | ||
167 | rds_ib_send_unmap_rdma(ic, send->s_op); | ||
168 | } | 253 | } |
169 | } | 254 | } |
170 | 255 | ||
171 | /* | 256 | /* |
257 | * The only fast path caller always has a non-zero nr, so we don't | ||
258 | * bother testing nr before performing the atomic sub. | ||
259 | */ | ||
260 | static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) | ||
261 | { | ||
262 | if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && | ||
263 | waitqueue_active(&rds_ib_ring_empty_wait)) | ||
264 | wake_up(&rds_ib_ring_empty_wait); | ||
265 | BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); | ||
266 | } | ||
267 | |||
268 | /* | ||
172 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc | 269 | * The _oldest/_free ring operations here race cleanly with the alloc/unalloc |
173 | * operations performed in the send path. As the sender allocs and potentially | 270 | * operations performed in the send path. As the sender allocs and potentially |
174 | * unallocs the next free entry in the ring it doesn't alter which is | 271 | * unallocs the next free entry in the ring it doesn't alter which is |
@@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
178 | { | 275 | { |
179 | struct rds_connection *conn = context; | 276 | struct rds_connection *conn = context; |
180 | struct rds_ib_connection *ic = conn->c_transport_data; | 277 | struct rds_ib_connection *ic = conn->c_transport_data; |
278 | struct rds_message *rm = NULL; | ||
181 | struct ib_wc wc; | 279 | struct ib_wc wc; |
182 | struct rds_ib_send_work *send; | 280 | struct rds_ib_send_work *send; |
183 | u32 completed; | 281 | u32 completed; |
184 | u32 oldest; | 282 | u32 oldest; |
185 | u32 i = 0; | 283 | u32 i = 0; |
186 | int ret; | 284 | int ret; |
285 | int nr_sig = 0; | ||
187 | 286 | ||
188 | rdsdebug("cq %p conn %p\n", cq, conn); | 287 | rdsdebug("cq %p conn %p\n", cq, conn); |
189 | rds_ib_stats_inc(s_ib_tx_cq_call); | 288 | rds_ib_stats_inc(s_ib_tx_cq_call); |
@@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
192 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); | 291 | rdsdebug("ib_req_notify_cq send failed: %d\n", ret); |
193 | 292 | ||
194 | while (ib_poll_cq(cq, 1, &wc) > 0) { | 293 | while (ib_poll_cq(cq, 1, &wc) > 0) { |
195 | rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", | 294 | rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", |
196 | (unsigned long long)wc.wr_id, wc.status, wc.byte_len, | 295 | (unsigned long long)wc.wr_id, wc.status, |
296 | rds_ib_wc_status_str(wc.status), wc.byte_len, | ||
197 | be32_to_cpu(wc.ex.imm_data)); | 297 | be32_to_cpu(wc.ex.imm_data)); |
198 | rds_ib_stats_inc(s_ib_tx_cq_event); | 298 | rds_ib_stats_inc(s_ib_tx_cq_event); |
199 | 299 | ||
@@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
210 | 310 | ||
211 | for (i = 0; i < completed; i++) { | 311 | for (i = 0; i < completed; i++) { |
212 | send = &ic->i_sends[oldest]; | 312 | send = &ic->i_sends[oldest]; |
313 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
314 | nr_sig++; | ||
213 | 315 | ||
214 | /* In the error case, wc.opcode sometimes contains garbage */ | 316 | rm = rds_ib_send_unmap_op(ic, send, wc.status); |
215 | switch (send->s_wr.opcode) { | ||
216 | case IB_WR_SEND: | ||
217 | if (send->s_rm) | ||
218 | rds_ib_send_unmap_rm(ic, send, wc.status); | ||
219 | break; | ||
220 | case IB_WR_RDMA_WRITE: | ||
221 | case IB_WR_RDMA_READ: | ||
222 | /* Nothing to be done - the SG list will be unmapped | ||
223 | * when the SEND completes. */ | ||
224 | break; | ||
225 | default: | ||
226 | if (printk_ratelimit()) | ||
227 | printk(KERN_NOTICE | ||
228 | "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", | ||
229 | __func__, send->s_wr.opcode); | ||
230 | break; | ||
231 | } | ||
232 | 317 | ||
233 | send->s_wr.opcode = 0xdead; | ||
234 | send->s_wr.num_sge = 1; | ||
235 | if (send->s_queued + HZ/2 < jiffies) | 318 | if (send->s_queued + HZ/2 < jiffies) |
236 | rds_ib_stats_inc(s_ib_tx_stalled); | 319 | rds_ib_stats_inc(s_ib_tx_stalled); |
237 | 320 | ||
238 | /* If a RDMA operation produced an error, signal this right | 321 | if (send->s_op) { |
239 | * away. If we don't, the subsequent SEND that goes with this | 322 | if (send->s_op == rm->m_final_op) { |
240 | * RDMA will be canceled with ERR_WFLUSH, and the application | 323 | /* If anyone waited for this message to get flushed out, wake |
241 | * never learn that the RDMA failed. */ | 324 | * them up now */ |
242 | if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { | 325 | rds_message_unmapped(rm); |
243 | struct rds_message *rm; | ||
244 | |||
245 | rm = rds_send_get_message(conn, send->s_op); | ||
246 | if (rm) { | ||
247 | if (rm->m_rdma_op) | ||
248 | rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); | ||
249 | rds_ib_send_rdma_complete(rm, wc.status); | ||
250 | rds_message_put(rm); | ||
251 | } | 326 | } |
327 | rds_message_put(rm); | ||
328 | send->s_op = NULL; | ||
252 | } | 329 | } |
253 | 330 | ||
254 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; | 331 | oldest = (oldest + 1) % ic->i_send_ring.w_nr; |
255 | } | 332 | } |
256 | 333 | ||
257 | rds_ib_ring_free(&ic->i_send_ring, completed); | 334 | rds_ib_ring_free(&ic->i_send_ring, completed); |
335 | rds_ib_sub_signaled(ic, nr_sig); | ||
336 | nr_sig = 0; | ||
258 | 337 | ||
259 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || | 338 | if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || |
260 | test_bit(0, &conn->c_map_queued)) | 339 | test_bit(0, &conn->c_map_queued)) |
@@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
262 | 341 | ||
263 | /* We expect errors as the qp is drained during shutdown */ | 342 | /* We expect errors as the qp is drained during shutdown */ |
264 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { | 343 | if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { |
265 | rds_ib_conn_error(conn, | 344 | rds_ib_conn_error(conn, "send completion on %pI4 had status " |
266 | "send completion on %pI4 " | 345 | "%u (%s), disconnecting and reconnecting\n", |
267 | "had status %u, disconnecting and reconnecting\n", | 346 | &conn->c_faddr, wc.status, |
268 | &conn->c_faddr, wc.status); | 347 | rds_ib_wc_status_str(wc.status)); |
269 | } | 348 | } |
270 | } | 349 | } |
271 | } | 350 | } |
@@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) | |||
294 | * credits (see rds_ib_send_add_credits below). | 373 | * credits (see rds_ib_send_add_credits below). |
295 | * | 374 | * |
296 | * The RDS send code is essentially single-threaded; rds_send_xmit | 375 | * The RDS send code is essentially single-threaded; rds_send_xmit |
297 | * grabs c_send_lock to ensure exclusive access to the send ring. | 376 | * sets RDS_IN_XMIT to ensure exclusive access to the send ring. |
298 | * However, the ACK sending code is independent and can race with | 377 | * However, the ACK sending code is independent and can race with |
299 | * message SENDs. | 378 | * message SENDs. |
300 | * | 379 | * |
@@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) | |||
413 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); | 492 | set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); |
414 | } | 493 | } |
415 | 494 | ||
416 | static inline void | 495 | static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, |
417 | rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, | 496 | struct rds_ib_send_work *send, |
418 | struct rds_ib_send_work *send, unsigned int pos, | 497 | bool notify) |
419 | unsigned long buffer, unsigned int length, | ||
420 | int send_flags) | ||
421 | { | 498 | { |
422 | struct ib_sge *sge; | 499 | /* |
423 | 500 | * We want to delay signaling completions just enough to get | |
424 | WARN_ON(pos != send - ic->i_sends); | 501 | * the batching benefits but not so much that we create dead time |
425 | 502 | * on the wire. | |
426 | send->s_wr.send_flags = send_flags; | 503 | */ |
427 | send->s_wr.opcode = IB_WR_SEND; | 504 | if (ic->i_unsignaled_wrs-- == 0 || notify) { |
428 | send->s_wr.num_sge = 2; | 505 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; |
429 | send->s_wr.next = NULL; | 506 | send->s_wr.send_flags |= IB_SEND_SIGNALED; |
430 | send->s_queued = jiffies; | 507 | return 1; |
431 | send->s_op = NULL; | ||
432 | |||
433 | if (length != 0) { | ||
434 | sge = rds_ib_data_sge(ic, send->s_sge); | ||
435 | sge->addr = buffer; | ||
436 | sge->length = length; | ||
437 | sge->lkey = ic->i_mr->lkey; | ||
438 | |||
439 | sge = rds_ib_header_sge(ic, send->s_sge); | ||
440 | } else { | ||
441 | /* We're sending a packet with no payload. There is only | ||
442 | * one SGE */ | ||
443 | send->s_wr.num_sge = 1; | ||
444 | sge = &send->s_sge[0]; | ||
445 | } | 508 | } |
446 | 509 | return 0; | |
447 | sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); | ||
448 | sge->length = sizeof(struct rds_header); | ||
449 | sge->lkey = ic->i_mr->lkey; | ||
450 | } | 510 | } |
451 | 511 | ||
452 | /* | 512 | /* |
@@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
475 | u32 pos; | 535 | u32 pos; |
476 | u32 i; | 536 | u32 i; |
477 | u32 work_alloc; | 537 | u32 work_alloc; |
478 | u32 credit_alloc; | 538 | u32 credit_alloc = 0; |
479 | u32 posted; | 539 | u32 posted; |
480 | u32 adv_credits = 0; | 540 | u32 adv_credits = 0; |
481 | int send_flags = 0; | 541 | int send_flags = 0; |
482 | int sent; | 542 | int bytes_sent = 0; |
483 | int ret; | 543 | int ret; |
484 | int flow_controlled = 0; | 544 | int flow_controlled = 0; |
545 | int nr_sig = 0; | ||
485 | 546 | ||
486 | BUG_ON(off % RDS_FRAG_SIZE); | 547 | BUG_ON(off % RDS_FRAG_SIZE); |
487 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); | 548 | BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); |
@@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
507 | goto out; | 568 | goto out; |
508 | } | 569 | } |
509 | 570 | ||
510 | credit_alloc = work_alloc; | ||
511 | if (ic->i_flowctl) { | 571 | if (ic->i_flowctl) { |
512 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); | 572 | credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); |
513 | adv_credits += posted; | 573 | adv_credits += posted; |
514 | if (credit_alloc < work_alloc) { | 574 | if (credit_alloc < work_alloc) { |
515 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); | 575 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); |
516 | work_alloc = credit_alloc; | 576 | work_alloc = credit_alloc; |
517 | flow_controlled++; | 577 | flow_controlled = 1; |
518 | } | 578 | } |
519 | if (work_alloc == 0) { | 579 | if (work_alloc == 0) { |
520 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); | 580 | set_bit(RDS_LL_SEND_FULL, &conn->c_flags); |
@@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
525 | } | 585 | } |
526 | 586 | ||
527 | /* map the message the first time we see it */ | 587 | /* map the message the first time we see it */ |
528 | if (ic->i_rm == NULL) { | 588 | if (!ic->i_data_op) { |
529 | /* | 589 | if (rm->data.op_nents) { |
530 | printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", | 590 | rm->data.op_count = ib_dma_map_sg(dev, |
531 | be16_to_cpu(rm->m_inc.i_hdr.h_dport), | 591 | rm->data.op_sg, |
532 | rm->m_inc.i_hdr.h_flags, | 592 | rm->data.op_nents, |
533 | be32_to_cpu(rm->m_inc.i_hdr.h_len)); | 593 | DMA_TO_DEVICE); |
534 | */ | 594 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); |
535 | if (rm->m_nents) { | 595 | if (rm->data.op_count == 0) { |
536 | rm->m_count = ib_dma_map_sg(dev, | ||
537 | rm->m_sg, rm->m_nents, DMA_TO_DEVICE); | ||
538 | rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); | ||
539 | if (rm->m_count == 0) { | ||
540 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 596 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
541 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 597 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
542 | ret = -ENOMEM; /* XXX ? */ | 598 | ret = -ENOMEM; /* XXX ? */ |
543 | goto out; | 599 | goto out; |
544 | } | 600 | } |
545 | } else { | 601 | } else { |
546 | rm->m_count = 0; | 602 | rm->data.op_count = 0; |
547 | } | 603 | } |
548 | 604 | ||
549 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
550 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | ||
551 | rds_message_addref(rm); | 605 | rds_message_addref(rm); |
552 | ic->i_rm = rm; | 606 | ic->i_data_op = &rm->data; |
553 | 607 | ||
554 | /* Finalize the header */ | 608 | /* Finalize the header */ |
555 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) | 609 | if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) |
@@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
559 | 613 | ||
560 | /* If it has a RDMA op, tell the peer we did it. This is | 614 | /* If it has a RDMA op, tell the peer we did it. This is |
561 | * used by the peer to release use-once RDMA MRs. */ | 615 | * used by the peer to release use-once RDMA MRs. */ |
562 | if (rm->m_rdma_op) { | 616 | if (rm->rdma.op_active) { |
563 | struct rds_ext_header_rdma ext_hdr; | 617 | struct rds_ext_header_rdma ext_hdr; |
564 | 618 | ||
565 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); | 619 | ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); |
566 | rds_message_add_extension(&rm->m_inc.i_hdr, | 620 | rds_message_add_extension(&rm->m_inc.i_hdr, |
567 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); | 621 | RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); |
568 | } | 622 | } |
@@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, | |||
582 | /* | 636 | /* |
583 | * Update adv_credits since we reset the ACK_REQUIRED bit. | 637 | * Update adv_credits since we reset the ACK_REQUIRED bit. |
584 | */ | 638 | */ |
585 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); | 639 | if (ic->i_flowctl) { |
586 | adv_credits += posted; | 640 | rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); |
587 | BUG_ON(adv_credits > 255); | 641 | adv_credits += posted; |
642 | BUG_ON(adv_credits > 255); | ||
643 | } | ||
588 | } | 644 | } |
589 | 645 | ||
590 | send = &ic->i_sends[pos]; | ||
591 | first = send; | ||
592 | prev = NULL; | ||
593 | scat = &rm->m_sg[sg]; | ||
594 | sent = 0; | ||
595 | i = 0; | ||
596 | |||
597 | /* Sometimes you want to put a fence between an RDMA | 646 | /* Sometimes you want to put a fence between an RDMA |
598 | * READ and the following SEND. | 647 | * READ and the following SEND. |
599 | * We could either do this all the time | 648 | * We could either do this all the time |
600 | * or when requested by the user. Right now, we let | 649 | * or when requested by the user. Right now, we let |
601 | * the application choose. | 650 | * the application choose. |
602 | */ | 651 | */ |
603 | if (rm->m_rdma_op && rm->m_rdma_op->r_fence) | 652 | if (rm->rdma.op_active && rm->rdma.op_fence) |
604 | send_flags = IB_SEND_FENCE; | 653 | send_flags = IB_SEND_FENCE; |
605 | 654 | ||
606 | /* | 655 | /* Each frag gets a header. Msgs may be 0 bytes */ |
607 | * We could be copying the header into the unused tail of the page. | 656 | send = &ic->i_sends[pos]; |
608 | * That would need to be changed in the future when those pages might | 657 | first = send; |
609 | * be mapped userspace pages or page cache pages. So instead we always | 658 | prev = NULL; |
610 | * use a second sge and our long-lived ring of mapped headers. We send | 659 | scat = &ic->i_data_op->op_sg[sg]; |
611 | * the header after the data so that the data payload can be aligned on | 660 | i = 0; |
612 | * the receiver. | 661 | do { |
613 | */ | 662 | unsigned int len = 0; |
614 | 663 | ||
615 | /* handle a 0-len message */ | 664 | /* Set up the header */ |
616 | if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { | 665 | send->s_wr.send_flags = send_flags; |
617 | rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); | 666 | send->s_wr.opcode = IB_WR_SEND; |
618 | goto add_header; | 667 | send->s_wr.num_sge = 1; |
619 | } | 668 | send->s_wr.next = NULL; |
669 | send->s_queued = jiffies; | ||
670 | send->s_op = NULL; | ||
620 | 671 | ||
621 | /* if there's data reference it with a chain of work reqs */ | 672 | send->s_sge[0].addr = ic->i_send_hdrs_dma |
622 | for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { | 673 | + (pos * sizeof(struct rds_header)); |
623 | unsigned int len; | 674 | send->s_sge[0].length = sizeof(struct rds_header); |
624 | 675 | ||
625 | send = &ic->i_sends[pos]; | 676 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); |
626 | 677 | ||
627 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); | 678 | /* Set up the data, if present */ |
628 | rds_ib_xmit_populate_wr(ic, send, pos, | 679 | if (i < work_alloc |
629 | ib_sg_dma_address(dev, scat) + off, len, | 680 | && scat != &rm->data.op_sg[rm->data.op_count]) { |
630 | send_flags); | 681 | len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); |
682 | send->s_wr.num_sge = 2; | ||
631 | 683 | ||
632 | /* | 684 | send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; |
633 | * We want to delay signaling completions just enough to get | 685 | send->s_sge[1].length = len; |
634 | * the batching benefits but not so much that we create dead time | ||
635 | * on the wire. | ||
636 | */ | ||
637 | if (ic->i_unsignaled_wrs-- == 0) { | ||
638 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
639 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | ||
640 | } | ||
641 | 686 | ||
642 | ic->i_unsignaled_bytes -= len; | 687 | bytes_sent += len; |
643 | if (ic->i_unsignaled_bytes <= 0) { | 688 | off += len; |
644 | ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; | 689 | if (off == ib_sg_dma_len(dev, scat)) { |
645 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 690 | scat++; |
691 | off = 0; | ||
692 | } | ||
646 | } | 693 | } |
647 | 694 | ||
695 | rds_ib_set_wr_signal_state(ic, send, 0); | ||
696 | |||
648 | /* | 697 | /* |
649 | * Always signal the last one if we're stopping due to flow control. | 698 | * Always signal the last one if we're stopping due to flow control. |
650 | */ | 699 | */ |
651 | if (flow_controlled && i == (work_alloc-1)) | 700 | if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) |
652 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 701 | send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
653 | 702 | ||
703 | if (send->s_wr.send_flags & IB_SEND_SIGNALED) | ||
704 | nr_sig++; | ||
705 | |||
654 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, | 706 | rdsdebug("send %p wr %p num_sge %u next %p\n", send, |
655 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); | 707 | &send->s_wr, send->s_wr.num_sge, send->s_wr.next); |
656 | 708 | ||
657 | sent += len; | 709 | if (ic->i_flowctl && adv_credits) { |
658 | off += len; | ||
659 | if (off == ib_sg_dma_len(dev, scat)) { | ||
660 | scat++; | ||
661 | off = 0; | ||
662 | } | ||
663 | |||
664 | add_header: | ||
665 | /* Tack on the header after the data. The header SGE should already | ||
666 | * have been set up to point to the right header buffer. */ | ||
667 | memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); | ||
668 | |||
669 | if (0) { | ||
670 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | ||
671 | |||
672 | printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", | ||
673 | be16_to_cpu(hdr->h_dport), | ||
674 | hdr->h_flags, | ||
675 | be32_to_cpu(hdr->h_len)); | ||
676 | } | ||
677 | if (adv_credits) { | ||
678 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; | 710 | struct rds_header *hdr = &ic->i_send_hdrs[pos]; |
679 | 711 | ||
680 | /* add credit and redo the header checksum */ | 712 | /* add credit and redo the header checksum */ |
@@ -689,20 +721,25 @@ add_header: | |||
689 | prev = send; | 721 | prev = send; |
690 | 722 | ||
691 | pos = (pos + 1) % ic->i_send_ring.w_nr; | 723 | pos = (pos + 1) % ic->i_send_ring.w_nr; |
692 | } | 724 | send = &ic->i_sends[pos]; |
725 | i++; | ||
726 | |||
727 | } while (i < work_alloc | ||
728 | && scat != &rm->data.op_sg[rm->data.op_count]); | ||
693 | 729 | ||
694 | /* Account the RDS header in the number of bytes we sent, but just once. | 730 | /* Account the RDS header in the number of bytes we sent, but just once. |
695 | * The caller has no concept of fragmentation. */ | 731 | * The caller has no concept of fragmentation. */ |
696 | if (hdr_off == 0) | 732 | if (hdr_off == 0) |
697 | sent += sizeof(struct rds_header); | 733 | bytes_sent += sizeof(struct rds_header); |
698 | 734 | ||
699 | /* if we finished the message then send completion owns it */ | 735 | /* if we finished the message then send completion owns it */ |
700 | if (scat == &rm->m_sg[rm->m_count]) { | 736 | if (scat == &rm->data.op_sg[rm->data.op_count]) { |
701 | prev->s_rm = ic->i_rm; | 737 | prev->s_op = ic->i_data_op; |
702 | prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; | 738 | prev->s_wr.send_flags |= IB_SEND_SOLICITED; |
703 | ic->i_rm = NULL; | 739 | ic->i_data_op = NULL; |
704 | } | 740 | } |
705 | 741 | ||
742 | /* Put back wrs & credits we didn't use */ | ||
706 | if (i < work_alloc) { | 743 | if (i < work_alloc) { |
707 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 744 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
708 | work_alloc = i; | 745 | work_alloc = i; |
@@ -710,6 +747,9 @@ add_header: | |||
710 | if (ic->i_flowctl && i < credit_alloc) | 747 | if (ic->i_flowctl && i < credit_alloc) |
711 | rds_ib_send_add_credits(conn, credit_alloc - i); | 748 | rds_ib_send_add_credits(conn, credit_alloc - i); |
712 | 749 | ||
750 | if (nr_sig) | ||
751 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
752 | |||
713 | /* XXX need to worry about failed_wr and partial sends. */ | 753 | /* XXX need to worry about failed_wr and partial sends. */ |
714 | failed_wr = &first->s_wr; | 754 | failed_wr = &first->s_wr; |
715 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 755 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
@@ -720,32 +760,127 @@ add_header: | |||
720 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " | 760 | printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " |
721 | "returned %d\n", &conn->c_faddr, ret); | 761 | "returned %d\n", &conn->c_faddr, ret); |
722 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 762 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
723 | if (prev->s_rm) { | 763 | rds_ib_sub_signaled(ic, nr_sig); |
724 | ic->i_rm = prev->s_rm; | 764 | if (prev->s_op) { |
725 | prev->s_rm = NULL; | 765 | ic->i_data_op = prev->s_op; |
766 | prev->s_op = NULL; | ||
726 | } | 767 | } |
727 | 768 | ||
728 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); | 769 | rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); |
729 | goto out; | 770 | goto out; |
730 | } | 771 | } |
731 | 772 | ||
732 | ret = sent; | 773 | ret = bytes_sent; |
733 | out: | 774 | out: |
734 | BUG_ON(adv_credits); | 775 | BUG_ON(adv_credits); |
735 | return ret; | 776 | return ret; |
736 | } | 777 | } |
737 | 778 | ||
738 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | 779 | /* |
780 | * Issue atomic operation. | ||
781 | * A simplified version of the rdma case, we always map 1 SG, and | ||
782 | * only 8 bytes, for the return value from the atomic operation. | ||
783 | */ | ||
784 | int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) | ||
785 | { | ||
786 | struct rds_ib_connection *ic = conn->c_transport_data; | ||
787 | struct rds_ib_send_work *send = NULL; | ||
788 | struct ib_send_wr *failed_wr; | ||
789 | struct rds_ib_device *rds_ibdev; | ||
790 | u32 pos; | ||
791 | u32 work_alloc; | ||
792 | int ret; | ||
793 | int nr_sig = 0; | ||
794 | |||
795 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | ||
796 | |||
797 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); | ||
798 | if (work_alloc != 1) { | ||
799 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
800 | rds_ib_stats_inc(s_ib_tx_ring_full); | ||
801 | ret = -ENOMEM; | ||
802 | goto out; | ||
803 | } | ||
804 | |||
805 | /* address of send request in ring */ | ||
806 | send = &ic->i_sends[pos]; | ||
807 | send->s_queued = jiffies; | ||
808 | |||
809 | if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { | ||
810 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; | ||
811 | send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; | ||
812 | send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; | ||
813 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; | ||
814 | send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; | ||
815 | } else { /* FADD */ | ||
816 | send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; | ||
817 | send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; | ||
818 | send->s_wr.wr.atomic.swap = 0; | ||
819 | send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; | ||
820 | send->s_wr.wr.atomic.swap_mask = 0; | ||
821 | } | ||
822 | nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); | ||
823 | send->s_wr.num_sge = 1; | ||
824 | send->s_wr.next = NULL; | ||
825 | send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; | ||
826 | send->s_wr.wr.atomic.rkey = op->op_rkey; | ||
827 | send->s_op = op; | ||
828 | rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); | ||
829 | |||
830 | /* map 8 byte retval buffer to the device */ | ||
831 | ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); | ||
832 | rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); | ||
833 | if (ret != 1) { | ||
834 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
835 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | ||
836 | ret = -ENOMEM; /* XXX ? */ | ||
837 | goto out; | ||
838 | } | ||
839 | |||
840 | /* Convert our struct scatterlist to struct ib_sge */ | ||
841 | send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); | ||
842 | send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); | ||
843 | send->s_sge[0].lkey = ic->i_mr->lkey; | ||
844 | |||
845 | rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, | ||
846 | send->s_sge[0].addr, send->s_sge[0].length); | ||
847 | |||
848 | if (nr_sig) | ||
849 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
850 | |||
851 | failed_wr = &send->s_wr; | ||
852 | ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); | ||
853 | rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, | ||
854 | send, &send->s_wr, ret, failed_wr); | ||
855 | BUG_ON(failed_wr != &send->s_wr); | ||
856 | if (ret) { | ||
857 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " | ||
858 | "returned %d\n", &conn->c_faddr, ret); | ||
859 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | ||
860 | rds_ib_sub_signaled(ic, nr_sig); | ||
861 | goto out; | ||
862 | } | ||
863 | |||
864 | if (unlikely(failed_wr != &send->s_wr)) { | ||
865 | printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); | ||
866 | BUG_ON(failed_wr != &send->s_wr); | ||
867 | } | ||
868 | |||
869 | out: | ||
870 | return ret; | ||
871 | } | ||
872 | |||
873 | int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) | ||
739 | { | 874 | { |
740 | struct rds_ib_connection *ic = conn->c_transport_data; | 875 | struct rds_ib_connection *ic = conn->c_transport_data; |
741 | struct rds_ib_send_work *send = NULL; | 876 | struct rds_ib_send_work *send = NULL; |
742 | struct rds_ib_send_work *first; | 877 | struct rds_ib_send_work *first; |
743 | struct rds_ib_send_work *prev; | 878 | struct rds_ib_send_work *prev; |
744 | struct ib_send_wr *failed_wr; | 879 | struct ib_send_wr *failed_wr; |
745 | struct rds_ib_device *rds_ibdev; | ||
746 | struct scatterlist *scat; | 880 | struct scatterlist *scat; |
747 | unsigned long len; | 881 | unsigned long len; |
748 | u64 remote_addr = op->r_remote_addr; | 882 | u64 remote_addr = op->op_remote_addr; |
883 | u32 max_sge = ic->rds_ibdev->max_sge; | ||
749 | u32 pos; | 884 | u32 pos; |
750 | u32 work_alloc; | 885 | u32 work_alloc; |
751 | u32 i; | 886 | u32 i; |
@@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
753 | int sent; | 888 | int sent; |
754 | int ret; | 889 | int ret; |
755 | int num_sge; | 890 | int num_sge; |
756 | 891 | int nr_sig = 0; | |
757 | rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); | 892 | |
758 | 893 | /* map the op the first time we see it */ | |
759 | /* map the message the first time we see it */ | 894 | if (!op->op_mapped) { |
760 | if (!op->r_mapped) { | 895 | op->op_count = ib_dma_map_sg(ic->i_cm_id->device, |
761 | op->r_count = ib_dma_map_sg(ic->i_cm_id->device, | 896 | op->op_sg, op->op_nents, (op->op_write) ? |
762 | op->r_sg, op->r_nents, (op->r_write) ? | 897 | DMA_TO_DEVICE : DMA_FROM_DEVICE); |
763 | DMA_TO_DEVICE : DMA_FROM_DEVICE); | 898 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); |
764 | rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); | 899 | if (op->op_count == 0) { |
765 | if (op->r_count == 0) { | ||
766 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); | 900 | rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); |
767 | ret = -ENOMEM; /* XXX ? */ | 901 | ret = -ENOMEM; /* XXX ? */ |
768 | goto out; | 902 | goto out; |
769 | } | 903 | } |
770 | 904 | ||
771 | op->r_mapped = 1; | 905 | op->op_mapped = 1; |
772 | } | 906 | } |
773 | 907 | ||
774 | /* | 908 | /* |
775 | * Instead of knowing how to return a partial rdma read/write we insist that there | 909 | * Instead of knowing how to return a partial rdma read/write we insist that there |
776 | * be enough work requests to send the entire message. | 910 | * be enough work requests to send the entire message. |
777 | */ | 911 | */ |
778 | i = ceil(op->r_count, rds_ibdev->max_sge); | 912 | i = ceil(op->op_count, max_sge); |
779 | 913 | ||
780 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); | 914 | work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); |
781 | if (work_alloc != i) { | 915 | if (work_alloc != i) { |
@@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
788 | send = &ic->i_sends[pos]; | 922 | send = &ic->i_sends[pos]; |
789 | first = send; | 923 | first = send; |
790 | prev = NULL; | 924 | prev = NULL; |
791 | scat = &op->r_sg[0]; | 925 | scat = &op->op_sg[0]; |
792 | sent = 0; | 926 | sent = 0; |
793 | num_sge = op->r_count; | 927 | num_sge = op->op_count; |
794 | 928 | ||
795 | for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { | 929 | for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { |
796 | send->s_wr.send_flags = 0; | 930 | send->s_wr.send_flags = 0; |
797 | send->s_queued = jiffies; | 931 | send->s_queued = jiffies; |
798 | /* | 932 | send->s_op = NULL; |
799 | * We want to delay signaling completions just enough to get | 933 | |
800 | * the batching benefits but not so much that we create dead time on the wire. | 934 | nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); |
801 | */ | ||
802 | if (ic->i_unsignaled_wrs-- == 0) { | ||
803 | ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; | ||
804 | send->s_wr.send_flags = IB_SEND_SIGNALED; | ||
805 | } | ||
806 | 935 | ||
807 | send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; | 936 | send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; |
808 | send->s_wr.wr.rdma.remote_addr = remote_addr; | 937 | send->s_wr.wr.rdma.remote_addr = remote_addr; |
809 | send->s_wr.wr.rdma.rkey = op->r_key; | 938 | send->s_wr.wr.rdma.rkey = op->op_rkey; |
810 | send->s_op = op; | ||
811 | 939 | ||
812 | if (num_sge > rds_ibdev->max_sge) { | 940 | if (num_sge > max_sge) { |
813 | send->s_wr.num_sge = rds_ibdev->max_sge; | 941 | send->s_wr.num_sge = max_sge; |
814 | num_sge -= rds_ibdev->max_sge; | 942 | num_sge -= max_sge; |
815 | } else { | 943 | } else { |
816 | send->s_wr.num_sge = num_sge; | 944 | send->s_wr.num_sge = num_sge; |
817 | } | 945 | } |
@@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
821 | if (prev) | 949 | if (prev) |
822 | prev->s_wr.next = &send->s_wr; | 950 | prev->s_wr.next = &send->s_wr; |
823 | 951 | ||
824 | for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { | 952 | for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { |
825 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); | 953 | len = ib_sg_dma_len(ic->i_cm_id->device, scat); |
826 | send->s_sge[j].addr = | 954 | send->s_sge[j].addr = |
827 | ib_sg_dma_address(ic->i_cm_id->device, scat); | 955 | ib_sg_dma_address(ic->i_cm_id->device, scat); |
@@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
843 | send = ic->i_sends; | 971 | send = ic->i_sends; |
844 | } | 972 | } |
845 | 973 | ||
846 | /* if we finished the message then send completion owns it */ | 974 | /* give a reference to the last op */ |
847 | if (scat == &op->r_sg[op->r_count]) | 975 | if (scat == &op->op_sg[op->op_count]) { |
848 | prev->s_wr.send_flags = IB_SEND_SIGNALED; | 976 | prev->s_op = op; |
977 | rds_message_addref(container_of(op, struct rds_message, rdma)); | ||
978 | } | ||
849 | 979 | ||
850 | if (i < work_alloc) { | 980 | if (i < work_alloc) { |
851 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); | 981 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); |
852 | work_alloc = i; | 982 | work_alloc = i; |
853 | } | 983 | } |
854 | 984 | ||
985 | if (nr_sig) | ||
986 | atomic_add(nr_sig, &ic->i_signaled_sends); | ||
987 | |||
855 | failed_wr = &first->s_wr; | 988 | failed_wr = &first->s_wr; |
856 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); | 989 | ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); |
857 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, | 990 | rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, |
@@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) | |||
861 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " | 994 | printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " |
862 | "returned %d\n", &conn->c_faddr, ret); | 995 | "returned %d\n", &conn->c_faddr, ret); |
863 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); | 996 | rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); |
997 | rds_ib_sub_signaled(ic, nr_sig); | ||
864 | goto out; | 998 | goto out; |
865 | } | 999 | } |
866 | 1000 | ||