diff options
author | Jack Morgenstein <jackm@dev.mellanox.co.il> | 2008-01-28 03:40:59 -0500 |
---|---|---|
committer | Roland Dreier <rolandd@cisco.com> | 2008-02-08 16:30:02 -0500 |
commit | ea54b10c7773007e173da31fe7adcc049da33331 (patch) | |
tree | b13b77fb3807071a5b93ece7b564f6748d962bbc /drivers/infiniband/hw/mlx4/qp.c | |
parent | b57aacfa7a95328f469d0360e49289b023c47e9e (diff) |
IB/mlx4: Use multiple WQ blocks to post smaller send WQEs
ConnectX HCA supports shrinking WQEs, so that a single work request
can be made of multiple units of wqe_shift. This way, WRs can differ
in size, and do not have to be a power of 2 in size, saving memory and
speeding up send WR posting. Unfortunately, if we do this then the
wqe_index field in CQEs can't be used to look up the WR ID anymore, so
our implementation does this only if selective signaling is off.
Further, on 32-bit platforms, we can't use vmap() to make the QP
buffer virtually contigious. Thus we have to use constant-sized WRs to
make sure a WR is always fully within a single page-sized chunk.
Finally, we use WRs with the NOP opcode to avoid wrapping around the
queue buffer in the middle of posting a WR, and we set the
NoErrorCompletion bit to avoid getting completions with error for NOP
WRs. However, NEC is only supported starting with firmware 2.2.232,
so we use constant-sized WRs for older firmware. And, since MLX QPs
only support SEND, we use constant-sized WRs in this case.
When stamping during NOP posting, do stamping following setting of the
NOP WQE valid bit.
Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
Diffstat (limited to 'drivers/infiniband/hw/mlx4/qp.c')
-rw-r--r-- | drivers/infiniband/hw/mlx4/qp.c | 210 |
1 files changed, 176 insertions, 34 deletions
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 376db730bc75..958e205b6d7c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c | |||
@@ -30,6 +30,8 @@ | |||
30 | * SOFTWARE. | 30 | * SOFTWARE. |
31 | */ | 31 | */ |
32 | 32 | ||
33 | #include <linux/log2.h> | ||
34 | |||
33 | #include <rdma/ib_cache.h> | 35 | #include <rdma/ib_cache.h> |
34 | #include <rdma/ib_pack.h> | 36 | #include <rdma/ib_pack.h> |
35 | 37 | ||
@@ -111,16 +113,87 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) | |||
111 | 113 | ||
112 | /* | 114 | /* |
113 | * Stamp a SQ WQE so that it is invalid if prefetched by marking the | 115 | * Stamp a SQ WQE so that it is invalid if prefetched by marking the |
114 | * first four bytes of every 64 byte chunk with 0xffffffff, except for | 116 | * first four bytes of every 64 byte chunk with |
115 | * the very first chunk of the WQE. | 117 | * 0x7FFFFFF | (invalid_ownership_value << 31). |
118 | * | ||
119 | * When the max work request size is less than or equal to the WQE | ||
120 | * basic block size, as an optimization, we can stamp all WQEs with | ||
121 | * 0xffffffff, and skip the very first chunk of each WQE. | ||
116 | */ | 122 | */ |
117 | static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) | 123 | static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) |
118 | { | 124 | { |
119 | u32 *wqe = get_send_wqe(qp, n); | 125 | u32 *wqe; |
120 | int i; | 126 | int i; |
127 | int s; | ||
128 | int ind; | ||
129 | void *buf; | ||
130 | __be32 stamp; | ||
131 | |||
132 | s = roundup(size, 1U << qp->sq.wqe_shift); | ||
133 | if (qp->sq_max_wqes_per_wr > 1) { | ||
134 | for (i = 0; i < s; i += 64) { | ||
135 | ind = (i >> qp->sq.wqe_shift) + n; | ||
136 | stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : | ||
137 | cpu_to_be32(0xffffffff); | ||
138 | buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); | ||
139 | wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); | ||
140 | *wqe = stamp; | ||
141 | } | ||
142 | } else { | ||
143 | buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); | ||
144 | for (i = 64; i < s; i += 64) { | ||
145 | wqe = buf + i; | ||
146 | *wqe = 0xffffffff; | ||
147 | } | ||
148 | } | ||
149 | } | ||
150 | |||
151 | static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) | ||
152 | { | ||
153 | struct mlx4_wqe_ctrl_seg *ctrl; | ||
154 | struct mlx4_wqe_inline_seg *inl; | ||
155 | void *wqe; | ||
156 | int s; | ||
157 | |||
158 | ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); | ||
159 | s = sizeof(struct mlx4_wqe_ctrl_seg); | ||
160 | |||
161 | if (qp->ibqp.qp_type == IB_QPT_UD) { | ||
162 | struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; | ||
163 | struct mlx4_av *av = (struct mlx4_av *)dgram->av; | ||
164 | memset(dgram, 0, sizeof *dgram); | ||
165 | av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); | ||
166 | s += sizeof(struct mlx4_wqe_datagram_seg); | ||
167 | } | ||
168 | |||
169 | /* Pad the remainder of the WQE with an inline data segment. */ | ||
170 | if (size > s) { | ||
171 | inl = wqe + s; | ||
172 | inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); | ||
173 | } | ||
174 | ctrl->srcrb_flags = 0; | ||
175 | ctrl->fence_size = size / 16; | ||
176 | /* | ||
177 | * Make sure descriptor is fully written before setting ownership bit | ||
178 | * (because HW can start executing as soon as we do). | ||
179 | */ | ||
180 | wmb(); | ||
181 | |||
182 | ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | | ||
183 | (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); | ||
121 | 184 | ||
122 | for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) | 185 | stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); |
123 | wqe[i] = 0xffffffff; | 186 | } |
187 | |||
188 | /* Post NOP WQE to prevent wrap-around in the middle of WR */ | ||
189 | static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) | ||
190 | { | ||
191 | unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); | ||
192 | if (unlikely(s < qp->sq_max_wqes_per_wr)) { | ||
193 | post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); | ||
194 | ind += s; | ||
195 | } | ||
196 | return ind; | ||
124 | } | 197 | } |
125 | 198 | ||
126 | static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) | 199 | static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) |
@@ -237,6 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, | |||
237 | static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, | 310 | static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, |
238 | enum ib_qp_type type, struct mlx4_ib_qp *qp) | 311 | enum ib_qp_type type, struct mlx4_ib_qp *qp) |
239 | { | 312 | { |
313 | int s; | ||
314 | |||
240 | /* Sanity check SQ size before proceeding */ | 315 | /* Sanity check SQ size before proceeding */ |
241 | if (cap->max_send_wr > dev->dev->caps.max_wqes || | 316 | if (cap->max_send_wr > dev->dev->caps.max_wqes || |
242 | cap->max_send_sge > dev->dev->caps.max_sq_sg || | 317 | cap->max_send_sge > dev->dev->caps.max_sq_sg || |
@@ -252,20 +327,74 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, | |||
252 | cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) | 327 | cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) |
253 | return -EINVAL; | 328 | return -EINVAL; |
254 | 329 | ||
255 | qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * | 330 | s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), |
256 | sizeof (struct mlx4_wqe_data_seg), | 331 | cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + |
257 | cap->max_inline_data + | 332 | send_wqe_overhead(type); |
258 | sizeof (struct mlx4_wqe_inline_seg)) + | ||
259 | send_wqe_overhead(type))); | ||
260 | qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) / | ||
261 | sizeof (struct mlx4_wqe_data_seg); | ||
262 | 333 | ||
263 | /* | 334 | /* |
264 | * We need to leave 2 KB + 1 WQE of headroom in the SQ to | 335 | * Hermon supports shrinking WQEs, such that a single work |
265 | * allow HW to prefetch. | 336 | * request can include multiple units of 1 << wqe_shift. This |
337 | * way, work requests can differ in size, and do not have to | ||
338 | * be a power of 2 in size, saving memory and speeding up send | ||
339 | * WR posting. Unfortunately, if we do this then the | ||
340 | * wqe_index field in CQEs can't be used to look up the WR ID | ||
341 | * anymore, so we do this only if selective signaling is off. | ||
342 | * | ||
343 | * Further, on 32-bit platforms, we can't use vmap() to make | ||
344 | * the QP buffer virtually contigious. Thus we have to use | ||
345 | * constant-sized WRs to make sure a WR is always fully within | ||
346 | * a single page-sized chunk. | ||
347 | * | ||
348 | * Finally, we use NOP work requests to pad the end of the | ||
349 | * work queue, to avoid wrap-around in the middle of WR. We | ||
350 | * set NEC bit to avoid getting completions with error for | ||
351 | * these NOP WRs, but since NEC is only supported starting | ||
352 | * with firmware 2.2.232, we use constant-sized WRs for older | ||
353 | * firmware. | ||
354 | * | ||
355 | * And, since MLX QPs only support SEND, we use constant-sized | ||
356 | * WRs in this case. | ||
357 | * | ||
358 | * We look for the smallest value of wqe_shift such that the | ||
359 | * resulting number of wqes does not exceed device | ||
360 | * capabilities. | ||
361 | * | ||
362 | * We set WQE size to at least 64 bytes, this way stamping | ||
363 | * invalidates each WQE. | ||
266 | */ | 364 | */ |
267 | qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; | 365 | if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && |
268 | qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); | 366 | qp->sq_signal_bits && BITS_PER_LONG == 64 && |
367 | type != IB_QPT_SMI && type != IB_QPT_GSI) | ||
368 | qp->sq.wqe_shift = ilog2(64); | ||
369 | else | ||
370 | qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); | ||
371 | |||
372 | for (;;) { | ||
373 | if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz) | ||
374 | return -EINVAL; | ||
375 | |||
376 | qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); | ||
377 | |||
378 | /* | ||
379 | * We need to leave 2 KB + 1 WR of headroom in the SQ to | ||
380 | * allow HW to prefetch. | ||
381 | */ | ||
382 | qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; | ||
383 | qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * | ||
384 | qp->sq_max_wqes_per_wr + | ||
385 | qp->sq_spare_wqes); | ||
386 | |||
387 | if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) | ||
388 | break; | ||
389 | |||
390 | if (qp->sq_max_wqes_per_wr <= 1) | ||
391 | return -EINVAL; | ||
392 | |||
393 | ++qp->sq.wqe_shift; | ||
394 | } | ||
395 | |||
396 | qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) - | ||
397 | send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg); | ||
269 | 398 | ||
270 | qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + | 399 | qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + |
271 | (qp->sq.wqe_cnt << qp->sq.wqe_shift); | 400 | (qp->sq.wqe_cnt << qp->sq.wqe_shift); |
@@ -277,7 +406,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, | |||
277 | qp->sq.offset = 0; | 406 | qp->sq.offset = 0; |
278 | } | 407 | } |
279 | 408 | ||
280 | cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; | 409 | cap->max_send_wr = qp->sq.max_post = |
410 | (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; | ||
281 | cap->max_send_sge = qp->sq.max_gs; | 411 | cap->max_send_sge = qp->sq.max_gs; |
282 | /* We don't support inline sends for kernel QPs (yet) */ | 412 | /* We don't support inline sends for kernel QPs (yet) */ |
283 | cap->max_inline_data = 0; | 413 | cap->max_inline_data = 0; |
@@ -323,6 +453,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, | |||
323 | qp->rq.tail = 0; | 453 | qp->rq.tail = 0; |
324 | qp->sq.head = 0; | 454 | qp->sq.head = 0; |
325 | qp->sq.tail = 0; | 455 | qp->sq.tail = 0; |
456 | qp->sq_next_wqe = 0; | ||
457 | |||
458 | if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) | ||
459 | qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); | ||
460 | else | ||
461 | qp->sq_signal_bits = 0; | ||
326 | 462 | ||
327 | err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); | 463 | err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); |
328 | if (err) | 464 | if (err) |
@@ -413,11 +549,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, | |||
413 | */ | 549 | */ |
414 | qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); | 550 | qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); |
415 | 551 | ||
416 | if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) | ||
417 | qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); | ||
418 | else | ||
419 | qp->sq_signal_bits = 0; | ||
420 | |||
421 | qp->mqp.event = mlx4_ib_qp_event; | 552 | qp->mqp.event = mlx4_ib_qp_event; |
422 | 553 | ||
423 | return 0; | 554 | return 0; |
@@ -912,7 +1043,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, | |||
912 | ctrl = get_send_wqe(qp, i); | 1043 | ctrl = get_send_wqe(qp, i); |
913 | ctrl->owner_opcode = cpu_to_be32(1 << 31); | 1044 | ctrl->owner_opcode = cpu_to_be32(1 << 31); |
914 | 1045 | ||
915 | stamp_send_wqe(qp, i); | 1046 | stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); |
916 | } | 1047 | } |
917 | } | 1048 | } |
918 | 1049 | ||
@@ -965,6 +1096,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, | |||
965 | qp->rq.tail = 0; | 1096 | qp->rq.tail = 0; |
966 | qp->sq.head = 0; | 1097 | qp->sq.head = 0; |
967 | qp->sq.tail = 0; | 1098 | qp->sq.tail = 0; |
1099 | qp->sq_next_wqe = 0; | ||
968 | if (!ibqp->srq) | 1100 | if (!ibqp->srq) |
969 | *qp->db.db = 0; | 1101 | *qp->db.db = 0; |
970 | } | 1102 | } |
@@ -1274,13 +1406,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, | |||
1274 | unsigned long flags; | 1406 | unsigned long flags; |
1275 | int nreq; | 1407 | int nreq; |
1276 | int err = 0; | 1408 | int err = 0; |
1277 | int ind; | 1409 | unsigned ind; |
1278 | int size; | 1410 | int uninitialized_var(stamp); |
1411 | int uninitialized_var(size); | ||
1279 | int i; | 1412 | int i; |
1280 | 1413 | ||
1281 | spin_lock_irqsave(&qp->sq.lock, flags); | 1414 | spin_lock_irqsave(&qp->sq.lock, flags); |
1282 | 1415 | ||
1283 | ind = qp->sq.head; | 1416 | ind = qp->sq_next_wqe; |
1284 | 1417 | ||
1285 | for (nreq = 0; wr; ++nreq, wr = wr->next) { | 1418 | for (nreq = 0; wr; ++nreq, wr = wr->next) { |
1286 | if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { | 1419 | if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { |
@@ -1296,7 +1429,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, | |||
1296 | } | 1429 | } |
1297 | 1430 | ||
1298 | ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); | 1431 | ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); |
1299 | qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; | 1432 | qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; |
1300 | 1433 | ||
1301 | ctrl->srcrb_flags = | 1434 | ctrl->srcrb_flags = |
1302 | (wr->send_flags & IB_SEND_SIGNALED ? | 1435 | (wr->send_flags & IB_SEND_SIGNALED ? |
@@ -1409,16 +1542,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, | |||
1409 | ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | | 1542 | ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | |
1410 | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); | 1543 | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); |
1411 | 1544 | ||
1545 | stamp = ind + qp->sq_spare_wqes; | ||
1546 | ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); | ||
1547 | |||
1412 | /* | 1548 | /* |
1413 | * We can improve latency by not stamping the last | 1549 | * We can improve latency by not stamping the last |
1414 | * send queue WQE until after ringing the doorbell, so | 1550 | * send queue WQE until after ringing the doorbell, so |
1415 | * only stamp here if there are still more WQEs to post. | 1551 | * only stamp here if there are still more WQEs to post. |
1552 | * | ||
1553 | * Same optimization applies to padding with NOP wqe | ||
1554 | * in case of WQE shrinking (used to prevent wrap-around | ||
1555 | * in the middle of WR). | ||
1416 | */ | 1556 | */ |
1417 | if (wr->next) | 1557 | if (wr->next) { |
1418 | stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & | 1558 | stamp_send_wqe(qp, stamp, size * 16); |
1419 | (qp->sq.wqe_cnt - 1)); | 1559 | ind = pad_wraparound(qp, ind); |
1560 | } | ||
1420 | 1561 | ||
1421 | ++ind; | ||
1422 | } | 1562 | } |
1423 | 1563 | ||
1424 | out: | 1564 | out: |
@@ -1440,8 +1580,10 @@ out: | |||
1440 | */ | 1580 | */ |
1441 | mmiowb(); | 1581 | mmiowb(); |
1442 | 1582 | ||
1443 | stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & | 1583 | stamp_send_wqe(qp, stamp, size * 16); |
1444 | (qp->sq.wqe_cnt - 1)); | 1584 | |
1585 | ind = pad_wraparound(qp, ind); | ||
1586 | qp->sq_next_wqe = ind; | ||
1445 | } | 1587 | } |
1446 | 1588 | ||
1447 | spin_unlock_irqrestore(&qp->sq.lock, flags); | 1589 | spin_unlock_irqrestore(&qp->sq.lock, flags); |