aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c20
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h2
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c216
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c8
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h4
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c10
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c89
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c4
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c10
-rw-r--r--drivers/net/mlx4/alloc.c48
-rw-r--r--drivers/net/mlx4/mr.c4
11 files changed, 311 insertions, 104 deletions
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 7950aa6e8184..7360bbafbe84 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -64,13 +64,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
64 64
65static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) 65static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
66{ 66{
67 int offset = n * sizeof (struct mlx4_cqe); 67 return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe));
68
69 if (buf->buf.nbufs == 1)
70 return buf->buf.u.direct.buf + offset;
71 else
72 return buf->buf.u.page_list[offset >> PAGE_SHIFT].buf +
73 (offset & (PAGE_SIZE - 1));
74} 68}
75 69
76static void *get_cqe(struct mlx4_ib_cq *cq, int n) 70static void *get_cqe(struct mlx4_ib_cq *cq, int n)
@@ -332,6 +326,12 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
332 is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == 326 is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
333 MLX4_CQE_OPCODE_ERROR; 327 MLX4_CQE_OPCODE_ERROR;
334 328
329 if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
330 is_send)) {
331 printk(KERN_WARNING "Completion for NOP opcode detected!\n");
332 return -EINVAL;
333 }
334
335 if (!*cur_qp || 335 if (!*cur_qp ||
336 (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) { 336 (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
337 /* 337 /*
@@ -354,8 +354,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
354 354
355 if (is_send) { 355 if (is_send) {
356 wq = &(*cur_qp)->sq; 356 wq = &(*cur_qp)->sq;
357 wqe_ctr = be16_to_cpu(cqe->wqe_index); 357 if (!(*cur_qp)->sq_signal_bits) {
358 wq->tail += (u16) (wqe_ctr - (u16) wq->tail); 358 wqe_ctr = be16_to_cpu(cqe->wqe_index);
359 wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
360 }
359 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; 361 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
360 ++wq->tail; 362 ++wq->tail;
361 } else if ((*cur_qp)->ibqp.srq) { 363 } else if ((*cur_qp)->ibqp.srq) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 28697653a370..3726e451a327 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -120,6 +120,8 @@ struct mlx4_ib_qp {
120 120
121 u32 doorbell_qpn; 121 u32 doorbell_qpn;
122 __be32 sq_signal_bits; 122 __be32 sq_signal_bits;
123 unsigned sq_next_wqe;
124 int sq_max_wqes_per_wr;
123 int sq_spare_wqes; 125 int sq_spare_wqes;
124 struct mlx4_ib_wq sq; 126 struct mlx4_ib_wq sq;
125 127
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 8cba9c532e64..958e205b6d7c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -30,6 +30,8 @@
30 * SOFTWARE. 30 * SOFTWARE.
31 */ 31 */
32 32
33#include <linux/log2.h>
34
33#include <rdma/ib_cache.h> 35#include <rdma/ib_cache.h>
34#include <rdma/ib_pack.h> 36#include <rdma/ib_pack.h>
35 37
@@ -96,11 +98,7 @@ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
96 98
97static void *get_wqe(struct mlx4_ib_qp *qp, int offset) 99static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
98{ 100{
99 if (qp->buf.nbufs == 1) 101 return mlx4_buf_offset(&qp->buf, offset);
100 return qp->buf.u.direct.buf + offset;
101 else
102 return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
103 (offset & (PAGE_SIZE - 1));
104} 102}
105 103
106static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) 104static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
@@ -115,16 +113,87 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
115 113
116/* 114/*
117 * Stamp a SQ WQE so that it is invalid if prefetched by marking the 115 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
118 * first four bytes of every 64 byte chunk with 0xffffffff, except for 116 * first four bytes of every 64 byte chunk with
119 * the very first chunk of the WQE. 117 * 0x7FFFFFF | (invalid_ownership_value << 31).
118 *
119 * When the max work request size is less than or equal to the WQE
120 * basic block size, as an optimization, we can stamp all WQEs with
121 * 0xffffffff, and skip the very first chunk of each WQE.
120 */ 122 */
121static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) 123static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
122{ 124{
123 u32 *wqe = get_send_wqe(qp, n); 125 u32 *wqe;
124 int i; 126 int i;
127 int s;
128 int ind;
129 void *buf;
130 __be32 stamp;
131
132 s = roundup(size, 1U << qp->sq.wqe_shift);
133 if (qp->sq_max_wqes_per_wr > 1) {
134 for (i = 0; i < s; i += 64) {
135 ind = (i >> qp->sq.wqe_shift) + n;
136 stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
137 cpu_to_be32(0xffffffff);
138 buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
139 wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
140 *wqe = stamp;
141 }
142 } else {
143 buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
144 for (i = 64; i < s; i += 64) {
145 wqe = buf + i;
146 *wqe = 0xffffffff;
147 }
148 }
149}
150
151static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
152{
153 struct mlx4_wqe_ctrl_seg *ctrl;
154 struct mlx4_wqe_inline_seg *inl;
155 void *wqe;
156 int s;
157
158 ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
159 s = sizeof(struct mlx4_wqe_ctrl_seg);
125 160
126 for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) 161 if (qp->ibqp.qp_type == IB_QPT_UD) {
127 wqe[i] = 0xffffffff; 162 struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
163 struct mlx4_av *av = (struct mlx4_av *)dgram->av;
164 memset(dgram, 0, sizeof *dgram);
165 av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
166 s += sizeof(struct mlx4_wqe_datagram_seg);
167 }
168
169 /* Pad the remainder of the WQE with an inline data segment. */
170 if (size > s) {
171 inl = wqe + s;
172 inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
173 }
174 ctrl->srcrb_flags = 0;
175 ctrl->fence_size = size / 16;
176 /*
177 * Make sure descriptor is fully written before setting ownership bit
178 * (because HW can start executing as soon as we do).
179 */
180 wmb();
181
182 ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
183 (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
184
185 stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
186}
187
188/* Post NOP WQE to prevent wrap-around in the middle of WR */
189static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
190{
191 unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
192 if (unlikely(s < qp->sq_max_wqes_per_wr)) {
193 post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
194 ind += s;
195 }
196 return ind;
128} 197}
129 198
130static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) 199static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -241,6 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
241static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, 310static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
242 enum ib_qp_type type, struct mlx4_ib_qp *qp) 311 enum ib_qp_type type, struct mlx4_ib_qp *qp)
243{ 312{
313 int s;
314
244 /* Sanity check SQ size before proceeding */ 315 /* Sanity check SQ size before proceeding */
245 if (cap->max_send_wr > dev->dev->caps.max_wqes || 316 if (cap->max_send_wr > dev->dev->caps.max_wqes ||
246 cap->max_send_sge > dev->dev->caps.max_sq_sg || 317 cap->max_send_sge > dev->dev->caps.max_sq_sg ||
@@ -256,20 +327,74 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
256 cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) 327 cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
257 return -EINVAL; 328 return -EINVAL;
258 329
259 qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * 330 s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
260 sizeof (struct mlx4_wqe_data_seg), 331 cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
261 cap->max_inline_data + 332 send_wqe_overhead(type);
262 sizeof (struct mlx4_wqe_inline_seg)) +
263 send_wqe_overhead(type)));
264 qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
265 sizeof (struct mlx4_wqe_data_seg);
266 333
267 /* 334 /*
268 * We need to leave 2 KB + 1 WQE of headroom in the SQ to 335 * Hermon supports shrinking WQEs, such that a single work
269 * allow HW to prefetch. 336 * request can include multiple units of 1 << wqe_shift. This
337 * way, work requests can differ in size, and do not have to
338 * be a power of 2 in size, saving memory and speeding up send
339 * WR posting. Unfortunately, if we do this then the
340 * wqe_index field in CQEs can't be used to look up the WR ID
341 * anymore, so we do this only if selective signaling is off.
342 *
343 * Further, on 32-bit platforms, we can't use vmap() to make
344 * the QP buffer virtually contigious. Thus we have to use
345 * constant-sized WRs to make sure a WR is always fully within
346 * a single page-sized chunk.
347 *
348 * Finally, we use NOP work requests to pad the end of the
349 * work queue, to avoid wrap-around in the middle of WR. We
350 * set NEC bit to avoid getting completions with error for
351 * these NOP WRs, but since NEC is only supported starting
352 * with firmware 2.2.232, we use constant-sized WRs for older
353 * firmware.
354 *
355 * And, since MLX QPs only support SEND, we use constant-sized
356 * WRs in this case.
357 *
358 * We look for the smallest value of wqe_shift such that the
359 * resulting number of wqes does not exceed device
360 * capabilities.
361 *
362 * We set WQE size to at least 64 bytes, this way stamping
363 * invalidates each WQE.
270 */ 364 */
271 qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; 365 if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
272 qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); 366 qp->sq_signal_bits && BITS_PER_LONG == 64 &&
367 type != IB_QPT_SMI && type != IB_QPT_GSI)
368 qp->sq.wqe_shift = ilog2(64);
369 else
370 qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
371
372 for (;;) {
373 if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
374 return -EINVAL;
375
376 qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
377
378 /*
379 * We need to leave 2 KB + 1 WR of headroom in the SQ to
380 * allow HW to prefetch.
381 */
382 qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
383 qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
384 qp->sq_max_wqes_per_wr +
385 qp->sq_spare_wqes);
386
387 if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
388 break;
389
390 if (qp->sq_max_wqes_per_wr <= 1)
391 return -EINVAL;
392
393 ++qp->sq.wqe_shift;
394 }
395
396 qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
397 send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
273 398
274 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 399 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
275 (qp->sq.wqe_cnt << qp->sq.wqe_shift); 400 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
@@ -281,7 +406,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
281 qp->sq.offset = 0; 406 qp->sq.offset = 0;
282 } 407 }
283 408
284 cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; 409 cap->max_send_wr = qp->sq.max_post =
410 (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
285 cap->max_send_sge = qp->sq.max_gs; 411 cap->max_send_sge = qp->sq.max_gs;
286 /* We don't support inline sends for kernel QPs (yet) */ 412 /* We don't support inline sends for kernel QPs (yet) */
287 cap->max_inline_data = 0; 413 cap->max_inline_data = 0;
@@ -327,6 +453,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
327 qp->rq.tail = 0; 453 qp->rq.tail = 0;
328 qp->sq.head = 0; 454 qp->sq.head = 0;
329 qp->sq.tail = 0; 455 qp->sq.tail = 0;
456 qp->sq_next_wqe = 0;
457
458 if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
459 qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
460 else
461 qp->sq_signal_bits = 0;
330 462
331 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); 463 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
332 if (err) 464 if (err)
@@ -417,11 +549,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
417 */ 549 */
418 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); 550 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
419 551
420 if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
421 qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
422 else
423 qp->sq_signal_bits = 0;
424
425 qp->mqp.event = mlx4_ib_qp_event; 552 qp->mqp.event = mlx4_ib_qp_event;
426 553
427 return 0; 554 return 0;
@@ -916,7 +1043,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
916 ctrl = get_send_wqe(qp, i); 1043 ctrl = get_send_wqe(qp, i);
917 ctrl->owner_opcode = cpu_to_be32(1 << 31); 1044 ctrl->owner_opcode = cpu_to_be32(1 << 31);
918 1045
919 stamp_send_wqe(qp, i); 1046 stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
920 } 1047 }
921 } 1048 }
922 1049
@@ -969,6 +1096,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
969 qp->rq.tail = 0; 1096 qp->rq.tail = 0;
970 qp->sq.head = 0; 1097 qp->sq.head = 0;
971 qp->sq.tail = 0; 1098 qp->sq.tail = 0;
1099 qp->sq_next_wqe = 0;
972 if (!ibqp->srq) 1100 if (!ibqp->srq)
973 *qp->db.db = 0; 1101 *qp->db.db = 0;
974 } 1102 }
@@ -1278,13 +1406,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1278 unsigned long flags; 1406 unsigned long flags;
1279 int nreq; 1407 int nreq;
1280 int err = 0; 1408 int err = 0;
1281 int ind; 1409 unsigned ind;
1282 int size; 1410 int uninitialized_var(stamp);
1411 int uninitialized_var(size);
1283 int i; 1412 int i;
1284 1413
1285 spin_lock_irqsave(&qp->sq.lock, flags); 1414 spin_lock_irqsave(&qp->sq.lock, flags);
1286 1415
1287 ind = qp->sq.head; 1416 ind = qp->sq_next_wqe;
1288 1417
1289 for (nreq = 0; wr; ++nreq, wr = wr->next) { 1418 for (nreq = 0; wr; ++nreq, wr = wr->next) {
1290 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { 1419 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
@@ -1300,7 +1429,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1300 } 1429 }
1301 1430
1302 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 1431 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
1303 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 1432 qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
1304 1433
1305 ctrl->srcrb_flags = 1434 ctrl->srcrb_flags =
1306 (wr->send_flags & IB_SEND_SIGNALED ? 1435 (wr->send_flags & IB_SEND_SIGNALED ?
@@ -1413,16 +1542,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
1413 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | 1542 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
1414 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); 1543 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
1415 1544
1545 stamp = ind + qp->sq_spare_wqes;
1546 ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
1547
1416 /* 1548 /*
1417 * We can improve latency by not stamping the last 1549 * We can improve latency by not stamping the last
1418 * send queue WQE until after ringing the doorbell, so 1550 * send queue WQE until after ringing the doorbell, so
1419 * only stamp here if there are still more WQEs to post. 1551 * only stamp here if there are still more WQEs to post.
1552 *
1553 * Same optimization applies to padding with NOP wqe
1554 * in case of WQE shrinking (used to prevent wrap-around
1555 * in the middle of WR).
1420 */ 1556 */
1421 if (wr->next) 1557 if (wr->next) {
1422 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & 1558 stamp_send_wqe(qp, stamp, size * 16);
1423 (qp->sq.wqe_cnt - 1)); 1559 ind = pad_wraparound(qp, ind);
1560 }
1424 1561
1425 ++ind;
1426 } 1562 }
1427 1563
1428out: 1564out:
@@ -1444,8 +1580,10 @@ out:
1444 */ 1580 */
1445 mmiowb(); 1581 mmiowb();
1446 1582
1447 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & 1583 stamp_send_wqe(qp, stamp, size * 16);
1448 (qp->sq.wqe_cnt - 1)); 1584
1585 ind = pad_wraparound(qp, ind);
1586 qp->sq_next_wqe = ind;
1449 } 1587 }
1450 1588
1451 spin_unlock_irqrestore(&qp->sq.lock, flags); 1589 spin_unlock_irqrestore(&qp->sq.lock, flags);
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index e7e9a3d0dac3..beaa3b06cf58 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -38,13 +38,7 @@
38 38
39static void *get_wqe(struct mlx4_ib_srq *srq, int n) 39static void *get_wqe(struct mlx4_ib_srq *srq, int n)
40{ 40{
41 int offset = n << srq->msrq.wqe_shift; 41 return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
42
43 if (srq->buf.nbufs == 1)
44 return srq->buf.u.direct.buf + offset;
45 else
46 return srq->buf.u.page_list[offset >> PAGE_SHIFT].buf +
47 (offset & (PAGE_SIZE - 1));
48} 42}
49 43
50static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type) 44static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index fe250c60607d..f9b7caa54143 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -143,7 +143,7 @@ struct ipoib_rx_buf {
143 143
144struct ipoib_tx_buf { 144struct ipoib_tx_buf {
145 struct sk_buff *skb; 145 struct sk_buff *skb;
146 u64 mapping; 146 u64 mapping[MAX_SKB_FRAGS + 1];
147}; 147};
148 148
149struct ib_cm_id; 149struct ib_cm_id;
@@ -296,7 +296,7 @@ struct ipoib_dev_priv {
296 struct ipoib_tx_buf *tx_ring; 296 struct ipoib_tx_buf *tx_ring;
297 unsigned tx_head; 297 unsigned tx_head;
298 unsigned tx_tail; 298 unsigned tx_tail;
299 struct ib_sge tx_sge; 299 struct ib_sge tx_sge[MAX_SKB_FRAGS + 1];
300 struct ib_send_wr tx_wr; 300 struct ib_send_wr tx_wr;
301 unsigned tx_outstanding; 301 unsigned tx_outstanding;
302 302
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 1818f958c250..7dd2ec473d24 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -634,8 +634,8 @@ static inline int post_send(struct ipoib_dev_priv *priv,
634{ 634{
635 struct ib_send_wr *bad_wr; 635 struct ib_send_wr *bad_wr;
636 636
637 priv->tx_sge.addr = addr; 637 priv->tx_sge[0].addr = addr;
638 priv->tx_sge.length = len; 638 priv->tx_sge[0].length = len;
639 639
640 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; 640 priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
641 641
@@ -676,7 +676,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
676 return; 676 return;
677 } 677 }
678 678
679 tx_req->mapping = addr; 679 tx_req->mapping[0] = addr;
680 680
681 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), 681 if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
682 addr, skb->len))) { 682 addr, skb->len))) {
@@ -715,7 +715,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
715 715
716 tx_req = &tx->tx_ring[wr_id]; 716 tx_req = &tx->tx_ring[wr_id];
717 717
718 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE); 718 ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len, DMA_TO_DEVICE);
719 719
720 /* FIXME: is this right? Shouldn't we only increment on success? */ 720 /* FIXME: is this right? Shouldn't we only increment on success? */
721 ++dev->stats.tx_packets; 721 ++dev->stats.tx_packets;
@@ -1110,7 +1110,7 @@ timeout:
1110 1110
1111 while ((int) p->tx_tail - (int) p->tx_head < 0) { 1111 while ((int) p->tx_tail - (int) p->tx_head < 0) {
1112 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; 1112 tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1113 ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, 1113 ib_dma_unmap_single(priv->ca, tx_req->mapping[0], tx_req->skb->len,
1114 DMA_TO_DEVICE); 1114 DMA_TO_DEVICE);
1115 dev_kfree_skb_any(tx_req->skb); 1115 dev_kfree_skb_any(tx_req->skb);
1116 ++p->tx_tail; 1116 ++p->tx_tail;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 52bc2bd5799a..9d3e778dc56d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -239,6 +239,54 @@ repost:
239 "for buf %d\n", wr_id); 239 "for buf %d\n", wr_id);
240} 240}
241 241
242static int ipoib_dma_map_tx(struct ib_device *ca,
243 struct ipoib_tx_buf *tx_req)
244{
245 struct sk_buff *skb = tx_req->skb;
246 u64 *mapping = tx_req->mapping;
247 int i;
248
249 mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
250 DMA_TO_DEVICE);
251 if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
252 return -EIO;
253
254 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
255 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
256 mapping[i + 1] = ib_dma_map_page(ca, frag->page,
257 frag->page_offset, frag->size,
258 DMA_TO_DEVICE);
259 if (unlikely(ib_dma_mapping_error(ca, mapping[i + 1])))
260 goto partial_error;
261 }
262 return 0;
263
264partial_error:
265 ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
266
267 for (; i > 0; --i) {
268 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
269 ib_dma_unmap_page(ca, mapping[i], frag->size, DMA_TO_DEVICE);
270 }
271 return -EIO;
272}
273
274static void ipoib_dma_unmap_tx(struct ib_device *ca,
275 struct ipoib_tx_buf *tx_req)
276{
277 struct sk_buff *skb = tx_req->skb;
278 u64 *mapping = tx_req->mapping;
279 int i;
280
281 ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
282
283 for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
284 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
285 ib_dma_unmap_page(ca, mapping[i + 1], frag->size,
286 DMA_TO_DEVICE);
287 }
288}
289
242static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) 290static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
243{ 291{
244 struct ipoib_dev_priv *priv = netdev_priv(dev); 292 struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -257,8 +305,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
257 305
258 tx_req = &priv->tx_ring[wr_id]; 306 tx_req = &priv->tx_ring[wr_id];
259 307
260 ib_dma_unmap_single(priv->ca, tx_req->mapping, 308 ipoib_dma_unmap_tx(priv->ca, tx_req);
261 tx_req->skb->len, DMA_TO_DEVICE);
262 309
263 ++dev->stats.tx_packets; 310 ++dev->stats.tx_packets;
264 dev->stats.tx_bytes += tx_req->skb->len; 311 dev->stats.tx_bytes += tx_req->skb->len;
@@ -341,16 +388,23 @@ void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
341static inline int post_send(struct ipoib_dev_priv *priv, 388static inline int post_send(struct ipoib_dev_priv *priv,
342 unsigned int wr_id, 389 unsigned int wr_id,
343 struct ib_ah *address, u32 qpn, 390 struct ib_ah *address, u32 qpn,
344 u64 addr, int len) 391 u64 *mapping, int headlen,
392 skb_frag_t *frags,
393 int nr_frags)
345{ 394{
346 struct ib_send_wr *bad_wr; 395 struct ib_send_wr *bad_wr;
396 int i;
347 397
348 priv->tx_sge.addr = addr; 398 priv->tx_sge[0].addr = mapping[0];
349 priv->tx_sge.length = len; 399 priv->tx_sge[0].length = headlen;
350 400 for (i = 0; i < nr_frags; ++i) {
351 priv->tx_wr.wr_id = wr_id; 401 priv->tx_sge[i + 1].addr = mapping[i + 1];
352 priv->tx_wr.wr.ud.remote_qpn = qpn; 402 priv->tx_sge[i + 1].length = frags[i].size;
353 priv->tx_wr.wr.ud.ah = address; 403 }
404 priv->tx_wr.num_sge = nr_frags + 1;
405 priv->tx_wr.wr_id = wr_id;
406 priv->tx_wr.wr.ud.remote_qpn = qpn;
407 priv->tx_wr.wr.ud.ah = address;
354 408
355 return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr); 409 return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
356} 410}
@@ -360,7 +414,6 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
360{ 414{
361 struct ipoib_dev_priv *priv = netdev_priv(dev); 415 struct ipoib_dev_priv *priv = netdev_priv(dev);
362 struct ipoib_tx_buf *tx_req; 416 struct ipoib_tx_buf *tx_req;
363 u64 addr;
364 417
365 if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) { 418 if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
366 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", 419 ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
@@ -383,20 +436,19 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
383 */ 436 */
384 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)]; 437 tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
385 tx_req->skb = skb; 438 tx_req->skb = skb;
386 addr = ib_dma_map_single(priv->ca, skb->data, skb->len, 439 if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
387 DMA_TO_DEVICE);
388 if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
389 ++dev->stats.tx_errors; 440 ++dev->stats.tx_errors;
390 dev_kfree_skb_any(skb); 441 dev_kfree_skb_any(skb);
391 return; 442 return;
392 } 443 }
393 tx_req->mapping = addr;
394 444
395 if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), 445 if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
396 address->ah, qpn, addr, skb->len))) { 446 address->ah, qpn,
447 tx_req->mapping, skb_headlen(skb),
448 skb_shinfo(skb)->frags, skb_shinfo(skb)->nr_frags))) {
397 ipoib_warn(priv, "post_send failed\n"); 449 ipoib_warn(priv, "post_send failed\n");
398 ++dev->stats.tx_errors; 450 ++dev->stats.tx_errors;
399 ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE); 451 ipoib_dma_unmap_tx(priv->ca, tx_req);
400 dev_kfree_skb_any(skb); 452 dev_kfree_skb_any(skb);
401 } else { 453 } else {
402 dev->trans_start = jiffies; 454 dev->trans_start = jiffies;
@@ -615,10 +667,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
615 while ((int) priv->tx_tail - (int) priv->tx_head < 0) { 667 while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
616 tx_req = &priv->tx_ring[priv->tx_tail & 668 tx_req = &priv->tx_ring[priv->tx_tail &
617 (ipoib_sendq_size - 1)]; 669 (ipoib_sendq_size - 1)];
618 ib_dma_unmap_single(priv->ca, 670 ipoib_dma_unmap_tx(priv->ca, tx_req);
619 tx_req->mapping,
620 tx_req->skb->len,
621 DMA_TO_DEVICE);
622 dev_kfree_skb_any(tx_req->skb); 671 dev_kfree_skb_any(tx_req->skb);
623 ++priv->tx_tail; 672 ++priv->tx_tail;
624 --priv->tx_outstanding; 673 --priv->tx_outstanding;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 09f5371137a1..f96477a8ca5a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -965,7 +965,9 @@ static void ipoib_setup(struct net_device *dev)
965 dev->addr_len = INFINIBAND_ALEN; 965 dev->addr_len = INFINIBAND_ALEN;
966 dev->type = ARPHRD_INFINIBAND; 966 dev->type = ARPHRD_INFINIBAND;
967 dev->tx_queue_len = ipoib_sendq_size * 2; 967 dev->tx_queue_len = ipoib_sendq_size * 2;
968 dev->features = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX; 968 dev->features = (NETIF_F_VLAN_CHALLENGED |
969 NETIF_F_LLTX |
970 NETIF_F_HIGHDMA);
969 971
970 /* MTU will be reset when mcast join happens */ 972 /* MTU will be reset when mcast join happens */
971 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN; 973 dev->mtu = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 433e99ac227b..a3aeb911f024 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -157,6 +157,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
157 }; 157 };
158 158
159 int ret, size; 159 int ret, size;
160 int i;
160 161
161 priv->pd = ib_alloc_pd(priv->ca); 162 priv->pd = ib_alloc_pd(priv->ca);
162 if (IS_ERR(priv->pd)) { 163 if (IS_ERR(priv->pd)) {
@@ -191,6 +192,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
191 init_attr.send_cq = priv->cq; 192 init_attr.send_cq = priv->cq;
192 init_attr.recv_cq = priv->cq; 193 init_attr.recv_cq = priv->cq;
193 194
195 if (dev->features & NETIF_F_SG)
196 init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
197
194 priv->qp = ib_create_qp(priv->pd, &init_attr); 198 priv->qp = ib_create_qp(priv->pd, &init_attr);
195 if (IS_ERR(priv->qp)) { 199 if (IS_ERR(priv->qp)) {
196 printk(KERN_WARNING "%s: failed to create QP\n", ca->name); 200 printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
@@ -201,11 +205,11 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
201 priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff; 205 priv->dev->dev_addr[2] = (priv->qp->qp_num >> 8) & 0xff;
202 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff; 206 priv->dev->dev_addr[3] = (priv->qp->qp_num ) & 0xff;
203 207
204 priv->tx_sge.lkey = priv->mr->lkey; 208 for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
209 priv->tx_sge[i].lkey = priv->mr->lkey;
205 210
206 priv->tx_wr.opcode = IB_WR_SEND; 211 priv->tx_wr.opcode = IB_WR_SEND;
207 priv->tx_wr.sg_list = &priv->tx_sge; 212 priv->tx_wr.sg_list = priv->tx_sge;
208 priv->tx_wr.num_sge = 1;
209 priv->tx_wr.send_flags = IB_SEND_SIGNALED; 213 priv->tx_wr.send_flags = IB_SEND_SIGNALED;
210 214
211 return 0; 215 return 0;
diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c
index b226e019bc8b..521dc0322ee4 100644
--- a/drivers/net/mlx4/alloc.c
+++ b/drivers/net/mlx4/alloc.c
@@ -116,40 +116,53 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
116 buf->nbufs = 1; 116 buf->nbufs = 1;
117 buf->npages = 1; 117 buf->npages = 1;
118 buf->page_shift = get_order(size) + PAGE_SHIFT; 118 buf->page_shift = get_order(size) + PAGE_SHIFT;
119 buf->u.direct.buf = dma_alloc_coherent(&dev->pdev->dev, 119 buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
120 size, &t, GFP_KERNEL); 120 size, &t, GFP_KERNEL);
121 if (!buf->u.direct.buf) 121 if (!buf->direct.buf)
122 return -ENOMEM; 122 return -ENOMEM;
123 123
124 buf->u.direct.map = t; 124 buf->direct.map = t;
125 125
126 while (t & ((1 << buf->page_shift) - 1)) { 126 while (t & ((1 << buf->page_shift) - 1)) {
127 --buf->page_shift; 127 --buf->page_shift;
128 buf->npages *= 2; 128 buf->npages *= 2;
129 } 129 }
130 130
131 memset(buf->u.direct.buf, 0, size); 131 memset(buf->direct.buf, 0, size);
132 } else { 132 } else {
133 int i; 133 int i;
134 134
135 buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; 135 buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE;
136 buf->npages = buf->nbufs; 136 buf->npages = buf->nbufs;
137 buf->page_shift = PAGE_SHIFT; 137 buf->page_shift = PAGE_SHIFT;
138 buf->u.page_list = kzalloc(buf->nbufs * sizeof *buf->u.page_list, 138 buf->page_list = kzalloc(buf->nbufs * sizeof *buf->page_list,
139 GFP_KERNEL); 139 GFP_KERNEL);
140 if (!buf->u.page_list) 140 if (!buf->page_list)
141 return -ENOMEM; 141 return -ENOMEM;
142 142
143 for (i = 0; i < buf->nbufs; ++i) { 143 for (i = 0; i < buf->nbufs; ++i) {
144 buf->u.page_list[i].buf = 144 buf->page_list[i].buf =
145 dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, 145 dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
146 &t, GFP_KERNEL); 146 &t, GFP_KERNEL);
147 if (!buf->u.page_list[i].buf) 147 if (!buf->page_list[i].buf)
148 goto err_free; 148 goto err_free;
149 149
150 buf->u.page_list[i].map = t; 150 buf->page_list[i].map = t;
151 151
152 memset(buf->u.page_list[i].buf, 0, PAGE_SIZE); 152 memset(buf->page_list[i].buf, 0, PAGE_SIZE);
153 }
154
155 if (BITS_PER_LONG == 64) {
156 struct page **pages;
157 pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
158 if (!pages)
159 goto err_free;
160 for (i = 0; i < buf->nbufs; ++i)
161 pages[i] = virt_to_page(buf->page_list[i].buf);
162 buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
163 kfree(pages);
164 if (!buf->direct.buf)
165 goto err_free;
153 } 166 }
154 } 167 }
155 168
@@ -167,15 +180,18 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
167 int i; 180 int i;
168 181
169 if (buf->nbufs == 1) 182 if (buf->nbufs == 1)
170 dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf, 183 dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
171 buf->u.direct.map); 184 buf->direct.map);
172 else { 185 else {
186 if (BITS_PER_LONG == 64)
187 vunmap(buf->direct.buf);
188
173 for (i = 0; i < buf->nbufs; ++i) 189 for (i = 0; i < buf->nbufs; ++i)
174 if (buf->u.page_list[i].buf) 190 if (buf->page_list[i].buf)
175 dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, 191 dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
176 buf->u.page_list[i].buf, 192 buf->page_list[i].buf,
177 buf->u.page_list[i].map); 193 buf->page_list[i].map);
178 kfree(buf->u.page_list); 194 kfree(buf->page_list);
179 } 195 }
180} 196}
181EXPORT_SYMBOL_GPL(mlx4_buf_free); 197EXPORT_SYMBOL_GPL(mlx4_buf_free);
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index 9c9e308d0917..679dfdb6807f 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -419,9 +419,9 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
419 419
420 for (i = 0; i < buf->npages; ++i) 420 for (i = 0; i < buf->npages; ++i)
421 if (buf->nbufs == 1) 421 if (buf->nbufs == 1)
422 page_list[i] = buf->u.direct.map + (i << buf->page_shift); 422 page_list[i] = buf->direct.map + (i << buf->page_shift);
423 else 423 else
424 page_list[i] = buf->u.page_list[i].map; 424 page_list[i] = buf->page_list[i].map;
425 425
426 err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list); 426 err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list);
427 427