IB/mlx4: Use multiple WQ blocks to post smaller send WQEs

ConnectX HCA supports shrinking WQEs, so that a single work request can be made of multiple units of wqe_shift. This way, WRs can differ in size, and do not have to be a power of 2 in size, saving memory and speeding up send WR posting. Unfortunately, if we do this then the wqe_index field in CQEs can't be used to look up the WR ID anymore, so our implementation does this only if selective signaling is off. Further, on 32-bit platforms, we can't use vmap() to make the QP buffer virtually contigious. Thus we have to use constant-sized WRs to make sure a WR is always fully within a single page-sized chunk. Finally, we use WRs with the NOP opcode to avoid wrapping around the queue buffer in the middle of posting a WR, and we set the NoErrorCompletion bit to avoid getting completions with error for NOP WRs. However, NEC is only supported starting with firmware 2.2.232, so we use constant-sized WRs for older firmware. And, since MLX QPs only support SEND, we use constant-sized WRs in this case. When stamping during NOP posting, do stamping following setting of the NOP WQE valid bit. Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il> Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il> Signed-off-by: Roland Dreier <rolandd@cisco.com>
author: Jack Morgenstein <jackm@dev.mellanox.co.il> 2008-01-28 03:40:59 -0500
committer: Roland Dreier <rolandd@cisco.com> 2008-02-08 16:30:02 -0500
commit: ea54b10c7773007e173da31fe7adcc049da33331 (patch)
tree: b13b77fb3807071a5b93ece7b564f6748d962bbc /drivers/infiniband
parent: b57aacfa7a95328f469d0360e49289b023c47e9e (diff)
3 files changed, 188 insertions, 36 deletions
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 8ac7b973f87..7360bbafbe8 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -326,6 +326,12 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
        is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
                MLX4_CQE_OPCODE_ERROR;
+        if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+                     is_send)) {
+                printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+                return -EINVAL;
+        }
        if (!*cur_qp ||
            (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
                /*
@@ -348,8 +354,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
        if (is_send) {
                wq = &(*cur_qp)->sq;
-                wqe_ctr = be16_to_cpu(cqe->wqe_index);
+                if (!(*cur_qp)->sq_signal_bits) {
-                wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+                        wqe_ctr = be16_to_cpu(cqe->wqe_index);
+                        wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+                }
                wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
                ++wq->tail;
        } else if ((*cur_qp)->ibqp.srq) {
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 28697653a37..3726e451a32 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -120,6 +120,8 @@ struct mlx4_ib_qp {
        u32                     doorbell_qpn;
        __be32                  sq_signal_bits;
+        unsigned                sq_next_wqe;
+        int                     sq_max_wqes_per_wr;
        int                     sq_spare_wqes;
        struct mlx4_ib_wq       sq;
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 376db730bc7..958e205b6d7 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -30,6 +30,8 @@
 * SOFTWARE.
 */
+#include <linux/log2.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
@@ -111,16 +113,87 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
 /*
 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
- * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * first four bytes of every 64 byte chunk with
- * the very first chunk of the WQE.
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
 */
-static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
 {
-        u32 *wqe = get_send_wqe(qp, n);
+        u32 *wqe;
        int i;
+        int s;
+        int ind;
+        void *buf;
+        __be32 stamp;
+        s = roundup(size, 1U << qp->sq.wqe_shift);
+        if (qp->sq_max_wqes_per_wr > 1) {
+                for (i = 0; i < s; i += 64) {
+                        ind = (i >> qp->sq.wqe_shift) + n;
+                        stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+                                                       cpu_to_be32(0xffffffff);
+                        buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+                        wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+                        *wqe = stamp;
+                }
+        } else {
+                buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+                for (i = 64; i < s; i += 64) {
+                        wqe = buf + i;
+                        *wqe = 0xffffffff;
+                }
+        }
+}
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+        struct mlx4_wqe_ctrl_seg *ctrl;
+        struct mlx4_wqe_inline_seg *inl;
+        void *wqe;
+        int s;
+        ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+        s = sizeof(struct mlx4_wqe_ctrl_seg);
+        if (qp->ibqp.qp_type == IB_QPT_UD) {
+                struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+                struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+                memset(dgram, 0, sizeof *dgram);
+                av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+                s += sizeof(struct mlx4_wqe_datagram_seg);
+        }
+        /* Pad the remainder of the WQE with an inline data segment. */
+        if (size > s) {
+                inl = wqe + s;
+                inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+        }
+        ctrl->srcrb_flags = 0;
+        ctrl->fence_size = size / 16;
+        /*
+         * Make sure descriptor is fully written before setting ownership bit
+         * (because HW can start executing as soon as we do).
+         */
+        wmb();
+        ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+                (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
-        for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
+        stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
-                wqe[i] = 0xffffffff;
+}
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+        unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+        if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+                post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+                ind += s;
+        }
+        return ind;
 }
 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -237,6 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                              enum ib_qp_type type, struct mlx4_ib_qp *qp)
 {
+        int s;
        /* Sanity check SQ size before proceeding */
        if (cap->max_send_wr     > dev->dev->caps.max_wqes  ||
            cap->max_send_sge    > dev->dev->caps.max_sq_sg ||
@@ -252,20 +327,74 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
            cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
                return -EINVAL;
-        qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
+        s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
-                                                        sizeof (struct mlx4_wqe_data_seg),
+                cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
-                                                        cap->max_inline_data +
+                send_wqe_overhead(type);
-                                                        sizeof (struct mlx4_wqe_inline_seg)) +
-                                                    send_wqe_overhead(type)));
-        qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
-                sizeof (struct mlx4_wqe_data_seg);
        /*
-         * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+         * Hermon supports shrinking WQEs, such that a single work
-         * allow HW to prefetch.
+         * request can include multiple units of 1 << wqe_shift.  This
+         * way, work requests can differ in size, and do not have to
+         * be a power of 2 in size, saving memory and speeding up send
+         * WR posting.  Unfortunately, if we do this then the
+         * wqe_index field in CQEs can't be used to look up the WR ID
+         * anymore, so we do this only if selective signaling is off.
+         *
+         * Further, on 32-bit platforms, we can't use vmap() to make
+         * the QP buffer virtually contigious.  Thus we have to use
+         * constant-sized WRs to make sure a WR is always fully within
+         * a single page-sized chunk.
+         *
+         * Finally, we use NOP work requests to pad the end of the
+         * work queue, to avoid wrap-around in the middle of WR.  We
+         * set NEC bit to avoid getting completions with error for
+         * these NOP WRs, but since NEC is only supported starting
+         * with firmware 2.2.232, we use constant-sized WRs for older
+         * firmware.
+         *
+         * And, since MLX QPs only support SEND, we use constant-sized
+         * WRs in this case.
+         *
+         * We look for the smallest value of wqe_shift such that the
+         * resulting number of wqes does not exceed device
+         * capabilities.
+         *
+         * We set WQE size to at least 64 bytes, this way stamping
+         * invalidates each WQE.
         */
-        qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+        if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
-        qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);
+            qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+            type != IB_QPT_SMI && type != IB_QPT_GSI)
+                qp->sq.wqe_shift = ilog2(64);
+        else
+                qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+        for (;;) {
+                if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
+                        return -EINVAL;
+                qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+                /*
+                 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+                 * allow HW to prefetch.
+                 */
+                qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+                qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+                                                    qp->sq_max_wqes_per_wr +
+                                                    qp->sq_spare_wqes);
+                if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+                        break;
+                if (qp->sq_max_wqes_per_wr <= 1)
+                        return -EINVAL;
+                ++qp->sq.wqe_shift;
+        }
+        qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
+                         send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
        qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
                (qp->sq.wqe_cnt << qp->sq.wqe_shift);
@@ -277,7 +406,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                qp->sq.offset = 0;
        }
-        cap->max_send_wr  = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+        cap->max_send_wr  = qp->sq.max_post =
+                (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
        cap->max_send_sge = qp->sq.max_gs;
        /* We don't support inline sends for kernel QPs (yet) */
        cap->max_inline_data = 0;
@@ -323,6 +453,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
        qp->rq.tail         = 0;
        qp->sq.head         = 0;
        qp->sq.tail         = 0;
+        qp->sq_next_wqe     = 0;
+        if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+                qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+        else
+                qp->sq_signal_bits = 0;
        err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
        if (err)
@@ -413,11 +549,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
         */
        qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
-        if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-                qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
-        else
-                qp->sq_signal_bits = 0;
        qp->mqp.event = mlx4_ib_qp_event;
        return 0;
@@ -912,7 +1043,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                        ctrl = get_send_wqe(qp, i);
                        ctrl->owner_opcode = cpu_to_be32(1 << 31);
-                        stamp_send_wqe(qp, i);
+                        stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
                }
        }
@@ -965,6 +1096,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                qp->rq.tail = 0;
                qp->sq.head = 0;
                qp->sq.tail = 0;
+                qp->sq_next_wqe = 0;
                if (!ibqp->srq)
                        *qp->db.db  = 0;
        }
@@ -1274,13 +1406,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
        unsigned long flags;
        int nreq;
        int err = 0;
-        int ind;
+        unsigned ind;
-        int size;
+        int uninitialized_var(stamp);
+        int uninitialized_var(size);
        int i;
        spin_lock_irqsave(&qp->sq.lock, flags);
-        ind = qp->sq.head;
+        ind = qp->sq_next_wqe;
        for (nreq = 0; wr; ++nreq, wr = wr->next) {
                if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
@@ -1296,7 +1429,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                }
                ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
-                qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+                qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
                ctrl->srcrb_flags =
                        (wr->send_flags & IB_SEND_SIGNALED ?
@@ -1409,16 +1542,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
                ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
                        (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+                stamp = ind + qp->sq_spare_wqes;
+                ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
                /*
                 * We can improve latency by not stamping the last
                 * send queue WQE until after ringing the doorbell, so
                 * only stamp here if there are still more WQEs to post.
+                 *
+                 * Same optimization applies to padding with NOP wqe
+                 * in case of WQE shrinking (used to prevent wrap-around
+                 * in the middle of WR).
                 */
-                if (wr->next)
+                if (wr->next) {
-                        stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+                        stamp_send_wqe(qp, stamp, size * 16);
-                                       (qp->sq.wqe_cnt - 1));
+                        ind = pad_wraparound(qp, ind);
+                }
-                ++ind;
        }
 out:
@@ -1440,8 +1580,10 @@ out:
                 */
                mmiowb();
-                stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+                stamp_send_wqe(qp, stamp, size * 16);
-                               (qp->sq.wqe_cnt - 1));
+                ind = pad_wraparound(qp, ind);
+                qp->sq_next_wqe = ind;
        }
        spin_unlock_irqrestore(&qp->sq.lock, flags);
author	Jack Morgenstein <jackm@dev.mellanox.co.il>	2008-01-28 03:40:59 -0500
committer	Roland Dreier <rolandd@cisco.com>	2008-02-08 16:30:02 -0500
commit	ea54b10c7773007e173da31fe7adcc049da33331 (patch)
tree	b13b77fb3807071a5b93ece7b564f6748d962bbc /drivers/infiniband
parent	b57aacfa7a95328f469d0360e49289b023c47e9e (diff)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 8ac7b973f87..7360bbafbe8 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -326,6 +326,12 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
326	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==	326	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
327	MLX4_CQE_OPCODE_ERROR;	327	MLX4_CQE_OPCODE_ERROR;
328		328
		329	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
		330	is_send)) {
		331	printk(KERN_WARNING "Completion for NOP opcode detected!\n");
		332	return -EINVAL;
		333	}
		334
329	if (!*cur_qp \|\|	335	if (!*cur_qp \|\|
330	(be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {	336	(be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
331	/*	337	/*
@@ -348,8 +354,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
348		354
349	if (is_send) {	355	if (is_send) {
350	wq = &(*cur_qp)->sq;	356	wq = &(*cur_qp)->sq;
351	wqe_ctr = be16_to_cpu(cqe->wqe_index);	357	if (!(*cur_qp)->sq_signal_bits) {
352	wq->tail += (u16) (wqe_ctr - (u16) wq->tail);	358	wqe_ctr = be16_to_cpu(cqe->wqe_index);
		359	wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
		360	}
353	wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];	361	wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
354	++wq->tail;	362	++wq->tail;
355	} else if ((*cur_qp)->ibqp.srq) {	363	} else if ((*cur_qp)->ibqp.srq) {


diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 28697653a37..3726e451a32 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -120,6 +120,8 @@ struct mlx4_ib_qp {
120		120
121	u32 doorbell_qpn;	121	u32 doorbell_qpn;
122	__be32 sq_signal_bits;	122	__be32 sq_signal_bits;
		123	unsigned sq_next_wqe;
		124	int sq_max_wqes_per_wr;
123	int sq_spare_wqes;	125	int sq_spare_wqes;
124	struct mlx4_ib_wq sq;	126	struct mlx4_ib_wq sq;
125		127


diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 376db730bc7..958e205b6d7 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -30,6 +30,8 @@
30	* SOFTWARE.	30	* SOFTWARE.
31	*/	31	*/
32		32
		33	#include <linux/log2.h>
		34
33	#include <rdma/ib_cache.h>	35	#include <rdma/ib_cache.h>
34	#include <rdma/ib_pack.h>	36	#include <rdma/ib_pack.h>
35		37
@@ -111,16 +113,87 @@ static void get_send_wqe(struct mlx4_ib_qp qp, int n)
111		113
112	/*	114	/*
113	* Stamp a SQ WQE so that it is invalid if prefetched by marking the	115	* Stamp a SQ WQE so that it is invalid if prefetched by marking the
114	* first four bytes of every 64 byte chunk with 0xffffffff, except for	116	* first four bytes of every 64 byte chunk with
115	* the very first chunk of the WQE.	117	* 0x7FFFFFF \| (invalid_ownership_value << 31).
		118	*
		119	* When the max work request size is less than or equal to the WQE
		120	* basic block size, as an optimization, we can stamp all WQEs with
		121	* 0xffffffff, and skip the very first chunk of each WQE.
116	*/	122	*/
117	static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)	123	static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
118	{	124	{
119	u32 *wqe = get_send_wqe(qp, n);	125	u32 *wqe;
120	int i;	126	int i;
		127	int s;
		128	int ind;
		129	void *buf;
		130	__be32 stamp;
		131
		132	s = roundup(size, 1U << qp->sq.wqe_shift);
		133	if (qp->sq_max_wqes_per_wr > 1) {
		134	for (i = 0; i < s; i += 64) {
		135	ind = (i >> qp->sq.wqe_shift) + n;
		136	stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
		137	cpu_to_be32(0xffffffff);
		138	buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
		139	wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
		140	*wqe = stamp;
		141	}
		142	} else {
		143	buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
		144	for (i = 64; i < s; i += 64) {
		145	wqe = buf + i;
		146	*wqe = 0xffffffff;
		147	}
		148	}
		149	}
		150
		151	static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
		152	{
		153	struct mlx4_wqe_ctrl_seg *ctrl;
		154	struct mlx4_wqe_inline_seg *inl;
		155	void *wqe;
		156	int s;
		157
		158	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
		159	s = sizeof(struct mlx4_wqe_ctrl_seg);
		160
		161	if (qp->ibqp.qp_type == IB_QPT_UD) {
		162	struct mlx4_wqe_datagram_seg dgram = wqe + sizeof ctrl;
		163	struct mlx4_av av = (struct mlx4_av )dgram->av;
		164	memset(dgram, 0, sizeof *dgram);
		165	av->port_pd = cpu_to_be32((qp->port << 24) \| to_mpd(qp->ibqp.pd)->pdn);
		166	s += sizeof(struct mlx4_wqe_datagram_seg);
		167	}
		168
		169	/* Pad the remainder of the WQE with an inline data segment. */
		170	if (size > s) {
		171	inl = wqe + s;
		172	inl->byte_count = cpu_to_be32(1 << 31 \| (size - s - sizeof *inl));
		173	}
		174	ctrl->srcrb_flags = 0;
		175	ctrl->fence_size = size / 16;
		176	/*
		177	* Make sure descriptor is fully written before setting ownership bit
		178	* (because HW can start executing as soon as we do).
		179	*/
		180	wmb();
		181
		182	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP \| MLX4_WQE_CTRL_NEC) \|
		183	(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
121		184
122	for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)	185	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
123	wqe[i] = 0xffffffff;	186	}
		187
		188	/* Post NOP WQE to prevent wrap-around in the middle of WR */
		189	static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
		190	{
		191	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
		192	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
		193	post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
		194	ind += s;
		195	}
		196	return ind;
124	}	197	}
125		198
126	static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)	199	static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -237,6 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev dev, struct ib_qp_cap cap,
237	static int set_kernel_sq_size(struct mlx4_ib_dev dev, struct ib_qp_cap cap,	310	static int set_kernel_sq_size(struct mlx4_ib_dev dev, struct ib_qp_cap cap,
238	enum ib_qp_type type, struct mlx4_ib_qp *qp)	311	enum ib_qp_type type, struct mlx4_ib_qp *qp)
239	{	312	{
		313	int s;
		314
240	/* Sanity check SQ size before proceeding */	315	/* Sanity check SQ size before proceeding */
241	if (cap->max_send_wr > dev->dev->caps.max_wqes \|\|	316	if (cap->max_send_wr > dev->dev->caps.max_wqes \|\|
242	cap->max_send_sge > dev->dev->caps.max_sq_sg \|\|	317	cap->max_send_sge > dev->dev->caps.max_sq_sg \|\|
@@ -252,20 +327,74 @@ static int set_kernel_sq_size(struct mlx4_ib_dev dev, struct ib_qp_cap cap,
252	cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)	327	cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
253	return -EINVAL;	328	return -EINVAL;
254		329
255	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *	330	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
256	sizeof (struct mlx4_wqe_data_seg),	331	cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
257	cap->max_inline_data +	332	send_wqe_overhead(type);
258	sizeof (struct mlx4_wqe_inline_seg)) +
259	send_wqe_overhead(type)));
260	qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
261	sizeof (struct mlx4_wqe_data_seg);
262		333
263	/*	334	/*
264	* We need to leave 2 KB + 1 WQE of headroom in the SQ to	335	* Hermon supports shrinking WQEs, such that a single work
265	* allow HW to prefetch.	336	* request can include multiple units of 1 << wqe_shift. This
		337	* way, work requests can differ in size, and do not have to
		338	* be a power of 2 in size, saving memory and speeding up send
		339	* WR posting. Unfortunately, if we do this then the
		340	* wqe_index field in CQEs can't be used to look up the WR ID
		341	* anymore, so we do this only if selective signaling is off.
		342	*
		343	* Further, on 32-bit platforms, we can't use vmap() to make
		344	* the QP buffer virtually contigious. Thus we have to use
		345	* constant-sized WRs to make sure a WR is always fully within
		346	* a single page-sized chunk.
		347	*
		348	* Finally, we use NOP work requests to pad the end of the
		349	* work queue, to avoid wrap-around in the middle of WR. We
		350	* set NEC bit to avoid getting completions with error for
		351	* these NOP WRs, but since NEC is only supported starting
		352	* with firmware 2.2.232, we use constant-sized WRs for older
		353	* firmware.
		354	*
		355	* And, since MLX QPs only support SEND, we use constant-sized
		356	* WRs in this case.
		357	*
		358	* We look for the smallest value of wqe_shift such that the
		359	* resulting number of wqes does not exceed device
		360	* capabilities.
		361	*
		362	* We set WQE size to at least 64 bytes, this way stamping
		363	* invalidates each WQE.
266	*/	364	*/
267	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;	365	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
268	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);	366	qp->sq_signal_bits && BITS_PER_LONG == 64 &&
		367	type != IB_QPT_SMI && type != IB_QPT_GSI)
		368	qp->sq.wqe_shift = ilog2(64);
		369	else
		370	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
		371
		372	for (;;) {
		373	if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
		374	return -EINVAL;
		375
		376	qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
		377
		378	/*
		379	* We need to leave 2 KB + 1 WR of headroom in the SQ to
		380	* allow HW to prefetch.
		381	*/
		382	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
		383	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
		384	qp->sq_max_wqes_per_wr +
		385	qp->sq_spare_wqes);
		386
		387	if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
		388	break;
		389
		390	if (qp->sq_max_wqes_per_wr <= 1)
		391	return -EINVAL;
		392
		393	++qp->sq.wqe_shift;
		394	}
		395
		396	qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
		397	send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
269		398
270	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +	399	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
271	(qp->sq.wqe_cnt << qp->sq.wqe_shift);	400	(qp->sq.wqe_cnt << qp->sq.wqe_shift);
@@ -277,7 +406,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev dev, struct ib_qp_cap cap,
277	qp->sq.offset = 0;	406	qp->sq.offset = 0;
278	}	407	}
279		408
280	cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;	409	cap->max_send_wr = qp->sq.max_post =
		410	(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
281	cap->max_send_sge = qp->sq.max_gs;	411	cap->max_send_sge = qp->sq.max_gs;
282	/* We don't support inline sends for kernel QPs (yet) */	412	/* We don't support inline sends for kernel QPs (yet) */
283	cap->max_inline_data = 0;	413	cap->max_inline_data = 0;
@@ -323,6 +453,12 @@ static int create_qp_common(struct mlx4_ib_dev dev, struct ib_pd pd,
323	qp->rq.tail = 0;	453	qp->rq.tail = 0;
324	qp->sq.head = 0;	454	qp->sq.head = 0;
325	qp->sq.tail = 0;	455	qp->sq.tail = 0;
		456	qp->sq_next_wqe = 0;
		457
		458	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
		459	qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
		460	else
		461	qp->sq_signal_bits = 0;
326		462
327	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);	463	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
328	if (err)	464	if (err)
@@ -413,11 +549,6 @@ static int create_qp_common(struct mlx4_ib_dev dev, struct ib_pd pd,
413	*/	549	*/
414	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);	550	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
415		551
416	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
417	qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
418	else
419	qp->sq_signal_bits = 0;
420
421	qp->mqp.event = mlx4_ib_qp_event;	552	qp->mqp.event = mlx4_ib_qp_event;
422		553
423	return 0;	554	return 0;
@@ -912,7 +1043,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
912	ctrl = get_send_wqe(qp, i);	1043	ctrl = get_send_wqe(qp, i);
913	ctrl->owner_opcode = cpu_to_be32(1 << 31);	1044	ctrl->owner_opcode = cpu_to_be32(1 << 31);
914		1045
915	stamp_send_wqe(qp, i);	1046	stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
916	}	1047	}
917	}	1048	}
918		1049
@@ -965,6 +1096,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
965	qp->rq.tail = 0;	1096	qp->rq.tail = 0;
966	qp->sq.head = 0;	1097	qp->sq.head = 0;
967	qp->sq.tail = 0;	1098	qp->sq.tail = 0;
		1099	qp->sq_next_wqe = 0;
968	if (!ibqp->srq)	1100	if (!ibqp->srq)
969	*qp->db.db = 0;	1101	*qp->db.db = 0;
970	}	1102	}
@@ -1274,13 +1406,14 @@ int mlx4_ib_post_send(struct ib_qp ibqp, struct ib_send_wr wr,
1274	unsigned long flags;	1406	unsigned long flags;
1275	int nreq;	1407	int nreq;
1276	int err = 0;	1408	int err = 0;
1277	int ind;	1409	unsigned ind;
1278	int size;	1410	int uninitialized_var(stamp);
		1411	int uninitialized_var(size);
1279	int i;	1412	int i;
1280		1413
1281	spin_lock_irqsave(&qp->sq.lock, flags);	1414	spin_lock_irqsave(&qp->sq.lock, flags);
1282		1415
1283	ind = qp->sq.head;	1416	ind = qp->sq_next_wqe;
1284		1417
1285	for (nreq = 0; wr; ++nreq, wr = wr->next) {	1418	for (nreq = 0; wr; ++nreq, wr = wr->next) {
1286	if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {	1419	if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
@@ -1296,7 +1429,7 @@ int mlx4_ib_post_send(struct ib_qp ibqp, struct ib_send_wr wr,
1296	}	1429	}
1297		1430
1298	ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));	1431	ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
1299	qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;	1432	qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
1300		1433
1301	ctrl->srcrb_flags =	1434	ctrl->srcrb_flags =
1302	(wr->send_flags & IB_SEND_SIGNALED ?	1435	(wr->send_flags & IB_SEND_SIGNALED ?
@@ -1409,16 +1542,23 @@ int mlx4_ib_post_send(struct ib_qp ibqp, struct ib_send_wr wr,
1409	ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] \|	1542	ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] \|
1410	(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);	1543	(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
1411		1544
		1545	stamp = ind + qp->sq_spare_wqes;
		1546	ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
		1547
1412	/*	1548	/*
1413	* We can improve latency by not stamping the last	1549	* We can improve latency by not stamping the last
1414	* send queue WQE until after ringing the doorbell, so	1550	* send queue WQE until after ringing the doorbell, so
1415	* only stamp here if there are still more WQEs to post.	1551	* only stamp here if there are still more WQEs to post.
		1552	*
		1553	* Same optimization applies to padding with NOP wqe
		1554	* in case of WQE shrinking (used to prevent wrap-around
		1555	* in the middle of WR).
1416	*/	1556	*/
1417	if (wr->next)	1557	if (wr->next) {
1418	stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &	1558	stamp_send_wqe(qp, stamp, size * 16);
1419	(qp->sq.wqe_cnt - 1));	1559	ind = pad_wraparound(qp, ind);
		1560	}
1420		1561
1421	++ind;
1422	}	1562	}
1423		1563
1424	out:	1564	out:
@@ -1440,8 +1580,10 @@ out:
1440	*/	1580	*/
1441	mmiowb();	1581	mmiowb();
1442		1582
1443	stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &	1583	stamp_send_wqe(qp, stamp, size * 16);
1444	(qp->sq.wqe_cnt - 1));	1584
		1585	ind = pad_wraparound(qp, ind);
		1586	qp->sq_next_wqe = ind;
1445	}	1587	}
1446		1588
1447	spin_unlock_irqrestore(&qp->sq.lock, flags);	1589	spin_unlock_irqrestore(&qp->sq.lock, flags);