aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorOr Gerlitz <ogerlitz@mellanox.com>2012-10-21 10:59:24 -0400
committerRoland Dreier <roland@purestorage.com>2012-11-26 13:19:17 -0500
commit08ff32352d6ff7083533dc1c25618d42f92ec28e (patch)
tree00cc99b260b03c7c3e66a017f9dafcfa56b53364 /drivers/infiniband
parentf4a75d2eb7b1e2206094b901be09adb31ba63681 (diff)
mlx4: 64-byte CQE/EQE support
ConnectX-3 devices can use either 64- or 32-byte completion queue entries (CQEs) and event queue entries (EQEs). Using 64-byte EQEs/CQEs performs better because each entry is aligned to a complete cacheline. This patch queries the HCA's capabilities, and if it supports 64-byte CQEs and EQES the driver will configure the HW to work in 64-byte mode. The 32-byte vs 64-byte mode is global per HCA and not per CQ or EQ. Since this mode is global, userspace (libmlx4) must be updated to work with the configured CQE size, and guests using SR-IOV virtual functions need to know both EQE and CQE size. In case one of the 64-byte CQE/EQE capabilities is activated, the patch makes sure that older guest drivers that use the QUERY_DEV_FUNC command (e.g as done in mlx4_core of Linux 3.3..3.6) will notice that they need an update to be able to work with the PPF. This is done by changing the returned pf_context_behaviour not to be zero any more. In case none of these capabilities is activated that value remains zero and older guest drivers can run OK. The SRIOV related flow is as follows 1. the PPF does the detection of the new capabilities using QUERY_DEV_CAP command. 2. the PPF activates the new capabilities using INIT_HCA. 3. the VF detects if the PPF activated the capabilities using QUERY_HCA, and if this is the case activates them for itself too. Note that the VF detects that it must be aware to the new PF behaviour using QUERY_FUNC_CAP. Steps 1 and 2 apply also for native mode. User space notification is done through a new field introduced in struct mlx4_ib_ucontext which holds device capabilities for which user space must take action. This changes the binary interface so the ABI towards libmlx4 exposed through uverbs is bumped from 3 to 4 but only when **needed** i.e. only when the driver does use 64-byte CQEs or future device capabilities which must be in sync by user space. This practice allows to work with unmodified libmlx4 on older devices (e.g A0, B0) which don't support 64-byte CQEs. In order to keep existing systems functional when they update to a newer kernel that contains these changes in VF and userspace ABI, a module parameter enable_64b_cqe_eqe must be set to enable 64-byte mode; the default is currently false. Signed-off-by: Eli Cohen <eli@mellanox.com> Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/hw/mlx4/cq.c34
-rw-r--r--drivers/infiniband/hw/mlx4/main.c27
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h1
-rw-r--r--drivers/infiniband/hw/mlx4/user.h12
4 files changed, 60 insertions, 14 deletions
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index c9eb6a6815ce..ae67df35dd4d 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -66,7 +66,7 @@ static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
66 66
67static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n) 67static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
68{ 68{
69 return mlx4_buf_offset(&buf->buf, n * sizeof (struct mlx4_cqe)); 69 return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
70} 70}
71 71
72static void *get_cqe(struct mlx4_ib_cq *cq, int n) 72static void *get_cqe(struct mlx4_ib_cq *cq, int n)
@@ -77,8 +77,9 @@ static void *get_cqe(struct mlx4_ib_cq *cq, int n)
77static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n) 77static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
78{ 78{
79 struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe); 79 struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
80 struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);
80 81
81 return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ 82 return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
82 !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe; 83 !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
83} 84}
84 85
@@ -99,12 +100,13 @@ static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *
99{ 100{
100 int err; 101 int err;
101 102
102 err = mlx4_buf_alloc(dev->dev, nent * sizeof(struct mlx4_cqe), 103 err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
103 PAGE_SIZE * 2, &buf->buf); 104 PAGE_SIZE * 2, &buf->buf);
104 105
105 if (err) 106 if (err)
106 goto out; 107 goto out;
107 108
109 buf->entry_size = dev->dev->caps.cqe_size;
108 err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift, 110 err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
109 &buf->mtt); 111 &buf->mtt);
110 if (err) 112 if (err)
@@ -120,8 +122,7 @@ err_mtt:
120 mlx4_mtt_cleanup(dev->dev, &buf->mtt); 122 mlx4_mtt_cleanup(dev->dev, &buf->mtt);
121 123
122err_buf: 124err_buf:
123 mlx4_buf_free(dev->dev, nent * sizeof(struct mlx4_cqe), 125 mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);
124 &buf->buf);
125 126
126out: 127out:
127 return err; 128 return err;
@@ -129,7 +130,7 @@ out:
129 130
130static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe) 131static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
131{ 132{
132 mlx4_buf_free(dev->dev, (cqe + 1) * sizeof(struct mlx4_cqe), &buf->buf); 133 mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
133} 134}
134 135
135static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context, 136static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
@@ -137,8 +138,9 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *cont
137 u64 buf_addr, int cqe) 138 u64 buf_addr, int cqe)
138{ 139{
139 int err; 140 int err;
141 int cqe_size = dev->dev->caps.cqe_size;
140 142
141 *umem = ib_umem_get(context, buf_addr, cqe * sizeof (struct mlx4_cqe), 143 *umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
142 IB_ACCESS_LOCAL_WRITE, 1); 144 IB_ACCESS_LOCAL_WRITE, 1);
143 if (IS_ERR(*umem)) 145 if (IS_ERR(*umem))
144 return PTR_ERR(*umem); 146 return PTR_ERR(*umem);
@@ -331,16 +333,23 @@ static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
331{ 333{
332 struct mlx4_cqe *cqe, *new_cqe; 334 struct mlx4_cqe *cqe, *new_cqe;
333 int i; 335 int i;
336 int cqe_size = cq->buf.entry_size;
337 int cqe_inc = cqe_size == 64 ? 1 : 0;
334 338
335 i = cq->mcq.cons_index; 339 i = cq->mcq.cons_index;
336 cqe = get_cqe(cq, i & cq->ibcq.cqe); 340 cqe = get_cqe(cq, i & cq->ibcq.cqe);
341 cqe += cqe_inc;
342
337 while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) { 343 while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
338 new_cqe = get_cqe_from_buf(&cq->resize_buf->buf, 344 new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
339 (i + 1) & cq->resize_buf->cqe); 345 (i + 1) & cq->resize_buf->cqe);
340 memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), sizeof(struct mlx4_cqe)); 346 memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
347 new_cqe += cqe_inc;
348
341 new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | 349 new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
342 (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); 350 (((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
343 cqe = get_cqe(cq, ++i & cq->ibcq.cqe); 351 cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
352 cqe += cqe_inc;
344 } 353 }
345 ++cq->mcq.cons_index; 354 ++cq->mcq.cons_index;
346} 355}
@@ -438,6 +447,7 @@ err_buf:
438 447
439out: 448out:
440 mutex_unlock(&cq->resize_mutex); 449 mutex_unlock(&cq->resize_mutex);
450
441 return err; 451 return err;
442} 452}
443 453
@@ -586,6 +596,9 @@ repoll:
586 if (!cqe) 596 if (!cqe)
587 return -EAGAIN; 597 return -EAGAIN;
588 598
599 if (cq->buf.entry_size == 64)
600 cqe++;
601
589 ++cq->mcq.cons_index; 602 ++cq->mcq.cons_index;
590 603
591 /* 604 /*
@@ -807,6 +820,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
807 int nfreed = 0; 820 int nfreed = 0;
808 struct mlx4_cqe *cqe, *dest; 821 struct mlx4_cqe *cqe, *dest;
809 u8 owner_bit; 822 u8 owner_bit;
823 int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;
810 824
811 /* 825 /*
812 * First we need to find the current producer index, so we 826 * First we need to find the current producer index, so we
@@ -825,12 +839,16 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
825 */ 839 */
826 while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { 840 while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
827 cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); 841 cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
842 cqe += cqe_inc;
843
828 if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) { 844 if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
829 if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) 845 if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
830 mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index)); 846 mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
831 ++nfreed; 847 ++nfreed;
832 } else if (nfreed) { 848 } else if (nfreed) {
833 dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe); 849 dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
850 dest += cqe_inc;
851
834 owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; 852 owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
835 memcpy(dest, cqe, sizeof *cqe); 853 memcpy(dest, cqe, sizeof *cqe);
836 dest->owner_sr_opcode = owner_bit | 854 dest->owner_sr_opcode = owner_bit |
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 718ec6b2bad2..e7d81c0d1ac5 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -563,15 +563,24 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
563{ 563{
564 struct mlx4_ib_dev *dev = to_mdev(ibdev); 564 struct mlx4_ib_dev *dev = to_mdev(ibdev);
565 struct mlx4_ib_ucontext *context; 565 struct mlx4_ib_ucontext *context;
566 struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
566 struct mlx4_ib_alloc_ucontext_resp resp; 567 struct mlx4_ib_alloc_ucontext_resp resp;
567 int err; 568 int err;
568 569
569 if (!dev->ib_active) 570 if (!dev->ib_active)
570 return ERR_PTR(-EAGAIN); 571 return ERR_PTR(-EAGAIN);
571 572
572 resp.qp_tab_size = dev->dev->caps.num_qps; 573 if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
573 resp.bf_reg_size = dev->dev->caps.bf_reg_size; 574 resp_v3.qp_tab_size = dev->dev->caps.num_qps;
574 resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; 575 resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size;
576 resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
577 } else {
578 resp.dev_caps = dev->dev->caps.userspace_caps;
579 resp.qp_tab_size = dev->dev->caps.num_qps;
580 resp.bf_reg_size = dev->dev->caps.bf_reg_size;
581 resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
582 resp.cqe_size = dev->dev->caps.cqe_size;
583 }
575 584
576 context = kmalloc(sizeof *context, GFP_KERNEL); 585 context = kmalloc(sizeof *context, GFP_KERNEL);
577 if (!context) 586 if (!context)
@@ -586,7 +595,11 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
586 INIT_LIST_HEAD(&context->db_page_list); 595 INIT_LIST_HEAD(&context->db_page_list);
587 mutex_init(&context->db_page_mutex); 596 mutex_init(&context->db_page_mutex);
588 597
589 err = ib_copy_to_udata(udata, &resp, sizeof resp); 598 if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
599 err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
600 else
601 err = ib_copy_to_udata(udata, &resp, sizeof(resp));
602
590 if (err) { 603 if (err) {
591 mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); 604 mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
592 kfree(context); 605 kfree(context);
@@ -1342,7 +1355,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
1342 ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; 1355 ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
1343 ibdev->ib_dev.dma_device = &dev->pdev->dev; 1356 ibdev->ib_dev.dma_device = &dev->pdev->dev;
1344 1357
1345 ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; 1358 if (dev->caps.userspace_caps)
1359 ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
1360 else
1361 ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
1362
1346 ibdev->ib_dev.uverbs_cmd_mask = 1363 ibdev->ib_dev.uverbs_cmd_mask =
1347 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | 1364 (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
1348 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | 1365 (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index e04cbc9a54a5..dcd845bc30f0 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -90,6 +90,7 @@ struct mlx4_ib_xrcd {
90struct mlx4_ib_cq_buf { 90struct mlx4_ib_cq_buf {
91 struct mlx4_buf buf; 91 struct mlx4_buf buf;
92 struct mlx4_mtt mtt; 92 struct mlx4_mtt mtt;
93 int entry_size;
93}; 94};
94 95
95struct mlx4_ib_cq_resize { 96struct mlx4_ib_cq_resize {
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
index 13beedeeef9f..07e6769ef43b 100644
--- a/drivers/infiniband/hw/mlx4/user.h
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -40,7 +40,9 @@
40 * Increment this value if any changes that break userspace ABI 40 * Increment this value if any changes that break userspace ABI
41 * compatibility are made. 41 * compatibility are made.
42 */ 42 */
43#define MLX4_IB_UVERBS_ABI_VERSION 3 43
44#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION 3
45#define MLX4_IB_UVERBS_ABI_VERSION 4
44 46
45/* 47/*
46 * Make sure that all structs defined in this file remain laid out so 48 * Make sure that all structs defined in this file remain laid out so
@@ -50,10 +52,18 @@
50 * instead. 52 * instead.
51 */ 53 */
52 54
55struct mlx4_ib_alloc_ucontext_resp_v3 {
56 __u32 qp_tab_size;
57 __u16 bf_reg_size;
58 __u16 bf_regs_per_page;
59};
60
53struct mlx4_ib_alloc_ucontext_resp { 61struct mlx4_ib_alloc_ucontext_resp {
62 __u32 dev_caps;
54 __u32 qp_tab_size; 63 __u32 qp_tab_size;
55 __u16 bf_reg_size; 64 __u16 bf_reg_size;
56 __u16 bf_regs_per_page; 65 __u16 bf_regs_per_page;
66 __u32 cqe_size;
57}; 67};
58 68
59struct mlx4_ib_alloc_pd_resp { 69struct mlx4_ib_alloc_pd_resp {