aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArtemy Kovalyov <artemyko@mellanox.com>2017-01-18 09:58:11 -0500
committerDoug Ledford <dledford@redhat.com>2017-02-14 11:41:19 -0500
commit81713d3788d2e6bc005f15ee1c59d0eb06050a6b (patch)
tree6f34bfb4eef3525694b651ddff723dbd8ffa8c0d
parent49780d42dfc9ec0f4090c32ca59688449da1a1cd (diff)
IB/mlx5: Add implicit MR support
Add implicit MR, covering entire user address space. The MR is implemented as an indirect KSM MR consisting of 1GB direct MRs. Pages and direct MRs are added/removed to MR by ODP. Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com> Signed-off-by: Leon Romanovsky <leon@kernel.org> Signed-off-by: Doug Ledford <dledford@redhat.com>
-rw-r--r--drivers/infiniband/hw/mlx5/main.c2
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h20
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c33
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c505
-rw-r--r--include/linux/mlx5/driver.h2
5 files changed, 513 insertions, 49 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fe37da2be26f..eb8719ca500e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3583,6 +3583,8 @@ static int __init mlx5_ib_init(void)
3583{ 3583{
3584 int err; 3584 int err;
3585 3585
3586 mlx5_ib_odp_init();
3587
3586 err = mlx5_register_interface(&mlx5_ib_interface); 3588 err = mlx5_register_interface(&mlx5_ib_interface);
3587 3589
3588 return err; 3590 return err;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index efc44de3c7d7..3cd064b5f0bf 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -202,6 +202,7 @@ struct mlx5_ib_flow_db {
202#define MLX5_IB_UPD_XLT_ADDR BIT(3) 202#define MLX5_IB_UPD_XLT_ADDR BIT(3)
203#define MLX5_IB_UPD_XLT_PD BIT(4) 203#define MLX5_IB_UPD_XLT_PD BIT(4)
204#define MLX5_IB_UPD_XLT_ACCESS BIT(5) 204#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
205#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
205 206
206/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. 207/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
207 * 208 *
@@ -503,6 +504,10 @@ struct mlx5_ib_mr {
503 int live; 504 int live;
504 void *descs_alloc; 505 void *descs_alloc;
505 int access_flags; /* Needed for rereg MR */ 506 int access_flags; /* Needed for rereg MR */
507
508 struct mlx5_ib_mr *parent;
509 atomic_t num_leaf_free;
510 wait_queue_head_t q_leaf_free;
506}; 511};
507 512
508struct mlx5_ib_mw { 513struct mlx5_ib_mw {
@@ -637,6 +642,7 @@ struct mlx5_ib_dev {
637 * being used by a page fault handler. 642 * being used by a page fault handler.
638 */ 643 */
639 struct srcu_struct mr_srcu; 644 struct srcu_struct mr_srcu;
645 u32 null_mkey;
640#endif 646#endif
641 struct mlx5_ib_flow_db flow_db; 647 struct mlx5_ib_flow_db flow_db;
642 /* protect resources needed as part of reset flow */ 648 /* protect resources needed as part of reset flow */
@@ -789,6 +795,9 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
789int mlx5_ib_dealloc_mw(struct ib_mw *mw); 795int mlx5_ib_dealloc_mw(struct ib_mw *mw);
790int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 796int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
791 int page_shift, int flags); 797 int page_shift, int flags);
798struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
799 int access_flags);
800void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
792int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 801int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
793 u64 length, u64 virt_addr, int access_flags, 802 u64 length, u64 virt_addr, int access_flags,
794 struct ib_pd *pd, struct ib_udata *udata); 803 struct ib_pd *pd, struct ib_udata *udata);
@@ -868,6 +877,9 @@ int __init mlx5_ib_odp_init(void);
868void mlx5_ib_odp_cleanup(void); 877void mlx5_ib_odp_cleanup(void);
869void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 878void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
870 unsigned long end); 879 unsigned long end);
880void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
881void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
882 size_t nentries, struct mlx5_ib_mr *mr, int flags);
871#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 883#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
872static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 884static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
873{ 885{
@@ -875,9 +887,13 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
875} 887}
876 888
877static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } 889static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
878static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} 890static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
879static inline int mlx5_ib_odp_init(void) { return 0; } 891static inline int mlx5_ib_odp_init(void) { return 0; }
880static inline void mlx5_ib_odp_cleanup(void) {} 892static inline void mlx5_ib_odp_cleanup(void) {}
893static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
894static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
895 size_t nentries, struct mlx5_ib_mr *mr,
896 int flags) {}
881 897
882#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 898#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
883 899
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8f5b94d483e4..3c1f483d003f 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -469,7 +469,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
469 spin_unlock_irq(&ent->lock); 469 spin_unlock_irq(&ent->lock);
470 470
471 err = add_keys(dev, entry, 1); 471 err = add_keys(dev, entry, 1);
472 if (err) 472 if (err && err != -EAGAIN)
473 return ERR_PTR(err); 473 return ERR_PTR(err);
474 474
475 wait_for_completion(&ent->compl); 475 wait_for_completion(&ent->compl);
@@ -669,8 +669,10 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
669 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 669 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
670 queue_work(cache->wq, &ent->work); 670 queue_work(cache->wq, &ent->work);
671 671
672 if (i > MAX_UMR_CACHE_ENTRY) 672 if (i > MAX_UMR_CACHE_ENTRY) {
673 mlx5_odp_init_mr_cache_entry(ent);
673 continue; 674 continue;
675 }
674 676
675 if (!use_umr(dev, ent->order)) 677 if (!use_umr(dev, ent->order))
676 continue; 678 continue;
@@ -935,6 +937,10 @@ static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
935{ 937{
936 struct mlx5_ib_dev *dev = mr->dev; 938 struct mlx5_ib_dev *dev = mr->dev;
937 struct ib_umem *umem = mr->umem; 939 struct ib_umem *umem = mr->umem;
940 if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
941 mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
942 return npages;
943 }
938 944
939 npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx); 945 npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
940 946
@@ -968,7 +974,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
968 struct mlx5_umr_wr wr; 974 struct mlx5_umr_wr wr;
969 struct ib_sge sg; 975 struct ib_sge sg;
970 int err = 0; 976 int err = 0;
971 int desc_size = sizeof(struct mlx5_mtt); 977 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
978 ? sizeof(struct mlx5_klm)
979 : sizeof(struct mlx5_mtt);
972 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; 980 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
973 const int page_mask = page_align - 1; 981 const int page_mask = page_align - 1;
974 size_t pages_mapped = 0; 982 size_t pages_mapped = 0;
@@ -1186,6 +1194,18 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1186 1194
1187 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", 1195 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1188 start, virt_addr, length, access_flags); 1196 start, virt_addr, length, access_flags);
1197
1198#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1199 if (!start && length == U64_MAX) {
1200 if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
1201 !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1202 return ERR_PTR(-EINVAL);
1203
1204 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1205 return &mr->ibmr;
1206 }
1207#endif
1208
1189 err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, 1209 err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
1190 &page_shift, &ncont, &order); 1210 &page_shift, &ncont, &order);
1191 1211
@@ -1471,8 +1491,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1471 /* Wait for all running page-fault handlers to finish. */ 1491 /* Wait for all running page-fault handlers to finish. */
1472 synchronize_srcu(&dev->mr_srcu); 1492 synchronize_srcu(&dev->mr_srcu);
1473 /* Destroy all page mappings */ 1493 /* Destroy all page mappings */
1474 mlx5_ib_invalidate_range(umem, ib_umem_start(umem), 1494 if (umem->odp_data->page_list)
1475 ib_umem_end(umem)); 1495 mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1496 ib_umem_end(umem));
1497 else
1498 mlx5_ib_free_implicit_mr(mr);
1476 /* 1499 /*
1477 * We kill the umem before the MR for ODP, 1500 * We kill the umem before the MR for ODP,
1478 * so that there will not be any invalidations in 1501 * so that there will not be any invalidations in
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index e5bc267aca73..d7b12f0750e2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,7 @@
34#include <rdma/ib_umem_odp.h> 34#include <rdma/ib_umem_odp.h>
35 35
36#include "mlx5_ib.h" 36#include "mlx5_ib.h"
37#include "cmd.h"
37 38
38#define MAX_PREFETCH_LEN (4*1024*1024U) 39#define MAX_PREFETCH_LEN (4*1024*1024U)
39 40
@@ -41,6 +42,140 @@
41 * a pagefault. */ 42 * a pagefault. */
42#define MMU_NOTIFIER_TIMEOUT 1000 43#define MMU_NOTIFIER_TIMEOUT 1000
43 44
45#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
46#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
47#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
48#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
49#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
50
51#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
52
53static u64 mlx5_imr_ksm_entries;
54
55static int check_parent(struct ib_umem_odp *odp,
56 struct mlx5_ib_mr *parent)
57{
58 struct mlx5_ib_mr *mr = odp->private;
59
60 return mr && mr->parent == parent;
61}
62
63static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
64{
65 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
66 struct ib_ucontext *ctx = odp->umem->context;
67 struct rb_node *rb;
68
69 down_read(&ctx->umem_rwsem);
70 while (1) {
71 rb = rb_next(&odp->interval_tree.rb);
72 if (!rb)
73 goto not_found;
74 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
75 if (check_parent(odp, parent))
76 goto end;
77 }
78not_found:
79 odp = NULL;
80end:
81 up_read(&ctx->umem_rwsem);
82 return odp;
83}
84
85static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
86 u64 start, u64 length,
87 struct mlx5_ib_mr *parent)
88{
89 struct ib_umem_odp *odp;
90 struct rb_node *rb;
91
92 down_read(&ctx->umem_rwsem);
93 odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
94 if (!odp)
95 goto end;
96
97 while (1) {
98 if (check_parent(odp, parent))
99 goto end;
100 rb = rb_next(&odp->interval_tree.rb);
101 if (!rb)
102 goto not_found;
103 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
104 if (ib_umem_start(odp->umem) > start + length)
105 goto not_found;
106 }
107not_found:
108 odp = NULL;
109end:
110 up_read(&ctx->umem_rwsem);
111 return odp;
112}
113
114void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
115 size_t nentries, struct mlx5_ib_mr *mr, int flags)
116{
117 struct ib_pd *pd = mr->ibmr.pd;
118 struct ib_ucontext *ctx = pd->uobject->context;
119 struct mlx5_ib_dev *dev = to_mdev(pd->device);
120 struct ib_umem_odp *odp;
121 unsigned long va;
122 int i;
123
124 if (flags & MLX5_IB_UPD_XLT_ZAP) {
125 for (i = 0; i < nentries; i++, pklm++) {
126 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
127 pklm->key = cpu_to_be32(dev->null_mkey);
128 pklm->va = 0;
129 }
130 return;
131 }
132
133 odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
134 nentries * MLX5_IMR_MTT_SIZE, mr);
135
136 for (i = 0; i < nentries; i++, pklm++) {
137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
138 va = (offset + i) * MLX5_IMR_MTT_SIZE;
139 if (odp && odp->umem->address == va) {
140 struct mlx5_ib_mr *mtt = odp->private;
141
142 pklm->key = cpu_to_be32(mtt->ibmr.lkey);
143 odp = odp_next(odp);
144 } else {
145 pklm->key = cpu_to_be32(dev->null_mkey);
146 }
147 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
148 i, va, be32_to_cpu(pklm->key));
149 }
150}
151
152static void mr_leaf_free_action(struct work_struct *work)
153{
154 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
155 int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
156 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
157
158 mr->parent = NULL;
159 synchronize_srcu(&mr->dev->mr_srcu);
160
161 if (!READ_ONCE(odp->dying)) {
162 mr->parent = imr;
163 if (atomic_dec_and_test(&imr->num_leaf_free))
164 wake_up(&imr->q_leaf_free);
165 return;
166 }
167
168 ib_umem_release(odp->umem);
169 if (imr->live)
170 mlx5_ib_update_xlt(imr, idx, 1, 0,
171 MLX5_IB_UPD_XLT_INDIRECT |
172 MLX5_IB_UPD_XLT_ATOMIC);
173 mlx5_mr_cache_free(mr->dev, mr);
174
175 if (atomic_dec_and_test(&imr->num_leaf_free))
176 wake_up(&imr->q_leaf_free);
177}
178
44void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 179void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
45 unsigned long end) 180 unsigned long end)
46{ 181{
@@ -111,6 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
111 */ 246 */
112 247
113 ib_umem_odp_unmap_dma_pages(umem, start, end); 248 ib_umem_odp_unmap_dma_pages(umem, start, end);
249
250 if (unlikely(!umem->npages && mr->parent &&
251 !umem->odp_data->dying)) {
252 WRITE_ONCE(umem->odp_data->dying, 1);
253 atomic_inc(&mr->parent->num_leaf_free);
254 schedule_work(&umem->odp_data->work);
255 }
114} 256}
115 257
116void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 258void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
@@ -147,6 +289,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
147 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 289 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
148 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 290 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
149 291
292 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
293 MLX5_CAP_GEN(dev->mdev, null_mkey) &&
294 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
295 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
296
150 return; 297 return;
151} 298}
152 299
@@ -184,6 +331,197 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
184 wq_num); 331 wq_num);
185} 332}
186 333
334static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
335 struct ib_umem *umem,
336 bool ksm, int access_flags)
337{
338 struct mlx5_ib_dev *dev = to_mdev(pd->device);
339 struct mlx5_ib_mr *mr;
340 int err;
341
342 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
343 MLX5_IMR_MTT_CACHE_ENTRY);
344
345 if (IS_ERR(mr))
346 return mr;
347
348 mr->ibmr.pd = pd;
349
350 mr->dev = dev;
351 mr->access_flags = access_flags;
352 mr->mmkey.iova = 0;
353 mr->umem = umem;
354
355 if (ksm) {
356 err = mlx5_ib_update_xlt(mr, 0,
357 mlx5_imr_ksm_entries,
358 MLX5_KSM_PAGE_SHIFT,
359 MLX5_IB_UPD_XLT_INDIRECT |
360 MLX5_IB_UPD_XLT_ZAP |
361 MLX5_IB_UPD_XLT_ENABLE);
362
363 } else {
364 err = mlx5_ib_update_xlt(mr, 0,
365 MLX5_IMR_MTT_ENTRIES,
366 PAGE_SHIFT,
367 MLX5_IB_UPD_XLT_ZAP |
368 MLX5_IB_UPD_XLT_ENABLE |
369 MLX5_IB_UPD_XLT_ATOMIC);
370 }
371
372 if (err)
373 goto fail;
374
375 mr->ibmr.lkey = mr->mmkey.key;
376 mr->ibmr.rkey = mr->mmkey.key;
377
378 mr->live = 1;
379
380 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
381 mr->mmkey.key, dev->mdev, mr);
382
383 return mr;
384
385fail:
386 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
387 mlx5_mr_cache_free(dev, mr);
388
389 return ERR_PTR(err);
390}
391
392static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
393 u64 io_virt, size_t bcnt)
394{
395 struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
396 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
397 struct ib_umem_odp *odp, *result = NULL;
398 u64 addr = io_virt & MLX5_IMR_MTT_MASK;
399 int nentries = 0, start_idx = 0, ret;
400 struct mlx5_ib_mr *mtt;
401 struct ib_umem *umem;
402
403 mutex_lock(&mr->umem->odp_data->umem_mutex);
404 odp = odp_lookup(ctx, addr, 1, mr);
405
406 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
407 io_virt, bcnt, addr, odp);
408
409next_mr:
410 if (likely(odp)) {
411 if (nentries)
412 nentries++;
413 } else {
414 umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
415 if (IS_ERR(umem)) {
416 mutex_unlock(&mr->umem->odp_data->umem_mutex);
417 return ERR_CAST(umem);
418 }
419
420 mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
421 if (IS_ERR(mtt)) {
422 mutex_unlock(&mr->umem->odp_data->umem_mutex);
423 ib_umem_release(umem);
424 return ERR_CAST(mtt);
425 }
426
427 odp = umem->odp_data;
428 odp->private = mtt;
429 mtt->umem = umem;
430 mtt->mmkey.iova = addr;
431 mtt->parent = mr;
432 INIT_WORK(&odp->work, mr_leaf_free_action);
433
434 if (!nentries)
435 start_idx = addr >> MLX5_IMR_MTT_SHIFT;
436 nentries++;
437 }
438
439 odp->dying = 0;
440
441 /* Return first odp if region not covered by single one */
442 if (likely(!result))
443 result = odp;
444
445 addr += MLX5_IMR_MTT_SIZE;
446 if (unlikely(addr < io_virt + bcnt)) {
447 odp = odp_next(odp);
448 if (odp && odp->umem->address != addr)
449 odp = NULL;
450 goto next_mr;
451 }
452
453 if (unlikely(nentries)) {
454 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
455 MLX5_IB_UPD_XLT_INDIRECT |
456 MLX5_IB_UPD_XLT_ATOMIC);
457 if (ret) {
458 mlx5_ib_err(dev, "Failed to update PAS\n");
459 result = ERR_PTR(ret);
460 }
461 }
462
463 mutex_unlock(&mr->umem->odp_data->umem_mutex);
464 return result;
465}
466
467struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
468 int access_flags)
469{
470 struct ib_ucontext *ctx = pd->ibpd.uobject->context;
471 struct mlx5_ib_mr *imr;
472 struct ib_umem *umem;
473
474 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
475 if (IS_ERR(umem))
476 return ERR_CAST(umem);
477
478 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
479 if (IS_ERR(imr)) {
480 ib_umem_release(umem);
481 return ERR_CAST(imr);
482 }
483
484 imr->umem = umem;
485 init_waitqueue_head(&imr->q_leaf_free);
486 atomic_set(&imr->num_leaf_free, 0);
487
488 return imr;
489}
490
491static int mr_leaf_free(struct ib_umem *umem, u64 start,
492 u64 end, void *cookie)
493{
494 struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
495
496 if (mr->parent != imr)
497 return 0;
498
499 ib_umem_odp_unmap_dma_pages(umem,
500 ib_umem_start(umem),
501 ib_umem_end(umem));
502
503 if (umem->odp_data->dying)
504 return 0;
505
506 WRITE_ONCE(umem->odp_data->dying, 1);
507 atomic_inc(&imr->num_leaf_free);
508 schedule_work(&umem->odp_data->work);
509
510 return 0;
511}
512
513void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
514{
515 struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
516
517 down_read(&ctx->umem_rwsem);
518 rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
519 mr_leaf_free, imr);
520 up_read(&ctx->umem_rwsem);
521
522 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
523}
524
187/* 525/*
188 * Handle a single data segment in a page-fault WQE or RDMA region. 526 * Handle a single data segment in a page-fault WQE or RDMA region.
189 * 527 *
@@ -195,47 +533,43 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
195 * -EFAULT when there's an error mapping the requested pages. The caller will 533 * -EFAULT when there's an error mapping the requested pages. The caller will
196 * abort the page fault handling. 534 * abort the page fault handling.
197 */ 535 */
198static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev, 536static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
199 u32 key, u64 io_virt, size_t bcnt, 537 u32 key, u64 io_virt, size_t bcnt,
200 u32 *bytes_committed, 538 u32 *bytes_committed,
201 u32 *bytes_mapped) 539 u32 *bytes_mapped)
202{ 540{
203 int srcu_key; 541 int srcu_key;
204 unsigned int current_seq; 542 unsigned int current_seq = 0;
205 u64 start_idx; 543 u64 start_idx;
206 int npages = 0, ret = 0; 544 int npages = 0, ret = 0;
207 struct mlx5_ib_mr *mr; 545 struct mlx5_ib_mr *mr;
208 u64 access_mask = ODP_READ_ALLOWED_BIT; 546 u64 access_mask = ODP_READ_ALLOWED_BIT;
547 struct ib_umem_odp *odp;
548 int implicit = 0;
549 size_t size;
209 550
210 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 551 srcu_key = srcu_read_lock(&dev->mr_srcu);
211 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); 552 mr = mlx5_ib_odp_find_mr_lkey(dev, key);
212 /* 553 /*
213 * If we didn't find the MR, it means the MR was closed while we were 554 * If we didn't find the MR, it means the MR was closed while we were
214 * handling the ODP event. In this case we return -EFAULT so that the 555 * handling the ODP event. In this case we return -EFAULT so that the
215 * QP will be closed. 556 * QP will be closed.
216 */ 557 */
217 if (!mr || !mr->ibmr.pd) { 558 if (!mr || !mr->ibmr.pd) {
218 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 559 mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
219 key); 560 key);
220 ret = -EFAULT; 561 ret = -EFAULT;
221 goto srcu_unlock; 562 goto srcu_unlock;
222 } 563 }
223 if (!mr->umem->odp_data) { 564 if (!mr->umem->odp_data) {
224 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 565 mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
225 key); 566 key);
226 if (bytes_mapped) 567 if (bytes_mapped)
227 *bytes_mapped += 568 *bytes_mapped +=
228 (bcnt - *bytes_committed); 569 (bcnt - *bytes_committed);
229 goto srcu_unlock; 570 goto srcu_unlock;
230 } 571 }
231 572
232 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
233 /*
234 * Ensure the sequence number is valid for some time before we call
235 * gup.
236 */
237 smp_rmb();
238
239 /* 573 /*
240 * Avoid branches - this code will perform correctly 574 * Avoid branches - this code will perform correctly
241 * in all iterations (in iteration 2 and above, 575 * in all iterations (in iteration 2 and above,
@@ -244,63 +578,109 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
244 io_virt += *bytes_committed; 578 io_virt += *bytes_committed;
245 bcnt -= *bytes_committed; 579 bcnt -= *bytes_committed;
246 580
581 if (!mr->umem->odp_data->page_list) {
582 odp = implicit_mr_get_data(mr, io_virt, bcnt);
583
584 if (IS_ERR(odp)) {
585 ret = PTR_ERR(odp);
586 goto srcu_unlock;
587 }
588 mr = odp->private;
589 implicit = 1;
590
591 } else {
592 odp = mr->umem->odp_data;
593 }
594
595next_mr:
596 current_seq = READ_ONCE(odp->notifiers_seq);
597 /*
598 * Ensure the sequence number is valid for some time before we call
599 * gup.
600 */
601 smp_rmb();
602
603 size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
247 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 604 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
248 605
249 if (mr->umem->writable) 606 if (mr->umem->writable)
250 access_mask |= ODP_WRITE_ALLOWED_BIT; 607 access_mask |= ODP_WRITE_ALLOWED_BIT;
251 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 608
252 access_mask, current_seq); 609 ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
253 if (npages < 0) { 610 access_mask, current_seq);
254 ret = npages; 611
612 if (ret < 0)
255 goto srcu_unlock; 613 goto srcu_unlock;
256 }
257 614
258 if (npages > 0) { 615 if (ret > 0) {
259 mutex_lock(&mr->umem->odp_data->umem_mutex); 616 int np = ret;
617
618 mutex_lock(&odp->umem_mutex);
260 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 619 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
261 /* 620 /*
262 * No need to check whether the MTTs really belong to 621 * No need to check whether the MTTs really belong to
263 * this MR, since ib_umem_odp_map_dma_pages already 622 * this MR, since ib_umem_odp_map_dma_pages already
264 * checks this. 623 * checks this.
265 */ 624 */
266 ret = mlx5_ib_update_xlt(mr, start_idx, npages, 625 ret = mlx5_ib_update_xlt(mr, start_idx, np,
267 PAGE_SHIFT, 626 PAGE_SHIFT,
268 MLX5_IB_UPD_XLT_ATOMIC); 627 MLX5_IB_UPD_XLT_ATOMIC);
269 } else { 628 } else {
270 ret = -EAGAIN; 629 ret = -EAGAIN;
271 } 630 }
272 mutex_unlock(&mr->umem->odp_data->umem_mutex); 631 mutex_unlock(&odp->umem_mutex);
273 if (ret < 0) { 632 if (ret < 0) {
274 if (ret != -EAGAIN) 633 if (ret != -EAGAIN)
275 pr_err("Failed to update mkey page tables\n"); 634 mlx5_ib_err(dev, "Failed to update mkey page tables\n");
276 goto srcu_unlock; 635 goto srcu_unlock;
277 } 636 }
278 637
279 if (bytes_mapped) { 638 if (bytes_mapped) {
280 u32 new_mappings = npages * PAGE_SIZE - 639 u32 new_mappings = np * PAGE_SIZE -
281 (io_virt - round_down(io_virt, PAGE_SIZE)); 640 (io_virt - round_down(io_virt, PAGE_SIZE));
282 *bytes_mapped += min_t(u32, new_mappings, bcnt); 641 *bytes_mapped += min_t(u32, new_mappings, size);
283 } 642 }
643
644 npages += np;
645 }
646
647 bcnt -= size;
648 if (unlikely(bcnt)) {
649 struct ib_umem_odp *next;
650
651 io_virt += size;
652 next = odp_next(odp);
653 if (unlikely(!next || next->umem->address != io_virt)) {
654 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
655 io_virt, next);
656 ret = -EAGAIN;
657 goto srcu_unlock_no_wait;
658 }
659 odp = next;
660 mr = odp->private;
661 goto next_mr;
284 } 662 }
285 663
286srcu_unlock: 664srcu_unlock:
287 if (ret == -EAGAIN) { 665 if (ret == -EAGAIN) {
288 if (!mr->umem->odp_data->dying) { 666 if (implicit || !odp->dying) {
289 struct ib_umem_odp *odp_data = mr->umem->odp_data;
290 unsigned long timeout = 667 unsigned long timeout =
291 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 668 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
292 669
293 if (!wait_for_completion_timeout( 670 if (!wait_for_completion_timeout(
294 &odp_data->notifier_completion, 671 &odp->notifier_completion,
295 timeout)) { 672 timeout)) {
296 pr_warn("timeout waiting for mmu notifier completion\n"); 673 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
674 current_seq, odp->notifiers_seq);
297 } 675 }
298 } else { 676 } else {
299 /* The MR is being killed, kill the QP as well. */ 677 /* The MR is being killed, kill the QP as well. */
300 ret = -EFAULT; 678 ret = -EFAULT;
301 } 679 }
302 } 680 }
303 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 681
682srcu_unlock_no_wait:
683 srcu_read_unlock(&dev->mr_srcu, srcu_key);
304 *bytes_committed = 0; 684 *bytes_committed = 0;
305 return ret ? ret : npages; 685 return ret ? ret : npages;
306} 686}
@@ -618,8 +998,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
618 goto resolve_page_fault; 998 goto resolve_page_fault;
619 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 999 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
620 if (ret != -ENOENT) 1000 if (ret != -ENOENT)
621 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n", 1001 mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n",
622 ret); 1002 ret, pfault->wqe.wq_num, pfault->type);
623 goto resolve_page_fault; 1003 goto resolve_page_fault;
624 } 1004 }
625 1005
@@ -627,7 +1007,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
627resolve_page_fault: 1007resolve_page_fault:
628 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1008 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
629 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1009 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
630 pfault->token, resume_with_error, 1010 pfault->wqe.wq_num, resume_with_error,
631 pfault->type); 1011 pfault->type);
632 free_page((unsigned long)buffer); 1012 free_page((unsigned long)buffer);
633} 1013}
@@ -700,10 +1080,9 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
700 ret = pagefault_single_data_segment(dev, rkey, address, 1080 ret = pagefault_single_data_segment(dev, rkey, address,
701 prefetch_len, 1081 prefetch_len,
702 &bytes_committed, NULL); 1082 &bytes_committed, NULL);
703 if (ret < 0) { 1083 if (ret < 0 && ret != -EAGAIN) {
704 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1084 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
705 ret, pfault->token, address, 1085 ret, pfault->token, address, prefetch_len);
706 prefetch_len);
707 } 1086 }
708 } 1087 }
709} 1088}
@@ -728,19 +1107,61 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
728 } 1107 }
729} 1108}
730 1109
731int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 1110void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
1111{
1112 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1113 return;
1114
1115 switch (ent->order - 2) {
1116 case MLX5_IMR_MTT_CACHE_ENTRY:
1117 ent->page = PAGE_SHIFT;
1118 ent->xlt = MLX5_IMR_MTT_ENTRIES *
1119 sizeof(struct mlx5_mtt) /
1120 MLX5_IB_UMR_OCTOWORD;
1121 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1122 ent->limit = 0;
1123 break;
1124
1125 case MLX5_IMR_KSM_CACHE_ENTRY:
1126 ent->page = MLX5_KSM_PAGE_SHIFT;
1127 ent->xlt = mlx5_imr_ksm_entries *
1128 sizeof(struct mlx5_klm) /
1129 MLX5_IB_UMR_OCTOWORD;
1130 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
1131 ent->limit = 0;
1132 break;
1133 }
1134}
1135
1136int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
732{ 1137{
733 int ret; 1138 int ret;
734 1139
735 ret = init_srcu_struct(&ibdev->mr_srcu); 1140 ret = init_srcu_struct(&dev->mr_srcu);
736 if (ret) 1141 if (ret)
737 return ret; 1142 return ret;
738 1143
1144 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
1145 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
1146 if (ret) {
1147 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
1148 return ret;
1149 }
1150 }
1151
739 return 0; 1152 return 0;
740} 1153}
741 1154
742void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 1155void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev)
1156{
1157 cleanup_srcu_struct(&dev->mr_srcu);
1158}
1159
1160int mlx5_ib_odp_init(void)
743{ 1161{
744 cleanup_srcu_struct(&ibdev->mr_srcu); 1162 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
1163 MLX5_IMR_MTT_BITS);
1164
1165 return 0;
745} 1166}
746 1167
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2534b8a0fd7b..886ff2b00500 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1053,6 +1053,8 @@ enum {
1053 1053
1054enum { 1054enum {
1055 MAX_UMR_CACHE_ENTRY = 20, 1055 MAX_UMR_CACHE_ENTRY = 20,
1056 MLX5_IMR_MTT_CACHE_ENTRY,
1057 MLX5_IMR_KSM_CACHE_ENTRY,
1056 MAX_MR_CACHE_ENTRIES 1058 MAX_MR_CACHE_ENTRIES
1057}; 1059};
1058 1060