aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/infiniband/hw/mlx5/main.c2
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h20
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c33
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c505
-rw-r--r--include/linux/mlx5/driver.h2
5 files changed, 513 insertions, 49 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index fe37da2be26f..eb8719ca500e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3583,6 +3583,8 @@ static int __init mlx5_ib_init(void)
3583{ 3583{
3584 int err; 3584 int err;
3585 3585
3586 mlx5_ib_odp_init();
3587
3586 err = mlx5_register_interface(&mlx5_ib_interface); 3588 err = mlx5_register_interface(&mlx5_ib_interface);
3587 3589
3588 return err; 3590 return err;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index efc44de3c7d7..3cd064b5f0bf 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -202,6 +202,7 @@ struct mlx5_ib_flow_db {
202#define MLX5_IB_UPD_XLT_ADDR BIT(3) 202#define MLX5_IB_UPD_XLT_ADDR BIT(3)
203#define MLX5_IB_UPD_XLT_PD BIT(4) 203#define MLX5_IB_UPD_XLT_PD BIT(4)
204#define MLX5_IB_UPD_XLT_ACCESS BIT(5) 204#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
205#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
205 206
206/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. 207/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
207 * 208 *
@@ -503,6 +504,10 @@ struct mlx5_ib_mr {
503 int live; 504 int live;
504 void *descs_alloc; 505 void *descs_alloc;
505 int access_flags; /* Needed for rereg MR */ 506 int access_flags; /* Needed for rereg MR */
507
508 struct mlx5_ib_mr *parent;
509 atomic_t num_leaf_free;
510 wait_queue_head_t q_leaf_free;
506}; 511};
507 512
508struct mlx5_ib_mw { 513struct mlx5_ib_mw {
@@ -637,6 +642,7 @@ struct mlx5_ib_dev {
637 * being used by a page fault handler. 642 * being used by a page fault handler.
638 */ 643 */
639 struct srcu_struct mr_srcu; 644 struct srcu_struct mr_srcu;
645 u32 null_mkey;
640#endif 646#endif
641 struct mlx5_ib_flow_db flow_db; 647 struct mlx5_ib_flow_db flow_db;
642 /* protect resources needed as part of reset flow */ 648 /* protect resources needed as part of reset flow */
@@ -789,6 +795,9 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
789int mlx5_ib_dealloc_mw(struct ib_mw *mw); 795int mlx5_ib_dealloc_mw(struct ib_mw *mw);
790int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, 796int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
791 int page_shift, int flags); 797 int page_shift, int flags);
798struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
799 int access_flags);
800void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
792int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 801int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
793 u64 length, u64 virt_addr, int access_flags, 802 u64 length, u64 virt_addr, int access_flags,
794 struct ib_pd *pd, struct ib_udata *udata); 803 struct ib_pd *pd, struct ib_udata *udata);
@@ -868,6 +877,9 @@ int __init mlx5_ib_odp_init(void);
868void mlx5_ib_odp_cleanup(void); 877void mlx5_ib_odp_cleanup(void);
869void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 878void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
870 unsigned long end); 879 unsigned long end);
880void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
881void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
882 size_t nentries, struct mlx5_ib_mr *mr, int flags);
871#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 883#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
872static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 884static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
873{ 885{
@@ -875,9 +887,13 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
875} 887}
876 888
877static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } 889static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
878static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} 890static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
879static inline int mlx5_ib_odp_init(void) { return 0; } 891static inline int mlx5_ib_odp_init(void) { return 0; }
880static inline void mlx5_ib_odp_cleanup(void) {} 892static inline void mlx5_ib_odp_cleanup(void) {}
893static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
894static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
895 size_t nentries, struct mlx5_ib_mr *mr,
896 int flags) {}
881 897
882#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 898#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
883 899
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 8f5b94d483e4..3c1f483d003f 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -469,7 +469,7 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
469 spin_unlock_irq(&ent->lock); 469 spin_unlock_irq(&ent->lock);
470 470
471 err = add_keys(dev, entry, 1); 471 err = add_keys(dev, entry, 1);
472 if (err) 472 if (err && err != -EAGAIN)
473 return ERR_PTR(err); 473 return ERR_PTR(err);
474 474
475 wait_for_completion(&ent->compl); 475 wait_for_completion(&ent->compl);
@@ -669,8 +669,10 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
669 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 669 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
670 queue_work(cache->wq, &ent->work); 670 queue_work(cache->wq, &ent->work);
671 671
672 if (i > MAX_UMR_CACHE_ENTRY) 672 if (i > MAX_UMR_CACHE_ENTRY) {
673 mlx5_odp_init_mr_cache_entry(ent);
673 continue; 674 continue;
675 }
674 676
675 if (!use_umr(dev, ent->order)) 677 if (!use_umr(dev, ent->order))
676 continue; 678 continue;
@@ -935,6 +937,10 @@ static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
935{ 937{
936 struct mlx5_ib_dev *dev = mr->dev; 938 struct mlx5_ib_dev *dev = mr->dev;
937 struct ib_umem *umem = mr->umem; 939 struct ib_umem *umem = mr->umem;
940 if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
941 mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
942 return npages;
943 }
938 944
939 npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx); 945 npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
940 946
@@ -968,7 +974,9 @@ int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
968 struct mlx5_umr_wr wr; 974 struct mlx5_umr_wr wr;
969 struct ib_sge sg; 975 struct ib_sge sg;
970 int err = 0; 976 int err = 0;
971 int desc_size = sizeof(struct mlx5_mtt); 977 int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT)
978 ? sizeof(struct mlx5_klm)
979 : sizeof(struct mlx5_mtt);
972 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; 980 const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size;
973 const int page_mask = page_align - 1; 981 const int page_mask = page_align - 1;
974 size_t pages_mapped = 0; 982 size_t pages_mapped = 0;
@@ -1186,6 +1194,18 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1186 1194
1187 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", 1195 mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1188 start, virt_addr, length, access_flags); 1196 start, virt_addr, length, access_flags);
1197
1198#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1199 if (!start && length == U64_MAX) {
1200 if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
1201 !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1202 return ERR_PTR(-EINVAL);
1203
1204 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
1205 return &mr->ibmr;
1206 }
1207#endif
1208
1189 err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, 1209 err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
1190 &page_shift, &ncont, &order); 1210 &page_shift, &ncont, &order);
1191 1211
@@ -1471,8 +1491,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1471 /* Wait for all running page-fault handlers to finish. */ 1491 /* Wait for all running page-fault handlers to finish. */
1472 synchronize_srcu(&dev->mr_srcu); 1492 synchronize_srcu(&dev->mr_srcu);
1473 /* Destroy all page mappings */ 1493 /* Destroy all page mappings */
1474 mlx5_ib_invalidate_range(umem, ib_umem_start(umem), 1494 if (umem->odp_data->page_list)
1475 ib_umem_end(umem)); 1495 mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1496 ib_umem_end(umem));
1497 else
1498 mlx5_ib_free_implicit_mr(mr);
1476 /* 1499 /*
1477 * We kill the umem before the MR for ODP, 1500 * We kill the umem before the MR for ODP,
1478 * so that there will not be any invalidations in 1501 * so that there will not be any invalidations in
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index e5bc267aca73..d7b12f0750e2 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -34,6 +34,7 @@
34#include <rdma/ib_umem_odp.h> 34#include <rdma/ib_umem_odp.h>
35 35
36#include "mlx5_ib.h" 36#include "mlx5_ib.h"
37#include "cmd.h"
37 38
38#define MAX_PREFETCH_LEN (4*1024*1024U) 39#define MAX_PREFETCH_LEN (4*1024*1024U)
39 40
@@ -41,6 +42,140 @@
41 * a pagefault. */ 42 * a pagefault. */
42#define MMU_NOTIFIER_TIMEOUT 1000 43#define MMU_NOTIFIER_TIMEOUT 1000
43 44
45#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
46#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
47#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
48#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
49#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
50
51#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
52
53static u64 mlx5_imr_ksm_entries;
54
55static int check_parent(struct ib_umem_odp *odp,
56 struct mlx5_ib_mr *parent)
57{
58 struct mlx5_ib_mr *mr = odp->private;
59
60 return mr && mr->parent == parent;
61}
62
63static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
64{
65 struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
66 struct ib_ucontext *ctx = odp->umem->context;
67 struct rb_node *rb;
68
69 down_read(&ctx->umem_rwsem);
70 while (1) {
71 rb = rb_next(&odp->interval_tree.rb);
72 if (!rb)
73 goto not_found;
74 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
75 if (check_parent(odp, parent))
76 goto end;
77 }
78not_found:
79 odp = NULL;
80end:
81 up_read(&ctx->umem_rwsem);
82 return odp;
83}
84
85static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx,
86 u64 start, u64 length,
87 struct mlx5_ib_mr *parent)
88{
89 struct ib_umem_odp *odp;
90 struct rb_node *rb;
91
92 down_read(&ctx->umem_rwsem);
93 odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length);
94 if (!odp)
95 goto end;
96
97 while (1) {
98 if (check_parent(odp, parent))
99 goto end;
100 rb = rb_next(&odp->interval_tree.rb);
101 if (!rb)
102 goto not_found;
103 odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
104 if (ib_umem_start(odp->umem) > start + length)
105 goto not_found;
106 }
107not_found:
108 odp = NULL;
109end:
110 up_read(&ctx->umem_rwsem);
111 return odp;
112}
113
114void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
115 size_t nentries, struct mlx5_ib_mr *mr, int flags)
116{
117 struct ib_pd *pd = mr->ibmr.pd;
118 struct ib_ucontext *ctx = pd->uobject->context;
119 struct mlx5_ib_dev *dev = to_mdev(pd->device);
120 struct ib_umem_odp *odp;
121 unsigned long va;
122 int i;
123
124 if (flags & MLX5_IB_UPD_XLT_ZAP) {
125 for (i = 0; i < nentries; i++, pklm++) {
126 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
127 pklm->key = cpu_to_be32(dev->null_mkey);
128 pklm->va = 0;
129 }
130 return;
131 }
132
133 odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE,
134 nentries * MLX5_IMR_MTT_SIZE, mr);
135
136 for (i = 0; i < nentries; i++, pklm++) {
137 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
138 va = (offset + i) * MLX5_IMR_MTT_SIZE;
139 if (odp && odp->umem->address == va) {
140 struct mlx5_ib_mr *mtt = odp->private;
141
142 pklm->key = cpu_to_be32(mtt->ibmr.lkey);
143 odp = odp_next(odp);
144 } else {
145 pklm->key = cpu_to_be32(dev->null_mkey);
146 }
147 mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
148 i, va, be32_to_cpu(pklm->key));
149 }
150}
151
152static void mr_leaf_free_action(struct work_struct *work)
153{
154 struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
155 int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT;
156 struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
157
158 mr->parent = NULL;
159 synchronize_srcu(&mr->dev->mr_srcu);
160
161 if (!READ_ONCE(odp->dying)) {
162 mr->parent = imr;
163 if (atomic_dec_and_test(&imr->num_leaf_free))
164 wake_up(&imr->q_leaf_free);
165 return;
166 }
167
168 ib_umem_release(odp->umem);
169 if (imr->live)
170 mlx5_ib_update_xlt(imr, idx, 1, 0,
171 MLX5_IB_UPD_XLT_INDIRECT |
172 MLX5_IB_UPD_XLT_ATOMIC);
173 mlx5_mr_cache_free(mr->dev, mr);
174
175 if (atomic_dec_and_test(&imr->num_leaf_free))
176 wake_up(&imr->q_leaf_free);
177}
178
44void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, 179void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
45 unsigned long end) 180 unsigned long end)
46{ 181{
@@ -111,6 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
111 */ 246 */
112 247
113 ib_umem_odp_unmap_dma_pages(umem, start, end); 248 ib_umem_odp_unmap_dma_pages(umem, start, end);
249
250 if (unlikely(!umem->npages && mr->parent &&
251 !umem->odp_data->dying)) {
252 WRITE_ONCE(umem->odp_data->dying, 1);
253 atomic_inc(&mr->parent->num_leaf_free);
254 schedule_work(&umem->odp_data->work);
255 }
114} 256}
115 257
116void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) 258void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
@@ -147,6 +289,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
147 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) 289 if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
148 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; 290 caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
149 291
292 if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
293 MLX5_CAP_GEN(dev->mdev, null_mkey) &&
294 MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
295 caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
296
150 return; 297 return;
151} 298}
152 299
@@ -184,6 +331,197 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
184 wq_num); 331 wq_num);
185} 332}
186 333
334static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
335 struct ib_umem *umem,
336 bool ksm, int access_flags)
337{
338 struct mlx5_ib_dev *dev = to_mdev(pd->device);
339 struct mlx5_ib_mr *mr;
340 int err;
341
342 mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
343 MLX5_IMR_MTT_CACHE_ENTRY);
344
345 if (IS_ERR(mr))
346 return mr;
347
348 mr->ibmr.pd = pd;
349
350 mr->dev = dev;
351 mr->access_flags = access_flags;
352 mr->mmkey.iova = 0;
353 mr->umem = umem;
354
355 if (ksm) {
356 err = mlx5_ib_update_xlt(mr, 0,
357 mlx5_imr_ksm_entries,
358 MLX5_KSM_PAGE_SHIFT,
359 MLX5_IB_UPD_XLT_INDIRECT |
360 MLX5_IB_UPD_XLT_ZAP |
361 MLX5_IB_UPD_XLT_ENABLE);
362
363 } else {
364 err = mlx5_ib_update_xlt(mr, 0,
365 MLX5_IMR_MTT_ENTRIES,
366 PAGE_SHIFT,
367 MLX5_IB_UPD_XLT_ZAP |
368 MLX5_IB_UPD_XLT_ENABLE |
369 MLX5_IB_UPD_XLT_ATOMIC);
370 }
371
372 if (err)
373 goto fail;
374
375 mr->ibmr.lkey = mr->mmkey.key;
376 mr->ibmr.rkey = mr->mmkey.key;
377
378 mr->live = 1;
379
380 mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
381 mr->mmkey.key, dev->mdev, mr);
382
383 return mr;
384
385fail:
386 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
387 mlx5_mr_cache_free(dev, mr);
388
389 return ERR_PTR(err);
390}
391
392static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
393 u64 io_virt, size_t bcnt)
394{
395 struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context;
396 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
397 struct ib_umem_odp *odp, *result = NULL;
398 u64 addr = io_virt & MLX5_IMR_MTT_MASK;
399 int nentries = 0, start_idx = 0, ret;
400 struct mlx5_ib_mr *mtt;
401 struct ib_umem *umem;
402
403 mutex_lock(&mr->umem->odp_data->umem_mutex);
404 odp = odp_lookup(ctx, addr, 1, mr);
405
406 mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
407 io_virt, bcnt, addr, odp);
408
409next_mr:
410 if (likely(odp)) {
411 if (nentries)
412 nentries++;
413 } else {
414 umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE);
415 if (IS_ERR(umem)) {
416 mutex_unlock(&mr->umem->odp_data->umem_mutex);
417 return ERR_CAST(umem);
418 }
419
420 mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags);
421 if (IS_ERR(mtt)) {
422 mutex_unlock(&mr->umem->odp_data->umem_mutex);
423 ib_umem_release(umem);
424 return ERR_CAST(mtt);
425 }
426
427 odp = umem->odp_data;
428 odp->private = mtt;
429 mtt->umem = umem;
430 mtt->mmkey.iova = addr;
431 mtt->parent = mr;
432 INIT_WORK(&odp->work, mr_leaf_free_action);
433
434 if (!nentries)
435 start_idx = addr >> MLX5_IMR_MTT_SHIFT;
436 nentries++;
437 }
438
439 odp->dying = 0;
440
441 /* Return first odp if region not covered by single one */
442 if (likely(!result))
443 result = odp;
444
445 addr += MLX5_IMR_MTT_SIZE;
446 if (unlikely(addr < io_virt + bcnt)) {
447 odp = odp_next(odp);
448 if (odp && odp->umem->address != addr)
449 odp = NULL;
450 goto next_mr;
451 }
452
453 if (unlikely(nentries)) {
454 ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
455 MLX5_IB_UPD_XLT_INDIRECT |
456 MLX5_IB_UPD_XLT_ATOMIC);
457 if (ret) {
458 mlx5_ib_err(dev, "Failed to update PAS\n");
459 result = ERR_PTR(ret);
460 }
461 }
462
463 mutex_unlock(&mr->umem->odp_data->umem_mutex);
464 return result;
465}
466
467struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
468 int access_flags)
469{
470 struct ib_ucontext *ctx = pd->ibpd.uobject->context;
471 struct mlx5_ib_mr *imr;
472 struct ib_umem *umem;
473
474 umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0);
475 if (IS_ERR(umem))
476 return ERR_CAST(umem);
477
478 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags);
479 if (IS_ERR(imr)) {
480 ib_umem_release(umem);
481 return ERR_CAST(imr);
482 }
483
484 imr->umem = umem;
485 init_waitqueue_head(&imr->q_leaf_free);
486 atomic_set(&imr->num_leaf_free, 0);
487
488 return imr;
489}
490
491static int mr_leaf_free(struct ib_umem *umem, u64 start,
492 u64 end, void *cookie)
493{
494 struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie;
495
496 if (mr->parent != imr)
497 return 0;
498
499 ib_umem_odp_unmap_dma_pages(umem,
500 ib_umem_start(umem),
501 ib_umem_end(umem));
502
503 if (umem->odp_data->dying)
504 return 0;
505
506 WRITE_ONCE(umem->odp_data->dying, 1);
507 atomic_inc(&imr->num_leaf_free);
508 schedule_work(&umem->odp_data->work);
509
510 return 0;
511}
512
513void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
514{
515 struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context;
516
517 down_read(&ctx->umem_rwsem);
518 rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX,
519 mr_leaf_free, imr);
520 up_read(&ctx->umem_rwsem);
521
522 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
523}
524
187/* 525/*
188 * Handle a single data segment in a page-fault WQE or RDMA region. 526 * Handle a single data segment in a page-fault WQE or RDMA region.
189 * 527 *
@@ -195,47 +533,43 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
195 * -EFAULT when there's an error mapping the requested pages. The caller will 533 * -EFAULT when there's an error mapping the requested pages. The caller will
196 * abort the page fault handling. 534 * abort the page fault handling.
197 */ 535 */
198static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev, 536static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
199 u32 key, u64 io_virt, size_t bcnt, 537 u32 key, u64 io_virt, size_t bcnt,
200 u32 *bytes_committed, 538 u32 *bytes_committed,
201 u32 *bytes_mapped) 539 u32 *bytes_mapped)
202{ 540{
203 int srcu_key; 541 int srcu_key;
204 unsigned int current_seq; 542 unsigned int current_seq = 0;
205 u64 start_idx; 543 u64 start_idx;
206 int npages = 0, ret = 0; 544 int npages = 0, ret = 0;
207 struct mlx5_ib_mr *mr; 545 struct mlx5_ib_mr *mr;
208 u64 access_mask = ODP_READ_ALLOWED_BIT; 546 u64 access_mask = ODP_READ_ALLOWED_BIT;
547 struct ib_umem_odp *odp;
548 int implicit = 0;
549 size_t size;
209 550
210 srcu_key = srcu_read_lock(&mib_dev->mr_srcu); 551 srcu_key = srcu_read_lock(&dev->mr_srcu);
211 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); 552 mr = mlx5_ib_odp_find_mr_lkey(dev, key);
212 /* 553 /*
213 * If we didn't find the MR, it means the MR was closed while we were 554 * If we didn't find the MR, it means the MR was closed while we were
214 * handling the ODP event. In this case we return -EFAULT so that the 555 * handling the ODP event. In this case we return -EFAULT so that the
215 * QP will be closed. 556 * QP will be closed.
216 */ 557 */
217 if (!mr || !mr->ibmr.pd) { 558 if (!mr || !mr->ibmr.pd) {
218 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", 559 mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
219 key); 560 key);
220 ret = -EFAULT; 561 ret = -EFAULT;
221 goto srcu_unlock; 562 goto srcu_unlock;
222 } 563 }
223 if (!mr->umem->odp_data) { 564 if (!mr->umem->odp_data) {
224 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", 565 mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
225 key); 566 key);
226 if (bytes_mapped) 567 if (bytes_mapped)
227 *bytes_mapped += 568 *bytes_mapped +=
228 (bcnt - *bytes_committed); 569 (bcnt - *bytes_committed);
229 goto srcu_unlock; 570 goto srcu_unlock;
230 } 571 }
231 572
232 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
233 /*
234 * Ensure the sequence number is valid for some time before we call
235 * gup.
236 */
237 smp_rmb();
238
239 /* 573 /*
240 * Avoid branches - this code will perform correctly 574 * Avoid branches - this code will perform correctly
241 * in all iterations (in iteration 2 and above, 575 * in all iterations (in iteration 2 and above,
@@ -244,63 +578,109 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *mib_dev,
244 io_virt += *bytes_committed; 578 io_virt += *bytes_committed;
245 bcnt -= *bytes_committed; 579 bcnt -= *bytes_committed;
246 580
581 if (!mr->umem->odp_data->page_list) {
582 odp = implicit_mr_get_data(mr, io_virt, bcnt);
583
584 if (IS_ERR(odp)) {
585 ret = PTR_ERR(odp);
586 goto srcu_unlock;
587 }
588 mr = odp->private;
589 implicit = 1;
590
591 } else {
592 odp = mr->umem->odp_data;
593 }
594
595next_mr:
596 current_seq = READ_ONCE(odp->notifiers_seq);
597 /*
598 * Ensure the sequence number is valid for some time before we call
599 * gup.
600 */
601 smp_rmb();
602
603 size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt);
247 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; 604 start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT;
248 605
249 if (mr->umem->writable) 606 if (mr->umem->writable)
250 access_mask |= ODP_WRITE_ALLOWED_BIT; 607 access_mask |= ODP_WRITE_ALLOWED_BIT;
251 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, 608
252 access_mask, current_seq); 609 ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size,
253 if (npages < 0) { 610 access_mask, current_seq);
254 ret = npages; 611
612 if (ret < 0)
255 goto srcu_unlock; 613 goto srcu_unlock;
256 }
257 614
258 if (npages > 0) { 615 if (ret > 0) {
259 mutex_lock(&mr->umem->odp_data->umem_mutex); 616 int np = ret;
617
618 mutex_lock(&odp->umem_mutex);
260 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { 619 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
261 /* 620 /*
262 * No need to check whether the MTTs really belong to 621 * No need to check whether the MTTs really belong to
263 * this MR, since ib_umem_odp_map_dma_pages already 622 * this MR, since ib_umem_odp_map_dma_pages already
264 * checks this. 623 * checks this.
265 */ 624 */
266 ret = mlx5_ib_update_xlt(mr, start_idx, npages, 625 ret = mlx5_ib_update_xlt(mr, start_idx, np,
267 PAGE_SHIFT, 626 PAGE_SHIFT,
268 MLX5_IB_UPD_XLT_ATOMIC); 627 MLX5_IB_UPD_XLT_ATOMIC);
269 } else { 628 } else {
270 ret = -EAGAIN; 629 ret = -EAGAIN;
271 } 630 }
272 mutex_unlock(&mr->umem->odp_data->umem_mutex); 631 mutex_unlock(&odp->umem_mutex);
273 if (ret < 0) { 632 if (ret < 0) {
274 if (ret != -EAGAIN) 633 if (ret != -EAGAIN)
275 pr_err("Failed to update mkey page tables\n"); 634 mlx5_ib_err(dev, "Failed to update mkey page tables\n");
276 goto srcu_unlock; 635 goto srcu_unlock;
277 } 636 }
278 637
279 if (bytes_mapped) { 638 if (bytes_mapped) {
280 u32 new_mappings = npages * PAGE_SIZE - 639 u32 new_mappings = np * PAGE_SIZE -
281 (io_virt - round_down(io_virt, PAGE_SIZE)); 640 (io_virt - round_down(io_virt, PAGE_SIZE));
282 *bytes_mapped += min_t(u32, new_mappings, bcnt); 641 *bytes_mapped += min_t(u32, new_mappings, size);
283 } 642 }
643
644 npages += np;
645 }
646
647 bcnt -= size;
648 if (unlikely(bcnt)) {
649 struct ib_umem_odp *next;
650
651 io_virt += size;
652 next = odp_next(odp);
653 if (unlikely(!next || next->umem->address != io_virt)) {
654 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
655 io_virt, next);
656 ret = -EAGAIN;
657 goto srcu_unlock_no_wait;
658 }
659 odp = next;
660 mr = odp->private;
661 goto next_mr;
284 } 662 }
285 663
286srcu_unlock: 664srcu_unlock:
287 if (ret == -EAGAIN) { 665 if (ret == -EAGAIN) {
288 if (!mr->umem->odp_data->dying) { 666 if (implicit || !odp->dying) {
289 struct ib_umem_odp *odp_data = mr->umem->odp_data;
290 unsigned long timeout = 667 unsigned long timeout =
291 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); 668 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
292 669
293 if (!wait_for_completion_timeout( 670 if (!wait_for_completion_timeout(
294 &odp_data->notifier_completion, 671 &odp->notifier_completion,
295 timeout)) { 672 timeout)) {
296 pr_warn("timeout waiting for mmu notifier completion\n"); 673 mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n",
674 current_seq, odp->notifiers_seq);
297 } 675 }
298 } else { 676 } else {
299 /* The MR is being killed, kill the QP as well. */ 677 /* The MR is being killed, kill the QP as well. */
300 ret = -EFAULT; 678 ret = -EFAULT;
301 } 679 }
302 } 680 }
303 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 681
682srcu_unlock_no_wait:
683 srcu_read_unlock(&dev->mr_srcu, srcu_key);
304 *bytes_committed = 0; 684 *bytes_committed = 0;
305 return ret ? ret : npages; 685 return ret ? ret : npages;
306} 686}
@@ -618,8 +998,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
618 goto resolve_page_fault; 998 goto resolve_page_fault;
619 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { 999 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
620 if (ret != -ENOENT) 1000 if (ret != -ENOENT)
621 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: %d\n", 1001 mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n",
622 ret); 1002 ret, pfault->wqe.wq_num, pfault->type);
623 goto resolve_page_fault; 1003 goto resolve_page_fault;
624 } 1004 }
625 1005
@@ -627,7 +1007,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
627resolve_page_fault: 1007resolve_page_fault:
628 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); 1008 mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
629 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", 1009 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
630 pfault->token, resume_with_error, 1010 pfault->wqe.wq_num, resume_with_error,
631 pfault->type); 1011 pfault->type);
632 free_page((unsigned long)buffer); 1012 free_page((unsigned long)buffer);
633} 1013}
@@ -700,10 +1080,9 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
700 ret = pagefault_single_data_segment(dev, rkey, address, 1080 ret = pagefault_single_data_segment(dev, rkey, address,
701 prefetch_len, 1081 prefetch_len,
702 &bytes_committed, NULL); 1082 &bytes_committed, NULL);
703 if (ret < 0) { 1083 if (ret < 0 && ret != -EAGAIN) {
704 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", 1084 mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
705 ret, pfault->token, address, 1085 ret, pfault->token, address, prefetch_len);
706 prefetch_len);
707 } 1086 }
708 } 1087 }
709} 1088}
@@ -728,19 +1107,61 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context,
728 } 1107 }
729} 1108}
730 1109
731int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) 1110void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
1111{
1112 if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
1113 return;
1114
1115 switch (ent->order - 2) {
1116 case MLX5_IMR_MTT_CACHE_ENTRY:
1117 ent->page = PAGE_SHIFT;
1118 ent->xlt = MLX5_IMR_MTT_ENTRIES *
1119 sizeof(struct mlx5_mtt) /
1120 MLX5_IB_UMR_OCTOWORD;
1121 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
1122 ent->limit = 0;
1123 break;
1124
1125 case MLX5_IMR_KSM_CACHE_ENTRY:
1126 ent->page = MLX5_KSM_PAGE_SHIFT;
1127 ent->xlt = mlx5_imr_ksm_entries *
1128 sizeof(struct mlx5_klm) /
1129 MLX5_IB_UMR_OCTOWORD;
1130 ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM;
1131 ent->limit = 0;
1132 break;
1133 }
1134}
1135
1136int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
732{ 1137{
733 int ret; 1138 int ret;
734 1139
735 ret = init_srcu_struct(&ibdev->mr_srcu); 1140 ret = init_srcu_struct(&dev->mr_srcu);
736 if (ret) 1141 if (ret)
737 return ret; 1142 return ret;
738 1143
1144 if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) {
1145 ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey);
1146 if (ret) {
1147 mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret);
1148 return ret;
1149 }
1150 }
1151
739 return 0; 1152 return 0;
740} 1153}
741 1154
742void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) 1155void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev)
1156{
1157 cleanup_srcu_struct(&dev->mr_srcu);
1158}
1159
1160int mlx5_ib_odp_init(void)
743{ 1161{
744 cleanup_srcu_struct(&ibdev->mr_srcu); 1162 mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
1163 MLX5_IMR_MTT_BITS);
1164
1165 return 0;
745} 1166}
746 1167
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 2534b8a0fd7b..886ff2b00500 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1053,6 +1053,8 @@ enum {
1053 1053
1054enum { 1054enum {
1055 MAX_UMR_CACHE_ENTRY = 20, 1055 MAX_UMR_CACHE_ENTRY = 20,
1056 MLX5_IMR_MTT_CACHE_ENTRY,
1057 MLX5_IMR_KSM_CACHE_ENTRY,
1056 MAX_MR_CACHE_ENTRIES 1058 MAX_MR_CACHE_ENTRIES
1057}; 1059};
1058 1060