summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/infiniband/hw/mlx5/main.c4
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h3
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c79
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c128
4 files changed, 198 insertions, 16 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index a801baa79c8e..8a87404e9c76 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -574,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
574 goto out_count; 574 goto out_count;
575 } 575 }
576 576
577#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
578 context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
579#endif
580
577 INIT_LIST_HEAD(&context->db_page_list); 581 INIT_LIST_HEAD(&context->db_page_list);
578 mutex_init(&context->db_page_mutex); 582 mutex_init(&context->db_page_mutex);
579 583
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index c6ceec3e3d6a..83f22fe297c8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -325,6 +325,7 @@ struct mlx5_ib_mr {
325 struct mlx5_ib_dev *dev; 325 struct mlx5_ib_dev *dev;
326 struct mlx5_create_mkey_mbox_out out; 326 struct mlx5_create_mkey_mbox_out out;
327 struct mlx5_core_sig_ctx *sig; 327 struct mlx5_core_sig_ctx *sig;
328 int live;
328}; 329};
329 330
330struct mlx5_ib_fast_reg_page_list { 331struct mlx5_ib_fast_reg_page_list {
@@ -629,6 +630,8 @@ int __init mlx5_ib_odp_init(void);
629void mlx5_ib_odp_cleanup(void); 630void mlx5_ib_odp_cleanup(void);
630void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); 631void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
631void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); 632void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
633void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
634 unsigned long end);
632 635
633#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 636#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
634static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) 637static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 922ac85b7198..32a28bd50b20 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -37,6 +37,7 @@
37#include <linux/export.h> 37#include <linux/export.h>
38#include <linux/delay.h> 38#include <linux/delay.h>
39#include <rdma/ib_umem.h> 39#include <rdma/ib_umem.h>
40#include <rdma/ib_umem_odp.h>
40#include <rdma/ib_verbs.h> 41#include <rdma/ib_verbs.h>
41#include "mlx5_ib.h" 42#include "mlx5_ib.h"
42 43
@@ -54,6 +55,18 @@ static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
54 55
55static int clean_mr(struct mlx5_ib_mr *mr); 56static int clean_mr(struct mlx5_ib_mr *mr);
56 57
58static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
59{
60 int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
61
62#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
63 /* Wait until all page fault handlers using the mr complete. */
64 synchronize_srcu(&dev->mr_srcu);
65#endif
66
67 return err;
68}
69
57static int order2idx(struct mlx5_ib_dev *dev, int order) 70static int order2idx(struct mlx5_ib_dev *dev, int order)
58{ 71{
59 struct mlx5_mr_cache *cache = &dev->cache; 72 struct mlx5_mr_cache *cache = &dev->cache;
@@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
191 ent->cur--; 204 ent->cur--;
192 ent->size--; 205 ent->size--;
193 spin_unlock_irq(&ent->lock); 206 spin_unlock_irq(&ent->lock);
194 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 207 err = destroy_mkey(dev, mr);
195 if (err) 208 if (err)
196 mlx5_ib_warn(dev, "failed destroy mkey\n"); 209 mlx5_ib_warn(dev, "failed destroy mkey\n");
197 else 210 else
@@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
482 ent->cur--; 495 ent->cur--;
483 ent->size--; 496 ent->size--;
484 spin_unlock_irq(&ent->lock); 497 spin_unlock_irq(&ent->lock);
485 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 498 err = destroy_mkey(dev, mr);
486 if (err) 499 if (err)
487 mlx5_ib_warn(dev, "failed destroy mkey\n"); 500 mlx5_ib_warn(dev, "failed destroy mkey\n");
488 else 501 else
@@ -812,6 +825,8 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
812 mr->mmr.size = len; 825 mr->mmr.size = len;
813 mr->mmr.pd = to_mpd(pd)->pdn; 826 mr->mmr.pd = to_mpd(pd)->pdn;
814 827
828 mr->live = 1;
829
815unmap_dma: 830unmap_dma:
816 up(&umrc->sem); 831 up(&umrc->sem);
817 dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); 832 dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
@@ -997,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
997 goto err_2; 1012 goto err_2;
998 } 1013 }
999 mr->umem = umem; 1014 mr->umem = umem;
1015 mr->live = 1;
1000 kvfree(in); 1016 kvfree(in);
1001 1017
1002 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); 1018 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
@@ -1074,10 +1090,47 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1074 mr->ibmr.lkey = mr->mmr.key; 1090 mr->ibmr.lkey = mr->mmr.key;
1075 mr->ibmr.rkey = mr->mmr.key; 1091 mr->ibmr.rkey = mr->mmr.key;
1076 1092
1093#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1094 if (umem->odp_data) {
1095 /*
1096 * This barrier prevents the compiler from moving the
1097 * setting of umem->odp_data->private to point to our
1098 * MR, before reg_umr finished, to ensure that the MR
1099 * initialization have finished before starting to
1100 * handle invalidations.
1101 */
1102 smp_wmb();
1103 mr->umem->odp_data->private = mr;
1104 /*
1105 * Make sure we will see the new
1106 * umem->odp_data->private value in the invalidation
1107 * routines, before we can get page faults on the
1108 * MR. Page faults can happen once we put the MR in
1109 * the tree, below this line. Without the barrier,
1110 * there can be a fault handling and an invalidation
1111 * before umem->odp_data->private == mr is visible to
1112 * the invalidation handler.
1113 */
1114 smp_wmb();
1115 }
1116#endif
1117
1077 return &mr->ibmr; 1118 return &mr->ibmr;
1078 1119
1079error: 1120error:
1121 /*
1122 * Destroy the umem *before* destroying the MR, to ensure we
1123 * will not have any in-flight notifiers when destroying the
1124 * MR.
1125 *
1126 * As the MR is completely invalid to begin with, and this
1127 * error path is only taken if we can't push the mr entry into
1128 * the pagefault tree, this is safe.
1129 */
1130
1080 ib_umem_release(umem); 1131 ib_umem_release(umem);
1132 /* Kill the MR, and return an error code. */
1133 clean_mr(mr);
1081 return ERR_PTR(err); 1134 return ERR_PTR(err);
1082} 1135}
1083 1136
@@ -1121,7 +1174,7 @@ static int clean_mr(struct mlx5_ib_mr *mr)
1121 int err; 1174 int err;
1122 1175
1123 if (!umred) { 1176 if (!umred) {
1124 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 1177 err = destroy_mkey(dev, mr);
1125 if (err) { 1178 if (err) {
1126 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", 1179 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1127 mr->mmr.key, err); 1180 mr->mmr.key, err);
@@ -1150,9 +1203,25 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1150 struct ib_umem *umem = mr->umem; 1203 struct ib_umem *umem = mr->umem;
1151 1204
1152#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 1205#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1153 if (umem) 1206 if (umem && umem->odp_data) {
1207 /* Prevent new page faults from succeeding */
1208 mr->live = 0;
1154 /* Wait for all running page-fault handlers to finish. */ 1209 /* Wait for all running page-fault handlers to finish. */
1155 synchronize_srcu(&dev->mr_srcu); 1210 synchronize_srcu(&dev->mr_srcu);
1211 /* Destroy all page mappings */
1212 mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1213 ib_umem_end(umem));
1214 /*
1215 * We kill the umem before the MR for ODP,
1216 * so that there will not be any invalidations in
1217 * flight, looking at the *mr struct.
1218 */
1219 ib_umem_release(umem);
1220 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1221
1222 /* Avoid double-freeing the umem. */
1223 umem = NULL;
1224 }
1156#endif 1225#endif
1157 1226
1158 clean_mr(mr); 1227 clean_mr(mr);
@@ -1269,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
1269 kfree(mr->sig); 1338 kfree(mr->sig);
1270 } 1339 }
1271 1340
1272 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 1341 err = destroy_mkey(dev, mr);
1273 if (err) { 1342 if (err) {
1274 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", 1343 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1275 mr->mmr.key, err); 1344 mr->mmr.key, err);
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 936a6cd4ecc7..a2c541c4809a 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -37,8 +37,78 @@
37 37
38#define MAX_PREFETCH_LEN (4*1024*1024U) 38#define MAX_PREFETCH_LEN (4*1024*1024U)
39 39
40/* Timeout in ms to wait for an active mmu notifier to complete when handling
41 * a pagefault. */
42#define MMU_NOTIFIER_TIMEOUT 1000
43
40struct workqueue_struct *mlx5_ib_page_fault_wq; 44struct workqueue_struct *mlx5_ib_page_fault_wq;
41 45
46void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
47 unsigned long end)
48{
49 struct mlx5_ib_mr *mr;
50 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
51 u64 idx = 0, blk_start_idx = 0;
52 int in_block = 0;
53 u64 addr;
54
55 if (!umem || !umem->odp_data) {
56 pr_err("invalidation called on NULL umem or non-ODP umem\n");
57 return;
58 }
59
60 mr = umem->odp_data->private;
61
62 if (!mr || !mr->ibmr.pd)
63 return;
64
65 start = max_t(u64, ib_umem_start(umem), start);
66 end = min_t(u64, ib_umem_end(umem), end);
67
68 /*
69 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
70 * while we are doing the invalidation, no page fault will attempt to
71 * overwrite the same MTTs. Concurent invalidations might race us,
72 * but they will write 0s as well, so no difference in the end result.
73 */
74
75 for (addr = start; addr < end; addr += (u64)umem->page_size) {
76 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
77 /*
78 * Strive to write the MTTs in chunks, but avoid overwriting
79 * non-existing MTTs. The huristic here can be improved to
80 * estimate the cost of another UMR vs. the cost of bigger
81 * UMR.
82 */
83 if (umem->odp_data->dma_list[idx] &
84 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
85 if (!in_block) {
86 blk_start_idx = idx;
87 in_block = 1;
88 }
89 } else {
90 u64 umr_offset = idx & umr_block_mask;
91
92 if (in_block && umr_offset == 0) {
93 mlx5_ib_update_mtt(mr, blk_start_idx,
94 idx - blk_start_idx, 1);
95 in_block = 0;
96 }
97 }
98 }
99 if (in_block)
100 mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
101 1);
102
103 /*
104 * We are now sure that the device will not access the
105 * memory. We can safely unmap it, and mark it as dirty if
106 * needed.
107 */
108
109 ib_umem_odp_unmap_dma_pages(umem, start, end);
110}
111
42#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ 112#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
43 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ 113 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
44 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ 114 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
@@ -59,9 +129,18 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
59 if (err) 129 if (err)
60 goto out; 130 goto out;
61 131
62 /* At this point we would copy the capability bits that the driver 132 caps->general_caps = IB_ODP_SUPPORT;
63 * supports from the hw_caps struct to the caps struct. However, no 133 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
64 * such capabilities are supported so far. */ 134 SEND);
135 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
136 SEND);
137 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
138 RECV);
139 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
140 WRITE);
141 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
142 READ);
143
65out: 144out:
66 return err; 145 return err;
67} 146}
@@ -71,8 +150,9 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
71{ 150{
72 u32 base_key = mlx5_base_mkey(key); 151 u32 base_key = mlx5_base_mkey(key);
73 struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); 152 struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
153 struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
74 154
75 if (!mmr || mmr->key != key) 155 if (!mmr || mmr->key != key || !mr->live)
76 return NULL; 156 return NULL;
77 157
78 return container_of(mmr, struct mlx5_ib_mr, mmr); 158 return container_of(mmr, struct mlx5_ib_mr, mmr);
@@ -143,6 +223,11 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
143 } 223 }
144 224
145 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); 225 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
226 /*
227 * Ensure the sequence number is valid for some time before we call
228 * gup.
229 */
230 smp_rmb();
146 231
147 /* 232 /*
148 * Avoid branches - this code will perform correctly 233 * Avoid branches - this code will perform correctly
@@ -165,15 +250,20 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
165 250
166 if (npages > 0) { 251 if (npages > 0) {
167 mutex_lock(&mr->umem->odp_data->umem_mutex); 252 mutex_lock(&mr->umem->odp_data->umem_mutex);
168 /* 253 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
169 * No need to check whether the MTTs really belong to 254 /*
170 * this MR, since ib_umem_odp_map_dma_pages already 255 * No need to check whether the MTTs really belong to
171 * checks this. 256 * this MR, since ib_umem_odp_map_dma_pages already
172 */ 257 * checks this.
173 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); 258 */
259 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
260 } else {
261 ret = -EAGAIN;
262 }
174 mutex_unlock(&mr->umem->odp_data->umem_mutex); 263 mutex_unlock(&mr->umem->odp_data->umem_mutex);
175 if (ret < 0) { 264 if (ret < 0) {
176 pr_err("Failed to update mkey page tables\n"); 265 if (ret != -EAGAIN)
266 pr_err("Failed to update mkey page tables\n");
177 goto srcu_unlock; 267 goto srcu_unlock;
178 } 268 }
179 269
@@ -185,6 +275,22 @@ static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
185 } 275 }
186 276
187srcu_unlock: 277srcu_unlock:
278 if (ret == -EAGAIN) {
279 if (!mr->umem->odp_data->dying) {
280 struct ib_umem_odp *odp_data = mr->umem->odp_data;
281 unsigned long timeout =
282 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
283
284 if (!wait_for_completion_timeout(
285 &odp_data->notifier_completion,
286 timeout)) {
287 pr_warn("timeout waiting for mmu notifier completion\n");
288 }
289 } else {
290 /* The MR is being killed, kill the QP as well. */
291 ret = -EFAULT;
292 }
293 }
188 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); 294 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
189 pfault->mpfault.bytes_committed = 0; 295 pfault->mpfault.bytes_committed = 0;
190 return ret ? ret : npages; 296 return ret ? ret : npages;