aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHaggai Eran <haggaie@mellanox.com>2014-12-11 10:04:23 -0500
committerRoland Dreier <roland@purestorage.com>2014-12-15 21:19:03 -0500
commit6aec21f6a8322fa8d43df3ea7f051dfd8967f1b9 (patch)
treede572339272023be68f0d5c2512cd719fb033f40
parent832a6b06ab5e13c228fc27e333ad360aa03ace6f (diff)
IB/mlx5: Page faults handling infrastructure
* Refactor MR registration and cleanup, and fix reg_pages accounting. * Create a work queue to handle page fault events in a kthread context. * Register a fault handler to get events from the core for each QP. The registered fault handler is empty in this patch, and only a later patch implements it. Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Shachar Raindel <raindel@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
-rw-r--r--drivers/infiniband/hw/mlx5/main.c31
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h67
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c45
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c145
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c26
-rw-r--r--include/linux/mlx5/driver.h2
6 files changed, 294 insertions, 22 deletions
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e6d775f2446d..a801baa79c8e 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -864,7 +864,7 @@ static ssize_t show_reg_pages(struct device *device,
864 struct mlx5_ib_dev *dev = 864 struct mlx5_ib_dev *dev =
865 container_of(device, struct mlx5_ib_dev, ib_dev.dev); 865 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
866 866
867 return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); 867 return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
868} 868}
869 869
870static ssize_t show_hca(struct device *device, struct device_attribute *attr, 870static ssize_t show_hca(struct device *device, struct device_attribute *attr,
@@ -1389,16 +1389,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
1389 goto err_eqs; 1389 goto err_eqs;
1390 1390
1391 mutex_init(&dev->cap_mask_mutex); 1391 mutex_init(&dev->cap_mask_mutex);
1392 spin_lock_init(&dev->mr_lock);
1393 1392
1394 err = create_dev_resources(&dev->devr); 1393 err = create_dev_resources(&dev->devr);
1395 if (err) 1394 if (err)
1396 goto err_eqs; 1395 goto err_eqs;
1397 1396
1398 err = ib_register_device(&dev->ib_dev, NULL); 1397 err = mlx5_ib_odp_init_one(dev);
1399 if (err) 1398 if (err)
1400 goto err_rsrc; 1399 goto err_rsrc;
1401 1400
1401 err = ib_register_device(&dev->ib_dev, NULL);
1402 if (err)
1403 goto err_odp;
1404
1402 err = create_umr_res(dev); 1405 err = create_umr_res(dev);
1403 if (err) 1406 if (err)
1404 goto err_dev; 1407 goto err_dev;
@@ -1420,6 +1423,9 @@ err_umrc:
1420err_dev: 1423err_dev:
1421 ib_unregister_device(&dev->ib_dev); 1424 ib_unregister_device(&dev->ib_dev);
1422 1425
1426err_odp:
1427 mlx5_ib_odp_remove_one(dev);
1428
1423err_rsrc: 1429err_rsrc:
1424 destroy_dev_resources(&dev->devr); 1430 destroy_dev_resources(&dev->devr);
1425 1431
@@ -1435,8 +1441,10 @@ err_dealloc:
1435static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) 1441static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
1436{ 1442{
1437 struct mlx5_ib_dev *dev = context; 1443 struct mlx5_ib_dev *dev = context;
1444
1438 ib_unregister_device(&dev->ib_dev); 1445 ib_unregister_device(&dev->ib_dev);
1439 destroy_umrc_res(dev); 1446 destroy_umrc_res(dev);
1447 mlx5_ib_odp_remove_one(dev);
1440 destroy_dev_resources(&dev->devr); 1448 destroy_dev_resources(&dev->devr);
1441 free_comp_eqs(dev); 1449 free_comp_eqs(dev);
1442 ib_dealloc_device(&dev->ib_dev); 1450 ib_dealloc_device(&dev->ib_dev);
@@ -1450,15 +1458,30 @@ static struct mlx5_interface mlx5_ib_interface = {
1450 1458
1451static int __init mlx5_ib_init(void) 1459static int __init mlx5_ib_init(void)
1452{ 1460{
1461 int err;
1462
1453 if (deprecated_prof_sel != 2) 1463 if (deprecated_prof_sel != 2)
1454 pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); 1464 pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
1455 1465
1456 return mlx5_register_interface(&mlx5_ib_interface); 1466 err = mlx5_ib_odp_init();
1467 if (err)
1468 return err;
1469
1470 err = mlx5_register_interface(&mlx5_ib_interface);
1471 if (err)
1472 goto clean_odp;
1473
1474 return err;
1475
1476clean_odp:
1477 mlx5_ib_odp_cleanup();
1478 return err;
1457} 1479}
1458 1480
1459static void __exit mlx5_ib_cleanup(void) 1481static void __exit mlx5_ib_cleanup(void)
1460{ 1482{
1461 mlx5_unregister_interface(&mlx5_ib_interface); 1483 mlx5_unregister_interface(&mlx5_ib_interface);
1484 mlx5_ib_odp_cleanup();
1462} 1485}
1463 1486
1464module_init(mlx5_ib_init); 1487module_init(mlx5_ib_init);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 6856e27bfb6a..c6ceec3e3d6a 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -149,6 +149,29 @@ enum {
149 MLX5_QP_EMPTY 149 MLX5_QP_EMPTY
150}; 150};
151 151
152/*
153 * Connect-IB can trigger up to four concurrent pagefaults
154 * per-QP.
155 */
156enum mlx5_ib_pagefault_context {
157 MLX5_IB_PAGEFAULT_RESPONDER_READ,
158 MLX5_IB_PAGEFAULT_REQUESTOR_READ,
159 MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
160 MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
161 MLX5_IB_PAGEFAULT_CONTEXTS
162};
163
164static inline enum mlx5_ib_pagefault_context
165 mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
166{
167 return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
168}
169
170struct mlx5_ib_pfault {
171 struct work_struct work;
172 struct mlx5_pagefault mpfault;
173};
174
152struct mlx5_ib_qp { 175struct mlx5_ib_qp {
153 struct ib_qp ibqp; 176 struct ib_qp ibqp;
154 struct mlx5_core_qp mqp; 177 struct mlx5_core_qp mqp;
@@ -194,6 +217,21 @@ struct mlx5_ib_qp {
194 217
195 /* Store signature errors */ 218 /* Store signature errors */
196 bool signature_en; 219 bool signature_en;
220
221#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
222 /*
223 * A flag that is true for QP's that are in a state that doesn't
224 * allow page faults, and shouldn't schedule any more faults.
225 */
226 int disable_page_faults;
227 /*
228 * The disable_page_faults_lock protects a QP's disable_page_faults
229 * field, allowing for a thread to atomically check whether the QP
230 * allows page faults, and if so schedule a page fault.
231 */
232 spinlock_t disable_page_faults_lock;
233 struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
234#endif
197}; 235};
198 236
199struct mlx5_ib_cq_buf { 237struct mlx5_ib_cq_buf {
@@ -392,13 +430,17 @@ struct mlx5_ib_dev {
392 struct umr_common umrc; 430 struct umr_common umrc;
393 /* sync used page count stats 431 /* sync used page count stats
394 */ 432 */
395 spinlock_t mr_lock;
396 struct mlx5_ib_resources devr; 433 struct mlx5_ib_resources devr;
397 struct mlx5_mr_cache cache; 434 struct mlx5_mr_cache cache;
398 struct timer_list delay_timer; 435 struct timer_list delay_timer;
399 int fill_delay; 436 int fill_delay;
400#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 437#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
401 struct ib_odp_caps odp_caps; 438 struct ib_odp_caps odp_caps;
439 /*
440 * Sleepable RCU that prevents destruction of MRs while they are still
441 * being used by a page fault handler.
442 */
443 struct srcu_struct mr_srcu;
402#endif 444#endif
403}; 445};
404 446
@@ -575,12 +617,33 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
575 struct ib_mr_status *mr_status); 617 struct ib_mr_status *mr_status);
576 618
577#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 619#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
620extern struct workqueue_struct *mlx5_ib_page_fault_wq;
621
578int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); 622int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
579#else 623void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
624 struct mlx5_ib_pfault *pfault);
625void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
626int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
627void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
628int __init mlx5_ib_odp_init(void);
629void mlx5_ib_odp_cleanup(void);
630void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
631void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
632
633#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
580static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) 634static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
581{ 635{
582 return 0; 636 return 0;
583} 637}
638
639static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {}
640static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
641static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
642static inline int mlx5_ib_odp_init(void) { return 0; }
643static inline void mlx5_ib_odp_cleanup(void) {}
644static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
645static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
646
584#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 647#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
585 648
586static inline void init_query_mad(struct ib_smp *mad) 649static inline void init_query_mad(struct ib_smp *mad)
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 38b06267798e..922ac85b7198 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -52,6 +52,8 @@ static __be64 mlx5_ib_update_mtt_emergency_buffer[
52static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); 52static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
53#endif 53#endif
54 54
55static int clean_mr(struct mlx5_ib_mr *mr);
56
55static int order2idx(struct mlx5_ib_dev *dev, int order) 57static int order2idx(struct mlx5_ib_dev *dev, int order)
56{ 58{
57 struct mlx5_mr_cache *cache = &dev->cache; 59 struct mlx5_mr_cache *cache = &dev->cache;
@@ -1049,6 +1051,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1049 mlx5_ib_dbg(dev, "cache empty for order %d", order); 1051 mlx5_ib_dbg(dev, "cache empty for order %d", order);
1050 mr = NULL; 1052 mr = NULL;
1051 } 1053 }
1054 } else if (access_flags & IB_ACCESS_ON_DEMAND) {
1055 err = -EINVAL;
1056 pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
1057 goto error;
1052 } 1058 }
1053 1059
1054 if (!mr) 1060 if (!mr)
@@ -1064,9 +1070,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1064 1070
1065 mr->umem = umem; 1071 mr->umem = umem;
1066 mr->npages = npages; 1072 mr->npages = npages;
1067 spin_lock(&dev->mr_lock); 1073 atomic_add(npages, &dev->mdev->priv.reg_pages);
1068 dev->mdev->priv.reg_pages += npages;
1069 spin_unlock(&dev->mr_lock);
1070 mr->ibmr.lkey = mr->mmr.key; 1074 mr->ibmr.lkey = mr->mmr.key;
1071 mr->ibmr.rkey = mr->mmr.key; 1075 mr->ibmr.rkey = mr->mmr.key;
1072 1076
@@ -1110,12 +1114,9 @@ error:
1110 return err; 1114 return err;
1111} 1115}
1112 1116
1113int mlx5_ib_dereg_mr(struct ib_mr *ibmr) 1117static int clean_mr(struct mlx5_ib_mr *mr)
1114{ 1118{
1115 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1119 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1116 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1117 struct ib_umem *umem = mr->umem;
1118 int npages = mr->npages;
1119 int umred = mr->umred; 1120 int umred = mr->umred;
1120 int err; 1121 int err;
1121 1122
@@ -1135,16 +1136,32 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1135 free_cached_mr(dev, mr); 1136 free_cached_mr(dev, mr);
1136 } 1137 }
1137 1138
1139 if (!umred)
1140 kfree(mr);
1141
1142 return 0;
1143}
1144
1145int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1146{
1147 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1148 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1149 int npages = mr->npages;
1150 struct ib_umem *umem = mr->umem;
1151
1152#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1153 if (umem)
1154 /* Wait for all running page-fault handlers to finish. */
1155 synchronize_srcu(&dev->mr_srcu);
1156#endif
1157
1158 clean_mr(mr);
1159
1138 if (umem) { 1160 if (umem) {
1139 ib_umem_release(umem); 1161 ib_umem_release(umem);
1140 spin_lock(&dev->mr_lock); 1162 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1141 dev->mdev->priv.reg_pages -= npages;
1142 spin_unlock(&dev->mr_lock);
1143 } 1163 }
1144 1164
1145 if (!umred)
1146 kfree(mr);
1147
1148 return 0; 1165 return 0;
1149} 1166}
1150 1167
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 66c39ee16aff..63bbdba396f1 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -32,6 +32,8 @@
32 32
33#include "mlx5_ib.h" 33#include "mlx5_ib.h"
34 34
35struct workqueue_struct *mlx5_ib_page_fault_wq;
36
35#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ 37#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
36 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ 38 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
37 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ 39 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
@@ -58,3 +60,146 @@ int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
58out: 60out:
59 return err; 61 return err;
60} 62}
63
64static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
65 u32 key)
66{
67 u32 base_key = mlx5_base_mkey(key);
68 struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
69
70 if (!mmr || mmr->key != key)
71 return NULL;
72
73 return container_of(mmr, struct mlx5_ib_mr, mmr);
74}
75
76static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
77 struct mlx5_ib_pfault *pfault,
78 int error) {
79 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
80 int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
81 pfault->mpfault.flags,
82 error);
83 if (ret)
84 pr_err("Failed to resolve the page fault on QP 0x%x\n",
85 qp->mqp.qpn);
86}
87
88void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
89 struct mlx5_ib_pfault *pfault)
90{
91 u8 event_subtype = pfault->mpfault.event_subtype;
92
93 switch (event_subtype) {
94 default:
95 pr_warn("Invalid page fault event subtype: 0x%x\n",
96 event_subtype);
97 mlx5_ib_page_fault_resume(qp, pfault, 1);
98 break;
99 }
100}
101
102static void mlx5_ib_qp_pfault_action(struct work_struct *work)
103{
104 struct mlx5_ib_pfault *pfault = container_of(work,
105 struct mlx5_ib_pfault,
106 work);
107 enum mlx5_ib_pagefault_context context =
108 mlx5_ib_get_pagefault_context(&pfault->mpfault);
109 struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
110 pagefaults[context]);
111 mlx5_ib_mr_pfault_handler(qp, pfault);
112}
113
114void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
115{
116 unsigned long flags;
117
118 spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
119 qp->disable_page_faults = 1;
120 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
121
122 /*
123 * Note that at this point, we are guarenteed that no more
124 * work queue elements will be posted to the work queue with
125 * the QP we are closing.
126 */
127 flush_workqueue(mlx5_ib_page_fault_wq);
128}
129
130void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
131{
132 unsigned long flags;
133
134 spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
135 qp->disable_page_faults = 0;
136 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
137}
138
139static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
140 struct mlx5_pagefault *pfault)
141{
142 /*
143 * Note that we will only get one fault event per QP per context
144 * (responder/initiator, read/write), until we resolve the page fault
145 * with the mlx5_ib_page_fault_resume command. Since this function is
146 * called from within the work element, there is no risk of missing
147 * events.
148 */
149 struct mlx5_ib_qp *mibqp = to_mibqp(qp);
150 enum mlx5_ib_pagefault_context context =
151 mlx5_ib_get_pagefault_context(pfault);
152 struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
153
154 qp_pfault->mpfault = *pfault;
155
156 /* No need to stop interrupts here since we are in an interrupt */
157 spin_lock(&mibqp->disable_page_faults_lock);
158 if (!mibqp->disable_page_faults)
159 queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
160 spin_unlock(&mibqp->disable_page_faults_lock);
161}
162
163void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
164{
165 int i;
166
167 qp->disable_page_faults = 1;
168 spin_lock_init(&qp->disable_page_faults_lock);
169
170 qp->mqp.pfault_handler = mlx5_ib_pfault_handler;
171
172 for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
173 INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
174}
175
176int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
177{
178 int ret;
179
180 ret = init_srcu_struct(&ibdev->mr_srcu);
181 if (ret)
182 return ret;
183
184 return 0;
185}
186
187void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
188{
189 cleanup_srcu_struct(&ibdev->mr_srcu);
190}
191
192int __init mlx5_ib_odp_init(void)
193{
194 mlx5_ib_page_fault_wq =
195 create_singlethread_workqueue("mlx5_ib_page_faults");
196 if (!mlx5_ib_page_fault_wq)
197 return -ENOMEM;
198
199 return 0;
200}
201
202void mlx5_ib_odp_cleanup(void)
203{
204 destroy_workqueue(mlx5_ib_page_fault_wq);
205}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 9783c3342dbf..be0cd358b080 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -876,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
876 int inlen = sizeof(*in); 876 int inlen = sizeof(*in);
877 int err; 877 int err;
878 878
879 mlx5_ib_odp_create_qp(qp);
880
879 gen = &dev->mdev->caps.gen; 881 gen = &dev->mdev->caps.gen;
880 mutex_init(&qp->mutex); 882 mutex_init(&qp->mutex);
881 spin_lock_init(&qp->sq.lock); 883 spin_lock_init(&qp->sq.lock);
@@ -1160,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
1160 in = kzalloc(sizeof(*in), GFP_KERNEL); 1162 in = kzalloc(sizeof(*in), GFP_KERNEL);
1161 if (!in) 1163 if (!in)
1162 return; 1164 return;
1163 if (qp->state != IB_QPS_RESET) 1165 if (qp->state != IB_QPS_RESET) {
1166 mlx5_ib_qp_disable_pagefaults(qp);
1164 if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), 1167 if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
1165 MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) 1168 MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
1166 mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", 1169 mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
1167 qp->mqp.qpn); 1170 qp->mqp.qpn);
1171 }
1168 1172
1169 get_cqs(qp, &send_cq, &recv_cq); 1173 get_cqs(qp, &send_cq, &recv_cq);
1170 1174
@@ -1712,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
1712 if (mlx5_st < 0) 1716 if (mlx5_st < 0)
1713 goto out; 1717 goto out;
1714 1718
1719 /* If moving to a reset or error state, we must disable page faults on
1720 * this QP and flush all current page faults. Otherwise a stale page
1721 * fault may attempt to work on this QP after it is reset and moved
1722 * again to RTS, and may cause the driver and the device to get out of
1723 * sync. */
1724 if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
1725 (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
1726 mlx5_ib_qp_disable_pagefaults(qp);
1727
1715 optpar = ib_mask_to_mlx5_opt(attr_mask); 1728 optpar = ib_mask_to_mlx5_opt(attr_mask);
1716 optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; 1729 optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
1717 in->optparam = cpu_to_be32(optpar); 1730 in->optparam = cpu_to_be32(optpar);
@@ -1721,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
1721 if (err) 1734 if (err)
1722 goto out; 1735 goto out;
1723 1736
1737 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1738 mlx5_ib_qp_enable_pagefaults(qp);
1739
1724 qp->state = new_state; 1740 qp->state = new_state;
1725 1741
1726 if (attr_mask & IB_QP_ACCESS_FLAGS) 1742 if (attr_mask & IB_QP_ACCESS_FLAGS)
@@ -3026,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
3026 int mlx5_state; 3042 int mlx5_state;
3027 int err = 0; 3043 int err = 0;
3028 3044
3045#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
3046 /*
3047 * Wait for any outstanding page faults, in case the user frees memory
3048 * based upon this query's result.
3049 */
3050 flush_workqueue(mlx5_ib_page_fault_wq);
3051#endif
3052
3029 mutex_lock(&qp->mutex); 3053 mutex_lock(&qp->mutex);
3030 outb = kzalloc(sizeof(*outb), GFP_KERNEL); 3054 outb = kzalloc(sizeof(*outb), GFP_KERNEL);
3031 if (!outb) { 3055 if (!outb) {
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 7088dcd19214..166d9315fe4b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -474,7 +474,7 @@ struct mlx5_priv {
474 struct workqueue_struct *pg_wq; 474 struct workqueue_struct *pg_wq;
475 struct rb_root page_root; 475 struct rb_root page_root;
476 int fw_pages; 476 int fw_pages;
477 int reg_pages; 477 atomic_t reg_pages;
478 struct list_head free_list; 478 struct list_head free_list;
479 479
480 struct mlx5_core_health health; 480 struct mlx5_core_health health;