aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/infiniband/Kconfig11
-rw-r--r--drivers/infiniband/core/Makefile1
-rw-r--r--drivers/infiniband/core/umem.c72
-rw-r--r--drivers/infiniband/core/umem_odp.c668
-rw-r--r--drivers/infiniband/core/umem_rbtree.c94
-rw-r--r--drivers/infiniband/core/uverbs.h1
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c171
-rw-r--r--drivers/infiniband/core/uverbs_main.c5
-rw-r--r--drivers/infiniband/hw/amso1100/c2_provider.c2
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c7
-rw-r--r--drivers/infiniband/hw/cxgb4/device.c2
-rw-r--r--drivers/infiniband/hw/cxgb4/mem.c28
-rw-r--r--drivers/infiniband/hw/cxgb4/qp.c2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_mrmw.c2
-rw-r--r--drivers/infiniband/hw/ipath/ipath_mr.c2
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c1
-rw-r--r--drivers/infiniband/hw/mlx5/Makefile1
-rw-r--r--drivers/infiniband/hw/mlx5/main.c45
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c69
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h116
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c323
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c798
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c197
-rw-r--r--drivers/infiniband/hw/nes/nes_verbs.c6
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_ah.c5
-rw-r--r--drivers/infiniband/hw/ocrdma/ocrdma_verbs.c4
-rw-r--r--drivers/infiniband/hw/qib/qib_mr.c2
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib.h19
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_cm.c18
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_ib.c27
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_main.c49
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_multicast.c239
-rw-r--r--drivers/infiniband/ulp/ipoib/ipoib_verbs.c22
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.c104
-rw-r--r--drivers/infiniband/ulp/iser/iscsi_iser.h30
-rw-r--r--drivers/infiniband/ulp/iser/iser_initiator.c6
-rw-r--r--drivers/infiniband/ulp/iser/iser_memory.c102
-rw-r--r--drivers/infiniband/ulp/iser/iser_verbs.c91
-rw-r--r--drivers/infiniband/ulp/srp/ib_srp.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/main.c6
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/eq.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/fw.c40
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/qp.c119
-rw-r--r--include/linux/mlx5/device.h72
-rw-r--r--include/linux/mlx5/driver.h14
-rw-r--r--include/linux/mlx5/qp.h65
-rw-r--r--include/rdma/ib_umem.h34
-rw-r--r--include/rdma/ib_umem_odp.h160
-rw-r--r--include/rdma/ib_verbs.h54
-rw-r--r--include/uapi/rdma/ib_user_verbs.h29
50 files changed, 3499 insertions, 451 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 77089399359b..b899531498eb 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM
38 depends on INFINIBAND_USER_ACCESS != n 38 depends on INFINIBAND_USER_ACCESS != n
39 default y 39 default y
40 40
41config INFINIBAND_ON_DEMAND_PAGING
42 bool "InfiniBand on-demand paging support"
43 depends on INFINIBAND_USER_MEM
44 select MMU_NOTIFIER
45 default y
46 ---help---
47 On demand paging support for the InfiniBand subsystem.
48 Together with driver support this allows registration of
49 memory regions without pinning their pages, fetching the
50 pages on demand instead.
51
41config INFINIBAND_ADDR_TRANS 52config INFINIBAND_ADDR_TRANS
42 bool 53 bool
43 depends on INFINIBAND 54 depends on INFINIBAND
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index ffd0af6734af..acf736764445 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
11ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ 11ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
12 device.o fmr_pool.o cache.o netlink.o 12 device.o fmr_pool.o cache.o netlink.o
13ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 13ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
14ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
14 15
15ib_mad-y := mad.o smi.o agent.o mad_rmpp.o 16ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
16 17
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index df0c4f605a21..aec7a6aa2951 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -39,6 +39,7 @@
39#include <linux/hugetlb.h> 39#include <linux/hugetlb.h>
40#include <linux/dma-attrs.h> 40#include <linux/dma-attrs.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <rdma/ib_umem_odp.h>
42 43
43#include "uverbs.h" 44#include "uverbs.h"
44 45
@@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
69 70
70/** 71/**
71 * ib_umem_get - Pin and DMA map userspace memory. 72 * ib_umem_get - Pin and DMA map userspace memory.
73 *
74 * If access flags indicate ODP memory, avoid pinning. Instead, stores
75 * the mm for future page fault handling in conjunction with MMU notifiers.
76 *
72 * @context: userspace context to pin memory for 77 * @context: userspace context to pin memory for
73 * @addr: userspace virtual address to start at 78 * @addr: userspace virtual address to start at
74 * @size: length of region to pin 79 * @size: length of region to pin
@@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
103 108
104 umem->context = context; 109 umem->context = context;
105 umem->length = size; 110 umem->length = size;
106 umem->offset = addr & ~PAGE_MASK; 111 umem->address = addr;
107 umem->page_size = PAGE_SIZE; 112 umem->page_size = PAGE_SIZE;
108 umem->pid = get_task_pid(current, PIDTYPE_PID); 113 umem->pid = get_task_pid(current, PIDTYPE_PID);
109 /* 114 /*
110 * We ask for writable memory if any access flags other than 115 * We ask for writable memory if any of the following
111 * "remote read" are set. "Local write" and "remote write" 116 * access flags are set. "Local write" and "remote write"
112 * obviously require write access. "Remote atomic" can do 117 * obviously require write access. "Remote atomic" can do
113 * things like fetch and add, which will modify memory, and 118 * things like fetch and add, which will modify memory, and
114 * "MW bind" can change permissions by binding a window. 119 * "MW bind" can change permissions by binding a window.
115 */ 120 */
116 umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); 121 umem->writable = !!(access &
122 (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
123 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
124
125 if (access & IB_ACCESS_ON_DEMAND) {
126 ret = ib_umem_odp_get(context, umem);
127 if (ret) {
128 kfree(umem);
129 return ERR_PTR(ret);
130 }
131 return umem;
132 }
133
134 umem->odp_data = NULL;
117 135
118 /* We assume the memory is from hugetlb until proved otherwise */ 136 /* We assume the memory is from hugetlb until proved otherwise */
119 umem->hugetlb = 1; 137 umem->hugetlb = 1;
@@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
132 if (!vma_list) 150 if (!vma_list)
133 umem->hugetlb = 0; 151 umem->hugetlb = 0;
134 152
135 npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; 153 npages = ib_umem_num_pages(umem);
136 154
137 down_write(&current->mm->mmap_sem); 155 down_write(&current->mm->mmap_sem);
138 156
@@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem)
235 struct task_struct *task; 253 struct task_struct *task;
236 unsigned long diff; 254 unsigned long diff;
237 255
256 if (umem->odp_data) {
257 ib_umem_odp_release(umem);
258 return;
259 }
260
238 __ib_umem_release(umem->context->device, umem, 1); 261 __ib_umem_release(umem->context->device, umem, 1);
239 262
240 task = get_pid_task(umem->pid, PIDTYPE_PID); 263 task = get_pid_task(umem->pid, PIDTYPE_PID);
@@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem)
246 if (!mm) 269 if (!mm)
247 goto out; 270 goto out;
248 271
249 diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; 272 diff = ib_umem_num_pages(umem);
250 273
251 /* 274 /*
252 * We may be called with the mm's mmap_sem already held. This 275 * We may be called with the mm's mmap_sem already held. This
@@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem)
283 int n; 306 int n;
284 struct scatterlist *sg; 307 struct scatterlist *sg;
285 308
309 if (umem->odp_data)
310 return ib_umem_num_pages(umem);
311
286 shift = ilog2(umem->page_size); 312 shift = ilog2(umem->page_size);
287 313
288 n = 0; 314 n = 0;
@@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem)
292 return n; 318 return n;
293} 319}
294EXPORT_SYMBOL(ib_umem_page_count); 320EXPORT_SYMBOL(ib_umem_page_count);
321
322/*
323 * Copy from the given ib_umem's pages to the given buffer.
324 *
325 * umem - the umem to copy from
326 * offset - offset to start copying from
327 * dst - destination buffer
328 * length - buffer length
329 *
330 * Returns 0 on success, or an error code.
331 */
332int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
333 size_t length)
334{
335 size_t end = offset + length;
336 int ret;
337
338 if (offset > umem->length || length > umem->length - offset) {
339 pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
340 offset, umem->length, end);
341 return -EINVAL;
342 }
343
344 ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
345 offset + ib_umem_offset(umem));
346
347 if (ret < 0)
348 return ret;
349 else if (ret != length)
350 return -EINVAL;
351 else
352 return 0;
353}
354EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 000000000000..6095872549e7
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,668 @@
1/*
2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/types.h>
34#include <linux/sched.h>
35#include <linux/pid.h>
36#include <linux/slab.h>
37#include <linux/export.h>
38#include <linux/vmalloc.h>
39
40#include <rdma/ib_verbs.h>
41#include <rdma/ib_umem.h>
42#include <rdma/ib_umem_odp.h>
43
44static void ib_umem_notifier_start_account(struct ib_umem *item)
45{
46 mutex_lock(&item->odp_data->umem_mutex);
47
48 /* Only update private counters for this umem if it has them.
49 * Otherwise skip it. All page faults will be delayed for this umem. */
50 if (item->odp_data->mn_counters_active) {
51 int notifiers_count = item->odp_data->notifiers_count++;
52
53 if (notifiers_count == 0)
54 /* Initialize the completion object for waiting on
55 * notifiers. Since notifier_count is zero, no one
56 * should be waiting right now. */
57 reinit_completion(&item->odp_data->notifier_completion);
58 }
59 mutex_unlock(&item->odp_data->umem_mutex);
60}
61
62static void ib_umem_notifier_end_account(struct ib_umem *item)
63{
64 mutex_lock(&item->odp_data->umem_mutex);
65
66 /* Only update private counters for this umem if it has them.
67 * Otherwise skip it. All page faults will be delayed for this umem. */
68 if (item->odp_data->mn_counters_active) {
69 /*
70 * This sequence increase will notify the QP page fault that
71 * the page that is going to be mapped in the spte could have
72 * been freed.
73 */
74 ++item->odp_data->notifiers_seq;
75 if (--item->odp_data->notifiers_count == 0)
76 complete_all(&item->odp_data->notifier_completion);
77 }
78 mutex_unlock(&item->odp_data->umem_mutex);
79}
80
81/* Account for a new mmu notifier in an ib_ucontext. */
82static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
83{
84 atomic_inc(&context->notifier_count);
85}
86
87/* Account for a terminating mmu notifier in an ib_ucontext.
88 *
89 * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
90 * the function takes the semaphore itself. */
91static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
92{
93 int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
94
95 if (zero_notifiers &&
96 !list_empty(&context->no_private_counters)) {
97 /* No currently running mmu notifiers. Now is the chance to
98 * add private accounting to all previously added umems. */
99 struct ib_umem_odp *odp_data, *next;
100
101 /* Prevent concurrent mmu notifiers from working on the
102 * no_private_counters list. */
103 down_write(&context->umem_rwsem);
104
105 /* Read the notifier_count again, with the umem_rwsem
106 * semaphore taken for write. */
107 if (!atomic_read(&context->notifier_count)) {
108 list_for_each_entry_safe(odp_data, next,
109 &context->no_private_counters,
110 no_private_counters) {
111 mutex_lock(&odp_data->umem_mutex);
112 odp_data->mn_counters_active = true;
113 list_del(&odp_data->no_private_counters);
114 complete_all(&odp_data->notifier_completion);
115 mutex_unlock(&odp_data->umem_mutex);
116 }
117 }
118
119 up_write(&context->umem_rwsem);
120 }
121}
122
123static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
124 u64 end, void *cookie) {
125 /*
126 * Increase the number of notifiers running, to
127 * prevent any further fault handling on this MR.
128 */
129 ib_umem_notifier_start_account(item);
130 item->odp_data->dying = 1;
131 /* Make sure that the fact the umem is dying is out before we release
132 * all pending page faults. */
133 smp_wmb();
134 complete_all(&item->odp_data->notifier_completion);
135 item->context->invalidate_range(item, ib_umem_start(item),
136 ib_umem_end(item));
137 return 0;
138}
139
140static void ib_umem_notifier_release(struct mmu_notifier *mn,
141 struct mm_struct *mm)
142{
143 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
144
145 if (!context->invalidate_range)
146 return;
147
148 ib_ucontext_notifier_start_account(context);
149 down_read(&context->umem_rwsem);
150 rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
151 ULLONG_MAX,
152 ib_umem_notifier_release_trampoline,
153 NULL);
154 up_read(&context->umem_rwsem);
155}
156
157static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
158 u64 end, void *cookie)
159{
160 ib_umem_notifier_start_account(item);
161 item->context->invalidate_range(item, start, start + PAGE_SIZE);
162 ib_umem_notifier_end_account(item);
163 return 0;
164}
165
166static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
167 struct mm_struct *mm,
168 unsigned long address)
169{
170 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
171
172 if (!context->invalidate_range)
173 return;
174
175 ib_ucontext_notifier_start_account(context);
176 down_read(&context->umem_rwsem);
177 rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
178 address + PAGE_SIZE,
179 invalidate_page_trampoline, NULL);
180 up_read(&context->umem_rwsem);
181 ib_ucontext_notifier_end_account(context);
182}
183
184static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
185 u64 end, void *cookie)
186{
187 ib_umem_notifier_start_account(item);
188 item->context->invalidate_range(item, start, end);
189 return 0;
190}
191
192static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
193 struct mm_struct *mm,
194 unsigned long start,
195 unsigned long end)
196{
197 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
198
199 if (!context->invalidate_range)
200 return;
201
202 ib_ucontext_notifier_start_account(context);
203 down_read(&context->umem_rwsem);
204 rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
205 end,
206 invalidate_range_start_trampoline, NULL);
207 up_read(&context->umem_rwsem);
208}
209
210static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
211 u64 end, void *cookie)
212{
213 ib_umem_notifier_end_account(item);
214 return 0;
215}
216
217static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
218 struct mm_struct *mm,
219 unsigned long start,
220 unsigned long end)
221{
222 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
223
224 if (!context->invalidate_range)
225 return;
226
227 down_read(&context->umem_rwsem);
228 rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
229 end,
230 invalidate_range_end_trampoline, NULL);
231 up_read(&context->umem_rwsem);
232 ib_ucontext_notifier_end_account(context);
233}
234
235static struct mmu_notifier_ops ib_umem_notifiers = {
236 .release = ib_umem_notifier_release,
237 .invalidate_page = ib_umem_notifier_invalidate_page,
238 .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
239 .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
240};
241
242int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
243{
244 int ret_val;
245 struct pid *our_pid;
246 struct mm_struct *mm = get_task_mm(current);
247
248 if (!mm)
249 return -EINVAL;
250
251 /* Prevent creating ODP MRs in child processes */
252 rcu_read_lock();
253 our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
254 rcu_read_unlock();
255 put_pid(our_pid);
256 if (context->tgid != our_pid) {
257 ret_val = -EINVAL;
258 goto out_mm;
259 }
260
261 umem->hugetlb = 0;
262 umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
263 if (!umem->odp_data) {
264 ret_val = -ENOMEM;
265 goto out_mm;
266 }
267 umem->odp_data->umem = umem;
268
269 mutex_init(&umem->odp_data->umem_mutex);
270
271 init_completion(&umem->odp_data->notifier_completion);
272
273 umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
274 sizeof(*umem->odp_data->page_list));
275 if (!umem->odp_data->page_list) {
276 ret_val = -ENOMEM;
277 goto out_odp_data;
278 }
279
280 umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
281 sizeof(*umem->odp_data->dma_list));
282 if (!umem->odp_data->dma_list) {
283 ret_val = -ENOMEM;
284 goto out_page_list;
285 }
286
287 /*
288 * When using MMU notifiers, we will get a
289 * notification before the "current" task (and MM) is
290 * destroyed. We use the umem_rwsem semaphore to synchronize.
291 */
292 down_write(&context->umem_rwsem);
293 context->odp_mrs_count++;
294 if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
295 rbt_ib_umem_insert(&umem->odp_data->interval_tree,
296 &context->umem_tree);
297 if (likely(!atomic_read(&context->notifier_count)))
298 umem->odp_data->mn_counters_active = true;
299 else
300 list_add(&umem->odp_data->no_private_counters,
301 &context->no_private_counters);
302 downgrade_write(&context->umem_rwsem);
303
304 if (context->odp_mrs_count == 1) {
305 /*
306 * Note that at this point, no MMU notifier is running
307 * for this context!
308 */
309 atomic_set(&context->notifier_count, 0);
310 INIT_HLIST_NODE(&context->mn.hlist);
311 context->mn.ops = &ib_umem_notifiers;
312 /*
313 * Lock-dep detects a false positive for mmap_sem vs.
314 * umem_rwsem, due to not grasping downgrade_write correctly.
315 */
316 lockdep_off();
317 ret_val = mmu_notifier_register(&context->mn, mm);
318 lockdep_on();
319 if (ret_val) {
320 pr_err("Failed to register mmu_notifier %d\n", ret_val);
321 ret_val = -EBUSY;
322 goto out_mutex;
323 }
324 }
325
326 up_read(&context->umem_rwsem);
327
328 /*
329 * Note that doing an mmput can cause a notifier for the relevant mm.
330 * If the notifier is called while we hold the umem_rwsem, this will
331 * cause a deadlock. Therefore, we release the reference only after we
332 * released the semaphore.
333 */
334 mmput(mm);
335 return 0;
336
337out_mutex:
338 up_read(&context->umem_rwsem);
339 vfree(umem->odp_data->dma_list);
340out_page_list:
341 vfree(umem->odp_data->page_list);
342out_odp_data:
343 kfree(umem->odp_data);
344out_mm:
345 mmput(mm);
346 return ret_val;
347}
348
349void ib_umem_odp_release(struct ib_umem *umem)
350{
351 struct ib_ucontext *context = umem->context;
352
353 /*
354 * Ensure that no more pages are mapped in the umem.
355 *
356 * It is the driver's responsibility to ensure, before calling us,
357 * that the hardware will not attempt to access the MR any more.
358 */
359 ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
360 ib_umem_end(umem));
361
362 down_write(&context->umem_rwsem);
363 if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
364 rbt_ib_umem_remove(&umem->odp_data->interval_tree,
365 &context->umem_tree);
366 context->odp_mrs_count--;
367 if (!umem->odp_data->mn_counters_active) {
368 list_del(&umem->odp_data->no_private_counters);
369 complete_all(&umem->odp_data->notifier_completion);
370 }
371
372 /*
373 * Downgrade the lock to a read lock. This ensures that the notifiers
374 * (who lock the mutex for reading) will be able to finish, and we
375 * will be able to enventually obtain the mmu notifiers SRCU. Note
376 * that since we are doing it atomically, no other user could register
377 * and unregister while we do the check.
378 */
379 downgrade_write(&context->umem_rwsem);
380 if (!context->odp_mrs_count) {
381 struct task_struct *owning_process = NULL;
382 struct mm_struct *owning_mm = NULL;
383
384 owning_process = get_pid_task(context->tgid,
385 PIDTYPE_PID);
386 if (owning_process == NULL)
387 /*
388 * The process is already dead, notifier were removed
389 * already.
390 */
391 goto out;
392
393 owning_mm = get_task_mm(owning_process);
394 if (owning_mm == NULL)
395 /*
396 * The process' mm is already dead, notifier were
397 * removed already.
398 */
399 goto out_put_task;
400 mmu_notifier_unregister(&context->mn, owning_mm);
401
402 mmput(owning_mm);
403
404out_put_task:
405 put_task_struct(owning_process);
406 }
407out:
408 up_read(&context->umem_rwsem);
409
410 vfree(umem->odp_data->dma_list);
411 vfree(umem->odp_data->page_list);
412 kfree(umem->odp_data);
413 kfree(umem);
414}
415
416/*
417 * Map for DMA and insert a single page into the on-demand paging page tables.
418 *
419 * @umem: the umem to insert the page to.
420 * @page_index: index in the umem to add the page to.
421 * @page: the page struct to map and add.
422 * @access_mask: access permissions needed for this page.
423 * @current_seq: sequence number for synchronization with invalidations.
424 * the sequence number is taken from
425 * umem->odp_data->notifiers_seq.
426 *
427 * The function returns -EFAULT if the DMA mapping operation fails. It returns
428 * -EAGAIN if a concurrent invalidation prevents us from updating the page.
429 *
430 * The page is released via put_page even if the operation failed. For
431 * on-demand pinning, the page is released whenever it isn't stored in the
432 * umem.
433 */
434static int ib_umem_odp_map_dma_single_page(
435 struct ib_umem *umem,
436 int page_index,
437 u64 base_virt_addr,
438 struct page *page,
439 u64 access_mask,
440 unsigned long current_seq)
441{
442 struct ib_device *dev = umem->context->device;
443 dma_addr_t dma_addr;
444 int stored_page = 0;
445 int remove_existing_mapping = 0;
446 int ret = 0;
447
448 mutex_lock(&umem->odp_data->umem_mutex);
449 /*
450 * Note: we avoid writing if seq is different from the initial seq, to
451 * handle case of a racing notifier. This check also allows us to bail
452 * early if we have a notifier running in parallel with us.
453 */
454 if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
455 ret = -EAGAIN;
456 goto out;
457 }
458 if (!(umem->odp_data->dma_list[page_index])) {
459 dma_addr = ib_dma_map_page(dev,
460 page,
461 0, PAGE_SIZE,
462 DMA_BIDIRECTIONAL);
463 if (ib_dma_mapping_error(dev, dma_addr)) {
464 ret = -EFAULT;
465 goto out;
466 }
467 umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
468 umem->odp_data->page_list[page_index] = page;
469 stored_page = 1;
470 } else if (umem->odp_data->page_list[page_index] == page) {
471 umem->odp_data->dma_list[page_index] |= access_mask;
472 } else {
473 pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
474 umem->odp_data->page_list[page_index], page);
475 /* Better remove the mapping now, to prevent any further
476 * damage. */
477 remove_existing_mapping = 1;
478 }
479
480out:
481 mutex_unlock(&umem->odp_data->umem_mutex);
482
483 /* On Demand Paging - avoid pinning the page */
484 if (umem->context->invalidate_range || !stored_page)
485 put_page(page);
486
487 if (remove_existing_mapping && umem->context->invalidate_range) {
488 invalidate_page_trampoline(
489 umem,
490 base_virt_addr + (page_index * PAGE_SIZE),
491 base_virt_addr + ((page_index+1)*PAGE_SIZE),
492 NULL);
493 ret = -EAGAIN;
494 }
495
496 return ret;
497}
498
499/**
500 * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
501 *
502 * Pins the range of pages passed in the argument, and maps them to
503 * DMA addresses. The DMA addresses of the mapped pages is updated in
504 * umem->odp_data->dma_list.
505 *
506 * Returns the number of pages mapped in success, negative error code
507 * for failure.
508 * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
509 * the function from completing its task.
510 *
511 * @umem: the umem to map and pin
512 * @user_virt: the address from which we need to map.
513 * @bcnt: the minimal number of bytes to pin and map. The mapping might be
514 * bigger due to alignment, and may also be smaller in case of an error
515 * pinning or mapping a page. The actual pages mapped is returned in
516 * the return value.
517 * @access_mask: bit mask of the requested access permissions for the given
518 * range.
519 * @current_seq: the MMU notifiers sequance value for synchronization with
520 * invalidations. the sequance number is read from
521 * umem->odp_data->notifiers_seq before calling this function
522 */
523int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
524 u64 access_mask, unsigned long current_seq)
525{
526 struct task_struct *owning_process = NULL;
527 struct mm_struct *owning_mm = NULL;
528 struct page **local_page_list = NULL;
529 u64 off;
530 int j, k, ret = 0, start_idx, npages = 0;
531 u64 base_virt_addr;
532
533 if (access_mask == 0)
534 return -EINVAL;
535
536 if (user_virt < ib_umem_start(umem) ||
537 user_virt + bcnt > ib_umem_end(umem))
538 return -EFAULT;
539
540 local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
541 if (!local_page_list)
542 return -ENOMEM;
543
544 off = user_virt & (~PAGE_MASK);
545 user_virt = user_virt & PAGE_MASK;
546 base_virt_addr = user_virt;
547 bcnt += off; /* Charge for the first page offset as well. */
548
549 owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
550 if (owning_process == NULL) {
551 ret = -EINVAL;
552 goto out_no_task;
553 }
554
555 owning_mm = get_task_mm(owning_process);
556 if (owning_mm == NULL) {
557 ret = -EINVAL;
558 goto out_put_task;
559 }
560
561 start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
562 k = start_idx;
563
564 while (bcnt > 0) {
565 const size_t gup_num_pages =
566 min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
567 PAGE_SIZE / sizeof(struct page *));
568
569 down_read(&owning_mm->mmap_sem);
570 /*
571 * Note: this might result in redundent page getting. We can
572 * avoid this by checking dma_list to be 0 before calling
573 * get_user_pages. However, this make the code much more
574 * complex (and doesn't gain us much performance in most use
575 * cases).
576 */
577 npages = get_user_pages(owning_process, owning_mm, user_virt,
578 gup_num_pages,
579 access_mask & ODP_WRITE_ALLOWED_BIT, 0,
580 local_page_list, NULL);
581 up_read(&owning_mm->mmap_sem);
582
583 if (npages < 0)
584 break;
585
586 bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
587 user_virt += npages << PAGE_SHIFT;
588 for (j = 0; j < npages; ++j) {
589 ret = ib_umem_odp_map_dma_single_page(
590 umem, k, base_virt_addr, local_page_list[j],
591 access_mask, current_seq);
592 if (ret < 0)
593 break;
594 k++;
595 }
596
597 if (ret < 0) {
598 /* Release left over pages when handling errors. */
599 for (++j; j < npages; ++j)
600 put_page(local_page_list[j]);
601 break;
602 }
603 }
604
605 if (ret >= 0) {
606 if (npages < 0 && k == start_idx)
607 ret = npages;
608 else
609 ret = k - start_idx;
610 }
611
612 mmput(owning_mm);
613out_put_task:
614 put_task_struct(owning_process);
615out_no_task:
616 free_page((unsigned long)local_page_list);
617 return ret;
618}
619EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
620
621void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
622 u64 bound)
623{
624 int idx;
625 u64 addr;
626 struct ib_device *dev = umem->context->device;
627
628 virt = max_t(u64, virt, ib_umem_start(umem));
629 bound = min_t(u64, bound, ib_umem_end(umem));
630 /* Note that during the run of this function, the
631 * notifiers_count of the MR is > 0, preventing any racing
632 * faults from completion. We might be racing with other
633 * invalidations, so we must make sure we free each page only
634 * once. */
635 for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
636 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
637 mutex_lock(&umem->odp_data->umem_mutex);
638 if (umem->odp_data->page_list[idx]) {
639 struct page *page = umem->odp_data->page_list[idx];
640 struct page *head_page = compound_head(page);
641 dma_addr_t dma = umem->odp_data->dma_list[idx];
642 dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
643
644 WARN_ON(!dma_addr);
645
646 ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
647 DMA_BIDIRECTIONAL);
648 if (dma & ODP_WRITE_ALLOWED_BIT)
649 /*
650 * set_page_dirty prefers being called with
651 * the page lock. However, MMU notifiers are
652 * called sometimes with and sometimes without
653 * the lock. We rely on the umem_mutex instead
654 * to prevent other mmu notifiers from
655 * continuing and allowing the page mapping to
656 * be removed.
657 */
658 set_page_dirty(head_page);
659 /* on demand pinning support */
660 if (!umem->context->invalidate_range)
661 put_page(page);
662 umem->odp_data->page_list[idx] = NULL;
663 umem->odp_data->dma_list[idx] = 0;
664 }
665 mutex_unlock(&umem->odp_data->umem_mutex);
666 }
667}
668EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
new file mode 100644
index 000000000000..727d788448f5
--- /dev/null
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -0,0 +1,94 @@
1/*
2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/kernel.h>
34#include <linux/module.h>
35#include <linux/interval_tree_generic.h>
36#include <linux/sched.h>
37#include <linux/gfp.h>
38#include <rdma/ib_umem_odp.h>
39
40/*
41 * The ib_umem list keeps track of memory regions for which the HW
42 * device request to receive notification when the related memory
43 * mapping is changed.
44 *
45 * ib_umem_lock protects the list.
46 */
47
48static inline u64 node_start(struct umem_odp_node *n)
49{
50 struct ib_umem_odp *umem_odp =
51 container_of(n, struct ib_umem_odp, interval_tree);
52
53 return ib_umem_start(umem_odp->umem);
54}
55
56/* Note that the representation of the intervals in the interval tree
57 * considers the ending point as contained in the interval, while the
58 * function ib_umem_end returns the first address which is not contained
59 * in the umem.
60 */
61static inline u64 node_last(struct umem_odp_node *n)
62{
63 struct ib_umem_odp *umem_odp =
64 container_of(n, struct ib_umem_odp, interval_tree);
65
66 return ib_umem_end(umem_odp->umem) - 1;
67}
68
69INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
70 node_start, node_last, , rbt_ib_umem)
71
72/* @last is not a part of the interval. See comment for function
73 * node_last.
74 */
75int rbt_ib_umem_for_each_in_range(struct rb_root *root,
76 u64 start, u64 last,
77 umem_call_back cb,
78 void *cookie)
79{
80 int ret_val = 0;
81 struct umem_odp_node *node;
82 struct ib_umem_odp *umem;
83
84 if (unlikely(start == last))
85 return ret_val;
86
87 for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
88 node = rbt_ib_umem_iter_next(node, start, last - 1)) {
89 umem = container_of(node, struct ib_umem_odp, interval_tree);
90 ret_val = cb(umem->umem, start, last, cookie) || ret_val;
91 }
92
93 return ret_val;
94}
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 643c08a025a5..b716b0815644 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
258 258
259IB_UVERBS_DECLARE_EX_CMD(create_flow); 259IB_UVERBS_DECLARE_EX_CMD(create_flow);
260IB_UVERBS_DECLARE_EX_CMD(destroy_flow); 260IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
261IB_UVERBS_DECLARE_EX_CMD(query_device);
261 262
262#endif /* UVERBS_H */ 263#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 5ba2a86aab6a..532d8eba8b02 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -36,6 +36,7 @@
36#include <linux/file.h> 36#include <linux/file.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/sched.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41 42
@@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
288 struct ib_uverbs_get_context_resp resp; 289 struct ib_uverbs_get_context_resp resp;
289 struct ib_udata udata; 290 struct ib_udata udata;
290 struct ib_device *ibdev = file->device->ib_dev; 291 struct ib_device *ibdev = file->device->ib_dev;
292#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
293 struct ib_device_attr dev_attr;
294#endif
291 struct ib_ucontext *ucontext; 295 struct ib_ucontext *ucontext;
292 struct file *filp; 296 struct file *filp;
293 int ret; 297 int ret;
@@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
325 INIT_LIST_HEAD(&ucontext->ah_list); 329 INIT_LIST_HEAD(&ucontext->ah_list);
326 INIT_LIST_HEAD(&ucontext->xrcd_list); 330 INIT_LIST_HEAD(&ucontext->xrcd_list);
327 INIT_LIST_HEAD(&ucontext->rule_list); 331 INIT_LIST_HEAD(&ucontext->rule_list);
332 rcu_read_lock();
333 ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
334 rcu_read_unlock();
328 ucontext->closing = 0; 335 ucontext->closing = 0;
329 336
337#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
338 ucontext->umem_tree = RB_ROOT;
339 init_rwsem(&ucontext->umem_rwsem);
340 ucontext->odp_mrs_count = 0;
341 INIT_LIST_HEAD(&ucontext->no_private_counters);
342
343 ret = ib_query_device(ibdev, &dev_attr);
344 if (ret)
345 goto err_free;
346 if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
347 ucontext->invalidate_range = NULL;
348
349#endif
350
330 resp.num_comp_vectors = file->device->num_comp_vectors; 351 resp.num_comp_vectors = file->device->num_comp_vectors;
331 352
332 ret = get_unused_fd_flags(O_CLOEXEC); 353 ret = get_unused_fd_flags(O_CLOEXEC);
@@ -371,6 +392,7 @@ err_fd:
371 put_unused_fd(resp.async_fd); 392 put_unused_fd(resp.async_fd);
372 393
373err_free: 394err_free:
395 put_pid(ucontext->tgid);
374 ibdev->dealloc_ucontext(ucontext); 396 ibdev->dealloc_ucontext(ucontext);
375 397
376err: 398err:
@@ -378,6 +400,52 @@ err:
378 return ret; 400 return ret;
379} 401}
380 402
403static void copy_query_dev_fields(struct ib_uverbs_file *file,
404 struct ib_uverbs_query_device_resp *resp,
405 struct ib_device_attr *attr)
406{
407 resp->fw_ver = attr->fw_ver;
408 resp->node_guid = file->device->ib_dev->node_guid;
409 resp->sys_image_guid = attr->sys_image_guid;
410 resp->max_mr_size = attr->max_mr_size;
411 resp->page_size_cap = attr->page_size_cap;
412 resp->vendor_id = attr->vendor_id;
413 resp->vendor_part_id = attr->vendor_part_id;
414 resp->hw_ver = attr->hw_ver;
415 resp->max_qp = attr->max_qp;
416 resp->max_qp_wr = attr->max_qp_wr;
417 resp->device_cap_flags = attr->device_cap_flags;
418 resp->max_sge = attr->max_sge;
419 resp->max_sge_rd = attr->max_sge_rd;
420 resp->max_cq = attr->max_cq;
421 resp->max_cqe = attr->max_cqe;
422 resp->max_mr = attr->max_mr;
423 resp->max_pd = attr->max_pd;
424 resp->max_qp_rd_atom = attr->max_qp_rd_atom;
425 resp->max_ee_rd_atom = attr->max_ee_rd_atom;
426 resp->max_res_rd_atom = attr->max_res_rd_atom;
427 resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
428 resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
429 resp->atomic_cap = attr->atomic_cap;
430 resp->max_ee = attr->max_ee;
431 resp->max_rdd = attr->max_rdd;
432 resp->max_mw = attr->max_mw;
433 resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
434 resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
435 resp->max_mcast_grp = attr->max_mcast_grp;
436 resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
437 resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
438 resp->max_ah = attr->max_ah;
439 resp->max_fmr = attr->max_fmr;
440 resp->max_map_per_fmr = attr->max_map_per_fmr;
441 resp->max_srq = attr->max_srq;
442 resp->max_srq_wr = attr->max_srq_wr;
443 resp->max_srq_sge = attr->max_srq_sge;
444 resp->max_pkeys = attr->max_pkeys;
445 resp->local_ca_ack_delay = attr->local_ca_ack_delay;
446 resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
447}
448
381ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, 449ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
382 const char __user *buf, 450 const char __user *buf,
383 int in_len, int out_len) 451 int in_len, int out_len)
@@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
398 return ret; 466 return ret;
399 467
400 memset(&resp, 0, sizeof resp); 468 memset(&resp, 0, sizeof resp);
401 469 copy_query_dev_fields(file, &resp, &attr);
402 resp.fw_ver = attr.fw_ver;
403 resp.node_guid = file->device->ib_dev->node_guid;
404 resp.sys_image_guid = attr.sys_image_guid;
405 resp.max_mr_size = attr.max_mr_size;
406 resp.page_size_cap = attr.page_size_cap;
407 resp.vendor_id = attr.vendor_id;
408 resp.vendor_part_id = attr.vendor_part_id;
409 resp.hw_ver = attr.hw_ver;
410 resp.max_qp = attr.max_qp;
411 resp.max_qp_wr = attr.max_qp_wr;
412 resp.device_cap_flags = attr.device_cap_flags;
413 resp.max_sge = attr.max_sge;
414 resp.max_sge_rd = attr.max_sge_rd;
415 resp.max_cq = attr.max_cq;
416 resp.max_cqe = attr.max_cqe;
417 resp.max_mr = attr.max_mr;
418 resp.max_pd = attr.max_pd;
419 resp.max_qp_rd_atom = attr.max_qp_rd_atom;
420 resp.max_ee_rd_atom = attr.max_ee_rd_atom;
421 resp.max_res_rd_atom = attr.max_res_rd_atom;
422 resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom;
423 resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom;
424 resp.atomic_cap = attr.atomic_cap;
425 resp.max_ee = attr.max_ee;
426 resp.max_rdd = attr.max_rdd;
427 resp.max_mw = attr.max_mw;
428 resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp;
429 resp.max_raw_ethy_qp = attr.max_raw_ethy_qp;
430 resp.max_mcast_grp = attr.max_mcast_grp;
431 resp.max_mcast_qp_attach = attr.max_mcast_qp_attach;
432 resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
433 resp.max_ah = attr.max_ah;
434 resp.max_fmr = attr.max_fmr;
435 resp.max_map_per_fmr = attr.max_map_per_fmr;
436 resp.max_srq = attr.max_srq;
437 resp.max_srq_wr = attr.max_srq_wr;
438 resp.max_srq_sge = attr.max_srq_sge;
439 resp.max_pkeys = attr.max_pkeys;
440 resp.local_ca_ack_delay = attr.local_ca_ack_delay;
441 resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt;
442 470
443 if (copy_to_user((void __user *) (unsigned long) cmd.response, 471 if (copy_to_user((void __user *) (unsigned long) cmd.response,
444 &resp, sizeof resp)) 472 &resp, sizeof resp))
@@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
947 goto err_free; 975 goto err_free;
948 } 976 }
949 977
978 if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
979 struct ib_device_attr attr;
980
981 ret = ib_query_device(pd->device, &attr);
982 if (ret || !(attr.device_cap_flags &
983 IB_DEVICE_ON_DEMAND_PAGING)) {
984 pr_debug("ODP support not available\n");
985 ret = -EINVAL;
986 goto err_put;
987 }
988 }
989
950 mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, 990 mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
951 cmd.access_flags, &udata); 991 cmd.access_flags, &udata);
952 if (IS_ERR(mr)) { 992 if (IS_ERR(mr)) {
@@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
3253 3293
3254 return ret ? ret : in_len; 3294 return ret ? ret : in_len;
3255} 3295}
3296
3297int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
3298 struct ib_udata *ucore,
3299 struct ib_udata *uhw)
3300{
3301 struct ib_uverbs_ex_query_device_resp resp;
3302 struct ib_uverbs_ex_query_device cmd;
3303 struct ib_device_attr attr;
3304 struct ib_device *device;
3305 int err;
3306
3307 device = file->device->ib_dev;
3308 if (ucore->inlen < sizeof(cmd))
3309 return -EINVAL;
3310
3311 err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
3312 if (err)
3313 return err;
3314
3315 if (cmd.reserved)
3316 return -EINVAL;
3317
3318 err = device->query_device(device, &attr);
3319 if (err)
3320 return err;
3321
3322 memset(&resp, 0, sizeof(resp));
3323 copy_query_dev_fields(file, &resp.base, &attr);
3324 resp.comp_mask = 0;
3325
3326#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
3327 if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) {
3328 resp.odp_caps.general_caps = attr.odp_caps.general_caps;
3329 resp.odp_caps.per_transport_caps.rc_odp_caps =
3330 attr.odp_caps.per_transport_caps.rc_odp_caps;
3331 resp.odp_caps.per_transport_caps.uc_odp_caps =
3332 attr.odp_caps.per_transport_caps.uc_odp_caps;
3333 resp.odp_caps.per_transport_caps.ud_odp_caps =
3334 attr.odp_caps.per_transport_caps.ud_odp_caps;
3335 resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
3336 }
3337#endif
3338
3339 err = ib_copy_to_udata(ucore, &resp, sizeof(resp));
3340 if (err)
3341 return err;
3342
3343 return 0;
3344}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 71ab83fde472..e6c23b9eab33 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
122 struct ib_udata *ucore, 122 struct ib_udata *ucore,
123 struct ib_udata *uhw) = { 123 struct ib_udata *uhw) = {
124 [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, 124 [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
125 [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow 125 [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
126 [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device
126}; 127};
127 128
128static void ib_uverbs_add_one(struct ib_device *device); 129static void ib_uverbs_add_one(struct ib_device *device);
@@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
296 kfree(uobj); 297 kfree(uobj);
297 } 298 }
298 299
300 put_pid(context->tgid);
301
299 return context->device->dealloc_ucontext(context); 302 return context->device->dealloc_ucontext(context);
300} 303}
301 304
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
index 2d5cbf4363e4..bdf3507810cb 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
476 c2mr->umem->page_size, 476 c2mr->umem->page_size,
477 i, 477 i,
478 length, 478 length,
479 c2mr->umem->offset, 479 ib_umem_offset(c2mr->umem),
480 &kva, 480 &kva,
481 c2_convert_access(acc), 481 c2_convert_access(acc),
482 c2mr); 482 c2mr);
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 4b8c6116c058..9edc200b311d 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
1640 __state_set(&ep->com, MPA_REQ_RCVD); 1640 __state_set(&ep->com, MPA_REQ_RCVD);
1641 1641
1642 /* drive upcall */ 1642 /* drive upcall */
1643 mutex_lock(&ep->parent_ep->com.mutex); 1643 mutex_lock_nested(&ep->parent_ep->com.mutex,
1644 SINGLE_DEPTH_NESTING);
1644 if (ep->parent_ep->com.state != DEAD) { 1645 if (ep->parent_ep->com.state != DEAD) {
1645 if (connect_request_upcall(ep)) 1646 if (connect_request_upcall(ep))
1646 abort_connection(ep, skb, GFP_KERNEL); 1647 abort_connection(ep, skb, GFP_KERNEL);
@@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
3126 err = c4iw_wait_for_reply(&ep->com.dev->rdev, 3127 err = c4iw_wait_for_reply(&ep->com.dev->rdev,
3127 &ep->com.wr_wait, 3128 &ep->com.wr_wait,
3128 0, 0, __func__); 3129 0, 0, __func__);
3130 else if (err > 0)
3131 err = net_xmit_errno(err);
3129 if (err) 3132 if (err)
3130 pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", 3133 pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
3131 err, ep->stid, 3134 err, ep->stid,
@@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
3159 err = c4iw_wait_for_reply(&ep->com.dev->rdev, 3162 err = c4iw_wait_for_reply(&ep->com.dev->rdev,
3160 &ep->com.wr_wait, 3163 &ep->com.wr_wait,
3161 0, 0, __func__); 3164 0, 0, __func__);
3165 else if (err > 0)
3166 err = net_xmit_errno(err);
3162 } 3167 }
3163 if (err) 3168 if (err)
3164 pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" 3169 pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 72f1f052e88c..eb5df4e62703 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file)
670 idr_for_each(&epd->devp->stid_idr, count_idrs, &count); 670 idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
671 spin_unlock_irq(&epd->devp->lock); 671 spin_unlock_irq(&epd->devp->lock);
672 672
673 epd->bufsize = count * 160; 673 epd->bufsize = count * 240;
674 epd->buf = vmalloc(epd->bufsize); 674 epd->buf = vmalloc(epd->bufsize);
675 if (!epd->buf) { 675 if (!epd->buf) {
676 ret = -ENOMEM; 676 ret = -ENOMEM;
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 0744455cd88b..cb43c2299ac0 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD;
50module_param(inline_threshold, int, 0644); 50module_param(inline_threshold, int, 0644);
51MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); 51MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
52 52
53static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
54{
55 return (is_t4(dev->rdev.lldi.adapter_type) ||
56 is_t5(dev->rdev.lldi.adapter_type)) &&
57 length >= 8*1024*1024*1024ULL;
58}
59
53static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, 60static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
54 u32 len, dma_addr_t data, int wait) 61 u32 len, dma_addr_t data, int wait)
55{ 62{
@@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
369 int ret; 376 int ret;
370 377
371 ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, 378 ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
372 FW_RI_STAG_NSMR, mhp->attr.perms, 379 FW_RI_STAG_NSMR, mhp->attr.len ?
380 mhp->attr.perms : 0,
373 mhp->attr.mw_bind_enable, mhp->attr.zbva, 381 mhp->attr.mw_bind_enable, mhp->attr.zbva,
374 mhp->attr.va_fbo, mhp->attr.len, shift - 12, 382 mhp->attr.va_fbo, mhp->attr.len ?
383 mhp->attr.len : -1, shift - 12,
375 mhp->attr.pbl_size, mhp->attr.pbl_addr); 384 mhp->attr.pbl_size, mhp->attr.pbl_addr);
376 if (ret) 385 if (ret)
377 return ret; 386 return ret;
@@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
536 return ret; 545 return ret;
537 } 546 }
538 547
548 if (mr_exceeds_hw_limits(rhp, total_size)) {
549 kfree(page_list);
550 return -EINVAL;
551 }
552
539 ret = reregister_mem(rhp, php, &mh, shift, npages); 553 ret = reregister_mem(rhp, php, &mh, shift, npages);
540 kfree(page_list); 554 kfree(page_list);
541 if (ret) 555 if (ret)
@@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
596 if (ret) 610 if (ret)
597 goto err; 611 goto err;
598 612
613 if (mr_exceeds_hw_limits(rhp, total_size)) {
614 kfree(page_list);
615 ret = -EINVAL;
616 goto err;
617 }
618
599 ret = alloc_pbl(mhp, npages); 619 ret = alloc_pbl(mhp, npages);
600 if (ret) { 620 if (ret) {
601 kfree(page_list); 621 kfree(page_list);
@@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
699 719
700 php = to_c4iw_pd(pd); 720 php = to_c4iw_pd(pd);
701 rhp = php->rhp; 721 rhp = php->rhp;
722
723 if (mr_exceeds_hw_limits(rhp, length))
724 return ERR_PTR(-EINVAL);
725
702 mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); 726 mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
703 if (!mhp) 727 if (!mhp)
704 return ERR_PTR(-ENOMEM); 728 return ERR_PTR(-ENOMEM);
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 2ed3ece2b2ee..bb85d479e66e 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1538,9 +1538,9 @@ err:
1538 set_state(qhp, C4IW_QP_STATE_ERROR); 1538 set_state(qhp, C4IW_QP_STATE_ERROR);
1539 free = 1; 1539 free = 1;
1540 abort = 1; 1540 abort = 1;
1541 wake_up(&qhp->wait);
1542 BUG_ON(!ep); 1541 BUG_ON(!ep);
1543 flush_qp(qhp); 1542 flush_qp(qhp);
1543 wake_up(&qhp->wait);
1544out: 1544out:
1545 mutex_unlock(&qhp->mutex); 1545 mutex_unlock(&qhp->mutex);
1546 1546
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index 3488e8c9fcb4..f914b30999f8 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -399,7 +399,7 @@ reg_user_mr_fallback:
399 pginfo.num_kpages = num_kpages; 399 pginfo.num_kpages = num_kpages;
400 pginfo.num_hwpages = num_hwpages; 400 pginfo.num_hwpages = num_hwpages;
401 pginfo.u.usr.region = e_mr->umem; 401 pginfo.u.usr.region = e_mr->umem;
402 pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; 402 pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
403 pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; 403 pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
404 ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, 404 ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
405 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, 405 e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
index 5e61e9bff697..c7278f6a8217 100644
--- a/drivers/infiniband/hw/ipath/ipath_mr.c
+++ b/drivers/infiniband/hw/ipath/ipath_mr.c
@@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
214 mr->mr.user_base = start; 214 mr->mr.user_base = start;
215 mr->mr.iova = virt_addr; 215 mr->mr.iova = virt_addr;
216 mr->mr.length = length; 216 mr->mr.length = length;
217 mr->mr.offset = umem->offset; 217 mr->mr.offset = ib_umem_offset(umem);
218 mr->mr.access_flags = mr_access_flags; 218 mr->mr.access_flags = mr_access_flags;
219 mr->mr.max_segs = n; 219 mr->mr.max_segs = n;
220 mr->umem = umem; 220 mr->umem = umem;
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 8f9325cfc85d..c36ccbd9a644 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
223 223
224 if (flags & IB_MR_REREG_TRANS) { 224 if (flags & IB_MR_REREG_TRANS) {
225 int shift; 225 int shift;
226 int err;
227 int n; 226 int n;
228 227
229 mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); 228 mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 4ea0135af484..27a70159e2ea 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -1,3 +1,4 @@
1obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o 1obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o
2 2
3mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o 3mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
4mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 1ba6c42e4df8..8a87404e9c76 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
244 props->max_mcast_grp; 244 props->max_mcast_grp;
245 props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ 245 props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
246 246
247#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
248 if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
249 props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
250 props->odp_caps = dev->odp_caps;
251#endif
252
247out: 253out:
248 kfree(in_mad); 254 kfree(in_mad);
249 kfree(out_mad); 255 kfree(out_mad);
@@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
568 goto out_count; 574 goto out_count;
569 } 575 }
570 576
577#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
578 context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
579#endif
580
571 INIT_LIST_HEAD(&context->db_page_list); 581 INIT_LIST_HEAD(&context->db_page_list);
572 mutex_init(&context->db_page_mutex); 582 mutex_init(&context->db_page_mutex);
573 583
@@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device,
858 struct mlx5_ib_dev *dev = 868 struct mlx5_ib_dev *dev =
859 container_of(device, struct mlx5_ib_dev, ib_dev.dev); 869 container_of(device, struct mlx5_ib_dev, ib_dev.dev);
860 870
861 return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); 871 return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
862} 872}
863 873
864static ssize_t show_hca(struct device *device, struct device_attribute *attr, 874static ssize_t show_hca(struct device *device, struct device_attribute *attr,
@@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
1321 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | 1331 (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
1322 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | 1332 (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
1323 (1ull << IB_USER_VERBS_CMD_OPEN_QP); 1333 (1ull << IB_USER_VERBS_CMD_OPEN_QP);
1334 dev->ib_dev.uverbs_ex_cmd_mask =
1335 (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
1324 1336
1325 dev->ib_dev.query_device = mlx5_ib_query_device; 1337 dev->ib_dev.query_device = mlx5_ib_query_device;
1326 dev->ib_dev.query_port = mlx5_ib_query_port; 1338 dev->ib_dev.query_port = mlx5_ib_query_port;
@@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
1366 dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; 1378 dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list;
1367 dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; 1379 dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
1368 1380
1381 mlx5_ib_internal_query_odp_caps(dev);
1382
1369 if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { 1383 if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
1370 dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; 1384 dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
1371 dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; 1385 dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
1379 goto err_eqs; 1393 goto err_eqs;
1380 1394
1381 mutex_init(&dev->cap_mask_mutex); 1395 mutex_init(&dev->cap_mask_mutex);
1382 spin_lock_init(&dev->mr_lock);
1383 1396
1384 err = create_dev_resources(&dev->devr); 1397 err = create_dev_resources(&dev->devr);
1385 if (err) 1398 if (err)
1386 goto err_eqs; 1399 goto err_eqs;
1387 1400
1388 err = ib_register_device(&dev->ib_dev, NULL); 1401 err = mlx5_ib_odp_init_one(dev);
1389 if (err) 1402 if (err)
1390 goto err_rsrc; 1403 goto err_rsrc;
1391 1404
1405 err = ib_register_device(&dev->ib_dev, NULL);
1406 if (err)
1407 goto err_odp;
1408
1392 err = create_umr_res(dev); 1409 err = create_umr_res(dev);
1393 if (err) 1410 if (err)
1394 goto err_dev; 1411 goto err_dev;
@@ -1410,6 +1427,9 @@ err_umrc:
1410err_dev: 1427err_dev:
1411 ib_unregister_device(&dev->ib_dev); 1428 ib_unregister_device(&dev->ib_dev);
1412 1429
1430err_odp:
1431 mlx5_ib_odp_remove_one(dev);
1432
1413err_rsrc: 1433err_rsrc:
1414 destroy_dev_resources(&dev->devr); 1434 destroy_dev_resources(&dev->devr);
1415 1435
@@ -1425,8 +1445,10 @@ err_dealloc:
1425static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) 1445static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
1426{ 1446{
1427 struct mlx5_ib_dev *dev = context; 1447 struct mlx5_ib_dev *dev = context;
1448
1428 ib_unregister_device(&dev->ib_dev); 1449 ib_unregister_device(&dev->ib_dev);
1429 destroy_umrc_res(dev); 1450 destroy_umrc_res(dev);
1451 mlx5_ib_odp_remove_one(dev);
1430 destroy_dev_resources(&dev->devr); 1452 destroy_dev_resources(&dev->devr);
1431 free_comp_eqs(dev); 1453 free_comp_eqs(dev);
1432 ib_dealloc_device(&dev->ib_dev); 1454 ib_dealloc_device(&dev->ib_dev);
@@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = {
1440 1462
1441static int __init mlx5_ib_init(void) 1463static int __init mlx5_ib_init(void)
1442{ 1464{
1465 int err;
1466
1443 if (deprecated_prof_sel != 2) 1467 if (deprecated_prof_sel != 2)
1444 pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); 1468 pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
1445 1469
1446 return mlx5_register_interface(&mlx5_ib_interface); 1470 err = mlx5_ib_odp_init();
1471 if (err)
1472 return err;
1473
1474 err = mlx5_register_interface(&mlx5_ib_interface);
1475 if (err)
1476 goto clean_odp;
1477
1478 return err;
1479
1480clean_odp:
1481 mlx5_ib_odp_cleanup();
1482 return err;
1447} 1483}
1448 1484
1449static void __exit mlx5_ib_cleanup(void) 1485static void __exit mlx5_ib_cleanup(void)
1450{ 1486{
1451 mlx5_unregister_interface(&mlx5_ib_interface); 1487 mlx5_unregister_interface(&mlx5_ib_interface);
1488 mlx5_ib_odp_cleanup();
1452} 1489}
1453 1490
1454module_init(mlx5_ib_init); 1491module_init(mlx5_ib_init);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index dae07eae9507..b56e4c5593ee 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -32,6 +32,7 @@
32 32
33#include <linux/module.h> 33#include <linux/module.h>
34#include <rdma/ib_umem.h> 34#include <rdma/ib_umem.h>
35#include <rdma/ib_umem_odp.h>
35#include "mlx5_ib.h" 36#include "mlx5_ib.h"
36 37
37/* @umem: umem object to scan 38/* @umem: umem object to scan
@@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
57 int entry; 58 int entry;
58 unsigned long page_shift = ilog2(umem->page_size); 59 unsigned long page_shift = ilog2(umem->page_size);
59 60
61 /* With ODP we must always match OS page size. */
62 if (umem->odp_data) {
63 *count = ib_umem_page_count(umem);
64 *shift = PAGE_SHIFT;
65 *ncont = *count;
66 if (order)
67 *order = ilog2(roundup_pow_of_two(*count));
68
69 return;
70 }
71
60 addr = addr >> page_shift; 72 addr = addr >> page_shift;
61 tmp = (unsigned long)addr; 73 tmp = (unsigned long)addr;
62 m = find_first_bit(&tmp, sizeof(tmp)); 74 m = find_first_bit(&tmp, sizeof(tmp));
@@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
108 *count = i; 120 *count = i;
109} 121}
110 122
111void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, 123#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
112 int page_shift, __be64 *pas, int umr) 124static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
125{
126 u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
127
128 if (umem_dma & ODP_READ_ALLOWED_BIT)
129 mtt_entry |= MLX5_IB_MTT_READ;
130 if (umem_dma & ODP_WRITE_ALLOWED_BIT)
131 mtt_entry |= MLX5_IB_MTT_WRITE;
132
133 return mtt_entry;
134}
135#endif
136
137/*
138 * Populate the given array with bus addresses from the umem.
139 *
140 * dev - mlx5_ib device
141 * umem - umem to use to fill the pages
142 * page_shift - determines the page size used in the resulting array
143 * offset - offset into the umem to start from,
144 * only implemented for ODP umems
145 * num_pages - total number of pages to fill
146 * pas - bus addresses array to fill
147 * access_flags - access flags to set on all present pages.
148 use enum mlx5_ib_mtt_access_flags for this.
149 */
150void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
151 int page_shift, size_t offset, size_t num_pages,
152 __be64 *pas, int access_flags)
113{ 153{
114 unsigned long umem_page_shift = ilog2(umem->page_size); 154 unsigned long umem_page_shift = ilog2(umem->page_size);
115 int shift = page_shift - umem_page_shift; 155 int shift = page_shift - umem_page_shift;
@@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
120 int len; 160 int len;
121 struct scatterlist *sg; 161 struct scatterlist *sg;
122 int entry; 162 int entry;
163#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
164 const bool odp = umem->odp_data != NULL;
165
166 if (odp) {
167 WARN_ON(shift != 0);
168 WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
169
170 for (i = 0; i < num_pages; ++i) {
171 dma_addr_t pa = umem->odp_data->dma_list[offset + i];
172
173 pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
174 }
175 return;
176 }
177#endif
123 178
124 i = 0; 179 i = 0;
125 for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { 180 for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
@@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
128 for (k = 0; k < len; k++) { 183 for (k = 0; k < len; k++) {
129 if (!(i & mask)) { 184 if (!(i & mask)) {
130 cur = base + (k << umem_page_shift); 185 cur = base + (k << umem_page_shift);
131 if (umr) 186 cur |= access_flags;
132 cur |= 3;
133 187
134 pas[i >> shift] = cpu_to_be64(cur); 188 pas[i >> shift] = cpu_to_be64(cur);
135 mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", 189 mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
@@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
142 } 196 }
143} 197}
144 198
199void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
200 int page_shift, __be64 *pas, int access_flags)
201{
202 return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
203 ib_umem_num_pages(umem), pas,
204 access_flags);
205}
145int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) 206int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
146{ 207{
147 u64 page_size; 208 u64 page_size;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 386780f0d1e1..83f22fe297c8 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -111,6 +111,8 @@ struct mlx5_ib_pd {
111 */ 111 */
112 112
113#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START 113#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START
114#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
115#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
114#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 116#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1
115#define MLX5_IB_WR_UMR IB_WR_RESERVED1 117#define MLX5_IB_WR_UMR IB_WR_RESERVED1
116 118
@@ -147,6 +149,29 @@ enum {
147 MLX5_QP_EMPTY 149 MLX5_QP_EMPTY
148}; 150};
149 151
152/*
153 * Connect-IB can trigger up to four concurrent pagefaults
154 * per-QP.
155 */
156enum mlx5_ib_pagefault_context {
157 MLX5_IB_PAGEFAULT_RESPONDER_READ,
158 MLX5_IB_PAGEFAULT_REQUESTOR_READ,
159 MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
160 MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
161 MLX5_IB_PAGEFAULT_CONTEXTS
162};
163
164static inline enum mlx5_ib_pagefault_context
165 mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
166{
167 return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
168}
169
170struct mlx5_ib_pfault {
171 struct work_struct work;
172 struct mlx5_pagefault mpfault;
173};
174
150struct mlx5_ib_qp { 175struct mlx5_ib_qp {
151 struct ib_qp ibqp; 176 struct ib_qp ibqp;
152 struct mlx5_core_qp mqp; 177 struct mlx5_core_qp mqp;
@@ -192,6 +217,21 @@ struct mlx5_ib_qp {
192 217
193 /* Store signature errors */ 218 /* Store signature errors */
194 bool signature_en; 219 bool signature_en;
220
221#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
222 /*
223 * A flag that is true for QP's that are in a state that doesn't
224 * allow page faults, and shouldn't schedule any more faults.
225 */
226 int disable_page_faults;
227 /*
228 * The disable_page_faults_lock protects a QP's disable_page_faults
229 * field, allowing for a thread to atomically check whether the QP
230 * allows page faults, and if so schedule a page fault.
231 */
232 spinlock_t disable_page_faults_lock;
233 struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
234#endif
195}; 235};
196 236
197struct mlx5_ib_cq_buf { 237struct mlx5_ib_cq_buf {
@@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags {
206 MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, 246 MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1,
207}; 247};
208 248
249struct mlx5_umr_wr {
250 union {
251 u64 virt_addr;
252 u64 offset;
253 } target;
254 struct ib_pd *pd;
255 unsigned int page_shift;
256 unsigned int npages;
257 u32 length;
258 int access_flags;
259 u32 mkey;
260};
261
209struct mlx5_shared_mr_info { 262struct mlx5_shared_mr_info {
210 int mr_id; 263 int mr_id;
211 struct ib_umem *umem; 264 struct ib_umem *umem;
@@ -253,6 +306,13 @@ struct mlx5_ib_xrcd {
253 u32 xrcdn; 306 u32 xrcdn;
254}; 307};
255 308
309enum mlx5_ib_mtt_access_flags {
310 MLX5_IB_MTT_READ = (1 << 0),
311 MLX5_IB_MTT_WRITE = (1 << 1),
312};
313
314#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
315
256struct mlx5_ib_mr { 316struct mlx5_ib_mr {
257 struct ib_mr ibmr; 317 struct ib_mr ibmr;
258 struct mlx5_core_mr mmr; 318 struct mlx5_core_mr mmr;
@@ -261,12 +321,11 @@ struct mlx5_ib_mr {
261 struct list_head list; 321 struct list_head list;
262 int order; 322 int order;
263 int umred; 323 int umred;
264 __be64 *pas;
265 dma_addr_t dma;
266 int npages; 324 int npages;
267 struct mlx5_ib_dev *dev; 325 struct mlx5_ib_dev *dev;
268 struct mlx5_create_mkey_mbox_out out; 326 struct mlx5_create_mkey_mbox_out out;
269 struct mlx5_core_sig_ctx *sig; 327 struct mlx5_core_sig_ctx *sig;
328 int live;
270}; 329};
271 330
272struct mlx5_ib_fast_reg_page_list { 331struct mlx5_ib_fast_reg_page_list {
@@ -372,11 +431,18 @@ struct mlx5_ib_dev {
372 struct umr_common umrc; 431 struct umr_common umrc;
373 /* sync used page count stats 432 /* sync used page count stats
374 */ 433 */
375 spinlock_t mr_lock;
376 struct mlx5_ib_resources devr; 434 struct mlx5_ib_resources devr;
377 struct mlx5_mr_cache cache; 435 struct mlx5_mr_cache cache;
378 struct timer_list delay_timer; 436 struct timer_list delay_timer;
379 int fill_delay; 437 int fill_delay;
438#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
439 struct ib_odp_caps odp_caps;
440 /*
441 * Sleepable RCU that prevents destruction of MRs while they are still
442 * being used by a page fault handler.
443 */
444 struct srcu_struct mr_srcu;
445#endif
380}; 446};
381 447
382static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) 448static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
490int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, 556int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
491 struct ib_recv_wr **bad_wr); 557 struct ib_recv_wr **bad_wr);
492void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); 558void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
559int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
560 void *buffer, u32 length);
493struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, 561struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
494 int vector, struct ib_ucontext *context, 562 int vector, struct ib_ucontext *context,
495 struct ib_udata *udata); 563 struct ib_udata *udata);
@@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
502struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 570struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
503 u64 virt_addr, int access_flags, 571 u64 virt_addr, int access_flags,
504 struct ib_udata *udata); 572 struct ib_udata *udata);
573int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
574 int npages, int zap);
505int mlx5_ib_dereg_mr(struct ib_mr *ibmr); 575int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
506int mlx5_ib_destroy_mr(struct ib_mr *ibmr); 576int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
507struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, 577struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
@@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
533void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); 603void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
534void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, 604void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
535 int *ncont, int *order); 605 int *ncont, int *order);
606void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
607 int page_shift, size_t offset, size_t num_pages,
608 __be64 *pas, int access_flags);
536void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, 609void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
537 int page_shift, __be64 *pas, int umr); 610 int page_shift, __be64 *pas, int access_flags);
538void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); 611void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
539int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); 612int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
540int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); 613int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
@@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
544int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 617int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
545 struct ib_mr_status *mr_status); 618 struct ib_mr_status *mr_status);
546 619
620#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
621extern struct workqueue_struct *mlx5_ib_page_fault_wq;
622
623int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
624void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
625 struct mlx5_ib_pfault *pfault);
626void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
627int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
628void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
629int __init mlx5_ib_odp_init(void);
630void mlx5_ib_odp_cleanup(void);
631void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
632void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
633void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
634 unsigned long end);
635
636#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
637static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
638{
639 return 0;
640}
641
642static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {}
643static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
644static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
645static inline int mlx5_ib_odp_init(void) { return 0; }
646static inline void mlx5_ib_odp_cleanup(void) {}
647static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
648static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
649
650#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
651
547static inline void init_query_mad(struct ib_smp *mad) 652static inline void init_query_mad(struct ib_smp *mad)
548{ 653{
549 mad->base_version = 1; 654 mad->base_version = 1;
@@ -561,4 +666,7 @@ static inline u8 convert_access(int acc)
561 MLX5_PERM_LOCAL_READ; 666 MLX5_PERM_LOCAL_READ;
562} 667}
563 668
669#define MLX5_MAX_UMR_SHIFT 16
670#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
671
564#endif /* MLX5_IB_H */ 672#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 5a80dd993761..32a28bd50b20 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -37,21 +37,34 @@
37#include <linux/export.h> 37#include <linux/export.h>
38#include <linux/delay.h> 38#include <linux/delay.h>
39#include <rdma/ib_umem.h> 39#include <rdma/ib_umem.h>
40#include <rdma/ib_umem_odp.h>
41#include <rdma/ib_verbs.h>
40#include "mlx5_ib.h" 42#include "mlx5_ib.h"
41 43
42enum { 44enum {
43 MAX_PENDING_REG_MR = 8, 45 MAX_PENDING_REG_MR = 8,
44}; 46};
45 47
46enum { 48#define MLX5_UMR_ALIGN 2048
47 MLX5_UMR_ALIGN = 2048 49#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
48}; 50static __be64 mlx5_ib_update_mtt_emergency_buffer[
51 MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
52 __aligned(MLX5_UMR_ALIGN);
53static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
54#endif
55
56static int clean_mr(struct mlx5_ib_mr *mr);
49 57
50static __be64 *mr_align(__be64 *ptr, int align) 58static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
51{ 59{
52 unsigned long mask = align - 1; 60 int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
53 61
54 return (__be64 *)(((unsigned long)ptr + mask) & ~mask); 62#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
63 /* Wait until all page fault handlers using the mr complete. */
64 synchronize_srcu(&dev->mr_srcu);
65#endif
66
67 return err;
55} 68}
56 69
57static int order2idx(struct mlx5_ib_dev *dev, int order) 70static int order2idx(struct mlx5_ib_dev *dev, int order)
@@ -146,7 +159,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
146 mr->order = ent->order; 159 mr->order = ent->order;
147 mr->umred = 1; 160 mr->umred = 1;
148 mr->dev = dev; 161 mr->dev = dev;
149 in->seg.status = 1 << 6; 162 in->seg.status = MLX5_MKEY_STATUS_FREE;
150 in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); 163 in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
151 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); 164 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
152 in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; 165 in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
@@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
191 ent->cur--; 204 ent->cur--;
192 ent->size--; 205 ent->size--;
193 spin_unlock_irq(&ent->lock); 206 spin_unlock_irq(&ent->lock);
194 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 207 err = destroy_mkey(dev, mr);
195 if (err) 208 if (err)
196 mlx5_ib_warn(dev, "failed destroy mkey\n"); 209 mlx5_ib_warn(dev, "failed destroy mkey\n");
197 else 210 else
@@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
482 ent->cur--; 495 ent->cur--;
483 ent->size--; 496 ent->size--;
484 spin_unlock_irq(&ent->lock); 497 spin_unlock_irq(&ent->lock);
485 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 498 err = destroy_mkey(dev, mr);
486 if (err) 499 if (err)
487 mlx5_ib_warn(dev, "failed destroy mkey\n"); 500 mlx5_ib_warn(dev, "failed destroy mkey\n");
488 else 501 else
@@ -668,7 +681,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size)
668 681
669static int use_umr(int order) 682static int use_umr(int order)
670{ 683{
671 return order <= 17; 684 return order <= MLX5_MAX_UMR_SHIFT;
672} 685}
673 686
674static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, 687static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
@@ -678,6 +691,7 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
678{ 691{
679 struct mlx5_ib_dev *dev = to_mdev(pd->device); 692 struct mlx5_ib_dev *dev = to_mdev(pd->device);
680 struct ib_mr *mr = dev->umrc.mr; 693 struct ib_mr *mr = dev->umrc.mr;
694 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
681 695
682 sg->addr = dma; 696 sg->addr = dma;
683 sg->length = ALIGN(sizeof(u64) * n, 64); 697 sg->length = ALIGN(sizeof(u64) * n, 64);
@@ -692,21 +706,24 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
692 wr->num_sge = 0; 706 wr->num_sge = 0;
693 707
694 wr->opcode = MLX5_IB_WR_UMR; 708 wr->opcode = MLX5_IB_WR_UMR;
695 wr->wr.fast_reg.page_list_len = n; 709
696 wr->wr.fast_reg.page_shift = page_shift; 710 umrwr->npages = n;
697 wr->wr.fast_reg.rkey = key; 711 umrwr->page_shift = page_shift;
698 wr->wr.fast_reg.iova_start = virt_addr; 712 umrwr->mkey = key;
699 wr->wr.fast_reg.length = len; 713 umrwr->target.virt_addr = virt_addr;
700 wr->wr.fast_reg.access_flags = access_flags; 714 umrwr->length = len;
701 wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd; 715 umrwr->access_flags = access_flags;
716 umrwr->pd = pd;
702} 717}
703 718
704static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, 719static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
705 struct ib_send_wr *wr, u32 key) 720 struct ib_send_wr *wr, u32 key)
706{ 721{
707 wr->send_flags = MLX5_IB_SEND_UMR_UNREG; 722 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
723
724 wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
708 wr->opcode = MLX5_IB_WR_UMR; 725 wr->opcode = MLX5_IB_WR_UMR;
709 wr->wr.fast_reg.rkey = key; 726 umrwr->mkey = key;
710} 727}
711 728
712void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) 729void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
@@ -742,7 +759,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
742 struct ib_send_wr wr, *bad; 759 struct ib_send_wr wr, *bad;
743 struct mlx5_ib_mr *mr; 760 struct mlx5_ib_mr *mr;
744 struct ib_sge sg; 761 struct ib_sge sg;
745 int size = sizeof(u64) * npages; 762 int size;
763 __be64 *mr_pas;
764 __be64 *pas;
765 dma_addr_t dma;
746 int err = 0; 766 int err = 0;
747 int i; 767 int i;
748 768
@@ -761,25 +781,31 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
761 if (!mr) 781 if (!mr)
762 return ERR_PTR(-EAGAIN); 782 return ERR_PTR(-EAGAIN);
763 783
764 mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); 784 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
765 if (!mr->pas) { 785 * To avoid copying garbage after the pas array, we allocate
786 * a little more. */
787 size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
788 mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
789 if (!mr_pas) {
766 err = -ENOMEM; 790 err = -ENOMEM;
767 goto free_mr; 791 goto free_mr;
768 } 792 }
769 793
770 mlx5_ib_populate_pas(dev, umem, page_shift, 794 pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN);
771 mr_align(mr->pas, MLX5_UMR_ALIGN), 1); 795 mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
796 /* Clear padding after the actual pages. */
797 memset(pas + npages, 0, size - npages * sizeof(u64));
772 798
773 mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, 799 dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
774 DMA_TO_DEVICE); 800 if (dma_mapping_error(ddev, dma)) {
775 if (dma_mapping_error(ddev, mr->dma)) {
776 err = -ENOMEM; 801 err = -ENOMEM;
777 goto free_pas; 802 goto free_pas;
778 } 803 }
779 804
780 memset(&wr, 0, sizeof(wr)); 805 memset(&wr, 0, sizeof(wr));
781 wr.wr_id = (u64)(unsigned long)&umr_context; 806 wr.wr_id = (u64)(unsigned long)&umr_context;
782 prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); 807 prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift,
808 virt_addr, len, access_flags);
783 809
784 mlx5_ib_init_umr_context(&umr_context); 810 mlx5_ib_init_umr_context(&umr_context);
785 down(&umrc->sem); 811 down(&umrc->sem);
@@ -799,12 +825,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
799 mr->mmr.size = len; 825 mr->mmr.size = len;
800 mr->mmr.pd = to_mpd(pd)->pdn; 826 mr->mmr.pd = to_mpd(pd)->pdn;
801 827
828 mr->live = 1;
829
802unmap_dma: 830unmap_dma:
803 up(&umrc->sem); 831 up(&umrc->sem);
804 dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); 832 dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
805 833
806free_pas: 834free_pas:
807 kfree(mr->pas); 835 kfree(mr_pas);
808 836
809free_mr: 837free_mr:
810 if (err) { 838 if (err) {
@@ -815,6 +843,128 @@ free_mr:
815 return mr; 843 return mr;
816} 844}
817 845
846#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
847int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
848 int zap)
849{
850 struct mlx5_ib_dev *dev = mr->dev;
851 struct device *ddev = dev->ib_dev.dma_device;
852 struct umr_common *umrc = &dev->umrc;
853 struct mlx5_ib_umr_context umr_context;
854 struct ib_umem *umem = mr->umem;
855 int size;
856 __be64 *pas;
857 dma_addr_t dma;
858 struct ib_send_wr wr, *bad;
859 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg;
860 struct ib_sge sg;
861 int err = 0;
862 const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
863 const int page_index_mask = page_index_alignment - 1;
864 size_t pages_mapped = 0;
865 size_t pages_to_map = 0;
866 size_t pages_iter = 0;
867 int use_emergency_buf = 0;
868
869 /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
870 * so we need to align the offset and length accordingly */
871 if (start_page_index & page_index_mask) {
872 npages += start_page_index & page_index_mask;
873 start_page_index &= ~page_index_mask;
874 }
875
876 pages_to_map = ALIGN(npages, page_index_alignment);
877
878 if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
879 return -EINVAL;
880
881 size = sizeof(u64) * pages_to_map;
882 size = min_t(int, PAGE_SIZE, size);
883 /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
884 * code, when we are called from an invalidation. The pas buffer must
885 * be 2k-aligned for Connect-IB. */
886 pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
887 if (!pas) {
888 mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
889 pas = mlx5_ib_update_mtt_emergency_buffer;
890 size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
891 use_emergency_buf = 1;
892 mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
893 memset(pas, 0, size);
894 }
895 pages_iter = size / sizeof(u64);
896 dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
897 if (dma_mapping_error(ddev, dma)) {
898 mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
899 err = -ENOMEM;
900 goto free_pas;
901 }
902
903 for (pages_mapped = 0;
904 pages_mapped < pages_to_map && !err;
905 pages_mapped += pages_iter, start_page_index += pages_iter) {
906 dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
907
908 npages = min_t(size_t,
909 pages_iter,
910 ib_umem_num_pages(umem) - start_page_index);
911
912 if (!zap) {
913 __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
914 start_page_index, npages, pas,
915 MLX5_IB_MTT_PRESENT);
916 /* Clear padding after the pages brought from the
917 * umem. */
918 memset(pas + npages, 0, size - npages * sizeof(u64));
919 }
920
921 dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
922
923 memset(&wr, 0, sizeof(wr));
924 wr.wr_id = (u64)(unsigned long)&umr_context;
925
926 sg.addr = dma;
927 sg.length = ALIGN(npages * sizeof(u64),
928 MLX5_UMR_MTT_ALIGNMENT);
929 sg.lkey = dev->umrc.mr->lkey;
930
931 wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
932 MLX5_IB_SEND_UMR_UPDATE_MTT;
933 wr.sg_list = &sg;
934 wr.num_sge = 1;
935 wr.opcode = MLX5_IB_WR_UMR;
936 umrwr->npages = sg.length / sizeof(u64);
937 umrwr->page_shift = PAGE_SHIFT;
938 umrwr->mkey = mr->mmr.key;
939 umrwr->target.offset = start_page_index;
940
941 mlx5_ib_init_umr_context(&umr_context);
942 down(&umrc->sem);
943 err = ib_post_send(umrc->qp, &wr, &bad);
944 if (err) {
945 mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
946 } else {
947 wait_for_completion(&umr_context.done);
948 if (umr_context.status != IB_WC_SUCCESS) {
949 mlx5_ib_err(dev, "UMR completion failed, code %d\n",
950 umr_context.status);
951 err = -EFAULT;
952 }
953 }
954 up(&umrc->sem);
955 }
956 dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
957
958free_pas:
959 if (!use_emergency_buf)
960 free_page((unsigned long)pas);
961 else
962 mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
963
964 return err;
965}
966#endif
967
818static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, 968static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
819 u64 length, struct ib_umem *umem, 969 u64 length, struct ib_umem *umem,
820 int npages, int page_shift, 970 int npages, int page_shift,
@@ -825,6 +975,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
825 struct mlx5_ib_mr *mr; 975 struct mlx5_ib_mr *mr;
826 int inlen; 976 int inlen;
827 int err; 977 int err;
978 bool pg_cap = !!(dev->mdev->caps.gen.flags &
979 MLX5_DEV_CAP_FLAG_ON_DMND_PG);
828 980
829 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 981 mr = kzalloc(sizeof(*mr), GFP_KERNEL);
830 if (!mr) 982 if (!mr)
@@ -836,8 +988,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
836 err = -ENOMEM; 988 err = -ENOMEM;
837 goto err_1; 989 goto err_1;
838 } 990 }
839 mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); 991 mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
992 pg_cap ? MLX5_IB_MTT_PRESENT : 0);
840 993
994 /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
995 * in the page list submitted with the command. */
996 in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
841 in->seg.flags = convert_access(access_flags) | 997 in->seg.flags = convert_access(access_flags) |
842 MLX5_ACCESS_MODE_MTT; 998 MLX5_ACCESS_MODE_MTT;
843 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); 999 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
@@ -856,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
856 goto err_2; 1012 goto err_2;
857 } 1013 }
858 mr->umem = umem; 1014 mr->umem = umem;
1015 mr->live = 1;
859 kvfree(in); 1016 kvfree(in);
860 1017
861 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); 1018 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
@@ -910,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
910 mlx5_ib_dbg(dev, "cache empty for order %d", order); 1067 mlx5_ib_dbg(dev, "cache empty for order %d", order);
911 mr = NULL; 1068 mr = NULL;
912 } 1069 }
1070 } else if (access_flags & IB_ACCESS_ON_DEMAND) {
1071 err = -EINVAL;
1072 pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
1073 goto error;
913 } 1074 }
914 1075
915 if (!mr) 1076 if (!mr)
@@ -925,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
925 1086
926 mr->umem = umem; 1087 mr->umem = umem;
927 mr->npages = npages; 1088 mr->npages = npages;
928 spin_lock(&dev->mr_lock); 1089 atomic_add(npages, &dev->mdev->priv.reg_pages);
929 dev->mdev->priv.reg_pages += npages;
930 spin_unlock(&dev->mr_lock);
931 mr->ibmr.lkey = mr->mmr.key; 1090 mr->ibmr.lkey = mr->mmr.key;
932 mr->ibmr.rkey = mr->mmr.key; 1091 mr->ibmr.rkey = mr->mmr.key;
933 1092
1093#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1094 if (umem->odp_data) {
1095 /*
1096 * This barrier prevents the compiler from moving the
1097 * setting of umem->odp_data->private to point to our
1098 * MR, before reg_umr finished, to ensure that the MR
1099 * initialization have finished before starting to
1100 * handle invalidations.
1101 */
1102 smp_wmb();
1103 mr->umem->odp_data->private = mr;
1104 /*
1105 * Make sure we will see the new
1106 * umem->odp_data->private value in the invalidation
1107 * routines, before we can get page faults on the
1108 * MR. Page faults can happen once we put the MR in
1109 * the tree, below this line. Without the barrier,
1110 * there can be a fault handling and an invalidation
1111 * before umem->odp_data->private == mr is visible to
1112 * the invalidation handler.
1113 */
1114 smp_wmb();
1115 }
1116#endif
1117
934 return &mr->ibmr; 1118 return &mr->ibmr;
935 1119
936error: 1120error:
1121 /*
1122 * Destroy the umem *before* destroying the MR, to ensure we
1123 * will not have any in-flight notifiers when destroying the
1124 * MR.
1125 *
1126 * As the MR is completely invalid to begin with, and this
1127 * error path is only taken if we can't push the mr entry into
1128 * the pagefault tree, this is safe.
1129 */
1130
937 ib_umem_release(umem); 1131 ib_umem_release(umem);
1132 /* Kill the MR, and return an error code. */
1133 clean_mr(mr);
938 return ERR_PTR(err); 1134 return ERR_PTR(err);
939} 1135}
940 1136
@@ -971,17 +1167,14 @@ error:
971 return err; 1167 return err;
972} 1168}
973 1169
974int mlx5_ib_dereg_mr(struct ib_mr *ibmr) 1170static int clean_mr(struct mlx5_ib_mr *mr)
975{ 1171{
976 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1172 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
977 struct mlx5_ib_mr *mr = to_mmr(ibmr);
978 struct ib_umem *umem = mr->umem;
979 int npages = mr->npages;
980 int umred = mr->umred; 1173 int umred = mr->umred;
981 int err; 1174 int err;
982 1175
983 if (!umred) { 1176 if (!umred) {
984 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 1177 err = destroy_mkey(dev, mr);
985 if (err) { 1178 if (err) {
986 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", 1179 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
987 mr->mmr.key, err); 1180 mr->mmr.key, err);
@@ -996,15 +1189,47 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
996 free_cached_mr(dev, mr); 1189 free_cached_mr(dev, mr);
997 } 1190 }
998 1191
999 if (umem) { 1192 if (!umred)
1193 kfree(mr);
1194
1195 return 0;
1196}
1197
1198int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
1199{
1200 struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1201 struct mlx5_ib_mr *mr = to_mmr(ibmr);
1202 int npages = mr->npages;
1203 struct ib_umem *umem = mr->umem;
1204
1205#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1206 if (umem && umem->odp_data) {
1207 /* Prevent new page faults from succeeding */
1208 mr->live = 0;
1209 /* Wait for all running page-fault handlers to finish. */
1210 synchronize_srcu(&dev->mr_srcu);
1211 /* Destroy all page mappings */
1212 mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1213 ib_umem_end(umem));
1214 /*
1215 * We kill the umem before the MR for ODP,
1216 * so that there will not be any invalidations in
1217 * flight, looking at the *mr struct.
1218 */
1000 ib_umem_release(umem); 1219 ib_umem_release(umem);
1001 spin_lock(&dev->mr_lock); 1220 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1002 dev->mdev->priv.reg_pages -= npages; 1221
1003 spin_unlock(&dev->mr_lock); 1222 /* Avoid double-freeing the umem. */
1223 umem = NULL;
1004 } 1224 }
1225#endif
1005 1226
1006 if (!umred) 1227 clean_mr(mr);
1007 kfree(mr); 1228
1229 if (umem) {
1230 ib_umem_release(umem);
1231 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1232 }
1008 1233
1009 return 0; 1234 return 0;
1010} 1235}
@@ -1028,7 +1253,7 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
1028 goto err_free; 1253 goto err_free;
1029 } 1254 }
1030 1255
1031 in->seg.status = 1 << 6; /* free */ 1256 in->seg.status = MLX5_MKEY_STATUS_FREE;
1032 in->seg.xlt_oct_size = cpu_to_be32(ndescs); 1257 in->seg.xlt_oct_size = cpu_to_be32(ndescs);
1033 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); 1258 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1034 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); 1259 in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
@@ -1113,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
1113 kfree(mr->sig); 1338 kfree(mr->sig);
1114 } 1339 }
1115 1340
1116 err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); 1341 err = destroy_mkey(dev, mr);
1117 if (err) { 1342 if (err) {
1118 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", 1343 mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1119 mr->mmr.key, err); 1344 mr->mmr.key, err);
@@ -1143,7 +1368,7 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
1143 goto err_free; 1368 goto err_free;
1144 } 1369 }
1145 1370
1146 in->seg.status = 1 << 6; /* free */ 1371 in->seg.status = MLX5_MKEY_STATUS_FREE;
1147 in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); 1372 in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
1148 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); 1373 in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
1149 in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; 1374 in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
new file mode 100644
index 000000000000..a2c541c4809a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -0,0 +1,798 @@
1/*
2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <rdma/ib_umem.h>
34#include <rdma/ib_umem_odp.h>
35
36#include "mlx5_ib.h"
37
38#define MAX_PREFETCH_LEN (4*1024*1024U)
39
40/* Timeout in ms to wait for an active mmu notifier to complete when handling
41 * a pagefault. */
42#define MMU_NOTIFIER_TIMEOUT 1000
43
44struct workqueue_struct *mlx5_ib_page_fault_wq;
45
46void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
47 unsigned long end)
48{
49 struct mlx5_ib_mr *mr;
50 const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
51 u64 idx = 0, blk_start_idx = 0;
52 int in_block = 0;
53 u64 addr;
54
55 if (!umem || !umem->odp_data) {
56 pr_err("invalidation called on NULL umem or non-ODP umem\n");
57 return;
58 }
59
60 mr = umem->odp_data->private;
61
62 if (!mr || !mr->ibmr.pd)
63 return;
64
65 start = max_t(u64, ib_umem_start(umem), start);
66 end = min_t(u64, ib_umem_end(umem), end);
67
68 /*
69 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
70 * while we are doing the invalidation, no page fault will attempt to
71 * overwrite the same MTTs. Concurent invalidations might race us,
72 * but they will write 0s as well, so no difference in the end result.
73 */
74
75 for (addr = start; addr < end; addr += (u64)umem->page_size) {
76 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
77 /*
78 * Strive to write the MTTs in chunks, but avoid overwriting
79 * non-existing MTTs. The huristic here can be improved to
80 * estimate the cost of another UMR vs. the cost of bigger
81 * UMR.
82 */
83 if (umem->odp_data->dma_list[idx] &
84 (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
85 if (!in_block) {
86 blk_start_idx = idx;
87 in_block = 1;
88 }
89 } else {
90 u64 umr_offset = idx & umr_block_mask;
91
92 if (in_block && umr_offset == 0) {
93 mlx5_ib_update_mtt(mr, blk_start_idx,
94 idx - blk_start_idx, 1);
95 in_block = 0;
96 }
97 }
98 }
99 if (in_block)
100 mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
101 1);
102
103 /*
104 * We are now sure that the device will not access the
105 * memory. We can safely unmap it, and mark it as dirty if
106 * needed.
107 */
108
109 ib_umem_odp_unmap_dma_pages(umem, start, end);
110}
111
112#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \
113 if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \
114 ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \
115} while (0)
116
117int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
118{
119 int err;
120 struct mlx5_odp_caps hw_caps;
121 struct ib_odp_caps *caps = &dev->odp_caps;
122
123 memset(caps, 0, sizeof(*caps));
124
125 if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
126 return 0;
127
128 err = mlx5_query_odp_caps(dev->mdev, &hw_caps);
129 if (err)
130 goto out;
131
132 caps->general_caps = IB_ODP_SUPPORT;
133 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
134 SEND);
135 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
136 SEND);
137 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
138 RECV);
139 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
140 WRITE);
141 COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
142 READ);
143
144out:
145 return err;
146}
147
148static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
149 u32 key)
150{
151 u32 base_key = mlx5_base_mkey(key);
152 struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
153 struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
154
155 if (!mmr || mmr->key != key || !mr->live)
156 return NULL;
157
158 return container_of(mmr, struct mlx5_ib_mr, mmr);
159}
160
161static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
162 struct mlx5_ib_pfault *pfault,
163 int error) {
164 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
165 int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
166 pfault->mpfault.flags,
167 error);
168 if (ret)
169 pr_err("Failed to resolve the page fault on QP 0x%x\n",
170 qp->mqp.qpn);
171}
172
173/*
174 * Handle a single data segment in a page-fault WQE.
175 *
176 * Returns number of pages retrieved on success. The caller will continue to
177 * the next data segment.
178 * Can return the following error codes:
179 * -EAGAIN to designate a temporary error. The caller will abort handling the
180 * page fault and resolve it.
181 * -EFAULT when there's an error mapping the requested pages. The caller will
182 * abort the page fault handling and possibly move the QP to an error state.
183 * On other errors the QP should also be closed with an error.
184 */
185static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
186 struct mlx5_ib_pfault *pfault,
187 u32 key, u64 io_virt, size_t bcnt,
188 u32 *bytes_mapped)
189{
190 struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
191 int srcu_key;
192 unsigned int current_seq;
193 u64 start_idx;
194 int npages = 0, ret = 0;
195 struct mlx5_ib_mr *mr;
196 u64 access_mask = ODP_READ_ALLOWED_BIT;
197
198 srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
199 mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
200 /*
201 * If we didn't find the MR, it means the MR was closed while we were
202 * handling the ODP event. In this case we return -EFAULT so that the
203 * QP will be closed.
204 */
205 if (!mr || !mr->ibmr.pd) {
206 pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
207 key);
208 ret = -EFAULT;
209 goto srcu_unlock;
210 }
211 if (!mr->umem->odp_data) {
212 pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
213 key);
214 if (bytes_mapped)
215 *bytes_mapped +=
216 (bcnt - pfault->mpfault.bytes_committed);
217 goto srcu_unlock;
218 }
219 if (mr->ibmr.pd != qp->ibqp.pd) {
220 pr_err("Page-fault with different PDs for QP and MR.\n");
221 ret = -EFAULT;
222 goto srcu_unlock;
223 }
224
225 current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
226 /*
227 * Ensure the sequence number is valid for some time before we call
228 * gup.
229 */
230 smp_rmb();
231
232 /*
233 * Avoid branches - this code will perform correctly
234 * in all iterations (in iteration 2 and above,
235 * bytes_committed == 0).
236 */
237 io_virt += pfault->mpfault.bytes_committed;
238 bcnt -= pfault->mpfault.bytes_committed;
239
240 start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT;
241
242 if (mr->umem->writable)
243 access_mask |= ODP_WRITE_ALLOWED_BIT;
244 npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
245 access_mask, current_seq);
246 if (npages < 0) {
247 ret = npages;
248 goto srcu_unlock;
249 }
250
251 if (npages > 0) {
252 mutex_lock(&mr->umem->odp_data->umem_mutex);
253 if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
254 /*
255 * No need to check whether the MTTs really belong to
256 * this MR, since ib_umem_odp_map_dma_pages already
257 * checks this.
258 */
259 ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
260 } else {
261 ret = -EAGAIN;
262 }
263 mutex_unlock(&mr->umem->odp_data->umem_mutex);
264 if (ret < 0) {
265 if (ret != -EAGAIN)
266 pr_err("Failed to update mkey page tables\n");
267 goto srcu_unlock;
268 }
269
270 if (bytes_mapped) {
271 u32 new_mappings = npages * PAGE_SIZE -
272 (io_virt - round_down(io_virt, PAGE_SIZE));
273 *bytes_mapped += min_t(u32, new_mappings, bcnt);
274 }
275 }
276
277srcu_unlock:
278 if (ret == -EAGAIN) {
279 if (!mr->umem->odp_data->dying) {
280 struct ib_umem_odp *odp_data = mr->umem->odp_data;
281 unsigned long timeout =
282 msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
283
284 if (!wait_for_completion_timeout(
285 &odp_data->notifier_completion,
286 timeout)) {
287 pr_warn("timeout waiting for mmu notifier completion\n");
288 }
289 } else {
290 /* The MR is being killed, kill the QP as well. */
291 ret = -EFAULT;
292 }
293 }
294 srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
295 pfault->mpfault.bytes_committed = 0;
296 return ret ? ret : npages;
297}
298
299/**
300 * Parse a series of data segments for page fault handling.
301 *
302 * @qp the QP on which the fault occurred.
303 * @pfault contains page fault information.
304 * @wqe points at the first data segment in the WQE.
305 * @wqe_end points after the end of the WQE.
306 * @bytes_mapped receives the number of bytes that the function was able to
307 * map. This allows the caller to decide intelligently whether
308 * enough memory was mapped to resolve the page fault
309 * successfully (e.g. enough for the next MTU, or the entire
310 * WQE).
311 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
312 * the committed bytes).
313 *
314 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
315 * negative error code.
316 */
317static int pagefault_data_segments(struct mlx5_ib_qp *qp,
318 struct mlx5_ib_pfault *pfault, void *wqe,
319 void *wqe_end, u32 *bytes_mapped,
320 u32 *total_wqe_bytes, int receive_queue)
321{
322 int ret = 0, npages = 0;
323 u64 io_virt;
324 u32 key;
325 u32 byte_count;
326 size_t bcnt;
327 int inline_segment;
328
329 /* Skip SRQ next-WQE segment. */
330 if (receive_queue && qp->ibqp.srq)
331 wqe += sizeof(struct mlx5_wqe_srq_next_seg);
332
333 if (bytes_mapped)
334 *bytes_mapped = 0;
335 if (total_wqe_bytes)
336 *total_wqe_bytes = 0;
337
338 while (wqe < wqe_end) {
339 struct mlx5_wqe_data_seg *dseg = wqe;
340
341 io_virt = be64_to_cpu(dseg->addr);
342 key = be32_to_cpu(dseg->lkey);
343 byte_count = be32_to_cpu(dseg->byte_count);
344 inline_segment = !!(byte_count & MLX5_INLINE_SEG);
345 bcnt = byte_count & ~MLX5_INLINE_SEG;
346
347 if (inline_segment) {
348 bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
349 wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
350 16);
351 } else {
352 wqe += sizeof(*dseg);
353 }
354
355 /* receive WQE end of sg list. */
356 if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
357 io_virt == 0)
358 break;
359
360 if (!inline_segment && total_wqe_bytes) {
361 *total_wqe_bytes += bcnt - min_t(size_t, bcnt,
362 pfault->mpfault.bytes_committed);
363 }
364
365 /* A zero length data segment designates a length of 2GB. */
366 if (bcnt == 0)
367 bcnt = 1U << 31;
368
369 if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) {
370 pfault->mpfault.bytes_committed -=
371 min_t(size_t, bcnt,
372 pfault->mpfault.bytes_committed);
373 continue;
374 }
375
376 ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
377 bcnt, bytes_mapped);
378 if (ret < 0)
379 break;
380 npages += ret;
381 }
382
383 return ret < 0 ? ret : npages;
384}
385
386/*
387 * Parse initiator WQE. Advances the wqe pointer to point at the
388 * scatter-gather list, and set wqe_end to the end of the WQE.
389 */
390static int mlx5_ib_mr_initiator_pfault_handler(
391 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
392 void **wqe, void **wqe_end, int wqe_length)
393{
394 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
395 struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
396 u16 wqe_index = pfault->mpfault.wqe.wqe_index;
397 unsigned ds, opcode;
398#if defined(DEBUG)
399 u32 ctrl_wqe_index, ctrl_qpn;
400#endif
401
402 ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
403 if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
404 mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
405 ds, wqe_length);
406 return -EFAULT;
407 }
408
409 if (ds == 0) {
410 mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
411 wqe_index, qp->mqp.qpn);
412 return -EFAULT;
413 }
414
415#if defined(DEBUG)
416 ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
417 MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
418 MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
419 if (wqe_index != ctrl_wqe_index) {
420 mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
421 wqe_index, qp->mqp.qpn,
422 ctrl_wqe_index);
423 return -EFAULT;
424 }
425
426 ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
427 MLX5_WQE_CTRL_QPN_SHIFT;
428 if (qp->mqp.qpn != ctrl_qpn) {
429 mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
430 wqe_index, qp->mqp.qpn,
431 ctrl_qpn);
432 return -EFAULT;
433 }
434#endif /* DEBUG */
435
436 *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
437 *wqe += sizeof(*ctrl);
438
439 opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
440 MLX5_WQE_CTRL_OPCODE_MASK;
441 switch (qp->ibqp.qp_type) {
442 case IB_QPT_RC:
443 switch (opcode) {
444 case MLX5_OPCODE_SEND:
445 case MLX5_OPCODE_SEND_IMM:
446 case MLX5_OPCODE_SEND_INVAL:
447 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
448 IB_ODP_SUPPORT_SEND))
449 goto invalid_transport_or_opcode;
450 break;
451 case MLX5_OPCODE_RDMA_WRITE:
452 case MLX5_OPCODE_RDMA_WRITE_IMM:
453 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
454 IB_ODP_SUPPORT_WRITE))
455 goto invalid_transport_or_opcode;
456 *wqe += sizeof(struct mlx5_wqe_raddr_seg);
457 break;
458 case MLX5_OPCODE_RDMA_READ:
459 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
460 IB_ODP_SUPPORT_READ))
461 goto invalid_transport_or_opcode;
462 *wqe += sizeof(struct mlx5_wqe_raddr_seg);
463 break;
464 default:
465 goto invalid_transport_or_opcode;
466 }
467 break;
468 case IB_QPT_UD:
469 switch (opcode) {
470 case MLX5_OPCODE_SEND:
471 case MLX5_OPCODE_SEND_IMM:
472 if (!(dev->odp_caps.per_transport_caps.ud_odp_caps &
473 IB_ODP_SUPPORT_SEND))
474 goto invalid_transport_or_opcode;
475 *wqe += sizeof(struct mlx5_wqe_datagram_seg);
476 break;
477 default:
478 goto invalid_transport_or_opcode;
479 }
480 break;
481 default:
482invalid_transport_or_opcode:
483 mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
484 qp->ibqp.qp_type, opcode);
485 return -EFAULT;
486 }
487
488 return 0;
489}
490
491/*
492 * Parse responder WQE. Advances the wqe pointer to point at the
493 * scatter-gather list, and set wqe_end to the end of the WQE.
494 */
495static int mlx5_ib_mr_responder_pfault_handler(
496 struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
497 void **wqe, void **wqe_end, int wqe_length)
498{
499 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
500 struct mlx5_ib_wq *wq = &qp->rq;
501 int wqe_size = 1 << wq->wqe_shift;
502
503 if (qp->ibqp.srq) {
504 mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
505 return -EFAULT;
506 }
507
508 if (qp->wq_sig) {
509 mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
510 return -EFAULT;
511 }
512
513 if (wqe_size > wqe_length) {
514 mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
515 return -EFAULT;
516 }
517
518 switch (qp->ibqp.qp_type) {
519 case IB_QPT_RC:
520 if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
521 IB_ODP_SUPPORT_RECV))
522 goto invalid_transport_or_opcode;
523 break;
524 default:
525invalid_transport_or_opcode:
526 mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
527 qp->ibqp.qp_type);
528 return -EFAULT;
529 }
530
531 *wqe_end = *wqe + wqe_size;
532
533 return 0;
534}
535
536static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
537 struct mlx5_ib_pfault *pfault)
538{
539 struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
540 int ret;
541 void *wqe, *wqe_end;
542 u32 bytes_mapped, total_wqe_bytes;
543 char *buffer = NULL;
544 int resume_with_error = 0;
545 u16 wqe_index = pfault->mpfault.wqe.wqe_index;
546 int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
547
548 buffer = (char *)__get_free_page(GFP_KERNEL);
549 if (!buffer) {
550 mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
551 resume_with_error = 1;
552 goto resolve_page_fault;
553 }
554
555 ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
556 PAGE_SIZE);
557 if (ret < 0) {
558 mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
559 -ret, wqe_index, qp->mqp.qpn);
560 resume_with_error = 1;
561 goto resolve_page_fault;
562 }
563
564 wqe = buffer;
565 if (requestor)
566 ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
567 &wqe_end, ret);
568 else
569 ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
570 &wqe_end, ret);
571 if (ret < 0) {
572 resume_with_error = 1;
573 goto resolve_page_fault;
574 }
575
576 if (wqe >= wqe_end) {
577 mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
578 resume_with_error = 1;
579 goto resolve_page_fault;
580 }
581
582 ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
583 &total_wqe_bytes, !requestor);
584 if (ret == -EAGAIN) {
585 goto resolve_page_fault;
586 } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
587 mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
588 -ret);
589 resume_with_error = 1;
590 goto resolve_page_fault;
591 }
592
593resolve_page_fault:
594 mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
595 mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
596 qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
597
598 free_page((unsigned long)buffer);
599}
600
601static int pages_in_range(u64 address, u32 length)
602{
603 return (ALIGN(address + length, PAGE_SIZE) -
604 (address & PAGE_MASK)) >> PAGE_SHIFT;
605}
606
607static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
608 struct mlx5_ib_pfault *pfault)
609{
610 struct mlx5_pagefault *mpfault = &pfault->mpfault;
611 u64 address;
612 u32 length;
613 u32 prefetch_len = mpfault->bytes_committed;
614 int prefetch_activated = 0;
615 u32 rkey = mpfault->rdma.r_key;
616 int ret;
617
618 /* The RDMA responder handler handles the page fault in two parts.
619 * First it brings the necessary pages for the current packet
620 * (and uses the pfault context), and then (after resuming the QP)
621 * prefetches more pages. The second operation cannot use the pfault
622 * context and therefore uses the dummy_pfault context allocated on
623 * the stack */
624 struct mlx5_ib_pfault dummy_pfault = {};
625
626 dummy_pfault.mpfault.bytes_committed = 0;
627
628 mpfault->rdma.rdma_va += mpfault->bytes_committed;
629 mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
630 mpfault->rdma.rdma_op_len);
631 mpfault->bytes_committed = 0;
632
633 address = mpfault->rdma.rdma_va;
634 length = mpfault->rdma.rdma_op_len;
635
636 /* For some operations, the hardware cannot tell the exact message
637 * length, and in those cases it reports zero. Use prefetch
638 * logic. */
639 if (length == 0) {
640 prefetch_activated = 1;
641 length = mpfault->rdma.packet_size;
642 prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
643 }
644
645 ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
646 NULL);
647 if (ret == -EAGAIN) {
648 /* We're racing with an invalidation, don't prefetch */
649 prefetch_activated = 0;
650 } else if (ret < 0 || pages_in_range(address, length) > ret) {
651 mlx5_ib_page_fault_resume(qp, pfault, 1);
652 return;
653 }
654
655 mlx5_ib_page_fault_resume(qp, pfault, 0);
656
657 /* At this point, there might be a new pagefault already arriving in
658 * the eq, switch to the dummy pagefault for the rest of the
659 * processing. We're still OK with the objects being alive as the
660 * work-queue is being fenced. */
661
662 if (prefetch_activated) {
663 ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
664 address,
665 prefetch_len,
666 NULL);
667 if (ret < 0) {
668 pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
669 ret, prefetch_activated,
670 qp->ibqp.qp_num, address, prefetch_len);
671 }
672 }
673}
674
675void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
676 struct mlx5_ib_pfault *pfault)
677{
678 u8 event_subtype = pfault->mpfault.event_subtype;
679
680 switch (event_subtype) {
681 case MLX5_PFAULT_SUBTYPE_WQE:
682 mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
683 break;
684 case MLX5_PFAULT_SUBTYPE_RDMA:
685 mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
686 break;
687 default:
688 pr_warn("Invalid page fault event subtype: 0x%x\n",
689 event_subtype);
690 mlx5_ib_page_fault_resume(qp, pfault, 1);
691 break;
692 }
693}
694
695static void mlx5_ib_qp_pfault_action(struct work_struct *work)
696{
697 struct mlx5_ib_pfault *pfault = container_of(work,
698 struct mlx5_ib_pfault,
699 work);
700 enum mlx5_ib_pagefault_context context =
701 mlx5_ib_get_pagefault_context(&pfault->mpfault);
702 struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
703 pagefaults[context]);
704 mlx5_ib_mr_pfault_handler(qp, pfault);
705}
706
707void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
708{
709 unsigned long flags;
710
711 spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
712 qp->disable_page_faults = 1;
713 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
714
715 /*
716 * Note that at this point, we are guarenteed that no more
717 * work queue elements will be posted to the work queue with
718 * the QP we are closing.
719 */
720 flush_workqueue(mlx5_ib_page_fault_wq);
721}
722
723void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
724{
725 unsigned long flags;
726
727 spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
728 qp->disable_page_faults = 0;
729 spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
730}
731
732static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
733 struct mlx5_pagefault *pfault)
734{
735 /*
736 * Note that we will only get one fault event per QP per context
737 * (responder/initiator, read/write), until we resolve the page fault
738 * with the mlx5_ib_page_fault_resume command. Since this function is
739 * called from within the work element, there is no risk of missing
740 * events.
741 */
742 struct mlx5_ib_qp *mibqp = to_mibqp(qp);
743 enum mlx5_ib_pagefault_context context =
744 mlx5_ib_get_pagefault_context(pfault);
745 struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
746
747 qp_pfault->mpfault = *pfault;
748
749 /* No need to stop interrupts here since we are in an interrupt */
750 spin_lock(&mibqp->disable_page_faults_lock);
751 if (!mibqp->disable_page_faults)
752 queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
753 spin_unlock(&mibqp->disable_page_faults_lock);
754}
755
756void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
757{
758 int i;
759
760 qp->disable_page_faults = 1;
761 spin_lock_init(&qp->disable_page_faults_lock);
762
763 qp->mqp.pfault_handler = mlx5_ib_pfault_handler;
764
765 for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
766 INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
767}
768
769int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
770{
771 int ret;
772
773 ret = init_srcu_struct(&ibdev->mr_srcu);
774 if (ret)
775 return ret;
776
777 return 0;
778}
779
780void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
781{
782 cleanup_srcu_struct(&ibdev->mr_srcu);
783}
784
785int __init mlx5_ib_odp_init(void)
786{
787 mlx5_ib_page_fault_wq =
788 create_singlethread_workqueue("mlx5_ib_page_faults");
789 if (!mlx5_ib_page_fault_wq)
790 return -ENOMEM;
791
792 return 0;
793}
794
795void mlx5_ib_odp_cleanup(void)
796{
797 destroy_workqueue(mlx5_ib_page_fault_wq);
798}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 1cae1c7132b4..be0cd358b080 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -70,15 +70,6 @@ static const u32 mlx5_ib_opcode[] = {
70 [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, 70 [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR,
71}; 71};
72 72
73struct umr_wr {
74 u64 virt_addr;
75 struct ib_pd *pd;
76 unsigned int page_shift;
77 unsigned int npages;
78 u32 length;
79 int access_flags;
80 u32 mkey;
81};
82 73
83static int is_qp0(enum ib_qp_type qp_type) 74static int is_qp0(enum ib_qp_type qp_type)
84{ 75{
@@ -110,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
110 return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); 101 return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
111} 102}
112 103
104/**
105 * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
106 *
107 * @qp: QP to copy from.
108 * @send: copy from the send queue when non-zero, use the receive queue
109 * otherwise.
110 * @wqe_index: index to start copying from. For send work queues, the
111 * wqe_index is in units of MLX5_SEND_WQE_BB.
112 * For receive work queue, it is the number of work queue
113 * element in the queue.
114 * @buffer: destination buffer.
115 * @length: maximum number of bytes to copy.
116 *
117 * Copies at least a single WQE, but may copy more data.
118 *
119 * Return: the number of bytes copied, or an error code.
120 */
121int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
122 void *buffer, u32 length)
123{
124 struct ib_device *ibdev = qp->ibqp.device;
125 struct mlx5_ib_dev *dev = to_mdev(ibdev);
126 struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
127 size_t offset;
128 size_t wq_end;
129 struct ib_umem *umem = qp->umem;
130 u32 first_copy_length;
131 int wqe_length;
132 int ret;
133
134 if (wq->wqe_cnt == 0) {
135 mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
136 qp->ibqp.qp_type);
137 return -EINVAL;
138 }
139
140 offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
141 wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
142
143 if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
144 return -EINVAL;
145
146 if (offset > umem->length ||
147 (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
148 return -EINVAL;
149
150 first_copy_length = min_t(u32, offset + length, wq_end) - offset;
151 ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
152 if (ret)
153 return ret;
154
155 if (send) {
156 struct mlx5_wqe_ctrl_seg *ctrl = buffer;
157 int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
158
159 wqe_length = ds * MLX5_WQE_DS_UNITS;
160 } else {
161 wqe_length = 1 << wq->wqe_shift;
162 }
163
164 if (wqe_length <= first_copy_length)
165 return first_copy_length;
166
167 ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
168 wqe_length - first_copy_length);
169 if (ret)
170 return ret;
171
172 return wqe_length;
173}
174
113static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) 175static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
114{ 176{
115 struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; 177 struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@@ -814,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
814 int inlen = sizeof(*in); 876 int inlen = sizeof(*in);
815 int err; 877 int err;
816 878
879 mlx5_ib_odp_create_qp(qp);
880
817 gen = &dev->mdev->caps.gen; 881 gen = &dev->mdev->caps.gen;
818 mutex_init(&qp->mutex); 882 mutex_init(&qp->mutex);
819 spin_lock_init(&qp->sq.lock); 883 spin_lock_init(&qp->sq.lock);
@@ -1098,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
1098 in = kzalloc(sizeof(*in), GFP_KERNEL); 1162 in = kzalloc(sizeof(*in), GFP_KERNEL);
1099 if (!in) 1163 if (!in)
1100 return; 1164 return;
1101 if (qp->state != IB_QPS_RESET) 1165 if (qp->state != IB_QPS_RESET) {
1166 mlx5_ib_qp_disable_pagefaults(qp);
1102 if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), 1167 if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
1103 MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) 1168 MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
1104 mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", 1169 mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
1105 qp->mqp.qpn); 1170 qp->mqp.qpn);
1171 }
1106 1172
1107 get_cqs(qp, &send_cq, &recv_cq); 1173 get_cqs(qp, &send_cq, &recv_cq);
1108 1174
@@ -1650,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
1650 if (mlx5_st < 0) 1716 if (mlx5_st < 0)
1651 goto out; 1717 goto out;
1652 1718
1719 /* If moving to a reset or error state, we must disable page faults on
1720 * this QP and flush all current page faults. Otherwise a stale page
1721 * fault may attempt to work on this QP after it is reset and moved
1722 * again to RTS, and may cause the driver and the device to get out of
1723 * sync. */
1724 if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
1725 (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
1726 mlx5_ib_qp_disable_pagefaults(qp);
1727
1653 optpar = ib_mask_to_mlx5_opt(attr_mask); 1728 optpar = ib_mask_to_mlx5_opt(attr_mask);
1654 optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; 1729 optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
1655 in->optparam = cpu_to_be32(optpar); 1730 in->optparam = cpu_to_be32(optpar);
@@ -1659,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
1659 if (err) 1734 if (err)
1660 goto out; 1735 goto out;
1661 1736
1737 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1738 mlx5_ib_qp_enable_pagefaults(qp);
1739
1662 qp->state = new_state; 1740 qp->state = new_state;
1663 1741
1664 if (attr_mask & IB_QP_ACCESS_FLAGS) 1742 if (attr_mask & IB_QP_ACCESS_FLAGS)
@@ -1848,37 +1926,70 @@ static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
1848 umr->mkey_mask = frwr_mkey_mask(); 1926 umr->mkey_mask = frwr_mkey_mask();
1849} 1927}
1850 1928
1929static __be64 get_umr_reg_mr_mask(void)
1930{
1931 u64 result;
1932
1933 result = MLX5_MKEY_MASK_LEN |
1934 MLX5_MKEY_MASK_PAGE_SIZE |
1935 MLX5_MKEY_MASK_START_ADDR |
1936 MLX5_MKEY_MASK_PD |
1937 MLX5_MKEY_MASK_LR |
1938 MLX5_MKEY_MASK_LW |
1939 MLX5_MKEY_MASK_KEY |
1940 MLX5_MKEY_MASK_RR |
1941 MLX5_MKEY_MASK_RW |
1942 MLX5_MKEY_MASK_A |
1943 MLX5_MKEY_MASK_FREE;
1944
1945 return cpu_to_be64(result);
1946}
1947
1948static __be64 get_umr_unreg_mr_mask(void)
1949{
1950 u64 result;
1951
1952 result = MLX5_MKEY_MASK_FREE;
1953
1954 return cpu_to_be64(result);
1955}
1956
1957static __be64 get_umr_update_mtt_mask(void)
1958{
1959 u64 result;
1960
1961 result = MLX5_MKEY_MASK_FREE;
1962
1963 return cpu_to_be64(result);
1964}
1965
1851static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, 1966static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
1852 struct ib_send_wr *wr) 1967 struct ib_send_wr *wr)
1853{ 1968{
1854 struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg; 1969 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
1855 u64 mask;
1856 1970
1857 memset(umr, 0, sizeof(*umr)); 1971 memset(umr, 0, sizeof(*umr));
1858 1972
1973 if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
1974 umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
1975 else
1976 umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
1977
1859 if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { 1978 if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
1860 umr->flags = 1 << 5; /* fail if not free */
1861 umr->klm_octowords = get_klm_octo(umrwr->npages); 1979 umr->klm_octowords = get_klm_octo(umrwr->npages);
1862 mask = MLX5_MKEY_MASK_LEN | 1980 if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
1863 MLX5_MKEY_MASK_PAGE_SIZE | 1981 umr->mkey_mask = get_umr_update_mtt_mask();
1864 MLX5_MKEY_MASK_START_ADDR | 1982 umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
1865 MLX5_MKEY_MASK_PD | 1983 umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
1866 MLX5_MKEY_MASK_LR | 1984 } else {
1867 MLX5_MKEY_MASK_LW | 1985 umr->mkey_mask = get_umr_reg_mr_mask();
1868 MLX5_MKEY_MASK_KEY | 1986 }
1869 MLX5_MKEY_MASK_RR |
1870 MLX5_MKEY_MASK_RW |
1871 MLX5_MKEY_MASK_A |
1872 MLX5_MKEY_MASK_FREE;
1873 umr->mkey_mask = cpu_to_be64(mask);
1874 } else { 1987 } else {
1875 umr->flags = 2 << 5; /* fail if free */ 1988 umr->mkey_mask = get_umr_unreg_mr_mask();
1876 mask = MLX5_MKEY_MASK_FREE;
1877 umr->mkey_mask = cpu_to_be64(mask);
1878 } 1989 }
1879 1990
1880 if (!wr->num_sge) 1991 if (!wr->num_sge)
1881 umr->flags |= (1 << 7); /* inline */ 1992 umr->flags |= MLX5_UMR_INLINE;
1882} 1993}
1883 1994
1884static u8 get_umr_flags(int acc) 1995static u8 get_umr_flags(int acc)
@@ -1895,7 +2006,7 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
1895{ 2006{
1896 memset(seg, 0, sizeof(*seg)); 2007 memset(seg, 0, sizeof(*seg));
1897 if (li) { 2008 if (li) {
1898 seg->status = 1 << 6; 2009 seg->status = MLX5_MKEY_STATUS_FREE;
1899 return; 2010 return;
1900 } 2011 }
1901 2012
@@ -1912,19 +2023,23 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
1912 2023
1913static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) 2024static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
1914{ 2025{
2026 struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
2027
1915 memset(seg, 0, sizeof(*seg)); 2028 memset(seg, 0, sizeof(*seg));
1916 if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { 2029 if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
1917 seg->status = 1 << 6; 2030 seg->status = MLX5_MKEY_STATUS_FREE;
1918 return; 2031 return;
1919 } 2032 }
1920 2033
1921 seg->flags = convert_access(wr->wr.fast_reg.access_flags); 2034 seg->flags = convert_access(umrwr->access_flags);
1922 seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn); 2035 if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
1923 seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); 2036 seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
1924 seg->len = cpu_to_be64(wr->wr.fast_reg.length); 2037 seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
1925 seg->log2_page_size = wr->wr.fast_reg.page_shift; 2038 }
2039 seg->len = cpu_to_be64(umrwr->length);
2040 seg->log2_page_size = umrwr->page_shift;
1926 seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | 2041 seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
1927 mlx5_mkey_variant(wr->wr.fast_reg.rkey)); 2042 mlx5_mkey_variant(umrwr->mkey));
1928} 2043}
1929 2044
1930static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, 2045static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
@@ -2927,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr
2927 int mlx5_state; 3042 int mlx5_state;
2928 int err = 0; 3043 int err = 0;
2929 3044
3045#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
3046 /*
3047 * Wait for any outstanding page faults, in case the user frees memory
3048 * based upon this query's result.
3049 */
3050 flush_workqueue(mlx5_ib_page_fault_wq);
3051#endif
3052
2930 mutex_lock(&qp->mutex); 3053 mutex_lock(&qp->mutex);
2931 outb = kzalloc(sizeof(*outb), GFP_KERNEL); 3054 outb = kzalloc(sizeof(*outb), GFP_KERNEL);
2932 if (!outb) { 3055 if (!outb) {
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index fef067c959fc..c0d0296e7a00 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -2341,9 +2341,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
2341 nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," 2341 nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
2342 " offset = %u, page size = %u.\n", 2342 " offset = %u, page size = %u.\n",
2343 (unsigned long int)start, (unsigned long int)virt, (u32)length, 2343 (unsigned long int)start, (unsigned long int)virt, (u32)length,
2344 region->offset, region->page_size); 2344 ib_umem_offset(region), region->page_size);
2345 2345
2346 skip_pages = ((u32)region->offset) >> 12; 2346 skip_pages = ((u32)ib_umem_offset(region)) >> 12;
2347 2347
2348 if (ib_copy_from_udata(&req, udata, sizeof(req))) { 2348 if (ib_copy_from_udata(&req, udata, sizeof(req))) {
2349 ib_umem_release(region); 2349 ib_umem_release(region);
@@ -2408,7 +2408,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
2408 region_length -= skip_pages << 12; 2408 region_length -= skip_pages << 12;
2409 for (page_index = skip_pages; page_index < chunk_pages; page_index++) { 2409 for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
2410 skip_pages = 0; 2410 skip_pages = 0;
2411 if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) 2411 if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
2412 goto enough_pages; 2412 goto enough_pages;
2413 if ((page_count&0x01FF) == 0) { 2413 if ((page_count&0x01FF) == 0) {
2414 if (page_count >= 1024 * 512) { 2414 if (page_count >= 1024 * 512) {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index ac02ce4e8040..f3cc8c9e65ae 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -96,7 +96,6 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
96 struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); 96 struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
97 struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); 97 struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
98 union ib_gid sgid; 98 union ib_gid sgid;
99 u8 zmac[ETH_ALEN];
100 99
101 if (!(attr->ah_flags & IB_AH_GRH)) 100 if (!(attr->ah_flags & IB_AH_GRH))
102 return ERR_PTR(-EINVAL); 101 return ERR_PTR(-EINVAL);
@@ -118,9 +117,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
118 goto av_conf_err; 117 goto av_conf_err;
119 } 118 }
120 119
121 memset(&zmac, 0, ETH_ALEN); 120 if (pd->uctx) {
122 if (pd->uctx &&
123 memcmp(attr->dmac, &zmac, ETH_ALEN)) {
124 status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, 121 status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
125 attr->dmac, &attr->vlan_id); 122 attr->dmac, &attr->vlan_id);
126 if (status) { 123 if (status) {
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 4c68305ee781..fb8d8c4dfbb9 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -805,7 +805,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
805 goto umem_err; 805 goto umem_err;
806 806
807 mr->hwmr.pbe_size = mr->umem->page_size; 807 mr->hwmr.pbe_size = mr->umem->page_size;
808 mr->hwmr.fbo = mr->umem->offset; 808 mr->hwmr.fbo = ib_umem_offset(mr->umem);
809 mr->hwmr.va = usr_addr; 809 mr->hwmr.va = usr_addr;
810 mr->hwmr.len = len; 810 mr->hwmr.len = len;
811 mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; 811 mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
@@ -1410,6 +1410,8 @@ int ocrdma_query_qp(struct ib_qp *ibqp,
1410 mutex_unlock(&dev->dev_lock); 1410 mutex_unlock(&dev->dev_lock);
1411 if (status) 1411 if (status)
1412 goto mbx_err; 1412 goto mbx_err;
1413 if (qp->qp_type == IB_QPT_UD)
1414 qp_attr->qkey = params.qkey;
1413 qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT); 1415 qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT);
1414 qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT); 1416 qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT);
1415 qp_attr->path_mtu = 1417 qp_attr->path_mtu =
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
index 9bbb55347cc1..a77fb4fb14e4 100644
--- a/drivers/infiniband/hw/qib/qib_mr.c
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -258,7 +258,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
258 mr->mr.user_base = start; 258 mr->mr.user_base = start;
259 mr->mr.iova = virt_addr; 259 mr->mr.iova = virt_addr;
260 mr->mr.length = length; 260 mr->mr.length = length;
261 mr->mr.offset = umem->offset; 261 mr->mr.offset = ib_umem_offset(umem);
262 mr->mr.access_flags = mr_access_flags; 262 mr->mr.access_flags = mr_access_flags;
263 mr->umem = umem; 263 mr->umem = umem;
264 264
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index d7562beb5423..8ba80a6d3a46 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -98,9 +98,15 @@ enum {
98 98
99 IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ 99 IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */
100 IPOIB_MCAST_FLAG_SENDONLY = 1, 100 IPOIB_MCAST_FLAG_SENDONLY = 1,
101 IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ 101 /*
102 * For IPOIB_MCAST_FLAG_BUSY
103 * When set, in flight join and mcast->mc is unreliable
104 * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
105 * haven't started yet
106 * When clear and mcast->mc is valid pointer, join was successful
107 */
108 IPOIB_MCAST_FLAG_BUSY = 2,
102 IPOIB_MCAST_FLAG_ATTACHED = 3, 109 IPOIB_MCAST_FLAG_ATTACHED = 3,
103 IPOIB_MCAST_JOIN_STARTED = 4,
104 110
105 MAX_SEND_CQE = 16, 111 MAX_SEND_CQE = 16,
106 IPOIB_CM_COPYBREAK = 256, 112 IPOIB_CM_COPYBREAK = 256,
@@ -317,6 +323,7 @@ struct ipoib_dev_priv {
317 struct list_head multicast_list; 323 struct list_head multicast_list;
318 struct rb_root multicast_tree; 324 struct rb_root multicast_tree;
319 325
326 struct workqueue_struct *wq;
320 struct delayed_work mcast_task; 327 struct delayed_work mcast_task;
321 struct work_struct carrier_on_task; 328 struct work_struct carrier_on_task;
322 struct work_struct flush_light; 329 struct work_struct flush_light;
@@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work);
477void ipoib_pkey_event(struct work_struct *work); 484void ipoib_pkey_event(struct work_struct *work);
478void ipoib_ib_dev_cleanup(struct net_device *dev); 485void ipoib_ib_dev_cleanup(struct net_device *dev);
479 486
480int ipoib_ib_dev_open(struct net_device *dev, int flush); 487int ipoib_ib_dev_open(struct net_device *dev);
481int ipoib_ib_dev_up(struct net_device *dev); 488int ipoib_ib_dev_up(struct net_device *dev);
482int ipoib_ib_dev_down(struct net_device *dev, int flush); 489int ipoib_ib_dev_down(struct net_device *dev);
483int ipoib_ib_dev_stop(struct net_device *dev, int flush); 490int ipoib_ib_dev_stop(struct net_device *dev);
484void ipoib_pkey_dev_check_presence(struct net_device *dev); 491void ipoib_pkey_dev_check_presence(struct net_device *dev);
485 492
486int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); 493int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
@@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
492 499
493void ipoib_mcast_restart_task(struct work_struct *work); 500void ipoib_mcast_restart_task(struct work_struct *work);
494int ipoib_mcast_start_thread(struct net_device *dev); 501int ipoib_mcast_start_thread(struct net_device *dev);
495int ipoib_mcast_stop_thread(struct net_device *dev, int flush); 502int ipoib_mcast_stop_thread(struct net_device *dev);
496 503
497void ipoib_mcast_dev_down(struct net_device *dev); 504void ipoib_mcast_dev_down(struct net_device *dev);
498void ipoib_mcast_dev_flush(struct net_device *dev); 505void ipoib_mcast_dev_flush(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 933efcea0d03..56959adb6c7d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
474 } 474 }
475 475
476 spin_lock_irq(&priv->lock); 476 spin_lock_irq(&priv->lock);
477 queue_delayed_work(ipoib_workqueue, 477 queue_delayed_work(priv->wq,
478 &priv->cm.stale_task, IPOIB_CM_RX_DELAY); 478 &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
479 /* Add this entry to passive ids list head, but do not re-add it 479 /* Add this entry to passive ids list head, but do not re-add it
480 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ 480 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
@@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
576 spin_lock_irqsave(&priv->lock, flags); 576 spin_lock_irqsave(&priv->lock, flags);
577 list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); 577 list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
578 ipoib_cm_start_rx_drain(priv); 578 ipoib_cm_start_rx_drain(priv);
579 queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); 579 queue_work(priv->wq, &priv->cm.rx_reap_task);
580 spin_unlock_irqrestore(&priv->lock, flags); 580 spin_unlock_irqrestore(&priv->lock, flags);
581 } else 581 } else
582 ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", 582 ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
@@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
603 spin_lock_irqsave(&priv->lock, flags); 603 spin_lock_irqsave(&priv->lock, flags);
604 list_move(&p->list, &priv->cm.rx_reap_list); 604 list_move(&p->list, &priv->cm.rx_reap_list);
605 spin_unlock_irqrestore(&priv->lock, flags); 605 spin_unlock_irqrestore(&priv->lock, flags);
606 queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); 606 queue_work(priv->wq, &priv->cm.rx_reap_task);
607 } 607 }
608 return; 608 return;
609 } 609 }
@@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
827 827
828 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { 828 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
829 list_move(&tx->list, &priv->cm.reap_list); 829 list_move(&tx->list, &priv->cm.reap_list);
830 queue_work(ipoib_workqueue, &priv->cm.reap_task); 830 queue_work(priv->wq, &priv->cm.reap_task);
831 } 831 }
832 832
833 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); 833 clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
@@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1255 1255
1256 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { 1256 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1257 list_move(&tx->list, &priv->cm.reap_list); 1257 list_move(&tx->list, &priv->cm.reap_list);
1258 queue_work(ipoib_workqueue, &priv->cm.reap_task); 1258 queue_work(priv->wq, &priv->cm.reap_task);
1259 } 1259 }
1260 1260
1261 spin_unlock_irqrestore(&priv->lock, flags); 1261 spin_unlock_irqrestore(&priv->lock, flags);
@@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path
1284 tx->dev = dev; 1284 tx->dev = dev;
1285 list_add(&tx->list, &priv->cm.start_list); 1285 list_add(&tx->list, &priv->cm.start_list);
1286 set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); 1286 set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
1287 queue_work(ipoib_workqueue, &priv->cm.start_task); 1287 queue_work(priv->wq, &priv->cm.start_task);
1288 return tx; 1288 return tx;
1289} 1289}
1290 1290
@@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1295 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { 1295 if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1296 spin_lock_irqsave(&priv->lock, flags); 1296 spin_lock_irqsave(&priv->lock, flags);
1297 list_move(&tx->list, &priv->cm.reap_list); 1297 list_move(&tx->list, &priv->cm.reap_list);
1298 queue_work(ipoib_workqueue, &priv->cm.reap_task); 1298 queue_work(priv->wq, &priv->cm.reap_task);
1299 ipoib_dbg(priv, "Reap connection for gid %pI6\n", 1299 ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1300 tx->neigh->daddr + 4); 1300 tx->neigh->daddr + 4);
1301 tx->neigh = NULL; 1301 tx->neigh = NULL;
@@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
1417 1417
1418 skb_queue_tail(&priv->cm.skb_queue, skb); 1418 skb_queue_tail(&priv->cm.skb_queue, skb);
1419 if (e) 1419 if (e)
1420 queue_work(ipoib_workqueue, &priv->cm.skb_task); 1420 queue_work(priv->wq, &priv->cm.skb_task);
1421} 1421}
1422 1422
1423static void ipoib_cm_rx_reap(struct work_struct *work) 1423static void ipoib_cm_rx_reap(struct work_struct *work)
@@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work)
1450 } 1450 }
1451 1451
1452 if (!list_empty(&priv->cm.passive_ids)) 1452 if (!list_empty(&priv->cm.passive_ids))
1453 queue_delayed_work(ipoib_workqueue, 1453 queue_delayed_work(priv->wq,
1454 &priv->cm.stale_task, IPOIB_CM_RX_DELAY); 1454 &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1455 spin_unlock_irq(&priv->lock); 1455 spin_unlock_irq(&priv->lock);
1456} 1456}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 72626c348174..fe65abb5150c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work)
655 __ipoib_reap_ah(dev); 655 __ipoib_reap_ah(dev);
656 656
657 if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) 657 if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
658 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, 658 queue_delayed_work(priv->wq, &priv->ah_reap_task,
659 round_jiffies_relative(HZ)); 659 round_jiffies_relative(HZ));
660} 660}
661 661
@@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx)
664 drain_tx_cq((struct net_device *)ctx); 664 drain_tx_cq((struct net_device *)ctx);
665} 665}
666 666
667int ipoib_ib_dev_open(struct net_device *dev, int flush) 667int ipoib_ib_dev_open(struct net_device *dev)
668{ 668{
669 struct ipoib_dev_priv *priv = netdev_priv(dev); 669 struct ipoib_dev_priv *priv = netdev_priv(dev);
670 int ret; 670 int ret;
@@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
696 } 696 }
697 697
698 clear_bit(IPOIB_STOP_REAPER, &priv->flags); 698 clear_bit(IPOIB_STOP_REAPER, &priv->flags);
699 queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, 699 queue_delayed_work(priv->wq, &priv->ah_reap_task,
700 round_jiffies_relative(HZ)); 700 round_jiffies_relative(HZ));
701 701
702 if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) 702 if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
@@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush)
706dev_stop: 706dev_stop:
707 if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) 707 if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
708 napi_enable(&priv->napi); 708 napi_enable(&priv->napi);
709 ipoib_ib_dev_stop(dev, flush); 709 ipoib_ib_dev_stop(dev);
710 return -1; 710 return -1;
711} 711}
712 712
@@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev)
738 return ipoib_mcast_start_thread(dev); 738 return ipoib_mcast_start_thread(dev);
739} 739}
740 740
741int ipoib_ib_dev_down(struct net_device *dev, int flush) 741int ipoib_ib_dev_down(struct net_device *dev)
742{ 742{
743 struct ipoib_dev_priv *priv = netdev_priv(dev); 743 struct ipoib_dev_priv *priv = netdev_priv(dev);
744 744
@@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush)
747 clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); 747 clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
748 netif_carrier_off(dev); 748 netif_carrier_off(dev);
749 749
750 ipoib_mcast_stop_thread(dev, flush); 750 ipoib_mcast_stop_thread(dev);
751 ipoib_mcast_dev_flush(dev); 751 ipoib_mcast_dev_flush(dev);
752 752
753 ipoib_flush_paths(dev); 753 ipoib_flush_paths(dev);
@@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev)
807 local_bh_enable(); 807 local_bh_enable();
808} 808}
809 809
810int ipoib_ib_dev_stop(struct net_device *dev, int flush) 810int ipoib_ib_dev_stop(struct net_device *dev)
811{ 811{
812 struct ipoib_dev_priv *priv = netdev_priv(dev); 812 struct ipoib_dev_priv *priv = netdev_priv(dev);
813 struct ib_qp_attr qp_attr; 813 struct ib_qp_attr qp_attr;
@@ -880,8 +880,7 @@ timeout:
880 /* Wait for all AHs to be reaped */ 880 /* Wait for all AHs to be reaped */
881 set_bit(IPOIB_STOP_REAPER, &priv->flags); 881 set_bit(IPOIB_STOP_REAPER, &priv->flags);
882 cancel_delayed_work(&priv->ah_reap_task); 882 cancel_delayed_work(&priv->ah_reap_task);
883 if (flush) 883 flush_workqueue(priv->wq);
884 flush_workqueue(ipoib_workqueue);
885 884
886 begin = jiffies; 885 begin = jiffies;
887 886
@@ -918,7 +917,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
918 (unsigned long) dev); 917 (unsigned long) dev);
919 918
920 if (dev->flags & IFF_UP) { 919 if (dev->flags & IFF_UP) {
921 if (ipoib_ib_dev_open(dev, 1)) { 920 if (ipoib_ib_dev_open(dev)) {
922 ipoib_transport_dev_cleanup(dev); 921 ipoib_transport_dev_cleanup(dev);
923 return -ENODEV; 922 return -ENODEV;
924 } 923 }
@@ -1040,12 +1039,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
1040 } 1039 }
1041 1040
1042 if (level >= IPOIB_FLUSH_NORMAL) 1041 if (level >= IPOIB_FLUSH_NORMAL)
1043 ipoib_ib_dev_down(dev, 0); 1042 ipoib_ib_dev_down(dev);
1044 1043
1045 if (level == IPOIB_FLUSH_HEAVY) { 1044 if (level == IPOIB_FLUSH_HEAVY) {
1046 if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) 1045 if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
1047 ipoib_ib_dev_stop(dev, 0); 1046 ipoib_ib_dev_stop(dev);
1048 if (ipoib_ib_dev_open(dev, 0) != 0) 1047 if (ipoib_ib_dev_open(dev) != 0)
1049 return; 1048 return;
1050 if (netif_queue_stopped(dev)) 1049 if (netif_queue_stopped(dev))
1051 netif_start_queue(dev); 1050 netif_start_queue(dev);
@@ -1097,7 +1096,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
1097 */ 1096 */
1098 ipoib_flush_paths(dev); 1097 ipoib_flush_paths(dev);
1099 1098
1100 ipoib_mcast_stop_thread(dev, 1); 1099 ipoib_mcast_stop_thread(dev);
1101 ipoib_mcast_dev_flush(dev); 1100 ipoib_mcast_dev_flush(dev);
1102 1101
1103 ipoib_transport_dev_cleanup(dev); 1102 ipoib_transport_dev_cleanup(dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 58b5aa3b6f2d..6bad17d4d588 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev)
108 108
109 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 109 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
110 110
111 if (ipoib_ib_dev_open(dev, 1)) { 111 if (ipoib_ib_dev_open(dev)) {
112 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) 112 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
113 return 0; 113 return 0;
114 goto err_disable; 114 goto err_disable;
@@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev)
139 return 0; 139 return 0;
140 140
141err_stop: 141err_stop:
142 ipoib_ib_dev_stop(dev, 1); 142 ipoib_ib_dev_stop(dev);
143 143
144err_disable: 144err_disable:
145 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 145 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
@@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev)
157 157
158 netif_stop_queue(dev); 158 netif_stop_queue(dev);
159 159
160 ipoib_ib_dev_down(dev, 1); 160 ipoib_ib_dev_down(dev);
161 ipoib_ib_dev_stop(dev, 0); 161 ipoib_ib_dev_stop(dev);
162 162
163 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 163 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
164 struct ipoib_dev_priv *cpriv; 164 struct ipoib_dev_priv *cpriv;
@@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev)
839 return; 839 return;
840 } 840 }
841 841
842 queue_work(ipoib_workqueue, &priv->restart_task); 842 queue_work(priv->wq, &priv->restart_task);
843} 843}
844 844
845static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) 845static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
@@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work)
954 __ipoib_reap_neigh(priv); 954 __ipoib_reap_neigh(priv);
955 955
956 if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) 956 if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
957 queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, 957 queue_delayed_work(priv->wq, &priv->neigh_reap_task,
958 arp_tbl.gc_interval); 958 arp_tbl.gc_interval);
959} 959}
960 960
@@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1133 1133
1134 /* start garbage collection */ 1134 /* start garbage collection */
1135 clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); 1135 clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1136 queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, 1136 queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1137 arp_tbl.gc_interval); 1137 arp_tbl.gc_interval);
1138 1138
1139 return 0; 1139 return 0;
@@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1262{ 1262{
1263 struct ipoib_dev_priv *priv = netdev_priv(dev); 1263 struct ipoib_dev_priv *priv = netdev_priv(dev);
1264 1264
1265 if (ipoib_neigh_hash_init(priv) < 0)
1266 goto out;
1267 /* Allocate RX/TX "rings" to hold queued skbs */ 1265 /* Allocate RX/TX "rings" to hold queued skbs */
1268 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, 1266 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1269 GFP_KERNEL); 1267 GFP_KERNEL);
1270 if (!priv->rx_ring) { 1268 if (!priv->rx_ring) {
1271 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", 1269 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1272 ca->name, ipoib_recvq_size); 1270 ca->name, ipoib_recvq_size);
1273 goto out_neigh_hash_cleanup; 1271 goto out;
1274 } 1272 }
1275 1273
1276 priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); 1274 priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
@@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1285 if (ipoib_ib_dev_init(dev, ca, port)) 1283 if (ipoib_ib_dev_init(dev, ca, port))
1286 goto out_tx_ring_cleanup; 1284 goto out_tx_ring_cleanup;
1287 1285
1286 /*
1287 * Must be after ipoib_ib_dev_init so we can allocate a per
1288 * device wq there and use it here
1289 */
1290 if (ipoib_neigh_hash_init(priv) < 0)
1291 goto out_dev_uninit;
1292
1288 return 0; 1293 return 0;
1289 1294
1295out_dev_uninit:
1296 ipoib_ib_dev_cleanup(dev);
1297
1290out_tx_ring_cleanup: 1298out_tx_ring_cleanup:
1291 vfree(priv->tx_ring); 1299 vfree(priv->tx_ring);
1292 1300
1293out_rx_ring_cleanup: 1301out_rx_ring_cleanup:
1294 kfree(priv->rx_ring); 1302 kfree(priv->rx_ring);
1295 1303
1296out_neigh_hash_cleanup:
1297 ipoib_neigh_hash_uninit(dev);
1298out: 1304out:
1299 return -ENOMEM; 1305 return -ENOMEM;
1300} 1306}
@@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev)
1317 } 1323 }
1318 unregister_netdevice_many(&head); 1324 unregister_netdevice_many(&head);
1319 1325
1326 /*
1327 * Must be before ipoib_ib_dev_cleanup or we delete an in use
1328 * work queue
1329 */
1330 ipoib_neigh_hash_uninit(dev);
1331
1320 ipoib_ib_dev_cleanup(dev); 1332 ipoib_ib_dev_cleanup(dev);
1321 1333
1322 kfree(priv->rx_ring); 1334 kfree(priv->rx_ring);
@@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
1324 1336
1325 priv->rx_ring = NULL; 1337 priv->rx_ring = NULL;
1326 priv->tx_ring = NULL; 1338 priv->tx_ring = NULL;
1327
1328 ipoib_neigh_hash_uninit(dev);
1329} 1339}
1330 1340
1331static const struct header_ops ipoib_header_ops = { 1341static const struct header_ops ipoib_header_ops = {
@@ -1636,7 +1646,7 @@ register_failed:
1636 /* Stop GC if started before flush */ 1646 /* Stop GC if started before flush */
1637 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); 1647 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1638 cancel_delayed_work(&priv->neigh_reap_task); 1648 cancel_delayed_work(&priv->neigh_reap_task);
1639 flush_workqueue(ipoib_workqueue); 1649 flush_workqueue(priv->wq);
1640 1650
1641event_failed: 1651event_failed:
1642 ipoib_dev_cleanup(priv->dev); 1652 ipoib_dev_cleanup(priv->dev);
@@ -1707,7 +1717,7 @@ static void ipoib_remove_one(struct ib_device *device)
1707 /* Stop GC */ 1717 /* Stop GC */
1708 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); 1718 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1709 cancel_delayed_work(&priv->neigh_reap_task); 1719 cancel_delayed_work(&priv->neigh_reap_task);
1710 flush_workqueue(ipoib_workqueue); 1720 flush_workqueue(priv->wq);
1711 1721
1712 unregister_netdev(priv->dev); 1722 unregister_netdev(priv->dev);
1713 free_netdev(priv->dev); 1723 free_netdev(priv->dev);
@@ -1748,8 +1758,13 @@ static int __init ipoib_init_module(void)
1748 * unregister_netdev() and linkwatch_event take the rtnl lock, 1758 * unregister_netdev() and linkwatch_event take the rtnl lock,
1749 * so flush_scheduled_work() can deadlock during device 1759 * so flush_scheduled_work() can deadlock during device
1750 * removal. 1760 * removal.
1761 *
1762 * In addition, bringing one device up and another down at the
1763 * same time can deadlock a single workqueue, so we have this
1764 * global fallback workqueue, but we also attempt to open a
1765 * per device workqueue each time we bring an interface up
1751 */ 1766 */
1752 ipoib_workqueue = create_singlethread_workqueue("ipoib"); 1767 ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
1753 if (!ipoib_workqueue) { 1768 if (!ipoib_workqueue) {
1754 ret = -ENOMEM; 1769 ret = -ENOMEM;
1755 goto err_fs; 1770 goto err_fs;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index ffb83b5f7e80..bc50dd0d0e4d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -190,12 +190,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
190 spin_unlock_irq(&priv->lock); 190 spin_unlock_irq(&priv->lock);
191 priv->tx_wr.wr.ud.remote_qkey = priv->qkey; 191 priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
192 set_qkey = 1; 192 set_qkey = 1;
193
194 if (!ipoib_cm_admin_enabled(dev)) {
195 rtnl_lock();
196 dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
197 rtnl_unlock();
198 }
199 } 193 }
200 194
201 if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { 195 if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -277,16 +271,27 @@ ipoib_mcast_sendonly_join_complete(int status,
277 struct ipoib_mcast *mcast = multicast->context; 271 struct ipoib_mcast *mcast = multicast->context;
278 struct net_device *dev = mcast->dev; 272 struct net_device *dev = mcast->dev;
279 273
274 /*
275 * We have to take the mutex to force mcast_sendonly_join to
276 * return from ib_sa_multicast_join and set mcast->mc to a
277 * valid value. Otherwise we were racing with ourselves in
278 * that we might fail here, but get a valid return from
279 * ib_sa_multicast_join after we had cleared mcast->mc here,
280 * resulting in mis-matched joins and leaves and a deadlock
281 */
282 mutex_lock(&mcast_mutex);
283
280 /* We trap for port events ourselves. */ 284 /* We trap for port events ourselves. */
281 if (status == -ENETRESET) 285 if (status == -ENETRESET)
282 return 0; 286 goto out;
283 287
284 if (!status) 288 if (!status)
285 status = ipoib_mcast_join_finish(mcast, &multicast->rec); 289 status = ipoib_mcast_join_finish(mcast, &multicast->rec);
286 290
287 if (status) { 291 if (status) {
288 if (mcast->logcount++ < 20) 292 if (mcast->logcount++ < 20)
289 ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", 293 ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast "
294 "join failed for %pI6, status %d\n",
290 mcast->mcmember.mgid.raw, status); 295 mcast->mcmember.mgid.raw, status);
291 296
292 /* Flush out any queued packets */ 297 /* Flush out any queued packets */
@@ -296,11 +301,15 @@ ipoib_mcast_sendonly_join_complete(int status,
296 dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); 301 dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
297 } 302 }
298 netif_tx_unlock_bh(dev); 303 netif_tx_unlock_bh(dev);
299
300 /* Clear the busy flag so we try again */
301 status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
302 &mcast->flags);
303 } 304 }
305out:
306 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
307 if (status)
308 mcast->mc = NULL;
309 complete(&mcast->done);
310 if (status == -ENETRESET)
311 status = 0;
312 mutex_unlock(&mcast_mutex);
304 return status; 313 return status;
305} 314}
306 315
@@ -318,12 +327,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
318 int ret = 0; 327 int ret = 0;
319 328
320 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { 329 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
321 ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); 330 ipoib_dbg_mcast(priv, "device shutting down, no sendonly "
331 "multicast joins\n");
322 return -ENODEV; 332 return -ENODEV;
323 } 333 }
324 334
325 if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { 335 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
326 ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); 336 ipoib_dbg_mcast(priv, "multicast entry busy, skipping "
337 "sendonly join\n");
327 return -EBUSY; 338 return -EBUSY;
328 } 339 }
329 340
@@ -331,6 +342,9 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
331 rec.port_gid = priv->local_gid; 342 rec.port_gid = priv->local_gid;
332 rec.pkey = cpu_to_be16(priv->pkey); 343 rec.pkey = cpu_to_be16(priv->pkey);
333 344
345 mutex_lock(&mcast_mutex);
346 init_completion(&mcast->done);
347 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
334 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, 348 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
335 priv->port, &rec, 349 priv->port, &rec,
336 IB_SA_MCMEMBER_REC_MGID | 350 IB_SA_MCMEMBER_REC_MGID |
@@ -343,12 +357,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
343 if (IS_ERR(mcast->mc)) { 357 if (IS_ERR(mcast->mc)) {
344 ret = PTR_ERR(mcast->mc); 358 ret = PTR_ERR(mcast->mc);
345 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 359 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
346 ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", 360 complete(&mcast->done);
347 ret); 361 ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
362 "failed (ret = %d)\n", ret);
348 } else { 363 } else {
349 ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", 364 ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
350 mcast->mcmember.mgid.raw); 365 "sendonly join\n", mcast->mcmember.mgid.raw);
351 } 366 }
367 mutex_unlock(&mcast_mutex);
352 368
353 return ret; 369 return ret;
354} 370}
@@ -359,18 +375,29 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
359 carrier_on_task); 375 carrier_on_task);
360 struct ib_port_attr attr; 376 struct ib_port_attr attr;
361 377
362 /*
363 * Take rtnl_lock to avoid racing with ipoib_stop() and
364 * turning the carrier back on while a device is being
365 * removed.
366 */
367 if (ib_query_port(priv->ca, priv->port, &attr) || 378 if (ib_query_port(priv->ca, priv->port, &attr) ||
368 attr.state != IB_PORT_ACTIVE) { 379 attr.state != IB_PORT_ACTIVE) {
369 ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); 380 ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
370 return; 381 return;
371 } 382 }
372 383
373 rtnl_lock(); 384 /*
385 * Take rtnl_lock to avoid racing with ipoib_stop() and
386 * turning the carrier back on while a device is being
387 * removed. However, ipoib_stop() will attempt to flush
388 * the workqueue while holding the rtnl lock, so loop
389 * on trylock until either we get the lock or we see
390 * FLAG_ADMIN_UP go away as that signals that we are bailing
391 * and can safely ignore the carrier on work.
392 */
393 while (!rtnl_trylock()) {
394 if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
395 return;
396 else
397 msleep(20);
398 }
399 if (!ipoib_cm_admin_enabled(priv->dev))
400 dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
374 netif_carrier_on(priv->dev); 401 netif_carrier_on(priv->dev);
375 rtnl_unlock(); 402 rtnl_unlock();
376} 403}
@@ -385,60 +412,63 @@ static int ipoib_mcast_join_complete(int status,
385 ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", 412 ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
386 mcast->mcmember.mgid.raw, status); 413 mcast->mcmember.mgid.raw, status);
387 414
415 /*
416 * We have to take the mutex to force mcast_join to
417 * return from ib_sa_multicast_join and set mcast->mc to a
418 * valid value. Otherwise we were racing with ourselves in
419 * that we might fail here, but get a valid return from
420 * ib_sa_multicast_join after we had cleared mcast->mc here,
421 * resulting in mis-matched joins and leaves and a deadlock
422 */
423 mutex_lock(&mcast_mutex);
424
388 /* We trap for port events ourselves. */ 425 /* We trap for port events ourselves. */
389 if (status == -ENETRESET) { 426 if (status == -ENETRESET)
390 status = 0;
391 goto out; 427 goto out;
392 }
393 428
394 if (!status) 429 if (!status)
395 status = ipoib_mcast_join_finish(mcast, &multicast->rec); 430 status = ipoib_mcast_join_finish(mcast, &multicast->rec);
396 431
397 if (!status) { 432 if (!status) {
398 mcast->backoff = 1; 433 mcast->backoff = 1;
399 mutex_lock(&mcast_mutex);
400 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 434 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
401 queue_delayed_work(ipoib_workqueue, 435 queue_delayed_work(priv->wq, &priv->mcast_task, 0);
402 &priv->mcast_task, 0);
403 mutex_unlock(&mcast_mutex);
404 436
405 /* 437 /*
406 * Defer carrier on work to ipoib_workqueue to avoid a 438 * Defer carrier on work to priv->wq to avoid a
407 * deadlock on rtnl_lock here. 439 * deadlock on rtnl_lock here.
408 */ 440 */
409 if (mcast == priv->broadcast) 441 if (mcast == priv->broadcast)
410 queue_work(ipoib_workqueue, &priv->carrier_on_task); 442 queue_work(priv->wq, &priv->carrier_on_task);
411 443 } else {
412 status = 0; 444 if (mcast->logcount++ < 20) {
413 goto out; 445 if (status == -ETIMEDOUT || status == -EAGAIN) {
414 } 446 ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
415 447 mcast->mcmember.mgid.raw, status);
416 if (mcast->logcount++ < 20) { 448 } else {
417 if (status == -ETIMEDOUT || status == -EAGAIN) { 449 ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
418 ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", 450 mcast->mcmember.mgid.raw, status);
419 mcast->mcmember.mgid.raw, status); 451 }
420 } else {
421 ipoib_warn(priv, "multicast join failed for %pI6, status %d\n",
422 mcast->mcmember.mgid.raw, status);
423 } 452 }
424 }
425
426 mcast->backoff *= 2;
427 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
428 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
429 453
430 /* Clear the busy flag so we try again */ 454 mcast->backoff *= 2;
431 status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 455 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
432 456 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
433 mutex_lock(&mcast_mutex); 457 }
458out:
434 spin_lock_irq(&priv->lock); 459 spin_lock_irq(&priv->lock);
435 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 460 clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
436 queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 461 if (status)
462 mcast->mc = NULL;
463 complete(&mcast->done);
464 if (status == -ENETRESET)
465 status = 0;
466 if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags))
467 queue_delayed_work(priv->wq, &priv->mcast_task,
437 mcast->backoff * HZ); 468 mcast->backoff * HZ);
438 spin_unlock_irq(&priv->lock); 469 spin_unlock_irq(&priv->lock);
439 mutex_unlock(&mcast_mutex); 470 mutex_unlock(&mcast_mutex);
440out: 471
441 complete(&mcast->done);
442 return status; 472 return status;
443} 473}
444 474
@@ -487,10 +517,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
487 rec.hop_limit = priv->broadcast->mcmember.hop_limit; 517 rec.hop_limit = priv->broadcast->mcmember.hop_limit;
488 } 518 }
489 519
490 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); 520 mutex_lock(&mcast_mutex);
491 init_completion(&mcast->done); 521 init_completion(&mcast->done);
492 set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); 522 set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
493
494 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, 523 mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
495 &rec, comp_mask, GFP_KERNEL, 524 &rec, comp_mask, GFP_KERNEL,
496 ipoib_mcast_join_complete, mcast); 525 ipoib_mcast_join_complete, mcast);
@@ -504,13 +533,11 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
504 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) 533 if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
505 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; 534 mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
506 535
507 mutex_lock(&mcast_mutex);
508 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 536 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
509 queue_delayed_work(ipoib_workqueue, 537 queue_delayed_work(priv->wq, &priv->mcast_task,
510 &priv->mcast_task,
511 mcast->backoff * HZ); 538 mcast->backoff * HZ);
512 mutex_unlock(&mcast_mutex);
513 } 539 }
540 mutex_unlock(&mcast_mutex);
514} 541}
515 542
516void ipoib_mcast_join_task(struct work_struct *work) 543void ipoib_mcast_join_task(struct work_struct *work)
@@ -547,8 +574,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
547 ipoib_warn(priv, "failed to allocate broadcast group\n"); 574 ipoib_warn(priv, "failed to allocate broadcast group\n");
548 mutex_lock(&mcast_mutex); 575 mutex_lock(&mcast_mutex);
549 if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) 576 if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
550 queue_delayed_work(ipoib_workqueue, 577 queue_delayed_work(priv->wq, &priv->mcast_task,
551 &priv->mcast_task, HZ); 578 HZ);
552 mutex_unlock(&mcast_mutex); 579 mutex_unlock(&mcast_mutex);
553 return; 580 return;
554 } 581 }
@@ -563,7 +590,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
563 } 590 }
564 591
565 if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { 592 if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
566 if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) 593 if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
594 !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
567 ipoib_mcast_join(dev, priv->broadcast, 0); 595 ipoib_mcast_join(dev, priv->broadcast, 0);
568 return; 596 return;
569 } 597 }
@@ -571,23 +599,33 @@ void ipoib_mcast_join_task(struct work_struct *work)
571 while (1) { 599 while (1) {
572 struct ipoib_mcast *mcast = NULL; 600 struct ipoib_mcast *mcast = NULL;
573 601
602 /*
603 * Need the mutex so our flags are consistent, need the
604 * priv->lock so we don't race with list removals in either
605 * mcast_dev_flush or mcast_restart_task
606 */
607 mutex_lock(&mcast_mutex);
574 spin_lock_irq(&priv->lock); 608 spin_lock_irq(&priv->lock);
575 list_for_each_entry(mcast, &priv->multicast_list, list) { 609 list_for_each_entry(mcast, &priv->multicast_list, list) {
576 if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) 610 if (IS_ERR_OR_NULL(mcast->mc) &&
577 && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) 611 !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
578 && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { 612 !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
579 /* Found the next unjoined group */ 613 /* Found the next unjoined group */
580 break; 614 break;
581 } 615 }
582 } 616 }
583 spin_unlock_irq(&priv->lock); 617 spin_unlock_irq(&priv->lock);
618 mutex_unlock(&mcast_mutex);
584 619
585 if (&mcast->list == &priv->multicast_list) { 620 if (&mcast->list == &priv->multicast_list) {
586 /* All done */ 621 /* All done */
587 break; 622 break;
588 } 623 }
589 624
590 ipoib_mcast_join(dev, mcast, 1); 625 if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
626 ipoib_mcast_sendonly_join(mcast);
627 else
628 ipoib_mcast_join(dev, mcast, 1);
591 return; 629 return;
592 } 630 }
593 631
@@ -604,13 +642,13 @@ int ipoib_mcast_start_thread(struct net_device *dev)
604 642
605 mutex_lock(&mcast_mutex); 643 mutex_lock(&mcast_mutex);
606 if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) 644 if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
607 queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); 645 queue_delayed_work(priv->wq, &priv->mcast_task, 0);
608 mutex_unlock(&mcast_mutex); 646 mutex_unlock(&mcast_mutex);
609 647
610 return 0; 648 return 0;
611} 649}
612 650
613int ipoib_mcast_stop_thread(struct net_device *dev, int flush) 651int ipoib_mcast_stop_thread(struct net_device *dev)
614{ 652{
615 struct ipoib_dev_priv *priv = netdev_priv(dev); 653 struct ipoib_dev_priv *priv = netdev_priv(dev);
616 654
@@ -621,8 +659,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
621 cancel_delayed_work(&priv->mcast_task); 659 cancel_delayed_work(&priv->mcast_task);
622 mutex_unlock(&mcast_mutex); 660 mutex_unlock(&mcast_mutex);
623 661
624 if (flush) 662 flush_workqueue(priv->wq);
625 flush_workqueue(ipoib_workqueue);
626 663
627 return 0; 664 return 0;
628} 665}
@@ -633,6 +670,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
633 int ret = 0; 670 int ret = 0;
634 671
635 if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) 672 if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
673 ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
674
675 if (!IS_ERR_OR_NULL(mcast->mc))
636 ib_sa_free_multicast(mcast->mc); 676 ib_sa_free_multicast(mcast->mc);
637 677
638 if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { 678 if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
@@ -685,6 +725,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
685 memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); 725 memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
686 __ipoib_mcast_add(dev, mcast); 726 __ipoib_mcast_add(dev, mcast);
687 list_add_tail(&mcast->list, &priv->multicast_list); 727 list_add_tail(&mcast->list, &priv->multicast_list);
728 if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
729 queue_delayed_work(priv->wq, &priv->mcast_task, 0);
688 } 730 }
689 731
690 if (!mcast->ah) { 732 if (!mcast->ah) {
@@ -698,8 +740,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
698 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) 740 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
699 ipoib_dbg_mcast(priv, "no address vector, " 741 ipoib_dbg_mcast(priv, "no address vector, "
700 "but multicast join already started\n"); 742 "but multicast join already started\n");
701 else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
702 ipoib_mcast_sendonly_join(mcast);
703 743
704 /* 744 /*
705 * If lookup completes between here and out:, don't 745 * If lookup completes between here and out:, don't
@@ -759,9 +799,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev)
759 799
760 spin_unlock_irqrestore(&priv->lock, flags); 800 spin_unlock_irqrestore(&priv->lock, flags);
761 801
762 /* seperate between the wait to the leave*/ 802 /*
803 * make sure the in-flight joins have finished before we attempt
804 * to leave
805 */
763 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) 806 list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
764 if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) 807 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
765 wait_for_completion(&mcast->done); 808 wait_for_completion(&mcast->done);
766 809
767 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { 810 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
@@ -794,8 +837,6 @@ void ipoib_mcast_restart_task(struct work_struct *work)
794 837
795 ipoib_dbg_mcast(priv, "restarting multicast task\n"); 838 ipoib_dbg_mcast(priv, "restarting multicast task\n");
796 839
797 ipoib_mcast_stop_thread(dev, 0);
798
799 local_irq_save(flags); 840 local_irq_save(flags);
800 netif_addr_lock(dev); 841 netif_addr_lock(dev);
801 spin_lock(&priv->lock); 842 spin_lock(&priv->lock);
@@ -880,14 +921,38 @@ void ipoib_mcast_restart_task(struct work_struct *work)
880 netif_addr_unlock(dev); 921 netif_addr_unlock(dev);
881 local_irq_restore(flags); 922 local_irq_restore(flags);
882 923
883 /* We have to cancel outside of the spinlock */ 924 /*
925 * make sure the in-flight joins have finished before we attempt
926 * to leave
927 */
928 list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
929 if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
930 wait_for_completion(&mcast->done);
931
932 /*
933 * We have to cancel outside of the spinlock, but we have to
934 * take the rtnl lock or else we race with the removal of
935 * entries from the remove list in mcast_dev_flush as part
936 * of ipoib_stop(). We detect the drop of the ADMIN_UP flag
937 * to signal that we have hit this particular race, and we
938 * return since we know we don't need to do anything else
939 * anyway.
940 */
941 while (!rtnl_trylock()) {
942 if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
943 return;
944 else
945 msleep(20);
946 }
884 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { 947 list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
885 ipoib_mcast_leave(mcast->dev, mcast); 948 ipoib_mcast_leave(mcast->dev, mcast);
886 ipoib_mcast_free(mcast); 949 ipoib_mcast_free(mcast);
887 } 950 }
888 951 /*
889 if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) 952 * Restart our join task if needed
890 ipoib_mcast_start_thread(dev); 953 */
954 ipoib_mcast_start_thread(dev);
955 rtnl_unlock();
891} 956}
892 957
893#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 958#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index c56d5d44c53b..b72a753eb41d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -145,10 +145,20 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
145 int ret, size; 145 int ret, size;
146 int i; 146 int i;
147 147
148 /*
149 * the various IPoIB tasks assume they will never race against
150 * themselves, so always use a single thread workqueue
151 */
152 priv->wq = create_singlethread_workqueue("ipoib_wq");
153 if (!priv->wq) {
154 printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
155 return -ENODEV;
156 }
157
148 priv->pd = ib_alloc_pd(priv->ca); 158 priv->pd = ib_alloc_pd(priv->ca);
149 if (IS_ERR(priv->pd)) { 159 if (IS_ERR(priv->pd)) {
150 printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); 160 printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
151 return -ENODEV; 161 goto out_free_wq;
152 } 162 }
153 163
154 priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); 164 priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
@@ -242,6 +252,10 @@ out_free_mr:
242 252
243out_free_pd: 253out_free_pd:
244 ib_dealloc_pd(priv->pd); 254 ib_dealloc_pd(priv->pd);
255
256out_free_wq:
257 destroy_workqueue(priv->wq);
258 priv->wq = NULL;
245 return -ENODEV; 259 return -ENODEV;
246} 260}
247 261
@@ -270,6 +284,12 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
270 284
271 if (ib_dealloc_pd(priv->pd)) 285 if (ib_dealloc_pd(priv->pd))
272 ipoib_warn(priv, "ib_dealloc_pd failed\n"); 286 ipoib_warn(priv, "ib_dealloc_pd failed\n");
287
288 if (priv->wq) {
289 flush_workqueue(priv->wq);
290 destroy_workqueue(priv->wq);
291 priv->wq = NULL;
292 }
273} 293}
274 294
275void ipoib_event(struct ib_event_handler *handler, 295void ipoib_event(struct ib_event_handler *handler,
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 20ca6a619476..6a594aac2290 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -97,7 +97,7 @@ module_param_named(pi_enable, iser_pi_enable, bool, 0644);
97MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); 97MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
98 98
99module_param_named(pi_guard, iser_pi_guard, int, 0644); 99module_param_named(pi_guard, iser_pi_guard, int, 0644);
100MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)"); 100MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
101 101
102static struct workqueue_struct *release_wq; 102static struct workqueue_struct *release_wq;
103struct iser_global ig; 103struct iser_global ig;
@@ -164,18 +164,42 @@ iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
164 return 0; 164 return 0;
165} 165}
166 166
167int iser_initialize_task_headers(struct iscsi_task *task, 167/**
168 struct iser_tx_desc *tx_desc) 168 * iser_initialize_task_headers() - Initialize task headers
169 * @task: iscsi task
170 * @tx_desc: iser tx descriptor
171 *
172 * Notes:
173 * This routine may race with iser teardown flow for scsi
174 * error handling TMFs. So for TMF we should acquire the
175 * state mutex to avoid dereferencing the IB device which
176 * may have already been terminated.
177 */
178int
179iser_initialize_task_headers(struct iscsi_task *task,
180 struct iser_tx_desc *tx_desc)
169{ 181{
170 struct iser_conn *iser_conn = task->conn->dd_data; 182 struct iser_conn *iser_conn = task->conn->dd_data;
171 struct iser_device *device = iser_conn->ib_conn.device; 183 struct iser_device *device = iser_conn->ib_conn.device;
172 struct iscsi_iser_task *iser_task = task->dd_data; 184 struct iscsi_iser_task *iser_task = task->dd_data;
173 u64 dma_addr; 185 u64 dma_addr;
186 const bool mgmt_task = !task->sc && !in_interrupt();
187 int ret = 0;
188
189 if (unlikely(mgmt_task))
190 mutex_lock(&iser_conn->state_mutex);
191
192 if (unlikely(iser_conn->state != ISER_CONN_UP)) {
193 ret = -ENODEV;
194 goto out;
195 }
174 196
175 dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, 197 dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
176 ISER_HEADERS_LEN, DMA_TO_DEVICE); 198 ISER_HEADERS_LEN, DMA_TO_DEVICE);
177 if (ib_dma_mapping_error(device->ib_device, dma_addr)) 199 if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
178 return -ENOMEM; 200 ret = -ENOMEM;
201 goto out;
202 }
179 203
180 tx_desc->dma_addr = dma_addr; 204 tx_desc->dma_addr = dma_addr;
181 tx_desc->tx_sg[0].addr = tx_desc->dma_addr; 205 tx_desc->tx_sg[0].addr = tx_desc->dma_addr;
@@ -183,7 +207,11 @@ int iser_initialize_task_headers(struct iscsi_task *task,
183 tx_desc->tx_sg[0].lkey = device->mr->lkey; 207 tx_desc->tx_sg[0].lkey = device->mr->lkey;
184 208
185 iser_task->iser_conn = iser_conn; 209 iser_task->iser_conn = iser_conn;
186 return 0; 210out:
211 if (unlikely(mgmt_task))
212 mutex_unlock(&iser_conn->state_mutex);
213
214 return ret;
187} 215}
188 216
189/** 217/**
@@ -199,9 +227,14 @@ static int
199iscsi_iser_task_init(struct iscsi_task *task) 227iscsi_iser_task_init(struct iscsi_task *task)
200{ 228{
201 struct iscsi_iser_task *iser_task = task->dd_data; 229 struct iscsi_iser_task *iser_task = task->dd_data;
230 int ret;
202 231
203 if (iser_initialize_task_headers(task, &iser_task->desc)) 232 ret = iser_initialize_task_headers(task, &iser_task->desc);
204 return -ENOMEM; 233 if (ret) {
234 iser_err("Failed to init task %p, err = %d\n",
235 iser_task, ret);
236 return ret;
237 }
205 238
206 /* mgmt task */ 239 /* mgmt task */
207 if (!task->sc) 240 if (!task->sc)
@@ -508,8 +541,8 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
508 */ 541 */
509 if (iser_conn) { 542 if (iser_conn) {
510 mutex_lock(&iser_conn->state_mutex); 543 mutex_lock(&iser_conn->state_mutex);
511 iscsi_conn_stop(cls_conn, flag);
512 iser_conn_terminate(iser_conn); 544 iser_conn_terminate(iser_conn);
545 iscsi_conn_stop(cls_conn, flag);
513 546
514 /* unbind */ 547 /* unbind */
515 iser_conn->iscsi_conn = NULL; 548 iser_conn->iscsi_conn = NULL;
@@ -541,12 +574,13 @@ iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
541static inline unsigned int 574static inline unsigned int
542iser_dif_prot_caps(int prot_caps) 575iser_dif_prot_caps(int prot_caps)
543{ 576{
544 return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | 577 return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ?
545 SHOST_DIX_TYPE1_PROTECTION : 0) | 578 SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION |
546 ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | 579 SHOST_DIX_TYPE1_PROTECTION : 0) |
547 SHOST_DIX_TYPE2_PROTECTION : 0) | 580 ((prot_caps & IB_PROT_T10DIF_TYPE_2) ?
548 ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | 581 SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) |
549 SHOST_DIX_TYPE3_PROTECTION : 0); 582 ((prot_caps & IB_PROT_T10DIF_TYPE_3) ?
583 SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0);
550} 584}
551 585
552/** 586/**
@@ -569,6 +603,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
569 struct Scsi_Host *shost; 603 struct Scsi_Host *shost;
570 struct iser_conn *iser_conn = NULL; 604 struct iser_conn *iser_conn = NULL;
571 struct ib_conn *ib_conn; 605 struct ib_conn *ib_conn;
606 u16 max_cmds;
572 607
573 shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); 608 shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
574 if (!shost) 609 if (!shost)
@@ -586,26 +621,41 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep,
586 */ 621 */
587 if (ep) { 622 if (ep) {
588 iser_conn = ep->dd_data; 623 iser_conn = ep->dd_data;
624 max_cmds = iser_conn->max_cmds;
625
626 mutex_lock(&iser_conn->state_mutex);
627 if (iser_conn->state != ISER_CONN_UP) {
628 iser_err("iser conn %p already started teardown\n",
629 iser_conn);
630 mutex_unlock(&iser_conn->state_mutex);
631 goto free_host;
632 }
633
589 ib_conn = &iser_conn->ib_conn; 634 ib_conn = &iser_conn->ib_conn;
590 if (ib_conn->pi_support) { 635 if (ib_conn->pi_support) {
591 u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; 636 u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
592 637
593 scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); 638 scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
594 if (iser_pi_guard) 639 scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
595 scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); 640 SHOST_DIX_GUARD_CRC);
596 else
597 scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC);
598 } 641 }
599 }
600 642
601 if (iscsi_host_add(shost, ep ? 643 if (iscsi_host_add(shost,
602 ib_conn->device->ib_device->dma_device : NULL)) 644 ib_conn->device->ib_device->dma_device)) {
603 goto free_host; 645 mutex_unlock(&iser_conn->state_mutex);
646 goto free_host;
647 }
648 mutex_unlock(&iser_conn->state_mutex);
649 } else {
650 max_cmds = ISER_DEF_XMIT_CMDS_MAX;
651 if (iscsi_host_add(shost, NULL))
652 goto free_host;
653 }
604 654
605 if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { 655 if (cmds_max > max_cmds) {
606 iser_info("cmds_max changed from %u to %u\n", 656 iser_info("cmds_max changed from %u to %u\n",
607 cmds_max, ISER_DEF_XMIT_CMDS_MAX); 657 cmds_max, max_cmds);
608 cmds_max = ISER_DEF_XMIT_CMDS_MAX; 658 cmds_max = max_cmds;
609 } 659 }
610 660
611 cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, 661 cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
index cd4174ca9a76..5ce26817e7e1 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -69,34 +69,31 @@
69 69
70#define DRV_NAME "iser" 70#define DRV_NAME "iser"
71#define PFX DRV_NAME ": " 71#define PFX DRV_NAME ": "
72#define DRV_VER "1.4.8" 72#define DRV_VER "1.5"
73 73
74#define iser_dbg(fmt, arg...) \ 74#define iser_dbg(fmt, arg...) \
75 do { \ 75 do { \
76 if (iser_debug_level > 2) \ 76 if (unlikely(iser_debug_level > 2)) \
77 printk(KERN_DEBUG PFX "%s: " fmt,\ 77 printk(KERN_DEBUG PFX "%s: " fmt,\
78 __func__ , ## arg); \ 78 __func__ , ## arg); \
79 } while (0) 79 } while (0)
80 80
81#define iser_warn(fmt, arg...) \ 81#define iser_warn(fmt, arg...) \
82 do { \ 82 do { \
83 if (iser_debug_level > 0) \ 83 if (unlikely(iser_debug_level > 0)) \
84 pr_warn(PFX "%s: " fmt, \ 84 pr_warn(PFX "%s: " fmt, \
85 __func__ , ## arg); \ 85 __func__ , ## arg); \
86 } while (0) 86 } while (0)
87 87
88#define iser_info(fmt, arg...) \ 88#define iser_info(fmt, arg...) \
89 do { \ 89 do { \
90 if (iser_debug_level > 1) \ 90 if (unlikely(iser_debug_level > 1)) \
91 pr_info(PFX "%s: " fmt, \ 91 pr_info(PFX "%s: " fmt, \
92 __func__ , ## arg); \ 92 __func__ , ## arg); \
93 } while (0) 93 } while (0)
94 94
95#define iser_err(fmt, arg...) \ 95#define iser_err(fmt, arg...) \
96 do { \ 96 pr_err(PFX "%s: " fmt, __func__ , ## arg)
97 printk(KERN_ERR PFX "%s: " fmt, \
98 __func__ , ## arg); \
99 } while (0)
100 97
101#define SHIFT_4K 12 98#define SHIFT_4K 12
102#define SIZE_4K (1ULL << SHIFT_4K) 99#define SIZE_4K (1ULL << SHIFT_4K)
@@ -144,6 +141,11 @@
144 ISER_MAX_TX_MISC_PDUS + \ 141 ISER_MAX_TX_MISC_PDUS + \
145 ISER_MAX_RX_MISC_PDUS) 142 ISER_MAX_RX_MISC_PDUS)
146 143
144#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \
145 - ISER_MAX_TX_MISC_PDUS \
146 - ISER_MAX_RX_MISC_PDUS) / \
147 (1 + ISER_INFLIGHT_DATAOUTS))
148
147#define ISER_WC_BATCH_COUNT 16 149#define ISER_WC_BATCH_COUNT 16
148#define ISER_SIGNAL_CMD_COUNT 32 150#define ISER_SIGNAL_CMD_COUNT 32
149 151
@@ -247,7 +249,6 @@ struct iscsi_endpoint;
247 * @va: MR start address (buffer va) 249 * @va: MR start address (buffer va)
248 * @len: MR length 250 * @len: MR length
249 * @mem_h: pointer to registration context (FMR/Fastreg) 251 * @mem_h: pointer to registration context (FMR/Fastreg)
250 * @is_mr: indicates weather we registered the buffer
251 */ 252 */
252struct iser_mem_reg { 253struct iser_mem_reg {
253 u32 lkey; 254 u32 lkey;
@@ -255,7 +256,6 @@ struct iser_mem_reg {
255 u64 va; 256 u64 va;
256 u64 len; 257 u64 len;
257 void *mem_h; 258 void *mem_h;
258 int is_mr;
259}; 259};
260 260
261/** 261/**
@@ -323,8 +323,6 @@ struct iser_rx_desc {
323 char pad[ISER_RX_PAD_SIZE]; 323 char pad[ISER_RX_PAD_SIZE];
324} __attribute__((packed)); 324} __attribute__((packed));
325 325
326#define ISER_MAX_CQ 4
327
328struct iser_conn; 326struct iser_conn;
329struct ib_conn; 327struct ib_conn;
330struct iscsi_iser_task; 328struct iscsi_iser_task;
@@ -375,7 +373,7 @@ struct iser_device {
375 struct list_head ig_list; 373 struct list_head ig_list;
376 int refcount; 374 int refcount;
377 int comps_used; 375 int comps_used;
378 struct iser_comp comps[ISER_MAX_CQ]; 376 struct iser_comp *comps;
379 int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, 377 int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
380 unsigned cmds_max); 378 unsigned cmds_max);
381 void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); 379 void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
@@ -432,6 +430,7 @@ struct fast_reg_descriptor {
432 * @cma_id: rdma_cm connection maneger handle 430 * @cma_id: rdma_cm connection maneger handle
433 * @qp: Connection Queue-pair 431 * @qp: Connection Queue-pair
434 * @post_recv_buf_count: post receive counter 432 * @post_recv_buf_count: post receive counter
433 * @sig_count: send work request signal count
435 * @rx_wr: receive work request for batch posts 434 * @rx_wr: receive work request for batch posts
436 * @device: reference to iser device 435 * @device: reference to iser device
437 * @comp: iser completion context 436 * @comp: iser completion context
@@ -452,6 +451,7 @@ struct ib_conn {
452 struct rdma_cm_id *cma_id; 451 struct rdma_cm_id *cma_id;
453 struct ib_qp *qp; 452 struct ib_qp *qp;
454 int post_recv_buf_count; 453 int post_recv_buf_count;
454 u8 sig_count;
455 struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; 455 struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX];
456 struct iser_device *device; 456 struct iser_device *device;
457 struct iser_comp *comp; 457 struct iser_comp *comp;
@@ -482,6 +482,7 @@ struct ib_conn {
482 * to max number of post recvs 482 * to max number of post recvs
483 * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) 483 * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
484 * @min_posted_rx: (qp_max_recv_dtos >> 2) 484 * @min_posted_rx: (qp_max_recv_dtos >> 2)
485 * @max_cmds: maximum cmds allowed for this connection
485 * @name: connection peer portal 486 * @name: connection peer portal
486 * @release_work: deffered work for release job 487 * @release_work: deffered work for release job
487 * @state_mutex: protects iser onnection state 488 * @state_mutex: protects iser onnection state
@@ -507,6 +508,7 @@ struct iser_conn {
507 unsigned qp_max_recv_dtos; 508 unsigned qp_max_recv_dtos;
508 unsigned qp_max_recv_dtos_mask; 509 unsigned qp_max_recv_dtos_mask;
509 unsigned min_posted_rx; 510 unsigned min_posted_rx;
511 u16 max_cmds;
510 char name[ISER_OBJECT_NAME_SIZE]; 512 char name[ISER_OBJECT_NAME_SIZE];
511 struct work_struct release_work; 513 struct work_struct release_work;
512 struct mutex state_mutex; 514 struct mutex state_mutex;
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
index 5a489ea63732..3821633f1065 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -369,7 +369,7 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
369 return 0; 369 return 0;
370} 370}
371 371
372static inline bool iser_signal_comp(int sig_count) 372static inline bool iser_signal_comp(u8 sig_count)
373{ 373{
374 return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); 374 return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
375} 375}
@@ -388,7 +388,7 @@ int iser_send_command(struct iscsi_conn *conn,
388 struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; 388 struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
389 struct scsi_cmnd *sc = task->sc; 389 struct scsi_cmnd *sc = task->sc;
390 struct iser_tx_desc *tx_desc = &iser_task->desc; 390 struct iser_tx_desc *tx_desc = &iser_task->desc;
391 static unsigned sig_count; 391 u8 sig_count = ++iser_conn->ib_conn.sig_count;
392 392
393 edtl = ntohl(hdr->data_length); 393 edtl = ntohl(hdr->data_length);
394 394
@@ -435,7 +435,7 @@ int iser_send_command(struct iscsi_conn *conn,
435 iser_task->status = ISER_TASK_STATUS_STARTED; 435 iser_task->status = ISER_TASK_STATUS_STARTED;
436 436
437 err = iser_post_send(&iser_conn->ib_conn, tx_desc, 437 err = iser_post_send(&iser_conn->ib_conn, tx_desc,
438 iser_signal_comp(++sig_count)); 438 iser_signal_comp(sig_count));
439 if (!err) 439 if (!err)
440 return 0; 440 return 0;
441 441
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
index 6c5ce357fba6..abce9339333f 100644
--- a/drivers/infiniband/ulp/iser/iser_memory.c
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -73,7 +73,6 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
73 73
74 if (cmd_dir == ISER_DIR_OUT) { 74 if (cmd_dir == ISER_DIR_OUT) {
75 /* copy the unaligned sg the buffer which is used for RDMA */ 75 /* copy the unaligned sg the buffer which is used for RDMA */
76 int i;
77 char *p, *from; 76 char *p, *from;
78 77
79 sgl = (struct scatterlist *)data->buf; 78 sgl = (struct scatterlist *)data->buf;
@@ -409,7 +408,6 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
409 regd_buf->reg.rkey = device->mr->rkey; 408 regd_buf->reg.rkey = device->mr->rkey;
410 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); 409 regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]);
411 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); 410 regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]);
412 regd_buf->reg.is_mr = 0;
413 411
414 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " 412 iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X "
415 "va: 0x%08lX sz: %ld]\n", 413 "va: 0x%08lX sz: %ld]\n",
@@ -440,13 +438,13 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
440 return 0; 438 return 0;
441} 439}
442 440
443static inline void 441static void
444iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, 442iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
445 struct ib_sig_domain *domain) 443 struct ib_sig_domain *domain)
446{ 444{
447 domain->sig_type = IB_SIG_TYPE_T10_DIF; 445 domain->sig_type = IB_SIG_TYPE_T10_DIF;
448 domain->sig.dif.pi_interval = sc->device->sector_size; 446 domain->sig.dif.pi_interval = scsi_prot_interval(sc);
449 domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff; 447 domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
450 /* 448 /*
451 * At the moment we hard code those, but in the future 449 * At the moment we hard code those, but in the future
452 * we will take them from sc. 450 * we will take them from sc.
@@ -454,8 +452,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
454 domain->sig.dif.apptag_check_mask = 0xffff; 452 domain->sig.dif.apptag_check_mask = 0xffff;
455 domain->sig.dif.app_escape = true; 453 domain->sig.dif.app_escape = true;
456 domain->sig.dif.ref_escape = true; 454 domain->sig.dif.ref_escape = true;
457 if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 || 455 if (sc->prot_flags & SCSI_PROT_REF_INCREMENT)
458 scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2)
459 domain->sig.dif.ref_remap = true; 456 domain->sig.dif.ref_remap = true;
460}; 457};
461 458
@@ -473,26 +470,16 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
473 case SCSI_PROT_WRITE_STRIP: 470 case SCSI_PROT_WRITE_STRIP:
474 sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; 471 sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
475 iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); 472 iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
476 /* 473 sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
477 * At the moment we use this modparam to tell what is 474 IB_T10DIF_CSUM : IB_T10DIF_CRC;
478 * the memory bg_type, in the future we will take it
479 * from sc.
480 */
481 sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
482 IB_T10DIF_CRC;
483 break; 475 break;
484 case SCSI_PROT_READ_PASS: 476 case SCSI_PROT_READ_PASS:
485 case SCSI_PROT_WRITE_PASS: 477 case SCSI_PROT_WRITE_PASS:
486 iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); 478 iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
487 sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; 479 sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
488 iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); 480 iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
489 /* 481 sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
490 * At the moment we use this modparam to tell what is 482 IB_T10DIF_CSUM : IB_T10DIF_CRC;
491 * the memory bg_type, in the future we will take it
492 * from sc.
493 */
494 sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM :
495 IB_T10DIF_CRC;
496 break; 483 break;
497 default: 484 default:
498 iser_err("Unsupported PI operation %d\n", 485 iser_err("Unsupported PI operation %d\n",
@@ -503,26 +490,28 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
503 return 0; 490 return 0;
504} 491}
505 492
506static int 493static inline void
507iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) 494iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
508{ 495{
509 switch (scsi_get_prot_type(sc)) { 496 *mask = 0;
510 case SCSI_PROT_DIF_TYPE0: 497 if (sc->prot_flags & SCSI_PROT_REF_CHECK)
511 break; 498 *mask |= ISER_CHECK_REFTAG;
512 case SCSI_PROT_DIF_TYPE1: 499 if (sc->prot_flags & SCSI_PROT_GUARD_CHECK)
513 case SCSI_PROT_DIF_TYPE2: 500 *mask |= ISER_CHECK_GUARD;
514 *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; 501}
515 break;
516 case SCSI_PROT_DIF_TYPE3:
517 *mask = ISER_CHECK_GUARD;
518 break;
519 default:
520 iser_err("Unsupported protection type %d\n",
521 scsi_get_prot_type(sc));
522 return -EINVAL;
523 }
524 502
525 return 0; 503static void
504iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
505{
506 u32 rkey;
507
508 memset(inv_wr, 0, sizeof(*inv_wr));
509 inv_wr->opcode = IB_WR_LOCAL_INV;
510 inv_wr->wr_id = ISER_FASTREG_LI_WRID;
511 inv_wr->ex.invalidate_rkey = mr->rkey;
512
513 rkey = ib_inc_rkey(mr->rkey);
514 ib_update_fast_reg_key(mr, rkey);
526} 515}
527 516
528static int 517static int
@@ -536,26 +525,17 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
536 struct ib_send_wr *bad_wr, *wr = NULL; 525 struct ib_send_wr *bad_wr, *wr = NULL;
537 struct ib_sig_attrs sig_attrs; 526 struct ib_sig_attrs sig_attrs;
538 int ret; 527 int ret;
539 u32 key;
540 528
541 memset(&sig_attrs, 0, sizeof(sig_attrs)); 529 memset(&sig_attrs, 0, sizeof(sig_attrs));
542 ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); 530 ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
543 if (ret) 531 if (ret)
544 goto err; 532 goto err;
545 533
546 ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); 534 iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
547 if (ret)
548 goto err;
549 535
550 if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { 536 if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
551 memset(&inv_wr, 0, sizeof(inv_wr)); 537 iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
552 inv_wr.opcode = IB_WR_LOCAL_INV;
553 inv_wr.wr_id = ISER_FASTREG_LI_WRID;
554 inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey;
555 wr = &inv_wr; 538 wr = &inv_wr;
556 /* Bump the key */
557 key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF);
558 ib_update_fast_reg_key(pi_ctx->sig_mr, ++key);
559 } 539 }
560 540
561 memset(&sig_wr, 0, sizeof(sig_wr)); 541 memset(&sig_wr, 0, sizeof(sig_wr));
@@ -585,12 +565,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
585 565
586 sig_sge->lkey = pi_ctx->sig_mr->lkey; 566 sig_sge->lkey = pi_ctx->sig_mr->lkey;
587 sig_sge->addr = 0; 567 sig_sge->addr = 0;
588 sig_sge->length = data_sge->length + prot_sge->length; 568 sig_sge->length = scsi_transfer_length(iser_task->sc);
589 if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT ||
590 scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) {
591 sig_sge->length += (data_sge->length /
592 iser_task->sc->device->sector_size) * 8;
593 }
594 569
595 iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", 570 iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n",
596 sig_sge->addr, sig_sge->length, 571 sig_sge->addr, sig_sge->length,
@@ -613,7 +588,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
613 struct ib_fast_reg_page_list *frpl; 588 struct ib_fast_reg_page_list *frpl;
614 struct ib_send_wr fastreg_wr, inv_wr; 589 struct ib_send_wr fastreg_wr, inv_wr;
615 struct ib_send_wr *bad_wr, *wr = NULL; 590 struct ib_send_wr *bad_wr, *wr = NULL;
616 u8 key;
617 int ret, offset, size, plen; 591 int ret, offset, size, plen;
618 592
619 /* if there a single dma entry, dma mr suffices */ 593 /* if there a single dma entry, dma mr suffices */
@@ -645,14 +619,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
645 } 619 }
646 620
647 if (!(desc->reg_indicators & ind)) { 621 if (!(desc->reg_indicators & ind)) {
648 memset(&inv_wr, 0, sizeof(inv_wr)); 622 iser_inv_rkey(&inv_wr, mr);
649 inv_wr.wr_id = ISER_FASTREG_LI_WRID;
650 inv_wr.opcode = IB_WR_LOCAL_INV;
651 inv_wr.ex.invalidate_rkey = mr->rkey;
652 wr = &inv_wr; 623 wr = &inv_wr;
653 /* Bump the key */
654 key = (u8)(mr->rkey & 0x000000FF);
655 ib_update_fast_reg_key(mr, ++key);
656 } 624 }
657 625
658 /* Prepare FASTREG WR */ 626 /* Prepare FASTREG WR */
@@ -770,15 +738,11 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
770 regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; 738 regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey;
771 regd_buf->reg.va = sig_sge.addr; 739 regd_buf->reg.va = sig_sge.addr;
772 regd_buf->reg.len = sig_sge.length; 740 regd_buf->reg.len = sig_sge.length;
773 regd_buf->reg.is_mr = 1;
774 } else { 741 } else {
775 if (desc) { 742 if (desc)
776 regd_buf->reg.rkey = desc->data_mr->rkey; 743 regd_buf->reg.rkey = desc->data_mr->rkey;
777 regd_buf->reg.is_mr = 1; 744 else
778 } else {
779 regd_buf->reg.rkey = device->mr->rkey; 745 regd_buf->reg.rkey = device->mr->rkey;
780 regd_buf->reg.is_mr = 0;
781 }
782 746
783 regd_buf->reg.lkey = data_sge.lkey; 747 regd_buf->reg.lkey = data_sge.lkey;
784 regd_buf->reg.va = data_sge.addr; 748 regd_buf->reg.va = data_sge.addr;
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 67225bb82bb5..695a2704bd43 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -76,7 +76,7 @@ static void iser_event_handler(struct ib_event_handler *handler,
76static int iser_create_device_ib_res(struct iser_device *device) 76static int iser_create_device_ib_res(struct iser_device *device)
77{ 77{
78 struct ib_device_attr *dev_attr = &device->dev_attr; 78 struct ib_device_attr *dev_attr = &device->dev_attr;
79 int ret, i; 79 int ret, i, max_cqe;
80 80
81 ret = ib_query_device(device->ib_device, dev_attr); 81 ret = ib_query_device(device->ib_device, dev_attr);
82 if (ret) { 82 if (ret) {
@@ -104,11 +104,19 @@ static int iser_create_device_ib_res(struct iser_device *device)
104 return -1; 104 return -1;
105 } 105 }
106 106
107 device->comps_used = min(ISER_MAX_CQ, 107 device->comps_used = min_t(int, num_online_cpus(),
108 device->ib_device->num_comp_vectors); 108 device->ib_device->num_comp_vectors);
109 iser_info("using %d CQs, device %s supports %d vectors\n", 109
110 device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
111 GFP_KERNEL);
112 if (!device->comps)
113 goto comps_err;
114
115 max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
116
117 iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
110 device->comps_used, device->ib_device->name, 118 device->comps_used, device->ib_device->name,
111 device->ib_device->num_comp_vectors); 119 device->ib_device->num_comp_vectors, max_cqe);
112 120
113 device->pd = ib_alloc_pd(device->ib_device); 121 device->pd = ib_alloc_pd(device->ib_device);
114 if (IS_ERR(device->pd)) 122 if (IS_ERR(device->pd))
@@ -122,7 +130,7 @@ static int iser_create_device_ib_res(struct iser_device *device)
122 iser_cq_callback, 130 iser_cq_callback,
123 iser_cq_event_callback, 131 iser_cq_event_callback,
124 (void *)comp, 132 (void *)comp,
125 ISER_MAX_CQ_LEN, i); 133 max_cqe, i);
126 if (IS_ERR(comp->cq)) { 134 if (IS_ERR(comp->cq)) {
127 comp->cq = NULL; 135 comp->cq = NULL;
128 goto cq_err; 136 goto cq_err;
@@ -162,6 +170,8 @@ cq_err:
162 } 170 }
163 ib_dealloc_pd(device->pd); 171 ib_dealloc_pd(device->pd);
164pd_err: 172pd_err:
173 kfree(device->comps);
174comps_err:
165 iser_err("failed to allocate an IB resource\n"); 175 iser_err("failed to allocate an IB resource\n");
166 return -1; 176 return -1;
167} 177}
@@ -187,6 +197,9 @@ static void iser_free_device_ib_res(struct iser_device *device)
187 (void)ib_dereg_mr(device->mr); 197 (void)ib_dereg_mr(device->mr);
188 (void)ib_dealloc_pd(device->pd); 198 (void)ib_dealloc_pd(device->pd);
189 199
200 kfree(device->comps);
201 device->comps = NULL;
202
190 device->mr = NULL; 203 device->mr = NULL;
191 device->pd = NULL; 204 device->pd = NULL;
192} 205}
@@ -425,7 +438,10 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn)
425 */ 438 */
426static int iser_create_ib_conn_res(struct ib_conn *ib_conn) 439static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
427{ 440{
441 struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
442 ib_conn);
428 struct iser_device *device; 443 struct iser_device *device;
444 struct ib_device_attr *dev_attr;
429 struct ib_qp_init_attr init_attr; 445 struct ib_qp_init_attr init_attr;
430 int ret = -ENOMEM; 446 int ret = -ENOMEM;
431 int index, min_index = 0; 447 int index, min_index = 0;
@@ -433,6 +449,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
433 BUG_ON(ib_conn->device == NULL); 449 BUG_ON(ib_conn->device == NULL);
434 450
435 device = ib_conn->device; 451 device = ib_conn->device;
452 dev_attr = &device->dev_attr;
436 453
437 memset(&init_attr, 0, sizeof init_attr); 454 memset(&init_attr, 0, sizeof init_attr);
438 455
@@ -460,8 +477,20 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
460 if (ib_conn->pi_support) { 477 if (ib_conn->pi_support) {
461 init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; 478 init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
462 init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; 479 init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
480 iser_conn->max_cmds =
481 ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
463 } else { 482 } else {
464 init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; 483 if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
484 init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1;
485 iser_conn->max_cmds =
486 ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
487 } else {
488 init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
489 iser_conn->max_cmds =
490 ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
491 iser_dbg("device %s supports max_send_wr %d\n",
492 device->ib_device->name, dev_attr->max_qp_wr);
493 }
465 } 494 }
466 495
467 ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); 496 ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
@@ -475,7 +504,11 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
475 return ret; 504 return ret;
476 505
477out_err: 506out_err:
507 mutex_lock(&ig.connlist_mutex);
508 ib_conn->comp->active_qps--;
509 mutex_unlock(&ig.connlist_mutex);
478 iser_err("unable to alloc mem or create resource, err %d\n", ret); 510 iser_err("unable to alloc mem or create resource, err %d\n", ret);
511
479 return ret; 512 return ret;
480} 513}
481 514
@@ -610,9 +643,11 @@ void iser_conn_release(struct iser_conn *iser_conn)
610 mutex_unlock(&ig.connlist_mutex); 643 mutex_unlock(&ig.connlist_mutex);
611 644
612 mutex_lock(&iser_conn->state_mutex); 645 mutex_lock(&iser_conn->state_mutex);
613 if (iser_conn->state != ISER_CONN_DOWN) 646 if (iser_conn->state != ISER_CONN_DOWN) {
614 iser_warn("iser conn %p state %d, expected state down.\n", 647 iser_warn("iser conn %p state %d, expected state down.\n",
615 iser_conn, iser_conn->state); 648 iser_conn, iser_conn->state);
649 iser_conn->state = ISER_CONN_DOWN;
650 }
616 /* 651 /*
617 * In case we never got to bind stage, we still need to 652 * In case we never got to bind stage, we still need to
618 * release IB resources (which is safe to call more than once). 653 * release IB resources (which is safe to call more than once).
@@ -662,8 +697,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
662 697
663 /* post an indication that all flush errors were consumed */ 698 /* post an indication that all flush errors were consumed */
664 err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); 699 err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
665 if (err) 700 if (err) {
666 iser_err("conn %p failed to post beacon", ib_conn); 701 iser_err("conn %p failed to post beacon", ib_conn);
702 return 1;
703 }
667 704
668 wait_for_completion(&ib_conn->flush_comp); 705 wait_for_completion(&ib_conn->flush_comp);
669 } 706 }
@@ -846,20 +883,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve
846 break; 883 break;
847 case RDMA_CM_EVENT_DISCONNECTED: 884 case RDMA_CM_EVENT_DISCONNECTED:
848 case RDMA_CM_EVENT_ADDR_CHANGE: 885 case RDMA_CM_EVENT_ADDR_CHANGE:
849 iser_disconnected_handler(cma_id); 886 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
887 iser_cleanup_handler(cma_id, false);
850 break; 888 break;
851 case RDMA_CM_EVENT_DEVICE_REMOVAL: 889 case RDMA_CM_EVENT_DEVICE_REMOVAL:
852 /* 890 /*
853 * we *must* destroy the device as we cannot rely 891 * we *must* destroy the device as we cannot rely
854 * on iscsid to be around to initiate error handling. 892 * on iscsid to be around to initiate error handling.
855 * also implicitly destroy the cma_id. 893 * also if we are not in state DOWN implicitly destroy
894 * the cma_id.
856 */ 895 */
857 iser_cleanup_handler(cma_id, true); 896 iser_cleanup_handler(cma_id, true);
858 iser_conn->ib_conn.cma_id = NULL; 897 if (iser_conn->state != ISER_CONN_DOWN) {
859 ret = 1; 898 iser_conn->ib_conn.cma_id = NULL;
860 break; 899 ret = 1;
861 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 900 }
862 iser_cleanup_handler(cma_id, false);
863 break; 901 break;
864 default: 902 default:
865 iser_err("Unexpected RDMA CM event (%d)\n", event->event); 903 iser_err("Unexpected RDMA CM event (%d)\n", event->event);
@@ -981,7 +1019,6 @@ int iser_reg_page_vec(struct ib_conn *ib_conn,
981 mem_reg->rkey = mem->fmr->rkey; 1019 mem_reg->rkey = mem->fmr->rkey;
982 mem_reg->len = page_vec->length * SIZE_4K; 1020 mem_reg->len = page_vec->length * SIZE_4K;
983 mem_reg->va = io_addr; 1021 mem_reg->va = io_addr;
984 mem_reg->is_mr = 1;
985 mem_reg->mem_h = (void *)mem; 1022 mem_reg->mem_h = (void *)mem;
986 1023
987 mem_reg->va += page_vec->offset; 1024 mem_reg->va += page_vec->offset;
@@ -1008,7 +1045,7 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
1008 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 1045 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg;
1009 int ret; 1046 int ret;
1010 1047
1011 if (!reg->is_mr) 1048 if (!reg->mem_h)
1012 return; 1049 return;
1013 1050
1014 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); 1051 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h);
@@ -1028,11 +1065,10 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
1028 struct ib_conn *ib_conn = &iser_conn->ib_conn; 1065 struct ib_conn *ib_conn = &iser_conn->ib_conn;
1029 struct fast_reg_descriptor *desc = reg->mem_h; 1066 struct fast_reg_descriptor *desc = reg->mem_h;
1030 1067
1031 if (!reg->is_mr) 1068 if (!desc)
1032 return; 1069 return;
1033 1070
1034 reg->mem_h = NULL; 1071 reg->mem_h = NULL;
1035 reg->is_mr = 0;
1036 spin_lock_bh(&ib_conn->lock); 1072 spin_lock_bh(&ib_conn->lock);
1037 list_add_tail(&desc->list, &ib_conn->fastreg.pool); 1073 list_add_tail(&desc->list, &ib_conn->fastreg.pool);
1038 spin_unlock_bh(&ib_conn->lock); 1074 spin_unlock_bh(&ib_conn->lock);
@@ -1049,7 +1085,7 @@ int iser_post_recvl(struct iser_conn *iser_conn)
1049 sge.length = ISER_RX_LOGIN_SIZE; 1085 sge.length = ISER_RX_LOGIN_SIZE;
1050 sge.lkey = ib_conn->device->mr->lkey; 1086 sge.lkey = ib_conn->device->mr->lkey;
1051 1087
1052 rx_wr.wr_id = (unsigned long)iser_conn->login_resp_buf; 1088 rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf;
1053 rx_wr.sg_list = &sge; 1089 rx_wr.sg_list = &sge;
1054 rx_wr.num_sge = 1; 1090 rx_wr.num_sge = 1;
1055 rx_wr.next = NULL; 1091 rx_wr.next = NULL;
@@ -1073,7 +1109,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count)
1073 1109
1074 for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { 1110 for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
1075 rx_desc = &iser_conn->rx_descs[my_rx_head]; 1111 rx_desc = &iser_conn->rx_descs[my_rx_head];
1076 rx_wr->wr_id = (unsigned long)rx_desc; 1112 rx_wr->wr_id = (uintptr_t)rx_desc;
1077 rx_wr->sg_list = &rx_desc->rx_sg; 1113 rx_wr->sg_list = &rx_desc->rx_sg;
1078 rx_wr->num_sge = 1; 1114 rx_wr->num_sge = 1;
1079 rx_wr->next = rx_wr + 1; 1115 rx_wr->next = rx_wr + 1;
@@ -1110,7 +1146,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
1110 DMA_TO_DEVICE); 1146 DMA_TO_DEVICE);
1111 1147
1112 send_wr.next = NULL; 1148 send_wr.next = NULL;
1113 send_wr.wr_id = (unsigned long)tx_desc; 1149 send_wr.wr_id = (uintptr_t)tx_desc;
1114 send_wr.sg_list = tx_desc->tx_sg; 1150 send_wr.sg_list = tx_desc->tx_sg;
1115 send_wr.num_sge = tx_desc->num_sge; 1151 send_wr.num_sge = tx_desc->num_sge;
1116 send_wr.opcode = IB_WR_SEND; 1152 send_wr.opcode = IB_WR_SEND;
@@ -1160,6 +1196,7 @@ static void
1160iser_handle_comp_error(struct ib_conn *ib_conn, 1196iser_handle_comp_error(struct ib_conn *ib_conn,
1161 struct ib_wc *wc) 1197 struct ib_wc *wc)
1162{ 1198{
1199 void *wr_id = (void *)(uintptr_t)wc->wr_id;
1163 struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, 1200 struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
1164 ib_conn); 1201 ib_conn);
1165 1202
@@ -1168,8 +1205,8 @@ iser_handle_comp_error(struct ib_conn *ib_conn,
1168 iscsi_conn_failure(iser_conn->iscsi_conn, 1205 iscsi_conn_failure(iser_conn->iscsi_conn,
1169 ISCSI_ERR_CONN_FAILED); 1206 ISCSI_ERR_CONN_FAILED);
1170 1207
1171 if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) { 1208 if (is_iser_tx_desc(iser_conn, wr_id)) {
1172 struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id; 1209 struct iser_tx_desc *desc = wr_id;
1173 1210
1174 if (desc->type == ISCSI_TX_DATAOUT) 1211 if (desc->type == ISCSI_TX_DATAOUT)
1175 kmem_cache_free(ig.desc_cache, desc); 1212 kmem_cache_free(ig.desc_cache, desc);
@@ -1193,14 +1230,14 @@ static void iser_handle_wc(struct ib_wc *wc)
1193 struct iser_rx_desc *rx_desc; 1230 struct iser_rx_desc *rx_desc;
1194 1231
1195 ib_conn = wc->qp->qp_context; 1232 ib_conn = wc->qp->qp_context;
1196 if (wc->status == IB_WC_SUCCESS) { 1233 if (likely(wc->status == IB_WC_SUCCESS)) {
1197 if (wc->opcode == IB_WC_RECV) { 1234 if (wc->opcode == IB_WC_RECV) {
1198 rx_desc = (struct iser_rx_desc *)wc->wr_id; 1235 rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
1199 iser_rcv_completion(rx_desc, wc->byte_len, 1236 iser_rcv_completion(rx_desc, wc->byte_len,
1200 ib_conn); 1237 ib_conn);
1201 } else 1238 } else
1202 if (wc->opcode == IB_WC_SEND) { 1239 if (wc->opcode == IB_WC_SEND) {
1203 tx_desc = (struct iser_tx_desc *)wc->wr_id; 1240 tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
1204 iser_snd_completion(tx_desc, ib_conn); 1241 iser_snd_completion(tx_desc, ib_conn);
1205 } else { 1242 } else {
1206 iser_err("Unknown wc opcode %d\n", wc->opcode); 1243 iser_err("Unknown wc opcode %d\n", wc->opcode);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 5461924c9f10..db3c8c851af1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -2929,7 +2929,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
2929 return -ENOMEM; 2929 return -ENOMEM;
2930 2930
2931 sep_opt = options; 2931 sep_opt = options;
2932 while ((p = strsep(&sep_opt, ",")) != NULL) { 2932 while ((p = strsep(&sep_opt, ",\n")) != NULL) {
2933 if (!*p) 2933 if (!*p)
2934 continue; 2934 continue;
2935 2935
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index e25436b24ce7..629f9f1435a5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -171,9 +171,9 @@ int mlx4_check_port_params(struct mlx4_dev *dev,
171{ 171{
172 int i; 172 int i;
173 173
174 for (i = 0; i < dev->caps.num_ports - 1; i++) { 174 if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) {
175 if (port_type[i] != port_type[i + 1]) { 175 for (i = 0; i < dev->caps.num_ports - 1; i++) {
176 if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { 176 if (port_type[i] != port_type[i + 1]) {
177 mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); 177 mlx4_err(dev, "Only same port types supported on this HCA, aborting\n");
178 return -EINVAL; 178 return -EINVAL;
179 } 179 }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index ab684463780b..da82991239a8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type)
157 return "MLX5_EVENT_TYPE_CMD"; 157 return "MLX5_EVENT_TYPE_CMD";
158 case MLX5_EVENT_TYPE_PAGE_REQUEST: 158 case MLX5_EVENT_TYPE_PAGE_REQUEST:
159 return "MLX5_EVENT_TYPE_PAGE_REQUEST"; 159 return "MLX5_EVENT_TYPE_PAGE_REQUEST";
160 case MLX5_EVENT_TYPE_PAGE_FAULT:
161 return "MLX5_EVENT_TYPE_PAGE_FAULT";
160 default: 162 default:
161 return "Unrecognized event"; 163 return "Unrecognized event";
162 } 164 }
@@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
279 } 281 }
280 break; 282 break;
281 283
284#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
285 case MLX5_EVENT_TYPE_PAGE_FAULT:
286 mlx5_eq_pagefault(dev, eqe);
287 break;
288#endif
282 289
283 default: 290 default:
284 mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", 291 mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n",
@@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
446int mlx5_start_eqs(struct mlx5_core_dev *dev) 453int mlx5_start_eqs(struct mlx5_core_dev *dev)
447{ 454{
448 struct mlx5_eq_table *table = &dev->priv.eq_table; 455 struct mlx5_eq_table *table = &dev->priv.eq_table;
456 u32 async_event_mask = MLX5_ASYNC_EVENT_MASK;
449 int err; 457 int err;
450 458
459 if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
460 async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT);
461
451 err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, 462 err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
452 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, 463 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
453 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); 464 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
@@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev)
459 mlx5_cmd_use_events(dev); 470 mlx5_cmd_use_events(dev);
460 471
461 err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, 472 err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
462 MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK, 473 MLX5_NUM_ASYNC_EQE, async_event_mask,
463 "mlx5_async_eq", &dev->priv.uuari.uars[0]); 474 "mlx5_async_eq", &dev->priv.uuari.uars[0]);
464 if (err) { 475 if (err) {
465 mlx5_core_warn(dev, "failed to create async EQ %d\n", err); 476 mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index 087c4c797deb..06f9036acd83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps)
69 return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); 69 return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR);
70} 70}
71 71
72int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps)
73{
74 u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)];
75 int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
76 void *out;
77 int err;
78
79 if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
80 return -ENOTSUPP;
81
82 memset(in, 0, sizeof(in));
83 out = kzalloc(out_sz, GFP_KERNEL);
84 if (!out)
85 return -ENOMEM;
86 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
87 MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR);
88 err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz);
89 if (err)
90 goto out;
91
92 err = mlx5_cmd_status_to_err_v2(out);
93 if (err) {
94 mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err);
95 goto out;
96 }
97
98 memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct),
99 sizeof(*caps));
100
101 mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n",
102 be32_to_cpu(caps->per_transport_caps.rc_odp_caps),
103 be32_to_cpu(caps->per_transport_caps.uc_odp_caps),
104 be32_to_cpu(caps->per_transport_caps.ud_odp_caps));
105
106out:
107 kfree(out);
108 return err;
109}
110EXPORT_SYMBOL(mlx5_query_odp_caps);
111
72int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) 112int mlx5_cmd_init_hca(struct mlx5_core_dev *dev)
73{ 113{
74 struct mlx5_cmd_init_hca_mbox_in in; 114 struct mlx5_cmd_init_hca_mbox_in in;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
index 5261a2b0da43..575d853dbe05 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type)
88 mlx5_core_put_rsc(common); 88 mlx5_core_put_rsc(common);
89} 89}
90 90
91#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
92void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe)
93{
94 struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault;
95 int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK;
96 struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn);
97 struct mlx5_core_qp *qp =
98 container_of(common, struct mlx5_core_qp, common);
99 struct mlx5_pagefault pfault;
100
101 if (!qp) {
102 mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n",
103 qpn);
104 return;
105 }
106
107 pfault.event_subtype = eqe->sub_type;
108 pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) &
109 (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA);
110 pfault.bytes_committed = be32_to_cpu(
111 pf_eqe->bytes_committed);
112
113 mlx5_core_dbg(dev,
114 "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n",
115 eqe->sub_type, pfault.flags);
116
117 switch (eqe->sub_type) {
118 case MLX5_PFAULT_SUBTYPE_RDMA:
119 /* RDMA based event */
120 pfault.rdma.r_key =
121 be32_to_cpu(pf_eqe->rdma.r_key);
122 pfault.rdma.packet_size =
123 be16_to_cpu(pf_eqe->rdma.packet_length);
124 pfault.rdma.rdma_op_len =
125 be32_to_cpu(pf_eqe->rdma.rdma_op_len);
126 pfault.rdma.rdma_va =
127 be64_to_cpu(pf_eqe->rdma.rdma_va);
128 mlx5_core_dbg(dev,
129 "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n",
130 qpn, pfault.rdma.r_key);
131 mlx5_core_dbg(dev,
132 "PAGE_FAULT: rdma_op_len: 0x%08x,\n",
133 pfault.rdma.rdma_op_len);
134 mlx5_core_dbg(dev,
135 "PAGE_FAULT: rdma_va: 0x%016llx,\n",
136 pfault.rdma.rdma_va);
137 mlx5_core_dbg(dev,
138 "PAGE_FAULT: bytes_committed: 0x%06x\n",
139 pfault.bytes_committed);
140 break;
141
142 case MLX5_PFAULT_SUBTYPE_WQE:
143 /* WQE based event */
144 pfault.wqe.wqe_index =
145 be16_to_cpu(pf_eqe->wqe.wqe_index);
146 pfault.wqe.packet_size =
147 be16_to_cpu(pf_eqe->wqe.packet_length);
148 mlx5_core_dbg(dev,
149 "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n",
150 qpn, pfault.wqe.wqe_index);
151 mlx5_core_dbg(dev,
152 "PAGE_FAULT: bytes_committed: 0x%06x\n",
153 pfault.bytes_committed);
154 break;
155
156 default:
157 mlx5_core_warn(dev,
158 "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n",
159 eqe->sub_type, qpn);
160 /* Unsupported page faults should still be resolved by the
161 * page fault handler
162 */
163 }
164
165 if (qp->pfault_handler) {
166 qp->pfault_handler(qp, &pfault);
167 } else {
168 mlx5_core_err(dev,
169 "ODP event for QP %08x, without a fault handler in QP\n",
170 qpn);
171 /* Page fault will remain unresolved. QP will hang until it is
172 * destroyed
173 */
174 }
175
176 mlx5_core_put_rsc(common);
177}
178#endif
179
91int mlx5_core_create_qp(struct mlx5_core_dev *dev, 180int mlx5_core_create_qp(struct mlx5_core_dev *dev,
92 struct mlx5_core_qp *qp, 181 struct mlx5_core_qp *qp,
93 struct mlx5_create_qp_mbox_in *in, 182 struct mlx5_create_qp_mbox_in *in,
@@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
322 return err; 411 return err;
323} 412}
324EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); 413EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
414
415#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
416int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
417 u8 flags, int error)
418{
419 struct mlx5_page_fault_resume_mbox_in in;
420 struct mlx5_page_fault_resume_mbox_out out;
421 int err;
422
423 memset(&in, 0, sizeof(in));
424 memset(&out, 0, sizeof(out));
425 in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME);
426 in.hdr.opmod = 0;
427 flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR |
428 MLX5_PAGE_FAULT_RESUME_WRITE |
429 MLX5_PAGE_FAULT_RESUME_RDMA);
430 flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0);
431 in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) |
432 (flags << MLX5_QPN_BITS));
433 err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
434 if (err)
435 return err;
436
437 if (out.hdr.status)
438 err = mlx5_cmd_status_to_err(&out.hdr);
439
440 return err;
441}
442EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume);
443#endif
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index ea4f1c46f761..4e5bd813bb9a 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -120,6 +120,15 @@ enum {
120}; 120};
121 121
122enum { 122enum {
123 MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31
124};
125
126enum {
127 MLX5_PFAULT_SUBTYPE_WQE = 0,
128 MLX5_PFAULT_SUBTYPE_RDMA = 1,
129};
130
131enum {
123 MLX5_PERM_LOCAL_READ = 1 << 2, 132 MLX5_PERM_LOCAL_READ = 1 << 2,
124 MLX5_PERM_LOCAL_WRITE = 1 << 3, 133 MLX5_PERM_LOCAL_WRITE = 1 << 3,
125 MLX5_PERM_REMOTE_READ = 1 << 4, 134 MLX5_PERM_REMOTE_READ = 1 << 4,
@@ -180,6 +189,19 @@ enum {
180 MLX5_MKEY_MASK_FREE = 1ull << 29, 189 MLX5_MKEY_MASK_FREE = 1ull << 29,
181}; 190};
182 191
192enum {
193 MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4),
194
195 MLX5_UMR_CHECK_NOT_FREE = (1 << 5),
196 MLX5_UMR_CHECK_FREE = (2 << 5),
197
198 MLX5_UMR_INLINE = (1 << 7),
199};
200
201#define MLX5_UMR_MTT_ALIGNMENT 0x40
202#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1)
203#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT
204
183enum mlx5_event { 205enum mlx5_event {
184 MLX5_EVENT_TYPE_COMP = 0x0, 206 MLX5_EVENT_TYPE_COMP = 0x0,
185 207
@@ -206,6 +228,8 @@ enum mlx5_event {
206 228
207 MLX5_EVENT_TYPE_CMD = 0x0a, 229 MLX5_EVENT_TYPE_CMD = 0x0a,
208 MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, 230 MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb,
231
232 MLX5_EVENT_TYPE_PAGE_FAULT = 0xc,
209}; 233};
210 234
211enum { 235enum {
@@ -225,6 +249,7 @@ enum {
225 MLX5_DEV_CAP_FLAG_APM = 1LL << 17, 249 MLX5_DEV_CAP_FLAG_APM = 1LL << 17,
226 MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, 250 MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18,
227 MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, 251 MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23,
252 MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24,
228 MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, 253 MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29,
229 MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, 254 MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30,
230 MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, 255 MLX5_DEV_CAP_FLAG_DCT = 1LL << 37,
@@ -290,6 +315,8 @@ enum {
290enum { 315enum {
291 HCA_CAP_OPMOD_GET_MAX = 0, 316 HCA_CAP_OPMOD_GET_MAX = 0,
292 HCA_CAP_OPMOD_GET_CUR = 1, 317 HCA_CAP_OPMOD_GET_CUR = 1,
318 HCA_CAP_OPMOD_GET_ODP_MAX = 4,
319 HCA_CAP_OPMOD_GET_ODP_CUR = 5
293}; 320};
294 321
295struct mlx5_inbox_hdr { 322struct mlx5_inbox_hdr {
@@ -319,6 +346,23 @@ struct mlx5_cmd_query_adapter_mbox_out {
319 u8 vsd_psid[16]; 346 u8 vsd_psid[16];
320}; 347};
321 348
349enum mlx5_odp_transport_cap_bits {
350 MLX5_ODP_SUPPORT_SEND = 1 << 31,
351 MLX5_ODP_SUPPORT_RECV = 1 << 30,
352 MLX5_ODP_SUPPORT_WRITE = 1 << 29,
353 MLX5_ODP_SUPPORT_READ = 1 << 28,
354};
355
356struct mlx5_odp_caps {
357 char reserved[0x10];
358 struct {
359 __be32 rc_odp_caps;
360 __be32 uc_odp_caps;
361 __be32 ud_odp_caps;
362 } per_transport_caps;
363 char reserved2[0xe4];
364};
365
322struct mlx5_cmd_init_hca_mbox_in { 366struct mlx5_cmd_init_hca_mbox_in {
323 struct mlx5_inbox_hdr hdr; 367 struct mlx5_inbox_hdr hdr;
324 u8 rsvd0[2]; 368 u8 rsvd0[2];
@@ -439,6 +483,27 @@ struct mlx5_eqe_page_req {
439 __be32 rsvd1[5]; 483 __be32 rsvd1[5];
440}; 484};
441 485
486struct mlx5_eqe_page_fault {
487 __be32 bytes_committed;
488 union {
489 struct {
490 u16 reserved1;
491 __be16 wqe_index;
492 u16 reserved2;
493 __be16 packet_length;
494 u8 reserved3[12];
495 } __packed wqe;
496 struct {
497 __be32 r_key;
498 u16 reserved1;
499 __be16 packet_length;
500 __be32 rdma_op_len;
501 __be64 rdma_va;
502 } __packed rdma;
503 } __packed;
504 __be32 flags_qpn;
505} __packed;
506
442union ev_data { 507union ev_data {
443 __be32 raw[7]; 508 __be32 raw[7];
444 struct mlx5_eqe_cmd cmd; 509 struct mlx5_eqe_cmd cmd;
@@ -450,6 +515,7 @@ union ev_data {
450 struct mlx5_eqe_congestion cong; 515 struct mlx5_eqe_congestion cong;
451 struct mlx5_eqe_stall_vl stall_vl; 516 struct mlx5_eqe_stall_vl stall_vl;
452 struct mlx5_eqe_page_req req_pages; 517 struct mlx5_eqe_page_req req_pages;
518 struct mlx5_eqe_page_fault page_fault;
453} __packed; 519} __packed;
454 520
455struct mlx5_eqe { 521struct mlx5_eqe {
@@ -776,6 +842,10 @@ struct mlx5_query_eq_mbox_out {
776 struct mlx5_eq_context ctx; 842 struct mlx5_eq_context ctx;
777}; 843};
778 844
845enum {
846 MLX5_MKEY_STATUS_FREE = 1 << 6,
847};
848
779struct mlx5_mkey_seg { 849struct mlx5_mkey_seg {
780 /* This is a two bit field occupying bits 31-30. 850 /* This is a two bit field occupying bits 31-30.
781 * bit 31 is always 0, 851 * bit 31 is always 0,
@@ -812,7 +882,7 @@ struct mlx5_query_special_ctxs_mbox_out {
812struct mlx5_create_mkey_mbox_in { 882struct mlx5_create_mkey_mbox_in {
813 struct mlx5_inbox_hdr hdr; 883 struct mlx5_inbox_hdr hdr;
814 __be32 input_mkey_index; 884 __be32 input_mkey_index;
815 u8 rsvd0[4]; 885 __be32 flags;
816 struct mlx5_mkey_seg seg; 886 struct mlx5_mkey_seg seg;
817 u8 rsvd1[16]; 887 u8 rsvd1[16];
818 __be32 xlat_oct_act_size; 888 __be32 xlat_oct_act_size;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index b1bf41556b32..166d9315fe4b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -113,6 +113,13 @@ enum {
113 MLX5_REG_HOST_ENDIANNESS = 0x7004, 113 MLX5_REG_HOST_ENDIANNESS = 0x7004,
114}; 114};
115 115
116enum mlx5_page_fault_resume_flags {
117 MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0,
118 MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1,
119 MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2,
120 MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7,
121};
122
116enum dbg_rsc_type { 123enum dbg_rsc_type {
117 MLX5_DBG_RSC_QP, 124 MLX5_DBG_RSC_QP,
118 MLX5_DBG_RSC_EQ, 125 MLX5_DBG_RSC_EQ,
@@ -467,7 +474,7 @@ struct mlx5_priv {
467 struct workqueue_struct *pg_wq; 474 struct workqueue_struct *pg_wq;
468 struct rb_root page_root; 475 struct rb_root page_root;
469 int fw_pages; 476 int fw_pages;
470 int reg_pages; 477 atomic_t reg_pages;
471 struct list_head free_list; 478 struct list_head free_list;
472 479
473 struct mlx5_core_health health; 480 struct mlx5_core_health health;
@@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
703void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); 710void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
704void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); 711void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
705void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); 712void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type);
713#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
714void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe);
715#endif
706void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); 716void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
707struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); 717struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
708void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); 718void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector);
@@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn,
740 int npsvs, u32 *sig_index); 750 int npsvs, u32 *sig_index);
741int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); 751int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num);
742void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); 752void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common);
753int mlx5_query_odp_caps(struct mlx5_core_dev *dev,
754 struct mlx5_odp_caps *odp_caps);
743 755
744static inline u32 mlx5_mkey_to_idx(u32 mkey) 756static inline u32 mlx5_mkey_to_idx(u32 mkey)
745{ 757{
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3fa075daeb1d..61f7a342d1bf 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -50,6 +50,9 @@
50#define MLX5_BSF_APPTAG_ESCAPE 0x1 50#define MLX5_BSF_APPTAG_ESCAPE 0x1
51#define MLX5_BSF_APPREF_ESCAPE 0x2 51#define MLX5_BSF_APPREF_ESCAPE 0x2
52 52
53#define MLX5_QPN_BITS 24
54#define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1)
55
53enum mlx5_qp_optpar { 56enum mlx5_qp_optpar {
54 MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, 57 MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0,
55 MLX5_QP_OPTPAR_RRE = 1 << 1, 58 MLX5_QP_OPTPAR_RRE = 1 << 1,
@@ -189,6 +192,14 @@ struct mlx5_wqe_ctrl_seg {
189 __be32 imm; 192 __be32 imm;
190}; 193};
191 194
195#define MLX5_WQE_CTRL_DS_MASK 0x3f
196#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00
197#define MLX5_WQE_CTRL_QPN_SHIFT 8
198#define MLX5_WQE_DS_UNITS 16
199#define MLX5_WQE_CTRL_OPCODE_MASK 0xff
200#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00
201#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8
202
192struct mlx5_wqe_xrc_seg { 203struct mlx5_wqe_xrc_seg {
193 __be32 xrc_srqn; 204 __be32 xrc_srqn;
194 u8 rsvd[12]; 205 u8 rsvd[12];
@@ -292,6 +303,8 @@ struct mlx5_wqe_signature_seg {
292 u8 rsvd1[11]; 303 u8 rsvd1[11];
293}; 304};
294 305
306#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff
307
295struct mlx5_wqe_inline_seg { 308struct mlx5_wqe_inline_seg {
296 __be32 byte_count; 309 __be32 byte_count;
297}; 310};
@@ -360,9 +373,46 @@ struct mlx5_stride_block_ctrl_seg {
360 __be16 num_entries; 373 __be16 num_entries;
361}; 374};
362 375
376enum mlx5_pagefault_flags {
377 MLX5_PFAULT_REQUESTOR = 1 << 0,
378 MLX5_PFAULT_WRITE = 1 << 1,
379 MLX5_PFAULT_RDMA = 1 << 2,
380};
381
382/* Contains the details of a pagefault. */
383struct mlx5_pagefault {
384 u32 bytes_committed;
385 u8 event_subtype;
386 enum mlx5_pagefault_flags flags;
387 union {
388 /* Initiator or send message responder pagefault details. */
389 struct {
390 /* Received packet size, only valid for responders. */
391 u32 packet_size;
392 /*
393 * WQE index. Refers to either the send queue or
394 * receive queue, according to event_subtype.
395 */
396 u16 wqe_index;
397 } wqe;
398 /* RDMA responder pagefault details */
399 struct {
400 u32 r_key;
401 /*
402 * Received packet size, minimal size page fault
403 * resolution required for forward progress.
404 */
405 u32 packet_size;
406 u32 rdma_op_len;
407 u64 rdma_va;
408 } rdma;
409 };
410};
411
363struct mlx5_core_qp { 412struct mlx5_core_qp {
364 struct mlx5_core_rsc_common common; /* must be first */ 413 struct mlx5_core_rsc_common common; /* must be first */
365 void (*event) (struct mlx5_core_qp *, int); 414 void (*event) (struct mlx5_core_qp *, int);
415 void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *);
366 int qpn; 416 int qpn;
367 struct mlx5_rsc_debug *dbg; 417 struct mlx5_rsc_debug *dbg;
368 int pid; 418 int pid;
@@ -530,6 +580,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u
530 return radix_tree_lookup(&dev->priv.mr_table.tree, key); 580 return radix_tree_lookup(&dev->priv.mr_table.tree, key);
531} 581}
532 582
583struct mlx5_page_fault_resume_mbox_in {
584 struct mlx5_inbox_hdr hdr;
585 __be32 flags_qpn;
586 u8 reserved[4];
587};
588
589struct mlx5_page_fault_resume_mbox_out {
590 struct mlx5_outbox_hdr hdr;
591 u8 rsvd[8];
592};
593
533int mlx5_core_create_qp(struct mlx5_core_dev *dev, 594int mlx5_core_create_qp(struct mlx5_core_dev *dev,
534 struct mlx5_core_qp *qp, 595 struct mlx5_core_qp *qp,
535 struct mlx5_create_qp_mbox_in *in, 596 struct mlx5_create_qp_mbox_in *in,
@@ -549,6 +610,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev);
549void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); 610void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
550int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); 611int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
551void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); 612void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
613#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
614int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn,
615 u8 context, int error);
616#endif
552 617
553static inline const char *mlx5_qp_type_str(int type) 618static inline const char *mlx5_qp_type_str(int type)
554{ 619{
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index a2bf41e0bde9..2d83cfd7e6ce 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -38,11 +38,12 @@
38#include <linux/workqueue.h> 38#include <linux/workqueue.h>
39 39
40struct ib_ucontext; 40struct ib_ucontext;
41struct ib_umem_odp;
41 42
42struct ib_umem { 43struct ib_umem {
43 struct ib_ucontext *context; 44 struct ib_ucontext *context;
44 size_t length; 45 size_t length;
45 int offset; 46 unsigned long address;
46 int page_size; 47 int page_size;
47 int writable; 48 int writable;
48 int hugetlb; 49 int hugetlb;
@@ -50,17 +51,43 @@ struct ib_umem {
50 struct pid *pid; 51 struct pid *pid;
51 struct mm_struct *mm; 52 struct mm_struct *mm;
52 unsigned long diff; 53 unsigned long diff;
54 struct ib_umem_odp *odp_data;
53 struct sg_table sg_head; 55 struct sg_table sg_head;
54 int nmap; 56 int nmap;
55 int npages; 57 int npages;
56}; 58};
57 59
60/* Returns the offset of the umem start relative to the first page. */
61static inline int ib_umem_offset(struct ib_umem *umem)
62{
63 return umem->address & ((unsigned long)umem->page_size - 1);
64}
65
66/* Returns the first page of an ODP umem. */
67static inline unsigned long ib_umem_start(struct ib_umem *umem)
68{
69 return umem->address - ib_umem_offset(umem);
70}
71
72/* Returns the address of the page after the last one of an ODP umem. */
73static inline unsigned long ib_umem_end(struct ib_umem *umem)
74{
75 return PAGE_ALIGN(umem->address + umem->length);
76}
77
78static inline size_t ib_umem_num_pages(struct ib_umem *umem)
79{
80 return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT;
81}
82
58#ifdef CONFIG_INFINIBAND_USER_MEM 83#ifdef CONFIG_INFINIBAND_USER_MEM
59 84
60struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, 85struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
61 size_t size, int access, int dmasync); 86 size_t size, int access, int dmasync);
62void ib_umem_release(struct ib_umem *umem); 87void ib_umem_release(struct ib_umem *umem);
63int ib_umem_page_count(struct ib_umem *umem); 88int ib_umem_page_count(struct ib_umem *umem);
89int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
90 size_t length);
64 91
65#else /* CONFIG_INFINIBAND_USER_MEM */ 92#else /* CONFIG_INFINIBAND_USER_MEM */
66 93
@@ -73,7 +100,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context,
73} 100}
74static inline void ib_umem_release(struct ib_umem *umem) { } 101static inline void ib_umem_release(struct ib_umem *umem) { }
75static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } 102static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; }
76 103static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
104 size_t length) {
105 return -EINVAL;
106}
77#endif /* CONFIG_INFINIBAND_USER_MEM */ 107#endif /* CONFIG_INFINIBAND_USER_MEM */
78 108
79#endif /* IB_UMEM_H */ 109#endif /* IB_UMEM_H */
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
new file mode 100644
index 000000000000..3da0b167041b
--- /dev/null
+++ b/include/rdma/ib_umem_odp.h
@@ -0,0 +1,160 @@
1/*
2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#ifndef IB_UMEM_ODP_H
34#define IB_UMEM_ODP_H
35
36#include <rdma/ib_umem.h>
37#include <rdma/ib_verbs.h>
38#include <linux/interval_tree.h>
39
40struct umem_odp_node {
41 u64 __subtree_last;
42 struct rb_node rb;
43};
44
45struct ib_umem_odp {
46 /*
47 * An array of the pages included in the on-demand paging umem.
48 * Indices of pages that are currently not mapped into the device will
49 * contain NULL.
50 */
51 struct page **page_list;
52 /*
53 * An array of the same size as page_list, with DMA addresses mapped
54 * for pages the pages in page_list. The lower two bits designate
55 * access permissions. See ODP_READ_ALLOWED_BIT and
56 * ODP_WRITE_ALLOWED_BIT.
57 */
58 dma_addr_t *dma_list;
59 /*
60 * The umem_mutex protects the page_list and dma_list fields of an ODP
61 * umem, allowing only a single thread to map/unmap pages. The mutex
62 * also protects access to the mmu notifier counters.
63 */
64 struct mutex umem_mutex;
65 void *private; /* for the HW driver to use. */
66
67 /* When false, use the notifier counter in the ucontext struct. */
68 bool mn_counters_active;
69 int notifiers_seq;
70 int notifiers_count;
71
72 /* A linked list of umems that don't have private mmu notifier
73 * counters yet. */
74 struct list_head no_private_counters;
75 struct ib_umem *umem;
76
77 /* Tree tracking */
78 struct umem_odp_node interval_tree;
79
80 struct completion notifier_completion;
81 int dying;
82};
83
84#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
85
86int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem);
87
88void ib_umem_odp_release(struct ib_umem *umem);
89
90/*
91 * The lower 2 bits of the DMA address signal the R/W permissions for
92 * the entry. To upgrade the permissions, provide the appropriate
93 * bitmask to the map_dma_pages function.
94 *
95 * Be aware that upgrading a mapped address might result in change of
96 * the DMA address for the page.
97 */
98#define ODP_READ_ALLOWED_BIT (1<<0ULL)
99#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
100
101#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
102
103int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt,
104 u64 access_mask, unsigned long current_seq);
105
106void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset,
107 u64 bound);
108
109void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root);
110void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root);
111typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end,
112 void *cookie);
113/*
114 * Call the callback on each ib_umem in the range. Returns the logical or of
115 * the return values of the functions called.
116 */
117int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end,
118 umem_call_back cb, void *cookie);
119
120struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root,
121 u64 start, u64 last);
122struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node,
123 u64 start, u64 last);
124
125static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item,
126 unsigned long mmu_seq)
127{
128 /*
129 * This code is strongly based on the KVM code from
130 * mmu_notifier_retry. Should be called with
131 * the relevant locks taken (item->odp_data->umem_mutex
132 * and the ucontext umem_mutex semaphore locked for read).
133 */
134
135 /* Do not allow page faults while the new ib_umem hasn't seen a state
136 * with zero notifiers yet, and doesn't have its own valid set of
137 * private counters. */
138 if (!item->odp_data->mn_counters_active)
139 return 1;
140
141 if (unlikely(item->odp_data->notifiers_count))
142 return 1;
143 if (item->odp_data->notifiers_seq != mmu_seq)
144 return 1;
145 return 0;
146}
147
148#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
149
150static inline int ib_umem_odp_get(struct ib_ucontext *context,
151 struct ib_umem *umem)
152{
153 return -EINVAL;
154}
155
156static inline void ib_umem_odp_release(struct ib_umem *umem) {}
157
158#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
159
160#endif /* IB_UMEM_ODP_H */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 470a011d6fa4..0d74f1de99aa 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -51,6 +51,7 @@
51#include <uapi/linux/if_ether.h> 51#include <uapi/linux/if_ether.h>
52 52
53#include <linux/atomic.h> 53#include <linux/atomic.h>
54#include <linux/mmu_notifier.h>
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55 56
56extern struct workqueue_struct *ib_wq; 57extern struct workqueue_struct *ib_wq;
@@ -123,7 +124,8 @@ enum ib_device_cap_flags {
123 IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), 124 IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23),
124 IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), 125 IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24),
125 IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), 126 IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29),
126 IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) 127 IB_DEVICE_SIGNATURE_HANDOVER = (1<<30),
128 IB_DEVICE_ON_DEMAND_PAGING = (1<<31),
127}; 129};
128 130
129enum ib_signature_prot_cap { 131enum ib_signature_prot_cap {
@@ -143,6 +145,27 @@ enum ib_atomic_cap {
143 IB_ATOMIC_GLOB 145 IB_ATOMIC_GLOB
144}; 146};
145 147
148enum ib_odp_general_cap_bits {
149 IB_ODP_SUPPORT = 1 << 0,
150};
151
152enum ib_odp_transport_cap_bits {
153 IB_ODP_SUPPORT_SEND = 1 << 0,
154 IB_ODP_SUPPORT_RECV = 1 << 1,
155 IB_ODP_SUPPORT_WRITE = 1 << 2,
156 IB_ODP_SUPPORT_READ = 1 << 3,
157 IB_ODP_SUPPORT_ATOMIC = 1 << 4,
158};
159
160struct ib_odp_caps {
161 uint64_t general_caps;
162 struct {
163 uint32_t rc_odp_caps;
164 uint32_t uc_odp_caps;
165 uint32_t ud_odp_caps;
166 } per_transport_caps;
167};
168
146struct ib_device_attr { 169struct ib_device_attr {
147 u64 fw_ver; 170 u64 fw_ver;
148 __be64 sys_image_guid; 171 __be64 sys_image_guid;
@@ -186,6 +209,7 @@ struct ib_device_attr {
186 u8 local_ca_ack_delay; 209 u8 local_ca_ack_delay;
187 int sig_prot_cap; 210 int sig_prot_cap;
188 int sig_guard_cap; 211 int sig_guard_cap;
212 struct ib_odp_caps odp_caps;
189}; 213};
190 214
191enum ib_mtu { 215enum ib_mtu {
@@ -1073,7 +1097,8 @@ enum ib_access_flags {
1073 IB_ACCESS_REMOTE_READ = (1<<2), 1097 IB_ACCESS_REMOTE_READ = (1<<2),
1074 IB_ACCESS_REMOTE_ATOMIC = (1<<3), 1098 IB_ACCESS_REMOTE_ATOMIC = (1<<3),
1075 IB_ACCESS_MW_BIND = (1<<4), 1099 IB_ACCESS_MW_BIND = (1<<4),
1076 IB_ZERO_BASED = (1<<5) 1100 IB_ZERO_BASED = (1<<5),
1101 IB_ACCESS_ON_DEMAND = (1<<6),
1077}; 1102};
1078 1103
1079struct ib_phys_buf { 1104struct ib_phys_buf {
@@ -1115,6 +1140,8 @@ struct ib_fmr_attr {
1115 u8 page_shift; 1140 u8 page_shift;
1116}; 1141};
1117 1142
1143struct ib_umem;
1144
1118struct ib_ucontext { 1145struct ib_ucontext {
1119 struct ib_device *device; 1146 struct ib_device *device;
1120 struct list_head pd_list; 1147 struct list_head pd_list;
@@ -1127,6 +1154,24 @@ struct ib_ucontext {
1127 struct list_head xrcd_list; 1154 struct list_head xrcd_list;
1128 struct list_head rule_list; 1155 struct list_head rule_list;
1129 int closing; 1156 int closing;
1157
1158 struct pid *tgid;
1159#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1160 struct rb_root umem_tree;
1161 /*
1162 * Protects .umem_rbroot and tree, as well as odp_mrs_count and
1163 * mmu notifiers registration.
1164 */
1165 struct rw_semaphore umem_rwsem;
1166 void (*invalidate_range)(struct ib_umem *umem,
1167 unsigned long start, unsigned long end);
1168
1169 struct mmu_notifier mn;
1170 atomic_t notifier_count;
1171 /* A list of umems that don't have private mmu notifier counters yet. */
1172 struct list_head no_private_counters;
1173 int odp_mrs_count;
1174#endif
1130}; 1175};
1131 1176
1132struct ib_uobject { 1177struct ib_uobject {
@@ -1662,7 +1707,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t
1662 1707
1663static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) 1708static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
1664{ 1709{
1665 return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; 1710 size_t copy_sz;
1711
1712 copy_sz = min_t(size_t, len, udata->outlen);
1713 return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0;
1666} 1714}
1667 1715
1668/** 1716/**
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 26daf55ff76e..4275b961bf60 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -90,8 +90,9 @@ enum {
90}; 90};
91 91
92enum { 92enum {
93 IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE,
93 IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, 94 IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD,
94 IB_USER_VERBS_EX_CMD_DESTROY_FLOW 95 IB_USER_VERBS_EX_CMD_DESTROY_FLOW,
95}; 96};
96 97
97/* 98/*
@@ -201,6 +202,32 @@ struct ib_uverbs_query_device_resp {
201 __u8 reserved[4]; 202 __u8 reserved[4];
202}; 203};
203 204
205enum {
206 IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0,
207};
208
209struct ib_uverbs_ex_query_device {
210 __u32 comp_mask;
211 __u32 reserved;
212};
213
214struct ib_uverbs_odp_caps {
215 __u64 general_caps;
216 struct {
217 __u32 rc_odp_caps;
218 __u32 uc_odp_caps;
219 __u32 ud_odp_caps;
220 } per_transport_caps;
221 __u32 reserved;
222};
223
224struct ib_uverbs_ex_query_device_resp {
225 struct ib_uverbs_query_device_resp base;
226 __u32 comp_mask;
227 __u32 reserved;
228 struct ib_uverbs_odp_caps odp_caps;
229};
230
204struct ib_uverbs_query_port { 231struct ib_uverbs_query_port {
205 __u64 response; 232 __u64 response;
206 __u8 port_num; 233 __u8 port_num;