diff options
50 files changed, 3499 insertions, 451 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 77089399359b..b899531498eb 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig | |||
@@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM | |||
38 | depends on INFINIBAND_USER_ACCESS != n | 38 | depends on INFINIBAND_USER_ACCESS != n |
39 | default y | 39 | default y |
40 | 40 | ||
41 | config INFINIBAND_ON_DEMAND_PAGING | ||
42 | bool "InfiniBand on-demand paging support" | ||
43 | depends on INFINIBAND_USER_MEM | ||
44 | select MMU_NOTIFIER | ||
45 | default y | ||
46 | ---help--- | ||
47 | On demand paging support for the InfiniBand subsystem. | ||
48 | Together with driver support this allows registration of | ||
49 | memory regions without pinning their pages, fetching the | ||
50 | pages on demand instead. | ||
51 | |||
41 | config INFINIBAND_ADDR_TRANS | 52 | config INFINIBAND_ADDR_TRANS |
42 | bool | 53 | bool |
43 | depends on INFINIBAND | 54 | depends on INFINIBAND |
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index ffd0af6734af..acf736764445 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile | |||
@@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ | |||
11 | ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ | 11 | ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ |
12 | device.o fmr_pool.o cache.o netlink.o | 12 | device.o fmr_pool.o cache.o netlink.o |
13 | ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o | 13 | ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o |
14 | ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o | ||
14 | 15 | ||
15 | ib_mad-y := mad.o smi.o agent.o mad_rmpp.o | 16 | ib_mad-y := mad.o smi.o agent.o mad_rmpp.o |
16 | 17 | ||
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index df0c4f605a21..aec7a6aa2951 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/hugetlb.h> | 39 | #include <linux/hugetlb.h> |
40 | #include <linux/dma-attrs.h> | 40 | #include <linux/dma-attrs.h> |
41 | #include <linux/slab.h> | 41 | #include <linux/slab.h> |
42 | #include <rdma/ib_umem_odp.h> | ||
42 | 43 | ||
43 | #include "uverbs.h" | 44 | #include "uverbs.h" |
44 | 45 | ||
@@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d | |||
69 | 70 | ||
70 | /** | 71 | /** |
71 | * ib_umem_get - Pin and DMA map userspace memory. | 72 | * ib_umem_get - Pin and DMA map userspace memory. |
73 | * | ||
74 | * If access flags indicate ODP memory, avoid pinning. Instead, stores | ||
75 | * the mm for future page fault handling in conjunction with MMU notifiers. | ||
76 | * | ||
72 | * @context: userspace context to pin memory for | 77 | * @context: userspace context to pin memory for |
73 | * @addr: userspace virtual address to start at | 78 | * @addr: userspace virtual address to start at |
74 | * @size: length of region to pin | 79 | * @size: length of region to pin |
@@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, | |||
103 | 108 | ||
104 | umem->context = context; | 109 | umem->context = context; |
105 | umem->length = size; | 110 | umem->length = size; |
106 | umem->offset = addr & ~PAGE_MASK; | 111 | umem->address = addr; |
107 | umem->page_size = PAGE_SIZE; | 112 | umem->page_size = PAGE_SIZE; |
108 | umem->pid = get_task_pid(current, PIDTYPE_PID); | 113 | umem->pid = get_task_pid(current, PIDTYPE_PID); |
109 | /* | 114 | /* |
110 | * We ask for writable memory if any access flags other than | 115 | * We ask for writable memory if any of the following |
111 | * "remote read" are set. "Local write" and "remote write" | 116 | * access flags are set. "Local write" and "remote write" |
112 | * obviously require write access. "Remote atomic" can do | 117 | * obviously require write access. "Remote atomic" can do |
113 | * things like fetch and add, which will modify memory, and | 118 | * things like fetch and add, which will modify memory, and |
114 | * "MW bind" can change permissions by binding a window. | 119 | * "MW bind" can change permissions by binding a window. |
115 | */ | 120 | */ |
116 | umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); | 121 | umem->writable = !!(access & |
122 | (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | | ||
123 | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); | ||
124 | |||
125 | if (access & IB_ACCESS_ON_DEMAND) { | ||
126 | ret = ib_umem_odp_get(context, umem); | ||
127 | if (ret) { | ||
128 | kfree(umem); | ||
129 | return ERR_PTR(ret); | ||
130 | } | ||
131 | return umem; | ||
132 | } | ||
133 | |||
134 | umem->odp_data = NULL; | ||
117 | 135 | ||
118 | /* We assume the memory is from hugetlb until proved otherwise */ | 136 | /* We assume the memory is from hugetlb until proved otherwise */ |
119 | umem->hugetlb = 1; | 137 | umem->hugetlb = 1; |
@@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, | |||
132 | if (!vma_list) | 150 | if (!vma_list) |
133 | umem->hugetlb = 0; | 151 | umem->hugetlb = 0; |
134 | 152 | ||
135 | npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; | 153 | npages = ib_umem_num_pages(umem); |
136 | 154 | ||
137 | down_write(¤t->mm->mmap_sem); | 155 | down_write(¤t->mm->mmap_sem); |
138 | 156 | ||
@@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem) | |||
235 | struct task_struct *task; | 253 | struct task_struct *task; |
236 | unsigned long diff; | 254 | unsigned long diff; |
237 | 255 | ||
256 | if (umem->odp_data) { | ||
257 | ib_umem_odp_release(umem); | ||
258 | return; | ||
259 | } | ||
260 | |||
238 | __ib_umem_release(umem->context->device, umem, 1); | 261 | __ib_umem_release(umem->context->device, umem, 1); |
239 | 262 | ||
240 | task = get_pid_task(umem->pid, PIDTYPE_PID); | 263 | task = get_pid_task(umem->pid, PIDTYPE_PID); |
@@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem) | |||
246 | if (!mm) | 269 | if (!mm) |
247 | goto out; | 270 | goto out; |
248 | 271 | ||
249 | diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; | 272 | diff = ib_umem_num_pages(umem); |
250 | 273 | ||
251 | /* | 274 | /* |
252 | * We may be called with the mm's mmap_sem already held. This | 275 | * We may be called with the mm's mmap_sem already held. This |
@@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem) | |||
283 | int n; | 306 | int n; |
284 | struct scatterlist *sg; | 307 | struct scatterlist *sg; |
285 | 308 | ||
309 | if (umem->odp_data) | ||
310 | return ib_umem_num_pages(umem); | ||
311 | |||
286 | shift = ilog2(umem->page_size); | 312 | shift = ilog2(umem->page_size); |
287 | 313 | ||
288 | n = 0; | 314 | n = 0; |
@@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem) | |||
292 | return n; | 318 | return n; |
293 | } | 319 | } |
294 | EXPORT_SYMBOL(ib_umem_page_count); | 320 | EXPORT_SYMBOL(ib_umem_page_count); |
321 | |||
322 | /* | ||
323 | * Copy from the given ib_umem's pages to the given buffer. | ||
324 | * | ||
325 | * umem - the umem to copy from | ||
326 | * offset - offset to start copying from | ||
327 | * dst - destination buffer | ||
328 | * length - buffer length | ||
329 | * | ||
330 | * Returns 0 on success, or an error code. | ||
331 | */ | ||
332 | int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, | ||
333 | size_t length) | ||
334 | { | ||
335 | size_t end = offset + length; | ||
336 | int ret; | ||
337 | |||
338 | if (offset > umem->length || length > umem->length - offset) { | ||
339 | pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", | ||
340 | offset, umem->length, end); | ||
341 | return -EINVAL; | ||
342 | } | ||
343 | |||
344 | ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, | ||
345 | offset + ib_umem_offset(umem)); | ||
346 | |||
347 | if (ret < 0) | ||
348 | return ret; | ||
349 | else if (ret != length) | ||
350 | return -EINVAL; | ||
351 | else | ||
352 | return 0; | ||
353 | } | ||
354 | EXPORT_SYMBOL(ib_umem_copy_from); | ||
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c new file mode 100644 index 000000000000..6095872549e7 --- /dev/null +++ b/drivers/infiniband/core/umem_odp.c | |||
@@ -0,0 +1,668 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #include <linux/types.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <linux/pid.h> | ||
36 | #include <linux/slab.h> | ||
37 | #include <linux/export.h> | ||
38 | #include <linux/vmalloc.h> | ||
39 | |||
40 | #include <rdma/ib_verbs.h> | ||
41 | #include <rdma/ib_umem.h> | ||
42 | #include <rdma/ib_umem_odp.h> | ||
43 | |||
44 | static void ib_umem_notifier_start_account(struct ib_umem *item) | ||
45 | { | ||
46 | mutex_lock(&item->odp_data->umem_mutex); | ||
47 | |||
48 | /* Only update private counters for this umem if it has them. | ||
49 | * Otherwise skip it. All page faults will be delayed for this umem. */ | ||
50 | if (item->odp_data->mn_counters_active) { | ||
51 | int notifiers_count = item->odp_data->notifiers_count++; | ||
52 | |||
53 | if (notifiers_count == 0) | ||
54 | /* Initialize the completion object for waiting on | ||
55 | * notifiers. Since notifier_count is zero, no one | ||
56 | * should be waiting right now. */ | ||
57 | reinit_completion(&item->odp_data->notifier_completion); | ||
58 | } | ||
59 | mutex_unlock(&item->odp_data->umem_mutex); | ||
60 | } | ||
61 | |||
62 | static void ib_umem_notifier_end_account(struct ib_umem *item) | ||
63 | { | ||
64 | mutex_lock(&item->odp_data->umem_mutex); | ||
65 | |||
66 | /* Only update private counters for this umem if it has them. | ||
67 | * Otherwise skip it. All page faults will be delayed for this umem. */ | ||
68 | if (item->odp_data->mn_counters_active) { | ||
69 | /* | ||
70 | * This sequence increase will notify the QP page fault that | ||
71 | * the page that is going to be mapped in the spte could have | ||
72 | * been freed. | ||
73 | */ | ||
74 | ++item->odp_data->notifiers_seq; | ||
75 | if (--item->odp_data->notifiers_count == 0) | ||
76 | complete_all(&item->odp_data->notifier_completion); | ||
77 | } | ||
78 | mutex_unlock(&item->odp_data->umem_mutex); | ||
79 | } | ||
80 | |||
81 | /* Account for a new mmu notifier in an ib_ucontext. */ | ||
82 | static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) | ||
83 | { | ||
84 | atomic_inc(&context->notifier_count); | ||
85 | } | ||
86 | |||
87 | /* Account for a terminating mmu notifier in an ib_ucontext. | ||
88 | * | ||
89 | * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since | ||
90 | * the function takes the semaphore itself. */ | ||
91 | static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) | ||
92 | { | ||
93 | int zero_notifiers = atomic_dec_and_test(&context->notifier_count); | ||
94 | |||
95 | if (zero_notifiers && | ||
96 | !list_empty(&context->no_private_counters)) { | ||
97 | /* No currently running mmu notifiers. Now is the chance to | ||
98 | * add private accounting to all previously added umems. */ | ||
99 | struct ib_umem_odp *odp_data, *next; | ||
100 | |||
101 | /* Prevent concurrent mmu notifiers from working on the | ||
102 | * no_private_counters list. */ | ||
103 | down_write(&context->umem_rwsem); | ||
104 | |||
105 | /* Read the notifier_count again, with the umem_rwsem | ||
106 | * semaphore taken for write. */ | ||
107 | if (!atomic_read(&context->notifier_count)) { | ||
108 | list_for_each_entry_safe(odp_data, next, | ||
109 | &context->no_private_counters, | ||
110 | no_private_counters) { | ||
111 | mutex_lock(&odp_data->umem_mutex); | ||
112 | odp_data->mn_counters_active = true; | ||
113 | list_del(&odp_data->no_private_counters); | ||
114 | complete_all(&odp_data->notifier_completion); | ||
115 | mutex_unlock(&odp_data->umem_mutex); | ||
116 | } | ||
117 | } | ||
118 | |||
119 | up_write(&context->umem_rwsem); | ||
120 | } | ||
121 | } | ||
122 | |||
123 | static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, | ||
124 | u64 end, void *cookie) { | ||
125 | /* | ||
126 | * Increase the number of notifiers running, to | ||
127 | * prevent any further fault handling on this MR. | ||
128 | */ | ||
129 | ib_umem_notifier_start_account(item); | ||
130 | item->odp_data->dying = 1; | ||
131 | /* Make sure that the fact the umem is dying is out before we release | ||
132 | * all pending page faults. */ | ||
133 | smp_wmb(); | ||
134 | complete_all(&item->odp_data->notifier_completion); | ||
135 | item->context->invalidate_range(item, ib_umem_start(item), | ||
136 | ib_umem_end(item)); | ||
137 | return 0; | ||
138 | } | ||
139 | |||
140 | static void ib_umem_notifier_release(struct mmu_notifier *mn, | ||
141 | struct mm_struct *mm) | ||
142 | { | ||
143 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
144 | |||
145 | if (!context->invalidate_range) | ||
146 | return; | ||
147 | |||
148 | ib_ucontext_notifier_start_account(context); | ||
149 | down_read(&context->umem_rwsem); | ||
150 | rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, | ||
151 | ULLONG_MAX, | ||
152 | ib_umem_notifier_release_trampoline, | ||
153 | NULL); | ||
154 | up_read(&context->umem_rwsem); | ||
155 | } | ||
156 | |||
157 | static int invalidate_page_trampoline(struct ib_umem *item, u64 start, | ||
158 | u64 end, void *cookie) | ||
159 | { | ||
160 | ib_umem_notifier_start_account(item); | ||
161 | item->context->invalidate_range(item, start, start + PAGE_SIZE); | ||
162 | ib_umem_notifier_end_account(item); | ||
163 | return 0; | ||
164 | } | ||
165 | |||
166 | static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, | ||
167 | struct mm_struct *mm, | ||
168 | unsigned long address) | ||
169 | { | ||
170 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
171 | |||
172 | if (!context->invalidate_range) | ||
173 | return; | ||
174 | |||
175 | ib_ucontext_notifier_start_account(context); | ||
176 | down_read(&context->umem_rwsem); | ||
177 | rbt_ib_umem_for_each_in_range(&context->umem_tree, address, | ||
178 | address + PAGE_SIZE, | ||
179 | invalidate_page_trampoline, NULL); | ||
180 | up_read(&context->umem_rwsem); | ||
181 | ib_ucontext_notifier_end_account(context); | ||
182 | } | ||
183 | |||
184 | static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, | ||
185 | u64 end, void *cookie) | ||
186 | { | ||
187 | ib_umem_notifier_start_account(item); | ||
188 | item->context->invalidate_range(item, start, end); | ||
189 | return 0; | ||
190 | } | ||
191 | |||
192 | static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, | ||
193 | struct mm_struct *mm, | ||
194 | unsigned long start, | ||
195 | unsigned long end) | ||
196 | { | ||
197 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
198 | |||
199 | if (!context->invalidate_range) | ||
200 | return; | ||
201 | |||
202 | ib_ucontext_notifier_start_account(context); | ||
203 | down_read(&context->umem_rwsem); | ||
204 | rbt_ib_umem_for_each_in_range(&context->umem_tree, start, | ||
205 | end, | ||
206 | invalidate_range_start_trampoline, NULL); | ||
207 | up_read(&context->umem_rwsem); | ||
208 | } | ||
209 | |||
210 | static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, | ||
211 | u64 end, void *cookie) | ||
212 | { | ||
213 | ib_umem_notifier_end_account(item); | ||
214 | return 0; | ||
215 | } | ||
216 | |||
217 | static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | ||
218 | struct mm_struct *mm, | ||
219 | unsigned long start, | ||
220 | unsigned long end) | ||
221 | { | ||
222 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
223 | |||
224 | if (!context->invalidate_range) | ||
225 | return; | ||
226 | |||
227 | down_read(&context->umem_rwsem); | ||
228 | rbt_ib_umem_for_each_in_range(&context->umem_tree, start, | ||
229 | end, | ||
230 | invalidate_range_end_trampoline, NULL); | ||
231 | up_read(&context->umem_rwsem); | ||
232 | ib_ucontext_notifier_end_account(context); | ||
233 | } | ||
234 | |||
235 | static struct mmu_notifier_ops ib_umem_notifiers = { | ||
236 | .release = ib_umem_notifier_release, | ||
237 | .invalidate_page = ib_umem_notifier_invalidate_page, | ||
238 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, | ||
239 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | ||
240 | }; | ||
241 | |||
242 | int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) | ||
243 | { | ||
244 | int ret_val; | ||
245 | struct pid *our_pid; | ||
246 | struct mm_struct *mm = get_task_mm(current); | ||
247 | |||
248 | if (!mm) | ||
249 | return -EINVAL; | ||
250 | |||
251 | /* Prevent creating ODP MRs in child processes */ | ||
252 | rcu_read_lock(); | ||
253 | our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); | ||
254 | rcu_read_unlock(); | ||
255 | put_pid(our_pid); | ||
256 | if (context->tgid != our_pid) { | ||
257 | ret_val = -EINVAL; | ||
258 | goto out_mm; | ||
259 | } | ||
260 | |||
261 | umem->hugetlb = 0; | ||
262 | umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); | ||
263 | if (!umem->odp_data) { | ||
264 | ret_val = -ENOMEM; | ||
265 | goto out_mm; | ||
266 | } | ||
267 | umem->odp_data->umem = umem; | ||
268 | |||
269 | mutex_init(&umem->odp_data->umem_mutex); | ||
270 | |||
271 | init_completion(&umem->odp_data->notifier_completion); | ||
272 | |||
273 | umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * | ||
274 | sizeof(*umem->odp_data->page_list)); | ||
275 | if (!umem->odp_data->page_list) { | ||
276 | ret_val = -ENOMEM; | ||
277 | goto out_odp_data; | ||
278 | } | ||
279 | |||
280 | umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * | ||
281 | sizeof(*umem->odp_data->dma_list)); | ||
282 | if (!umem->odp_data->dma_list) { | ||
283 | ret_val = -ENOMEM; | ||
284 | goto out_page_list; | ||
285 | } | ||
286 | |||
287 | /* | ||
288 | * When using MMU notifiers, we will get a | ||
289 | * notification before the "current" task (and MM) is | ||
290 | * destroyed. We use the umem_rwsem semaphore to synchronize. | ||
291 | */ | ||
292 | down_write(&context->umem_rwsem); | ||
293 | context->odp_mrs_count++; | ||
294 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | ||
295 | rbt_ib_umem_insert(&umem->odp_data->interval_tree, | ||
296 | &context->umem_tree); | ||
297 | if (likely(!atomic_read(&context->notifier_count))) | ||
298 | umem->odp_data->mn_counters_active = true; | ||
299 | else | ||
300 | list_add(&umem->odp_data->no_private_counters, | ||
301 | &context->no_private_counters); | ||
302 | downgrade_write(&context->umem_rwsem); | ||
303 | |||
304 | if (context->odp_mrs_count == 1) { | ||
305 | /* | ||
306 | * Note that at this point, no MMU notifier is running | ||
307 | * for this context! | ||
308 | */ | ||
309 | atomic_set(&context->notifier_count, 0); | ||
310 | INIT_HLIST_NODE(&context->mn.hlist); | ||
311 | context->mn.ops = &ib_umem_notifiers; | ||
312 | /* | ||
313 | * Lock-dep detects a false positive for mmap_sem vs. | ||
314 | * umem_rwsem, due to not grasping downgrade_write correctly. | ||
315 | */ | ||
316 | lockdep_off(); | ||
317 | ret_val = mmu_notifier_register(&context->mn, mm); | ||
318 | lockdep_on(); | ||
319 | if (ret_val) { | ||
320 | pr_err("Failed to register mmu_notifier %d\n", ret_val); | ||
321 | ret_val = -EBUSY; | ||
322 | goto out_mutex; | ||
323 | } | ||
324 | } | ||
325 | |||
326 | up_read(&context->umem_rwsem); | ||
327 | |||
328 | /* | ||
329 | * Note that doing an mmput can cause a notifier for the relevant mm. | ||
330 | * If the notifier is called while we hold the umem_rwsem, this will | ||
331 | * cause a deadlock. Therefore, we release the reference only after we | ||
332 | * released the semaphore. | ||
333 | */ | ||
334 | mmput(mm); | ||
335 | return 0; | ||
336 | |||
337 | out_mutex: | ||
338 | up_read(&context->umem_rwsem); | ||
339 | vfree(umem->odp_data->dma_list); | ||
340 | out_page_list: | ||
341 | vfree(umem->odp_data->page_list); | ||
342 | out_odp_data: | ||
343 | kfree(umem->odp_data); | ||
344 | out_mm: | ||
345 | mmput(mm); | ||
346 | return ret_val; | ||
347 | } | ||
348 | |||
349 | void ib_umem_odp_release(struct ib_umem *umem) | ||
350 | { | ||
351 | struct ib_ucontext *context = umem->context; | ||
352 | |||
353 | /* | ||
354 | * Ensure that no more pages are mapped in the umem. | ||
355 | * | ||
356 | * It is the driver's responsibility to ensure, before calling us, | ||
357 | * that the hardware will not attempt to access the MR any more. | ||
358 | */ | ||
359 | ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), | ||
360 | ib_umem_end(umem)); | ||
361 | |||
362 | down_write(&context->umem_rwsem); | ||
363 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | ||
364 | rbt_ib_umem_remove(&umem->odp_data->interval_tree, | ||
365 | &context->umem_tree); | ||
366 | context->odp_mrs_count--; | ||
367 | if (!umem->odp_data->mn_counters_active) { | ||
368 | list_del(&umem->odp_data->no_private_counters); | ||
369 | complete_all(&umem->odp_data->notifier_completion); | ||
370 | } | ||
371 | |||
372 | /* | ||
373 | * Downgrade the lock to a read lock. This ensures that the notifiers | ||
374 | * (who lock the mutex for reading) will be able to finish, and we | ||
375 | * will be able to enventually obtain the mmu notifiers SRCU. Note | ||
376 | * that since we are doing it atomically, no other user could register | ||
377 | * and unregister while we do the check. | ||
378 | */ | ||
379 | downgrade_write(&context->umem_rwsem); | ||
380 | if (!context->odp_mrs_count) { | ||
381 | struct task_struct *owning_process = NULL; | ||
382 | struct mm_struct *owning_mm = NULL; | ||
383 | |||
384 | owning_process = get_pid_task(context->tgid, | ||
385 | PIDTYPE_PID); | ||
386 | if (owning_process == NULL) | ||
387 | /* | ||
388 | * The process is already dead, notifier were removed | ||
389 | * already. | ||
390 | */ | ||
391 | goto out; | ||
392 | |||
393 | owning_mm = get_task_mm(owning_process); | ||
394 | if (owning_mm == NULL) | ||
395 | /* | ||
396 | * The process' mm is already dead, notifier were | ||
397 | * removed already. | ||
398 | */ | ||
399 | goto out_put_task; | ||
400 | mmu_notifier_unregister(&context->mn, owning_mm); | ||
401 | |||
402 | mmput(owning_mm); | ||
403 | |||
404 | out_put_task: | ||
405 | put_task_struct(owning_process); | ||
406 | } | ||
407 | out: | ||
408 | up_read(&context->umem_rwsem); | ||
409 | |||
410 | vfree(umem->odp_data->dma_list); | ||
411 | vfree(umem->odp_data->page_list); | ||
412 | kfree(umem->odp_data); | ||
413 | kfree(umem); | ||
414 | } | ||
415 | |||
416 | /* | ||
417 | * Map for DMA and insert a single page into the on-demand paging page tables. | ||
418 | * | ||
419 | * @umem: the umem to insert the page to. | ||
420 | * @page_index: index in the umem to add the page to. | ||
421 | * @page: the page struct to map and add. | ||
422 | * @access_mask: access permissions needed for this page. | ||
423 | * @current_seq: sequence number for synchronization with invalidations. | ||
424 | * the sequence number is taken from | ||
425 | * umem->odp_data->notifiers_seq. | ||
426 | * | ||
427 | * The function returns -EFAULT if the DMA mapping operation fails. It returns | ||
428 | * -EAGAIN if a concurrent invalidation prevents us from updating the page. | ||
429 | * | ||
430 | * The page is released via put_page even if the operation failed. For | ||
431 | * on-demand pinning, the page is released whenever it isn't stored in the | ||
432 | * umem. | ||
433 | */ | ||
434 | static int ib_umem_odp_map_dma_single_page( | ||
435 | struct ib_umem *umem, | ||
436 | int page_index, | ||
437 | u64 base_virt_addr, | ||
438 | struct page *page, | ||
439 | u64 access_mask, | ||
440 | unsigned long current_seq) | ||
441 | { | ||
442 | struct ib_device *dev = umem->context->device; | ||
443 | dma_addr_t dma_addr; | ||
444 | int stored_page = 0; | ||
445 | int remove_existing_mapping = 0; | ||
446 | int ret = 0; | ||
447 | |||
448 | mutex_lock(&umem->odp_data->umem_mutex); | ||
449 | /* | ||
450 | * Note: we avoid writing if seq is different from the initial seq, to | ||
451 | * handle case of a racing notifier. This check also allows us to bail | ||
452 | * early if we have a notifier running in parallel with us. | ||
453 | */ | ||
454 | if (ib_umem_mmu_notifier_retry(umem, current_seq)) { | ||
455 | ret = -EAGAIN; | ||
456 | goto out; | ||
457 | } | ||
458 | if (!(umem->odp_data->dma_list[page_index])) { | ||
459 | dma_addr = ib_dma_map_page(dev, | ||
460 | page, | ||
461 | 0, PAGE_SIZE, | ||
462 | DMA_BIDIRECTIONAL); | ||
463 | if (ib_dma_mapping_error(dev, dma_addr)) { | ||
464 | ret = -EFAULT; | ||
465 | goto out; | ||
466 | } | ||
467 | umem->odp_data->dma_list[page_index] = dma_addr | access_mask; | ||
468 | umem->odp_data->page_list[page_index] = page; | ||
469 | stored_page = 1; | ||
470 | } else if (umem->odp_data->page_list[page_index] == page) { | ||
471 | umem->odp_data->dma_list[page_index] |= access_mask; | ||
472 | } else { | ||
473 | pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", | ||
474 | umem->odp_data->page_list[page_index], page); | ||
475 | /* Better remove the mapping now, to prevent any further | ||
476 | * damage. */ | ||
477 | remove_existing_mapping = 1; | ||
478 | } | ||
479 | |||
480 | out: | ||
481 | mutex_unlock(&umem->odp_data->umem_mutex); | ||
482 | |||
483 | /* On Demand Paging - avoid pinning the page */ | ||
484 | if (umem->context->invalidate_range || !stored_page) | ||
485 | put_page(page); | ||
486 | |||
487 | if (remove_existing_mapping && umem->context->invalidate_range) { | ||
488 | invalidate_page_trampoline( | ||
489 | umem, | ||
490 | base_virt_addr + (page_index * PAGE_SIZE), | ||
491 | base_virt_addr + ((page_index+1)*PAGE_SIZE), | ||
492 | NULL); | ||
493 | ret = -EAGAIN; | ||
494 | } | ||
495 | |||
496 | return ret; | ||
497 | } | ||
498 | |||
499 | /** | ||
500 | * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. | ||
501 | * | ||
502 | * Pins the range of pages passed in the argument, and maps them to | ||
503 | * DMA addresses. The DMA addresses of the mapped pages is updated in | ||
504 | * umem->odp_data->dma_list. | ||
505 | * | ||
506 | * Returns the number of pages mapped in success, negative error code | ||
507 | * for failure. | ||
508 | * An -EAGAIN error code is returned when a concurrent mmu notifier prevents | ||
509 | * the function from completing its task. | ||
510 | * | ||
511 | * @umem: the umem to map and pin | ||
512 | * @user_virt: the address from which we need to map. | ||
513 | * @bcnt: the minimal number of bytes to pin and map. The mapping might be | ||
514 | * bigger due to alignment, and may also be smaller in case of an error | ||
515 | * pinning or mapping a page. The actual pages mapped is returned in | ||
516 | * the return value. | ||
517 | * @access_mask: bit mask of the requested access permissions for the given | ||
518 | * range. | ||
519 | * @current_seq: the MMU notifiers sequance value for synchronization with | ||
520 | * invalidations. the sequance number is read from | ||
521 | * umem->odp_data->notifiers_seq before calling this function | ||
522 | */ | ||
523 | int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, | ||
524 | u64 access_mask, unsigned long current_seq) | ||
525 | { | ||
526 | struct task_struct *owning_process = NULL; | ||
527 | struct mm_struct *owning_mm = NULL; | ||
528 | struct page **local_page_list = NULL; | ||
529 | u64 off; | ||
530 | int j, k, ret = 0, start_idx, npages = 0; | ||
531 | u64 base_virt_addr; | ||
532 | |||
533 | if (access_mask == 0) | ||
534 | return -EINVAL; | ||
535 | |||
536 | if (user_virt < ib_umem_start(umem) || | ||
537 | user_virt + bcnt > ib_umem_end(umem)) | ||
538 | return -EFAULT; | ||
539 | |||
540 | local_page_list = (struct page **)__get_free_page(GFP_KERNEL); | ||
541 | if (!local_page_list) | ||
542 | return -ENOMEM; | ||
543 | |||
544 | off = user_virt & (~PAGE_MASK); | ||
545 | user_virt = user_virt & PAGE_MASK; | ||
546 | base_virt_addr = user_virt; | ||
547 | bcnt += off; /* Charge for the first page offset as well. */ | ||
548 | |||
549 | owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); | ||
550 | if (owning_process == NULL) { | ||
551 | ret = -EINVAL; | ||
552 | goto out_no_task; | ||
553 | } | ||
554 | |||
555 | owning_mm = get_task_mm(owning_process); | ||
556 | if (owning_mm == NULL) { | ||
557 | ret = -EINVAL; | ||
558 | goto out_put_task; | ||
559 | } | ||
560 | |||
561 | start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; | ||
562 | k = start_idx; | ||
563 | |||
564 | while (bcnt > 0) { | ||
565 | const size_t gup_num_pages = | ||
566 | min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, | ||
567 | PAGE_SIZE / sizeof(struct page *)); | ||
568 | |||
569 | down_read(&owning_mm->mmap_sem); | ||
570 | /* | ||
571 | * Note: this might result in redundent page getting. We can | ||
572 | * avoid this by checking dma_list to be 0 before calling | ||
573 | * get_user_pages. However, this make the code much more | ||
574 | * complex (and doesn't gain us much performance in most use | ||
575 | * cases). | ||
576 | */ | ||
577 | npages = get_user_pages(owning_process, owning_mm, user_virt, | ||
578 | gup_num_pages, | ||
579 | access_mask & ODP_WRITE_ALLOWED_BIT, 0, | ||
580 | local_page_list, NULL); | ||
581 | up_read(&owning_mm->mmap_sem); | ||
582 | |||
583 | if (npages < 0) | ||
584 | break; | ||
585 | |||
586 | bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); | ||
587 | user_virt += npages << PAGE_SHIFT; | ||
588 | for (j = 0; j < npages; ++j) { | ||
589 | ret = ib_umem_odp_map_dma_single_page( | ||
590 | umem, k, base_virt_addr, local_page_list[j], | ||
591 | access_mask, current_seq); | ||
592 | if (ret < 0) | ||
593 | break; | ||
594 | k++; | ||
595 | } | ||
596 | |||
597 | if (ret < 0) { | ||
598 | /* Release left over pages when handling errors. */ | ||
599 | for (++j; j < npages; ++j) | ||
600 | put_page(local_page_list[j]); | ||
601 | break; | ||
602 | } | ||
603 | } | ||
604 | |||
605 | if (ret >= 0) { | ||
606 | if (npages < 0 && k == start_idx) | ||
607 | ret = npages; | ||
608 | else | ||
609 | ret = k - start_idx; | ||
610 | } | ||
611 | |||
612 | mmput(owning_mm); | ||
613 | out_put_task: | ||
614 | put_task_struct(owning_process); | ||
615 | out_no_task: | ||
616 | free_page((unsigned long)local_page_list); | ||
617 | return ret; | ||
618 | } | ||
619 | EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); | ||
620 | |||
621 | void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, | ||
622 | u64 bound) | ||
623 | { | ||
624 | int idx; | ||
625 | u64 addr; | ||
626 | struct ib_device *dev = umem->context->device; | ||
627 | |||
628 | virt = max_t(u64, virt, ib_umem_start(umem)); | ||
629 | bound = min_t(u64, bound, ib_umem_end(umem)); | ||
630 | /* Note that during the run of this function, the | ||
631 | * notifiers_count of the MR is > 0, preventing any racing | ||
632 | * faults from completion. We might be racing with other | ||
633 | * invalidations, so we must make sure we free each page only | ||
634 | * once. */ | ||
635 | for (addr = virt; addr < bound; addr += (u64)umem->page_size) { | ||
636 | idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; | ||
637 | mutex_lock(&umem->odp_data->umem_mutex); | ||
638 | if (umem->odp_data->page_list[idx]) { | ||
639 | struct page *page = umem->odp_data->page_list[idx]; | ||
640 | struct page *head_page = compound_head(page); | ||
641 | dma_addr_t dma = umem->odp_data->dma_list[idx]; | ||
642 | dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; | ||
643 | |||
644 | WARN_ON(!dma_addr); | ||
645 | |||
646 | ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, | ||
647 | DMA_BIDIRECTIONAL); | ||
648 | if (dma & ODP_WRITE_ALLOWED_BIT) | ||
649 | /* | ||
650 | * set_page_dirty prefers being called with | ||
651 | * the page lock. However, MMU notifiers are | ||
652 | * called sometimes with and sometimes without | ||
653 | * the lock. We rely on the umem_mutex instead | ||
654 | * to prevent other mmu notifiers from | ||
655 | * continuing and allowing the page mapping to | ||
656 | * be removed. | ||
657 | */ | ||
658 | set_page_dirty(head_page); | ||
659 | /* on demand pinning support */ | ||
660 | if (!umem->context->invalidate_range) | ||
661 | put_page(page); | ||
662 | umem->odp_data->page_list[idx] = NULL; | ||
663 | umem->odp_data->dma_list[idx] = 0; | ||
664 | } | ||
665 | mutex_unlock(&umem->odp_data->umem_mutex); | ||
666 | } | ||
667 | } | ||
668 | EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); | ||
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c new file mode 100644 index 000000000000..727d788448f5 --- /dev/null +++ b/drivers/infiniband/core/umem_rbtree.c | |||
@@ -0,0 +1,94 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #include <linux/kernel.h> | ||
34 | #include <linux/module.h> | ||
35 | #include <linux/interval_tree_generic.h> | ||
36 | #include <linux/sched.h> | ||
37 | #include <linux/gfp.h> | ||
38 | #include <rdma/ib_umem_odp.h> | ||
39 | |||
40 | /* | ||
41 | * The ib_umem list keeps track of memory regions for which the HW | ||
42 | * device request to receive notification when the related memory | ||
43 | * mapping is changed. | ||
44 | * | ||
45 | * ib_umem_lock protects the list. | ||
46 | */ | ||
47 | |||
48 | static inline u64 node_start(struct umem_odp_node *n) | ||
49 | { | ||
50 | struct ib_umem_odp *umem_odp = | ||
51 | container_of(n, struct ib_umem_odp, interval_tree); | ||
52 | |||
53 | return ib_umem_start(umem_odp->umem); | ||
54 | } | ||
55 | |||
56 | /* Note that the representation of the intervals in the interval tree | ||
57 | * considers the ending point as contained in the interval, while the | ||
58 | * function ib_umem_end returns the first address which is not contained | ||
59 | * in the umem. | ||
60 | */ | ||
61 | static inline u64 node_last(struct umem_odp_node *n) | ||
62 | { | ||
63 | struct ib_umem_odp *umem_odp = | ||
64 | container_of(n, struct ib_umem_odp, interval_tree); | ||
65 | |||
66 | return ib_umem_end(umem_odp->umem) - 1; | ||
67 | } | ||
68 | |||
69 | INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, | ||
70 | node_start, node_last, , rbt_ib_umem) | ||
71 | |||
72 | /* @last is not a part of the interval. See comment for function | ||
73 | * node_last. | ||
74 | */ | ||
75 | int rbt_ib_umem_for_each_in_range(struct rb_root *root, | ||
76 | u64 start, u64 last, | ||
77 | umem_call_back cb, | ||
78 | void *cookie) | ||
79 | { | ||
80 | int ret_val = 0; | ||
81 | struct umem_odp_node *node; | ||
82 | struct ib_umem_odp *umem; | ||
83 | |||
84 | if (unlikely(start == last)) | ||
85 | return ret_val; | ||
86 | |||
87 | for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; | ||
88 | node = rbt_ib_umem_iter_next(node, start, last - 1)) { | ||
89 | umem = container_of(node, struct ib_umem_odp, interval_tree); | ||
90 | ret_val = cb(umem->umem, start, last, cookie) || ret_val; | ||
91 | } | ||
92 | |||
93 | return ret_val; | ||
94 | } | ||
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 643c08a025a5..b716b0815644 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h | |||
@@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); | |||
258 | 258 | ||
259 | IB_UVERBS_DECLARE_EX_CMD(create_flow); | 259 | IB_UVERBS_DECLARE_EX_CMD(create_flow); |
260 | IB_UVERBS_DECLARE_EX_CMD(destroy_flow); | 260 | IB_UVERBS_DECLARE_EX_CMD(destroy_flow); |
261 | IB_UVERBS_DECLARE_EX_CMD(query_device); | ||
261 | 262 | ||
262 | #endif /* UVERBS_H */ | 263 | #endif /* UVERBS_H */ |
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ba2a86aab6a..532d8eba8b02 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/file.h> | 36 | #include <linux/file.h> |
37 | #include <linux/fs.h> | 37 | #include <linux/fs.h> |
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/sched.h> | ||
39 | 40 | ||
40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
41 | 42 | ||
@@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, | |||
288 | struct ib_uverbs_get_context_resp resp; | 289 | struct ib_uverbs_get_context_resp resp; |
289 | struct ib_udata udata; | 290 | struct ib_udata udata; |
290 | struct ib_device *ibdev = file->device->ib_dev; | 291 | struct ib_device *ibdev = file->device->ib_dev; |
292 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
293 | struct ib_device_attr dev_attr; | ||
294 | #endif | ||
291 | struct ib_ucontext *ucontext; | 295 | struct ib_ucontext *ucontext; |
292 | struct file *filp; | 296 | struct file *filp; |
293 | int ret; | 297 | int ret; |
@@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, | |||
325 | INIT_LIST_HEAD(&ucontext->ah_list); | 329 | INIT_LIST_HEAD(&ucontext->ah_list); |
326 | INIT_LIST_HEAD(&ucontext->xrcd_list); | 330 | INIT_LIST_HEAD(&ucontext->xrcd_list); |
327 | INIT_LIST_HEAD(&ucontext->rule_list); | 331 | INIT_LIST_HEAD(&ucontext->rule_list); |
332 | rcu_read_lock(); | ||
333 | ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); | ||
334 | rcu_read_unlock(); | ||
328 | ucontext->closing = 0; | 335 | ucontext->closing = 0; |
329 | 336 | ||
337 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
338 | ucontext->umem_tree = RB_ROOT; | ||
339 | init_rwsem(&ucontext->umem_rwsem); | ||
340 | ucontext->odp_mrs_count = 0; | ||
341 | INIT_LIST_HEAD(&ucontext->no_private_counters); | ||
342 | |||
343 | ret = ib_query_device(ibdev, &dev_attr); | ||
344 | if (ret) | ||
345 | goto err_free; | ||
346 | if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) | ||
347 | ucontext->invalidate_range = NULL; | ||
348 | |||
349 | #endif | ||
350 | |||
330 | resp.num_comp_vectors = file->device->num_comp_vectors; | 351 | resp.num_comp_vectors = file->device->num_comp_vectors; |
331 | 352 | ||
332 | ret = get_unused_fd_flags(O_CLOEXEC); | 353 | ret = get_unused_fd_flags(O_CLOEXEC); |
@@ -371,6 +392,7 @@ err_fd: | |||
371 | put_unused_fd(resp.async_fd); | 392 | put_unused_fd(resp.async_fd); |
372 | 393 | ||
373 | err_free: | 394 | err_free: |
395 | put_pid(ucontext->tgid); | ||
374 | ibdev->dealloc_ucontext(ucontext); | 396 | ibdev->dealloc_ucontext(ucontext); |
375 | 397 | ||
376 | err: | 398 | err: |
@@ -378,6 +400,52 @@ err: | |||
378 | return ret; | 400 | return ret; |
379 | } | 401 | } |
380 | 402 | ||
403 | static void copy_query_dev_fields(struct ib_uverbs_file *file, | ||
404 | struct ib_uverbs_query_device_resp *resp, | ||
405 | struct ib_device_attr *attr) | ||
406 | { | ||
407 | resp->fw_ver = attr->fw_ver; | ||
408 | resp->node_guid = file->device->ib_dev->node_guid; | ||
409 | resp->sys_image_guid = attr->sys_image_guid; | ||
410 | resp->max_mr_size = attr->max_mr_size; | ||
411 | resp->page_size_cap = attr->page_size_cap; | ||
412 | resp->vendor_id = attr->vendor_id; | ||
413 | resp->vendor_part_id = attr->vendor_part_id; | ||
414 | resp->hw_ver = attr->hw_ver; | ||
415 | resp->max_qp = attr->max_qp; | ||
416 | resp->max_qp_wr = attr->max_qp_wr; | ||
417 | resp->device_cap_flags = attr->device_cap_flags; | ||
418 | resp->max_sge = attr->max_sge; | ||
419 | resp->max_sge_rd = attr->max_sge_rd; | ||
420 | resp->max_cq = attr->max_cq; | ||
421 | resp->max_cqe = attr->max_cqe; | ||
422 | resp->max_mr = attr->max_mr; | ||
423 | resp->max_pd = attr->max_pd; | ||
424 | resp->max_qp_rd_atom = attr->max_qp_rd_atom; | ||
425 | resp->max_ee_rd_atom = attr->max_ee_rd_atom; | ||
426 | resp->max_res_rd_atom = attr->max_res_rd_atom; | ||
427 | resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; | ||
428 | resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; | ||
429 | resp->atomic_cap = attr->atomic_cap; | ||
430 | resp->max_ee = attr->max_ee; | ||
431 | resp->max_rdd = attr->max_rdd; | ||
432 | resp->max_mw = attr->max_mw; | ||
433 | resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; | ||
434 | resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; | ||
435 | resp->max_mcast_grp = attr->max_mcast_grp; | ||
436 | resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; | ||
437 | resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; | ||
438 | resp->max_ah = attr->max_ah; | ||
439 | resp->max_fmr = attr->max_fmr; | ||
440 | resp->max_map_per_fmr = attr->max_map_per_fmr; | ||
441 | resp->max_srq = attr->max_srq; | ||
442 | resp->max_srq_wr = attr->max_srq_wr; | ||
443 | resp->max_srq_sge = attr->max_srq_sge; | ||
444 | resp->max_pkeys = attr->max_pkeys; | ||
445 | resp->local_ca_ack_delay = attr->local_ca_ack_delay; | ||
446 | resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; | ||
447 | } | ||
448 | |||
381 | ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, | 449 | ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, |
382 | const char __user *buf, | 450 | const char __user *buf, |
383 | int in_len, int out_len) | 451 | int in_len, int out_len) |
@@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, | |||
398 | return ret; | 466 | return ret; |
399 | 467 | ||
400 | memset(&resp, 0, sizeof resp); | 468 | memset(&resp, 0, sizeof resp); |
401 | 469 | copy_query_dev_fields(file, &resp, &attr); | |
402 | resp.fw_ver = attr.fw_ver; | ||
403 | resp.node_guid = file->device->ib_dev->node_guid; | ||
404 | resp.sys_image_guid = attr.sys_image_guid; | ||
405 | resp.max_mr_size = attr.max_mr_size; | ||
406 | resp.page_size_cap = attr.page_size_cap; | ||
407 | resp.vendor_id = attr.vendor_id; | ||
408 | resp.vendor_part_id = attr.vendor_part_id; | ||
409 | resp.hw_ver = attr.hw_ver; | ||
410 | resp.max_qp = attr.max_qp; | ||
411 | resp.max_qp_wr = attr.max_qp_wr; | ||
412 | resp.device_cap_flags = attr.device_cap_flags; | ||
413 | resp.max_sge = attr.max_sge; | ||
414 | resp.max_sge_rd = attr.max_sge_rd; | ||
415 | resp.max_cq = attr.max_cq; | ||
416 | resp.max_cqe = attr.max_cqe; | ||
417 | resp.max_mr = attr.max_mr; | ||
418 | resp.max_pd = attr.max_pd; | ||
419 | resp.max_qp_rd_atom = attr.max_qp_rd_atom; | ||
420 | resp.max_ee_rd_atom = attr.max_ee_rd_atom; | ||
421 | resp.max_res_rd_atom = attr.max_res_rd_atom; | ||
422 | resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; | ||
423 | resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; | ||
424 | resp.atomic_cap = attr.atomic_cap; | ||
425 | resp.max_ee = attr.max_ee; | ||
426 | resp.max_rdd = attr.max_rdd; | ||
427 | resp.max_mw = attr.max_mw; | ||
428 | resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; | ||
429 | resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; | ||
430 | resp.max_mcast_grp = attr.max_mcast_grp; | ||
431 | resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; | ||
432 | resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; | ||
433 | resp.max_ah = attr.max_ah; | ||
434 | resp.max_fmr = attr.max_fmr; | ||
435 | resp.max_map_per_fmr = attr.max_map_per_fmr; | ||
436 | resp.max_srq = attr.max_srq; | ||
437 | resp.max_srq_wr = attr.max_srq_wr; | ||
438 | resp.max_srq_sge = attr.max_srq_sge; | ||
439 | resp.max_pkeys = attr.max_pkeys; | ||
440 | resp.local_ca_ack_delay = attr.local_ca_ack_delay; | ||
441 | resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; | ||
442 | 470 | ||
443 | if (copy_to_user((void __user *) (unsigned long) cmd.response, | 471 | if (copy_to_user((void __user *) (unsigned long) cmd.response, |
444 | &resp, sizeof resp)) | 472 | &resp, sizeof resp)) |
@@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, | |||
947 | goto err_free; | 975 | goto err_free; |
948 | } | 976 | } |
949 | 977 | ||
978 | if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { | ||
979 | struct ib_device_attr attr; | ||
980 | |||
981 | ret = ib_query_device(pd->device, &attr); | ||
982 | if (ret || !(attr.device_cap_flags & | ||
983 | IB_DEVICE_ON_DEMAND_PAGING)) { | ||
984 | pr_debug("ODP support not available\n"); | ||
985 | ret = -EINVAL; | ||
986 | goto err_put; | ||
987 | } | ||
988 | } | ||
989 | |||
950 | mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, | 990 | mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, |
951 | cmd.access_flags, &udata); | 991 | cmd.access_flags, &udata); |
952 | if (IS_ERR(mr)) { | 992 | if (IS_ERR(mr)) { |
@@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, | |||
3253 | 3293 | ||
3254 | return ret ? ret : in_len; | 3294 | return ret ? ret : in_len; |
3255 | } | 3295 | } |
3296 | |||
3297 | int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, | ||
3298 | struct ib_udata *ucore, | ||
3299 | struct ib_udata *uhw) | ||
3300 | { | ||
3301 | struct ib_uverbs_ex_query_device_resp resp; | ||
3302 | struct ib_uverbs_ex_query_device cmd; | ||
3303 | struct ib_device_attr attr; | ||
3304 | struct ib_device *device; | ||
3305 | int err; | ||
3306 | |||
3307 | device = file->device->ib_dev; | ||
3308 | if (ucore->inlen < sizeof(cmd)) | ||
3309 | return -EINVAL; | ||
3310 | |||
3311 | err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); | ||
3312 | if (err) | ||
3313 | return err; | ||
3314 | |||
3315 | if (cmd.reserved) | ||
3316 | return -EINVAL; | ||
3317 | |||
3318 | err = device->query_device(device, &attr); | ||
3319 | if (err) | ||
3320 | return err; | ||
3321 | |||
3322 | memset(&resp, 0, sizeof(resp)); | ||
3323 | copy_query_dev_fields(file, &resp.base, &attr); | ||
3324 | resp.comp_mask = 0; | ||
3325 | |||
3326 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
3327 | if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) { | ||
3328 | resp.odp_caps.general_caps = attr.odp_caps.general_caps; | ||
3329 | resp.odp_caps.per_transport_caps.rc_odp_caps = | ||
3330 | attr.odp_caps.per_transport_caps.rc_odp_caps; | ||
3331 | resp.odp_caps.per_transport_caps.uc_odp_caps = | ||
3332 | attr.odp_caps.per_transport_caps.uc_odp_caps; | ||
3333 | resp.odp_caps.per_transport_caps.ud_odp_caps = | ||
3334 | attr.odp_caps.per_transport_caps.ud_odp_caps; | ||
3335 | resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP; | ||
3336 | } | ||
3337 | #endif | ||
3338 | |||
3339 | err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); | ||
3340 | if (err) | ||
3341 | return err; | ||
3342 | |||
3343 | return 0; | ||
3344 | } | ||
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 71ab83fde472..e6c23b9eab33 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c | |||
@@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, | |||
122 | struct ib_udata *ucore, | 122 | struct ib_udata *ucore, |
123 | struct ib_udata *uhw) = { | 123 | struct ib_udata *uhw) = { |
124 | [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, | 124 | [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, |
125 | [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow | 125 | [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, |
126 | [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device | ||
126 | }; | 127 | }; |
127 | 128 | ||
128 | static void ib_uverbs_add_one(struct ib_device *device); | 129 | static void ib_uverbs_add_one(struct ib_device *device); |
@@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
296 | kfree(uobj); | 297 | kfree(uobj); |
297 | } | 298 | } |
298 | 299 | ||
300 | put_pid(context->tgid); | ||
301 | |||
299 | return context->device->dealloc_ucontext(context); | 302 | return context->device->dealloc_ucontext(context); |
300 | } | 303 | } |
301 | 304 | ||
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 2d5cbf4363e4..bdf3507810cb 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c | |||
@@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
476 | c2mr->umem->page_size, | 476 | c2mr->umem->page_size, |
477 | i, | 477 | i, |
478 | length, | 478 | length, |
479 | c2mr->umem->offset, | 479 | ib_umem_offset(c2mr->umem), |
480 | &kva, | 480 | &kva, |
481 | c2_convert_access(acc), | 481 | c2_convert_access(acc), |
482 | c2mr); | 482 | c2mr); |
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 4b8c6116c058..9edc200b311d 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c | |||
@@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) | |||
1640 | __state_set(&ep->com, MPA_REQ_RCVD); | 1640 | __state_set(&ep->com, MPA_REQ_RCVD); |
1641 | 1641 | ||
1642 | /* drive upcall */ | 1642 | /* drive upcall */ |
1643 | mutex_lock(&ep->parent_ep->com.mutex); | 1643 | mutex_lock_nested(&ep->parent_ep->com.mutex, |
1644 | SINGLE_DEPTH_NESTING); | ||
1644 | if (ep->parent_ep->com.state != DEAD) { | 1645 | if (ep->parent_ep->com.state != DEAD) { |
1645 | if (connect_request_upcall(ep)) | 1646 | if (connect_request_upcall(ep)) |
1646 | abort_connection(ep, skb, GFP_KERNEL); | 1647 | abort_connection(ep, skb, GFP_KERNEL); |
@@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) | |||
3126 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, | 3127 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, |
3127 | &ep->com.wr_wait, | 3128 | &ep->com.wr_wait, |
3128 | 0, 0, __func__); | 3129 | 0, 0, __func__); |
3130 | else if (err > 0) | ||
3131 | err = net_xmit_errno(err); | ||
3129 | if (err) | 3132 | if (err) |
3130 | pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", | 3133 | pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", |
3131 | err, ep->stid, | 3134 | err, ep->stid, |
@@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) | |||
3159 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, | 3162 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, |
3160 | &ep->com.wr_wait, | 3163 | &ep->com.wr_wait, |
3161 | 0, 0, __func__); | 3164 | 0, 0, __func__); |
3165 | else if (err > 0) | ||
3166 | err = net_xmit_errno(err); | ||
3162 | } | 3167 | } |
3163 | if (err) | 3168 | if (err) |
3164 | pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" | 3169 | pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" |
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 72f1f052e88c..eb5df4e62703 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c | |||
@@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file) | |||
670 | idr_for_each(&epd->devp->stid_idr, count_idrs, &count); | 670 | idr_for_each(&epd->devp->stid_idr, count_idrs, &count); |
671 | spin_unlock_irq(&epd->devp->lock); | 671 | spin_unlock_irq(&epd->devp->lock); |
672 | 672 | ||
673 | epd->bufsize = count * 160; | 673 | epd->bufsize = count * 240; |
674 | epd->buf = vmalloc(epd->bufsize); | 674 | epd->buf = vmalloc(epd->bufsize); |
675 | if (!epd->buf) { | 675 | if (!epd->buf) { |
676 | ret = -ENOMEM; | 676 | ret = -ENOMEM; |
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0744455cd88b..cb43c2299ac0 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c | |||
@@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD; | |||
50 | module_param(inline_threshold, int, 0644); | 50 | module_param(inline_threshold, int, 0644); |
51 | MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); | 51 | MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); |
52 | 52 | ||
53 | static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length) | ||
54 | { | ||
55 | return (is_t4(dev->rdev.lldi.adapter_type) || | ||
56 | is_t5(dev->rdev.lldi.adapter_type)) && | ||
57 | length >= 8*1024*1024*1024ULL; | ||
58 | } | ||
59 | |||
53 | static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, | 60 | static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, |
54 | u32 len, dma_addr_t data, int wait) | 61 | u32 len, dma_addr_t data, int wait) |
55 | { | 62 | { |
@@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, | |||
369 | int ret; | 376 | int ret; |
370 | 377 | ||
371 | ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, | 378 | ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, |
372 | FW_RI_STAG_NSMR, mhp->attr.perms, | 379 | FW_RI_STAG_NSMR, mhp->attr.len ? |
380 | mhp->attr.perms : 0, | ||
373 | mhp->attr.mw_bind_enable, mhp->attr.zbva, | 381 | mhp->attr.mw_bind_enable, mhp->attr.zbva, |
374 | mhp->attr.va_fbo, mhp->attr.len, shift - 12, | 382 | mhp->attr.va_fbo, mhp->attr.len ? |
383 | mhp->attr.len : -1, shift - 12, | ||
375 | mhp->attr.pbl_size, mhp->attr.pbl_addr); | 384 | mhp->attr.pbl_size, mhp->attr.pbl_addr); |
376 | if (ret) | 385 | if (ret) |
377 | return ret; | 386 | return ret; |
@@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, | |||
536 | return ret; | 545 | return ret; |
537 | } | 546 | } |
538 | 547 | ||
548 | if (mr_exceeds_hw_limits(rhp, total_size)) { | ||
549 | kfree(page_list); | ||
550 | return -EINVAL; | ||
551 | } | ||
552 | |||
539 | ret = reregister_mem(rhp, php, &mh, shift, npages); | 553 | ret = reregister_mem(rhp, php, &mh, shift, npages); |
540 | kfree(page_list); | 554 | kfree(page_list); |
541 | if (ret) | 555 | if (ret) |
@@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, | |||
596 | if (ret) | 610 | if (ret) |
597 | goto err; | 611 | goto err; |
598 | 612 | ||
613 | if (mr_exceeds_hw_limits(rhp, total_size)) { | ||
614 | kfree(page_list); | ||
615 | ret = -EINVAL; | ||
616 | goto err; | ||
617 | } | ||
618 | |||
599 | ret = alloc_pbl(mhp, npages); | 619 | ret = alloc_pbl(mhp, npages); |
600 | if (ret) { | 620 | if (ret) { |
601 | kfree(page_list); | 621 | kfree(page_list); |
@@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
699 | 719 | ||
700 | php = to_c4iw_pd(pd); | 720 | php = to_c4iw_pd(pd); |
701 | rhp = php->rhp; | 721 | rhp = php->rhp; |
722 | |||
723 | if (mr_exceeds_hw_limits(rhp, length)) | ||
724 | return ERR_PTR(-EINVAL); | ||
725 | |||
702 | mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); | 726 | mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); |
703 | if (!mhp) | 727 | if (!mhp) |
704 | return ERR_PTR(-ENOMEM); | 728 | return ERR_PTR(-ENOMEM); |
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 2ed3ece2b2ee..bb85d479e66e 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c | |||
@@ -1538,9 +1538,9 @@ err: | |||
1538 | set_state(qhp, C4IW_QP_STATE_ERROR); | 1538 | set_state(qhp, C4IW_QP_STATE_ERROR); |
1539 | free = 1; | 1539 | free = 1; |
1540 | abort = 1; | 1540 | abort = 1; |
1541 | wake_up(&qhp->wait); | ||
1542 | BUG_ON(!ep); | 1541 | BUG_ON(!ep); |
1543 | flush_qp(qhp); | 1542 | flush_qp(qhp); |
1543 | wake_up(&qhp->wait); | ||
1544 | out: | 1544 | out: |
1545 | mutex_unlock(&qhp->mutex); | 1545 | mutex_unlock(&qhp->mutex); |
1546 | 1546 | ||
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 3488e8c9fcb4..f914b30999f8 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c | |||
@@ -399,7 +399,7 @@ reg_user_mr_fallback: | |||
399 | pginfo.num_kpages = num_kpages; | 399 | pginfo.num_kpages = num_kpages; |
400 | pginfo.num_hwpages = num_hwpages; | 400 | pginfo.num_hwpages = num_hwpages; |
401 | pginfo.u.usr.region = e_mr->umem; | 401 | pginfo.u.usr.region = e_mr->umem; |
402 | pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; | 402 | pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size; |
403 | pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; | 403 | pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; |
404 | ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, | 404 | ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, |
405 | e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, | 405 | e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, |
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index 5e61e9bff697..c7278f6a8217 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c | |||
@@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
214 | mr->mr.user_base = start; | 214 | mr->mr.user_base = start; |
215 | mr->mr.iova = virt_addr; | 215 | mr->mr.iova = virt_addr; |
216 | mr->mr.length = length; | 216 | mr->mr.length = length; |
217 | mr->mr.offset = umem->offset; | 217 | mr->mr.offset = ib_umem_offset(umem); |
218 | mr->mr.access_flags = mr_access_flags; | 218 | mr->mr.access_flags = mr_access_flags; |
219 | mr->mr.max_segs = n; | 219 | mr->mr.max_segs = n; |
220 | mr->umem = umem; | 220 | mr->umem = umem; |
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 8f9325cfc85d..c36ccbd9a644 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c | |||
@@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, | |||
223 | 223 | ||
224 | if (flags & IB_MR_REREG_TRANS) { | 224 | if (flags & IB_MR_REREG_TRANS) { |
225 | int shift; | 225 | int shift; |
226 | int err; | ||
227 | int n; | 226 | int n; |
228 | 227 | ||
229 | mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); | 228 | mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); |
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 4ea0135af484..27a70159e2ea 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile | |||
@@ -1,3 +1,4 @@ | |||
1 | obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o | 1 | obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o |
2 | 2 | ||
3 | mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o | 3 | mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o |
4 | mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o | ||
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1ba6c42e4df8..8a87404e9c76 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c | |||
@@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, | |||
244 | props->max_mcast_grp; | 244 | props->max_mcast_grp; |
245 | props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ | 245 | props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ |
246 | 246 | ||
247 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
248 | if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) | ||
249 | props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; | ||
250 | props->odp_caps = dev->odp_caps; | ||
251 | #endif | ||
252 | |||
247 | out: | 253 | out: |
248 | kfree(in_mad); | 254 | kfree(in_mad); |
249 | kfree(out_mad); | 255 | kfree(out_mad); |
@@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, | |||
568 | goto out_count; | 574 | goto out_count; |
569 | } | 575 | } |
570 | 576 | ||
577 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
578 | context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; | ||
579 | #endif | ||
580 | |||
571 | INIT_LIST_HEAD(&context->db_page_list); | 581 | INIT_LIST_HEAD(&context->db_page_list); |
572 | mutex_init(&context->db_page_mutex); | 582 | mutex_init(&context->db_page_mutex); |
573 | 583 | ||
@@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device, | |||
858 | struct mlx5_ib_dev *dev = | 868 | struct mlx5_ib_dev *dev = |
859 | container_of(device, struct mlx5_ib_dev, ib_dev.dev); | 869 | container_of(device, struct mlx5_ib_dev, ib_dev.dev); |
860 | 870 | ||
861 | return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); | 871 | return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); |
862 | } | 872 | } |
863 | 873 | ||
864 | static ssize_t show_hca(struct device *device, struct device_attribute *attr, | 874 | static ssize_t show_hca(struct device *device, struct device_attribute *attr, |
@@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) | |||
1321 | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | | 1331 | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | |
1322 | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | | 1332 | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | |
1323 | (1ull << IB_USER_VERBS_CMD_OPEN_QP); | 1333 | (1ull << IB_USER_VERBS_CMD_OPEN_QP); |
1334 | dev->ib_dev.uverbs_ex_cmd_mask = | ||
1335 | (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); | ||
1324 | 1336 | ||
1325 | dev->ib_dev.query_device = mlx5_ib_query_device; | 1337 | dev->ib_dev.query_device = mlx5_ib_query_device; |
1326 | dev->ib_dev.query_port = mlx5_ib_query_port; | 1338 | dev->ib_dev.query_port = mlx5_ib_query_port; |
@@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) | |||
1366 | dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; | 1378 | dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; |
1367 | dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; | 1379 | dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; |
1368 | 1380 | ||
1381 | mlx5_ib_internal_query_odp_caps(dev); | ||
1382 | |||
1369 | if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { | 1383 | if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { |
1370 | dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; | 1384 | dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; |
1371 | dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; | 1385 | dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; |
@@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) | |||
1379 | goto err_eqs; | 1393 | goto err_eqs; |
1380 | 1394 | ||
1381 | mutex_init(&dev->cap_mask_mutex); | 1395 | mutex_init(&dev->cap_mask_mutex); |
1382 | spin_lock_init(&dev->mr_lock); | ||
1383 | 1396 | ||
1384 | err = create_dev_resources(&dev->devr); | 1397 | err = create_dev_resources(&dev->devr); |
1385 | if (err) | 1398 | if (err) |
1386 | goto err_eqs; | 1399 | goto err_eqs; |
1387 | 1400 | ||
1388 | err = ib_register_device(&dev->ib_dev, NULL); | 1401 | err = mlx5_ib_odp_init_one(dev); |
1389 | if (err) | 1402 | if (err) |
1390 | goto err_rsrc; | 1403 | goto err_rsrc; |
1391 | 1404 | ||
1405 | err = ib_register_device(&dev->ib_dev, NULL); | ||
1406 | if (err) | ||
1407 | goto err_odp; | ||
1408 | |||
1392 | err = create_umr_res(dev); | 1409 | err = create_umr_res(dev); |
1393 | if (err) | 1410 | if (err) |
1394 | goto err_dev; | 1411 | goto err_dev; |
@@ -1410,6 +1427,9 @@ err_umrc: | |||
1410 | err_dev: | 1427 | err_dev: |
1411 | ib_unregister_device(&dev->ib_dev); | 1428 | ib_unregister_device(&dev->ib_dev); |
1412 | 1429 | ||
1430 | err_odp: | ||
1431 | mlx5_ib_odp_remove_one(dev); | ||
1432 | |||
1413 | err_rsrc: | 1433 | err_rsrc: |
1414 | destroy_dev_resources(&dev->devr); | 1434 | destroy_dev_resources(&dev->devr); |
1415 | 1435 | ||
@@ -1425,8 +1445,10 @@ err_dealloc: | |||
1425 | static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) | 1445 | static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) |
1426 | { | 1446 | { |
1427 | struct mlx5_ib_dev *dev = context; | 1447 | struct mlx5_ib_dev *dev = context; |
1448 | |||
1428 | ib_unregister_device(&dev->ib_dev); | 1449 | ib_unregister_device(&dev->ib_dev); |
1429 | destroy_umrc_res(dev); | 1450 | destroy_umrc_res(dev); |
1451 | mlx5_ib_odp_remove_one(dev); | ||
1430 | destroy_dev_resources(&dev->devr); | 1452 | destroy_dev_resources(&dev->devr); |
1431 | free_comp_eqs(dev); | 1453 | free_comp_eqs(dev); |
1432 | ib_dealloc_device(&dev->ib_dev); | 1454 | ib_dealloc_device(&dev->ib_dev); |
@@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = { | |||
1440 | 1462 | ||
1441 | static int __init mlx5_ib_init(void) | 1463 | static int __init mlx5_ib_init(void) |
1442 | { | 1464 | { |
1465 | int err; | ||
1466 | |||
1443 | if (deprecated_prof_sel != 2) | 1467 | if (deprecated_prof_sel != 2) |
1444 | pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); | 1468 | pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); |
1445 | 1469 | ||
1446 | return mlx5_register_interface(&mlx5_ib_interface); | 1470 | err = mlx5_ib_odp_init(); |
1471 | if (err) | ||
1472 | return err; | ||
1473 | |||
1474 | err = mlx5_register_interface(&mlx5_ib_interface); | ||
1475 | if (err) | ||
1476 | goto clean_odp; | ||
1477 | |||
1478 | return err; | ||
1479 | |||
1480 | clean_odp: | ||
1481 | mlx5_ib_odp_cleanup(); | ||
1482 | return err; | ||
1447 | } | 1483 | } |
1448 | 1484 | ||
1449 | static void __exit mlx5_ib_cleanup(void) | 1485 | static void __exit mlx5_ib_cleanup(void) |
1450 | { | 1486 | { |
1451 | mlx5_unregister_interface(&mlx5_ib_interface); | 1487 | mlx5_unregister_interface(&mlx5_ib_interface); |
1488 | mlx5_ib_odp_cleanup(); | ||
1452 | } | 1489 | } |
1453 | 1490 | ||
1454 | module_init(mlx5_ib_init); | 1491 | module_init(mlx5_ib_init); |
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index dae07eae9507..b56e4c5593ee 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c | |||
@@ -32,6 +32,7 @@ | |||
32 | 32 | ||
33 | #include <linux/module.h> | 33 | #include <linux/module.h> |
34 | #include <rdma/ib_umem.h> | 34 | #include <rdma/ib_umem.h> |
35 | #include <rdma/ib_umem_odp.h> | ||
35 | #include "mlx5_ib.h" | 36 | #include "mlx5_ib.h" |
36 | 37 | ||
37 | /* @umem: umem object to scan | 38 | /* @umem: umem object to scan |
@@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, | |||
57 | int entry; | 58 | int entry; |
58 | unsigned long page_shift = ilog2(umem->page_size); | 59 | unsigned long page_shift = ilog2(umem->page_size); |
59 | 60 | ||
61 | /* With ODP we must always match OS page size. */ | ||
62 | if (umem->odp_data) { | ||
63 | *count = ib_umem_page_count(umem); | ||
64 | *shift = PAGE_SHIFT; | ||
65 | *ncont = *count; | ||
66 | if (order) | ||
67 | *order = ilog2(roundup_pow_of_two(*count)); | ||
68 | |||
69 | return; | ||
70 | } | ||
71 | |||
60 | addr = addr >> page_shift; | 72 | addr = addr >> page_shift; |
61 | tmp = (unsigned long)addr; | 73 | tmp = (unsigned long)addr; |
62 | m = find_first_bit(&tmp, sizeof(tmp)); | 74 | m = find_first_bit(&tmp, sizeof(tmp)); |
@@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, | |||
108 | *count = i; | 120 | *count = i; |
109 | } | 121 | } |
110 | 122 | ||
111 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | 123 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
112 | int page_shift, __be64 *pas, int umr) | 124 | static u64 umem_dma_to_mtt(dma_addr_t umem_dma) |
125 | { | ||
126 | u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; | ||
127 | |||
128 | if (umem_dma & ODP_READ_ALLOWED_BIT) | ||
129 | mtt_entry |= MLX5_IB_MTT_READ; | ||
130 | if (umem_dma & ODP_WRITE_ALLOWED_BIT) | ||
131 | mtt_entry |= MLX5_IB_MTT_WRITE; | ||
132 | |||
133 | return mtt_entry; | ||
134 | } | ||
135 | #endif | ||
136 | |||
137 | /* | ||
138 | * Populate the given array with bus addresses from the umem. | ||
139 | * | ||
140 | * dev - mlx5_ib device | ||
141 | * umem - umem to use to fill the pages | ||
142 | * page_shift - determines the page size used in the resulting array | ||
143 | * offset - offset into the umem to start from, | ||
144 | * only implemented for ODP umems | ||
145 | * num_pages - total number of pages to fill | ||
146 | * pas - bus addresses array to fill | ||
147 | * access_flags - access flags to set on all present pages. | ||
148 | use enum mlx5_ib_mtt_access_flags for this. | ||
149 | */ | ||
150 | void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | ||
151 | int page_shift, size_t offset, size_t num_pages, | ||
152 | __be64 *pas, int access_flags) | ||
113 | { | 153 | { |
114 | unsigned long umem_page_shift = ilog2(umem->page_size); | 154 | unsigned long umem_page_shift = ilog2(umem->page_size); |
115 | int shift = page_shift - umem_page_shift; | 155 | int shift = page_shift - umem_page_shift; |
@@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | |||
120 | int len; | 160 | int len; |
121 | struct scatterlist *sg; | 161 | struct scatterlist *sg; |
122 | int entry; | 162 | int entry; |
163 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
164 | const bool odp = umem->odp_data != NULL; | ||
165 | |||
166 | if (odp) { | ||
167 | WARN_ON(shift != 0); | ||
168 | WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); | ||
169 | |||
170 | for (i = 0; i < num_pages; ++i) { | ||
171 | dma_addr_t pa = umem->odp_data->dma_list[offset + i]; | ||
172 | |||
173 | pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); | ||
174 | } | ||
175 | return; | ||
176 | } | ||
177 | #endif | ||
123 | 178 | ||
124 | i = 0; | 179 | i = 0; |
125 | for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { | 180 | for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { |
@@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | |||
128 | for (k = 0; k < len; k++) { | 183 | for (k = 0; k < len; k++) { |
129 | if (!(i & mask)) { | 184 | if (!(i & mask)) { |
130 | cur = base + (k << umem_page_shift); | 185 | cur = base + (k << umem_page_shift); |
131 | if (umr) | 186 | cur |= access_flags; |
132 | cur |= 3; | ||
133 | 187 | ||
134 | pas[i >> shift] = cpu_to_be64(cur); | 188 | pas[i >> shift] = cpu_to_be64(cur); |
135 | mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", | 189 | mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", |
@@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | |||
142 | } | 196 | } |
143 | } | 197 | } |
144 | 198 | ||
199 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | ||
200 | int page_shift, __be64 *pas, int access_flags) | ||
201 | { | ||
202 | return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, | ||
203 | ib_umem_num_pages(umem), pas, | ||
204 | access_flags); | ||
205 | } | ||
145 | int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) | 206 | int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) |
146 | { | 207 | { |
147 | u64 page_size; | 208 | u64 page_size; |
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 386780f0d1e1..83f22fe297c8 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h | |||
@@ -111,6 +111,8 @@ struct mlx5_ib_pd { | |||
111 | */ | 111 | */ |
112 | 112 | ||
113 | #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START | 113 | #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START |
114 | #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) | ||
115 | #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) | ||
114 | #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 | 116 | #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 |
115 | #define MLX5_IB_WR_UMR IB_WR_RESERVED1 | 117 | #define MLX5_IB_WR_UMR IB_WR_RESERVED1 |
116 | 118 | ||
@@ -147,6 +149,29 @@ enum { | |||
147 | MLX5_QP_EMPTY | 149 | MLX5_QP_EMPTY |
148 | }; | 150 | }; |
149 | 151 | ||
152 | /* | ||
153 | * Connect-IB can trigger up to four concurrent pagefaults | ||
154 | * per-QP. | ||
155 | */ | ||
156 | enum mlx5_ib_pagefault_context { | ||
157 | MLX5_IB_PAGEFAULT_RESPONDER_READ, | ||
158 | MLX5_IB_PAGEFAULT_REQUESTOR_READ, | ||
159 | MLX5_IB_PAGEFAULT_RESPONDER_WRITE, | ||
160 | MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, | ||
161 | MLX5_IB_PAGEFAULT_CONTEXTS | ||
162 | }; | ||
163 | |||
164 | static inline enum mlx5_ib_pagefault_context | ||
165 | mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) | ||
166 | { | ||
167 | return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); | ||
168 | } | ||
169 | |||
170 | struct mlx5_ib_pfault { | ||
171 | struct work_struct work; | ||
172 | struct mlx5_pagefault mpfault; | ||
173 | }; | ||
174 | |||
150 | struct mlx5_ib_qp { | 175 | struct mlx5_ib_qp { |
151 | struct ib_qp ibqp; | 176 | struct ib_qp ibqp; |
152 | struct mlx5_core_qp mqp; | 177 | struct mlx5_core_qp mqp; |
@@ -192,6 +217,21 @@ struct mlx5_ib_qp { | |||
192 | 217 | ||
193 | /* Store signature errors */ | 218 | /* Store signature errors */ |
194 | bool signature_en; | 219 | bool signature_en; |
220 | |||
221 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
222 | /* | ||
223 | * A flag that is true for QP's that are in a state that doesn't | ||
224 | * allow page faults, and shouldn't schedule any more faults. | ||
225 | */ | ||
226 | int disable_page_faults; | ||
227 | /* | ||
228 | * The disable_page_faults_lock protects a QP's disable_page_faults | ||
229 | * field, allowing for a thread to atomically check whether the QP | ||
230 | * allows page faults, and if so schedule a page fault. | ||
231 | */ | ||
232 | spinlock_t disable_page_faults_lock; | ||
233 | struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; | ||
234 | #endif | ||
195 | }; | 235 | }; |
196 | 236 | ||
197 | struct mlx5_ib_cq_buf { | 237 | struct mlx5_ib_cq_buf { |
@@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags { | |||
206 | MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, | 246 | MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, |
207 | }; | 247 | }; |
208 | 248 | ||
249 | struct mlx5_umr_wr { | ||
250 | union { | ||
251 | u64 virt_addr; | ||
252 | u64 offset; | ||
253 | } target; | ||
254 | struct ib_pd *pd; | ||
255 | unsigned int page_shift; | ||
256 | unsigned int npages; | ||
257 | u32 length; | ||
258 | int access_flags; | ||
259 | u32 mkey; | ||
260 | }; | ||
261 | |||
209 | struct mlx5_shared_mr_info { | 262 | struct mlx5_shared_mr_info { |
210 | int mr_id; | 263 | int mr_id; |
211 | struct ib_umem *umem; | 264 | struct ib_umem *umem; |
@@ -253,6 +306,13 @@ struct mlx5_ib_xrcd { | |||
253 | u32 xrcdn; | 306 | u32 xrcdn; |
254 | }; | 307 | }; |
255 | 308 | ||
309 | enum mlx5_ib_mtt_access_flags { | ||
310 | MLX5_IB_MTT_READ = (1 << 0), | ||
311 | MLX5_IB_MTT_WRITE = (1 << 1), | ||
312 | }; | ||
313 | |||
314 | #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) | ||
315 | |||
256 | struct mlx5_ib_mr { | 316 | struct mlx5_ib_mr { |
257 | struct ib_mr ibmr; | 317 | struct ib_mr ibmr; |
258 | struct mlx5_core_mr mmr; | 318 | struct mlx5_core_mr mmr; |
@@ -261,12 +321,11 @@ struct mlx5_ib_mr { | |||
261 | struct list_head list; | 321 | struct list_head list; |
262 | int order; | 322 | int order; |
263 | int umred; | 323 | int umred; |
264 | __be64 *pas; | ||
265 | dma_addr_t dma; | ||
266 | int npages; | 324 | int npages; |
267 | struct mlx5_ib_dev *dev; | 325 | struct mlx5_ib_dev *dev; |
268 | struct mlx5_create_mkey_mbox_out out; | 326 | struct mlx5_create_mkey_mbox_out out; |
269 | struct mlx5_core_sig_ctx *sig; | 327 | struct mlx5_core_sig_ctx *sig; |
328 | int live; | ||
270 | }; | 329 | }; |
271 | 330 | ||
272 | struct mlx5_ib_fast_reg_page_list { | 331 | struct mlx5_ib_fast_reg_page_list { |
@@ -372,11 +431,18 @@ struct mlx5_ib_dev { | |||
372 | struct umr_common umrc; | 431 | struct umr_common umrc; |
373 | /* sync used page count stats | 432 | /* sync used page count stats |
374 | */ | 433 | */ |
375 | spinlock_t mr_lock; | ||
376 | struct mlx5_ib_resources devr; | 434 | struct mlx5_ib_resources devr; |
377 | struct mlx5_mr_cache cache; | 435 | struct mlx5_mr_cache cache; |
378 | struct timer_list delay_timer; | 436 | struct timer_list delay_timer; |
379 | int fill_delay; | 437 | int fill_delay; |
438 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
439 | struct ib_odp_caps odp_caps; | ||
440 | /* | ||
441 | * Sleepable RCU that prevents destruction of MRs while they are still | ||
442 | * being used by a page fault handler. | ||
443 | */ | ||
444 | struct srcu_struct mr_srcu; | ||
445 | #endif | ||
380 | }; | 446 | }; |
381 | 447 | ||
382 | static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) | 448 | static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) |
@@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, | |||
490 | int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, | 556 | int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, |
491 | struct ib_recv_wr **bad_wr); | 557 | struct ib_recv_wr **bad_wr); |
492 | void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); | 558 | void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); |
559 | int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, | ||
560 | void *buffer, u32 length); | ||
493 | struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, | 561 | struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, |
494 | int vector, struct ib_ucontext *context, | 562 | int vector, struct ib_ucontext *context, |
495 | struct ib_udata *udata); | 563 | struct ib_udata *udata); |
@@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); | |||
502 | struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | 570 | struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, |
503 | u64 virt_addr, int access_flags, | 571 | u64 virt_addr, int access_flags, |
504 | struct ib_udata *udata); | 572 | struct ib_udata *udata); |
573 | int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, | ||
574 | int npages, int zap); | ||
505 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr); | 575 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr); |
506 | int mlx5_ib_destroy_mr(struct ib_mr *ibmr); | 576 | int mlx5_ib_destroy_mr(struct ib_mr *ibmr); |
507 | struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, | 577 | struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, |
@@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); | |||
533 | void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); | 603 | void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); |
534 | void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, | 604 | void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, |
535 | int *ncont, int *order); | 605 | int *ncont, int *order); |
606 | void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | ||
607 | int page_shift, size_t offset, size_t num_pages, | ||
608 | __be64 *pas, int access_flags); | ||
536 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | 609 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, |
537 | int page_shift, __be64 *pas, int umr); | 610 | int page_shift, __be64 *pas, int access_flags); |
538 | void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); | 611 | void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); |
539 | int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); | 612 | int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); |
540 | int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); | 613 | int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); |
@@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); | |||
544 | int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, | 617 | int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, |
545 | struct ib_mr_status *mr_status); | 618 | struct ib_mr_status *mr_status); |
546 | 619 | ||
620 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
621 | extern struct workqueue_struct *mlx5_ib_page_fault_wq; | ||
622 | |||
623 | int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); | ||
624 | void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, | ||
625 | struct mlx5_ib_pfault *pfault); | ||
626 | void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); | ||
627 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); | ||
628 | void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); | ||
629 | int __init mlx5_ib_odp_init(void); | ||
630 | void mlx5_ib_odp_cleanup(void); | ||
631 | void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); | ||
632 | void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); | ||
633 | void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, | ||
634 | unsigned long end); | ||
635 | |||
636 | #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
637 | static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) | ||
638 | { | ||
639 | return 0; | ||
640 | } | ||
641 | |||
642 | static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} | ||
643 | static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } | ||
644 | static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} | ||
645 | static inline int mlx5_ib_odp_init(void) { return 0; } | ||
646 | static inline void mlx5_ib_odp_cleanup(void) {} | ||
647 | static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} | ||
648 | static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} | ||
649 | |||
650 | #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
651 | |||
547 | static inline void init_query_mad(struct ib_smp *mad) | 652 | static inline void init_query_mad(struct ib_smp *mad) |
548 | { | 653 | { |
549 | mad->base_version = 1; | 654 | mad->base_version = 1; |
@@ -561,4 +666,7 @@ static inline u8 convert_access(int acc) | |||
561 | MLX5_PERM_LOCAL_READ; | 666 | MLX5_PERM_LOCAL_READ; |
562 | } | 667 | } |
563 | 668 | ||
669 | #define MLX5_MAX_UMR_SHIFT 16 | ||
670 | #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) | ||
671 | |||
564 | #endif /* MLX5_IB_H */ | 672 | #endif /* MLX5_IB_H */ |
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 5a80dd993761..32a28bd50b20 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c | |||
@@ -37,21 +37,34 @@ | |||
37 | #include <linux/export.h> | 37 | #include <linux/export.h> |
38 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
39 | #include <rdma/ib_umem.h> | 39 | #include <rdma/ib_umem.h> |
40 | #include <rdma/ib_umem_odp.h> | ||
41 | #include <rdma/ib_verbs.h> | ||
40 | #include "mlx5_ib.h" | 42 | #include "mlx5_ib.h" |
41 | 43 | ||
42 | enum { | 44 | enum { |
43 | MAX_PENDING_REG_MR = 8, | 45 | MAX_PENDING_REG_MR = 8, |
44 | }; | 46 | }; |
45 | 47 | ||
46 | enum { | 48 | #define MLX5_UMR_ALIGN 2048 |
47 | MLX5_UMR_ALIGN = 2048 | 49 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
48 | }; | 50 | static __be64 mlx5_ib_update_mtt_emergency_buffer[ |
51 | MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] | ||
52 | __aligned(MLX5_UMR_ALIGN); | ||
53 | static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); | ||
54 | #endif | ||
55 | |||
56 | static int clean_mr(struct mlx5_ib_mr *mr); | ||
49 | 57 | ||
50 | static __be64 *mr_align(__be64 *ptr, int align) | 58 | static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) |
51 | { | 59 | { |
52 | unsigned long mask = align - 1; | 60 | int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); |
53 | 61 | ||
54 | return (__be64 *)(((unsigned long)ptr + mask) & ~mask); | 62 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
63 | /* Wait until all page fault handlers using the mr complete. */ | ||
64 | synchronize_srcu(&dev->mr_srcu); | ||
65 | #endif | ||
66 | |||
67 | return err; | ||
55 | } | 68 | } |
56 | 69 | ||
57 | static int order2idx(struct mlx5_ib_dev *dev, int order) | 70 | static int order2idx(struct mlx5_ib_dev *dev, int order) |
@@ -146,7 +159,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) | |||
146 | mr->order = ent->order; | 159 | mr->order = ent->order; |
147 | mr->umred = 1; | 160 | mr->umred = 1; |
148 | mr->dev = dev; | 161 | mr->dev = dev; |
149 | in->seg.status = 1 << 6; | 162 | in->seg.status = MLX5_MKEY_STATUS_FREE; |
150 | in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); | 163 | in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); |
151 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); | 164 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); |
152 | in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; | 165 | in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; |
@@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) | |||
191 | ent->cur--; | 204 | ent->cur--; |
192 | ent->size--; | 205 | ent->size--; |
193 | spin_unlock_irq(&ent->lock); | 206 | spin_unlock_irq(&ent->lock); |
194 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 207 | err = destroy_mkey(dev, mr); |
195 | if (err) | 208 | if (err) |
196 | mlx5_ib_warn(dev, "failed destroy mkey\n"); | 209 | mlx5_ib_warn(dev, "failed destroy mkey\n"); |
197 | else | 210 | else |
@@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) | |||
482 | ent->cur--; | 495 | ent->cur--; |
483 | ent->size--; | 496 | ent->size--; |
484 | spin_unlock_irq(&ent->lock); | 497 | spin_unlock_irq(&ent->lock); |
485 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 498 | err = destroy_mkey(dev, mr); |
486 | if (err) | 499 | if (err) |
487 | mlx5_ib_warn(dev, "failed destroy mkey\n"); | 500 | mlx5_ib_warn(dev, "failed destroy mkey\n"); |
488 | else | 501 | else |
@@ -668,7 +681,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size) | |||
668 | 681 | ||
669 | static int use_umr(int order) | 682 | static int use_umr(int order) |
670 | { | 683 | { |
671 | return order <= 17; | 684 | return order <= MLX5_MAX_UMR_SHIFT; |
672 | } | 685 | } |
673 | 686 | ||
674 | static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, | 687 | static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, |
@@ -678,6 +691,7 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, | |||
678 | { | 691 | { |
679 | struct mlx5_ib_dev *dev = to_mdev(pd->device); | 692 | struct mlx5_ib_dev *dev = to_mdev(pd->device); |
680 | struct ib_mr *mr = dev->umrc.mr; | 693 | struct ib_mr *mr = dev->umrc.mr; |
694 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; | ||
681 | 695 | ||
682 | sg->addr = dma; | 696 | sg->addr = dma; |
683 | sg->length = ALIGN(sizeof(u64) * n, 64); | 697 | sg->length = ALIGN(sizeof(u64) * n, 64); |
@@ -692,21 +706,24 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, | |||
692 | wr->num_sge = 0; | 706 | wr->num_sge = 0; |
693 | 707 | ||
694 | wr->opcode = MLX5_IB_WR_UMR; | 708 | wr->opcode = MLX5_IB_WR_UMR; |
695 | wr->wr.fast_reg.page_list_len = n; | 709 | |
696 | wr->wr.fast_reg.page_shift = page_shift; | 710 | umrwr->npages = n; |
697 | wr->wr.fast_reg.rkey = key; | 711 | umrwr->page_shift = page_shift; |
698 | wr->wr.fast_reg.iova_start = virt_addr; | 712 | umrwr->mkey = key; |
699 | wr->wr.fast_reg.length = len; | 713 | umrwr->target.virt_addr = virt_addr; |
700 | wr->wr.fast_reg.access_flags = access_flags; | 714 | umrwr->length = len; |
701 | wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd; | 715 | umrwr->access_flags = access_flags; |
716 | umrwr->pd = pd; | ||
702 | } | 717 | } |
703 | 718 | ||
704 | static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, | 719 | static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, |
705 | struct ib_send_wr *wr, u32 key) | 720 | struct ib_send_wr *wr, u32 key) |
706 | { | 721 | { |
707 | wr->send_flags = MLX5_IB_SEND_UMR_UNREG; | 722 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; |
723 | |||
724 | wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; | ||
708 | wr->opcode = MLX5_IB_WR_UMR; | 725 | wr->opcode = MLX5_IB_WR_UMR; |
709 | wr->wr.fast_reg.rkey = key; | 726 | umrwr->mkey = key; |
710 | } | 727 | } |
711 | 728 | ||
712 | void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) | 729 | void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) |
@@ -742,7 +759,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, | |||
742 | struct ib_send_wr wr, *bad; | 759 | struct ib_send_wr wr, *bad; |
743 | struct mlx5_ib_mr *mr; | 760 | struct mlx5_ib_mr *mr; |
744 | struct ib_sge sg; | 761 | struct ib_sge sg; |
745 | int size = sizeof(u64) * npages; | 762 | int size; |
763 | __be64 *mr_pas; | ||
764 | __be64 *pas; | ||
765 | dma_addr_t dma; | ||
746 | int err = 0; | 766 | int err = 0; |
747 | int i; | 767 | int i; |
748 | 768 | ||
@@ -761,25 +781,31 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, | |||
761 | if (!mr) | 781 | if (!mr) |
762 | return ERR_PTR(-EAGAIN); | 782 | return ERR_PTR(-EAGAIN); |
763 | 783 | ||
764 | mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); | 784 | /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. |
765 | if (!mr->pas) { | 785 | * To avoid copying garbage after the pas array, we allocate |
786 | * a little more. */ | ||
787 | size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); | ||
788 | mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); | ||
789 | if (!mr_pas) { | ||
766 | err = -ENOMEM; | 790 | err = -ENOMEM; |
767 | goto free_mr; | 791 | goto free_mr; |
768 | } | 792 | } |
769 | 793 | ||
770 | mlx5_ib_populate_pas(dev, umem, page_shift, | 794 | pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); |
771 | mr_align(mr->pas, MLX5_UMR_ALIGN), 1); | 795 | mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); |
796 | /* Clear padding after the actual pages. */ | ||
797 | memset(pas + npages, 0, size - npages * sizeof(u64)); | ||
772 | 798 | ||
773 | mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, | 799 | dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); |
774 | DMA_TO_DEVICE); | 800 | if (dma_mapping_error(ddev, dma)) { |
775 | if (dma_mapping_error(ddev, mr->dma)) { | ||
776 | err = -ENOMEM; | 801 | err = -ENOMEM; |
777 | goto free_pas; | 802 | goto free_pas; |
778 | } | 803 | } |
779 | 804 | ||
780 | memset(&wr, 0, sizeof(wr)); | 805 | memset(&wr, 0, sizeof(wr)); |
781 | wr.wr_id = (u64)(unsigned long)&umr_context; | 806 | wr.wr_id = (u64)(unsigned long)&umr_context; |
782 | prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); | 807 | prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift, |
808 | virt_addr, len, access_flags); | ||
783 | 809 | ||
784 | mlx5_ib_init_umr_context(&umr_context); | 810 | mlx5_ib_init_umr_context(&umr_context); |
785 | down(&umrc->sem); | 811 | down(&umrc->sem); |
@@ -799,12 +825,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, | |||
799 | mr->mmr.size = len; | 825 | mr->mmr.size = len; |
800 | mr->mmr.pd = to_mpd(pd)->pdn; | 826 | mr->mmr.pd = to_mpd(pd)->pdn; |
801 | 827 | ||
828 | mr->live = 1; | ||
829 | |||
802 | unmap_dma: | 830 | unmap_dma: |
803 | up(&umrc->sem); | 831 | up(&umrc->sem); |
804 | dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); | 832 | dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); |
805 | 833 | ||
806 | free_pas: | 834 | free_pas: |
807 | kfree(mr->pas); | 835 | kfree(mr_pas); |
808 | 836 | ||
809 | free_mr: | 837 | free_mr: |
810 | if (err) { | 838 | if (err) { |
@@ -815,6 +843,128 @@ free_mr: | |||
815 | return mr; | 843 | return mr; |
816 | } | 844 | } |
817 | 845 | ||
846 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
847 | int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, | ||
848 | int zap) | ||
849 | { | ||
850 | struct mlx5_ib_dev *dev = mr->dev; | ||
851 | struct device *ddev = dev->ib_dev.dma_device; | ||
852 | struct umr_common *umrc = &dev->umrc; | ||
853 | struct mlx5_ib_umr_context umr_context; | ||
854 | struct ib_umem *umem = mr->umem; | ||
855 | int size; | ||
856 | __be64 *pas; | ||
857 | dma_addr_t dma; | ||
858 | struct ib_send_wr wr, *bad; | ||
859 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg; | ||
860 | struct ib_sge sg; | ||
861 | int err = 0; | ||
862 | const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); | ||
863 | const int page_index_mask = page_index_alignment - 1; | ||
864 | size_t pages_mapped = 0; | ||
865 | size_t pages_to_map = 0; | ||
866 | size_t pages_iter = 0; | ||
867 | int use_emergency_buf = 0; | ||
868 | |||
869 | /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, | ||
870 | * so we need to align the offset and length accordingly */ | ||
871 | if (start_page_index & page_index_mask) { | ||
872 | npages += start_page_index & page_index_mask; | ||
873 | start_page_index &= ~page_index_mask; | ||
874 | } | ||
875 | |||
876 | pages_to_map = ALIGN(npages, page_index_alignment); | ||
877 | |||
878 | if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) | ||
879 | return -EINVAL; | ||
880 | |||
881 | size = sizeof(u64) * pages_to_map; | ||
882 | size = min_t(int, PAGE_SIZE, size); | ||
883 | /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim | ||
884 | * code, when we are called from an invalidation. The pas buffer must | ||
885 | * be 2k-aligned for Connect-IB. */ | ||
886 | pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); | ||
887 | if (!pas) { | ||
888 | mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); | ||
889 | pas = mlx5_ib_update_mtt_emergency_buffer; | ||
890 | size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; | ||
891 | use_emergency_buf = 1; | ||
892 | mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); | ||
893 | memset(pas, 0, size); | ||
894 | } | ||
895 | pages_iter = size / sizeof(u64); | ||
896 | dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); | ||
897 | if (dma_mapping_error(ddev, dma)) { | ||
898 | mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); | ||
899 | err = -ENOMEM; | ||
900 | goto free_pas; | ||
901 | } | ||
902 | |||
903 | for (pages_mapped = 0; | ||
904 | pages_mapped < pages_to_map && !err; | ||
905 | pages_mapped += pages_iter, start_page_index += pages_iter) { | ||
906 | dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); | ||
907 | |||
908 | npages = min_t(size_t, | ||
909 | pages_iter, | ||
910 | ib_umem_num_pages(umem) - start_page_index); | ||
911 | |||
912 | if (!zap) { | ||
913 | __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, | ||
914 | start_page_index, npages, pas, | ||
915 | MLX5_IB_MTT_PRESENT); | ||
916 | /* Clear padding after the pages brought from the | ||
917 | * umem. */ | ||
918 | memset(pas + npages, 0, size - npages * sizeof(u64)); | ||
919 | } | ||
920 | |||
921 | dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); | ||
922 | |||
923 | memset(&wr, 0, sizeof(wr)); | ||
924 | wr.wr_id = (u64)(unsigned long)&umr_context; | ||
925 | |||
926 | sg.addr = dma; | ||
927 | sg.length = ALIGN(npages * sizeof(u64), | ||
928 | MLX5_UMR_MTT_ALIGNMENT); | ||
929 | sg.lkey = dev->umrc.mr->lkey; | ||
930 | |||
931 | wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | | ||
932 | MLX5_IB_SEND_UMR_UPDATE_MTT; | ||
933 | wr.sg_list = &sg; | ||
934 | wr.num_sge = 1; | ||
935 | wr.opcode = MLX5_IB_WR_UMR; | ||
936 | umrwr->npages = sg.length / sizeof(u64); | ||
937 | umrwr->page_shift = PAGE_SHIFT; | ||
938 | umrwr->mkey = mr->mmr.key; | ||
939 | umrwr->target.offset = start_page_index; | ||
940 | |||
941 | mlx5_ib_init_umr_context(&umr_context); | ||
942 | down(&umrc->sem); | ||
943 | err = ib_post_send(umrc->qp, &wr, &bad); | ||
944 | if (err) { | ||
945 | mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); | ||
946 | } else { | ||
947 | wait_for_completion(&umr_context.done); | ||
948 | if (umr_context.status != IB_WC_SUCCESS) { | ||
949 | mlx5_ib_err(dev, "UMR completion failed, code %d\n", | ||
950 | umr_context.status); | ||
951 | err = -EFAULT; | ||
952 | } | ||
953 | } | ||
954 | up(&umrc->sem); | ||
955 | } | ||
956 | dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); | ||
957 | |||
958 | free_pas: | ||
959 | if (!use_emergency_buf) | ||
960 | free_page((unsigned long)pas); | ||
961 | else | ||
962 | mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); | ||
963 | |||
964 | return err; | ||
965 | } | ||
966 | #endif | ||
967 | |||
818 | static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | 968 | static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, |
819 | u64 length, struct ib_umem *umem, | 969 | u64 length, struct ib_umem *umem, |
820 | int npages, int page_shift, | 970 | int npages, int page_shift, |
@@ -825,6 +975,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | |||
825 | struct mlx5_ib_mr *mr; | 975 | struct mlx5_ib_mr *mr; |
826 | int inlen; | 976 | int inlen; |
827 | int err; | 977 | int err; |
978 | bool pg_cap = !!(dev->mdev->caps.gen.flags & | ||
979 | MLX5_DEV_CAP_FLAG_ON_DMND_PG); | ||
828 | 980 | ||
829 | mr = kzalloc(sizeof(*mr), GFP_KERNEL); | 981 | mr = kzalloc(sizeof(*mr), GFP_KERNEL); |
830 | if (!mr) | 982 | if (!mr) |
@@ -836,8 +988,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | |||
836 | err = -ENOMEM; | 988 | err = -ENOMEM; |
837 | goto err_1; | 989 | goto err_1; |
838 | } | 990 | } |
839 | mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); | 991 | mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, |
992 | pg_cap ? MLX5_IB_MTT_PRESENT : 0); | ||
840 | 993 | ||
994 | /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags | ||
995 | * in the page list submitted with the command. */ | ||
996 | in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; | ||
841 | in->seg.flags = convert_access(access_flags) | | 997 | in->seg.flags = convert_access(access_flags) | |
842 | MLX5_ACCESS_MODE_MTT; | 998 | MLX5_ACCESS_MODE_MTT; |
843 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); | 999 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); |
@@ -856,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | |||
856 | goto err_2; | 1012 | goto err_2; |
857 | } | 1013 | } |
858 | mr->umem = umem; | 1014 | mr->umem = umem; |
1015 | mr->live = 1; | ||
859 | kvfree(in); | 1016 | kvfree(in); |
860 | 1017 | ||
861 | mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); | 1018 | mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); |
@@ -910,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
910 | mlx5_ib_dbg(dev, "cache empty for order %d", order); | 1067 | mlx5_ib_dbg(dev, "cache empty for order %d", order); |
911 | mr = NULL; | 1068 | mr = NULL; |
912 | } | 1069 | } |
1070 | } else if (access_flags & IB_ACCESS_ON_DEMAND) { | ||
1071 | err = -EINVAL; | ||
1072 | pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); | ||
1073 | goto error; | ||
913 | } | 1074 | } |
914 | 1075 | ||
915 | if (!mr) | 1076 | if (!mr) |
@@ -925,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
925 | 1086 | ||
926 | mr->umem = umem; | 1087 | mr->umem = umem; |
927 | mr->npages = npages; | 1088 | mr->npages = npages; |
928 | spin_lock(&dev->mr_lock); | 1089 | atomic_add(npages, &dev->mdev->priv.reg_pages); |
929 | dev->mdev->priv.reg_pages += npages; | ||
930 | spin_unlock(&dev->mr_lock); | ||
931 | mr->ibmr.lkey = mr->mmr.key; | 1090 | mr->ibmr.lkey = mr->mmr.key; |
932 | mr->ibmr.rkey = mr->mmr.key; | 1091 | mr->ibmr.rkey = mr->mmr.key; |
933 | 1092 | ||
1093 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
1094 | if (umem->odp_data) { | ||
1095 | /* | ||
1096 | * This barrier prevents the compiler from moving the | ||
1097 | * setting of umem->odp_data->private to point to our | ||
1098 | * MR, before reg_umr finished, to ensure that the MR | ||
1099 | * initialization have finished before starting to | ||
1100 | * handle invalidations. | ||
1101 | */ | ||
1102 | smp_wmb(); | ||
1103 | mr->umem->odp_data->private = mr; | ||
1104 | /* | ||
1105 | * Make sure we will see the new | ||
1106 | * umem->odp_data->private value in the invalidation | ||
1107 | * routines, before we can get page faults on the | ||
1108 | * MR. Page faults can happen once we put the MR in | ||
1109 | * the tree, below this line. Without the barrier, | ||
1110 | * there can be a fault handling and an invalidation | ||
1111 | * before umem->odp_data->private == mr is visible to | ||
1112 | * the invalidation handler. | ||
1113 | */ | ||
1114 | smp_wmb(); | ||
1115 | } | ||
1116 | #endif | ||
1117 | |||
934 | return &mr->ibmr; | 1118 | return &mr->ibmr; |
935 | 1119 | ||
936 | error: | 1120 | error: |
1121 | /* | ||
1122 | * Destroy the umem *before* destroying the MR, to ensure we | ||
1123 | * will not have any in-flight notifiers when destroying the | ||
1124 | * MR. | ||
1125 | * | ||
1126 | * As the MR is completely invalid to begin with, and this | ||
1127 | * error path is only taken if we can't push the mr entry into | ||
1128 | * the pagefault tree, this is safe. | ||
1129 | */ | ||
1130 | |||
937 | ib_umem_release(umem); | 1131 | ib_umem_release(umem); |
1132 | /* Kill the MR, and return an error code. */ | ||
1133 | clean_mr(mr); | ||
938 | return ERR_PTR(err); | 1134 | return ERR_PTR(err); |
939 | } | 1135 | } |
940 | 1136 | ||
@@ -971,17 +1167,14 @@ error: | |||
971 | return err; | 1167 | return err; |
972 | } | 1168 | } |
973 | 1169 | ||
974 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr) | 1170 | static int clean_mr(struct mlx5_ib_mr *mr) |
975 | { | 1171 | { |
976 | struct mlx5_ib_dev *dev = to_mdev(ibmr->device); | 1172 | struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); |
977 | struct mlx5_ib_mr *mr = to_mmr(ibmr); | ||
978 | struct ib_umem *umem = mr->umem; | ||
979 | int npages = mr->npages; | ||
980 | int umred = mr->umred; | 1173 | int umred = mr->umred; |
981 | int err; | 1174 | int err; |
982 | 1175 | ||
983 | if (!umred) { | 1176 | if (!umred) { |
984 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 1177 | err = destroy_mkey(dev, mr); |
985 | if (err) { | 1178 | if (err) { |
986 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", | 1179 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", |
987 | mr->mmr.key, err); | 1180 | mr->mmr.key, err); |
@@ -996,15 +1189,47 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) | |||
996 | free_cached_mr(dev, mr); | 1189 | free_cached_mr(dev, mr); |
997 | } | 1190 | } |
998 | 1191 | ||
999 | if (umem) { | 1192 | if (!umred) |
1193 | kfree(mr); | ||
1194 | |||
1195 | return 0; | ||
1196 | } | ||
1197 | |||
1198 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr) | ||
1199 | { | ||
1200 | struct mlx5_ib_dev *dev = to_mdev(ibmr->device); | ||
1201 | struct mlx5_ib_mr *mr = to_mmr(ibmr); | ||
1202 | int npages = mr->npages; | ||
1203 | struct ib_umem *umem = mr->umem; | ||
1204 | |||
1205 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
1206 | if (umem && umem->odp_data) { | ||
1207 | /* Prevent new page faults from succeeding */ | ||
1208 | mr->live = 0; | ||
1209 | /* Wait for all running page-fault handlers to finish. */ | ||
1210 | synchronize_srcu(&dev->mr_srcu); | ||
1211 | /* Destroy all page mappings */ | ||
1212 | mlx5_ib_invalidate_range(umem, ib_umem_start(umem), | ||
1213 | ib_umem_end(umem)); | ||
1214 | /* | ||
1215 | * We kill the umem before the MR for ODP, | ||
1216 | * so that there will not be any invalidations in | ||
1217 | * flight, looking at the *mr struct. | ||
1218 | */ | ||
1000 | ib_umem_release(umem); | 1219 | ib_umem_release(umem); |
1001 | spin_lock(&dev->mr_lock); | 1220 | atomic_sub(npages, &dev->mdev->priv.reg_pages); |
1002 | dev->mdev->priv.reg_pages -= npages; | 1221 | |
1003 | spin_unlock(&dev->mr_lock); | 1222 | /* Avoid double-freeing the umem. */ |
1223 | umem = NULL; | ||
1004 | } | 1224 | } |
1225 | #endif | ||
1005 | 1226 | ||
1006 | if (!umred) | 1227 | clean_mr(mr); |
1007 | kfree(mr); | 1228 | |
1229 | if (umem) { | ||
1230 | ib_umem_release(umem); | ||
1231 | atomic_sub(npages, &dev->mdev->priv.reg_pages); | ||
1232 | } | ||
1008 | 1233 | ||
1009 | return 0; | 1234 | return 0; |
1010 | } | 1235 | } |
@@ -1028,7 +1253,7 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, | |||
1028 | goto err_free; | 1253 | goto err_free; |
1029 | } | 1254 | } |
1030 | 1255 | ||
1031 | in->seg.status = 1 << 6; /* free */ | 1256 | in->seg.status = MLX5_MKEY_STATUS_FREE; |
1032 | in->seg.xlt_oct_size = cpu_to_be32(ndescs); | 1257 | in->seg.xlt_oct_size = cpu_to_be32(ndescs); |
1033 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); | 1258 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); |
1034 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); | 1259 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); |
@@ -1113,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr) | |||
1113 | kfree(mr->sig); | 1338 | kfree(mr->sig); |
1114 | } | 1339 | } |
1115 | 1340 | ||
1116 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 1341 | err = destroy_mkey(dev, mr); |
1117 | if (err) { | 1342 | if (err) { |
1118 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", | 1343 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", |
1119 | mr->mmr.key, err); | 1344 | mr->mmr.key, err); |
@@ -1143,7 +1368,7 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, | |||
1143 | goto err_free; | 1368 | goto err_free; |
1144 | } | 1369 | } |
1145 | 1370 | ||
1146 | in->seg.status = 1 << 6; /* free */ | 1371 | in->seg.status = MLX5_MKEY_STATUS_FREE; |
1147 | in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); | 1372 | in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); |
1148 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); | 1373 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); |
1149 | in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; | 1374 | in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; |
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 000000000000..a2c541c4809a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/odp.c | |||
@@ -0,0 +1,798 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #include <rdma/ib_umem.h> | ||
34 | #include <rdma/ib_umem_odp.h> | ||
35 | |||
36 | #include "mlx5_ib.h" | ||
37 | |||
38 | #define MAX_PREFETCH_LEN (4*1024*1024U) | ||
39 | |||
40 | /* Timeout in ms to wait for an active mmu notifier to complete when handling | ||
41 | * a pagefault. */ | ||
42 | #define MMU_NOTIFIER_TIMEOUT 1000 | ||
43 | |||
44 | struct workqueue_struct *mlx5_ib_page_fault_wq; | ||
45 | |||
46 | void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, | ||
47 | unsigned long end) | ||
48 | { | ||
49 | struct mlx5_ib_mr *mr; | ||
50 | const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; | ||
51 | u64 idx = 0, blk_start_idx = 0; | ||
52 | int in_block = 0; | ||
53 | u64 addr; | ||
54 | |||
55 | if (!umem || !umem->odp_data) { | ||
56 | pr_err("invalidation called on NULL umem or non-ODP umem\n"); | ||
57 | return; | ||
58 | } | ||
59 | |||
60 | mr = umem->odp_data->private; | ||
61 | |||
62 | if (!mr || !mr->ibmr.pd) | ||
63 | return; | ||
64 | |||
65 | start = max_t(u64, ib_umem_start(umem), start); | ||
66 | end = min_t(u64, ib_umem_end(umem), end); | ||
67 | |||
68 | /* | ||
69 | * Iteration one - zap the HW's MTTs. The notifiers_count ensures that | ||
70 | * while we are doing the invalidation, no page fault will attempt to | ||
71 | * overwrite the same MTTs. Concurent invalidations might race us, | ||
72 | * but they will write 0s as well, so no difference in the end result. | ||
73 | */ | ||
74 | |||
75 | for (addr = start; addr < end; addr += (u64)umem->page_size) { | ||
76 | idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; | ||
77 | /* | ||
78 | * Strive to write the MTTs in chunks, but avoid overwriting | ||
79 | * non-existing MTTs. The huristic here can be improved to | ||
80 | * estimate the cost of another UMR vs. the cost of bigger | ||
81 | * UMR. | ||
82 | */ | ||
83 | if (umem->odp_data->dma_list[idx] & | ||
84 | (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { | ||
85 | if (!in_block) { | ||
86 | blk_start_idx = idx; | ||
87 | in_block = 1; | ||
88 | } | ||
89 | } else { | ||
90 | u64 umr_offset = idx & umr_block_mask; | ||
91 | |||
92 | if (in_block && umr_offset == 0) { | ||
93 | mlx5_ib_update_mtt(mr, blk_start_idx, | ||
94 | idx - blk_start_idx, 1); | ||
95 | in_block = 0; | ||
96 | } | ||
97 | } | ||
98 | } | ||
99 | if (in_block) | ||
100 | mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, | ||
101 | 1); | ||
102 | |||
103 | /* | ||
104 | * We are now sure that the device will not access the | ||
105 | * memory. We can safely unmap it, and mark it as dirty if | ||
106 | * needed. | ||
107 | */ | ||
108 | |||
109 | ib_umem_odp_unmap_dma_pages(umem, start, end); | ||
110 | } | ||
111 | |||
112 | #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ | ||
113 | if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ | ||
114 | ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ | ||
115 | } while (0) | ||
116 | |||
117 | int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) | ||
118 | { | ||
119 | int err; | ||
120 | struct mlx5_odp_caps hw_caps; | ||
121 | struct ib_odp_caps *caps = &dev->odp_caps; | ||
122 | |||
123 | memset(caps, 0, sizeof(*caps)); | ||
124 | |||
125 | if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) | ||
126 | return 0; | ||
127 | |||
128 | err = mlx5_query_odp_caps(dev->mdev, &hw_caps); | ||
129 | if (err) | ||
130 | goto out; | ||
131 | |||
132 | caps->general_caps = IB_ODP_SUPPORT; | ||
133 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, | ||
134 | SEND); | ||
135 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
136 | SEND); | ||
137 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
138 | RECV); | ||
139 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
140 | WRITE); | ||
141 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
142 | READ); | ||
143 | |||
144 | out: | ||
145 | return err; | ||
146 | } | ||
147 | |||
148 | static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, | ||
149 | u32 key) | ||
150 | { | ||
151 | u32 base_key = mlx5_base_mkey(key); | ||
152 | struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); | ||
153 | struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); | ||
154 | |||
155 | if (!mmr || mmr->key != key || !mr->live) | ||
156 | return NULL; | ||
157 | |||
158 | return container_of(mmr, struct mlx5_ib_mr, mmr); | ||
159 | } | ||
160 | |||
161 | static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, | ||
162 | struct mlx5_ib_pfault *pfault, | ||
163 | int error) { | ||
164 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
165 | int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, | ||
166 | pfault->mpfault.flags, | ||
167 | error); | ||
168 | if (ret) | ||
169 | pr_err("Failed to resolve the page fault on QP 0x%x\n", | ||
170 | qp->mqp.qpn); | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * Handle a single data segment in a page-fault WQE. | ||
175 | * | ||
176 | * Returns number of pages retrieved on success. The caller will continue to | ||
177 | * the next data segment. | ||
178 | * Can return the following error codes: | ||
179 | * -EAGAIN to designate a temporary error. The caller will abort handling the | ||
180 | * page fault and resolve it. | ||
181 | * -EFAULT when there's an error mapping the requested pages. The caller will | ||
182 | * abort the page fault handling and possibly move the QP to an error state. | ||
183 | * On other errors the QP should also be closed with an error. | ||
184 | */ | ||
185 | static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, | ||
186 | struct mlx5_ib_pfault *pfault, | ||
187 | u32 key, u64 io_virt, size_t bcnt, | ||
188 | u32 *bytes_mapped) | ||
189 | { | ||
190 | struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); | ||
191 | int srcu_key; | ||
192 | unsigned int current_seq; | ||
193 | u64 start_idx; | ||
194 | int npages = 0, ret = 0; | ||
195 | struct mlx5_ib_mr *mr; | ||
196 | u64 access_mask = ODP_READ_ALLOWED_BIT; | ||
197 | |||
198 | srcu_key = srcu_read_lock(&mib_dev->mr_srcu); | ||
199 | mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); | ||
200 | /* | ||
201 | * If we didn't find the MR, it means the MR was closed while we were | ||
202 | * handling the ODP event. In this case we return -EFAULT so that the | ||
203 | * QP will be closed. | ||
204 | */ | ||
205 | if (!mr || !mr->ibmr.pd) { | ||
206 | pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", | ||
207 | key); | ||
208 | ret = -EFAULT; | ||
209 | goto srcu_unlock; | ||
210 | } | ||
211 | if (!mr->umem->odp_data) { | ||
212 | pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", | ||
213 | key); | ||
214 | if (bytes_mapped) | ||
215 | *bytes_mapped += | ||
216 | (bcnt - pfault->mpfault.bytes_committed); | ||
217 | goto srcu_unlock; | ||
218 | } | ||
219 | if (mr->ibmr.pd != qp->ibqp.pd) { | ||
220 | pr_err("Page-fault with different PDs for QP and MR.\n"); | ||
221 | ret = -EFAULT; | ||
222 | goto srcu_unlock; | ||
223 | } | ||
224 | |||
225 | current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); | ||
226 | /* | ||
227 | * Ensure the sequence number is valid for some time before we call | ||
228 | * gup. | ||
229 | */ | ||
230 | smp_rmb(); | ||
231 | |||
232 | /* | ||
233 | * Avoid branches - this code will perform correctly | ||
234 | * in all iterations (in iteration 2 and above, | ||
235 | * bytes_committed == 0). | ||
236 | */ | ||
237 | io_virt += pfault->mpfault.bytes_committed; | ||
238 | bcnt -= pfault->mpfault.bytes_committed; | ||
239 | |||
240 | start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; | ||
241 | |||
242 | if (mr->umem->writable) | ||
243 | access_mask |= ODP_WRITE_ALLOWED_BIT; | ||
244 | npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, | ||
245 | access_mask, current_seq); | ||
246 | if (npages < 0) { | ||
247 | ret = npages; | ||
248 | goto srcu_unlock; | ||
249 | } | ||
250 | |||
251 | if (npages > 0) { | ||
252 | mutex_lock(&mr->umem->odp_data->umem_mutex); | ||
253 | if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { | ||
254 | /* | ||
255 | * No need to check whether the MTTs really belong to | ||
256 | * this MR, since ib_umem_odp_map_dma_pages already | ||
257 | * checks this. | ||
258 | */ | ||
259 | ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); | ||
260 | } else { | ||
261 | ret = -EAGAIN; | ||
262 | } | ||
263 | mutex_unlock(&mr->umem->odp_data->umem_mutex); | ||
264 | if (ret < 0) { | ||
265 | if (ret != -EAGAIN) | ||
266 | pr_err("Failed to update mkey page tables\n"); | ||
267 | goto srcu_unlock; | ||
268 | } | ||
269 | |||
270 | if (bytes_mapped) { | ||
271 | u32 new_mappings = npages * PAGE_SIZE - | ||
272 | (io_virt - round_down(io_virt, PAGE_SIZE)); | ||
273 | *bytes_mapped += min_t(u32, new_mappings, bcnt); | ||
274 | } | ||
275 | } | ||
276 | |||
277 | srcu_unlock: | ||
278 | if (ret == -EAGAIN) { | ||
279 | if (!mr->umem->odp_data->dying) { | ||
280 | struct ib_umem_odp *odp_data = mr->umem->odp_data; | ||
281 | unsigned long timeout = | ||
282 | msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); | ||
283 | |||
284 | if (!wait_for_completion_timeout( | ||
285 | &odp_data->notifier_completion, | ||
286 | timeout)) { | ||
287 | pr_warn("timeout waiting for mmu notifier completion\n"); | ||
288 | } | ||
289 | } else { | ||
290 | /* The MR is being killed, kill the QP as well. */ | ||
291 | ret = -EFAULT; | ||
292 | } | ||
293 | } | ||
294 | srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); | ||
295 | pfault->mpfault.bytes_committed = 0; | ||
296 | return ret ? ret : npages; | ||
297 | } | ||
298 | |||
299 | /** | ||
300 | * Parse a series of data segments for page fault handling. | ||
301 | * | ||
302 | * @qp the QP on which the fault occurred. | ||
303 | * @pfault contains page fault information. | ||
304 | * @wqe points at the first data segment in the WQE. | ||
305 | * @wqe_end points after the end of the WQE. | ||
306 | * @bytes_mapped receives the number of bytes that the function was able to | ||
307 | * map. This allows the caller to decide intelligently whether | ||
308 | * enough memory was mapped to resolve the page fault | ||
309 | * successfully (e.g. enough for the next MTU, or the entire | ||
310 | * WQE). | ||
311 | * @total_wqe_bytes receives the total data size of this WQE in bytes (minus | ||
312 | * the committed bytes). | ||
313 | * | ||
314 | * Returns the number of pages loaded if positive, zero for an empty WQE, or a | ||
315 | * negative error code. | ||
316 | */ | ||
317 | static int pagefault_data_segments(struct mlx5_ib_qp *qp, | ||
318 | struct mlx5_ib_pfault *pfault, void *wqe, | ||
319 | void *wqe_end, u32 *bytes_mapped, | ||
320 | u32 *total_wqe_bytes, int receive_queue) | ||
321 | { | ||
322 | int ret = 0, npages = 0; | ||
323 | u64 io_virt; | ||
324 | u32 key; | ||
325 | u32 byte_count; | ||
326 | size_t bcnt; | ||
327 | int inline_segment; | ||
328 | |||
329 | /* Skip SRQ next-WQE segment. */ | ||
330 | if (receive_queue && qp->ibqp.srq) | ||
331 | wqe += sizeof(struct mlx5_wqe_srq_next_seg); | ||
332 | |||
333 | if (bytes_mapped) | ||
334 | *bytes_mapped = 0; | ||
335 | if (total_wqe_bytes) | ||
336 | *total_wqe_bytes = 0; | ||
337 | |||
338 | while (wqe < wqe_end) { | ||
339 | struct mlx5_wqe_data_seg *dseg = wqe; | ||
340 | |||
341 | io_virt = be64_to_cpu(dseg->addr); | ||
342 | key = be32_to_cpu(dseg->lkey); | ||
343 | byte_count = be32_to_cpu(dseg->byte_count); | ||
344 | inline_segment = !!(byte_count & MLX5_INLINE_SEG); | ||
345 | bcnt = byte_count & ~MLX5_INLINE_SEG; | ||
346 | |||
347 | if (inline_segment) { | ||
348 | bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; | ||
349 | wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, | ||
350 | 16); | ||
351 | } else { | ||
352 | wqe += sizeof(*dseg); | ||
353 | } | ||
354 | |||
355 | /* receive WQE end of sg list. */ | ||
356 | if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && | ||
357 | io_virt == 0) | ||
358 | break; | ||
359 | |||
360 | if (!inline_segment && total_wqe_bytes) { | ||
361 | *total_wqe_bytes += bcnt - min_t(size_t, bcnt, | ||
362 | pfault->mpfault.bytes_committed); | ||
363 | } | ||
364 | |||
365 | /* A zero length data segment designates a length of 2GB. */ | ||
366 | if (bcnt == 0) | ||
367 | bcnt = 1U << 31; | ||
368 | |||
369 | if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { | ||
370 | pfault->mpfault.bytes_committed -= | ||
371 | min_t(size_t, bcnt, | ||
372 | pfault->mpfault.bytes_committed); | ||
373 | continue; | ||
374 | } | ||
375 | |||
376 | ret = pagefault_single_data_segment(qp, pfault, key, io_virt, | ||
377 | bcnt, bytes_mapped); | ||
378 | if (ret < 0) | ||
379 | break; | ||
380 | npages += ret; | ||
381 | } | ||
382 | |||
383 | return ret < 0 ? ret : npages; | ||
384 | } | ||
385 | |||
386 | /* | ||
387 | * Parse initiator WQE. Advances the wqe pointer to point at the | ||
388 | * scatter-gather list, and set wqe_end to the end of the WQE. | ||
389 | */ | ||
390 | static int mlx5_ib_mr_initiator_pfault_handler( | ||
391 | struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, | ||
392 | void **wqe, void **wqe_end, int wqe_length) | ||
393 | { | ||
394 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
395 | struct mlx5_wqe_ctrl_seg *ctrl = *wqe; | ||
396 | u16 wqe_index = pfault->mpfault.wqe.wqe_index; | ||
397 | unsigned ds, opcode; | ||
398 | #if defined(DEBUG) | ||
399 | u32 ctrl_wqe_index, ctrl_qpn; | ||
400 | #endif | ||
401 | |||
402 | ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; | ||
403 | if (ds * MLX5_WQE_DS_UNITS > wqe_length) { | ||
404 | mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", | ||
405 | ds, wqe_length); | ||
406 | return -EFAULT; | ||
407 | } | ||
408 | |||
409 | if (ds == 0) { | ||
410 | mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", | ||
411 | wqe_index, qp->mqp.qpn); | ||
412 | return -EFAULT; | ||
413 | } | ||
414 | |||
415 | #if defined(DEBUG) | ||
416 | ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & | ||
417 | MLX5_WQE_CTRL_WQE_INDEX_MASK) >> | ||
418 | MLX5_WQE_CTRL_WQE_INDEX_SHIFT; | ||
419 | if (wqe_index != ctrl_wqe_index) { | ||
420 | mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", | ||
421 | wqe_index, qp->mqp.qpn, | ||
422 | ctrl_wqe_index); | ||
423 | return -EFAULT; | ||
424 | } | ||
425 | |||
426 | ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> | ||
427 | MLX5_WQE_CTRL_QPN_SHIFT; | ||
428 | if (qp->mqp.qpn != ctrl_qpn) { | ||
429 | mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", | ||
430 | wqe_index, qp->mqp.qpn, | ||
431 | ctrl_qpn); | ||
432 | return -EFAULT; | ||
433 | } | ||
434 | #endif /* DEBUG */ | ||
435 | |||
436 | *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; | ||
437 | *wqe += sizeof(*ctrl); | ||
438 | |||
439 | opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & | ||
440 | MLX5_WQE_CTRL_OPCODE_MASK; | ||
441 | switch (qp->ibqp.qp_type) { | ||
442 | case IB_QPT_RC: | ||
443 | switch (opcode) { | ||
444 | case MLX5_OPCODE_SEND: | ||
445 | case MLX5_OPCODE_SEND_IMM: | ||
446 | case MLX5_OPCODE_SEND_INVAL: | ||
447 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
448 | IB_ODP_SUPPORT_SEND)) | ||
449 | goto invalid_transport_or_opcode; | ||
450 | break; | ||
451 | case MLX5_OPCODE_RDMA_WRITE: | ||
452 | case MLX5_OPCODE_RDMA_WRITE_IMM: | ||
453 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
454 | IB_ODP_SUPPORT_WRITE)) | ||
455 | goto invalid_transport_or_opcode; | ||
456 | *wqe += sizeof(struct mlx5_wqe_raddr_seg); | ||
457 | break; | ||
458 | case MLX5_OPCODE_RDMA_READ: | ||
459 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
460 | IB_ODP_SUPPORT_READ)) | ||
461 | goto invalid_transport_or_opcode; | ||
462 | *wqe += sizeof(struct mlx5_wqe_raddr_seg); | ||
463 | break; | ||
464 | default: | ||
465 | goto invalid_transport_or_opcode; | ||
466 | } | ||
467 | break; | ||
468 | case IB_QPT_UD: | ||
469 | switch (opcode) { | ||
470 | case MLX5_OPCODE_SEND: | ||
471 | case MLX5_OPCODE_SEND_IMM: | ||
472 | if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & | ||
473 | IB_ODP_SUPPORT_SEND)) | ||
474 | goto invalid_transport_or_opcode; | ||
475 | *wqe += sizeof(struct mlx5_wqe_datagram_seg); | ||
476 | break; | ||
477 | default: | ||
478 | goto invalid_transport_or_opcode; | ||
479 | } | ||
480 | break; | ||
481 | default: | ||
482 | invalid_transport_or_opcode: | ||
483 | mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", | ||
484 | qp->ibqp.qp_type, opcode); | ||
485 | return -EFAULT; | ||
486 | } | ||
487 | |||
488 | return 0; | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * Parse responder WQE. Advances the wqe pointer to point at the | ||
493 | * scatter-gather list, and set wqe_end to the end of the WQE. | ||
494 | */ | ||
495 | static int mlx5_ib_mr_responder_pfault_handler( | ||
496 | struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, | ||
497 | void **wqe, void **wqe_end, int wqe_length) | ||
498 | { | ||
499 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
500 | struct mlx5_ib_wq *wq = &qp->rq; | ||
501 | int wqe_size = 1 << wq->wqe_shift; | ||
502 | |||
503 | if (qp->ibqp.srq) { | ||
504 | mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); | ||
505 | return -EFAULT; | ||
506 | } | ||
507 | |||
508 | if (qp->wq_sig) { | ||
509 | mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); | ||
510 | return -EFAULT; | ||
511 | } | ||
512 | |||
513 | if (wqe_size > wqe_length) { | ||
514 | mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); | ||
515 | return -EFAULT; | ||
516 | } | ||
517 | |||
518 | switch (qp->ibqp.qp_type) { | ||
519 | case IB_QPT_RC: | ||
520 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
521 | IB_ODP_SUPPORT_RECV)) | ||
522 | goto invalid_transport_or_opcode; | ||
523 | break; | ||
524 | default: | ||
525 | invalid_transport_or_opcode: | ||
526 | mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", | ||
527 | qp->ibqp.qp_type); | ||
528 | return -EFAULT; | ||
529 | } | ||
530 | |||
531 | *wqe_end = *wqe + wqe_size; | ||
532 | |||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, | ||
537 | struct mlx5_ib_pfault *pfault) | ||
538 | { | ||
539 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
540 | int ret; | ||
541 | void *wqe, *wqe_end; | ||
542 | u32 bytes_mapped, total_wqe_bytes; | ||
543 | char *buffer = NULL; | ||
544 | int resume_with_error = 0; | ||
545 | u16 wqe_index = pfault->mpfault.wqe.wqe_index; | ||
546 | int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; | ||
547 | |||
548 | buffer = (char *)__get_free_page(GFP_KERNEL); | ||
549 | if (!buffer) { | ||
550 | mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); | ||
551 | resume_with_error = 1; | ||
552 | goto resolve_page_fault; | ||
553 | } | ||
554 | |||
555 | ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, | ||
556 | PAGE_SIZE); | ||
557 | if (ret < 0) { | ||
558 | mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", | ||
559 | -ret, wqe_index, qp->mqp.qpn); | ||
560 | resume_with_error = 1; | ||
561 | goto resolve_page_fault; | ||
562 | } | ||
563 | |||
564 | wqe = buffer; | ||
565 | if (requestor) | ||
566 | ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, | ||
567 | &wqe_end, ret); | ||
568 | else | ||
569 | ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, | ||
570 | &wqe_end, ret); | ||
571 | if (ret < 0) { | ||
572 | resume_with_error = 1; | ||
573 | goto resolve_page_fault; | ||
574 | } | ||
575 | |||
576 | if (wqe >= wqe_end) { | ||
577 | mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); | ||
578 | resume_with_error = 1; | ||
579 | goto resolve_page_fault; | ||
580 | } | ||
581 | |||
582 | ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, | ||
583 | &total_wqe_bytes, !requestor); | ||
584 | if (ret == -EAGAIN) { | ||
585 | goto resolve_page_fault; | ||
586 | } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { | ||
587 | mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", | ||
588 | -ret); | ||
589 | resume_with_error = 1; | ||
590 | goto resolve_page_fault; | ||
591 | } | ||
592 | |||
593 | resolve_page_fault: | ||
594 | mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); | ||
595 | mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", | ||
596 | qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); | ||
597 | |||
598 | free_page((unsigned long)buffer); | ||
599 | } | ||
600 | |||
601 | static int pages_in_range(u64 address, u32 length) | ||
602 | { | ||
603 | return (ALIGN(address + length, PAGE_SIZE) - | ||
604 | (address & PAGE_MASK)) >> PAGE_SHIFT; | ||
605 | } | ||
606 | |||
607 | static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, | ||
608 | struct mlx5_ib_pfault *pfault) | ||
609 | { | ||
610 | struct mlx5_pagefault *mpfault = &pfault->mpfault; | ||
611 | u64 address; | ||
612 | u32 length; | ||
613 | u32 prefetch_len = mpfault->bytes_committed; | ||
614 | int prefetch_activated = 0; | ||
615 | u32 rkey = mpfault->rdma.r_key; | ||
616 | int ret; | ||
617 | |||
618 | /* The RDMA responder handler handles the page fault in two parts. | ||
619 | * First it brings the necessary pages for the current packet | ||
620 | * (and uses the pfault context), and then (after resuming the QP) | ||
621 | * prefetches more pages. The second operation cannot use the pfault | ||
622 | * context and therefore uses the dummy_pfault context allocated on | ||
623 | * the stack */ | ||
624 | struct mlx5_ib_pfault dummy_pfault = {}; | ||
625 | |||
626 | dummy_pfault.mpfault.bytes_committed = 0; | ||
627 | |||
628 | mpfault->rdma.rdma_va += mpfault->bytes_committed; | ||
629 | mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, | ||
630 | mpfault->rdma.rdma_op_len); | ||
631 | mpfault->bytes_committed = 0; | ||
632 | |||
633 | address = mpfault->rdma.rdma_va; | ||
634 | length = mpfault->rdma.rdma_op_len; | ||
635 | |||
636 | /* For some operations, the hardware cannot tell the exact message | ||
637 | * length, and in those cases it reports zero. Use prefetch | ||
638 | * logic. */ | ||
639 | if (length == 0) { | ||
640 | prefetch_activated = 1; | ||
641 | length = mpfault->rdma.packet_size; | ||
642 | prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); | ||
643 | } | ||
644 | |||
645 | ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, | ||
646 | NULL); | ||
647 | if (ret == -EAGAIN) { | ||
648 | /* We're racing with an invalidation, don't prefetch */ | ||
649 | prefetch_activated = 0; | ||
650 | } else if (ret < 0 || pages_in_range(address, length) > ret) { | ||
651 | mlx5_ib_page_fault_resume(qp, pfault, 1); | ||
652 | return; | ||
653 | } | ||
654 | |||
655 | mlx5_ib_page_fault_resume(qp, pfault, 0); | ||
656 | |||
657 | /* At this point, there might be a new pagefault already arriving in | ||
658 | * the eq, switch to the dummy pagefault for the rest of the | ||
659 | * processing. We're still OK with the objects being alive as the | ||
660 | * work-queue is being fenced. */ | ||
661 | |||
662 | if (prefetch_activated) { | ||
663 | ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, | ||
664 | address, | ||
665 | prefetch_len, | ||
666 | NULL); | ||
667 | if (ret < 0) { | ||
668 | pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", | ||
669 | ret, prefetch_activated, | ||
670 | qp->ibqp.qp_num, address, prefetch_len); | ||
671 | } | ||
672 | } | ||
673 | } | ||
674 | |||
675 | void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, | ||
676 | struct mlx5_ib_pfault *pfault) | ||
677 | { | ||
678 | u8 event_subtype = pfault->mpfault.event_subtype; | ||
679 | |||
680 | switch (event_subtype) { | ||
681 | case MLX5_PFAULT_SUBTYPE_WQE: | ||
682 | mlx5_ib_mr_wqe_pfault_handler(qp, pfault); | ||
683 | break; | ||
684 | case MLX5_PFAULT_SUBTYPE_RDMA: | ||
685 | mlx5_ib_mr_rdma_pfault_handler(qp, pfault); | ||
686 | break; | ||
687 | default: | ||
688 | pr_warn("Invalid page fault event subtype: 0x%x\n", | ||
689 | event_subtype); | ||
690 | mlx5_ib_page_fault_resume(qp, pfault, 1); | ||
691 | break; | ||
692 | } | ||
693 | } | ||
694 | |||
695 | static void mlx5_ib_qp_pfault_action(struct work_struct *work) | ||
696 | { | ||
697 | struct mlx5_ib_pfault *pfault = container_of(work, | ||
698 | struct mlx5_ib_pfault, | ||
699 | work); | ||
700 | enum mlx5_ib_pagefault_context context = | ||
701 | mlx5_ib_get_pagefault_context(&pfault->mpfault); | ||
702 | struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, | ||
703 | pagefaults[context]); | ||
704 | mlx5_ib_mr_pfault_handler(qp, pfault); | ||
705 | } | ||
706 | |||
707 | void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) | ||
708 | { | ||
709 | unsigned long flags; | ||
710 | |||
711 | spin_lock_irqsave(&qp->disable_page_faults_lock, flags); | ||
712 | qp->disable_page_faults = 1; | ||
713 | spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); | ||
714 | |||
715 | /* | ||
716 | * Note that at this point, we are guarenteed that no more | ||
717 | * work queue elements will be posted to the work queue with | ||
718 | * the QP we are closing. | ||
719 | */ | ||
720 | flush_workqueue(mlx5_ib_page_fault_wq); | ||
721 | } | ||
722 | |||
723 | void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) | ||
724 | { | ||
725 | unsigned long flags; | ||
726 | |||
727 | spin_lock_irqsave(&qp->disable_page_faults_lock, flags); | ||
728 | qp->disable_page_faults = 0; | ||
729 | spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); | ||
730 | } | ||
731 | |||
732 | static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, | ||
733 | struct mlx5_pagefault *pfault) | ||
734 | { | ||
735 | /* | ||
736 | * Note that we will only get one fault event per QP per context | ||
737 | * (responder/initiator, read/write), until we resolve the page fault | ||
738 | * with the mlx5_ib_page_fault_resume command. Since this function is | ||
739 | * called from within the work element, there is no risk of missing | ||
740 | * events. | ||
741 | */ | ||
742 | struct mlx5_ib_qp *mibqp = to_mibqp(qp); | ||
743 | enum mlx5_ib_pagefault_context context = | ||
744 | mlx5_ib_get_pagefault_context(pfault); | ||
745 | struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; | ||
746 | |||
747 | qp_pfault->mpfault = *pfault; | ||
748 | |||
749 | /* No need to stop interrupts here since we are in an interrupt */ | ||
750 | spin_lock(&mibqp->disable_page_faults_lock); | ||
751 | if (!mibqp->disable_page_faults) | ||
752 | queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); | ||
753 | spin_unlock(&mibqp->disable_page_faults_lock); | ||
754 | } | ||
755 | |||
756 | void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) | ||
757 | { | ||
758 | int i; | ||
759 | |||
760 | qp->disable_page_faults = 1; | ||
761 | spin_lock_init(&qp->disable_page_faults_lock); | ||
762 | |||
763 | qp->mqp.pfault_handler = mlx5_ib_pfault_handler; | ||
764 | |||
765 | for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) | ||
766 | INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); | ||
767 | } | ||
768 | |||
769 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) | ||
770 | { | ||
771 | int ret; | ||
772 | |||
773 | ret = init_srcu_struct(&ibdev->mr_srcu); | ||
774 | if (ret) | ||
775 | return ret; | ||
776 | |||
777 | return 0; | ||
778 | } | ||
779 | |||
780 | void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) | ||
781 | { | ||
782 | cleanup_srcu_struct(&ibdev->mr_srcu); | ||
783 | } | ||
784 | |||
785 | int __init mlx5_ib_odp_init(void) | ||
786 | { | ||
787 | mlx5_ib_page_fault_wq = | ||
788 | create_singlethread_workqueue("mlx5_ib_page_faults"); | ||
789 | if (!mlx5_ib_page_fault_wq) | ||
790 | return -ENOMEM; | ||
791 | |||
792 | return 0; | ||
793 | } | ||
794 | |||
795 | void mlx5_ib_odp_cleanup(void) | ||
796 | { | ||
797 | destroy_workqueue(mlx5_ib_page_fault_wq); | ||
798 | } | ||
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1cae1c7132b4..be0cd358b080 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c | |||
@@ -70,15 +70,6 @@ static const u32 mlx5_ib_opcode[] = { | |||
70 | [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, | 70 | [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, |
71 | }; | 71 | }; |
72 | 72 | ||
73 | struct umr_wr { | ||
74 | u64 virt_addr; | ||
75 | struct ib_pd *pd; | ||
76 | unsigned int page_shift; | ||
77 | unsigned int npages; | ||
78 | u32 length; | ||
79 | int access_flags; | ||
80 | u32 mkey; | ||
81 | }; | ||
82 | 73 | ||
83 | static int is_qp0(enum ib_qp_type qp_type) | 74 | static int is_qp0(enum ib_qp_type qp_type) |
84 | { | 75 | { |
@@ -110,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) | |||
110 | return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); | 101 | return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); |
111 | } | 102 | } |
112 | 103 | ||
104 | /** | ||
105 | * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. | ||
106 | * | ||
107 | * @qp: QP to copy from. | ||
108 | * @send: copy from the send queue when non-zero, use the receive queue | ||
109 | * otherwise. | ||
110 | * @wqe_index: index to start copying from. For send work queues, the | ||
111 | * wqe_index is in units of MLX5_SEND_WQE_BB. | ||
112 | * For receive work queue, it is the number of work queue | ||
113 | * element in the queue. | ||
114 | * @buffer: destination buffer. | ||
115 | * @length: maximum number of bytes to copy. | ||
116 | * | ||
117 | * Copies at least a single WQE, but may copy more data. | ||
118 | * | ||
119 | * Return: the number of bytes copied, or an error code. | ||
120 | */ | ||
121 | int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, | ||
122 | void *buffer, u32 length) | ||
123 | { | ||
124 | struct ib_device *ibdev = qp->ibqp.device; | ||
125 | struct mlx5_ib_dev *dev = to_mdev(ibdev); | ||
126 | struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; | ||
127 | size_t offset; | ||
128 | size_t wq_end; | ||
129 | struct ib_umem *umem = qp->umem; | ||
130 | u32 first_copy_length; | ||
131 | int wqe_length; | ||
132 | int ret; | ||
133 | |||
134 | if (wq->wqe_cnt == 0) { | ||
135 | mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", | ||
136 | qp->ibqp.qp_type); | ||
137 | return -EINVAL; | ||
138 | } | ||
139 | |||
140 | offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); | ||
141 | wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); | ||
142 | |||
143 | if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) | ||
144 | return -EINVAL; | ||
145 | |||
146 | if (offset > umem->length || | ||
147 | (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) | ||
148 | return -EINVAL; | ||
149 | |||
150 | first_copy_length = min_t(u32, offset + length, wq_end) - offset; | ||
151 | ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); | ||
152 | if (ret) | ||
153 | return ret; | ||
154 | |||
155 | if (send) { | ||
156 | struct mlx5_wqe_ctrl_seg *ctrl = buffer; | ||
157 | int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; | ||
158 | |||
159 | wqe_length = ds * MLX5_WQE_DS_UNITS; | ||
160 | } else { | ||
161 | wqe_length = 1 << wq->wqe_shift; | ||
162 | } | ||
163 | |||
164 | if (wqe_length <= first_copy_length) | ||
165 | return first_copy_length; | ||
166 | |||
167 | ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, | ||
168 | wqe_length - first_copy_length); | ||
169 | if (ret) | ||
170 | return ret; | ||
171 | |||
172 | return wqe_length; | ||
173 | } | ||
174 | |||
113 | static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) | 175 | static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) |
114 | { | 176 | { |
115 | struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; | 177 | struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; |
@@ -814,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, | |||
814 | int inlen = sizeof(*in); | 876 | int inlen = sizeof(*in); |
815 | int err; | 877 | int err; |
816 | 878 | ||
879 | mlx5_ib_odp_create_qp(qp); | ||
880 | |||
817 | gen = &dev->mdev->caps.gen; | 881 | gen = &dev->mdev->caps.gen; |
818 | mutex_init(&qp->mutex); | 882 | mutex_init(&qp->mutex); |
819 | spin_lock_init(&qp->sq.lock); | 883 | spin_lock_init(&qp->sq.lock); |
@@ -1098,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) | |||
1098 | in = kzalloc(sizeof(*in), GFP_KERNEL); | 1162 | in = kzalloc(sizeof(*in), GFP_KERNEL); |
1099 | if (!in) | 1163 | if (!in) |
1100 | return; | 1164 | return; |
1101 | if (qp->state != IB_QPS_RESET) | 1165 | if (qp->state != IB_QPS_RESET) { |
1166 | mlx5_ib_qp_disable_pagefaults(qp); | ||
1102 | if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), | 1167 | if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), |
1103 | MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) | 1168 | MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) |
1104 | mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", | 1169 | mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", |
1105 | qp->mqp.qpn); | 1170 | qp->mqp.qpn); |
1171 | } | ||
1106 | 1172 | ||
1107 | get_cqs(qp, &send_cq, &recv_cq); | 1173 | get_cqs(qp, &send_cq, &recv_cq); |
1108 | 1174 | ||
@@ -1650,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, | |||
1650 | if (mlx5_st < 0) | 1716 | if (mlx5_st < 0) |
1651 | goto out; | 1717 | goto out; |
1652 | 1718 | ||
1719 | /* If moving to a reset or error state, we must disable page faults on | ||
1720 | * this QP and flush all current page faults. Otherwise a stale page | ||
1721 | * fault may attempt to work on this QP after it is reset and moved | ||
1722 | * again to RTS, and may cause the driver and the device to get out of | ||
1723 | * sync. */ | ||
1724 | if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && | ||
1725 | (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) | ||
1726 | mlx5_ib_qp_disable_pagefaults(qp); | ||
1727 | |||
1653 | optpar = ib_mask_to_mlx5_opt(attr_mask); | 1728 | optpar = ib_mask_to_mlx5_opt(attr_mask); |
1654 | optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; | 1729 | optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; |
1655 | in->optparam = cpu_to_be32(optpar); | 1730 | in->optparam = cpu_to_be32(optpar); |
@@ -1659,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, | |||
1659 | if (err) | 1734 | if (err) |
1660 | goto out; | 1735 | goto out; |
1661 | 1736 | ||
1737 | if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) | ||
1738 | mlx5_ib_qp_enable_pagefaults(qp); | ||
1739 | |||
1662 | qp->state = new_state; | 1740 | qp->state = new_state; |
1663 | 1741 | ||
1664 | if (attr_mask & IB_QP_ACCESS_FLAGS) | 1742 | if (attr_mask & IB_QP_ACCESS_FLAGS) |
@@ -1848,37 +1926,70 @@ static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, | |||
1848 | umr->mkey_mask = frwr_mkey_mask(); | 1926 | umr->mkey_mask = frwr_mkey_mask(); |
1849 | } | 1927 | } |
1850 | 1928 | ||
1929 | static __be64 get_umr_reg_mr_mask(void) | ||
1930 | { | ||
1931 | u64 result; | ||
1932 | |||
1933 | result = MLX5_MKEY_MASK_LEN | | ||
1934 | MLX5_MKEY_MASK_PAGE_SIZE | | ||
1935 | MLX5_MKEY_MASK_START_ADDR | | ||
1936 | MLX5_MKEY_MASK_PD | | ||
1937 | MLX5_MKEY_MASK_LR | | ||
1938 | MLX5_MKEY_MASK_LW | | ||
1939 | MLX5_MKEY_MASK_KEY | | ||
1940 | MLX5_MKEY_MASK_RR | | ||
1941 | MLX5_MKEY_MASK_RW | | ||
1942 | MLX5_MKEY_MASK_A | | ||
1943 | MLX5_MKEY_MASK_FREE; | ||
1944 | |||
1945 | return cpu_to_be64(result); | ||
1946 | } | ||
1947 | |||
1948 | static __be64 get_umr_unreg_mr_mask(void) | ||
1949 | { | ||
1950 | u64 result; | ||
1951 | |||
1952 | result = MLX5_MKEY_MASK_FREE; | ||
1953 | |||
1954 | return cpu_to_be64(result); | ||
1955 | } | ||
1956 | |||
1957 | static __be64 get_umr_update_mtt_mask(void) | ||
1958 | { | ||
1959 | u64 result; | ||
1960 | |||
1961 | result = MLX5_MKEY_MASK_FREE; | ||
1962 | |||
1963 | return cpu_to_be64(result); | ||
1964 | } | ||
1965 | |||
1851 | static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, | 1966 | static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, |
1852 | struct ib_send_wr *wr) | 1967 | struct ib_send_wr *wr) |
1853 | { | 1968 | { |
1854 | struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg; | 1969 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; |
1855 | u64 mask; | ||
1856 | 1970 | ||
1857 | memset(umr, 0, sizeof(*umr)); | 1971 | memset(umr, 0, sizeof(*umr)); |
1858 | 1972 | ||
1973 | if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) | ||
1974 | umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ | ||
1975 | else | ||
1976 | umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ | ||
1977 | |||
1859 | if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { | 1978 | if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { |
1860 | umr->flags = 1 << 5; /* fail if not free */ | ||
1861 | umr->klm_octowords = get_klm_octo(umrwr->npages); | 1979 | umr->klm_octowords = get_klm_octo(umrwr->npages); |
1862 | mask = MLX5_MKEY_MASK_LEN | | 1980 | if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { |
1863 | MLX5_MKEY_MASK_PAGE_SIZE | | 1981 | umr->mkey_mask = get_umr_update_mtt_mask(); |
1864 | MLX5_MKEY_MASK_START_ADDR | | 1982 | umr->bsf_octowords = get_klm_octo(umrwr->target.offset); |
1865 | MLX5_MKEY_MASK_PD | | 1983 | umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; |
1866 | MLX5_MKEY_MASK_LR | | 1984 | } else { |
1867 | MLX5_MKEY_MASK_LW | | 1985 | umr->mkey_mask = get_umr_reg_mr_mask(); |
1868 | MLX5_MKEY_MASK_KEY | | 1986 | } |
1869 | MLX5_MKEY_MASK_RR | | ||
1870 | MLX5_MKEY_MASK_RW | | ||
1871 | MLX5_MKEY_MASK_A | | ||
1872 | MLX5_MKEY_MASK_FREE; | ||
1873 | umr->mkey_mask = cpu_to_be64(mask); | ||
1874 | } else { | 1987 | } else { |
1875 | umr->flags = 2 << 5; /* fail if free */ | 1988 | umr->mkey_mask = get_umr_unreg_mr_mask(); |
1876 | mask = MLX5_MKEY_MASK_FREE; | ||
1877 | umr->mkey_mask = cpu_to_be64(mask); | ||
1878 | } | 1989 | } |
1879 | 1990 | ||
1880 | if (!wr->num_sge) | 1991 | if (!wr->num_sge) |
1881 | umr->flags |= (1 << 7); /* inline */ | 1992 | umr->flags |= MLX5_UMR_INLINE; |
1882 | } | 1993 | } |
1883 | 1994 | ||
1884 | static u8 get_umr_flags(int acc) | 1995 | static u8 get_umr_flags(int acc) |
@@ -1895,7 +2006,7 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, | |||
1895 | { | 2006 | { |
1896 | memset(seg, 0, sizeof(*seg)); | 2007 | memset(seg, 0, sizeof(*seg)); |
1897 | if (li) { | 2008 | if (li) { |
1898 | seg->status = 1 << 6; | 2009 | seg->status = MLX5_MKEY_STATUS_FREE; |
1899 | return; | 2010 | return; |
1900 | } | 2011 | } |
1901 | 2012 | ||
@@ -1912,19 +2023,23 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, | |||
1912 | 2023 | ||
1913 | static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) | 2024 | static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) |
1914 | { | 2025 | { |
2026 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; | ||
2027 | |||
1915 | memset(seg, 0, sizeof(*seg)); | 2028 | memset(seg, 0, sizeof(*seg)); |
1916 | if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { | 2029 | if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { |
1917 | seg->status = 1 << 6; | 2030 | seg->status = MLX5_MKEY_STATUS_FREE; |
1918 | return; | 2031 | return; |
1919 | } | 2032 | } |
1920 | 2033 | ||
1921 | seg->flags = convert_access(wr->wr.fast_reg.access_flags); | 2034 | seg->flags = convert_access(umrwr->access_flags); |
1922 | seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn); | 2035 | if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { |
1923 | seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); | 2036 | seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); |
1924 | seg->len = cpu_to_be64(wr->wr.fast_reg.length); | 2037 | seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); |
1925 | seg->log2_page_size = wr->wr.fast_reg.page_shift; | 2038 | } |
2039 | seg->len = cpu_to_be64(umrwr->length); | ||
2040 | seg->log2_page_size = umrwr->page_shift; | ||
1926 | seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | | 2041 | seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | |
1927 | mlx5_mkey_variant(wr->wr.fast_reg.rkey)); | 2042 | mlx5_mkey_variant(umrwr->mkey)); |
1928 | } | 2043 | } |
1929 | 2044 | ||
1930 | static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, | 2045 | static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, |
@@ -2927,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr | |||
2927 | int mlx5_state; | 3042 | int mlx5_state; |
2928 | int err = 0; | 3043 | int err = 0; |
2929 | 3044 | ||
3045 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
3046 | /* | ||
3047 | * Wait for any outstanding page faults, in case the user frees memory | ||
3048 | * based upon this query's result. | ||
3049 | */ | ||
3050 | flush_workqueue(mlx5_ib_page_fault_wq); | ||
3051 | #endif | ||
3052 | |||
2930 | mutex_lock(&qp->mutex); | 3053 | mutex_lock(&qp->mutex); |
2931 | outb = kzalloc(sizeof(*outb), GFP_KERNEL); | 3054 | outb = kzalloc(sizeof(*outb), GFP_KERNEL); |
2932 | if (!outb) { | 3055 | if (!outb) { |
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fef067c959fc..c0d0296e7a00 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c | |||
@@ -2341,9 +2341,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
2341 | nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," | 2341 | nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," |
2342 | " offset = %u, page size = %u.\n", | 2342 | " offset = %u, page size = %u.\n", |
2343 | (unsigned long int)start, (unsigned long int)virt, (u32)length, | 2343 | (unsigned long int)start, (unsigned long int)virt, (u32)length, |
2344 | region->offset, region->page_size); | 2344 | ib_umem_offset(region), region->page_size); |
2345 | 2345 | ||
2346 | skip_pages = ((u32)region->offset) >> 12; | 2346 | skip_pages = ((u32)ib_umem_offset(region)) >> 12; |
2347 | 2347 | ||
2348 | if (ib_copy_from_udata(&req, udata, sizeof(req))) { | 2348 | if (ib_copy_from_udata(&req, udata, sizeof(req))) { |
2349 | ib_umem_release(region); | 2349 | ib_umem_release(region); |
@@ -2408,7 +2408,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
2408 | region_length -= skip_pages << 12; | 2408 | region_length -= skip_pages << 12; |
2409 | for (page_index = skip_pages; page_index < chunk_pages; page_index++) { | 2409 | for (page_index = skip_pages; page_index < chunk_pages; page_index++) { |
2410 | skip_pages = 0; | 2410 | skip_pages = 0; |
2411 | if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) | 2411 | if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length) |
2412 | goto enough_pages; | 2412 | goto enough_pages; |
2413 | if ((page_count&0x01FF) == 0) { | 2413 | if ((page_count&0x01FF) == 0) { |
2414 | if (page_count >= 1024 * 512) { | 2414 | if (page_count >= 1024 * 512) { |
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index ac02ce4e8040..f3cc8c9e65ae 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c | |||
@@ -96,7 +96,6 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) | |||
96 | struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); | 96 | struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); |
97 | struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); | 97 | struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); |
98 | union ib_gid sgid; | 98 | union ib_gid sgid; |
99 | u8 zmac[ETH_ALEN]; | ||
100 | 99 | ||
101 | if (!(attr->ah_flags & IB_AH_GRH)) | 100 | if (!(attr->ah_flags & IB_AH_GRH)) |
102 | return ERR_PTR(-EINVAL); | 101 | return ERR_PTR(-EINVAL); |
@@ -118,9 +117,7 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) | |||
118 | goto av_conf_err; | 117 | goto av_conf_err; |
119 | } | 118 | } |
120 | 119 | ||
121 | memset(&zmac, 0, ETH_ALEN); | 120 | if (pd->uctx) { |
122 | if (pd->uctx && | ||
123 | memcmp(attr->dmac, &zmac, ETH_ALEN)) { | ||
124 | status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, | 121 | status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, |
125 | attr->dmac, &attr->vlan_id); | 122 | attr->dmac, &attr->vlan_id); |
126 | if (status) { | 123 | if (status) { |
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 4c68305ee781..fb8d8c4dfbb9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | |||
@@ -805,7 +805,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, | |||
805 | goto umem_err; | 805 | goto umem_err; |
806 | 806 | ||
807 | mr->hwmr.pbe_size = mr->umem->page_size; | 807 | mr->hwmr.pbe_size = mr->umem->page_size; |
808 | mr->hwmr.fbo = mr->umem->offset; | 808 | mr->hwmr.fbo = ib_umem_offset(mr->umem); |
809 | mr->hwmr.va = usr_addr; | 809 | mr->hwmr.va = usr_addr; |
810 | mr->hwmr.len = len; | 810 | mr->hwmr.len = len; |
811 | mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; | 811 | mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; |
@@ -1410,6 +1410,8 @@ int ocrdma_query_qp(struct ib_qp *ibqp, | |||
1410 | mutex_unlock(&dev->dev_lock); | 1410 | mutex_unlock(&dev->dev_lock); |
1411 | if (status) | 1411 | if (status) |
1412 | goto mbx_err; | 1412 | goto mbx_err; |
1413 | if (qp->qp_type == IB_QPT_UD) | ||
1414 | qp_attr->qkey = params.qkey; | ||
1413 | qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT); | 1415 | qp_attr->qp_state = get_ibqp_state(IB_QPS_INIT); |
1414 | qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT); | 1416 | qp_attr->cur_qp_state = get_ibqp_state(IB_QPS_INIT); |
1415 | qp_attr->path_mtu = | 1417 | qp_attr->path_mtu = |
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index 9bbb55347cc1..a77fb4fb14e4 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c | |||
@@ -258,7 +258,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
258 | mr->mr.user_base = start; | 258 | mr->mr.user_base = start; |
259 | mr->mr.iova = virt_addr; | 259 | mr->mr.iova = virt_addr; |
260 | mr->mr.length = length; | 260 | mr->mr.length = length; |
261 | mr->mr.offset = umem->offset; | 261 | mr->mr.offset = ib_umem_offset(umem); |
262 | mr->mr.access_flags = mr_access_flags; | 262 | mr->mr.access_flags = mr_access_flags; |
263 | mr->umem = umem; | 263 | mr->umem = umem; |
264 | 264 | ||
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562beb5423..8ba80a6d3a46 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h | |||
@@ -98,9 +98,15 @@ enum { | |||
98 | 98 | ||
99 | IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ | 99 | IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ |
100 | IPOIB_MCAST_FLAG_SENDONLY = 1, | 100 | IPOIB_MCAST_FLAG_SENDONLY = 1, |
101 | IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ | 101 | /* |
102 | * For IPOIB_MCAST_FLAG_BUSY | ||
103 | * When set, in flight join and mcast->mc is unreliable | ||
104 | * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or | ||
105 | * haven't started yet | ||
106 | * When clear and mcast->mc is valid pointer, join was successful | ||
107 | */ | ||
108 | IPOIB_MCAST_FLAG_BUSY = 2, | ||
102 | IPOIB_MCAST_FLAG_ATTACHED = 3, | 109 | IPOIB_MCAST_FLAG_ATTACHED = 3, |
103 | IPOIB_MCAST_JOIN_STARTED = 4, | ||
104 | 110 | ||
105 | MAX_SEND_CQE = 16, | 111 | MAX_SEND_CQE = 16, |
106 | IPOIB_CM_COPYBREAK = 256, | 112 | IPOIB_CM_COPYBREAK = 256, |
@@ -317,6 +323,7 @@ struct ipoib_dev_priv { | |||
317 | struct list_head multicast_list; | 323 | struct list_head multicast_list; |
318 | struct rb_root multicast_tree; | 324 | struct rb_root multicast_tree; |
319 | 325 | ||
326 | struct workqueue_struct *wq; | ||
320 | struct delayed_work mcast_task; | 327 | struct delayed_work mcast_task; |
321 | struct work_struct carrier_on_task; | 328 | struct work_struct carrier_on_task; |
322 | struct work_struct flush_light; | 329 | struct work_struct flush_light; |
@@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); | |||
477 | void ipoib_pkey_event(struct work_struct *work); | 484 | void ipoib_pkey_event(struct work_struct *work); |
478 | void ipoib_ib_dev_cleanup(struct net_device *dev); | 485 | void ipoib_ib_dev_cleanup(struct net_device *dev); |
479 | 486 | ||
480 | int ipoib_ib_dev_open(struct net_device *dev, int flush); | 487 | int ipoib_ib_dev_open(struct net_device *dev); |
481 | int ipoib_ib_dev_up(struct net_device *dev); | 488 | int ipoib_ib_dev_up(struct net_device *dev); |
482 | int ipoib_ib_dev_down(struct net_device *dev, int flush); | 489 | int ipoib_ib_dev_down(struct net_device *dev); |
483 | int ipoib_ib_dev_stop(struct net_device *dev, int flush); | 490 | int ipoib_ib_dev_stop(struct net_device *dev); |
484 | void ipoib_pkey_dev_check_presence(struct net_device *dev); | 491 | void ipoib_pkey_dev_check_presence(struct net_device *dev); |
485 | 492 | ||
486 | int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); | 493 | int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); |
@@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); | |||
492 | 499 | ||
493 | void ipoib_mcast_restart_task(struct work_struct *work); | 500 | void ipoib_mcast_restart_task(struct work_struct *work); |
494 | int ipoib_mcast_start_thread(struct net_device *dev); | 501 | int ipoib_mcast_start_thread(struct net_device *dev); |
495 | int ipoib_mcast_stop_thread(struct net_device *dev, int flush); | 502 | int ipoib_mcast_stop_thread(struct net_device *dev); |
496 | 503 | ||
497 | void ipoib_mcast_dev_down(struct net_device *dev); | 504 | void ipoib_mcast_dev_down(struct net_device *dev); |
498 | void ipoib_mcast_dev_flush(struct net_device *dev); | 505 | void ipoib_mcast_dev_flush(struct net_device *dev); |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 933efcea0d03..56959adb6c7d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |||
@@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even | |||
474 | } | 474 | } |
475 | 475 | ||
476 | spin_lock_irq(&priv->lock); | 476 | spin_lock_irq(&priv->lock); |
477 | queue_delayed_work(ipoib_workqueue, | 477 | queue_delayed_work(priv->wq, |
478 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); | 478 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); |
479 | /* Add this entry to passive ids list head, but do not re-add it | 479 | /* Add this entry to passive ids list head, but do not re-add it |
480 | * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ | 480 | * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ |
@@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |||
576 | spin_lock_irqsave(&priv->lock, flags); | 576 | spin_lock_irqsave(&priv->lock, flags); |
577 | list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); | 577 | list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); |
578 | ipoib_cm_start_rx_drain(priv); | 578 | ipoib_cm_start_rx_drain(priv); |
579 | queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); | 579 | queue_work(priv->wq, &priv->cm.rx_reap_task); |
580 | spin_unlock_irqrestore(&priv->lock, flags); | 580 | spin_unlock_irqrestore(&priv->lock, flags); |
581 | } else | 581 | } else |
582 | ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", | 582 | ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", |
@@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |||
603 | spin_lock_irqsave(&priv->lock, flags); | 603 | spin_lock_irqsave(&priv->lock, flags); |
604 | list_move(&p->list, &priv->cm.rx_reap_list); | 604 | list_move(&p->list, &priv->cm.rx_reap_list); |
605 | spin_unlock_irqrestore(&priv->lock, flags); | 605 | spin_unlock_irqrestore(&priv->lock, flags); |
606 | queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); | 606 | queue_work(priv->wq, &priv->cm.rx_reap_task); |
607 | } | 607 | } |
608 | return; | 608 | return; |
609 | } | 609 | } |
@@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) | |||
827 | 827 | ||
828 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { | 828 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { |
829 | list_move(&tx->list, &priv->cm.reap_list); | 829 | list_move(&tx->list, &priv->cm.reap_list); |
830 | queue_work(ipoib_workqueue, &priv->cm.reap_task); | 830 | queue_work(priv->wq, &priv->cm.reap_task); |
831 | } | 831 | } |
832 | 832 | ||
833 | clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); | 833 | clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); |
@@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, | |||
1255 | 1255 | ||
1256 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { | 1256 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { |
1257 | list_move(&tx->list, &priv->cm.reap_list); | 1257 | list_move(&tx->list, &priv->cm.reap_list); |
1258 | queue_work(ipoib_workqueue, &priv->cm.reap_task); | 1258 | queue_work(priv->wq, &priv->cm.reap_task); |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | spin_unlock_irqrestore(&priv->lock, flags); | 1261 | spin_unlock_irqrestore(&priv->lock, flags); |
@@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path | |||
1284 | tx->dev = dev; | 1284 | tx->dev = dev; |
1285 | list_add(&tx->list, &priv->cm.start_list); | 1285 | list_add(&tx->list, &priv->cm.start_list); |
1286 | set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); | 1286 | set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); |
1287 | queue_work(ipoib_workqueue, &priv->cm.start_task); | 1287 | queue_work(priv->wq, &priv->cm.start_task); |
1288 | return tx; | 1288 | return tx; |
1289 | } | 1289 | } |
1290 | 1290 | ||
@@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) | |||
1295 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { | 1295 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { |
1296 | spin_lock_irqsave(&priv->lock, flags); | 1296 | spin_lock_irqsave(&priv->lock, flags); |
1297 | list_move(&tx->list, &priv->cm.reap_list); | 1297 | list_move(&tx->list, &priv->cm.reap_list); |
1298 | queue_work(ipoib_workqueue, &priv->cm.reap_task); | 1298 | queue_work(priv->wq, &priv->cm.reap_task); |
1299 | ipoib_dbg(priv, "Reap connection for gid %pI6\n", | 1299 | ipoib_dbg(priv, "Reap connection for gid %pI6\n", |
1300 | tx->neigh->daddr + 4); | 1300 | tx->neigh->daddr + 4); |
1301 | tx->neigh = NULL; | 1301 | tx->neigh = NULL; |
@@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, | |||
1417 | 1417 | ||
1418 | skb_queue_tail(&priv->cm.skb_queue, skb); | 1418 | skb_queue_tail(&priv->cm.skb_queue, skb); |
1419 | if (e) | 1419 | if (e) |
1420 | queue_work(ipoib_workqueue, &priv->cm.skb_task); | 1420 | queue_work(priv->wq, &priv->cm.skb_task); |
1421 | } | 1421 | } |
1422 | 1422 | ||
1423 | static void ipoib_cm_rx_reap(struct work_struct *work) | 1423 | static void ipoib_cm_rx_reap(struct work_struct *work) |
@@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work) | |||
1450 | } | 1450 | } |
1451 | 1451 | ||
1452 | if (!list_empty(&priv->cm.passive_ids)) | 1452 | if (!list_empty(&priv->cm.passive_ids)) |
1453 | queue_delayed_work(ipoib_workqueue, | 1453 | queue_delayed_work(priv->wq, |
1454 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); | 1454 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); |
1455 | spin_unlock_irq(&priv->lock); | 1455 | spin_unlock_irq(&priv->lock); |
1456 | } | 1456 | } |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 72626c348174..fe65abb5150c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c | |||
@@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work) | |||
655 | __ipoib_reap_ah(dev); | 655 | __ipoib_reap_ah(dev); |
656 | 656 | ||
657 | if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) | 657 | if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) |
658 | queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, | 658 | queue_delayed_work(priv->wq, &priv->ah_reap_task, |
659 | round_jiffies_relative(HZ)); | 659 | round_jiffies_relative(HZ)); |
660 | } | 660 | } |
661 | 661 | ||
@@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx) | |||
664 | drain_tx_cq((struct net_device *)ctx); | 664 | drain_tx_cq((struct net_device *)ctx); |
665 | } | 665 | } |
666 | 666 | ||
667 | int ipoib_ib_dev_open(struct net_device *dev, int flush) | 667 | int ipoib_ib_dev_open(struct net_device *dev) |
668 | { | 668 | { |
669 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 669 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
670 | int ret; | 670 | int ret; |
@@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) | |||
696 | } | 696 | } |
697 | 697 | ||
698 | clear_bit(IPOIB_STOP_REAPER, &priv->flags); | 698 | clear_bit(IPOIB_STOP_REAPER, &priv->flags); |
699 | queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, | 699 | queue_delayed_work(priv->wq, &priv->ah_reap_task, |
700 | round_jiffies_relative(HZ)); | 700 | round_jiffies_relative(HZ)); |
701 | 701 | ||
702 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) | 702 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
@@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) | |||
706 | dev_stop: | 706 | dev_stop: |
707 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) | 707 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
708 | napi_enable(&priv->napi); | 708 | napi_enable(&priv->napi); |
709 | ipoib_ib_dev_stop(dev, flush); | 709 | ipoib_ib_dev_stop(dev); |
710 | return -1; | 710 | return -1; |
711 | } | 711 | } |
712 | 712 | ||
@@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev) | |||
738 | return ipoib_mcast_start_thread(dev); | 738 | return ipoib_mcast_start_thread(dev); |
739 | } | 739 | } |
740 | 740 | ||
741 | int ipoib_ib_dev_down(struct net_device *dev, int flush) | 741 | int ipoib_ib_dev_down(struct net_device *dev) |
742 | { | 742 | { |
743 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 743 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
744 | 744 | ||
@@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) | |||
747 | clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); | 747 | clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); |
748 | netif_carrier_off(dev); | 748 | netif_carrier_off(dev); |
749 | 749 | ||
750 | ipoib_mcast_stop_thread(dev, flush); | 750 | ipoib_mcast_stop_thread(dev); |
751 | ipoib_mcast_dev_flush(dev); | 751 | ipoib_mcast_dev_flush(dev); |
752 | 752 | ||
753 | ipoib_flush_paths(dev); | 753 | ipoib_flush_paths(dev); |
@@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev) | |||
807 | local_bh_enable(); | 807 | local_bh_enable(); |
808 | } | 808 | } |
809 | 809 | ||
810 | int ipoib_ib_dev_stop(struct net_device *dev, int flush) | 810 | int ipoib_ib_dev_stop(struct net_device *dev) |
811 | { | 811 | { |
812 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 812 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
813 | struct ib_qp_attr qp_attr; | 813 | struct ib_qp_attr qp_attr; |
@@ -880,8 +880,7 @@ timeout: | |||
880 | /* Wait for all AHs to be reaped */ | 880 | /* Wait for all AHs to be reaped */ |
881 | set_bit(IPOIB_STOP_REAPER, &priv->flags); | 881 | set_bit(IPOIB_STOP_REAPER, &priv->flags); |
882 | cancel_delayed_work(&priv->ah_reap_task); | 882 | cancel_delayed_work(&priv->ah_reap_task); |
883 | if (flush) | 883 | flush_workqueue(priv->wq); |
884 | flush_workqueue(ipoib_workqueue); | ||
885 | 884 | ||
886 | begin = jiffies; | 885 | begin = jiffies; |
887 | 886 | ||
@@ -918,7 +917,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) | |||
918 | (unsigned long) dev); | 917 | (unsigned long) dev); |
919 | 918 | ||
920 | if (dev->flags & IFF_UP) { | 919 | if (dev->flags & IFF_UP) { |
921 | if (ipoib_ib_dev_open(dev, 1)) { | 920 | if (ipoib_ib_dev_open(dev)) { |
922 | ipoib_transport_dev_cleanup(dev); | 921 | ipoib_transport_dev_cleanup(dev); |
923 | return -ENODEV; | 922 | return -ENODEV; |
924 | } | 923 | } |
@@ -1040,12 +1039,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, | |||
1040 | } | 1039 | } |
1041 | 1040 | ||
1042 | if (level >= IPOIB_FLUSH_NORMAL) | 1041 | if (level >= IPOIB_FLUSH_NORMAL) |
1043 | ipoib_ib_dev_down(dev, 0); | 1042 | ipoib_ib_dev_down(dev); |
1044 | 1043 | ||
1045 | if (level == IPOIB_FLUSH_HEAVY) { | 1044 | if (level == IPOIB_FLUSH_HEAVY) { |
1046 | if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) | 1045 | if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
1047 | ipoib_ib_dev_stop(dev, 0); | 1046 | ipoib_ib_dev_stop(dev); |
1048 | if (ipoib_ib_dev_open(dev, 0) != 0) | 1047 | if (ipoib_ib_dev_open(dev) != 0) |
1049 | return; | 1048 | return; |
1050 | if (netif_queue_stopped(dev)) | 1049 | if (netif_queue_stopped(dev)) |
1051 | netif_start_queue(dev); | 1050 | netif_start_queue(dev); |
@@ -1097,7 +1096,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) | |||
1097 | */ | 1096 | */ |
1098 | ipoib_flush_paths(dev); | 1097 | ipoib_flush_paths(dev); |
1099 | 1098 | ||
1100 | ipoib_mcast_stop_thread(dev, 1); | 1099 | ipoib_mcast_stop_thread(dev); |
1101 | ipoib_mcast_dev_flush(dev); | 1100 | ipoib_mcast_dev_flush(dev); |
1102 | 1101 | ||
1103 | ipoib_transport_dev_cleanup(dev); | 1102 | ipoib_transport_dev_cleanup(dev); |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 58b5aa3b6f2d..6bad17d4d588 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c | |||
@@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev) | |||
108 | 108 | ||
109 | set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); | 109 | set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); |
110 | 110 | ||
111 | if (ipoib_ib_dev_open(dev, 1)) { | 111 | if (ipoib_ib_dev_open(dev)) { |
112 | if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) | 112 | if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) |
113 | return 0; | 113 | return 0; |
114 | goto err_disable; | 114 | goto err_disable; |
@@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev) | |||
139 | return 0; | 139 | return 0; |
140 | 140 | ||
141 | err_stop: | 141 | err_stop: |
142 | ipoib_ib_dev_stop(dev, 1); | 142 | ipoib_ib_dev_stop(dev); |
143 | 143 | ||
144 | err_disable: | 144 | err_disable: |
145 | clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); | 145 | clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); |
@@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev) | |||
157 | 157 | ||
158 | netif_stop_queue(dev); | 158 | netif_stop_queue(dev); |
159 | 159 | ||
160 | ipoib_ib_dev_down(dev, 1); | 160 | ipoib_ib_dev_down(dev); |
161 | ipoib_ib_dev_stop(dev, 0); | 161 | ipoib_ib_dev_stop(dev); |
162 | 162 | ||
163 | if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { | 163 | if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { |
164 | struct ipoib_dev_priv *cpriv; | 164 | struct ipoib_dev_priv *cpriv; |
@@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) | |||
839 | return; | 839 | return; |
840 | } | 840 | } |
841 | 841 | ||
842 | queue_work(ipoib_workqueue, &priv->restart_task); | 842 | queue_work(priv->wq, &priv->restart_task); |
843 | } | 843 | } |
844 | 844 | ||
845 | static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) | 845 | static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) |
@@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work) | |||
954 | __ipoib_reap_neigh(priv); | 954 | __ipoib_reap_neigh(priv); |
955 | 955 | ||
956 | if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) | 956 | if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) |
957 | queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, | 957 | queue_delayed_work(priv->wq, &priv->neigh_reap_task, |
958 | arp_tbl.gc_interval); | 958 | arp_tbl.gc_interval); |
959 | } | 959 | } |
960 | 960 | ||
@@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) | |||
1133 | 1133 | ||
1134 | /* start garbage collection */ | 1134 | /* start garbage collection */ |
1135 | clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | 1135 | clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); |
1136 | queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, | 1136 | queue_delayed_work(priv->wq, &priv->neigh_reap_task, |
1137 | arp_tbl.gc_interval); | 1137 | arp_tbl.gc_interval); |
1138 | 1138 | ||
1139 | return 0; | 1139 | return 0; |
@@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) | |||
1262 | { | 1262 | { |
1263 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 1263 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
1264 | 1264 | ||
1265 | if (ipoib_neigh_hash_init(priv) < 0) | ||
1266 | goto out; | ||
1267 | /* Allocate RX/TX "rings" to hold queued skbs */ | 1265 | /* Allocate RX/TX "rings" to hold queued skbs */ |
1268 | priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, | 1266 | priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, |
1269 | GFP_KERNEL); | 1267 | GFP_KERNEL); |
1270 | if (!priv->rx_ring) { | 1268 | if (!priv->rx_ring) { |
1271 | printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", | 1269 | printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", |
1272 | ca->name, ipoib_recvq_size); | 1270 | ca->name, ipoib_recvq_size); |
1273 | goto out_neigh_hash_cleanup; | 1271 | goto out; |
1274 | } | 1272 | } |
1275 | 1273 | ||
1276 | priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); | 1274 | priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); |
@@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) | |||
1285 | if (ipoib_ib_dev_init(dev, ca, port)) | 1283 | if (ipoib_ib_dev_init(dev, ca, port)) |
1286 | goto out_tx_ring_cleanup; | 1284 | goto out_tx_ring_cleanup; |
1287 | 1285 | ||
1286 | /* | ||
1287 | * Must be after ipoib_ib_dev_init so we can allocate a per | ||
1288 | * device wq there and use it here | ||
1289 | */ | ||
1290 | if (ipoib_neigh_hash_init(priv) < 0) | ||
1291 | goto out_dev_uninit; | ||
1292 | |||
1288 | return 0; | 1293 | return 0; |
1289 | 1294 | ||
1295 | out_dev_uninit: | ||
1296 | ipoib_ib_dev_cleanup(dev); | ||
1297 | |||
1290 | out_tx_ring_cleanup: | 1298 | out_tx_ring_cleanup: |
1291 | vfree(priv->tx_ring); | 1299 | vfree(priv->tx_ring); |
1292 | 1300 | ||
1293 | out_rx_ring_cleanup: | 1301 | out_rx_ring_cleanup: |
1294 | kfree(priv->rx_ring); | 1302 | kfree(priv->rx_ring); |
1295 | 1303 | ||
1296 | out_neigh_hash_cleanup: | ||
1297 | ipoib_neigh_hash_uninit(dev); | ||
1298 | out: | 1304 | out: |
1299 | return -ENOMEM; | 1305 | return -ENOMEM; |
1300 | } | 1306 | } |
@@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev) | |||
1317 | } | 1323 | } |
1318 | unregister_netdevice_many(&head); | 1324 | unregister_netdevice_many(&head); |
1319 | 1325 | ||
1326 | /* | ||
1327 | * Must be before ipoib_ib_dev_cleanup or we delete an in use | ||
1328 | * work queue | ||
1329 | */ | ||
1330 | ipoib_neigh_hash_uninit(dev); | ||
1331 | |||
1320 | ipoib_ib_dev_cleanup(dev); | 1332 | ipoib_ib_dev_cleanup(dev); |
1321 | 1333 | ||
1322 | kfree(priv->rx_ring); | 1334 | kfree(priv->rx_ring); |
@@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev) | |||
1324 | 1336 | ||
1325 | priv->rx_ring = NULL; | 1337 | priv->rx_ring = NULL; |
1326 | priv->tx_ring = NULL; | 1338 | priv->tx_ring = NULL; |
1327 | |||
1328 | ipoib_neigh_hash_uninit(dev); | ||
1329 | } | 1339 | } |
1330 | 1340 | ||
1331 | static const struct header_ops ipoib_header_ops = { | 1341 | static const struct header_ops ipoib_header_ops = { |
@@ -1636,7 +1646,7 @@ register_failed: | |||
1636 | /* Stop GC if started before flush */ | 1646 | /* Stop GC if started before flush */ |
1637 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | 1647 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); |
1638 | cancel_delayed_work(&priv->neigh_reap_task); | 1648 | cancel_delayed_work(&priv->neigh_reap_task); |
1639 | flush_workqueue(ipoib_workqueue); | 1649 | flush_workqueue(priv->wq); |
1640 | 1650 | ||
1641 | event_failed: | 1651 | event_failed: |
1642 | ipoib_dev_cleanup(priv->dev); | 1652 | ipoib_dev_cleanup(priv->dev); |
@@ -1707,7 +1717,7 @@ static void ipoib_remove_one(struct ib_device *device) | |||
1707 | /* Stop GC */ | 1717 | /* Stop GC */ |
1708 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | 1718 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); |
1709 | cancel_delayed_work(&priv->neigh_reap_task); | 1719 | cancel_delayed_work(&priv->neigh_reap_task); |
1710 | flush_workqueue(ipoib_workqueue); | 1720 | flush_workqueue(priv->wq); |
1711 | 1721 | ||
1712 | unregister_netdev(priv->dev); | 1722 | unregister_netdev(priv->dev); |
1713 | free_netdev(priv->dev); | 1723 | free_netdev(priv->dev); |
@@ -1748,8 +1758,13 @@ static int __init ipoib_init_module(void) | |||
1748 | * unregister_netdev() and linkwatch_event take the rtnl lock, | 1758 | * unregister_netdev() and linkwatch_event take the rtnl lock, |
1749 | * so flush_scheduled_work() can deadlock during device | 1759 | * so flush_scheduled_work() can deadlock during device |
1750 | * removal. | 1760 | * removal. |
1761 | * | ||
1762 | * In addition, bringing one device up and another down at the | ||
1763 | * same time can deadlock a single workqueue, so we have this | ||
1764 | * global fallback workqueue, but we also attempt to open a | ||
1765 | * per device workqueue each time we bring an interface up | ||
1751 | */ | 1766 | */ |
1752 | ipoib_workqueue = create_singlethread_workqueue("ipoib"); | 1767 | ipoib_workqueue = create_singlethread_workqueue("ipoib_flush"); |
1753 | if (!ipoib_workqueue) { | 1768 | if (!ipoib_workqueue) { |
1754 | ret = -ENOMEM; | 1769 | ret = -ENOMEM; |
1755 | goto err_fs; | 1770 | goto err_fs; |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ffb83b5f7e80..bc50dd0d0e4d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |||
@@ -190,12 +190,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, | |||
190 | spin_unlock_irq(&priv->lock); | 190 | spin_unlock_irq(&priv->lock); |
191 | priv->tx_wr.wr.ud.remote_qkey = priv->qkey; | 191 | priv->tx_wr.wr.ud.remote_qkey = priv->qkey; |
192 | set_qkey = 1; | 192 | set_qkey = 1; |
193 | |||
194 | if (!ipoib_cm_admin_enabled(dev)) { | ||
195 | rtnl_lock(); | ||
196 | dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); | ||
197 | rtnl_unlock(); | ||
198 | } | ||
199 | } | 193 | } |
200 | 194 | ||
201 | if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { | 195 | if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { |
@@ -277,16 +271,27 @@ ipoib_mcast_sendonly_join_complete(int status, | |||
277 | struct ipoib_mcast *mcast = multicast->context; | 271 | struct ipoib_mcast *mcast = multicast->context; |
278 | struct net_device *dev = mcast->dev; | 272 | struct net_device *dev = mcast->dev; |
279 | 273 | ||
274 | /* | ||
275 | * We have to take the mutex to force mcast_sendonly_join to | ||
276 | * return from ib_sa_multicast_join and set mcast->mc to a | ||
277 | * valid value. Otherwise we were racing with ourselves in | ||
278 | * that we might fail here, but get a valid return from | ||
279 | * ib_sa_multicast_join after we had cleared mcast->mc here, | ||
280 | * resulting in mis-matched joins and leaves and a deadlock | ||
281 | */ | ||
282 | mutex_lock(&mcast_mutex); | ||
283 | |||
280 | /* We trap for port events ourselves. */ | 284 | /* We trap for port events ourselves. */ |
281 | if (status == -ENETRESET) | 285 | if (status == -ENETRESET) |
282 | return 0; | 286 | goto out; |
283 | 287 | ||
284 | if (!status) | 288 | if (!status) |
285 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); | 289 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); |
286 | 290 | ||
287 | if (status) { | 291 | if (status) { |
288 | if (mcast->logcount++ < 20) | 292 | if (mcast->logcount++ < 20) |
289 | ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", | 293 | ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast " |
294 | "join failed for %pI6, status %d\n", | ||
290 | mcast->mcmember.mgid.raw, status); | 295 | mcast->mcmember.mgid.raw, status); |
291 | 296 | ||
292 | /* Flush out any queued packets */ | 297 | /* Flush out any queued packets */ |
@@ -296,11 +301,15 @@ ipoib_mcast_sendonly_join_complete(int status, | |||
296 | dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); | 301 | dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); |
297 | } | 302 | } |
298 | netif_tx_unlock_bh(dev); | 303 | netif_tx_unlock_bh(dev); |
299 | |||
300 | /* Clear the busy flag so we try again */ | ||
301 | status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, | ||
302 | &mcast->flags); | ||
303 | } | 304 | } |
305 | out: | ||
306 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | ||
307 | if (status) | ||
308 | mcast->mc = NULL; | ||
309 | complete(&mcast->done); | ||
310 | if (status == -ENETRESET) | ||
311 | status = 0; | ||
312 | mutex_unlock(&mcast_mutex); | ||
304 | return status; | 313 | return status; |
305 | } | 314 | } |
306 | 315 | ||
@@ -318,12 +327,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) | |||
318 | int ret = 0; | 327 | int ret = 0; |
319 | 328 | ||
320 | if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { | 329 | if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { |
321 | ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); | 330 | ipoib_dbg_mcast(priv, "device shutting down, no sendonly " |
331 | "multicast joins\n"); | ||
322 | return -ENODEV; | 332 | return -ENODEV; |
323 | } | 333 | } |
324 | 334 | ||
325 | if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { | 335 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { |
326 | ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); | 336 | ipoib_dbg_mcast(priv, "multicast entry busy, skipping " |
337 | "sendonly join\n"); | ||
327 | return -EBUSY; | 338 | return -EBUSY; |
328 | } | 339 | } |
329 | 340 | ||
@@ -331,6 +342,9 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) | |||
331 | rec.port_gid = priv->local_gid; | 342 | rec.port_gid = priv->local_gid; |
332 | rec.pkey = cpu_to_be16(priv->pkey); | 343 | rec.pkey = cpu_to_be16(priv->pkey); |
333 | 344 | ||
345 | mutex_lock(&mcast_mutex); | ||
346 | init_completion(&mcast->done); | ||
347 | set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | ||
334 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, | 348 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, |
335 | priv->port, &rec, | 349 | priv->port, &rec, |
336 | IB_SA_MCMEMBER_REC_MGID | | 350 | IB_SA_MCMEMBER_REC_MGID | |
@@ -343,12 +357,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) | |||
343 | if (IS_ERR(mcast->mc)) { | 357 | if (IS_ERR(mcast->mc)) { |
344 | ret = PTR_ERR(mcast->mc); | 358 | ret = PTR_ERR(mcast->mc); |
345 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | 359 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); |
346 | ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", | 360 | complete(&mcast->done); |
347 | ret); | 361 | ipoib_warn(priv, "ib_sa_join_multicast for sendonly join " |
362 | "failed (ret = %d)\n", ret); | ||
348 | } else { | 363 | } else { |
349 | ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", | 364 | ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting " |
350 | mcast->mcmember.mgid.raw); | 365 | "sendonly join\n", mcast->mcmember.mgid.raw); |
351 | } | 366 | } |
367 | mutex_unlock(&mcast_mutex); | ||
352 | 368 | ||
353 | return ret; | 369 | return ret; |
354 | } | 370 | } |
@@ -359,18 +375,29 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) | |||
359 | carrier_on_task); | 375 | carrier_on_task); |
360 | struct ib_port_attr attr; | 376 | struct ib_port_attr attr; |
361 | 377 | ||
362 | /* | ||
363 | * Take rtnl_lock to avoid racing with ipoib_stop() and | ||
364 | * turning the carrier back on while a device is being | ||
365 | * removed. | ||
366 | */ | ||
367 | if (ib_query_port(priv->ca, priv->port, &attr) || | 378 | if (ib_query_port(priv->ca, priv->port, &attr) || |
368 | attr.state != IB_PORT_ACTIVE) { | 379 | attr.state != IB_PORT_ACTIVE) { |
369 | ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); | 380 | ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); |
370 | return; | 381 | return; |
371 | } | 382 | } |
372 | 383 | ||
373 | rtnl_lock(); | 384 | /* |
385 | * Take rtnl_lock to avoid racing with ipoib_stop() and | ||
386 | * turning the carrier back on while a device is being | ||
387 | * removed. However, ipoib_stop() will attempt to flush | ||
388 | * the workqueue while holding the rtnl lock, so loop | ||
389 | * on trylock until either we get the lock or we see | ||
390 | * FLAG_ADMIN_UP go away as that signals that we are bailing | ||
391 | * and can safely ignore the carrier on work. | ||
392 | */ | ||
393 | while (!rtnl_trylock()) { | ||
394 | if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) | ||
395 | return; | ||
396 | else | ||
397 | msleep(20); | ||
398 | } | ||
399 | if (!ipoib_cm_admin_enabled(priv->dev)) | ||
400 | dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); | ||
374 | netif_carrier_on(priv->dev); | 401 | netif_carrier_on(priv->dev); |
375 | rtnl_unlock(); | 402 | rtnl_unlock(); |
376 | } | 403 | } |
@@ -385,60 +412,63 @@ static int ipoib_mcast_join_complete(int status, | |||
385 | ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", | 412 | ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", |
386 | mcast->mcmember.mgid.raw, status); | 413 | mcast->mcmember.mgid.raw, status); |
387 | 414 | ||
415 | /* | ||
416 | * We have to take the mutex to force mcast_join to | ||
417 | * return from ib_sa_multicast_join and set mcast->mc to a | ||
418 | * valid value. Otherwise we were racing with ourselves in | ||
419 | * that we might fail here, but get a valid return from | ||
420 | * ib_sa_multicast_join after we had cleared mcast->mc here, | ||
421 | * resulting in mis-matched joins and leaves and a deadlock | ||
422 | */ | ||
423 | mutex_lock(&mcast_mutex); | ||
424 | |||
388 | /* We trap for port events ourselves. */ | 425 | /* We trap for port events ourselves. */ |
389 | if (status == -ENETRESET) { | 426 | if (status == -ENETRESET) |
390 | status = 0; | ||
391 | goto out; | 427 | goto out; |
392 | } | ||
393 | 428 | ||
394 | if (!status) | 429 | if (!status) |
395 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); | 430 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); |
396 | 431 | ||
397 | if (!status) { | 432 | if (!status) { |
398 | mcast->backoff = 1; | 433 | mcast->backoff = 1; |
399 | mutex_lock(&mcast_mutex); | ||
400 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 434 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) |
401 | queue_delayed_work(ipoib_workqueue, | 435 | queue_delayed_work(priv->wq, &priv->mcast_task, 0); |
402 | &priv->mcast_task, 0); | ||
403 | mutex_unlock(&mcast_mutex); | ||
404 | 436 | ||
405 | /* | 437 | /* |
406 | * Defer carrier on work to ipoib_workqueue to avoid a | 438 | * Defer carrier on work to priv->wq to avoid a |
407 | * deadlock on rtnl_lock here. | 439 | * deadlock on rtnl_lock here. |
408 | */ | 440 | */ |
409 | if (mcast == priv->broadcast) | 441 | if (mcast == priv->broadcast) |
410 | queue_work(ipoib_workqueue, &priv->carrier_on_task); | 442 | queue_work(priv->wq, &priv->carrier_on_task); |
411 | 443 | } else { | |
412 | status = 0; | 444 | if (mcast->logcount++ < 20) { |
413 | goto out; | 445 | if (status == -ETIMEDOUT || status == -EAGAIN) { |
414 | } | 446 | ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", |
415 | 447 | mcast->mcmember.mgid.raw, status); | |
416 | if (mcast->logcount++ < 20) { | 448 | } else { |
417 | if (status == -ETIMEDOUT || status == -EAGAIN) { | 449 | ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", |
418 | ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", | 450 | mcast->mcmember.mgid.raw, status); |
419 | mcast->mcmember.mgid.raw, status); | 451 | } |
420 | } else { | ||
421 | ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", | ||
422 | mcast->mcmember.mgid.raw, status); | ||
423 | } | 452 | } |
424 | } | ||
425 | |||
426 | mcast->backoff *= 2; | ||
427 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) | ||
428 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; | ||
429 | 453 | ||
430 | /* Clear the busy flag so we try again */ | 454 | mcast->backoff *= 2; |
431 | status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | 455 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) |
432 | 456 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; | |
433 | mutex_lock(&mcast_mutex); | 457 | } |
458 | out: | ||
434 | spin_lock_irq(&priv->lock); | 459 | spin_lock_irq(&priv->lock); |
435 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 460 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); |
436 | queue_delayed_work(ipoib_workqueue, &priv->mcast_task, | 461 | if (status) |
462 | mcast->mc = NULL; | ||
463 | complete(&mcast->done); | ||
464 | if (status == -ENETRESET) | ||
465 | status = 0; | ||
466 | if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags)) | ||
467 | queue_delayed_work(priv->wq, &priv->mcast_task, | ||
437 | mcast->backoff * HZ); | 468 | mcast->backoff * HZ); |
438 | spin_unlock_irq(&priv->lock); | 469 | spin_unlock_irq(&priv->lock); |
439 | mutex_unlock(&mcast_mutex); | 470 | mutex_unlock(&mcast_mutex); |
440 | out: | 471 | |
441 | complete(&mcast->done); | ||
442 | return status; | 472 | return status; |
443 | } | 473 | } |
444 | 474 | ||
@@ -487,10 +517,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, | |||
487 | rec.hop_limit = priv->broadcast->mcmember.hop_limit; | 517 | rec.hop_limit = priv->broadcast->mcmember.hop_limit; |
488 | } | 518 | } |
489 | 519 | ||
490 | set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | 520 | mutex_lock(&mcast_mutex); |
491 | init_completion(&mcast->done); | 521 | init_completion(&mcast->done); |
492 | set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); | 522 | set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); |
493 | |||
494 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, | 523 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, |
495 | &rec, comp_mask, GFP_KERNEL, | 524 | &rec, comp_mask, GFP_KERNEL, |
496 | ipoib_mcast_join_complete, mcast); | 525 | ipoib_mcast_join_complete, mcast); |
@@ -504,13 +533,11 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, | |||
504 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) | 533 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) |
505 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; | 534 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; |
506 | 535 | ||
507 | mutex_lock(&mcast_mutex); | ||
508 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 536 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) |
509 | queue_delayed_work(ipoib_workqueue, | 537 | queue_delayed_work(priv->wq, &priv->mcast_task, |
510 | &priv->mcast_task, | ||
511 | mcast->backoff * HZ); | 538 | mcast->backoff * HZ); |
512 | mutex_unlock(&mcast_mutex); | ||
513 | } | 539 | } |
540 | mutex_unlock(&mcast_mutex); | ||
514 | } | 541 | } |
515 | 542 | ||
516 | void ipoib_mcast_join_task(struct work_struct *work) | 543 | void ipoib_mcast_join_task(struct work_struct *work) |
@@ -547,8 +574,8 @@ void ipoib_mcast_join_task(struct work_struct *work) | |||
547 | ipoib_warn(priv, "failed to allocate broadcast group\n"); | 574 | ipoib_warn(priv, "failed to allocate broadcast group\n"); |
548 | mutex_lock(&mcast_mutex); | 575 | mutex_lock(&mcast_mutex); |
549 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 576 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) |
550 | queue_delayed_work(ipoib_workqueue, | 577 | queue_delayed_work(priv->wq, &priv->mcast_task, |
551 | &priv->mcast_task, HZ); | 578 | HZ); |
552 | mutex_unlock(&mcast_mutex); | 579 | mutex_unlock(&mcast_mutex); |
553 | return; | 580 | return; |
554 | } | 581 | } |
@@ -563,7 +590,8 @@ void ipoib_mcast_join_task(struct work_struct *work) | |||
563 | } | 590 | } |
564 | 591 | ||
565 | if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { | 592 | if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { |
566 | if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) | 593 | if (IS_ERR_OR_NULL(priv->broadcast->mc) && |
594 | !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) | ||
567 | ipoib_mcast_join(dev, priv->broadcast, 0); | 595 | ipoib_mcast_join(dev, priv->broadcast, 0); |
568 | return; | 596 | return; |
569 | } | 597 | } |
@@ -571,23 +599,33 @@ void ipoib_mcast_join_task(struct work_struct *work) | |||
571 | while (1) { | 599 | while (1) { |
572 | struct ipoib_mcast *mcast = NULL; | 600 | struct ipoib_mcast *mcast = NULL; |
573 | 601 | ||
602 | /* | ||
603 | * Need the mutex so our flags are consistent, need the | ||
604 | * priv->lock so we don't race with list removals in either | ||
605 | * mcast_dev_flush or mcast_restart_task | ||
606 | */ | ||
607 | mutex_lock(&mcast_mutex); | ||
574 | spin_lock_irq(&priv->lock); | 608 | spin_lock_irq(&priv->lock); |
575 | list_for_each_entry(mcast, &priv->multicast_list, list) { | 609 | list_for_each_entry(mcast, &priv->multicast_list, list) { |
576 | if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) | 610 | if (IS_ERR_OR_NULL(mcast->mc) && |
577 | && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) | 611 | !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && |
578 | && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { | 612 | !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { |
579 | /* Found the next unjoined group */ | 613 | /* Found the next unjoined group */ |
580 | break; | 614 | break; |
581 | } | 615 | } |
582 | } | 616 | } |
583 | spin_unlock_irq(&priv->lock); | 617 | spin_unlock_irq(&priv->lock); |
618 | mutex_unlock(&mcast_mutex); | ||
584 | 619 | ||
585 | if (&mcast->list == &priv->multicast_list) { | 620 | if (&mcast->list == &priv->multicast_list) { |
586 | /* All done */ | 621 | /* All done */ |
587 | break; | 622 | break; |
588 | } | 623 | } |
589 | 624 | ||
590 | ipoib_mcast_join(dev, mcast, 1); | 625 | if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) |
626 | ipoib_mcast_sendonly_join(mcast); | ||
627 | else | ||
628 | ipoib_mcast_join(dev, mcast, 1); | ||
591 | return; | 629 | return; |
592 | } | 630 | } |
593 | 631 | ||
@@ -604,13 +642,13 @@ int ipoib_mcast_start_thread(struct net_device *dev) | |||
604 | 642 | ||
605 | mutex_lock(&mcast_mutex); | 643 | mutex_lock(&mcast_mutex); |
606 | if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) | 644 | if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) |
607 | queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); | 645 | queue_delayed_work(priv->wq, &priv->mcast_task, 0); |
608 | mutex_unlock(&mcast_mutex); | 646 | mutex_unlock(&mcast_mutex); |
609 | 647 | ||
610 | return 0; | 648 | return 0; |
611 | } | 649 | } |
612 | 650 | ||
613 | int ipoib_mcast_stop_thread(struct net_device *dev, int flush) | 651 | int ipoib_mcast_stop_thread(struct net_device *dev) |
614 | { | 652 | { |
615 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 653 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
616 | 654 | ||
@@ -621,8 +659,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) | |||
621 | cancel_delayed_work(&priv->mcast_task); | 659 | cancel_delayed_work(&priv->mcast_task); |
622 | mutex_unlock(&mcast_mutex); | 660 | mutex_unlock(&mcast_mutex); |
623 | 661 | ||
624 | if (flush) | 662 | flush_workqueue(priv->wq); |
625 | flush_workqueue(ipoib_workqueue); | ||
626 | 663 | ||
627 | return 0; | 664 | return 0; |
628 | } | 665 | } |
@@ -633,6 +670,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) | |||
633 | int ret = 0; | 670 | int ret = 0; |
634 | 671 | ||
635 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) | 672 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) |
673 | ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); | ||
674 | |||
675 | if (!IS_ERR_OR_NULL(mcast->mc)) | ||
636 | ib_sa_free_multicast(mcast->mc); | 676 | ib_sa_free_multicast(mcast->mc); |
637 | 677 | ||
638 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { | 678 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { |
@@ -685,6 +725,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) | |||
685 | memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); | 725 | memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); |
686 | __ipoib_mcast_add(dev, mcast); | 726 | __ipoib_mcast_add(dev, mcast); |
687 | list_add_tail(&mcast->list, &priv->multicast_list); | 727 | list_add_tail(&mcast->list, &priv->multicast_list); |
728 | if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) | ||
729 | queue_delayed_work(priv->wq, &priv->mcast_task, 0); | ||
688 | } | 730 | } |
689 | 731 | ||
690 | if (!mcast->ah) { | 732 | if (!mcast->ah) { |
@@ -698,8 +740,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) | |||
698 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) | 740 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) |
699 | ipoib_dbg_mcast(priv, "no address vector, " | 741 | ipoib_dbg_mcast(priv, "no address vector, " |
700 | "but multicast join already started\n"); | 742 | "but multicast join already started\n"); |
701 | else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) | ||
702 | ipoib_mcast_sendonly_join(mcast); | ||
703 | 743 | ||
704 | /* | 744 | /* |
705 | * If lookup completes between here and out:, don't | 745 | * If lookup completes between here and out:, don't |
@@ -759,9 +799,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev) | |||
759 | 799 | ||
760 | spin_unlock_irqrestore(&priv->lock, flags); | 800 | spin_unlock_irqrestore(&priv->lock, flags); |
761 | 801 | ||
762 | /* seperate between the wait to the leave*/ | 802 | /* |
803 | * make sure the in-flight joins have finished before we attempt | ||
804 | * to leave | ||
805 | */ | ||
763 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) | 806 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) |
764 | if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) | 807 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) |
765 | wait_for_completion(&mcast->done); | 808 | wait_for_completion(&mcast->done); |
766 | 809 | ||
767 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { | 810 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { |
@@ -794,8 +837,6 @@ void ipoib_mcast_restart_task(struct work_struct *work) | |||
794 | 837 | ||
795 | ipoib_dbg_mcast(priv, "restarting multicast task\n"); | 838 | ipoib_dbg_mcast(priv, "restarting multicast task\n"); |
796 | 839 | ||
797 | ipoib_mcast_stop_thread(dev, 0); | ||
798 | |||
799 | local_irq_save(flags); | 840 | local_irq_save(flags); |
800 | netif_addr_lock(dev); | 841 | netif_addr_lock(dev); |
801 | spin_lock(&priv->lock); | 842 | spin_lock(&priv->lock); |
@@ -880,14 +921,38 @@ void ipoib_mcast_restart_task(struct work_struct *work) | |||
880 | netif_addr_unlock(dev); | 921 | netif_addr_unlock(dev); |
881 | local_irq_restore(flags); | 922 | local_irq_restore(flags); |
882 | 923 | ||
883 | /* We have to cancel outside of the spinlock */ | 924 | /* |
925 | * make sure the in-flight joins have finished before we attempt | ||
926 | * to leave | ||
927 | */ | ||
928 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) | ||
929 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) | ||
930 | wait_for_completion(&mcast->done); | ||
931 | |||
932 | /* | ||
933 | * We have to cancel outside of the spinlock, but we have to | ||
934 | * take the rtnl lock or else we race with the removal of | ||
935 | * entries from the remove list in mcast_dev_flush as part | ||
936 | * of ipoib_stop(). We detect the drop of the ADMIN_UP flag | ||
937 | * to signal that we have hit this particular race, and we | ||
938 | * return since we know we don't need to do anything else | ||
939 | * anyway. | ||
940 | */ | ||
941 | while (!rtnl_trylock()) { | ||
942 | if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) | ||
943 | return; | ||
944 | else | ||
945 | msleep(20); | ||
946 | } | ||
884 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { | 947 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { |
885 | ipoib_mcast_leave(mcast->dev, mcast); | 948 | ipoib_mcast_leave(mcast->dev, mcast); |
886 | ipoib_mcast_free(mcast); | 949 | ipoib_mcast_free(mcast); |
887 | } | 950 | } |
888 | 951 | /* | |
889 | if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) | 952 | * Restart our join task if needed |
890 | ipoib_mcast_start_thread(dev); | 953 | */ |
954 | ipoib_mcast_start_thread(dev); | ||
955 | rtnl_unlock(); | ||
891 | } | 956 | } |
892 | 957 | ||
893 | #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG | 958 | #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index c56d5d44c53b..b72a753eb41d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c | |||
@@ -145,10 +145,20 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) | |||
145 | int ret, size; | 145 | int ret, size; |
146 | int i; | 146 | int i; |
147 | 147 | ||
148 | /* | ||
149 | * the various IPoIB tasks assume they will never race against | ||
150 | * themselves, so always use a single thread workqueue | ||
151 | */ | ||
152 | priv->wq = create_singlethread_workqueue("ipoib_wq"); | ||
153 | if (!priv->wq) { | ||
154 | printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); | ||
155 | return -ENODEV; | ||
156 | } | ||
157 | |||
148 | priv->pd = ib_alloc_pd(priv->ca); | 158 | priv->pd = ib_alloc_pd(priv->ca); |
149 | if (IS_ERR(priv->pd)) { | 159 | if (IS_ERR(priv->pd)) { |
150 | printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); | 160 | printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); |
151 | return -ENODEV; | 161 | goto out_free_wq; |
152 | } | 162 | } |
153 | 163 | ||
154 | priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); | 164 | priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); |
@@ -242,6 +252,10 @@ out_free_mr: | |||
242 | 252 | ||
243 | out_free_pd: | 253 | out_free_pd: |
244 | ib_dealloc_pd(priv->pd); | 254 | ib_dealloc_pd(priv->pd); |
255 | |||
256 | out_free_wq: | ||
257 | destroy_workqueue(priv->wq); | ||
258 | priv->wq = NULL; | ||
245 | return -ENODEV; | 259 | return -ENODEV; |
246 | } | 260 | } |
247 | 261 | ||
@@ -270,6 +284,12 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) | |||
270 | 284 | ||
271 | if (ib_dealloc_pd(priv->pd)) | 285 | if (ib_dealloc_pd(priv->pd)) |
272 | ipoib_warn(priv, "ib_dealloc_pd failed\n"); | 286 | ipoib_warn(priv, "ib_dealloc_pd failed\n"); |
287 | |||
288 | if (priv->wq) { | ||
289 | flush_workqueue(priv->wq); | ||
290 | destroy_workqueue(priv->wq); | ||
291 | priv->wq = NULL; | ||
292 | } | ||
273 | } | 293 | } |
274 | 294 | ||
275 | void ipoib_event(struct ib_event_handler *handler, | 295 | void ipoib_event(struct ib_event_handler *handler, |
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 20ca6a619476..6a594aac2290 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c | |||
@@ -97,7 +97,7 @@ module_param_named(pi_enable, iser_pi_enable, bool, 0644); | |||
97 | MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); | 97 | MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); |
98 | 98 | ||
99 | module_param_named(pi_guard, iser_pi_guard, int, 0644); | 99 | module_param_named(pi_guard, iser_pi_guard, int, 0644); |
100 | MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)"); | 100 | MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]"); |
101 | 101 | ||
102 | static struct workqueue_struct *release_wq; | 102 | static struct workqueue_struct *release_wq; |
103 | struct iser_global ig; | 103 | struct iser_global ig; |
@@ -164,18 +164,42 @@ iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) | |||
164 | return 0; | 164 | return 0; |
165 | } | 165 | } |
166 | 166 | ||
167 | int iser_initialize_task_headers(struct iscsi_task *task, | 167 | /** |
168 | struct iser_tx_desc *tx_desc) | 168 | * iser_initialize_task_headers() - Initialize task headers |
169 | * @task: iscsi task | ||
170 | * @tx_desc: iser tx descriptor | ||
171 | * | ||
172 | * Notes: | ||
173 | * This routine may race with iser teardown flow for scsi | ||
174 | * error handling TMFs. So for TMF we should acquire the | ||
175 | * state mutex to avoid dereferencing the IB device which | ||
176 | * may have already been terminated. | ||
177 | */ | ||
178 | int | ||
179 | iser_initialize_task_headers(struct iscsi_task *task, | ||
180 | struct iser_tx_desc *tx_desc) | ||
169 | { | 181 | { |
170 | struct iser_conn *iser_conn = task->conn->dd_data; | 182 | struct iser_conn *iser_conn = task->conn->dd_data; |
171 | struct iser_device *device = iser_conn->ib_conn.device; | 183 | struct iser_device *device = iser_conn->ib_conn.device; |
172 | struct iscsi_iser_task *iser_task = task->dd_data; | 184 | struct iscsi_iser_task *iser_task = task->dd_data; |
173 | u64 dma_addr; | 185 | u64 dma_addr; |
186 | const bool mgmt_task = !task->sc && !in_interrupt(); | ||
187 | int ret = 0; | ||
188 | |||
189 | if (unlikely(mgmt_task)) | ||
190 | mutex_lock(&iser_conn->state_mutex); | ||
191 | |||
192 | if (unlikely(iser_conn->state != ISER_CONN_UP)) { | ||
193 | ret = -ENODEV; | ||
194 | goto out; | ||
195 | } | ||
174 | 196 | ||
175 | dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, | 197 | dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, |
176 | ISER_HEADERS_LEN, DMA_TO_DEVICE); | 198 | ISER_HEADERS_LEN, DMA_TO_DEVICE); |
177 | if (ib_dma_mapping_error(device->ib_device, dma_addr)) | 199 | if (ib_dma_mapping_error(device->ib_device, dma_addr)) { |
178 | return -ENOMEM; | 200 | ret = -ENOMEM; |
201 | goto out; | ||
202 | } | ||
179 | 203 | ||
180 | tx_desc->dma_addr = dma_addr; | 204 | tx_desc->dma_addr = dma_addr; |
181 | tx_desc->tx_sg[0].addr = tx_desc->dma_addr; | 205 | tx_desc->tx_sg[0].addr = tx_desc->dma_addr; |
@@ -183,7 +207,11 @@ int iser_initialize_task_headers(struct iscsi_task *task, | |||
183 | tx_desc->tx_sg[0].lkey = device->mr->lkey; | 207 | tx_desc->tx_sg[0].lkey = device->mr->lkey; |
184 | 208 | ||
185 | iser_task->iser_conn = iser_conn; | 209 | iser_task->iser_conn = iser_conn; |
186 | return 0; | 210 | out: |
211 | if (unlikely(mgmt_task)) | ||
212 | mutex_unlock(&iser_conn->state_mutex); | ||
213 | |||
214 | return ret; | ||
187 | } | 215 | } |
188 | 216 | ||
189 | /** | 217 | /** |
@@ -199,9 +227,14 @@ static int | |||
199 | iscsi_iser_task_init(struct iscsi_task *task) | 227 | iscsi_iser_task_init(struct iscsi_task *task) |
200 | { | 228 | { |
201 | struct iscsi_iser_task *iser_task = task->dd_data; | 229 | struct iscsi_iser_task *iser_task = task->dd_data; |
230 | int ret; | ||
202 | 231 | ||
203 | if (iser_initialize_task_headers(task, &iser_task->desc)) | 232 | ret = iser_initialize_task_headers(task, &iser_task->desc); |
204 | return -ENOMEM; | 233 | if (ret) { |
234 | iser_err("Failed to init task %p, err = %d\n", | ||
235 | iser_task, ret); | ||
236 | return ret; | ||
237 | } | ||
205 | 238 | ||
206 | /* mgmt task */ | 239 | /* mgmt task */ |
207 | if (!task->sc) | 240 | if (!task->sc) |
@@ -508,8 +541,8 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) | |||
508 | */ | 541 | */ |
509 | if (iser_conn) { | 542 | if (iser_conn) { |
510 | mutex_lock(&iser_conn->state_mutex); | 543 | mutex_lock(&iser_conn->state_mutex); |
511 | iscsi_conn_stop(cls_conn, flag); | ||
512 | iser_conn_terminate(iser_conn); | 544 | iser_conn_terminate(iser_conn); |
545 | iscsi_conn_stop(cls_conn, flag); | ||
513 | 546 | ||
514 | /* unbind */ | 547 | /* unbind */ |
515 | iser_conn->iscsi_conn = NULL; | 548 | iser_conn->iscsi_conn = NULL; |
@@ -541,12 +574,13 @@ iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) | |||
541 | static inline unsigned int | 574 | static inline unsigned int |
542 | iser_dif_prot_caps(int prot_caps) | 575 | iser_dif_prot_caps(int prot_caps) |
543 | { | 576 | { |
544 | return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | | 577 | return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? |
545 | SHOST_DIX_TYPE1_PROTECTION : 0) | | 578 | SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION | |
546 | ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | | 579 | SHOST_DIX_TYPE1_PROTECTION : 0) | |
547 | SHOST_DIX_TYPE2_PROTECTION : 0) | | 580 | ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? |
548 | ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | | 581 | SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) | |
549 | SHOST_DIX_TYPE3_PROTECTION : 0); | 582 | ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? |
583 | SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0); | ||
550 | } | 584 | } |
551 | 585 | ||
552 | /** | 586 | /** |
@@ -569,6 +603,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, | |||
569 | struct Scsi_Host *shost; | 603 | struct Scsi_Host *shost; |
570 | struct iser_conn *iser_conn = NULL; | 604 | struct iser_conn *iser_conn = NULL; |
571 | struct ib_conn *ib_conn; | 605 | struct ib_conn *ib_conn; |
606 | u16 max_cmds; | ||
572 | 607 | ||
573 | shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); | 608 | shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); |
574 | if (!shost) | 609 | if (!shost) |
@@ -586,26 +621,41 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, | |||
586 | */ | 621 | */ |
587 | if (ep) { | 622 | if (ep) { |
588 | iser_conn = ep->dd_data; | 623 | iser_conn = ep->dd_data; |
624 | max_cmds = iser_conn->max_cmds; | ||
625 | |||
626 | mutex_lock(&iser_conn->state_mutex); | ||
627 | if (iser_conn->state != ISER_CONN_UP) { | ||
628 | iser_err("iser conn %p already started teardown\n", | ||
629 | iser_conn); | ||
630 | mutex_unlock(&iser_conn->state_mutex); | ||
631 | goto free_host; | ||
632 | } | ||
633 | |||
589 | ib_conn = &iser_conn->ib_conn; | 634 | ib_conn = &iser_conn->ib_conn; |
590 | if (ib_conn->pi_support) { | 635 | if (ib_conn->pi_support) { |
591 | u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; | 636 | u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; |
592 | 637 | ||
593 | scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); | 638 | scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); |
594 | if (iser_pi_guard) | 639 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP | |
595 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); | 640 | SHOST_DIX_GUARD_CRC); |
596 | else | ||
597 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); | ||
598 | } | 641 | } |
599 | } | ||
600 | 642 | ||
601 | if (iscsi_host_add(shost, ep ? | 643 | if (iscsi_host_add(shost, |
602 | ib_conn->device->ib_device->dma_device : NULL)) | 644 | ib_conn->device->ib_device->dma_device)) { |
603 | goto free_host; | 645 | mutex_unlock(&iser_conn->state_mutex); |
646 | goto free_host; | ||
647 | } | ||
648 | mutex_unlock(&iser_conn->state_mutex); | ||
649 | } else { | ||
650 | max_cmds = ISER_DEF_XMIT_CMDS_MAX; | ||
651 | if (iscsi_host_add(shost, NULL)) | ||
652 | goto free_host; | ||
653 | } | ||
604 | 654 | ||
605 | if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { | 655 | if (cmds_max > max_cmds) { |
606 | iser_info("cmds_max changed from %u to %u\n", | 656 | iser_info("cmds_max changed from %u to %u\n", |
607 | cmds_max, ISER_DEF_XMIT_CMDS_MAX); | 657 | cmds_max, max_cmds); |
608 | cmds_max = ISER_DEF_XMIT_CMDS_MAX; | 658 | cmds_max = max_cmds; |
609 | } | 659 | } |
610 | 660 | ||
611 | cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, | 661 | cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, |
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index cd4174ca9a76..5ce26817e7e1 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h | |||
@@ -69,34 +69,31 @@ | |||
69 | 69 | ||
70 | #define DRV_NAME "iser" | 70 | #define DRV_NAME "iser" |
71 | #define PFX DRV_NAME ": " | 71 | #define PFX DRV_NAME ": " |
72 | #define DRV_VER "1.4.8" | 72 | #define DRV_VER "1.5" |
73 | 73 | ||
74 | #define iser_dbg(fmt, arg...) \ | 74 | #define iser_dbg(fmt, arg...) \ |
75 | do { \ | 75 | do { \ |
76 | if (iser_debug_level > 2) \ | 76 | if (unlikely(iser_debug_level > 2)) \ |
77 | printk(KERN_DEBUG PFX "%s: " fmt,\ | 77 | printk(KERN_DEBUG PFX "%s: " fmt,\ |
78 | __func__ , ## arg); \ | 78 | __func__ , ## arg); \ |
79 | } while (0) | 79 | } while (0) |
80 | 80 | ||
81 | #define iser_warn(fmt, arg...) \ | 81 | #define iser_warn(fmt, arg...) \ |
82 | do { \ | 82 | do { \ |
83 | if (iser_debug_level > 0) \ | 83 | if (unlikely(iser_debug_level > 0)) \ |
84 | pr_warn(PFX "%s: " fmt, \ | 84 | pr_warn(PFX "%s: " fmt, \ |
85 | __func__ , ## arg); \ | 85 | __func__ , ## arg); \ |
86 | } while (0) | 86 | } while (0) |
87 | 87 | ||
88 | #define iser_info(fmt, arg...) \ | 88 | #define iser_info(fmt, arg...) \ |
89 | do { \ | 89 | do { \ |
90 | if (iser_debug_level > 1) \ | 90 | if (unlikely(iser_debug_level > 1)) \ |
91 | pr_info(PFX "%s: " fmt, \ | 91 | pr_info(PFX "%s: " fmt, \ |
92 | __func__ , ## arg); \ | 92 | __func__ , ## arg); \ |
93 | } while (0) | 93 | } while (0) |
94 | 94 | ||
95 | #define iser_err(fmt, arg...) \ | 95 | #define iser_err(fmt, arg...) \ |
96 | do { \ | 96 | pr_err(PFX "%s: " fmt, __func__ , ## arg) |
97 | printk(KERN_ERR PFX "%s: " fmt, \ | ||
98 | __func__ , ## arg); \ | ||
99 | } while (0) | ||
100 | 97 | ||
101 | #define SHIFT_4K 12 | 98 | #define SHIFT_4K 12 |
102 | #define SIZE_4K (1ULL << SHIFT_4K) | 99 | #define SIZE_4K (1ULL << SHIFT_4K) |
@@ -144,6 +141,11 @@ | |||
144 | ISER_MAX_TX_MISC_PDUS + \ | 141 | ISER_MAX_TX_MISC_PDUS + \ |
145 | ISER_MAX_RX_MISC_PDUS) | 142 | ISER_MAX_RX_MISC_PDUS) |
146 | 143 | ||
144 | #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ | ||
145 | - ISER_MAX_TX_MISC_PDUS \ | ||
146 | - ISER_MAX_RX_MISC_PDUS) / \ | ||
147 | (1 + ISER_INFLIGHT_DATAOUTS)) | ||
148 | |||
147 | #define ISER_WC_BATCH_COUNT 16 | 149 | #define ISER_WC_BATCH_COUNT 16 |
148 | #define ISER_SIGNAL_CMD_COUNT 32 | 150 | #define ISER_SIGNAL_CMD_COUNT 32 |
149 | 151 | ||
@@ -247,7 +249,6 @@ struct iscsi_endpoint; | |||
247 | * @va: MR start address (buffer va) | 249 | * @va: MR start address (buffer va) |
248 | * @len: MR length | 250 | * @len: MR length |
249 | * @mem_h: pointer to registration context (FMR/Fastreg) | 251 | * @mem_h: pointer to registration context (FMR/Fastreg) |
250 | * @is_mr: indicates weather we registered the buffer | ||
251 | */ | 252 | */ |
252 | struct iser_mem_reg { | 253 | struct iser_mem_reg { |
253 | u32 lkey; | 254 | u32 lkey; |
@@ -255,7 +256,6 @@ struct iser_mem_reg { | |||
255 | u64 va; | 256 | u64 va; |
256 | u64 len; | 257 | u64 len; |
257 | void *mem_h; | 258 | void *mem_h; |
258 | int is_mr; | ||
259 | }; | 259 | }; |
260 | 260 | ||
261 | /** | 261 | /** |
@@ -323,8 +323,6 @@ struct iser_rx_desc { | |||
323 | char pad[ISER_RX_PAD_SIZE]; | 323 | char pad[ISER_RX_PAD_SIZE]; |
324 | } __attribute__((packed)); | 324 | } __attribute__((packed)); |
325 | 325 | ||
326 | #define ISER_MAX_CQ 4 | ||
327 | |||
328 | struct iser_conn; | 326 | struct iser_conn; |
329 | struct ib_conn; | 327 | struct ib_conn; |
330 | struct iscsi_iser_task; | 328 | struct iscsi_iser_task; |
@@ -375,7 +373,7 @@ struct iser_device { | |||
375 | struct list_head ig_list; | 373 | struct list_head ig_list; |
376 | int refcount; | 374 | int refcount; |
377 | int comps_used; | 375 | int comps_used; |
378 | struct iser_comp comps[ISER_MAX_CQ]; | 376 | struct iser_comp *comps; |
379 | int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, | 377 | int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, |
380 | unsigned cmds_max); | 378 | unsigned cmds_max); |
381 | void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); | 379 | void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); |
@@ -432,6 +430,7 @@ struct fast_reg_descriptor { | |||
432 | * @cma_id: rdma_cm connection maneger handle | 430 | * @cma_id: rdma_cm connection maneger handle |
433 | * @qp: Connection Queue-pair | 431 | * @qp: Connection Queue-pair |
434 | * @post_recv_buf_count: post receive counter | 432 | * @post_recv_buf_count: post receive counter |
433 | * @sig_count: send work request signal count | ||
435 | * @rx_wr: receive work request for batch posts | 434 | * @rx_wr: receive work request for batch posts |
436 | * @device: reference to iser device | 435 | * @device: reference to iser device |
437 | * @comp: iser completion context | 436 | * @comp: iser completion context |
@@ -452,6 +451,7 @@ struct ib_conn { | |||
452 | struct rdma_cm_id *cma_id; | 451 | struct rdma_cm_id *cma_id; |
453 | struct ib_qp *qp; | 452 | struct ib_qp *qp; |
454 | int post_recv_buf_count; | 453 | int post_recv_buf_count; |
454 | u8 sig_count; | ||
455 | struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; | 455 | struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; |
456 | struct iser_device *device; | 456 | struct iser_device *device; |
457 | struct iser_comp *comp; | 457 | struct iser_comp *comp; |
@@ -482,6 +482,7 @@ struct ib_conn { | |||
482 | * to max number of post recvs | 482 | * to max number of post recvs |
483 | * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) | 483 | * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) |
484 | * @min_posted_rx: (qp_max_recv_dtos >> 2) | 484 | * @min_posted_rx: (qp_max_recv_dtos >> 2) |
485 | * @max_cmds: maximum cmds allowed for this connection | ||
485 | * @name: connection peer portal | 486 | * @name: connection peer portal |
486 | * @release_work: deffered work for release job | 487 | * @release_work: deffered work for release job |
487 | * @state_mutex: protects iser onnection state | 488 | * @state_mutex: protects iser onnection state |
@@ -507,6 +508,7 @@ struct iser_conn { | |||
507 | unsigned qp_max_recv_dtos; | 508 | unsigned qp_max_recv_dtos; |
508 | unsigned qp_max_recv_dtos_mask; | 509 | unsigned qp_max_recv_dtos_mask; |
509 | unsigned min_posted_rx; | 510 | unsigned min_posted_rx; |
511 | u16 max_cmds; | ||
510 | char name[ISER_OBJECT_NAME_SIZE]; | 512 | char name[ISER_OBJECT_NAME_SIZE]; |
511 | struct work_struct release_work; | 513 | struct work_struct release_work; |
512 | struct mutex state_mutex; | 514 | struct mutex state_mutex; |
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 5a489ea63732..3821633f1065 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c | |||
@@ -369,7 +369,7 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) | |||
369 | return 0; | 369 | return 0; |
370 | } | 370 | } |
371 | 371 | ||
372 | static inline bool iser_signal_comp(int sig_count) | 372 | static inline bool iser_signal_comp(u8 sig_count) |
373 | { | 373 | { |
374 | return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); | 374 | return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); |
375 | } | 375 | } |
@@ -388,7 +388,7 @@ int iser_send_command(struct iscsi_conn *conn, | |||
388 | struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; | 388 | struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; |
389 | struct scsi_cmnd *sc = task->sc; | 389 | struct scsi_cmnd *sc = task->sc; |
390 | struct iser_tx_desc *tx_desc = &iser_task->desc; | 390 | struct iser_tx_desc *tx_desc = &iser_task->desc; |
391 | static unsigned sig_count; | 391 | u8 sig_count = ++iser_conn->ib_conn.sig_count; |
392 | 392 | ||
393 | edtl = ntohl(hdr->data_length); | 393 | edtl = ntohl(hdr->data_length); |
394 | 394 | ||
@@ -435,7 +435,7 @@ int iser_send_command(struct iscsi_conn *conn, | |||
435 | iser_task->status = ISER_TASK_STATUS_STARTED; | 435 | iser_task->status = ISER_TASK_STATUS_STARTED; |
436 | 436 | ||
437 | err = iser_post_send(&iser_conn->ib_conn, tx_desc, | 437 | err = iser_post_send(&iser_conn->ib_conn, tx_desc, |
438 | iser_signal_comp(++sig_count)); | 438 | iser_signal_comp(sig_count)); |
439 | if (!err) | 439 | if (!err) |
440 | return 0; | 440 | return 0; |
441 | 441 | ||
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 6c5ce357fba6..abce9339333f 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c | |||
@@ -73,7 +73,6 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, | |||
73 | 73 | ||
74 | if (cmd_dir == ISER_DIR_OUT) { | 74 | if (cmd_dir == ISER_DIR_OUT) { |
75 | /* copy the unaligned sg the buffer which is used for RDMA */ | 75 | /* copy the unaligned sg the buffer which is used for RDMA */ |
76 | int i; | ||
77 | char *p, *from; | 76 | char *p, *from; |
78 | 77 | ||
79 | sgl = (struct scatterlist *)data->buf; | 78 | sgl = (struct scatterlist *)data->buf; |
@@ -409,7 +408,6 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, | |||
409 | regd_buf->reg.rkey = device->mr->rkey; | 408 | regd_buf->reg.rkey = device->mr->rkey; |
410 | regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); | 409 | regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); |
411 | regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); | 410 | regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); |
412 | regd_buf->reg.is_mr = 0; | ||
413 | 411 | ||
414 | iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " | 412 | iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " |
415 | "va: 0x%08lX sz: %ld]\n", | 413 | "va: 0x%08lX sz: %ld]\n", |
@@ -440,13 +438,13 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, | |||
440 | return 0; | 438 | return 0; |
441 | } | 439 | } |
442 | 440 | ||
443 | static inline void | 441 | static void |
444 | iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, | 442 | iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, |
445 | struct ib_sig_domain *domain) | 443 | struct ib_sig_domain *domain) |
446 | { | 444 | { |
447 | domain->sig_type = IB_SIG_TYPE_T10_DIF; | 445 | domain->sig_type = IB_SIG_TYPE_T10_DIF; |
448 | domain->sig.dif.pi_interval = sc->device->sector_size; | 446 | domain->sig.dif.pi_interval = scsi_prot_interval(sc); |
449 | domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff; | 447 | domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc); |
450 | /* | 448 | /* |
451 | * At the moment we hard code those, but in the future | 449 | * At the moment we hard code those, but in the future |
452 | * we will take them from sc. | 450 | * we will take them from sc. |
@@ -454,8 +452,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, | |||
454 | domain->sig.dif.apptag_check_mask = 0xffff; | 452 | domain->sig.dif.apptag_check_mask = 0xffff; |
455 | domain->sig.dif.app_escape = true; | 453 | domain->sig.dif.app_escape = true; |
456 | domain->sig.dif.ref_escape = true; | 454 | domain->sig.dif.ref_escape = true; |
457 | if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 || | 455 | if (sc->prot_flags & SCSI_PROT_REF_INCREMENT) |
458 | scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2) | ||
459 | domain->sig.dif.ref_remap = true; | 456 | domain->sig.dif.ref_remap = true; |
460 | }; | 457 | }; |
461 | 458 | ||
@@ -473,26 +470,16 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) | |||
473 | case SCSI_PROT_WRITE_STRIP: | 470 | case SCSI_PROT_WRITE_STRIP: |
474 | sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; | 471 | sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; |
475 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); | 472 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); |
476 | /* | 473 | sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? |
477 | * At the moment we use this modparam to tell what is | 474 | IB_T10DIF_CSUM : IB_T10DIF_CRC; |
478 | * the memory bg_type, in the future we will take it | ||
479 | * from sc. | ||
480 | */ | ||
481 | sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : | ||
482 | IB_T10DIF_CRC; | ||
483 | break; | 475 | break; |
484 | case SCSI_PROT_READ_PASS: | 476 | case SCSI_PROT_READ_PASS: |
485 | case SCSI_PROT_WRITE_PASS: | 477 | case SCSI_PROT_WRITE_PASS: |
486 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); | 478 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); |
487 | sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; | 479 | sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; |
488 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); | 480 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); |
489 | /* | 481 | sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? |
490 | * At the moment we use this modparam to tell what is | 482 | IB_T10DIF_CSUM : IB_T10DIF_CRC; |
491 | * the memory bg_type, in the future we will take it | ||
492 | * from sc. | ||
493 | */ | ||
494 | sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : | ||
495 | IB_T10DIF_CRC; | ||
496 | break; | 483 | break; |
497 | default: | 484 | default: |
498 | iser_err("Unsupported PI operation %d\n", | 485 | iser_err("Unsupported PI operation %d\n", |
@@ -503,26 +490,28 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) | |||
503 | return 0; | 490 | return 0; |
504 | } | 491 | } |
505 | 492 | ||
506 | static int | 493 | static inline void |
507 | iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) | 494 | iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) |
508 | { | 495 | { |
509 | switch (scsi_get_prot_type(sc)) { | 496 | *mask = 0; |
510 | case SCSI_PROT_DIF_TYPE0: | 497 | if (sc->prot_flags & SCSI_PROT_REF_CHECK) |
511 | break; | 498 | *mask |= ISER_CHECK_REFTAG; |
512 | case SCSI_PROT_DIF_TYPE1: | 499 | if (sc->prot_flags & SCSI_PROT_GUARD_CHECK) |
513 | case SCSI_PROT_DIF_TYPE2: | 500 | *mask |= ISER_CHECK_GUARD; |
514 | *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; | 501 | } |
515 | break; | ||
516 | case SCSI_PROT_DIF_TYPE3: | ||
517 | *mask = ISER_CHECK_GUARD; | ||
518 | break; | ||
519 | default: | ||
520 | iser_err("Unsupported protection type %d\n", | ||
521 | scsi_get_prot_type(sc)); | ||
522 | return -EINVAL; | ||
523 | } | ||
524 | 502 | ||
525 | return 0; | 503 | static void |
504 | iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) | ||
505 | { | ||
506 | u32 rkey; | ||
507 | |||
508 | memset(inv_wr, 0, sizeof(*inv_wr)); | ||
509 | inv_wr->opcode = IB_WR_LOCAL_INV; | ||
510 | inv_wr->wr_id = ISER_FASTREG_LI_WRID; | ||
511 | inv_wr->ex.invalidate_rkey = mr->rkey; | ||
512 | |||
513 | rkey = ib_inc_rkey(mr->rkey); | ||
514 | ib_update_fast_reg_key(mr, rkey); | ||
526 | } | 515 | } |
527 | 516 | ||
528 | static int | 517 | static int |
@@ -536,26 +525,17 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, | |||
536 | struct ib_send_wr *bad_wr, *wr = NULL; | 525 | struct ib_send_wr *bad_wr, *wr = NULL; |
537 | struct ib_sig_attrs sig_attrs; | 526 | struct ib_sig_attrs sig_attrs; |
538 | int ret; | 527 | int ret; |
539 | u32 key; | ||
540 | 528 | ||
541 | memset(&sig_attrs, 0, sizeof(sig_attrs)); | 529 | memset(&sig_attrs, 0, sizeof(sig_attrs)); |
542 | ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); | 530 | ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); |
543 | if (ret) | 531 | if (ret) |
544 | goto err; | 532 | goto err; |
545 | 533 | ||
546 | ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); | 534 | iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); |
547 | if (ret) | ||
548 | goto err; | ||
549 | 535 | ||
550 | if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { | 536 | if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { |
551 | memset(&inv_wr, 0, sizeof(inv_wr)); | 537 | iser_inv_rkey(&inv_wr, pi_ctx->sig_mr); |
552 | inv_wr.opcode = IB_WR_LOCAL_INV; | ||
553 | inv_wr.wr_id = ISER_FASTREG_LI_WRID; | ||
554 | inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; | ||
555 | wr = &inv_wr; | 538 | wr = &inv_wr; |
556 | /* Bump the key */ | ||
557 | key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); | ||
558 | ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); | ||
559 | } | 539 | } |
560 | 540 | ||
561 | memset(&sig_wr, 0, sizeof(sig_wr)); | 541 | memset(&sig_wr, 0, sizeof(sig_wr)); |
@@ -585,12 +565,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, | |||
585 | 565 | ||
586 | sig_sge->lkey = pi_ctx->sig_mr->lkey; | 566 | sig_sge->lkey = pi_ctx->sig_mr->lkey; |
587 | sig_sge->addr = 0; | 567 | sig_sge->addr = 0; |
588 | sig_sge->length = data_sge->length + prot_sge->length; | 568 | sig_sge->length = scsi_transfer_length(iser_task->sc); |
589 | if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || | ||
590 | scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { | ||
591 | sig_sge->length += (data_sge->length / | ||
592 | iser_task->sc->device->sector_size) * 8; | ||
593 | } | ||
594 | 569 | ||
595 | iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", | 570 | iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", |
596 | sig_sge->addr, sig_sge->length, | 571 | sig_sge->addr, sig_sge->length, |
@@ -613,7 +588,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, | |||
613 | struct ib_fast_reg_page_list *frpl; | 588 | struct ib_fast_reg_page_list *frpl; |
614 | struct ib_send_wr fastreg_wr, inv_wr; | 589 | struct ib_send_wr fastreg_wr, inv_wr; |
615 | struct ib_send_wr *bad_wr, *wr = NULL; | 590 | struct ib_send_wr *bad_wr, *wr = NULL; |
616 | u8 key; | ||
617 | int ret, offset, size, plen; | 591 | int ret, offset, size, plen; |
618 | 592 | ||
619 | /* if there a single dma entry, dma mr suffices */ | 593 | /* if there a single dma entry, dma mr suffices */ |
@@ -645,14 +619,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, | |||
645 | } | 619 | } |
646 | 620 | ||
647 | if (!(desc->reg_indicators & ind)) { | 621 | if (!(desc->reg_indicators & ind)) { |
648 | memset(&inv_wr, 0, sizeof(inv_wr)); | 622 | iser_inv_rkey(&inv_wr, mr); |
649 | inv_wr.wr_id = ISER_FASTREG_LI_WRID; | ||
650 | inv_wr.opcode = IB_WR_LOCAL_INV; | ||
651 | inv_wr.ex.invalidate_rkey = mr->rkey; | ||
652 | wr = &inv_wr; | 623 | wr = &inv_wr; |
653 | /* Bump the key */ | ||
654 | key = (u8)(mr->rkey & 0x000000FF); | ||
655 | ib_update_fast_reg_key(mr, ++key); | ||
656 | } | 624 | } |
657 | 625 | ||
658 | /* Prepare FASTREG WR */ | 626 | /* Prepare FASTREG WR */ |
@@ -770,15 +738,11 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, | |||
770 | regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; | 738 | regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; |
771 | regd_buf->reg.va = sig_sge.addr; | 739 | regd_buf->reg.va = sig_sge.addr; |
772 | regd_buf->reg.len = sig_sge.length; | 740 | regd_buf->reg.len = sig_sge.length; |
773 | regd_buf->reg.is_mr = 1; | ||
774 | } else { | 741 | } else { |
775 | if (desc) { | 742 | if (desc) |
776 | regd_buf->reg.rkey = desc->data_mr->rkey; | 743 | regd_buf->reg.rkey = desc->data_mr->rkey; |
777 | regd_buf->reg.is_mr = 1; | 744 | else |
778 | } else { | ||
779 | regd_buf->reg.rkey = device->mr->rkey; | 745 | regd_buf->reg.rkey = device->mr->rkey; |
780 | regd_buf->reg.is_mr = 0; | ||
781 | } | ||
782 | 746 | ||
783 | regd_buf->reg.lkey = data_sge.lkey; | 747 | regd_buf->reg.lkey = data_sge.lkey; |
784 | regd_buf->reg.va = data_sge.addr; | 748 | regd_buf->reg.va = data_sge.addr; |
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 67225bb82bb5..695a2704bd43 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c | |||
@@ -76,7 +76,7 @@ static void iser_event_handler(struct ib_event_handler *handler, | |||
76 | static int iser_create_device_ib_res(struct iser_device *device) | 76 | static int iser_create_device_ib_res(struct iser_device *device) |
77 | { | 77 | { |
78 | struct ib_device_attr *dev_attr = &device->dev_attr; | 78 | struct ib_device_attr *dev_attr = &device->dev_attr; |
79 | int ret, i; | 79 | int ret, i, max_cqe; |
80 | 80 | ||
81 | ret = ib_query_device(device->ib_device, dev_attr); | 81 | ret = ib_query_device(device->ib_device, dev_attr); |
82 | if (ret) { | 82 | if (ret) { |
@@ -104,11 +104,19 @@ static int iser_create_device_ib_res(struct iser_device *device) | |||
104 | return -1; | 104 | return -1; |
105 | } | 105 | } |
106 | 106 | ||
107 | device->comps_used = min(ISER_MAX_CQ, | 107 | device->comps_used = min_t(int, num_online_cpus(), |
108 | device->ib_device->num_comp_vectors); | 108 | device->ib_device->num_comp_vectors); |
109 | iser_info("using %d CQs, device %s supports %d vectors\n", | 109 | |
110 | device->comps = kcalloc(device->comps_used, sizeof(*device->comps), | ||
111 | GFP_KERNEL); | ||
112 | if (!device->comps) | ||
113 | goto comps_err; | ||
114 | |||
115 | max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe); | ||
116 | |||
117 | iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n", | ||
110 | device->comps_used, device->ib_device->name, | 118 | device->comps_used, device->ib_device->name, |
111 | device->ib_device->num_comp_vectors); | 119 | device->ib_device->num_comp_vectors, max_cqe); |
112 | 120 | ||
113 | device->pd = ib_alloc_pd(device->ib_device); | 121 | device->pd = ib_alloc_pd(device->ib_device); |
114 | if (IS_ERR(device->pd)) | 122 | if (IS_ERR(device->pd)) |
@@ -122,7 +130,7 @@ static int iser_create_device_ib_res(struct iser_device *device) | |||
122 | iser_cq_callback, | 130 | iser_cq_callback, |
123 | iser_cq_event_callback, | 131 | iser_cq_event_callback, |
124 | (void *)comp, | 132 | (void *)comp, |
125 | ISER_MAX_CQ_LEN, i); | 133 | max_cqe, i); |
126 | if (IS_ERR(comp->cq)) { | 134 | if (IS_ERR(comp->cq)) { |
127 | comp->cq = NULL; | 135 | comp->cq = NULL; |
128 | goto cq_err; | 136 | goto cq_err; |
@@ -162,6 +170,8 @@ cq_err: | |||
162 | } | 170 | } |
163 | ib_dealloc_pd(device->pd); | 171 | ib_dealloc_pd(device->pd); |
164 | pd_err: | 172 | pd_err: |
173 | kfree(device->comps); | ||
174 | comps_err: | ||
165 | iser_err("failed to allocate an IB resource\n"); | 175 | iser_err("failed to allocate an IB resource\n"); |
166 | return -1; | 176 | return -1; |
167 | } | 177 | } |
@@ -187,6 +197,9 @@ static void iser_free_device_ib_res(struct iser_device *device) | |||
187 | (void)ib_dereg_mr(device->mr); | 197 | (void)ib_dereg_mr(device->mr); |
188 | (void)ib_dealloc_pd(device->pd); | 198 | (void)ib_dealloc_pd(device->pd); |
189 | 199 | ||
200 | kfree(device->comps); | ||
201 | device->comps = NULL; | ||
202 | |||
190 | device->mr = NULL; | 203 | device->mr = NULL; |
191 | device->pd = NULL; | 204 | device->pd = NULL; |
192 | } | 205 | } |
@@ -425,7 +438,10 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) | |||
425 | */ | 438 | */ |
426 | static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | 439 | static int iser_create_ib_conn_res(struct ib_conn *ib_conn) |
427 | { | 440 | { |
441 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, | ||
442 | ib_conn); | ||
428 | struct iser_device *device; | 443 | struct iser_device *device; |
444 | struct ib_device_attr *dev_attr; | ||
429 | struct ib_qp_init_attr init_attr; | 445 | struct ib_qp_init_attr init_attr; |
430 | int ret = -ENOMEM; | 446 | int ret = -ENOMEM; |
431 | int index, min_index = 0; | 447 | int index, min_index = 0; |
@@ -433,6 +449,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | |||
433 | BUG_ON(ib_conn->device == NULL); | 449 | BUG_ON(ib_conn->device == NULL); |
434 | 450 | ||
435 | device = ib_conn->device; | 451 | device = ib_conn->device; |
452 | dev_attr = &device->dev_attr; | ||
436 | 453 | ||
437 | memset(&init_attr, 0, sizeof init_attr); | 454 | memset(&init_attr, 0, sizeof init_attr); |
438 | 455 | ||
@@ -460,8 +477,20 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | |||
460 | if (ib_conn->pi_support) { | 477 | if (ib_conn->pi_support) { |
461 | init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; | 478 | init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; |
462 | init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; | 479 | init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; |
480 | iser_conn->max_cmds = | ||
481 | ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS); | ||
463 | } else { | 482 | } else { |
464 | init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; | 483 | if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) { |
484 | init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; | ||
485 | iser_conn->max_cmds = | ||
486 | ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS); | ||
487 | } else { | ||
488 | init_attr.cap.max_send_wr = dev_attr->max_qp_wr; | ||
489 | iser_conn->max_cmds = | ||
490 | ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr); | ||
491 | iser_dbg("device %s supports max_send_wr %d\n", | ||
492 | device->ib_device->name, dev_attr->max_qp_wr); | ||
493 | } | ||
465 | } | 494 | } |
466 | 495 | ||
467 | ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); | 496 | ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); |
@@ -475,7 +504,11 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | |||
475 | return ret; | 504 | return ret; |
476 | 505 | ||
477 | out_err: | 506 | out_err: |
507 | mutex_lock(&ig.connlist_mutex); | ||
508 | ib_conn->comp->active_qps--; | ||
509 | mutex_unlock(&ig.connlist_mutex); | ||
478 | iser_err("unable to alloc mem or create resource, err %d\n", ret); | 510 | iser_err("unable to alloc mem or create resource, err %d\n", ret); |
511 | |||
479 | return ret; | 512 | return ret; |
480 | } | 513 | } |
481 | 514 | ||
@@ -610,9 +643,11 @@ void iser_conn_release(struct iser_conn *iser_conn) | |||
610 | mutex_unlock(&ig.connlist_mutex); | 643 | mutex_unlock(&ig.connlist_mutex); |
611 | 644 | ||
612 | mutex_lock(&iser_conn->state_mutex); | 645 | mutex_lock(&iser_conn->state_mutex); |
613 | if (iser_conn->state != ISER_CONN_DOWN) | 646 | if (iser_conn->state != ISER_CONN_DOWN) { |
614 | iser_warn("iser conn %p state %d, expected state down.\n", | 647 | iser_warn("iser conn %p state %d, expected state down.\n", |
615 | iser_conn, iser_conn->state); | 648 | iser_conn, iser_conn->state); |
649 | iser_conn->state = ISER_CONN_DOWN; | ||
650 | } | ||
616 | /* | 651 | /* |
617 | * In case we never got to bind stage, we still need to | 652 | * In case we never got to bind stage, we still need to |
618 | * release IB resources (which is safe to call more than once). | 653 | * release IB resources (which is safe to call more than once). |
@@ -662,8 +697,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn) | |||
662 | 697 | ||
663 | /* post an indication that all flush errors were consumed */ | 698 | /* post an indication that all flush errors were consumed */ |
664 | err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); | 699 | err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); |
665 | if (err) | 700 | if (err) { |
666 | iser_err("conn %p failed to post beacon", ib_conn); | 701 | iser_err("conn %p failed to post beacon", ib_conn); |
702 | return 1; | ||
703 | } | ||
667 | 704 | ||
668 | wait_for_completion(&ib_conn->flush_comp); | 705 | wait_for_completion(&ib_conn->flush_comp); |
669 | } | 706 | } |
@@ -846,20 +883,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve | |||
846 | break; | 883 | break; |
847 | case RDMA_CM_EVENT_DISCONNECTED: | 884 | case RDMA_CM_EVENT_DISCONNECTED: |
848 | case RDMA_CM_EVENT_ADDR_CHANGE: | 885 | case RDMA_CM_EVENT_ADDR_CHANGE: |
849 | iser_disconnected_handler(cma_id); | 886 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: |
887 | iser_cleanup_handler(cma_id, false); | ||
850 | break; | 888 | break; |
851 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | 889 | case RDMA_CM_EVENT_DEVICE_REMOVAL: |
852 | /* | 890 | /* |
853 | * we *must* destroy the device as we cannot rely | 891 | * we *must* destroy the device as we cannot rely |
854 | * on iscsid to be around to initiate error handling. | 892 | * on iscsid to be around to initiate error handling. |
855 | * also implicitly destroy the cma_id. | 893 | * also if we are not in state DOWN implicitly destroy |
894 | * the cma_id. | ||
856 | */ | 895 | */ |
857 | iser_cleanup_handler(cma_id, true); | 896 | iser_cleanup_handler(cma_id, true); |
858 | iser_conn->ib_conn.cma_id = NULL; | 897 | if (iser_conn->state != ISER_CONN_DOWN) { |
859 | ret = 1; | 898 | iser_conn->ib_conn.cma_id = NULL; |
860 | break; | 899 | ret = 1; |
861 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: | 900 | } |
862 | iser_cleanup_handler(cma_id, false); | ||
863 | break; | 901 | break; |
864 | default: | 902 | default: |
865 | iser_err("Unexpected RDMA CM event (%d)\n", event->event); | 903 | iser_err("Unexpected RDMA CM event (%d)\n", event->event); |
@@ -981,7 +1019,6 @@ int iser_reg_page_vec(struct ib_conn *ib_conn, | |||
981 | mem_reg->rkey = mem->fmr->rkey; | 1019 | mem_reg->rkey = mem->fmr->rkey; |
982 | mem_reg->len = page_vec->length * SIZE_4K; | 1020 | mem_reg->len = page_vec->length * SIZE_4K; |
983 | mem_reg->va = io_addr; | 1021 | mem_reg->va = io_addr; |
984 | mem_reg->is_mr = 1; | ||
985 | mem_reg->mem_h = (void *)mem; | 1022 | mem_reg->mem_h = (void *)mem; |
986 | 1023 | ||
987 | mem_reg->va += page_vec->offset; | 1024 | mem_reg->va += page_vec->offset; |
@@ -1008,7 +1045,7 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, | |||
1008 | struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; | 1045 | struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; |
1009 | int ret; | 1046 | int ret; |
1010 | 1047 | ||
1011 | if (!reg->is_mr) | 1048 | if (!reg->mem_h) |
1012 | return; | 1049 | return; |
1013 | 1050 | ||
1014 | iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); | 1051 | iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); |
@@ -1028,11 +1065,10 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, | |||
1028 | struct ib_conn *ib_conn = &iser_conn->ib_conn; | 1065 | struct ib_conn *ib_conn = &iser_conn->ib_conn; |
1029 | struct fast_reg_descriptor *desc = reg->mem_h; | 1066 | struct fast_reg_descriptor *desc = reg->mem_h; |
1030 | 1067 | ||
1031 | if (!reg->is_mr) | 1068 | if (!desc) |
1032 | return; | 1069 | return; |
1033 | 1070 | ||
1034 | reg->mem_h = NULL; | 1071 | reg->mem_h = NULL; |
1035 | reg->is_mr = 0; | ||
1036 | spin_lock_bh(&ib_conn->lock); | 1072 | spin_lock_bh(&ib_conn->lock); |
1037 | list_add_tail(&desc->list, &ib_conn->fastreg.pool); | 1073 | list_add_tail(&desc->list, &ib_conn->fastreg.pool); |
1038 | spin_unlock_bh(&ib_conn->lock); | 1074 | spin_unlock_bh(&ib_conn->lock); |
@@ -1049,7 +1085,7 @@ int iser_post_recvl(struct iser_conn *iser_conn) | |||
1049 | sge.length = ISER_RX_LOGIN_SIZE; | 1085 | sge.length = ISER_RX_LOGIN_SIZE; |
1050 | sge.lkey = ib_conn->device->mr->lkey; | 1086 | sge.lkey = ib_conn->device->mr->lkey; |
1051 | 1087 | ||
1052 | rx_wr.wr_id = (unsigned long)iser_conn->login_resp_buf; | 1088 | rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; |
1053 | rx_wr.sg_list = &sge; | 1089 | rx_wr.sg_list = &sge; |
1054 | rx_wr.num_sge = 1; | 1090 | rx_wr.num_sge = 1; |
1055 | rx_wr.next = NULL; | 1091 | rx_wr.next = NULL; |
@@ -1073,7 +1109,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) | |||
1073 | 1109 | ||
1074 | for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { | 1110 | for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { |
1075 | rx_desc = &iser_conn->rx_descs[my_rx_head]; | 1111 | rx_desc = &iser_conn->rx_descs[my_rx_head]; |
1076 | rx_wr->wr_id = (unsigned long)rx_desc; | 1112 | rx_wr->wr_id = (uintptr_t)rx_desc; |
1077 | rx_wr->sg_list = &rx_desc->rx_sg; | 1113 | rx_wr->sg_list = &rx_desc->rx_sg; |
1078 | rx_wr->num_sge = 1; | 1114 | rx_wr->num_sge = 1; |
1079 | rx_wr->next = rx_wr + 1; | 1115 | rx_wr->next = rx_wr + 1; |
@@ -1110,7 +1146,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, | |||
1110 | DMA_TO_DEVICE); | 1146 | DMA_TO_DEVICE); |
1111 | 1147 | ||
1112 | send_wr.next = NULL; | 1148 | send_wr.next = NULL; |
1113 | send_wr.wr_id = (unsigned long)tx_desc; | 1149 | send_wr.wr_id = (uintptr_t)tx_desc; |
1114 | send_wr.sg_list = tx_desc->tx_sg; | 1150 | send_wr.sg_list = tx_desc->tx_sg; |
1115 | send_wr.num_sge = tx_desc->num_sge; | 1151 | send_wr.num_sge = tx_desc->num_sge; |
1116 | send_wr.opcode = IB_WR_SEND; | 1152 | send_wr.opcode = IB_WR_SEND; |
@@ -1160,6 +1196,7 @@ static void | |||
1160 | iser_handle_comp_error(struct ib_conn *ib_conn, | 1196 | iser_handle_comp_error(struct ib_conn *ib_conn, |
1161 | struct ib_wc *wc) | 1197 | struct ib_wc *wc) |
1162 | { | 1198 | { |
1199 | void *wr_id = (void *)(uintptr_t)wc->wr_id; | ||
1163 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, | 1200 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, |
1164 | ib_conn); | 1201 | ib_conn); |
1165 | 1202 | ||
@@ -1168,8 +1205,8 @@ iser_handle_comp_error(struct ib_conn *ib_conn, | |||
1168 | iscsi_conn_failure(iser_conn->iscsi_conn, | 1205 | iscsi_conn_failure(iser_conn->iscsi_conn, |
1169 | ISCSI_ERR_CONN_FAILED); | 1206 | ISCSI_ERR_CONN_FAILED); |
1170 | 1207 | ||
1171 | if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) { | 1208 | if (is_iser_tx_desc(iser_conn, wr_id)) { |
1172 | struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id; | 1209 | struct iser_tx_desc *desc = wr_id; |
1173 | 1210 | ||
1174 | if (desc->type == ISCSI_TX_DATAOUT) | 1211 | if (desc->type == ISCSI_TX_DATAOUT) |
1175 | kmem_cache_free(ig.desc_cache, desc); | 1212 | kmem_cache_free(ig.desc_cache, desc); |
@@ -1193,14 +1230,14 @@ static void iser_handle_wc(struct ib_wc *wc) | |||
1193 | struct iser_rx_desc *rx_desc; | 1230 | struct iser_rx_desc *rx_desc; |
1194 | 1231 | ||
1195 | ib_conn = wc->qp->qp_context; | 1232 | ib_conn = wc->qp->qp_context; |
1196 | if (wc->status == IB_WC_SUCCESS) { | 1233 | if (likely(wc->status == IB_WC_SUCCESS)) { |
1197 | if (wc->opcode == IB_WC_RECV) { | 1234 | if (wc->opcode == IB_WC_RECV) { |
1198 | rx_desc = (struct iser_rx_desc *)wc->wr_id; | 1235 | rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; |
1199 | iser_rcv_completion(rx_desc, wc->byte_len, | 1236 | iser_rcv_completion(rx_desc, wc->byte_len, |
1200 | ib_conn); | 1237 | ib_conn); |
1201 | } else | 1238 | } else |
1202 | if (wc->opcode == IB_WC_SEND) { | 1239 | if (wc->opcode == IB_WC_SEND) { |
1203 | tx_desc = (struct iser_tx_desc *)wc->wr_id; | 1240 | tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; |
1204 | iser_snd_completion(tx_desc, ib_conn); | 1241 | iser_snd_completion(tx_desc, ib_conn); |
1205 | } else { | 1242 | } else { |
1206 | iser_err("Unknown wc opcode %d\n", wc->opcode); | 1243 | iser_err("Unknown wc opcode %d\n", wc->opcode); |
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 5461924c9f10..db3c8c851af1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c | |||
@@ -2929,7 +2929,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) | |||
2929 | return -ENOMEM; | 2929 | return -ENOMEM; |
2930 | 2930 | ||
2931 | sep_opt = options; | 2931 | sep_opt = options; |
2932 | while ((p = strsep(&sep_opt, ",")) != NULL) { | 2932 | while ((p = strsep(&sep_opt, ",\n")) != NULL) { |
2933 | if (!*p) | 2933 | if (!*p) |
2934 | continue; | 2934 | continue; |
2935 | 2935 | ||
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index e25436b24ce7..629f9f1435a5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c | |||
@@ -171,9 +171,9 @@ int mlx4_check_port_params(struct mlx4_dev *dev, | |||
171 | { | 171 | { |
172 | int i; | 172 | int i; |
173 | 173 | ||
174 | for (i = 0; i < dev->caps.num_ports - 1; i++) { | 174 | if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { |
175 | if (port_type[i] != port_type[i + 1]) { | 175 | for (i = 0; i < dev->caps.num_ports - 1; i++) { |
176 | if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { | 176 | if (port_type[i] != port_type[i + 1]) { |
177 | mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); | 177 | mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); |
178 | return -EINVAL; | 178 | return -EINVAL; |
179 | } | 179 | } |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ab684463780b..da82991239a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c | |||
@@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type) | |||
157 | return "MLX5_EVENT_TYPE_CMD"; | 157 | return "MLX5_EVENT_TYPE_CMD"; |
158 | case MLX5_EVENT_TYPE_PAGE_REQUEST: | 158 | case MLX5_EVENT_TYPE_PAGE_REQUEST: |
159 | return "MLX5_EVENT_TYPE_PAGE_REQUEST"; | 159 | return "MLX5_EVENT_TYPE_PAGE_REQUEST"; |
160 | case MLX5_EVENT_TYPE_PAGE_FAULT: | ||
161 | return "MLX5_EVENT_TYPE_PAGE_FAULT"; | ||
160 | default: | 162 | default: |
161 | return "Unrecognized event"; | 163 | return "Unrecognized event"; |
162 | } | 164 | } |
@@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) | |||
279 | } | 281 | } |
280 | break; | 282 | break; |
281 | 283 | ||
284 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
285 | case MLX5_EVENT_TYPE_PAGE_FAULT: | ||
286 | mlx5_eq_pagefault(dev, eqe); | ||
287 | break; | ||
288 | #endif | ||
282 | 289 | ||
283 | default: | 290 | default: |
284 | mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", | 291 | mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", |
@@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev) | |||
446 | int mlx5_start_eqs(struct mlx5_core_dev *dev) | 453 | int mlx5_start_eqs(struct mlx5_core_dev *dev) |
447 | { | 454 | { |
448 | struct mlx5_eq_table *table = &dev->priv.eq_table; | 455 | struct mlx5_eq_table *table = &dev->priv.eq_table; |
456 | u32 async_event_mask = MLX5_ASYNC_EVENT_MASK; | ||
449 | int err; | 457 | int err; |
450 | 458 | ||
459 | if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) | ||
460 | async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT); | ||
461 | |||
451 | err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, | 462 | err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, |
452 | MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, | 463 | MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, |
453 | "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); | 464 | "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); |
@@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) | |||
459 | mlx5_cmd_use_events(dev); | 470 | mlx5_cmd_use_events(dev); |
460 | 471 | ||
461 | err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, | 472 | err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, |
462 | MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK, | 473 | MLX5_NUM_ASYNC_EQE, async_event_mask, |
463 | "mlx5_async_eq", &dev->priv.uuari.uars[0]); | 474 | "mlx5_async_eq", &dev->priv.uuari.uars[0]); |
464 | if (err) { | 475 | if (err) { |
465 | mlx5_core_warn(dev, "failed to create async EQ %d\n", err); | 476 | mlx5_core_warn(dev, "failed to create async EQ %d\n", err); |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 087c4c797deb..06f9036acd83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c | |||
@@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) | |||
69 | return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); | 69 | return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); |
70 | } | 70 | } |
71 | 71 | ||
72 | int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps) | ||
73 | { | ||
74 | u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; | ||
75 | int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); | ||
76 | void *out; | ||
77 | int err; | ||
78 | |||
79 | if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) | ||
80 | return -ENOTSUPP; | ||
81 | |||
82 | memset(in, 0, sizeof(in)); | ||
83 | out = kzalloc(out_sz, GFP_KERNEL); | ||
84 | if (!out) | ||
85 | return -ENOMEM; | ||
86 | MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); | ||
87 | MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR); | ||
88 | err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); | ||
89 | if (err) | ||
90 | goto out; | ||
91 | |||
92 | err = mlx5_cmd_status_to_err_v2(out); | ||
93 | if (err) { | ||
94 | mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err); | ||
95 | goto out; | ||
96 | } | ||
97 | |||
98 | memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct), | ||
99 | sizeof(*caps)); | ||
100 | |||
101 | mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n", | ||
102 | be32_to_cpu(caps->per_transport_caps.rc_odp_caps), | ||
103 | be32_to_cpu(caps->per_transport_caps.uc_odp_caps), | ||
104 | be32_to_cpu(caps->per_transport_caps.ud_odp_caps)); | ||
105 | |||
106 | out: | ||
107 | kfree(out); | ||
108 | return err; | ||
109 | } | ||
110 | EXPORT_SYMBOL(mlx5_query_odp_caps); | ||
111 | |||
72 | int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) | 112 | int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) |
73 | { | 113 | { |
74 | struct mlx5_cmd_init_hca_mbox_in in; | 114 | struct mlx5_cmd_init_hca_mbox_in in; |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 5261a2b0da43..575d853dbe05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c | |||
@@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) | |||
88 | mlx5_core_put_rsc(common); | 88 | mlx5_core_put_rsc(common); |
89 | } | 89 | } |
90 | 90 | ||
91 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
92 | void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) | ||
93 | { | ||
94 | struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault; | ||
95 | int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK; | ||
96 | struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn); | ||
97 | struct mlx5_core_qp *qp = | ||
98 | container_of(common, struct mlx5_core_qp, common); | ||
99 | struct mlx5_pagefault pfault; | ||
100 | |||
101 | if (!qp) { | ||
102 | mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n", | ||
103 | qpn); | ||
104 | return; | ||
105 | } | ||
106 | |||
107 | pfault.event_subtype = eqe->sub_type; | ||
108 | pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) & | ||
109 | (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA); | ||
110 | pfault.bytes_committed = be32_to_cpu( | ||
111 | pf_eqe->bytes_committed); | ||
112 | |||
113 | mlx5_core_dbg(dev, | ||
114 | "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n", | ||
115 | eqe->sub_type, pfault.flags); | ||
116 | |||
117 | switch (eqe->sub_type) { | ||
118 | case MLX5_PFAULT_SUBTYPE_RDMA: | ||
119 | /* RDMA based event */ | ||
120 | pfault.rdma.r_key = | ||
121 | be32_to_cpu(pf_eqe->rdma.r_key); | ||
122 | pfault.rdma.packet_size = | ||
123 | be16_to_cpu(pf_eqe->rdma.packet_length); | ||
124 | pfault.rdma.rdma_op_len = | ||
125 | be32_to_cpu(pf_eqe->rdma.rdma_op_len); | ||
126 | pfault.rdma.rdma_va = | ||
127 | be64_to_cpu(pf_eqe->rdma.rdma_va); | ||
128 | mlx5_core_dbg(dev, | ||
129 | "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n", | ||
130 | qpn, pfault.rdma.r_key); | ||
131 | mlx5_core_dbg(dev, | ||
132 | "PAGE_FAULT: rdma_op_len: 0x%08x,\n", | ||
133 | pfault.rdma.rdma_op_len); | ||
134 | mlx5_core_dbg(dev, | ||
135 | "PAGE_FAULT: rdma_va: 0x%016llx,\n", | ||
136 | pfault.rdma.rdma_va); | ||
137 | mlx5_core_dbg(dev, | ||
138 | "PAGE_FAULT: bytes_committed: 0x%06x\n", | ||
139 | pfault.bytes_committed); | ||
140 | break; | ||
141 | |||
142 | case MLX5_PFAULT_SUBTYPE_WQE: | ||
143 | /* WQE based event */ | ||
144 | pfault.wqe.wqe_index = | ||
145 | be16_to_cpu(pf_eqe->wqe.wqe_index); | ||
146 | pfault.wqe.packet_size = | ||
147 | be16_to_cpu(pf_eqe->wqe.packet_length); | ||
148 | mlx5_core_dbg(dev, | ||
149 | "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n", | ||
150 | qpn, pfault.wqe.wqe_index); | ||
151 | mlx5_core_dbg(dev, | ||
152 | "PAGE_FAULT: bytes_committed: 0x%06x\n", | ||
153 | pfault.bytes_committed); | ||
154 | break; | ||
155 | |||
156 | default: | ||
157 | mlx5_core_warn(dev, | ||
158 | "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n", | ||
159 | eqe->sub_type, qpn); | ||
160 | /* Unsupported page faults should still be resolved by the | ||
161 | * page fault handler | ||
162 | */ | ||
163 | } | ||
164 | |||
165 | if (qp->pfault_handler) { | ||
166 | qp->pfault_handler(qp, &pfault); | ||
167 | } else { | ||
168 | mlx5_core_err(dev, | ||
169 | "ODP event for QP %08x, without a fault handler in QP\n", | ||
170 | qpn); | ||
171 | /* Page fault will remain unresolved. QP will hang until it is | ||
172 | * destroyed | ||
173 | */ | ||
174 | } | ||
175 | |||
176 | mlx5_core_put_rsc(common); | ||
177 | } | ||
178 | #endif | ||
179 | |||
91 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, | 180 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, |
92 | struct mlx5_core_qp *qp, | 181 | struct mlx5_core_qp *qp, |
93 | struct mlx5_create_qp_mbox_in *in, | 182 | struct mlx5_create_qp_mbox_in *in, |
@@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) | |||
322 | return err; | 411 | return err; |
323 | } | 412 | } |
324 | EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); | 413 | EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); |
414 | |||
415 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
416 | int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, | ||
417 | u8 flags, int error) | ||
418 | { | ||
419 | struct mlx5_page_fault_resume_mbox_in in; | ||
420 | struct mlx5_page_fault_resume_mbox_out out; | ||
421 | int err; | ||
422 | |||
423 | memset(&in, 0, sizeof(in)); | ||
424 | memset(&out, 0, sizeof(out)); | ||
425 | in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME); | ||
426 | in.hdr.opmod = 0; | ||
427 | flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR | | ||
428 | MLX5_PAGE_FAULT_RESUME_WRITE | | ||
429 | MLX5_PAGE_FAULT_RESUME_RDMA); | ||
430 | flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0); | ||
431 | in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) | | ||
432 | (flags << MLX5_QPN_BITS)); | ||
433 | err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); | ||
434 | if (err) | ||
435 | return err; | ||
436 | |||
437 | if (out.hdr.status) | ||
438 | err = mlx5_cmd_status_to_err(&out.hdr); | ||
439 | |||
440 | return err; | ||
441 | } | ||
442 | EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); | ||
443 | #endif | ||
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ea4f1c46f761..4e5bd813bb9a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h | |||
@@ -120,6 +120,15 @@ enum { | |||
120 | }; | 120 | }; |
121 | 121 | ||
122 | enum { | 122 | enum { |
123 | MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31 | ||
124 | }; | ||
125 | |||
126 | enum { | ||
127 | MLX5_PFAULT_SUBTYPE_WQE = 0, | ||
128 | MLX5_PFAULT_SUBTYPE_RDMA = 1, | ||
129 | }; | ||
130 | |||
131 | enum { | ||
123 | MLX5_PERM_LOCAL_READ = 1 << 2, | 132 | MLX5_PERM_LOCAL_READ = 1 << 2, |
124 | MLX5_PERM_LOCAL_WRITE = 1 << 3, | 133 | MLX5_PERM_LOCAL_WRITE = 1 << 3, |
125 | MLX5_PERM_REMOTE_READ = 1 << 4, | 134 | MLX5_PERM_REMOTE_READ = 1 << 4, |
@@ -180,6 +189,19 @@ enum { | |||
180 | MLX5_MKEY_MASK_FREE = 1ull << 29, | 189 | MLX5_MKEY_MASK_FREE = 1ull << 29, |
181 | }; | 190 | }; |
182 | 191 | ||
192 | enum { | ||
193 | MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4), | ||
194 | |||
195 | MLX5_UMR_CHECK_NOT_FREE = (1 << 5), | ||
196 | MLX5_UMR_CHECK_FREE = (2 << 5), | ||
197 | |||
198 | MLX5_UMR_INLINE = (1 << 7), | ||
199 | }; | ||
200 | |||
201 | #define MLX5_UMR_MTT_ALIGNMENT 0x40 | ||
202 | #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) | ||
203 | #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT | ||
204 | |||
183 | enum mlx5_event { | 205 | enum mlx5_event { |
184 | MLX5_EVENT_TYPE_COMP = 0x0, | 206 | MLX5_EVENT_TYPE_COMP = 0x0, |
185 | 207 | ||
@@ -206,6 +228,8 @@ enum mlx5_event { | |||
206 | 228 | ||
207 | MLX5_EVENT_TYPE_CMD = 0x0a, | 229 | MLX5_EVENT_TYPE_CMD = 0x0a, |
208 | MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, | 230 | MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, |
231 | |||
232 | MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, | ||
209 | }; | 233 | }; |
210 | 234 | ||
211 | enum { | 235 | enum { |
@@ -225,6 +249,7 @@ enum { | |||
225 | MLX5_DEV_CAP_FLAG_APM = 1LL << 17, | 249 | MLX5_DEV_CAP_FLAG_APM = 1LL << 17, |
226 | MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, | 250 | MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, |
227 | MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, | 251 | MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, |
252 | MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, | ||
228 | MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, | 253 | MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, |
229 | MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, | 254 | MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, |
230 | MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, | 255 | MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, |
@@ -290,6 +315,8 @@ enum { | |||
290 | enum { | 315 | enum { |
291 | HCA_CAP_OPMOD_GET_MAX = 0, | 316 | HCA_CAP_OPMOD_GET_MAX = 0, |
292 | HCA_CAP_OPMOD_GET_CUR = 1, | 317 | HCA_CAP_OPMOD_GET_CUR = 1, |
318 | HCA_CAP_OPMOD_GET_ODP_MAX = 4, | ||
319 | HCA_CAP_OPMOD_GET_ODP_CUR = 5 | ||
293 | }; | 320 | }; |
294 | 321 | ||
295 | struct mlx5_inbox_hdr { | 322 | struct mlx5_inbox_hdr { |
@@ -319,6 +346,23 @@ struct mlx5_cmd_query_adapter_mbox_out { | |||
319 | u8 vsd_psid[16]; | 346 | u8 vsd_psid[16]; |
320 | }; | 347 | }; |
321 | 348 | ||
349 | enum mlx5_odp_transport_cap_bits { | ||
350 | MLX5_ODP_SUPPORT_SEND = 1 << 31, | ||
351 | MLX5_ODP_SUPPORT_RECV = 1 << 30, | ||
352 | MLX5_ODP_SUPPORT_WRITE = 1 << 29, | ||
353 | MLX5_ODP_SUPPORT_READ = 1 << 28, | ||
354 | }; | ||
355 | |||
356 | struct mlx5_odp_caps { | ||
357 | char reserved[0x10]; | ||
358 | struct { | ||
359 | __be32 rc_odp_caps; | ||
360 | __be32 uc_odp_caps; | ||
361 | __be32 ud_odp_caps; | ||
362 | } per_transport_caps; | ||
363 | char reserved2[0xe4]; | ||
364 | }; | ||
365 | |||
322 | struct mlx5_cmd_init_hca_mbox_in { | 366 | struct mlx5_cmd_init_hca_mbox_in { |
323 | struct mlx5_inbox_hdr hdr; | 367 | struct mlx5_inbox_hdr hdr; |
324 | u8 rsvd0[2]; | 368 | u8 rsvd0[2]; |
@@ -439,6 +483,27 @@ struct mlx5_eqe_page_req { | |||
439 | __be32 rsvd1[5]; | 483 | __be32 rsvd1[5]; |
440 | }; | 484 | }; |
441 | 485 | ||
486 | struct mlx5_eqe_page_fault { | ||
487 | __be32 bytes_committed; | ||
488 | union { | ||
489 | struct { | ||
490 | u16 reserved1; | ||
491 | __be16 wqe_index; | ||
492 | u16 reserved2; | ||
493 | __be16 packet_length; | ||
494 | u8 reserved3[12]; | ||
495 | } __packed wqe; | ||
496 | struct { | ||
497 | __be32 r_key; | ||
498 | u16 reserved1; | ||
499 | __be16 packet_length; | ||
500 | __be32 rdma_op_len; | ||
501 | __be64 rdma_va; | ||
502 | } __packed rdma; | ||
503 | } __packed; | ||
504 | __be32 flags_qpn; | ||
505 | } __packed; | ||
506 | |||
442 | union ev_data { | 507 | union ev_data { |
443 | __be32 raw[7]; | 508 | __be32 raw[7]; |
444 | struct mlx5_eqe_cmd cmd; | 509 | struct mlx5_eqe_cmd cmd; |
@@ -450,6 +515,7 @@ union ev_data { | |||
450 | struct mlx5_eqe_congestion cong; | 515 | struct mlx5_eqe_congestion cong; |
451 | struct mlx5_eqe_stall_vl stall_vl; | 516 | struct mlx5_eqe_stall_vl stall_vl; |
452 | struct mlx5_eqe_page_req req_pages; | 517 | struct mlx5_eqe_page_req req_pages; |
518 | struct mlx5_eqe_page_fault page_fault; | ||
453 | } __packed; | 519 | } __packed; |
454 | 520 | ||
455 | struct mlx5_eqe { | 521 | struct mlx5_eqe { |
@@ -776,6 +842,10 @@ struct mlx5_query_eq_mbox_out { | |||
776 | struct mlx5_eq_context ctx; | 842 | struct mlx5_eq_context ctx; |
777 | }; | 843 | }; |
778 | 844 | ||
845 | enum { | ||
846 | MLX5_MKEY_STATUS_FREE = 1 << 6, | ||
847 | }; | ||
848 | |||
779 | struct mlx5_mkey_seg { | 849 | struct mlx5_mkey_seg { |
780 | /* This is a two bit field occupying bits 31-30. | 850 | /* This is a two bit field occupying bits 31-30. |
781 | * bit 31 is always 0, | 851 | * bit 31 is always 0, |
@@ -812,7 +882,7 @@ struct mlx5_query_special_ctxs_mbox_out { | |||
812 | struct mlx5_create_mkey_mbox_in { | 882 | struct mlx5_create_mkey_mbox_in { |
813 | struct mlx5_inbox_hdr hdr; | 883 | struct mlx5_inbox_hdr hdr; |
814 | __be32 input_mkey_index; | 884 | __be32 input_mkey_index; |
815 | u8 rsvd0[4]; | 885 | __be32 flags; |
816 | struct mlx5_mkey_seg seg; | 886 | struct mlx5_mkey_seg seg; |
817 | u8 rsvd1[16]; | 887 | u8 rsvd1[16]; |
818 | __be32 xlat_oct_act_size; | 888 | __be32 xlat_oct_act_size; |
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b1bf41556b32..166d9315fe4b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h | |||
@@ -113,6 +113,13 @@ enum { | |||
113 | MLX5_REG_HOST_ENDIANNESS = 0x7004, | 113 | MLX5_REG_HOST_ENDIANNESS = 0x7004, |
114 | }; | 114 | }; |
115 | 115 | ||
116 | enum mlx5_page_fault_resume_flags { | ||
117 | MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, | ||
118 | MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, | ||
119 | MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2, | ||
120 | MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7, | ||
121 | }; | ||
122 | |||
116 | enum dbg_rsc_type { | 123 | enum dbg_rsc_type { |
117 | MLX5_DBG_RSC_QP, | 124 | MLX5_DBG_RSC_QP, |
118 | MLX5_DBG_RSC_EQ, | 125 | MLX5_DBG_RSC_EQ, |
@@ -467,7 +474,7 @@ struct mlx5_priv { | |||
467 | struct workqueue_struct *pg_wq; | 474 | struct workqueue_struct *pg_wq; |
468 | struct rb_root page_root; | 475 | struct rb_root page_root; |
469 | int fw_pages; | 476 | int fw_pages; |
470 | int reg_pages; | 477 | atomic_t reg_pages; |
471 | struct list_head free_list; | 478 | struct list_head free_list; |
472 | 479 | ||
473 | struct mlx5_core_health health; | 480 | struct mlx5_core_health health; |
@@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev); | |||
703 | void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); | 710 | void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); |
704 | void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); | 711 | void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); |
705 | void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); | 712 | void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); |
713 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
714 | void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); | ||
715 | #endif | ||
706 | void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); | 716 | void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); |
707 | struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); | 717 | struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); |
708 | void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); | 718 | void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); |
@@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, | |||
740 | int npsvs, u32 *sig_index); | 750 | int npsvs, u32 *sig_index); |
741 | int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); | 751 | int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); |
742 | void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); | 752 | void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); |
753 | int mlx5_query_odp_caps(struct mlx5_core_dev *dev, | ||
754 | struct mlx5_odp_caps *odp_caps); | ||
743 | 755 | ||
744 | static inline u32 mlx5_mkey_to_idx(u32 mkey) | 756 | static inline u32 mlx5_mkey_to_idx(u32 mkey) |
745 | { | 757 | { |
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3fa075daeb1d..61f7a342d1bf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h | |||
@@ -50,6 +50,9 @@ | |||
50 | #define MLX5_BSF_APPTAG_ESCAPE 0x1 | 50 | #define MLX5_BSF_APPTAG_ESCAPE 0x1 |
51 | #define MLX5_BSF_APPREF_ESCAPE 0x2 | 51 | #define MLX5_BSF_APPREF_ESCAPE 0x2 |
52 | 52 | ||
53 | #define MLX5_QPN_BITS 24 | ||
54 | #define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1) | ||
55 | |||
53 | enum mlx5_qp_optpar { | 56 | enum mlx5_qp_optpar { |
54 | MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, | 57 | MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, |
55 | MLX5_QP_OPTPAR_RRE = 1 << 1, | 58 | MLX5_QP_OPTPAR_RRE = 1 << 1, |
@@ -189,6 +192,14 @@ struct mlx5_wqe_ctrl_seg { | |||
189 | __be32 imm; | 192 | __be32 imm; |
190 | }; | 193 | }; |
191 | 194 | ||
195 | #define MLX5_WQE_CTRL_DS_MASK 0x3f | ||
196 | #define MLX5_WQE_CTRL_QPN_MASK 0xffffff00 | ||
197 | #define MLX5_WQE_CTRL_QPN_SHIFT 8 | ||
198 | #define MLX5_WQE_DS_UNITS 16 | ||
199 | #define MLX5_WQE_CTRL_OPCODE_MASK 0xff | ||
200 | #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 | ||
201 | #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 | ||
202 | |||
192 | struct mlx5_wqe_xrc_seg { | 203 | struct mlx5_wqe_xrc_seg { |
193 | __be32 xrc_srqn; | 204 | __be32 xrc_srqn; |
194 | u8 rsvd[12]; | 205 | u8 rsvd[12]; |
@@ -292,6 +303,8 @@ struct mlx5_wqe_signature_seg { | |||
292 | u8 rsvd1[11]; | 303 | u8 rsvd1[11]; |
293 | }; | 304 | }; |
294 | 305 | ||
306 | #define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff | ||
307 | |||
295 | struct mlx5_wqe_inline_seg { | 308 | struct mlx5_wqe_inline_seg { |
296 | __be32 byte_count; | 309 | __be32 byte_count; |
297 | }; | 310 | }; |
@@ -360,9 +373,46 @@ struct mlx5_stride_block_ctrl_seg { | |||
360 | __be16 num_entries; | 373 | __be16 num_entries; |
361 | }; | 374 | }; |
362 | 375 | ||
376 | enum mlx5_pagefault_flags { | ||
377 | MLX5_PFAULT_REQUESTOR = 1 << 0, | ||
378 | MLX5_PFAULT_WRITE = 1 << 1, | ||
379 | MLX5_PFAULT_RDMA = 1 << 2, | ||
380 | }; | ||
381 | |||
382 | /* Contains the details of a pagefault. */ | ||
383 | struct mlx5_pagefault { | ||
384 | u32 bytes_committed; | ||
385 | u8 event_subtype; | ||
386 | enum mlx5_pagefault_flags flags; | ||
387 | union { | ||
388 | /* Initiator or send message responder pagefault details. */ | ||
389 | struct { | ||
390 | /* Received packet size, only valid for responders. */ | ||
391 | u32 packet_size; | ||
392 | /* | ||
393 | * WQE index. Refers to either the send queue or | ||
394 | * receive queue, according to event_subtype. | ||
395 | */ | ||
396 | u16 wqe_index; | ||
397 | } wqe; | ||
398 | /* RDMA responder pagefault details */ | ||
399 | struct { | ||
400 | u32 r_key; | ||
401 | /* | ||
402 | * Received packet size, minimal size page fault | ||
403 | * resolution required for forward progress. | ||
404 | */ | ||
405 | u32 packet_size; | ||
406 | u32 rdma_op_len; | ||
407 | u64 rdma_va; | ||
408 | } rdma; | ||
409 | }; | ||
410 | }; | ||
411 | |||
363 | struct mlx5_core_qp { | 412 | struct mlx5_core_qp { |
364 | struct mlx5_core_rsc_common common; /* must be first */ | 413 | struct mlx5_core_rsc_common common; /* must be first */ |
365 | void (*event) (struct mlx5_core_qp *, int); | 414 | void (*event) (struct mlx5_core_qp *, int); |
415 | void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *); | ||
366 | int qpn; | 416 | int qpn; |
367 | struct mlx5_rsc_debug *dbg; | 417 | struct mlx5_rsc_debug *dbg; |
368 | int pid; | 418 | int pid; |
@@ -530,6 +580,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u | |||
530 | return radix_tree_lookup(&dev->priv.mr_table.tree, key); | 580 | return radix_tree_lookup(&dev->priv.mr_table.tree, key); |
531 | } | 581 | } |
532 | 582 | ||
583 | struct mlx5_page_fault_resume_mbox_in { | ||
584 | struct mlx5_inbox_hdr hdr; | ||
585 | __be32 flags_qpn; | ||
586 | u8 reserved[4]; | ||
587 | }; | ||
588 | |||
589 | struct mlx5_page_fault_resume_mbox_out { | ||
590 | struct mlx5_outbox_hdr hdr; | ||
591 | u8 rsvd[8]; | ||
592 | }; | ||
593 | |||
533 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, | 594 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, |
534 | struct mlx5_core_qp *qp, | 595 | struct mlx5_core_qp *qp, |
535 | struct mlx5_create_qp_mbox_in *in, | 596 | struct mlx5_create_qp_mbox_in *in, |
@@ -549,6 +610,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev); | |||
549 | void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); | 610 | void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); |
550 | int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); | 611 | int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); |
551 | void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); | 612 | void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); |
613 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
614 | int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, | ||
615 | u8 context, int error); | ||
616 | #endif | ||
552 | 617 | ||
553 | static inline const char *mlx5_qp_type_str(int type) | 618 | static inline const char *mlx5_qp_type_str(int type) |
554 | { | 619 | { |
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a2bf41e0bde9..2d83cfd7e6ce 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h | |||
@@ -38,11 +38,12 @@ | |||
38 | #include <linux/workqueue.h> | 38 | #include <linux/workqueue.h> |
39 | 39 | ||
40 | struct ib_ucontext; | 40 | struct ib_ucontext; |
41 | struct ib_umem_odp; | ||
41 | 42 | ||
42 | struct ib_umem { | 43 | struct ib_umem { |
43 | struct ib_ucontext *context; | 44 | struct ib_ucontext *context; |
44 | size_t length; | 45 | size_t length; |
45 | int offset; | 46 | unsigned long address; |
46 | int page_size; | 47 | int page_size; |
47 | int writable; | 48 | int writable; |
48 | int hugetlb; | 49 | int hugetlb; |
@@ -50,17 +51,43 @@ struct ib_umem { | |||
50 | struct pid *pid; | 51 | struct pid *pid; |
51 | struct mm_struct *mm; | 52 | struct mm_struct *mm; |
52 | unsigned long diff; | 53 | unsigned long diff; |
54 | struct ib_umem_odp *odp_data; | ||
53 | struct sg_table sg_head; | 55 | struct sg_table sg_head; |
54 | int nmap; | 56 | int nmap; |
55 | int npages; | 57 | int npages; |
56 | }; | 58 | }; |
57 | 59 | ||
60 | /* Returns the offset of the umem start relative to the first page. */ | ||
61 | static inline int ib_umem_offset(struct ib_umem *umem) | ||
62 | { | ||
63 | return umem->address & ((unsigned long)umem->page_size - 1); | ||
64 | } | ||
65 | |||
66 | /* Returns the first page of an ODP umem. */ | ||
67 | static inline unsigned long ib_umem_start(struct ib_umem *umem) | ||
68 | { | ||
69 | return umem->address - ib_umem_offset(umem); | ||
70 | } | ||
71 | |||
72 | /* Returns the address of the page after the last one of an ODP umem. */ | ||
73 | static inline unsigned long ib_umem_end(struct ib_umem *umem) | ||
74 | { | ||
75 | return PAGE_ALIGN(umem->address + umem->length); | ||
76 | } | ||
77 | |||
78 | static inline size_t ib_umem_num_pages(struct ib_umem *umem) | ||
79 | { | ||
80 | return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; | ||
81 | } | ||
82 | |||
58 | #ifdef CONFIG_INFINIBAND_USER_MEM | 83 | #ifdef CONFIG_INFINIBAND_USER_MEM |
59 | 84 | ||
60 | struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, | 85 | struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, |
61 | size_t size, int access, int dmasync); | 86 | size_t size, int access, int dmasync); |
62 | void ib_umem_release(struct ib_umem *umem); | 87 | void ib_umem_release(struct ib_umem *umem); |
63 | int ib_umem_page_count(struct ib_umem *umem); | 88 | int ib_umem_page_count(struct ib_umem *umem); |
89 | int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, | ||
90 | size_t length); | ||
64 | 91 | ||
65 | #else /* CONFIG_INFINIBAND_USER_MEM */ | 92 | #else /* CONFIG_INFINIBAND_USER_MEM */ |
66 | 93 | ||
@@ -73,7 +100,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, | |||
73 | } | 100 | } |
74 | static inline void ib_umem_release(struct ib_umem *umem) { } | 101 | static inline void ib_umem_release(struct ib_umem *umem) { } |
75 | static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } | 102 | static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } |
76 | 103 | static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, | |
104 | size_t length) { | ||
105 | return -EINVAL; | ||
106 | } | ||
77 | #endif /* CONFIG_INFINIBAND_USER_MEM */ | 107 | #endif /* CONFIG_INFINIBAND_USER_MEM */ |
78 | 108 | ||
79 | #endif /* IB_UMEM_H */ | 109 | #endif /* IB_UMEM_H */ |
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h new file mode 100644 index 000000000000..3da0b167041b --- /dev/null +++ b/include/rdma/ib_umem_odp.h | |||
@@ -0,0 +1,160 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
3 | * | ||
4 | * This software is available to you under a choice of one of two | ||
5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
6 | * General Public License (GPL) Version 2, available from the file | ||
7 | * COPYING in the main directory of this source tree, or the | ||
8 | * OpenIB.org BSD license below: | ||
9 | * | ||
10 | * Redistribution and use in source and binary forms, with or | ||
11 | * without modification, are permitted provided that the following | ||
12 | * conditions are met: | ||
13 | * | ||
14 | * - Redistributions of source code must retain the above | ||
15 | * copyright notice, this list of conditions and the following | ||
16 | * disclaimer. | ||
17 | * | ||
18 | * - Redistributions in binary form must reproduce the above | ||
19 | * copyright notice, this list of conditions and the following | ||
20 | * disclaimer in the documentation and/or other materials | ||
21 | * provided with the distribution. | ||
22 | * | ||
23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
30 | * SOFTWARE. | ||
31 | */ | ||
32 | |||
33 | #ifndef IB_UMEM_ODP_H | ||
34 | #define IB_UMEM_ODP_H | ||
35 | |||
36 | #include <rdma/ib_umem.h> | ||
37 | #include <rdma/ib_verbs.h> | ||
38 | #include <linux/interval_tree.h> | ||
39 | |||
40 | struct umem_odp_node { | ||
41 | u64 __subtree_last; | ||
42 | struct rb_node rb; | ||
43 | }; | ||
44 | |||
45 | struct ib_umem_odp { | ||
46 | /* | ||
47 | * An array of the pages included in the on-demand paging umem. | ||
48 | * Indices of pages that are currently not mapped into the device will | ||
49 | * contain NULL. | ||
50 | */ | ||
51 | struct page **page_list; | ||
52 | /* | ||
53 | * An array of the same size as page_list, with DMA addresses mapped | ||
54 | * for pages the pages in page_list. The lower two bits designate | ||
55 | * access permissions. See ODP_READ_ALLOWED_BIT and | ||
56 | * ODP_WRITE_ALLOWED_BIT. | ||
57 | */ | ||
58 | dma_addr_t *dma_list; | ||
59 | /* | ||
60 | * The umem_mutex protects the page_list and dma_list fields of an ODP | ||
61 | * umem, allowing only a single thread to map/unmap pages. The mutex | ||
62 | * also protects access to the mmu notifier counters. | ||
63 | */ | ||
64 | struct mutex umem_mutex; | ||
65 | void *private; /* for the HW driver to use. */ | ||
66 | |||
67 | /* When false, use the notifier counter in the ucontext struct. */ | ||
68 | bool mn_counters_active; | ||
69 | int notifiers_seq; | ||
70 | int notifiers_count; | ||
71 | |||
72 | /* A linked list of umems that don't have private mmu notifier | ||
73 | * counters yet. */ | ||
74 | struct list_head no_private_counters; | ||
75 | struct ib_umem *umem; | ||
76 | |||
77 | /* Tree tracking */ | ||
78 | struct umem_odp_node interval_tree; | ||
79 | |||
80 | struct completion notifier_completion; | ||
81 | int dying; | ||
82 | }; | ||
83 | |||
84 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
85 | |||
86 | int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); | ||
87 | |||
88 | void ib_umem_odp_release(struct ib_umem *umem); | ||
89 | |||
90 | /* | ||
91 | * The lower 2 bits of the DMA address signal the R/W permissions for | ||
92 | * the entry. To upgrade the permissions, provide the appropriate | ||
93 | * bitmask to the map_dma_pages function. | ||
94 | * | ||
95 | * Be aware that upgrading a mapped address might result in change of | ||
96 | * the DMA address for the page. | ||
97 | */ | ||
98 | #define ODP_READ_ALLOWED_BIT (1<<0ULL) | ||
99 | #define ODP_WRITE_ALLOWED_BIT (1<<1ULL) | ||
100 | |||
101 | #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) | ||
102 | |||
103 | int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, | ||
104 | u64 access_mask, unsigned long current_seq); | ||
105 | |||
106 | void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, | ||
107 | u64 bound); | ||
108 | |||
109 | void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); | ||
110 | void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); | ||
111 | typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, | ||
112 | void *cookie); | ||
113 | /* | ||
114 | * Call the callback on each ib_umem in the range. Returns the logical or of | ||
115 | * the return values of the functions called. | ||
116 | */ | ||
117 | int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, | ||
118 | umem_call_back cb, void *cookie); | ||
119 | |||
120 | struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, | ||
121 | u64 start, u64 last); | ||
122 | struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, | ||
123 | u64 start, u64 last); | ||
124 | |||
125 | static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, | ||
126 | unsigned long mmu_seq) | ||
127 | { | ||
128 | /* | ||
129 | * This code is strongly based on the KVM code from | ||
130 | * mmu_notifier_retry. Should be called with | ||
131 | * the relevant locks taken (item->odp_data->umem_mutex | ||
132 | * and the ucontext umem_mutex semaphore locked for read). | ||
133 | */ | ||
134 | |||
135 | /* Do not allow page faults while the new ib_umem hasn't seen a state | ||
136 | * with zero notifiers yet, and doesn't have its own valid set of | ||
137 | * private counters. */ | ||
138 | if (!item->odp_data->mn_counters_active) | ||
139 | return 1; | ||
140 | |||
141 | if (unlikely(item->odp_data->notifiers_count)) | ||
142 | return 1; | ||
143 | if (item->odp_data->notifiers_seq != mmu_seq) | ||
144 | return 1; | ||
145 | return 0; | ||
146 | } | ||
147 | |||
148 | #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
149 | |||
150 | static inline int ib_umem_odp_get(struct ib_ucontext *context, | ||
151 | struct ib_umem *umem) | ||
152 | { | ||
153 | return -EINVAL; | ||
154 | } | ||
155 | |||
156 | static inline void ib_umem_odp_release(struct ib_umem *umem) {} | ||
157 | |||
158 | #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
159 | |||
160 | #endif /* IB_UMEM_ODP_H */ | ||
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 470a011d6fa4..0d74f1de99aa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <uapi/linux/if_ether.h> | 51 | #include <uapi/linux/if_ether.h> |
52 | 52 | ||
53 | #include <linux/atomic.h> | 53 | #include <linux/atomic.h> |
54 | #include <linux/mmu_notifier.h> | ||
54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
55 | 56 | ||
56 | extern struct workqueue_struct *ib_wq; | 57 | extern struct workqueue_struct *ib_wq; |
@@ -123,7 +124,8 @@ enum ib_device_cap_flags { | |||
123 | IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), | 124 | IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), |
124 | IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), | 125 | IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), |
125 | IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), | 126 | IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), |
126 | IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) | 127 | IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), |
128 | IB_DEVICE_ON_DEMAND_PAGING = (1<<31), | ||
127 | }; | 129 | }; |
128 | 130 | ||
129 | enum ib_signature_prot_cap { | 131 | enum ib_signature_prot_cap { |
@@ -143,6 +145,27 @@ enum ib_atomic_cap { | |||
143 | IB_ATOMIC_GLOB | 145 | IB_ATOMIC_GLOB |
144 | }; | 146 | }; |
145 | 147 | ||
148 | enum ib_odp_general_cap_bits { | ||
149 | IB_ODP_SUPPORT = 1 << 0, | ||
150 | }; | ||
151 | |||
152 | enum ib_odp_transport_cap_bits { | ||
153 | IB_ODP_SUPPORT_SEND = 1 << 0, | ||
154 | IB_ODP_SUPPORT_RECV = 1 << 1, | ||
155 | IB_ODP_SUPPORT_WRITE = 1 << 2, | ||
156 | IB_ODP_SUPPORT_READ = 1 << 3, | ||
157 | IB_ODP_SUPPORT_ATOMIC = 1 << 4, | ||
158 | }; | ||
159 | |||
160 | struct ib_odp_caps { | ||
161 | uint64_t general_caps; | ||
162 | struct { | ||
163 | uint32_t rc_odp_caps; | ||
164 | uint32_t uc_odp_caps; | ||
165 | uint32_t ud_odp_caps; | ||
166 | } per_transport_caps; | ||
167 | }; | ||
168 | |||
146 | struct ib_device_attr { | 169 | struct ib_device_attr { |
147 | u64 fw_ver; | 170 | u64 fw_ver; |
148 | __be64 sys_image_guid; | 171 | __be64 sys_image_guid; |
@@ -186,6 +209,7 @@ struct ib_device_attr { | |||
186 | u8 local_ca_ack_delay; | 209 | u8 local_ca_ack_delay; |
187 | int sig_prot_cap; | 210 | int sig_prot_cap; |
188 | int sig_guard_cap; | 211 | int sig_guard_cap; |
212 | struct ib_odp_caps odp_caps; | ||
189 | }; | 213 | }; |
190 | 214 | ||
191 | enum ib_mtu { | 215 | enum ib_mtu { |
@@ -1073,7 +1097,8 @@ enum ib_access_flags { | |||
1073 | IB_ACCESS_REMOTE_READ = (1<<2), | 1097 | IB_ACCESS_REMOTE_READ = (1<<2), |
1074 | IB_ACCESS_REMOTE_ATOMIC = (1<<3), | 1098 | IB_ACCESS_REMOTE_ATOMIC = (1<<3), |
1075 | IB_ACCESS_MW_BIND = (1<<4), | 1099 | IB_ACCESS_MW_BIND = (1<<4), |
1076 | IB_ZERO_BASED = (1<<5) | 1100 | IB_ZERO_BASED = (1<<5), |
1101 | IB_ACCESS_ON_DEMAND = (1<<6), | ||
1077 | }; | 1102 | }; |
1078 | 1103 | ||
1079 | struct ib_phys_buf { | 1104 | struct ib_phys_buf { |
@@ -1115,6 +1140,8 @@ struct ib_fmr_attr { | |||
1115 | u8 page_shift; | 1140 | u8 page_shift; |
1116 | }; | 1141 | }; |
1117 | 1142 | ||
1143 | struct ib_umem; | ||
1144 | |||
1118 | struct ib_ucontext { | 1145 | struct ib_ucontext { |
1119 | struct ib_device *device; | 1146 | struct ib_device *device; |
1120 | struct list_head pd_list; | 1147 | struct list_head pd_list; |
@@ -1127,6 +1154,24 @@ struct ib_ucontext { | |||
1127 | struct list_head xrcd_list; | 1154 | struct list_head xrcd_list; |
1128 | struct list_head rule_list; | 1155 | struct list_head rule_list; |
1129 | int closing; | 1156 | int closing; |
1157 | |||
1158 | struct pid *tgid; | ||
1159 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
1160 | struct rb_root umem_tree; | ||
1161 | /* | ||
1162 | * Protects .umem_rbroot and tree, as well as odp_mrs_count and | ||
1163 | * mmu notifiers registration. | ||
1164 | */ | ||
1165 | struct rw_semaphore umem_rwsem; | ||
1166 | void (*invalidate_range)(struct ib_umem *umem, | ||
1167 | unsigned long start, unsigned long end); | ||
1168 | |||
1169 | struct mmu_notifier mn; | ||
1170 | atomic_t notifier_count; | ||
1171 | /* A list of umems that don't have private mmu notifier counters yet. */ | ||
1172 | struct list_head no_private_counters; | ||
1173 | int odp_mrs_count; | ||
1174 | #endif | ||
1130 | }; | 1175 | }; |
1131 | 1176 | ||
1132 | struct ib_uobject { | 1177 | struct ib_uobject { |
@@ -1662,7 +1707,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t | |||
1662 | 1707 | ||
1663 | static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) | 1708 | static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) |
1664 | { | 1709 | { |
1665 | return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; | 1710 | size_t copy_sz; |
1711 | |||
1712 | copy_sz = min_t(size_t, len, udata->outlen); | ||
1713 | return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0; | ||
1666 | } | 1714 | } |
1667 | 1715 | ||
1668 | /** | 1716 | /** |
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 26daf55ff76e..4275b961bf60 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h | |||
@@ -90,8 +90,9 @@ enum { | |||
90 | }; | 90 | }; |
91 | 91 | ||
92 | enum { | 92 | enum { |
93 | IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, | ||
93 | IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, | 94 | IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, |
94 | IB_USER_VERBS_EX_CMD_DESTROY_FLOW | 95 | IB_USER_VERBS_EX_CMD_DESTROY_FLOW, |
95 | }; | 96 | }; |
96 | 97 | ||
97 | /* | 98 | /* |
@@ -201,6 +202,32 @@ struct ib_uverbs_query_device_resp { | |||
201 | __u8 reserved[4]; | 202 | __u8 reserved[4]; |
202 | }; | 203 | }; |
203 | 204 | ||
205 | enum { | ||
206 | IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0, | ||
207 | }; | ||
208 | |||
209 | struct ib_uverbs_ex_query_device { | ||
210 | __u32 comp_mask; | ||
211 | __u32 reserved; | ||
212 | }; | ||
213 | |||
214 | struct ib_uverbs_odp_caps { | ||
215 | __u64 general_caps; | ||
216 | struct { | ||
217 | __u32 rc_odp_caps; | ||
218 | __u32 uc_odp_caps; | ||
219 | __u32 ud_odp_caps; | ||
220 | } per_transport_caps; | ||
221 | __u32 reserved; | ||
222 | }; | ||
223 | |||
224 | struct ib_uverbs_ex_query_device_resp { | ||
225 | struct ib_uverbs_query_device_resp base; | ||
226 | __u32 comp_mask; | ||
227 | __u32 reserved; | ||
228 | struct ib_uverbs_odp_caps odp_caps; | ||
229 | }; | ||
230 | |||
204 | struct ib_uverbs_query_port { | 231 | struct ib_uverbs_query_port { |
205 | __u64 response; | 232 | __u64 response; |
206 | __u8 port_num; | 233 | __u8 port_num; |