diff options
52 files changed, 3508 insertions, 453 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 77089399359b..b899531498eb 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig | |||
| @@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM | |||
| 38 | depends on INFINIBAND_USER_ACCESS != n | 38 | depends on INFINIBAND_USER_ACCESS != n |
| 39 | default y | 39 | default y |
| 40 | 40 | ||
| 41 | config INFINIBAND_ON_DEMAND_PAGING | ||
| 42 | bool "InfiniBand on-demand paging support" | ||
| 43 | depends on INFINIBAND_USER_MEM | ||
| 44 | select MMU_NOTIFIER | ||
| 45 | default y | ||
| 46 | ---help--- | ||
| 47 | On demand paging support for the InfiniBand subsystem. | ||
| 48 | Together with driver support this allows registration of | ||
| 49 | memory regions without pinning their pages, fetching the | ||
| 50 | pages on demand instead. | ||
| 51 | |||
| 41 | config INFINIBAND_ADDR_TRANS | 52 | config INFINIBAND_ADDR_TRANS |
| 42 | bool | 53 | bool |
| 43 | depends on INFINIBAND | 54 | depends on INFINIBAND |
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index ffd0af6734af..acf736764445 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile | |||
| @@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ | |||
| 11 | ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ | 11 | ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ |
| 12 | device.o fmr_pool.o cache.o netlink.o | 12 | device.o fmr_pool.o cache.o netlink.o |
| 13 | ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o | 13 | ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o |
| 14 | ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o | ||
| 14 | 15 | ||
| 15 | ib_mad-y := mad.o smi.o agent.o mad_rmpp.o | 16 | ib_mad-y := mad.o smi.o agent.o mad_rmpp.o |
| 16 | 17 | ||
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 8172d37f9add..f80da50d84a5 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c | |||
| @@ -176,8 +176,8 @@ static void set_timeout(unsigned long time) | |||
| 176 | unsigned long delay; | 176 | unsigned long delay; |
| 177 | 177 | ||
| 178 | delay = time - jiffies; | 178 | delay = time - jiffies; |
| 179 | if ((long)delay <= 0) | 179 | if ((long)delay < 0) |
| 180 | delay = 1; | 180 | delay = 0; |
| 181 | 181 | ||
| 182 | mod_delayed_work(addr_wq, &work, delay); | 182 | mod_delayed_work(addr_wq, &work, delay); |
| 183 | } | 183 | } |
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d2360a8ef0b2..fa17b552ff78 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c | |||
| @@ -525,17 +525,22 @@ static void join_handler(int status, struct ib_sa_mcmember_rec *rec, | |||
| 525 | if (status) | 525 | if (status) |
| 526 | process_join_error(group, status); | 526 | process_join_error(group, status); |
| 527 | else { | 527 | else { |
| 528 | int mgids_changed, is_mgid0; | ||
| 528 | ib_find_pkey(group->port->dev->device, group->port->port_num, | 529 | ib_find_pkey(group->port->dev->device, group->port->port_num, |
| 529 | be16_to_cpu(rec->pkey), &pkey_index); | 530 | be16_to_cpu(rec->pkey), &pkey_index); |
| 530 | 531 | ||
| 531 | spin_lock_irq(&group->port->lock); | 532 | spin_lock_irq(&group->port->lock); |
| 532 | group->rec = *rec; | ||
| 533 | if (group->state == MCAST_BUSY && | 533 | if (group->state == MCAST_BUSY && |
| 534 | group->pkey_index == MCAST_INVALID_PKEY_INDEX) | 534 | group->pkey_index == MCAST_INVALID_PKEY_INDEX) |
| 535 | group->pkey_index = pkey_index; | 535 | group->pkey_index = pkey_index; |
| 536 | if (!memcmp(&mgid0, &group->rec.mgid, sizeof mgid0)) { | 536 | mgids_changed = memcmp(&rec->mgid, &group->rec.mgid, |
| 537 | sizeof(group->rec.mgid)); | ||
| 538 | group->rec = *rec; | ||
| 539 | if (mgids_changed) { | ||
| 537 | rb_erase(&group->node, &group->port->table); | 540 | rb_erase(&group->node, &group->port->table); |
| 538 | mcast_insert(group->port, group, 1); | 541 | is_mgid0 = !memcmp(&mgid0, &group->rec.mgid, |
| 542 | sizeof(mgid0)); | ||
| 543 | mcast_insert(group->port, group, is_mgid0); | ||
| 539 | } | 544 | } |
| 540 | spin_unlock_irq(&group->port->lock); | 545 | spin_unlock_irq(&group->port->lock); |
| 541 | } | 546 | } |
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index df0c4f605a21..aec7a6aa2951 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c | |||
| @@ -39,6 +39,7 @@ | |||
| 39 | #include <linux/hugetlb.h> | 39 | #include <linux/hugetlb.h> |
| 40 | #include <linux/dma-attrs.h> | 40 | #include <linux/dma-attrs.h> |
| 41 | #include <linux/slab.h> | 41 | #include <linux/slab.h> |
| 42 | #include <rdma/ib_umem_odp.h> | ||
| 42 | 43 | ||
| 43 | #include "uverbs.h" | 44 | #include "uverbs.h" |
| 44 | 45 | ||
| @@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d | |||
| 69 | 70 | ||
| 70 | /** | 71 | /** |
| 71 | * ib_umem_get - Pin and DMA map userspace memory. | 72 | * ib_umem_get - Pin and DMA map userspace memory. |
| 73 | * | ||
| 74 | * If access flags indicate ODP memory, avoid pinning. Instead, stores | ||
| 75 | * the mm for future page fault handling in conjunction with MMU notifiers. | ||
| 76 | * | ||
| 72 | * @context: userspace context to pin memory for | 77 | * @context: userspace context to pin memory for |
| 73 | * @addr: userspace virtual address to start at | 78 | * @addr: userspace virtual address to start at |
| 74 | * @size: length of region to pin | 79 | * @size: length of region to pin |
| @@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, | |||
| 103 | 108 | ||
| 104 | umem->context = context; | 109 | umem->context = context; |
| 105 | umem->length = size; | 110 | umem->length = size; |
| 106 | umem->offset = addr & ~PAGE_MASK; | 111 | umem->address = addr; |
| 107 | umem->page_size = PAGE_SIZE; | 112 | umem->page_size = PAGE_SIZE; |
| 108 | umem->pid = get_task_pid(current, PIDTYPE_PID); | 113 | umem->pid = get_task_pid(current, PIDTYPE_PID); |
| 109 | /* | 114 | /* |
| 110 | * We ask for writable memory if any access flags other than | 115 | * We ask for writable memory if any of the following |
| 111 | * "remote read" are set. "Local write" and "remote write" | 116 | * access flags are set. "Local write" and "remote write" |
| 112 | * obviously require write access. "Remote atomic" can do | 117 | * obviously require write access. "Remote atomic" can do |
| 113 | * things like fetch and add, which will modify memory, and | 118 | * things like fetch and add, which will modify memory, and |
| 114 | * "MW bind" can change permissions by binding a window. | 119 | * "MW bind" can change permissions by binding a window. |
| 115 | */ | 120 | */ |
| 116 | umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); | 121 | umem->writable = !!(access & |
| 122 | (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | | ||
| 123 | IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); | ||
| 124 | |||
| 125 | if (access & IB_ACCESS_ON_DEMAND) { | ||
| 126 | ret = ib_umem_odp_get(context, umem); | ||
| 127 | if (ret) { | ||
| 128 | kfree(umem); | ||
| 129 | return ERR_PTR(ret); | ||
| 130 | } | ||
| 131 | return umem; | ||
| 132 | } | ||
| 133 | |||
| 134 | umem->odp_data = NULL; | ||
| 117 | 135 | ||
| 118 | /* We assume the memory is from hugetlb until proved otherwise */ | 136 | /* We assume the memory is from hugetlb until proved otherwise */ |
| 119 | umem->hugetlb = 1; | 137 | umem->hugetlb = 1; |
| @@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, | |||
| 132 | if (!vma_list) | 150 | if (!vma_list) |
| 133 | umem->hugetlb = 0; | 151 | umem->hugetlb = 0; |
| 134 | 152 | ||
| 135 | npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; | 153 | npages = ib_umem_num_pages(umem); |
| 136 | 154 | ||
| 137 | down_write(¤t->mm->mmap_sem); | 155 | down_write(¤t->mm->mmap_sem); |
| 138 | 156 | ||
| @@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem) | |||
| 235 | struct task_struct *task; | 253 | struct task_struct *task; |
| 236 | unsigned long diff; | 254 | unsigned long diff; |
| 237 | 255 | ||
| 256 | if (umem->odp_data) { | ||
| 257 | ib_umem_odp_release(umem); | ||
| 258 | return; | ||
| 259 | } | ||
| 260 | |||
| 238 | __ib_umem_release(umem->context->device, umem, 1); | 261 | __ib_umem_release(umem->context->device, umem, 1); |
| 239 | 262 | ||
| 240 | task = get_pid_task(umem->pid, PIDTYPE_PID); | 263 | task = get_pid_task(umem->pid, PIDTYPE_PID); |
| @@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem) | |||
| 246 | if (!mm) | 269 | if (!mm) |
| 247 | goto out; | 270 | goto out; |
| 248 | 271 | ||
| 249 | diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; | 272 | diff = ib_umem_num_pages(umem); |
| 250 | 273 | ||
| 251 | /* | 274 | /* |
| 252 | * We may be called with the mm's mmap_sem already held. This | 275 | * We may be called with the mm's mmap_sem already held. This |
| @@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem) | |||
| 283 | int n; | 306 | int n; |
| 284 | struct scatterlist *sg; | 307 | struct scatterlist *sg; |
| 285 | 308 | ||
| 309 | if (umem->odp_data) | ||
| 310 | return ib_umem_num_pages(umem); | ||
| 311 | |||
| 286 | shift = ilog2(umem->page_size); | 312 | shift = ilog2(umem->page_size); |
| 287 | 313 | ||
| 288 | n = 0; | 314 | n = 0; |
| @@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem) | |||
| 292 | return n; | 318 | return n; |
| 293 | } | 319 | } |
| 294 | EXPORT_SYMBOL(ib_umem_page_count); | 320 | EXPORT_SYMBOL(ib_umem_page_count); |
| 321 | |||
| 322 | /* | ||
| 323 | * Copy from the given ib_umem's pages to the given buffer. | ||
| 324 | * | ||
| 325 | * umem - the umem to copy from | ||
| 326 | * offset - offset to start copying from | ||
| 327 | * dst - destination buffer | ||
| 328 | * length - buffer length | ||
| 329 | * | ||
| 330 | * Returns 0 on success, or an error code. | ||
| 331 | */ | ||
| 332 | int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, | ||
| 333 | size_t length) | ||
| 334 | { | ||
| 335 | size_t end = offset + length; | ||
| 336 | int ret; | ||
| 337 | |||
| 338 | if (offset > umem->length || length > umem->length - offset) { | ||
| 339 | pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n", | ||
| 340 | offset, umem->length, end); | ||
| 341 | return -EINVAL; | ||
| 342 | } | ||
| 343 | |||
| 344 | ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, | ||
| 345 | offset + ib_umem_offset(umem)); | ||
| 346 | |||
| 347 | if (ret < 0) | ||
| 348 | return ret; | ||
| 349 | else if (ret != length) | ||
| 350 | return -EINVAL; | ||
| 351 | else | ||
| 352 | return 0; | ||
| 353 | } | ||
| 354 | EXPORT_SYMBOL(ib_umem_copy_from); | ||
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c new file mode 100644 index 000000000000..6095872549e7 --- /dev/null +++ b/drivers/infiniband/core/umem_odp.c | |||
| @@ -0,0 +1,668 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | */ | ||
| 32 | |||
| 33 | #include <linux/types.h> | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <linux/pid.h> | ||
| 36 | #include <linux/slab.h> | ||
| 37 | #include <linux/export.h> | ||
| 38 | #include <linux/vmalloc.h> | ||
| 39 | |||
| 40 | #include <rdma/ib_verbs.h> | ||
| 41 | #include <rdma/ib_umem.h> | ||
| 42 | #include <rdma/ib_umem_odp.h> | ||
| 43 | |||
| 44 | static void ib_umem_notifier_start_account(struct ib_umem *item) | ||
| 45 | { | ||
| 46 | mutex_lock(&item->odp_data->umem_mutex); | ||
| 47 | |||
| 48 | /* Only update private counters for this umem if it has them. | ||
| 49 | * Otherwise skip it. All page faults will be delayed for this umem. */ | ||
| 50 | if (item->odp_data->mn_counters_active) { | ||
| 51 | int notifiers_count = item->odp_data->notifiers_count++; | ||
| 52 | |||
| 53 | if (notifiers_count == 0) | ||
| 54 | /* Initialize the completion object for waiting on | ||
| 55 | * notifiers. Since notifier_count is zero, no one | ||
| 56 | * should be waiting right now. */ | ||
| 57 | reinit_completion(&item->odp_data->notifier_completion); | ||
| 58 | } | ||
| 59 | mutex_unlock(&item->odp_data->umem_mutex); | ||
| 60 | } | ||
| 61 | |||
| 62 | static void ib_umem_notifier_end_account(struct ib_umem *item) | ||
| 63 | { | ||
| 64 | mutex_lock(&item->odp_data->umem_mutex); | ||
| 65 | |||
| 66 | /* Only update private counters for this umem if it has them. | ||
| 67 | * Otherwise skip it. All page faults will be delayed for this umem. */ | ||
| 68 | if (item->odp_data->mn_counters_active) { | ||
| 69 | /* | ||
| 70 | * This sequence increase will notify the QP page fault that | ||
| 71 | * the page that is going to be mapped in the spte could have | ||
| 72 | * been freed. | ||
| 73 | */ | ||
| 74 | ++item->odp_data->notifiers_seq; | ||
| 75 | if (--item->odp_data->notifiers_count == 0) | ||
| 76 | complete_all(&item->odp_data->notifier_completion); | ||
| 77 | } | ||
| 78 | mutex_unlock(&item->odp_data->umem_mutex); | ||
| 79 | } | ||
| 80 | |||
| 81 | /* Account for a new mmu notifier in an ib_ucontext. */ | ||
| 82 | static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) | ||
| 83 | { | ||
| 84 | atomic_inc(&context->notifier_count); | ||
| 85 | } | ||
| 86 | |||
| 87 | /* Account for a terminating mmu notifier in an ib_ucontext. | ||
| 88 | * | ||
| 89 | * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since | ||
| 90 | * the function takes the semaphore itself. */ | ||
| 91 | static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) | ||
| 92 | { | ||
| 93 | int zero_notifiers = atomic_dec_and_test(&context->notifier_count); | ||
| 94 | |||
| 95 | if (zero_notifiers && | ||
| 96 | !list_empty(&context->no_private_counters)) { | ||
| 97 | /* No currently running mmu notifiers. Now is the chance to | ||
| 98 | * add private accounting to all previously added umems. */ | ||
| 99 | struct ib_umem_odp *odp_data, *next; | ||
| 100 | |||
| 101 | /* Prevent concurrent mmu notifiers from working on the | ||
| 102 | * no_private_counters list. */ | ||
| 103 | down_write(&context->umem_rwsem); | ||
| 104 | |||
| 105 | /* Read the notifier_count again, with the umem_rwsem | ||
| 106 | * semaphore taken for write. */ | ||
| 107 | if (!atomic_read(&context->notifier_count)) { | ||
| 108 | list_for_each_entry_safe(odp_data, next, | ||
| 109 | &context->no_private_counters, | ||
| 110 | no_private_counters) { | ||
| 111 | mutex_lock(&odp_data->umem_mutex); | ||
| 112 | odp_data->mn_counters_active = true; | ||
| 113 | list_del(&odp_data->no_private_counters); | ||
| 114 | complete_all(&odp_data->notifier_completion); | ||
| 115 | mutex_unlock(&odp_data->umem_mutex); | ||
| 116 | } | ||
| 117 | } | ||
| 118 | |||
| 119 | up_write(&context->umem_rwsem); | ||
| 120 | } | ||
| 121 | } | ||
| 122 | |||
| 123 | static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, | ||
| 124 | u64 end, void *cookie) { | ||
| 125 | /* | ||
| 126 | * Increase the number of notifiers running, to | ||
| 127 | * prevent any further fault handling on this MR. | ||
| 128 | */ | ||
| 129 | ib_umem_notifier_start_account(item); | ||
| 130 | item->odp_data->dying = 1; | ||
| 131 | /* Make sure that the fact the umem is dying is out before we release | ||
| 132 | * all pending page faults. */ | ||
| 133 | smp_wmb(); | ||
| 134 | complete_all(&item->odp_data->notifier_completion); | ||
| 135 | item->context->invalidate_range(item, ib_umem_start(item), | ||
| 136 | ib_umem_end(item)); | ||
| 137 | return 0; | ||
| 138 | } | ||
| 139 | |||
| 140 | static void ib_umem_notifier_release(struct mmu_notifier *mn, | ||
| 141 | struct mm_struct *mm) | ||
| 142 | { | ||
| 143 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
| 144 | |||
| 145 | if (!context->invalidate_range) | ||
| 146 | return; | ||
| 147 | |||
| 148 | ib_ucontext_notifier_start_account(context); | ||
| 149 | down_read(&context->umem_rwsem); | ||
| 150 | rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, | ||
| 151 | ULLONG_MAX, | ||
| 152 | ib_umem_notifier_release_trampoline, | ||
| 153 | NULL); | ||
| 154 | up_read(&context->umem_rwsem); | ||
| 155 | } | ||
| 156 | |||
| 157 | static int invalidate_page_trampoline(struct ib_umem *item, u64 start, | ||
| 158 | u64 end, void *cookie) | ||
| 159 | { | ||
| 160 | ib_umem_notifier_start_account(item); | ||
| 161 | item->context->invalidate_range(item, start, start + PAGE_SIZE); | ||
| 162 | ib_umem_notifier_end_account(item); | ||
| 163 | return 0; | ||
| 164 | } | ||
| 165 | |||
| 166 | static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn, | ||
| 167 | struct mm_struct *mm, | ||
| 168 | unsigned long address) | ||
| 169 | { | ||
| 170 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
| 171 | |||
| 172 | if (!context->invalidate_range) | ||
| 173 | return; | ||
| 174 | |||
| 175 | ib_ucontext_notifier_start_account(context); | ||
| 176 | down_read(&context->umem_rwsem); | ||
| 177 | rbt_ib_umem_for_each_in_range(&context->umem_tree, address, | ||
| 178 | address + PAGE_SIZE, | ||
| 179 | invalidate_page_trampoline, NULL); | ||
| 180 | up_read(&context->umem_rwsem); | ||
| 181 | ib_ucontext_notifier_end_account(context); | ||
| 182 | } | ||
| 183 | |||
| 184 | static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, | ||
| 185 | u64 end, void *cookie) | ||
| 186 | { | ||
| 187 | ib_umem_notifier_start_account(item); | ||
| 188 | item->context->invalidate_range(item, start, end); | ||
| 189 | return 0; | ||
| 190 | } | ||
| 191 | |||
| 192 | static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, | ||
| 193 | struct mm_struct *mm, | ||
| 194 | unsigned long start, | ||
| 195 | unsigned long end) | ||
| 196 | { | ||
| 197 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
| 198 | |||
| 199 | if (!context->invalidate_range) | ||
| 200 | return; | ||
| 201 | |||
| 202 | ib_ucontext_notifier_start_account(context); | ||
| 203 | down_read(&context->umem_rwsem); | ||
| 204 | rbt_ib_umem_for_each_in_range(&context->umem_tree, start, | ||
| 205 | end, | ||
| 206 | invalidate_range_start_trampoline, NULL); | ||
| 207 | up_read(&context->umem_rwsem); | ||
| 208 | } | ||
| 209 | |||
| 210 | static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, | ||
| 211 | u64 end, void *cookie) | ||
| 212 | { | ||
| 213 | ib_umem_notifier_end_account(item); | ||
| 214 | return 0; | ||
| 215 | } | ||
| 216 | |||
| 217 | static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | ||
| 218 | struct mm_struct *mm, | ||
| 219 | unsigned long start, | ||
| 220 | unsigned long end) | ||
| 221 | { | ||
| 222 | struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); | ||
| 223 | |||
| 224 | if (!context->invalidate_range) | ||
| 225 | return; | ||
| 226 | |||
| 227 | down_read(&context->umem_rwsem); | ||
| 228 | rbt_ib_umem_for_each_in_range(&context->umem_tree, start, | ||
| 229 | end, | ||
| 230 | invalidate_range_end_trampoline, NULL); | ||
| 231 | up_read(&context->umem_rwsem); | ||
| 232 | ib_ucontext_notifier_end_account(context); | ||
| 233 | } | ||
| 234 | |||
| 235 | static struct mmu_notifier_ops ib_umem_notifiers = { | ||
| 236 | .release = ib_umem_notifier_release, | ||
| 237 | .invalidate_page = ib_umem_notifier_invalidate_page, | ||
| 238 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, | ||
| 239 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | ||
| 240 | }; | ||
| 241 | |||
| 242 | int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) | ||
| 243 | { | ||
| 244 | int ret_val; | ||
| 245 | struct pid *our_pid; | ||
| 246 | struct mm_struct *mm = get_task_mm(current); | ||
| 247 | |||
| 248 | if (!mm) | ||
| 249 | return -EINVAL; | ||
| 250 | |||
| 251 | /* Prevent creating ODP MRs in child processes */ | ||
| 252 | rcu_read_lock(); | ||
| 253 | our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); | ||
| 254 | rcu_read_unlock(); | ||
| 255 | put_pid(our_pid); | ||
| 256 | if (context->tgid != our_pid) { | ||
| 257 | ret_val = -EINVAL; | ||
| 258 | goto out_mm; | ||
| 259 | } | ||
| 260 | |||
| 261 | umem->hugetlb = 0; | ||
| 262 | umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); | ||
| 263 | if (!umem->odp_data) { | ||
| 264 | ret_val = -ENOMEM; | ||
| 265 | goto out_mm; | ||
| 266 | } | ||
| 267 | umem->odp_data->umem = umem; | ||
| 268 | |||
| 269 | mutex_init(&umem->odp_data->umem_mutex); | ||
| 270 | |||
| 271 | init_completion(&umem->odp_data->notifier_completion); | ||
| 272 | |||
| 273 | umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * | ||
| 274 | sizeof(*umem->odp_data->page_list)); | ||
| 275 | if (!umem->odp_data->page_list) { | ||
| 276 | ret_val = -ENOMEM; | ||
| 277 | goto out_odp_data; | ||
| 278 | } | ||
| 279 | |||
| 280 | umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) * | ||
| 281 | sizeof(*umem->odp_data->dma_list)); | ||
| 282 | if (!umem->odp_data->dma_list) { | ||
| 283 | ret_val = -ENOMEM; | ||
| 284 | goto out_page_list; | ||
| 285 | } | ||
| 286 | |||
| 287 | /* | ||
| 288 | * When using MMU notifiers, we will get a | ||
| 289 | * notification before the "current" task (and MM) is | ||
| 290 | * destroyed. We use the umem_rwsem semaphore to synchronize. | ||
| 291 | */ | ||
| 292 | down_write(&context->umem_rwsem); | ||
| 293 | context->odp_mrs_count++; | ||
| 294 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | ||
| 295 | rbt_ib_umem_insert(&umem->odp_data->interval_tree, | ||
| 296 | &context->umem_tree); | ||
| 297 | if (likely(!atomic_read(&context->notifier_count))) | ||
| 298 | umem->odp_data->mn_counters_active = true; | ||
| 299 | else | ||
| 300 | list_add(&umem->odp_data->no_private_counters, | ||
| 301 | &context->no_private_counters); | ||
| 302 | downgrade_write(&context->umem_rwsem); | ||
| 303 | |||
| 304 | if (context->odp_mrs_count == 1) { | ||
| 305 | /* | ||
| 306 | * Note that at this point, no MMU notifier is running | ||
| 307 | * for this context! | ||
| 308 | */ | ||
| 309 | atomic_set(&context->notifier_count, 0); | ||
| 310 | INIT_HLIST_NODE(&context->mn.hlist); | ||
| 311 | context->mn.ops = &ib_umem_notifiers; | ||
| 312 | /* | ||
| 313 | * Lock-dep detects a false positive for mmap_sem vs. | ||
| 314 | * umem_rwsem, due to not grasping downgrade_write correctly. | ||
| 315 | */ | ||
| 316 | lockdep_off(); | ||
| 317 | ret_val = mmu_notifier_register(&context->mn, mm); | ||
| 318 | lockdep_on(); | ||
| 319 | if (ret_val) { | ||
| 320 | pr_err("Failed to register mmu_notifier %d\n", ret_val); | ||
| 321 | ret_val = -EBUSY; | ||
| 322 | goto out_mutex; | ||
| 323 | } | ||
| 324 | } | ||
| 325 | |||
| 326 | up_read(&context->umem_rwsem); | ||
| 327 | |||
| 328 | /* | ||
| 329 | * Note that doing an mmput can cause a notifier for the relevant mm. | ||
| 330 | * If the notifier is called while we hold the umem_rwsem, this will | ||
| 331 | * cause a deadlock. Therefore, we release the reference only after we | ||
| 332 | * released the semaphore. | ||
| 333 | */ | ||
| 334 | mmput(mm); | ||
| 335 | return 0; | ||
| 336 | |||
| 337 | out_mutex: | ||
| 338 | up_read(&context->umem_rwsem); | ||
| 339 | vfree(umem->odp_data->dma_list); | ||
| 340 | out_page_list: | ||
| 341 | vfree(umem->odp_data->page_list); | ||
| 342 | out_odp_data: | ||
| 343 | kfree(umem->odp_data); | ||
| 344 | out_mm: | ||
| 345 | mmput(mm); | ||
| 346 | return ret_val; | ||
| 347 | } | ||
| 348 | |||
| 349 | void ib_umem_odp_release(struct ib_umem *umem) | ||
| 350 | { | ||
| 351 | struct ib_ucontext *context = umem->context; | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Ensure that no more pages are mapped in the umem. | ||
| 355 | * | ||
| 356 | * It is the driver's responsibility to ensure, before calling us, | ||
| 357 | * that the hardware will not attempt to access the MR any more. | ||
| 358 | */ | ||
| 359 | ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), | ||
| 360 | ib_umem_end(umem)); | ||
| 361 | |||
| 362 | down_write(&context->umem_rwsem); | ||
| 363 | if (likely(ib_umem_start(umem) != ib_umem_end(umem))) | ||
| 364 | rbt_ib_umem_remove(&umem->odp_data->interval_tree, | ||
| 365 | &context->umem_tree); | ||
| 366 | context->odp_mrs_count--; | ||
| 367 | if (!umem->odp_data->mn_counters_active) { | ||
| 368 | list_del(&umem->odp_data->no_private_counters); | ||
| 369 | complete_all(&umem->odp_data->notifier_completion); | ||
| 370 | } | ||
| 371 | |||
| 372 | /* | ||
| 373 | * Downgrade the lock to a read lock. This ensures that the notifiers | ||
| 374 | * (who lock the mutex for reading) will be able to finish, and we | ||
| 375 | * will be able to enventually obtain the mmu notifiers SRCU. Note | ||
| 376 | * that since we are doing it atomically, no other user could register | ||
| 377 | * and unregister while we do the check. | ||
| 378 | */ | ||
| 379 | downgrade_write(&context->umem_rwsem); | ||
| 380 | if (!context->odp_mrs_count) { | ||
| 381 | struct task_struct *owning_process = NULL; | ||
| 382 | struct mm_struct *owning_mm = NULL; | ||
| 383 | |||
| 384 | owning_process = get_pid_task(context->tgid, | ||
| 385 | PIDTYPE_PID); | ||
| 386 | if (owning_process == NULL) | ||
| 387 | /* | ||
| 388 | * The process is already dead, notifier were removed | ||
| 389 | * already. | ||
| 390 | */ | ||
| 391 | goto out; | ||
| 392 | |||
| 393 | owning_mm = get_task_mm(owning_process); | ||
| 394 | if (owning_mm == NULL) | ||
| 395 | /* | ||
| 396 | * The process' mm is already dead, notifier were | ||
| 397 | * removed already. | ||
| 398 | */ | ||
| 399 | goto out_put_task; | ||
| 400 | mmu_notifier_unregister(&context->mn, owning_mm); | ||
| 401 | |||
| 402 | mmput(owning_mm); | ||
| 403 | |||
| 404 | out_put_task: | ||
| 405 | put_task_struct(owning_process); | ||
| 406 | } | ||
| 407 | out: | ||
| 408 | up_read(&context->umem_rwsem); | ||
| 409 | |||
| 410 | vfree(umem->odp_data->dma_list); | ||
| 411 | vfree(umem->odp_data->page_list); | ||
| 412 | kfree(umem->odp_data); | ||
| 413 | kfree(umem); | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | ||
| 417 | * Map for DMA and insert a single page into the on-demand paging page tables. | ||
| 418 | * | ||
| 419 | * @umem: the umem to insert the page to. | ||
| 420 | * @page_index: index in the umem to add the page to. | ||
| 421 | * @page: the page struct to map and add. | ||
| 422 | * @access_mask: access permissions needed for this page. | ||
| 423 | * @current_seq: sequence number for synchronization with invalidations. | ||
| 424 | * the sequence number is taken from | ||
| 425 | * umem->odp_data->notifiers_seq. | ||
| 426 | * | ||
| 427 | * The function returns -EFAULT if the DMA mapping operation fails. It returns | ||
| 428 | * -EAGAIN if a concurrent invalidation prevents us from updating the page. | ||
| 429 | * | ||
| 430 | * The page is released via put_page even if the operation failed. For | ||
| 431 | * on-demand pinning, the page is released whenever it isn't stored in the | ||
| 432 | * umem. | ||
| 433 | */ | ||
| 434 | static int ib_umem_odp_map_dma_single_page( | ||
| 435 | struct ib_umem *umem, | ||
| 436 | int page_index, | ||
| 437 | u64 base_virt_addr, | ||
| 438 | struct page *page, | ||
| 439 | u64 access_mask, | ||
| 440 | unsigned long current_seq) | ||
| 441 | { | ||
| 442 | struct ib_device *dev = umem->context->device; | ||
| 443 | dma_addr_t dma_addr; | ||
| 444 | int stored_page = 0; | ||
| 445 | int remove_existing_mapping = 0; | ||
| 446 | int ret = 0; | ||
| 447 | |||
| 448 | mutex_lock(&umem->odp_data->umem_mutex); | ||
| 449 | /* | ||
| 450 | * Note: we avoid writing if seq is different from the initial seq, to | ||
| 451 | * handle case of a racing notifier. This check also allows us to bail | ||
| 452 | * early if we have a notifier running in parallel with us. | ||
| 453 | */ | ||
| 454 | if (ib_umem_mmu_notifier_retry(umem, current_seq)) { | ||
| 455 | ret = -EAGAIN; | ||
| 456 | goto out; | ||
| 457 | } | ||
| 458 | if (!(umem->odp_data->dma_list[page_index])) { | ||
| 459 | dma_addr = ib_dma_map_page(dev, | ||
| 460 | page, | ||
| 461 | 0, PAGE_SIZE, | ||
| 462 | DMA_BIDIRECTIONAL); | ||
| 463 | if (ib_dma_mapping_error(dev, dma_addr)) { | ||
| 464 | ret = -EFAULT; | ||
| 465 | goto out; | ||
| 466 | } | ||
| 467 | umem->odp_data->dma_list[page_index] = dma_addr | access_mask; | ||
| 468 | umem->odp_data->page_list[page_index] = page; | ||
| 469 | stored_page = 1; | ||
| 470 | } else if (umem->odp_data->page_list[page_index] == page) { | ||
| 471 | umem->odp_data->dma_list[page_index] |= access_mask; | ||
| 472 | } else { | ||
| 473 | pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", | ||
| 474 | umem->odp_data->page_list[page_index], page); | ||
| 475 | /* Better remove the mapping now, to prevent any further | ||
| 476 | * damage. */ | ||
| 477 | remove_existing_mapping = 1; | ||
| 478 | } | ||
| 479 | |||
| 480 | out: | ||
| 481 | mutex_unlock(&umem->odp_data->umem_mutex); | ||
| 482 | |||
| 483 | /* On Demand Paging - avoid pinning the page */ | ||
| 484 | if (umem->context->invalidate_range || !stored_page) | ||
| 485 | put_page(page); | ||
| 486 | |||
| 487 | if (remove_existing_mapping && umem->context->invalidate_range) { | ||
| 488 | invalidate_page_trampoline( | ||
| 489 | umem, | ||
| 490 | base_virt_addr + (page_index * PAGE_SIZE), | ||
| 491 | base_virt_addr + ((page_index+1)*PAGE_SIZE), | ||
| 492 | NULL); | ||
| 493 | ret = -EAGAIN; | ||
| 494 | } | ||
| 495 | |||
| 496 | return ret; | ||
| 497 | } | ||
| 498 | |||
| 499 | /** | ||
| 500 | * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR. | ||
| 501 | * | ||
| 502 | * Pins the range of pages passed in the argument, and maps them to | ||
| 503 | * DMA addresses. The DMA addresses of the mapped pages is updated in | ||
| 504 | * umem->odp_data->dma_list. | ||
| 505 | * | ||
| 506 | * Returns the number of pages mapped in success, negative error code | ||
| 507 | * for failure. | ||
| 508 | * An -EAGAIN error code is returned when a concurrent mmu notifier prevents | ||
| 509 | * the function from completing its task. | ||
| 510 | * | ||
| 511 | * @umem: the umem to map and pin | ||
| 512 | * @user_virt: the address from which we need to map. | ||
| 513 | * @bcnt: the minimal number of bytes to pin and map. The mapping might be | ||
| 514 | * bigger due to alignment, and may also be smaller in case of an error | ||
| 515 | * pinning or mapping a page. The actual pages mapped is returned in | ||
| 516 | * the return value. | ||
| 517 | * @access_mask: bit mask of the requested access permissions for the given | ||
| 518 | * range. | ||
| 519 | * @current_seq: the MMU notifiers sequance value for synchronization with | ||
| 520 | * invalidations. the sequance number is read from | ||
| 521 | * umem->odp_data->notifiers_seq before calling this function | ||
| 522 | */ | ||
| 523 | int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, | ||
| 524 | u64 access_mask, unsigned long current_seq) | ||
| 525 | { | ||
| 526 | struct task_struct *owning_process = NULL; | ||
| 527 | struct mm_struct *owning_mm = NULL; | ||
| 528 | struct page **local_page_list = NULL; | ||
| 529 | u64 off; | ||
| 530 | int j, k, ret = 0, start_idx, npages = 0; | ||
| 531 | u64 base_virt_addr; | ||
| 532 | |||
| 533 | if (access_mask == 0) | ||
| 534 | return -EINVAL; | ||
| 535 | |||
| 536 | if (user_virt < ib_umem_start(umem) || | ||
| 537 | user_virt + bcnt > ib_umem_end(umem)) | ||
| 538 | return -EFAULT; | ||
| 539 | |||
| 540 | local_page_list = (struct page **)__get_free_page(GFP_KERNEL); | ||
| 541 | if (!local_page_list) | ||
| 542 | return -ENOMEM; | ||
| 543 | |||
| 544 | off = user_virt & (~PAGE_MASK); | ||
| 545 | user_virt = user_virt & PAGE_MASK; | ||
| 546 | base_virt_addr = user_virt; | ||
| 547 | bcnt += off; /* Charge for the first page offset as well. */ | ||
| 548 | |||
| 549 | owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); | ||
| 550 | if (owning_process == NULL) { | ||
| 551 | ret = -EINVAL; | ||
| 552 | goto out_no_task; | ||
| 553 | } | ||
| 554 | |||
| 555 | owning_mm = get_task_mm(owning_process); | ||
| 556 | if (owning_mm == NULL) { | ||
| 557 | ret = -EINVAL; | ||
| 558 | goto out_put_task; | ||
| 559 | } | ||
| 560 | |||
| 561 | start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT; | ||
| 562 | k = start_idx; | ||
| 563 | |||
| 564 | while (bcnt > 0) { | ||
| 565 | const size_t gup_num_pages = | ||
| 566 | min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE, | ||
| 567 | PAGE_SIZE / sizeof(struct page *)); | ||
| 568 | |||
| 569 | down_read(&owning_mm->mmap_sem); | ||
| 570 | /* | ||
| 571 | * Note: this might result in redundent page getting. We can | ||
| 572 | * avoid this by checking dma_list to be 0 before calling | ||
| 573 | * get_user_pages. However, this make the code much more | ||
| 574 | * complex (and doesn't gain us much performance in most use | ||
| 575 | * cases). | ||
| 576 | */ | ||
| 577 | npages = get_user_pages(owning_process, owning_mm, user_virt, | ||
| 578 | gup_num_pages, | ||
| 579 | access_mask & ODP_WRITE_ALLOWED_BIT, 0, | ||
| 580 | local_page_list, NULL); | ||
| 581 | up_read(&owning_mm->mmap_sem); | ||
| 582 | |||
| 583 | if (npages < 0) | ||
| 584 | break; | ||
| 585 | |||
| 586 | bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); | ||
| 587 | user_virt += npages << PAGE_SHIFT; | ||
| 588 | for (j = 0; j < npages; ++j) { | ||
| 589 | ret = ib_umem_odp_map_dma_single_page( | ||
| 590 | umem, k, base_virt_addr, local_page_list[j], | ||
| 591 | access_mask, current_seq); | ||
| 592 | if (ret < 0) | ||
| 593 | break; | ||
| 594 | k++; | ||
| 595 | } | ||
| 596 | |||
| 597 | if (ret < 0) { | ||
| 598 | /* Release left over pages when handling errors. */ | ||
| 599 | for (++j; j < npages; ++j) | ||
| 600 | put_page(local_page_list[j]); | ||
| 601 | break; | ||
| 602 | } | ||
| 603 | } | ||
| 604 | |||
| 605 | if (ret >= 0) { | ||
| 606 | if (npages < 0 && k == start_idx) | ||
| 607 | ret = npages; | ||
| 608 | else | ||
| 609 | ret = k - start_idx; | ||
| 610 | } | ||
| 611 | |||
| 612 | mmput(owning_mm); | ||
| 613 | out_put_task: | ||
| 614 | put_task_struct(owning_process); | ||
| 615 | out_no_task: | ||
| 616 | free_page((unsigned long)local_page_list); | ||
| 617 | return ret; | ||
| 618 | } | ||
| 619 | EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); | ||
| 620 | |||
| 621 | void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, | ||
| 622 | u64 bound) | ||
| 623 | { | ||
| 624 | int idx; | ||
| 625 | u64 addr; | ||
| 626 | struct ib_device *dev = umem->context->device; | ||
| 627 | |||
| 628 | virt = max_t(u64, virt, ib_umem_start(umem)); | ||
| 629 | bound = min_t(u64, bound, ib_umem_end(umem)); | ||
| 630 | /* Note that during the run of this function, the | ||
| 631 | * notifiers_count of the MR is > 0, preventing any racing | ||
| 632 | * faults from completion. We might be racing with other | ||
| 633 | * invalidations, so we must make sure we free each page only | ||
| 634 | * once. */ | ||
| 635 | for (addr = virt; addr < bound; addr += (u64)umem->page_size) { | ||
| 636 | idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; | ||
| 637 | mutex_lock(&umem->odp_data->umem_mutex); | ||
| 638 | if (umem->odp_data->page_list[idx]) { | ||
| 639 | struct page *page = umem->odp_data->page_list[idx]; | ||
| 640 | struct page *head_page = compound_head(page); | ||
| 641 | dma_addr_t dma = umem->odp_data->dma_list[idx]; | ||
| 642 | dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; | ||
| 643 | |||
| 644 | WARN_ON(!dma_addr); | ||
| 645 | |||
| 646 | ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, | ||
| 647 | DMA_BIDIRECTIONAL); | ||
| 648 | if (dma & ODP_WRITE_ALLOWED_BIT) | ||
| 649 | /* | ||
| 650 | * set_page_dirty prefers being called with | ||
| 651 | * the page lock. However, MMU notifiers are | ||
| 652 | * called sometimes with and sometimes without | ||
| 653 | * the lock. We rely on the umem_mutex instead | ||
| 654 | * to prevent other mmu notifiers from | ||
| 655 | * continuing and allowing the page mapping to | ||
| 656 | * be removed. | ||
| 657 | */ | ||
| 658 | set_page_dirty(head_page); | ||
| 659 | /* on demand pinning support */ | ||
| 660 | if (!umem->context->invalidate_range) | ||
| 661 | put_page(page); | ||
| 662 | umem->odp_data->page_list[idx] = NULL; | ||
| 663 | umem->odp_data->dma_list[idx] = 0; | ||
| 664 | } | ||
| 665 | mutex_unlock(&umem->odp_data->umem_mutex); | ||
| 666 | } | ||
| 667 | } | ||
| 668 | EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); | ||
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c new file mode 100644 index 000000000000..727d788448f5 --- /dev/null +++ b/drivers/infiniband/core/umem_rbtree.c | |||
| @@ -0,0 +1,94 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | */ | ||
| 32 | |||
| 33 | #include <linux/kernel.h> | ||
| 34 | #include <linux/module.h> | ||
| 35 | #include <linux/interval_tree_generic.h> | ||
| 36 | #include <linux/sched.h> | ||
| 37 | #include <linux/gfp.h> | ||
| 38 | #include <rdma/ib_umem_odp.h> | ||
| 39 | |||
| 40 | /* | ||
| 41 | * The ib_umem list keeps track of memory regions for which the HW | ||
| 42 | * device request to receive notification when the related memory | ||
| 43 | * mapping is changed. | ||
| 44 | * | ||
| 45 | * ib_umem_lock protects the list. | ||
| 46 | */ | ||
| 47 | |||
| 48 | static inline u64 node_start(struct umem_odp_node *n) | ||
| 49 | { | ||
| 50 | struct ib_umem_odp *umem_odp = | ||
| 51 | container_of(n, struct ib_umem_odp, interval_tree); | ||
| 52 | |||
| 53 | return ib_umem_start(umem_odp->umem); | ||
| 54 | } | ||
| 55 | |||
| 56 | /* Note that the representation of the intervals in the interval tree | ||
| 57 | * considers the ending point as contained in the interval, while the | ||
| 58 | * function ib_umem_end returns the first address which is not contained | ||
| 59 | * in the umem. | ||
| 60 | */ | ||
| 61 | static inline u64 node_last(struct umem_odp_node *n) | ||
| 62 | { | ||
| 63 | struct ib_umem_odp *umem_odp = | ||
| 64 | container_of(n, struct ib_umem_odp, interval_tree); | ||
| 65 | |||
| 66 | return ib_umem_end(umem_odp->umem) - 1; | ||
| 67 | } | ||
| 68 | |||
| 69 | INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, | ||
| 70 | node_start, node_last, , rbt_ib_umem) | ||
| 71 | |||
| 72 | /* @last is not a part of the interval. See comment for function | ||
| 73 | * node_last. | ||
| 74 | */ | ||
| 75 | int rbt_ib_umem_for_each_in_range(struct rb_root *root, | ||
| 76 | u64 start, u64 last, | ||
| 77 | umem_call_back cb, | ||
| 78 | void *cookie) | ||
| 79 | { | ||
| 80 | int ret_val = 0; | ||
| 81 | struct umem_odp_node *node; | ||
| 82 | struct ib_umem_odp *umem; | ||
| 83 | |||
| 84 | if (unlikely(start == last)) | ||
| 85 | return ret_val; | ||
| 86 | |||
| 87 | for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; | ||
| 88 | node = rbt_ib_umem_iter_next(node, start, last - 1)) { | ||
| 89 | umem = container_of(node, struct ib_umem_odp, interval_tree); | ||
| 90 | ret_val = cb(umem->umem, start, last, cookie) || ret_val; | ||
| 91 | } | ||
| 92 | |||
| 93 | return ret_val; | ||
| 94 | } | ||
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 643c08a025a5..b716b0815644 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h | |||
| @@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); | |||
| 258 | 258 | ||
| 259 | IB_UVERBS_DECLARE_EX_CMD(create_flow); | 259 | IB_UVERBS_DECLARE_EX_CMD(create_flow); |
| 260 | IB_UVERBS_DECLARE_EX_CMD(destroy_flow); | 260 | IB_UVERBS_DECLARE_EX_CMD(destroy_flow); |
| 261 | IB_UVERBS_DECLARE_EX_CMD(query_device); | ||
| 261 | 262 | ||
| 262 | #endif /* UVERBS_H */ | 263 | #endif /* UVERBS_H */ |
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5ba2a86aab6a..532d8eba8b02 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <linux/file.h> | 36 | #include <linux/file.h> |
| 37 | #include <linux/fs.h> | 37 | #include <linux/fs.h> |
| 38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
| 39 | #include <linux/sched.h> | ||
| 39 | 40 | ||
| 40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
| 41 | 42 | ||
| @@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, | |||
| 288 | struct ib_uverbs_get_context_resp resp; | 289 | struct ib_uverbs_get_context_resp resp; |
| 289 | struct ib_udata udata; | 290 | struct ib_udata udata; |
| 290 | struct ib_device *ibdev = file->device->ib_dev; | 291 | struct ib_device *ibdev = file->device->ib_dev; |
| 292 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 293 | struct ib_device_attr dev_attr; | ||
| 294 | #endif | ||
| 291 | struct ib_ucontext *ucontext; | 295 | struct ib_ucontext *ucontext; |
| 292 | struct file *filp; | 296 | struct file *filp; |
| 293 | int ret; | 297 | int ret; |
| @@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, | |||
| 325 | INIT_LIST_HEAD(&ucontext->ah_list); | 329 | INIT_LIST_HEAD(&ucontext->ah_list); |
| 326 | INIT_LIST_HEAD(&ucontext->xrcd_list); | 330 | INIT_LIST_HEAD(&ucontext->xrcd_list); |
| 327 | INIT_LIST_HEAD(&ucontext->rule_list); | 331 | INIT_LIST_HEAD(&ucontext->rule_list); |
| 332 | rcu_read_lock(); | ||
| 333 | ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); | ||
| 334 | rcu_read_unlock(); | ||
| 328 | ucontext->closing = 0; | 335 | ucontext->closing = 0; |
| 329 | 336 | ||
| 337 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 338 | ucontext->umem_tree = RB_ROOT; | ||
| 339 | init_rwsem(&ucontext->umem_rwsem); | ||
| 340 | ucontext->odp_mrs_count = 0; | ||
| 341 | INIT_LIST_HEAD(&ucontext->no_private_counters); | ||
| 342 | |||
| 343 | ret = ib_query_device(ibdev, &dev_attr); | ||
| 344 | if (ret) | ||
| 345 | goto err_free; | ||
| 346 | if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) | ||
| 347 | ucontext->invalidate_range = NULL; | ||
| 348 | |||
| 349 | #endif | ||
| 350 | |||
| 330 | resp.num_comp_vectors = file->device->num_comp_vectors; | 351 | resp.num_comp_vectors = file->device->num_comp_vectors; |
| 331 | 352 | ||
| 332 | ret = get_unused_fd_flags(O_CLOEXEC); | 353 | ret = get_unused_fd_flags(O_CLOEXEC); |
| @@ -371,6 +392,7 @@ err_fd: | |||
| 371 | put_unused_fd(resp.async_fd); | 392 | put_unused_fd(resp.async_fd); |
| 372 | 393 | ||
| 373 | err_free: | 394 | err_free: |
| 395 | put_pid(ucontext->tgid); | ||
| 374 | ibdev->dealloc_ucontext(ucontext); | 396 | ibdev->dealloc_ucontext(ucontext); |
| 375 | 397 | ||
| 376 | err: | 398 | err: |
| @@ -378,6 +400,52 @@ err: | |||
| 378 | return ret; | 400 | return ret; |
| 379 | } | 401 | } |
| 380 | 402 | ||
| 403 | static void copy_query_dev_fields(struct ib_uverbs_file *file, | ||
| 404 | struct ib_uverbs_query_device_resp *resp, | ||
| 405 | struct ib_device_attr *attr) | ||
| 406 | { | ||
| 407 | resp->fw_ver = attr->fw_ver; | ||
| 408 | resp->node_guid = file->device->ib_dev->node_guid; | ||
| 409 | resp->sys_image_guid = attr->sys_image_guid; | ||
| 410 | resp->max_mr_size = attr->max_mr_size; | ||
| 411 | resp->page_size_cap = attr->page_size_cap; | ||
| 412 | resp->vendor_id = attr->vendor_id; | ||
| 413 | resp->vendor_part_id = attr->vendor_part_id; | ||
| 414 | resp->hw_ver = attr->hw_ver; | ||
| 415 | resp->max_qp = attr->max_qp; | ||
| 416 | resp->max_qp_wr = attr->max_qp_wr; | ||
| 417 | resp->device_cap_flags = attr->device_cap_flags; | ||
| 418 | resp->max_sge = attr->max_sge; | ||
| 419 | resp->max_sge_rd = attr->max_sge_rd; | ||
| 420 | resp->max_cq = attr->max_cq; | ||
| 421 | resp->max_cqe = attr->max_cqe; | ||
| 422 | resp->max_mr = attr->max_mr; | ||
| 423 | resp->max_pd = attr->max_pd; | ||
| 424 | resp->max_qp_rd_atom = attr->max_qp_rd_atom; | ||
| 425 | resp->max_ee_rd_atom = attr->max_ee_rd_atom; | ||
| 426 | resp->max_res_rd_atom = attr->max_res_rd_atom; | ||
| 427 | resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom; | ||
| 428 | resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom; | ||
| 429 | resp->atomic_cap = attr->atomic_cap; | ||
| 430 | resp->max_ee = attr->max_ee; | ||
| 431 | resp->max_rdd = attr->max_rdd; | ||
| 432 | resp->max_mw = attr->max_mw; | ||
| 433 | resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp; | ||
| 434 | resp->max_raw_ethy_qp = attr->max_raw_ethy_qp; | ||
| 435 | resp->max_mcast_grp = attr->max_mcast_grp; | ||
| 436 | resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; | ||
| 437 | resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; | ||
| 438 | resp->max_ah = attr->max_ah; | ||
| 439 | resp->max_fmr = attr->max_fmr; | ||
| 440 | resp->max_map_per_fmr = attr->max_map_per_fmr; | ||
| 441 | resp->max_srq = attr->max_srq; | ||
| 442 | resp->max_srq_wr = attr->max_srq_wr; | ||
| 443 | resp->max_srq_sge = attr->max_srq_sge; | ||
| 444 | resp->max_pkeys = attr->max_pkeys; | ||
| 445 | resp->local_ca_ack_delay = attr->local_ca_ack_delay; | ||
| 446 | resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt; | ||
| 447 | } | ||
| 448 | |||
| 381 | ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, | 449 | ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, |
| 382 | const char __user *buf, | 450 | const char __user *buf, |
| 383 | int in_len, int out_len) | 451 | int in_len, int out_len) |
| @@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, | |||
| 398 | return ret; | 466 | return ret; |
| 399 | 467 | ||
| 400 | memset(&resp, 0, sizeof resp); | 468 | memset(&resp, 0, sizeof resp); |
| 401 | 469 | copy_query_dev_fields(file, &resp, &attr); | |
| 402 | resp.fw_ver = attr.fw_ver; | ||
| 403 | resp.node_guid = file->device->ib_dev->node_guid; | ||
| 404 | resp.sys_image_guid = attr.sys_image_guid; | ||
| 405 | resp.max_mr_size = attr.max_mr_size; | ||
| 406 | resp.page_size_cap = attr.page_size_cap; | ||
| 407 | resp.vendor_id = attr.vendor_id; | ||
| 408 | resp.vendor_part_id = attr.vendor_part_id; | ||
| 409 | resp.hw_ver = attr.hw_ver; | ||
| 410 | resp.max_qp = attr.max_qp; | ||
| 411 | resp.max_qp_wr = attr.max_qp_wr; | ||
| 412 | resp.device_cap_flags = attr.device_cap_flags; | ||
| 413 | resp.max_sge = attr.max_sge; | ||
| 414 | resp.max_sge_rd = attr.max_sge_rd; | ||
| 415 | resp.max_cq = attr.max_cq; | ||
| 416 | resp.max_cqe = attr.max_cqe; | ||
| 417 | resp.max_mr = attr.max_mr; | ||
| 418 | resp.max_pd = attr.max_pd; | ||
| 419 | resp.max_qp_rd_atom = attr.max_qp_rd_atom; | ||
| 420 | resp.max_ee_rd_atom = attr.max_ee_rd_atom; | ||
| 421 | resp.max_res_rd_atom = attr.max_res_rd_atom; | ||
| 422 | resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom; | ||
| 423 | resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom; | ||
| 424 | resp.atomic_cap = attr.atomic_cap; | ||
| 425 | resp.max_ee = attr.max_ee; | ||
| 426 | resp.max_rdd = attr.max_rdd; | ||
| 427 | resp.max_mw = attr.max_mw; | ||
| 428 | resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp; | ||
| 429 | resp.max_raw_ethy_qp = attr.max_raw_ethy_qp; | ||
| 430 | resp.max_mcast_grp = attr.max_mcast_grp; | ||
| 431 | resp.max_mcast_qp_attach = attr.max_mcast_qp_attach; | ||
| 432 | resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach; | ||
| 433 | resp.max_ah = attr.max_ah; | ||
| 434 | resp.max_fmr = attr.max_fmr; | ||
| 435 | resp.max_map_per_fmr = attr.max_map_per_fmr; | ||
| 436 | resp.max_srq = attr.max_srq; | ||
| 437 | resp.max_srq_wr = attr.max_srq_wr; | ||
| 438 | resp.max_srq_sge = attr.max_srq_sge; | ||
| 439 | resp.max_pkeys = attr.max_pkeys; | ||
| 440 | resp.local_ca_ack_delay = attr.local_ca_ack_delay; | ||
| 441 | resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt; | ||
| 442 | 470 | ||
| 443 | if (copy_to_user((void __user *) (unsigned long) cmd.response, | 471 | if (copy_to_user((void __user *) (unsigned long) cmd.response, |
| 444 | &resp, sizeof resp)) | 472 | &resp, sizeof resp)) |
| @@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, | |||
| 947 | goto err_free; | 975 | goto err_free; |
| 948 | } | 976 | } |
| 949 | 977 | ||
| 978 | if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { | ||
| 979 | struct ib_device_attr attr; | ||
| 980 | |||
| 981 | ret = ib_query_device(pd->device, &attr); | ||
| 982 | if (ret || !(attr.device_cap_flags & | ||
| 983 | IB_DEVICE_ON_DEMAND_PAGING)) { | ||
| 984 | pr_debug("ODP support not available\n"); | ||
| 985 | ret = -EINVAL; | ||
| 986 | goto err_put; | ||
| 987 | } | ||
| 988 | } | ||
| 989 | |||
| 950 | mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, | 990 | mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, |
| 951 | cmd.access_flags, &udata); | 991 | cmd.access_flags, &udata); |
| 952 | if (IS_ERR(mr)) { | 992 | if (IS_ERR(mr)) { |
| @@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, | |||
| 3253 | 3293 | ||
| 3254 | return ret ? ret : in_len; | 3294 | return ret ? ret : in_len; |
| 3255 | } | 3295 | } |
| 3296 | |||
| 3297 | int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, | ||
| 3298 | struct ib_udata *ucore, | ||
| 3299 | struct ib_udata *uhw) | ||
| 3300 | { | ||
| 3301 | struct ib_uverbs_ex_query_device_resp resp; | ||
| 3302 | struct ib_uverbs_ex_query_device cmd; | ||
| 3303 | struct ib_device_attr attr; | ||
| 3304 | struct ib_device *device; | ||
| 3305 | int err; | ||
| 3306 | |||
| 3307 | device = file->device->ib_dev; | ||
| 3308 | if (ucore->inlen < sizeof(cmd)) | ||
| 3309 | return -EINVAL; | ||
| 3310 | |||
| 3311 | err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); | ||
| 3312 | if (err) | ||
| 3313 | return err; | ||
| 3314 | |||
| 3315 | if (cmd.reserved) | ||
| 3316 | return -EINVAL; | ||
| 3317 | |||
| 3318 | err = device->query_device(device, &attr); | ||
| 3319 | if (err) | ||
| 3320 | return err; | ||
| 3321 | |||
| 3322 | memset(&resp, 0, sizeof(resp)); | ||
| 3323 | copy_query_dev_fields(file, &resp.base, &attr); | ||
| 3324 | resp.comp_mask = 0; | ||
| 3325 | |||
| 3326 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 3327 | if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) { | ||
| 3328 | resp.odp_caps.general_caps = attr.odp_caps.general_caps; | ||
| 3329 | resp.odp_caps.per_transport_caps.rc_odp_caps = | ||
| 3330 | attr.odp_caps.per_transport_caps.rc_odp_caps; | ||
| 3331 | resp.odp_caps.per_transport_caps.uc_odp_caps = | ||
| 3332 | attr.odp_caps.per_transport_caps.uc_odp_caps; | ||
| 3333 | resp.odp_caps.per_transport_caps.ud_odp_caps = | ||
| 3334 | attr.odp_caps.per_transport_caps.ud_odp_caps; | ||
| 3335 | resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP; | ||
| 3336 | } | ||
| 3337 | #endif | ||
| 3338 | |||
| 3339 | err = ib_copy_to_udata(ucore, &resp, sizeof(resp)); | ||
| 3340 | if (err) | ||
| 3341 | return err; | ||
| 3342 | |||
| 3343 | return 0; | ||
| 3344 | } | ||
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 71ab83fde472..e6c23b9eab33 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c | |||
| @@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, | |||
| 122 | struct ib_udata *ucore, | 122 | struct ib_udata *ucore, |
| 123 | struct ib_udata *uhw) = { | 123 | struct ib_udata *uhw) = { |
| 124 | [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, | 124 | [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, |
| 125 | [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow | 125 | [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow, |
| 126 | [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device | ||
| 126 | }; | 127 | }; |
| 127 | 128 | ||
| 128 | static void ib_uverbs_add_one(struct ib_device *device); | 129 | static void ib_uverbs_add_one(struct ib_device *device); |
| @@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, | |||
| 296 | kfree(uobj); | 297 | kfree(uobj); |
| 297 | } | 298 | } |
| 298 | 299 | ||
| 300 | put_pid(context->tgid); | ||
| 301 | |||
| 299 | return context->device->dealloc_ucontext(context); | 302 | return context->device->dealloc_ucontext(context); |
| 300 | } | 303 | } |
| 301 | 304 | ||
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c2b89cc5dbca..f93eb8da7b5a 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c | |||
| @@ -879,7 +879,8 @@ int ib_resolve_eth_l2_attrs(struct ib_qp *qp, | |||
| 879 | if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { | 879 | if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) { |
| 880 | rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); | 880 | rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac); |
| 881 | rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); | 881 | rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac); |
| 882 | qp_attr->vlan_id = rdma_get_vlan_id(&sgid); | 882 | if (!(*qp_attr_mask & IB_QP_VID)) |
| 883 | qp_attr->vlan_id = rdma_get_vlan_id(&sgid); | ||
| 883 | } else { | 884 | } else { |
| 884 | ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, | 885 | ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid, |
| 885 | qp_attr->ah_attr.dmac, &qp_attr->vlan_id); | 886 | qp_attr->ah_attr.dmac, &qp_attr->vlan_id); |
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c index 2d5cbf4363e4..bdf3507810cb 100644 --- a/drivers/infiniband/hw/amso1100/c2_provider.c +++ b/drivers/infiniband/hw/amso1100/c2_provider.c | |||
| @@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 476 | c2mr->umem->page_size, | 476 | c2mr->umem->page_size, |
| 477 | i, | 477 | i, |
| 478 | length, | 478 | length, |
| 479 | c2mr->umem->offset, | 479 | ib_umem_offset(c2mr->umem), |
| 480 | &kva, | 480 | &kva, |
| 481 | c2_convert_access(acc), | 481 | c2_convert_access(acc), |
| 482 | c2mr); | 482 | c2mr); |
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 4b8c6116c058..9edc200b311d 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c | |||
| @@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) | |||
| 1640 | __state_set(&ep->com, MPA_REQ_RCVD); | 1640 | __state_set(&ep->com, MPA_REQ_RCVD); |
| 1641 | 1641 | ||
| 1642 | /* drive upcall */ | 1642 | /* drive upcall */ |
| 1643 | mutex_lock(&ep->parent_ep->com.mutex); | 1643 | mutex_lock_nested(&ep->parent_ep->com.mutex, |
| 1644 | SINGLE_DEPTH_NESTING); | ||
| 1644 | if (ep->parent_ep->com.state != DEAD) { | 1645 | if (ep->parent_ep->com.state != DEAD) { |
| 1645 | if (connect_request_upcall(ep)) | 1646 | if (connect_request_upcall(ep)) |
| 1646 | abort_connection(ep, skb, GFP_KERNEL); | 1647 | abort_connection(ep, skb, GFP_KERNEL); |
| @@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) | |||
| 3126 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, | 3127 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, |
| 3127 | &ep->com.wr_wait, | 3128 | &ep->com.wr_wait, |
| 3128 | 0, 0, __func__); | 3129 | 0, 0, __func__); |
| 3130 | else if (err > 0) | ||
| 3131 | err = net_xmit_errno(err); | ||
| 3129 | if (err) | 3132 | if (err) |
| 3130 | pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", | 3133 | pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", |
| 3131 | err, ep->stid, | 3134 | err, ep->stid, |
| @@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) | |||
| 3159 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, | 3162 | err = c4iw_wait_for_reply(&ep->com.dev->rdev, |
| 3160 | &ep->com.wr_wait, | 3163 | &ep->com.wr_wait, |
| 3161 | 0, 0, __func__); | 3164 | 0, 0, __func__); |
| 3165 | else if (err > 0) | ||
| 3166 | err = net_xmit_errno(err); | ||
| 3162 | } | 3167 | } |
| 3163 | if (err) | 3168 | if (err) |
| 3164 | pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" | 3169 | pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n" |
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 72f1f052e88c..eb5df4e62703 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c | |||
| @@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file) | |||
| 670 | idr_for_each(&epd->devp->stid_idr, count_idrs, &count); | 670 | idr_for_each(&epd->devp->stid_idr, count_idrs, &count); |
| 671 | spin_unlock_irq(&epd->devp->lock); | 671 | spin_unlock_irq(&epd->devp->lock); |
| 672 | 672 | ||
| 673 | epd->bufsize = count * 160; | 673 | epd->bufsize = count * 240; |
| 674 | epd->buf = vmalloc(epd->bufsize); | 674 | epd->buf = vmalloc(epd->bufsize); |
| 675 | if (!epd->buf) { | 675 | if (!epd->buf) { |
| 676 | ret = -ENOMEM; | 676 | ret = -ENOMEM; |
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 0744455cd88b..cb43c2299ac0 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c | |||
| @@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD; | |||
| 50 | module_param(inline_threshold, int, 0644); | 50 | module_param(inline_threshold, int, 0644); |
| 51 | MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); | 51 | MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)"); |
| 52 | 52 | ||
| 53 | static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length) | ||
| 54 | { | ||
| 55 | return (is_t4(dev->rdev.lldi.adapter_type) || | ||
| 56 | is_t5(dev->rdev.lldi.adapter_type)) && | ||
| 57 | length >= 8*1024*1024*1024ULL; | ||
| 58 | } | ||
| 59 | |||
| 53 | static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, | 60 | static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, |
| 54 | u32 len, dma_addr_t data, int wait) | 61 | u32 len, dma_addr_t data, int wait) |
| 55 | { | 62 | { |
| @@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, | |||
| 369 | int ret; | 376 | int ret; |
| 370 | 377 | ||
| 371 | ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, | 378 | ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, |
| 372 | FW_RI_STAG_NSMR, mhp->attr.perms, | 379 | FW_RI_STAG_NSMR, mhp->attr.len ? |
| 380 | mhp->attr.perms : 0, | ||
| 373 | mhp->attr.mw_bind_enable, mhp->attr.zbva, | 381 | mhp->attr.mw_bind_enable, mhp->attr.zbva, |
| 374 | mhp->attr.va_fbo, mhp->attr.len, shift - 12, | 382 | mhp->attr.va_fbo, mhp->attr.len ? |
| 383 | mhp->attr.len : -1, shift - 12, | ||
| 375 | mhp->attr.pbl_size, mhp->attr.pbl_addr); | 384 | mhp->attr.pbl_size, mhp->attr.pbl_addr); |
| 376 | if (ret) | 385 | if (ret) |
| 377 | return ret; | 386 | return ret; |
| @@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, | |||
| 536 | return ret; | 545 | return ret; |
| 537 | } | 546 | } |
| 538 | 547 | ||
| 548 | if (mr_exceeds_hw_limits(rhp, total_size)) { | ||
| 549 | kfree(page_list); | ||
| 550 | return -EINVAL; | ||
| 551 | } | ||
| 552 | |||
| 539 | ret = reregister_mem(rhp, php, &mh, shift, npages); | 553 | ret = reregister_mem(rhp, php, &mh, shift, npages); |
| 540 | kfree(page_list); | 554 | kfree(page_list); |
| 541 | if (ret) | 555 | if (ret) |
| @@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, | |||
| 596 | if (ret) | 610 | if (ret) |
| 597 | goto err; | 611 | goto err; |
| 598 | 612 | ||
| 613 | if (mr_exceeds_hw_limits(rhp, total_size)) { | ||
| 614 | kfree(page_list); | ||
| 615 | ret = -EINVAL; | ||
| 616 | goto err; | ||
| 617 | } | ||
| 618 | |||
| 599 | ret = alloc_pbl(mhp, npages); | 619 | ret = alloc_pbl(mhp, npages); |
| 600 | if (ret) { | 620 | if (ret) { |
| 601 | kfree(page_list); | 621 | kfree(page_list); |
| @@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 699 | 719 | ||
| 700 | php = to_c4iw_pd(pd); | 720 | php = to_c4iw_pd(pd); |
| 701 | rhp = php->rhp; | 721 | rhp = php->rhp; |
| 722 | |||
| 723 | if (mr_exceeds_hw_limits(rhp, length)) | ||
| 724 | return ERR_PTR(-EINVAL); | ||
| 725 | |||
| 702 | mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); | 726 | mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); |
| 703 | if (!mhp) | 727 | if (!mhp) |
| 704 | return ERR_PTR(-ENOMEM); | 728 | return ERR_PTR(-ENOMEM); |
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 2ed3ece2b2ee..bb85d479e66e 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c | |||
| @@ -1538,9 +1538,9 @@ err: | |||
| 1538 | set_state(qhp, C4IW_QP_STATE_ERROR); | 1538 | set_state(qhp, C4IW_QP_STATE_ERROR); |
| 1539 | free = 1; | 1539 | free = 1; |
| 1540 | abort = 1; | 1540 | abort = 1; |
| 1541 | wake_up(&qhp->wait); | ||
| 1542 | BUG_ON(!ep); | 1541 | BUG_ON(!ep); |
| 1543 | flush_qp(qhp); | 1542 | flush_qp(qhp); |
| 1543 | wake_up(&qhp->wait); | ||
| 1544 | out: | 1544 | out: |
| 1545 | mutex_unlock(&qhp->mutex); | 1545 | mutex_unlock(&qhp->mutex); |
| 1546 | 1546 | ||
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index 3488e8c9fcb4..f914b30999f8 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c | |||
| @@ -399,7 +399,7 @@ reg_user_mr_fallback: | |||
| 399 | pginfo.num_kpages = num_kpages; | 399 | pginfo.num_kpages = num_kpages; |
| 400 | pginfo.num_hwpages = num_hwpages; | 400 | pginfo.num_hwpages = num_hwpages; |
| 401 | pginfo.u.usr.region = e_mr->umem; | 401 | pginfo.u.usr.region = e_mr->umem; |
| 402 | pginfo.next_hwpage = e_mr->umem->offset / hwpage_size; | 402 | pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size; |
| 403 | pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; | 403 | pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl; |
| 404 | ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, | 404 | ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags, |
| 405 | e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, | 405 | e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, |
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c index 5e61e9bff697..c7278f6a8217 100644 --- a/drivers/infiniband/hw/ipath/ipath_mr.c +++ b/drivers/infiniband/hw/ipath/ipath_mr.c | |||
| @@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 214 | mr->mr.user_base = start; | 214 | mr->mr.user_base = start; |
| 215 | mr->mr.iova = virt_addr; | 215 | mr->mr.iova = virt_addr; |
| 216 | mr->mr.length = length; | 216 | mr->mr.length = length; |
| 217 | mr->mr.offset = umem->offset; | 217 | mr->mr.offset = ib_umem_offset(umem); |
| 218 | mr->mr.access_flags = mr_access_flags; | 218 | mr->mr.access_flags = mr_access_flags; |
| 219 | mr->mr.max_segs = n; | 219 | mr->mr.max_segs = n; |
| 220 | mr->umem = umem; | 220 | mr->umem = umem; |
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 8f9325cfc85d..c36ccbd9a644 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c | |||
| @@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, | |||
| 223 | 223 | ||
| 224 | if (flags & IB_MR_REREG_TRANS) { | 224 | if (flags & IB_MR_REREG_TRANS) { |
| 225 | int shift; | 225 | int shift; |
| 226 | int err; | ||
| 227 | int n; | 226 | int n; |
| 228 | 227 | ||
| 229 | mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); | 228 | mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); |
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 4ea0135af484..27a70159e2ea 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o | 1 | obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o |
| 2 | 2 | ||
| 3 | mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o | 3 | mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o |
| 4 | mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o | ||
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1ba6c42e4df8..8a87404e9c76 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c | |||
| @@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, | |||
| 244 | props->max_mcast_grp; | 244 | props->max_mcast_grp; |
| 245 | props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ | 245 | props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ |
| 246 | 246 | ||
| 247 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 248 | if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) | ||
| 249 | props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; | ||
| 250 | props->odp_caps = dev->odp_caps; | ||
| 251 | #endif | ||
| 252 | |||
| 247 | out: | 253 | out: |
| 248 | kfree(in_mad); | 254 | kfree(in_mad); |
| 249 | kfree(out_mad); | 255 | kfree(out_mad); |
| @@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, | |||
| 568 | goto out_count; | 574 | goto out_count; |
| 569 | } | 575 | } |
| 570 | 576 | ||
| 577 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 578 | context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; | ||
| 579 | #endif | ||
| 580 | |||
| 571 | INIT_LIST_HEAD(&context->db_page_list); | 581 | INIT_LIST_HEAD(&context->db_page_list); |
| 572 | mutex_init(&context->db_page_mutex); | 582 | mutex_init(&context->db_page_mutex); |
| 573 | 583 | ||
| @@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device, | |||
| 858 | struct mlx5_ib_dev *dev = | 868 | struct mlx5_ib_dev *dev = |
| 859 | container_of(device, struct mlx5_ib_dev, ib_dev.dev); | 869 | container_of(device, struct mlx5_ib_dev, ib_dev.dev); |
| 860 | 870 | ||
| 861 | return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages); | 871 | return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); |
| 862 | } | 872 | } |
| 863 | 873 | ||
| 864 | static ssize_t show_hca(struct device *device, struct device_attribute *attr, | 874 | static ssize_t show_hca(struct device *device, struct device_attribute *attr, |
| @@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) | |||
| 1321 | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | | 1331 | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | |
| 1322 | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | | 1332 | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | |
| 1323 | (1ull << IB_USER_VERBS_CMD_OPEN_QP); | 1333 | (1ull << IB_USER_VERBS_CMD_OPEN_QP); |
| 1334 | dev->ib_dev.uverbs_ex_cmd_mask = | ||
| 1335 | (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); | ||
| 1324 | 1336 | ||
| 1325 | dev->ib_dev.query_device = mlx5_ib_query_device; | 1337 | dev->ib_dev.query_device = mlx5_ib_query_device; |
| 1326 | dev->ib_dev.query_port = mlx5_ib_query_port; | 1338 | dev->ib_dev.query_port = mlx5_ib_query_port; |
| @@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) | |||
| 1366 | dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; | 1378 | dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list; |
| 1367 | dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; | 1379 | dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status; |
| 1368 | 1380 | ||
| 1381 | mlx5_ib_internal_query_odp_caps(dev); | ||
| 1382 | |||
| 1369 | if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { | 1383 | if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) { |
| 1370 | dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; | 1384 | dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; |
| 1371 | dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; | 1385 | dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; |
| @@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) | |||
| 1379 | goto err_eqs; | 1393 | goto err_eqs; |
| 1380 | 1394 | ||
| 1381 | mutex_init(&dev->cap_mask_mutex); | 1395 | mutex_init(&dev->cap_mask_mutex); |
| 1382 | spin_lock_init(&dev->mr_lock); | ||
| 1383 | 1396 | ||
| 1384 | err = create_dev_resources(&dev->devr); | 1397 | err = create_dev_resources(&dev->devr); |
| 1385 | if (err) | 1398 | if (err) |
| 1386 | goto err_eqs; | 1399 | goto err_eqs; |
| 1387 | 1400 | ||
| 1388 | err = ib_register_device(&dev->ib_dev, NULL); | 1401 | err = mlx5_ib_odp_init_one(dev); |
| 1389 | if (err) | 1402 | if (err) |
| 1390 | goto err_rsrc; | 1403 | goto err_rsrc; |
| 1391 | 1404 | ||
| 1405 | err = ib_register_device(&dev->ib_dev, NULL); | ||
| 1406 | if (err) | ||
| 1407 | goto err_odp; | ||
| 1408 | |||
| 1392 | err = create_umr_res(dev); | 1409 | err = create_umr_res(dev); |
| 1393 | if (err) | 1410 | if (err) |
| 1394 | goto err_dev; | 1411 | goto err_dev; |
| @@ -1410,6 +1427,9 @@ err_umrc: | |||
| 1410 | err_dev: | 1427 | err_dev: |
| 1411 | ib_unregister_device(&dev->ib_dev); | 1428 | ib_unregister_device(&dev->ib_dev); |
| 1412 | 1429 | ||
| 1430 | err_odp: | ||
| 1431 | mlx5_ib_odp_remove_one(dev); | ||
| 1432 | |||
| 1413 | err_rsrc: | 1433 | err_rsrc: |
| 1414 | destroy_dev_resources(&dev->devr); | 1434 | destroy_dev_resources(&dev->devr); |
| 1415 | 1435 | ||
| @@ -1425,8 +1445,10 @@ err_dealloc: | |||
| 1425 | static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) | 1445 | static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) |
| 1426 | { | 1446 | { |
| 1427 | struct mlx5_ib_dev *dev = context; | 1447 | struct mlx5_ib_dev *dev = context; |
| 1448 | |||
| 1428 | ib_unregister_device(&dev->ib_dev); | 1449 | ib_unregister_device(&dev->ib_dev); |
| 1429 | destroy_umrc_res(dev); | 1450 | destroy_umrc_res(dev); |
| 1451 | mlx5_ib_odp_remove_one(dev); | ||
| 1430 | destroy_dev_resources(&dev->devr); | 1452 | destroy_dev_resources(&dev->devr); |
| 1431 | free_comp_eqs(dev); | 1453 | free_comp_eqs(dev); |
| 1432 | ib_dealloc_device(&dev->ib_dev); | 1454 | ib_dealloc_device(&dev->ib_dev); |
| @@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = { | |||
| 1440 | 1462 | ||
| 1441 | static int __init mlx5_ib_init(void) | 1463 | static int __init mlx5_ib_init(void) |
| 1442 | { | 1464 | { |
| 1465 | int err; | ||
| 1466 | |||
| 1443 | if (deprecated_prof_sel != 2) | 1467 | if (deprecated_prof_sel != 2) |
| 1444 | pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); | 1468 | pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); |
| 1445 | 1469 | ||
| 1446 | return mlx5_register_interface(&mlx5_ib_interface); | 1470 | err = mlx5_ib_odp_init(); |
| 1471 | if (err) | ||
| 1472 | return err; | ||
| 1473 | |||
| 1474 | err = mlx5_register_interface(&mlx5_ib_interface); | ||
| 1475 | if (err) | ||
| 1476 | goto clean_odp; | ||
| 1477 | |||
| 1478 | return err; | ||
| 1479 | |||
| 1480 | clean_odp: | ||
| 1481 | mlx5_ib_odp_cleanup(); | ||
| 1482 | return err; | ||
| 1447 | } | 1483 | } |
| 1448 | 1484 | ||
| 1449 | static void __exit mlx5_ib_cleanup(void) | 1485 | static void __exit mlx5_ib_cleanup(void) |
| 1450 | { | 1486 | { |
| 1451 | mlx5_unregister_interface(&mlx5_ib_interface); | 1487 | mlx5_unregister_interface(&mlx5_ib_interface); |
| 1488 | mlx5_ib_odp_cleanup(); | ||
| 1452 | } | 1489 | } |
| 1453 | 1490 | ||
| 1454 | module_init(mlx5_ib_init); | 1491 | module_init(mlx5_ib_init); |
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index dae07eae9507..b56e4c5593ee 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | 32 | ||
| 33 | #include <linux/module.h> | 33 | #include <linux/module.h> |
| 34 | #include <rdma/ib_umem.h> | 34 | #include <rdma/ib_umem.h> |
| 35 | #include <rdma/ib_umem_odp.h> | ||
| 35 | #include "mlx5_ib.h" | 36 | #include "mlx5_ib.h" |
| 36 | 37 | ||
| 37 | /* @umem: umem object to scan | 38 | /* @umem: umem object to scan |
| @@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, | |||
| 57 | int entry; | 58 | int entry; |
| 58 | unsigned long page_shift = ilog2(umem->page_size); | 59 | unsigned long page_shift = ilog2(umem->page_size); |
| 59 | 60 | ||
| 61 | /* With ODP we must always match OS page size. */ | ||
| 62 | if (umem->odp_data) { | ||
| 63 | *count = ib_umem_page_count(umem); | ||
| 64 | *shift = PAGE_SHIFT; | ||
| 65 | *ncont = *count; | ||
| 66 | if (order) | ||
| 67 | *order = ilog2(roundup_pow_of_two(*count)); | ||
| 68 | |||
| 69 | return; | ||
| 70 | } | ||
| 71 | |||
| 60 | addr = addr >> page_shift; | 72 | addr = addr >> page_shift; |
| 61 | tmp = (unsigned long)addr; | 73 | tmp = (unsigned long)addr; |
| 62 | m = find_first_bit(&tmp, sizeof(tmp)); | 74 | m = find_first_bit(&tmp, sizeof(tmp)); |
| @@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, | |||
| 108 | *count = i; | 120 | *count = i; |
| 109 | } | 121 | } |
| 110 | 122 | ||
| 111 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | 123 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
| 112 | int page_shift, __be64 *pas, int umr) | 124 | static u64 umem_dma_to_mtt(dma_addr_t umem_dma) |
| 125 | { | ||
| 126 | u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; | ||
| 127 | |||
| 128 | if (umem_dma & ODP_READ_ALLOWED_BIT) | ||
| 129 | mtt_entry |= MLX5_IB_MTT_READ; | ||
| 130 | if (umem_dma & ODP_WRITE_ALLOWED_BIT) | ||
| 131 | mtt_entry |= MLX5_IB_MTT_WRITE; | ||
| 132 | |||
| 133 | return mtt_entry; | ||
| 134 | } | ||
| 135 | #endif | ||
| 136 | |||
| 137 | /* | ||
| 138 | * Populate the given array with bus addresses from the umem. | ||
| 139 | * | ||
| 140 | * dev - mlx5_ib device | ||
| 141 | * umem - umem to use to fill the pages | ||
| 142 | * page_shift - determines the page size used in the resulting array | ||
| 143 | * offset - offset into the umem to start from, | ||
| 144 | * only implemented for ODP umems | ||
| 145 | * num_pages - total number of pages to fill | ||
| 146 | * pas - bus addresses array to fill | ||
| 147 | * access_flags - access flags to set on all present pages. | ||
| 148 | use enum mlx5_ib_mtt_access_flags for this. | ||
| 149 | */ | ||
| 150 | void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | ||
| 151 | int page_shift, size_t offset, size_t num_pages, | ||
| 152 | __be64 *pas, int access_flags) | ||
| 113 | { | 153 | { |
| 114 | unsigned long umem_page_shift = ilog2(umem->page_size); | 154 | unsigned long umem_page_shift = ilog2(umem->page_size); |
| 115 | int shift = page_shift - umem_page_shift; | 155 | int shift = page_shift - umem_page_shift; |
| @@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | |||
| 120 | int len; | 160 | int len; |
| 121 | struct scatterlist *sg; | 161 | struct scatterlist *sg; |
| 122 | int entry; | 162 | int entry; |
| 163 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 164 | const bool odp = umem->odp_data != NULL; | ||
| 165 | |||
| 166 | if (odp) { | ||
| 167 | WARN_ON(shift != 0); | ||
| 168 | WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); | ||
| 169 | |||
| 170 | for (i = 0; i < num_pages; ++i) { | ||
| 171 | dma_addr_t pa = umem->odp_data->dma_list[offset + i]; | ||
| 172 | |||
| 173 | pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); | ||
| 174 | } | ||
| 175 | return; | ||
| 176 | } | ||
| 177 | #endif | ||
| 123 | 178 | ||
| 124 | i = 0; | 179 | i = 0; |
| 125 | for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { | 180 | for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { |
| @@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | |||
| 128 | for (k = 0; k < len; k++) { | 183 | for (k = 0; k < len; k++) { |
| 129 | if (!(i & mask)) { | 184 | if (!(i & mask)) { |
| 130 | cur = base + (k << umem_page_shift); | 185 | cur = base + (k << umem_page_shift); |
| 131 | if (umr) | 186 | cur |= access_flags; |
| 132 | cur |= 3; | ||
| 133 | 187 | ||
| 134 | pas[i >> shift] = cpu_to_be64(cur); | 188 | pas[i >> shift] = cpu_to_be64(cur); |
| 135 | mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", | 189 | mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", |
| @@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | |||
| 142 | } | 196 | } |
| 143 | } | 197 | } |
| 144 | 198 | ||
| 199 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | ||
| 200 | int page_shift, __be64 *pas, int access_flags) | ||
| 201 | { | ||
| 202 | return __mlx5_ib_populate_pas(dev, umem, page_shift, 0, | ||
| 203 | ib_umem_num_pages(umem), pas, | ||
| 204 | access_flags); | ||
| 205 | } | ||
| 145 | int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) | 206 | int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset) |
| 146 | { | 207 | { |
| 147 | u64 page_size; | 208 | u64 page_size; |
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 386780f0d1e1..83f22fe297c8 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h | |||
| @@ -111,6 +111,8 @@ struct mlx5_ib_pd { | |||
| 111 | */ | 111 | */ |
| 112 | 112 | ||
| 113 | #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START | 113 | #define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START |
| 114 | #define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) | ||
| 115 | #define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) | ||
| 114 | #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 | 116 | #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 |
| 115 | #define MLX5_IB_WR_UMR IB_WR_RESERVED1 | 117 | #define MLX5_IB_WR_UMR IB_WR_RESERVED1 |
| 116 | 118 | ||
| @@ -147,6 +149,29 @@ enum { | |||
| 147 | MLX5_QP_EMPTY | 149 | MLX5_QP_EMPTY |
| 148 | }; | 150 | }; |
| 149 | 151 | ||
| 152 | /* | ||
| 153 | * Connect-IB can trigger up to four concurrent pagefaults | ||
| 154 | * per-QP. | ||
| 155 | */ | ||
| 156 | enum mlx5_ib_pagefault_context { | ||
| 157 | MLX5_IB_PAGEFAULT_RESPONDER_READ, | ||
| 158 | MLX5_IB_PAGEFAULT_REQUESTOR_READ, | ||
| 159 | MLX5_IB_PAGEFAULT_RESPONDER_WRITE, | ||
| 160 | MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, | ||
| 161 | MLX5_IB_PAGEFAULT_CONTEXTS | ||
| 162 | }; | ||
| 163 | |||
| 164 | static inline enum mlx5_ib_pagefault_context | ||
| 165 | mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) | ||
| 166 | { | ||
| 167 | return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); | ||
| 168 | } | ||
| 169 | |||
| 170 | struct mlx5_ib_pfault { | ||
| 171 | struct work_struct work; | ||
| 172 | struct mlx5_pagefault mpfault; | ||
| 173 | }; | ||
| 174 | |||
| 150 | struct mlx5_ib_qp { | 175 | struct mlx5_ib_qp { |
| 151 | struct ib_qp ibqp; | 176 | struct ib_qp ibqp; |
| 152 | struct mlx5_core_qp mqp; | 177 | struct mlx5_core_qp mqp; |
| @@ -192,6 +217,21 @@ struct mlx5_ib_qp { | |||
| 192 | 217 | ||
| 193 | /* Store signature errors */ | 218 | /* Store signature errors */ |
| 194 | bool signature_en; | 219 | bool signature_en; |
| 220 | |||
| 221 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 222 | /* | ||
| 223 | * A flag that is true for QP's that are in a state that doesn't | ||
| 224 | * allow page faults, and shouldn't schedule any more faults. | ||
| 225 | */ | ||
| 226 | int disable_page_faults; | ||
| 227 | /* | ||
| 228 | * The disable_page_faults_lock protects a QP's disable_page_faults | ||
| 229 | * field, allowing for a thread to atomically check whether the QP | ||
| 230 | * allows page faults, and if so schedule a page fault. | ||
| 231 | */ | ||
| 232 | spinlock_t disable_page_faults_lock; | ||
| 233 | struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; | ||
| 234 | #endif | ||
| 195 | }; | 235 | }; |
| 196 | 236 | ||
| 197 | struct mlx5_ib_cq_buf { | 237 | struct mlx5_ib_cq_buf { |
| @@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags { | |||
| 206 | MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, | 246 | MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, |
| 207 | }; | 247 | }; |
| 208 | 248 | ||
| 249 | struct mlx5_umr_wr { | ||
| 250 | union { | ||
| 251 | u64 virt_addr; | ||
| 252 | u64 offset; | ||
| 253 | } target; | ||
| 254 | struct ib_pd *pd; | ||
| 255 | unsigned int page_shift; | ||
| 256 | unsigned int npages; | ||
| 257 | u32 length; | ||
| 258 | int access_flags; | ||
| 259 | u32 mkey; | ||
| 260 | }; | ||
| 261 | |||
| 209 | struct mlx5_shared_mr_info { | 262 | struct mlx5_shared_mr_info { |
| 210 | int mr_id; | 263 | int mr_id; |
| 211 | struct ib_umem *umem; | 264 | struct ib_umem *umem; |
| @@ -253,6 +306,13 @@ struct mlx5_ib_xrcd { | |||
| 253 | u32 xrcdn; | 306 | u32 xrcdn; |
| 254 | }; | 307 | }; |
| 255 | 308 | ||
| 309 | enum mlx5_ib_mtt_access_flags { | ||
| 310 | MLX5_IB_MTT_READ = (1 << 0), | ||
| 311 | MLX5_IB_MTT_WRITE = (1 << 1), | ||
| 312 | }; | ||
| 313 | |||
| 314 | #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) | ||
| 315 | |||
| 256 | struct mlx5_ib_mr { | 316 | struct mlx5_ib_mr { |
| 257 | struct ib_mr ibmr; | 317 | struct ib_mr ibmr; |
| 258 | struct mlx5_core_mr mmr; | 318 | struct mlx5_core_mr mmr; |
| @@ -261,12 +321,11 @@ struct mlx5_ib_mr { | |||
| 261 | struct list_head list; | 321 | struct list_head list; |
| 262 | int order; | 322 | int order; |
| 263 | int umred; | 323 | int umred; |
| 264 | __be64 *pas; | ||
| 265 | dma_addr_t dma; | ||
| 266 | int npages; | 324 | int npages; |
| 267 | struct mlx5_ib_dev *dev; | 325 | struct mlx5_ib_dev *dev; |
| 268 | struct mlx5_create_mkey_mbox_out out; | 326 | struct mlx5_create_mkey_mbox_out out; |
| 269 | struct mlx5_core_sig_ctx *sig; | 327 | struct mlx5_core_sig_ctx *sig; |
| 328 | int live; | ||
| 270 | }; | 329 | }; |
| 271 | 330 | ||
| 272 | struct mlx5_ib_fast_reg_page_list { | 331 | struct mlx5_ib_fast_reg_page_list { |
| @@ -372,11 +431,18 @@ struct mlx5_ib_dev { | |||
| 372 | struct umr_common umrc; | 431 | struct umr_common umrc; |
| 373 | /* sync used page count stats | 432 | /* sync used page count stats |
| 374 | */ | 433 | */ |
| 375 | spinlock_t mr_lock; | ||
| 376 | struct mlx5_ib_resources devr; | 434 | struct mlx5_ib_resources devr; |
| 377 | struct mlx5_mr_cache cache; | 435 | struct mlx5_mr_cache cache; |
| 378 | struct timer_list delay_timer; | 436 | struct timer_list delay_timer; |
| 379 | int fill_delay; | 437 | int fill_delay; |
| 438 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 439 | struct ib_odp_caps odp_caps; | ||
| 440 | /* | ||
| 441 | * Sleepable RCU that prevents destruction of MRs while they are still | ||
| 442 | * being used by a page fault handler. | ||
| 443 | */ | ||
| 444 | struct srcu_struct mr_srcu; | ||
| 445 | #endif | ||
| 380 | }; | 446 | }; |
| 381 | 447 | ||
| 382 | static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) | 448 | static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) |
| @@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, | |||
| 490 | int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, | 556 | int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, |
| 491 | struct ib_recv_wr **bad_wr); | 557 | struct ib_recv_wr **bad_wr); |
| 492 | void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); | 558 | void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); |
| 559 | int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, | ||
| 560 | void *buffer, u32 length); | ||
| 493 | struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, | 561 | struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, |
| 494 | int vector, struct ib_ucontext *context, | 562 | int vector, struct ib_ucontext *context, |
| 495 | struct ib_udata *udata); | 563 | struct ib_udata *udata); |
| @@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); | |||
| 502 | struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | 570 | struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, |
| 503 | u64 virt_addr, int access_flags, | 571 | u64 virt_addr, int access_flags, |
| 504 | struct ib_udata *udata); | 572 | struct ib_udata *udata); |
| 573 | int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, | ||
| 574 | int npages, int zap); | ||
| 505 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr); | 575 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr); |
| 506 | int mlx5_ib_destroy_mr(struct ib_mr *ibmr); | 576 | int mlx5_ib_destroy_mr(struct ib_mr *ibmr); |
| 507 | struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, | 577 | struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, |
| @@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); | |||
| 533 | void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); | 603 | void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); |
| 534 | void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, | 604 | void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift, |
| 535 | int *ncont, int *order); | 605 | int *ncont, int *order); |
| 606 | void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | ||
| 607 | int page_shift, size_t offset, size_t num_pages, | ||
| 608 | __be64 *pas, int access_flags); | ||
| 536 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, | 609 | void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, |
| 537 | int page_shift, __be64 *pas, int umr); | 610 | int page_shift, __be64 *pas, int access_flags); |
| 538 | void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); | 611 | void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); |
| 539 | int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); | 612 | int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); |
| 540 | int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); | 613 | int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); |
| @@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context); | |||
| 544 | int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, | 617 | int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, |
| 545 | struct ib_mr_status *mr_status); | 618 | struct ib_mr_status *mr_status); |
| 546 | 619 | ||
| 620 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 621 | extern struct workqueue_struct *mlx5_ib_page_fault_wq; | ||
| 622 | |||
| 623 | int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev); | ||
| 624 | void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, | ||
| 625 | struct mlx5_ib_pfault *pfault); | ||
| 626 | void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); | ||
| 627 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); | ||
| 628 | void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); | ||
| 629 | int __init mlx5_ib_odp_init(void); | ||
| 630 | void mlx5_ib_odp_cleanup(void); | ||
| 631 | void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); | ||
| 632 | void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); | ||
| 633 | void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, | ||
| 634 | unsigned long end); | ||
| 635 | |||
| 636 | #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
| 637 | static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) | ||
| 638 | { | ||
| 639 | return 0; | ||
| 640 | } | ||
| 641 | |||
| 642 | static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} | ||
| 643 | static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } | ||
| 644 | static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} | ||
| 645 | static inline int mlx5_ib_odp_init(void) { return 0; } | ||
| 646 | static inline void mlx5_ib_odp_cleanup(void) {} | ||
| 647 | static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} | ||
| 648 | static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} | ||
| 649 | |||
| 650 | #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
| 651 | |||
| 547 | static inline void init_query_mad(struct ib_smp *mad) | 652 | static inline void init_query_mad(struct ib_smp *mad) |
| 548 | { | 653 | { |
| 549 | mad->base_version = 1; | 654 | mad->base_version = 1; |
| @@ -561,4 +666,7 @@ static inline u8 convert_access(int acc) | |||
| 561 | MLX5_PERM_LOCAL_READ; | 666 | MLX5_PERM_LOCAL_READ; |
| 562 | } | 667 | } |
| 563 | 668 | ||
| 669 | #define MLX5_MAX_UMR_SHIFT 16 | ||
| 670 | #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) | ||
| 671 | |||
| 564 | #endif /* MLX5_IB_H */ | 672 | #endif /* MLX5_IB_H */ |
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 5a80dd993761..32a28bd50b20 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c | |||
| @@ -37,21 +37,34 @@ | |||
| 37 | #include <linux/export.h> | 37 | #include <linux/export.h> |
| 38 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
| 39 | #include <rdma/ib_umem.h> | 39 | #include <rdma/ib_umem.h> |
| 40 | #include <rdma/ib_umem_odp.h> | ||
| 41 | #include <rdma/ib_verbs.h> | ||
| 40 | #include "mlx5_ib.h" | 42 | #include "mlx5_ib.h" |
| 41 | 43 | ||
| 42 | enum { | 44 | enum { |
| 43 | MAX_PENDING_REG_MR = 8, | 45 | MAX_PENDING_REG_MR = 8, |
| 44 | }; | 46 | }; |
| 45 | 47 | ||
| 46 | enum { | 48 | #define MLX5_UMR_ALIGN 2048 |
| 47 | MLX5_UMR_ALIGN = 2048 | 49 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
| 48 | }; | 50 | static __be64 mlx5_ib_update_mtt_emergency_buffer[ |
| 51 | MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] | ||
| 52 | __aligned(MLX5_UMR_ALIGN); | ||
| 53 | static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); | ||
| 54 | #endif | ||
| 55 | |||
| 56 | static int clean_mr(struct mlx5_ib_mr *mr); | ||
| 49 | 57 | ||
| 50 | static __be64 *mr_align(__be64 *ptr, int align) | 58 | static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) |
| 51 | { | 59 | { |
| 52 | unsigned long mask = align - 1; | 60 | int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); |
| 53 | 61 | ||
| 54 | return (__be64 *)(((unsigned long)ptr + mask) & ~mask); | 62 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
| 63 | /* Wait until all page fault handlers using the mr complete. */ | ||
| 64 | synchronize_srcu(&dev->mr_srcu); | ||
| 65 | #endif | ||
| 66 | |||
| 67 | return err; | ||
| 55 | } | 68 | } |
| 56 | 69 | ||
| 57 | static int order2idx(struct mlx5_ib_dev *dev, int order) | 70 | static int order2idx(struct mlx5_ib_dev *dev, int order) |
| @@ -146,7 +159,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) | |||
| 146 | mr->order = ent->order; | 159 | mr->order = ent->order; |
| 147 | mr->umred = 1; | 160 | mr->umred = 1; |
| 148 | mr->dev = dev; | 161 | mr->dev = dev; |
| 149 | in->seg.status = 1 << 6; | 162 | in->seg.status = MLX5_MKEY_STATUS_FREE; |
| 150 | in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); | 163 | in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); |
| 151 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); | 164 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); |
| 152 | in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; | 165 | in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; |
| @@ -191,7 +204,7 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) | |||
| 191 | ent->cur--; | 204 | ent->cur--; |
| 192 | ent->size--; | 205 | ent->size--; |
| 193 | spin_unlock_irq(&ent->lock); | 206 | spin_unlock_irq(&ent->lock); |
| 194 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 207 | err = destroy_mkey(dev, mr); |
| 195 | if (err) | 208 | if (err) |
| 196 | mlx5_ib_warn(dev, "failed destroy mkey\n"); | 209 | mlx5_ib_warn(dev, "failed destroy mkey\n"); |
| 197 | else | 210 | else |
| @@ -482,7 +495,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) | |||
| 482 | ent->cur--; | 495 | ent->cur--; |
| 483 | ent->size--; | 496 | ent->size--; |
| 484 | spin_unlock_irq(&ent->lock); | 497 | spin_unlock_irq(&ent->lock); |
| 485 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 498 | err = destroy_mkey(dev, mr); |
| 486 | if (err) | 499 | if (err) |
| 487 | mlx5_ib_warn(dev, "failed destroy mkey\n"); | 500 | mlx5_ib_warn(dev, "failed destroy mkey\n"); |
| 488 | else | 501 | else |
| @@ -668,7 +681,7 @@ static int get_octo_len(u64 addr, u64 len, int page_size) | |||
| 668 | 681 | ||
| 669 | static int use_umr(int order) | 682 | static int use_umr(int order) |
| 670 | { | 683 | { |
| 671 | return order <= 17; | 684 | return order <= MLX5_MAX_UMR_SHIFT; |
| 672 | } | 685 | } |
| 673 | 686 | ||
| 674 | static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, | 687 | static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, |
| @@ -678,6 +691,7 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, | |||
| 678 | { | 691 | { |
| 679 | struct mlx5_ib_dev *dev = to_mdev(pd->device); | 692 | struct mlx5_ib_dev *dev = to_mdev(pd->device); |
| 680 | struct ib_mr *mr = dev->umrc.mr; | 693 | struct ib_mr *mr = dev->umrc.mr; |
| 694 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; | ||
| 681 | 695 | ||
| 682 | sg->addr = dma; | 696 | sg->addr = dma; |
| 683 | sg->length = ALIGN(sizeof(u64) * n, 64); | 697 | sg->length = ALIGN(sizeof(u64) * n, 64); |
| @@ -692,21 +706,24 @@ static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, | |||
| 692 | wr->num_sge = 0; | 706 | wr->num_sge = 0; |
| 693 | 707 | ||
| 694 | wr->opcode = MLX5_IB_WR_UMR; | 708 | wr->opcode = MLX5_IB_WR_UMR; |
| 695 | wr->wr.fast_reg.page_list_len = n; | 709 | |
| 696 | wr->wr.fast_reg.page_shift = page_shift; | 710 | umrwr->npages = n; |
| 697 | wr->wr.fast_reg.rkey = key; | 711 | umrwr->page_shift = page_shift; |
| 698 | wr->wr.fast_reg.iova_start = virt_addr; | 712 | umrwr->mkey = key; |
| 699 | wr->wr.fast_reg.length = len; | 713 | umrwr->target.virt_addr = virt_addr; |
| 700 | wr->wr.fast_reg.access_flags = access_flags; | 714 | umrwr->length = len; |
| 701 | wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd; | 715 | umrwr->access_flags = access_flags; |
| 716 | umrwr->pd = pd; | ||
| 702 | } | 717 | } |
| 703 | 718 | ||
| 704 | static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, | 719 | static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, |
| 705 | struct ib_send_wr *wr, u32 key) | 720 | struct ib_send_wr *wr, u32 key) |
| 706 | { | 721 | { |
| 707 | wr->send_flags = MLX5_IB_SEND_UMR_UNREG; | 722 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; |
| 723 | |||
| 724 | wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; | ||
| 708 | wr->opcode = MLX5_IB_WR_UMR; | 725 | wr->opcode = MLX5_IB_WR_UMR; |
| 709 | wr->wr.fast_reg.rkey = key; | 726 | umrwr->mkey = key; |
| 710 | } | 727 | } |
| 711 | 728 | ||
| 712 | void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) | 729 | void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context) |
| @@ -742,7 +759,10 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, | |||
| 742 | struct ib_send_wr wr, *bad; | 759 | struct ib_send_wr wr, *bad; |
| 743 | struct mlx5_ib_mr *mr; | 760 | struct mlx5_ib_mr *mr; |
| 744 | struct ib_sge sg; | 761 | struct ib_sge sg; |
| 745 | int size = sizeof(u64) * npages; | 762 | int size; |
| 763 | __be64 *mr_pas; | ||
| 764 | __be64 *pas; | ||
| 765 | dma_addr_t dma; | ||
| 746 | int err = 0; | 766 | int err = 0; |
| 747 | int i; | 767 | int i; |
| 748 | 768 | ||
| @@ -761,25 +781,31 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, | |||
| 761 | if (!mr) | 781 | if (!mr) |
| 762 | return ERR_PTR(-EAGAIN); | 782 | return ERR_PTR(-EAGAIN); |
| 763 | 783 | ||
| 764 | mr->pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); | 784 | /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. |
| 765 | if (!mr->pas) { | 785 | * To avoid copying garbage after the pas array, we allocate |
| 786 | * a little more. */ | ||
| 787 | size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); | ||
| 788 | mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); | ||
| 789 | if (!mr_pas) { | ||
| 766 | err = -ENOMEM; | 790 | err = -ENOMEM; |
| 767 | goto free_mr; | 791 | goto free_mr; |
| 768 | } | 792 | } |
| 769 | 793 | ||
| 770 | mlx5_ib_populate_pas(dev, umem, page_shift, | 794 | pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN); |
| 771 | mr_align(mr->pas, MLX5_UMR_ALIGN), 1); | 795 | mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); |
| 796 | /* Clear padding after the actual pages. */ | ||
| 797 | memset(pas + npages, 0, size - npages * sizeof(u64)); | ||
| 772 | 798 | ||
| 773 | mr->dma = dma_map_single(ddev, mr_align(mr->pas, MLX5_UMR_ALIGN), size, | 799 | dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); |
| 774 | DMA_TO_DEVICE); | 800 | if (dma_mapping_error(ddev, dma)) { |
| 775 | if (dma_mapping_error(ddev, mr->dma)) { | ||
| 776 | err = -ENOMEM; | 801 | err = -ENOMEM; |
| 777 | goto free_pas; | 802 | goto free_pas; |
| 778 | } | 803 | } |
| 779 | 804 | ||
| 780 | memset(&wr, 0, sizeof(wr)); | 805 | memset(&wr, 0, sizeof(wr)); |
| 781 | wr.wr_id = (u64)(unsigned long)&umr_context; | 806 | wr.wr_id = (u64)(unsigned long)&umr_context; |
| 782 | prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags); | 807 | prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift, |
| 808 | virt_addr, len, access_flags); | ||
| 783 | 809 | ||
| 784 | mlx5_ib_init_umr_context(&umr_context); | 810 | mlx5_ib_init_umr_context(&umr_context); |
| 785 | down(&umrc->sem); | 811 | down(&umrc->sem); |
| @@ -799,12 +825,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, | |||
| 799 | mr->mmr.size = len; | 825 | mr->mmr.size = len; |
| 800 | mr->mmr.pd = to_mpd(pd)->pdn; | 826 | mr->mmr.pd = to_mpd(pd)->pdn; |
| 801 | 827 | ||
| 828 | mr->live = 1; | ||
| 829 | |||
| 802 | unmap_dma: | 830 | unmap_dma: |
| 803 | up(&umrc->sem); | 831 | up(&umrc->sem); |
| 804 | dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE); | 832 | dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); |
| 805 | 833 | ||
| 806 | free_pas: | 834 | free_pas: |
| 807 | kfree(mr->pas); | 835 | kfree(mr_pas); |
| 808 | 836 | ||
| 809 | free_mr: | 837 | free_mr: |
| 810 | if (err) { | 838 | if (err) { |
| @@ -815,6 +843,128 @@ free_mr: | |||
| 815 | return mr; | 843 | return mr; |
| 816 | } | 844 | } |
| 817 | 845 | ||
| 846 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 847 | int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, | ||
| 848 | int zap) | ||
| 849 | { | ||
| 850 | struct mlx5_ib_dev *dev = mr->dev; | ||
| 851 | struct device *ddev = dev->ib_dev.dma_device; | ||
| 852 | struct umr_common *umrc = &dev->umrc; | ||
| 853 | struct mlx5_ib_umr_context umr_context; | ||
| 854 | struct ib_umem *umem = mr->umem; | ||
| 855 | int size; | ||
| 856 | __be64 *pas; | ||
| 857 | dma_addr_t dma; | ||
| 858 | struct ib_send_wr wr, *bad; | ||
| 859 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg; | ||
| 860 | struct ib_sge sg; | ||
| 861 | int err = 0; | ||
| 862 | const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); | ||
| 863 | const int page_index_mask = page_index_alignment - 1; | ||
| 864 | size_t pages_mapped = 0; | ||
| 865 | size_t pages_to_map = 0; | ||
| 866 | size_t pages_iter = 0; | ||
| 867 | int use_emergency_buf = 0; | ||
| 868 | |||
| 869 | /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, | ||
| 870 | * so we need to align the offset and length accordingly */ | ||
| 871 | if (start_page_index & page_index_mask) { | ||
| 872 | npages += start_page_index & page_index_mask; | ||
| 873 | start_page_index &= ~page_index_mask; | ||
| 874 | } | ||
| 875 | |||
| 876 | pages_to_map = ALIGN(npages, page_index_alignment); | ||
| 877 | |||
| 878 | if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) | ||
| 879 | return -EINVAL; | ||
| 880 | |||
| 881 | size = sizeof(u64) * pages_to_map; | ||
| 882 | size = min_t(int, PAGE_SIZE, size); | ||
| 883 | /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim | ||
| 884 | * code, when we are called from an invalidation. The pas buffer must | ||
| 885 | * be 2k-aligned for Connect-IB. */ | ||
| 886 | pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); | ||
| 887 | if (!pas) { | ||
| 888 | mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); | ||
| 889 | pas = mlx5_ib_update_mtt_emergency_buffer; | ||
| 890 | size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; | ||
| 891 | use_emergency_buf = 1; | ||
| 892 | mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); | ||
| 893 | memset(pas, 0, size); | ||
| 894 | } | ||
| 895 | pages_iter = size / sizeof(u64); | ||
| 896 | dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); | ||
| 897 | if (dma_mapping_error(ddev, dma)) { | ||
| 898 | mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); | ||
| 899 | err = -ENOMEM; | ||
| 900 | goto free_pas; | ||
| 901 | } | ||
| 902 | |||
| 903 | for (pages_mapped = 0; | ||
| 904 | pages_mapped < pages_to_map && !err; | ||
| 905 | pages_mapped += pages_iter, start_page_index += pages_iter) { | ||
| 906 | dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); | ||
| 907 | |||
| 908 | npages = min_t(size_t, | ||
| 909 | pages_iter, | ||
| 910 | ib_umem_num_pages(umem) - start_page_index); | ||
| 911 | |||
| 912 | if (!zap) { | ||
| 913 | __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, | ||
| 914 | start_page_index, npages, pas, | ||
| 915 | MLX5_IB_MTT_PRESENT); | ||
| 916 | /* Clear padding after the pages brought from the | ||
| 917 | * umem. */ | ||
| 918 | memset(pas + npages, 0, size - npages * sizeof(u64)); | ||
| 919 | } | ||
| 920 | |||
| 921 | dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); | ||
| 922 | |||
| 923 | memset(&wr, 0, sizeof(wr)); | ||
| 924 | wr.wr_id = (u64)(unsigned long)&umr_context; | ||
| 925 | |||
| 926 | sg.addr = dma; | ||
| 927 | sg.length = ALIGN(npages * sizeof(u64), | ||
| 928 | MLX5_UMR_MTT_ALIGNMENT); | ||
| 929 | sg.lkey = dev->umrc.mr->lkey; | ||
| 930 | |||
| 931 | wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | | ||
| 932 | MLX5_IB_SEND_UMR_UPDATE_MTT; | ||
| 933 | wr.sg_list = &sg; | ||
| 934 | wr.num_sge = 1; | ||
| 935 | wr.opcode = MLX5_IB_WR_UMR; | ||
| 936 | umrwr->npages = sg.length / sizeof(u64); | ||
| 937 | umrwr->page_shift = PAGE_SHIFT; | ||
| 938 | umrwr->mkey = mr->mmr.key; | ||
| 939 | umrwr->target.offset = start_page_index; | ||
| 940 | |||
| 941 | mlx5_ib_init_umr_context(&umr_context); | ||
| 942 | down(&umrc->sem); | ||
| 943 | err = ib_post_send(umrc->qp, &wr, &bad); | ||
| 944 | if (err) { | ||
| 945 | mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); | ||
| 946 | } else { | ||
| 947 | wait_for_completion(&umr_context.done); | ||
| 948 | if (umr_context.status != IB_WC_SUCCESS) { | ||
| 949 | mlx5_ib_err(dev, "UMR completion failed, code %d\n", | ||
| 950 | umr_context.status); | ||
| 951 | err = -EFAULT; | ||
| 952 | } | ||
| 953 | } | ||
| 954 | up(&umrc->sem); | ||
| 955 | } | ||
| 956 | dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); | ||
| 957 | |||
| 958 | free_pas: | ||
| 959 | if (!use_emergency_buf) | ||
| 960 | free_page((unsigned long)pas); | ||
| 961 | else | ||
| 962 | mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); | ||
| 963 | |||
| 964 | return err; | ||
| 965 | } | ||
| 966 | #endif | ||
| 967 | |||
| 818 | static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | 968 | static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, |
| 819 | u64 length, struct ib_umem *umem, | 969 | u64 length, struct ib_umem *umem, |
| 820 | int npages, int page_shift, | 970 | int npages, int page_shift, |
| @@ -825,6 +975,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | |||
| 825 | struct mlx5_ib_mr *mr; | 975 | struct mlx5_ib_mr *mr; |
| 826 | int inlen; | 976 | int inlen; |
| 827 | int err; | 977 | int err; |
| 978 | bool pg_cap = !!(dev->mdev->caps.gen.flags & | ||
| 979 | MLX5_DEV_CAP_FLAG_ON_DMND_PG); | ||
| 828 | 980 | ||
| 829 | mr = kzalloc(sizeof(*mr), GFP_KERNEL); | 981 | mr = kzalloc(sizeof(*mr), GFP_KERNEL); |
| 830 | if (!mr) | 982 | if (!mr) |
| @@ -836,8 +988,12 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | |||
| 836 | err = -ENOMEM; | 988 | err = -ENOMEM; |
| 837 | goto err_1; | 989 | goto err_1; |
| 838 | } | 990 | } |
| 839 | mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0); | 991 | mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, |
| 992 | pg_cap ? MLX5_IB_MTT_PRESENT : 0); | ||
| 840 | 993 | ||
| 994 | /* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags | ||
| 995 | * in the page list submitted with the command. */ | ||
| 996 | in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0; | ||
| 841 | in->seg.flags = convert_access(access_flags) | | 997 | in->seg.flags = convert_access(access_flags) | |
| 842 | MLX5_ACCESS_MODE_MTT; | 998 | MLX5_ACCESS_MODE_MTT; |
| 843 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); | 999 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); |
| @@ -856,6 +1012,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, | |||
| 856 | goto err_2; | 1012 | goto err_2; |
| 857 | } | 1013 | } |
| 858 | mr->umem = umem; | 1014 | mr->umem = umem; |
| 1015 | mr->live = 1; | ||
| 859 | kvfree(in); | 1016 | kvfree(in); |
| 860 | 1017 | ||
| 861 | mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); | 1018 | mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key); |
| @@ -910,6 +1067,10 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 910 | mlx5_ib_dbg(dev, "cache empty for order %d", order); | 1067 | mlx5_ib_dbg(dev, "cache empty for order %d", order); |
| 911 | mr = NULL; | 1068 | mr = NULL; |
| 912 | } | 1069 | } |
| 1070 | } else if (access_flags & IB_ACCESS_ON_DEMAND) { | ||
| 1071 | err = -EINVAL; | ||
| 1072 | pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); | ||
| 1073 | goto error; | ||
| 913 | } | 1074 | } |
| 914 | 1075 | ||
| 915 | if (!mr) | 1076 | if (!mr) |
| @@ -925,16 +1086,51 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 925 | 1086 | ||
| 926 | mr->umem = umem; | 1087 | mr->umem = umem; |
| 927 | mr->npages = npages; | 1088 | mr->npages = npages; |
| 928 | spin_lock(&dev->mr_lock); | 1089 | atomic_add(npages, &dev->mdev->priv.reg_pages); |
| 929 | dev->mdev->priv.reg_pages += npages; | ||
| 930 | spin_unlock(&dev->mr_lock); | ||
| 931 | mr->ibmr.lkey = mr->mmr.key; | 1090 | mr->ibmr.lkey = mr->mmr.key; |
| 932 | mr->ibmr.rkey = mr->mmr.key; | 1091 | mr->ibmr.rkey = mr->mmr.key; |
| 933 | 1092 | ||
| 1093 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 1094 | if (umem->odp_data) { | ||
| 1095 | /* | ||
| 1096 | * This barrier prevents the compiler from moving the | ||
| 1097 | * setting of umem->odp_data->private to point to our | ||
| 1098 | * MR, before reg_umr finished, to ensure that the MR | ||
| 1099 | * initialization have finished before starting to | ||
| 1100 | * handle invalidations. | ||
| 1101 | */ | ||
| 1102 | smp_wmb(); | ||
| 1103 | mr->umem->odp_data->private = mr; | ||
| 1104 | /* | ||
| 1105 | * Make sure we will see the new | ||
| 1106 | * umem->odp_data->private value in the invalidation | ||
| 1107 | * routines, before we can get page faults on the | ||
| 1108 | * MR. Page faults can happen once we put the MR in | ||
| 1109 | * the tree, below this line. Without the barrier, | ||
| 1110 | * there can be a fault handling and an invalidation | ||
| 1111 | * before umem->odp_data->private == mr is visible to | ||
| 1112 | * the invalidation handler. | ||
| 1113 | */ | ||
| 1114 | smp_wmb(); | ||
| 1115 | } | ||
| 1116 | #endif | ||
| 1117 | |||
| 934 | return &mr->ibmr; | 1118 | return &mr->ibmr; |
| 935 | 1119 | ||
| 936 | error: | 1120 | error: |
| 1121 | /* | ||
| 1122 | * Destroy the umem *before* destroying the MR, to ensure we | ||
| 1123 | * will not have any in-flight notifiers when destroying the | ||
| 1124 | * MR. | ||
| 1125 | * | ||
| 1126 | * As the MR is completely invalid to begin with, and this | ||
| 1127 | * error path is only taken if we can't push the mr entry into | ||
| 1128 | * the pagefault tree, this is safe. | ||
| 1129 | */ | ||
| 1130 | |||
| 937 | ib_umem_release(umem); | 1131 | ib_umem_release(umem); |
| 1132 | /* Kill the MR, and return an error code. */ | ||
| 1133 | clean_mr(mr); | ||
| 938 | return ERR_PTR(err); | 1134 | return ERR_PTR(err); |
| 939 | } | 1135 | } |
| 940 | 1136 | ||
| @@ -971,17 +1167,14 @@ error: | |||
| 971 | return err; | 1167 | return err; |
| 972 | } | 1168 | } |
| 973 | 1169 | ||
| 974 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr) | 1170 | static int clean_mr(struct mlx5_ib_mr *mr) |
| 975 | { | 1171 | { |
| 976 | struct mlx5_ib_dev *dev = to_mdev(ibmr->device); | 1172 | struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); |
| 977 | struct mlx5_ib_mr *mr = to_mmr(ibmr); | ||
| 978 | struct ib_umem *umem = mr->umem; | ||
| 979 | int npages = mr->npages; | ||
| 980 | int umred = mr->umred; | 1173 | int umred = mr->umred; |
| 981 | int err; | 1174 | int err; |
| 982 | 1175 | ||
| 983 | if (!umred) { | 1176 | if (!umred) { |
| 984 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 1177 | err = destroy_mkey(dev, mr); |
| 985 | if (err) { | 1178 | if (err) { |
| 986 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", | 1179 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", |
| 987 | mr->mmr.key, err); | 1180 | mr->mmr.key, err); |
| @@ -996,15 +1189,47 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) | |||
| 996 | free_cached_mr(dev, mr); | 1189 | free_cached_mr(dev, mr); |
| 997 | } | 1190 | } |
| 998 | 1191 | ||
| 999 | if (umem) { | 1192 | if (!umred) |
| 1193 | kfree(mr); | ||
| 1194 | |||
| 1195 | return 0; | ||
| 1196 | } | ||
| 1197 | |||
| 1198 | int mlx5_ib_dereg_mr(struct ib_mr *ibmr) | ||
| 1199 | { | ||
| 1200 | struct mlx5_ib_dev *dev = to_mdev(ibmr->device); | ||
| 1201 | struct mlx5_ib_mr *mr = to_mmr(ibmr); | ||
| 1202 | int npages = mr->npages; | ||
| 1203 | struct ib_umem *umem = mr->umem; | ||
| 1204 | |||
| 1205 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 1206 | if (umem && umem->odp_data) { | ||
| 1207 | /* Prevent new page faults from succeeding */ | ||
| 1208 | mr->live = 0; | ||
| 1209 | /* Wait for all running page-fault handlers to finish. */ | ||
| 1210 | synchronize_srcu(&dev->mr_srcu); | ||
| 1211 | /* Destroy all page mappings */ | ||
| 1212 | mlx5_ib_invalidate_range(umem, ib_umem_start(umem), | ||
| 1213 | ib_umem_end(umem)); | ||
| 1214 | /* | ||
| 1215 | * We kill the umem before the MR for ODP, | ||
| 1216 | * so that there will not be any invalidations in | ||
| 1217 | * flight, looking at the *mr struct. | ||
| 1218 | */ | ||
| 1000 | ib_umem_release(umem); | 1219 | ib_umem_release(umem); |
| 1001 | spin_lock(&dev->mr_lock); | 1220 | atomic_sub(npages, &dev->mdev->priv.reg_pages); |
| 1002 | dev->mdev->priv.reg_pages -= npages; | 1221 | |
| 1003 | spin_unlock(&dev->mr_lock); | 1222 | /* Avoid double-freeing the umem. */ |
| 1223 | umem = NULL; | ||
| 1004 | } | 1224 | } |
| 1225 | #endif | ||
| 1005 | 1226 | ||
| 1006 | if (!umred) | 1227 | clean_mr(mr); |
| 1007 | kfree(mr); | 1228 | |
| 1229 | if (umem) { | ||
| 1230 | ib_umem_release(umem); | ||
| 1231 | atomic_sub(npages, &dev->mdev->priv.reg_pages); | ||
| 1232 | } | ||
| 1008 | 1233 | ||
| 1009 | return 0; | 1234 | return 0; |
| 1010 | } | 1235 | } |
| @@ -1028,7 +1253,7 @@ struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd, | |||
| 1028 | goto err_free; | 1253 | goto err_free; |
| 1029 | } | 1254 | } |
| 1030 | 1255 | ||
| 1031 | in->seg.status = 1 << 6; /* free */ | 1256 | in->seg.status = MLX5_MKEY_STATUS_FREE; |
| 1032 | in->seg.xlt_oct_size = cpu_to_be32(ndescs); | 1257 | in->seg.xlt_oct_size = cpu_to_be32(ndescs); |
| 1033 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); | 1258 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); |
| 1034 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); | 1259 | in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn); |
| @@ -1113,7 +1338,7 @@ int mlx5_ib_destroy_mr(struct ib_mr *ibmr) | |||
| 1113 | kfree(mr->sig); | 1338 | kfree(mr->sig); |
| 1114 | } | 1339 | } |
| 1115 | 1340 | ||
| 1116 | err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr); | 1341 | err = destroy_mkey(dev, mr); |
| 1117 | if (err) { | 1342 | if (err) { |
| 1118 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", | 1343 | mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", |
| 1119 | mr->mmr.key, err); | 1344 | mr->mmr.key, err); |
| @@ -1143,7 +1368,7 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, | |||
| 1143 | goto err_free; | 1368 | goto err_free; |
| 1144 | } | 1369 | } |
| 1145 | 1370 | ||
| 1146 | in->seg.status = 1 << 6; /* free */ | 1371 | in->seg.status = MLX5_MKEY_STATUS_FREE; |
| 1147 | in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); | 1372 | in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2); |
| 1148 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); | 1373 | in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); |
| 1149 | in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; | 1374 | in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT; |
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c new file mode 100644 index 000000000000..a2c541c4809a --- /dev/null +++ b/drivers/infiniband/hw/mlx5/odp.c | |||
| @@ -0,0 +1,798 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | */ | ||
| 32 | |||
| 33 | #include <rdma/ib_umem.h> | ||
| 34 | #include <rdma/ib_umem_odp.h> | ||
| 35 | |||
| 36 | #include "mlx5_ib.h" | ||
| 37 | |||
| 38 | #define MAX_PREFETCH_LEN (4*1024*1024U) | ||
| 39 | |||
| 40 | /* Timeout in ms to wait for an active mmu notifier to complete when handling | ||
| 41 | * a pagefault. */ | ||
| 42 | #define MMU_NOTIFIER_TIMEOUT 1000 | ||
| 43 | |||
| 44 | struct workqueue_struct *mlx5_ib_page_fault_wq; | ||
| 45 | |||
| 46 | void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, | ||
| 47 | unsigned long end) | ||
| 48 | { | ||
| 49 | struct mlx5_ib_mr *mr; | ||
| 50 | const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; | ||
| 51 | u64 idx = 0, blk_start_idx = 0; | ||
| 52 | int in_block = 0; | ||
| 53 | u64 addr; | ||
| 54 | |||
| 55 | if (!umem || !umem->odp_data) { | ||
| 56 | pr_err("invalidation called on NULL umem or non-ODP umem\n"); | ||
| 57 | return; | ||
| 58 | } | ||
| 59 | |||
| 60 | mr = umem->odp_data->private; | ||
| 61 | |||
| 62 | if (!mr || !mr->ibmr.pd) | ||
| 63 | return; | ||
| 64 | |||
| 65 | start = max_t(u64, ib_umem_start(umem), start); | ||
| 66 | end = min_t(u64, ib_umem_end(umem), end); | ||
| 67 | |||
| 68 | /* | ||
| 69 | * Iteration one - zap the HW's MTTs. The notifiers_count ensures that | ||
| 70 | * while we are doing the invalidation, no page fault will attempt to | ||
| 71 | * overwrite the same MTTs. Concurent invalidations might race us, | ||
| 72 | * but they will write 0s as well, so no difference in the end result. | ||
| 73 | */ | ||
| 74 | |||
| 75 | for (addr = start; addr < end; addr += (u64)umem->page_size) { | ||
| 76 | idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; | ||
| 77 | /* | ||
| 78 | * Strive to write the MTTs in chunks, but avoid overwriting | ||
| 79 | * non-existing MTTs. The huristic here can be improved to | ||
| 80 | * estimate the cost of another UMR vs. the cost of bigger | ||
| 81 | * UMR. | ||
| 82 | */ | ||
| 83 | if (umem->odp_data->dma_list[idx] & | ||
| 84 | (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { | ||
| 85 | if (!in_block) { | ||
| 86 | blk_start_idx = idx; | ||
| 87 | in_block = 1; | ||
| 88 | } | ||
| 89 | } else { | ||
| 90 | u64 umr_offset = idx & umr_block_mask; | ||
| 91 | |||
| 92 | if (in_block && umr_offset == 0) { | ||
| 93 | mlx5_ib_update_mtt(mr, blk_start_idx, | ||
| 94 | idx - blk_start_idx, 1); | ||
| 95 | in_block = 0; | ||
| 96 | } | ||
| 97 | } | ||
| 98 | } | ||
| 99 | if (in_block) | ||
| 100 | mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, | ||
| 101 | 1); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * We are now sure that the device will not access the | ||
| 105 | * memory. We can safely unmap it, and mark it as dirty if | ||
| 106 | * needed. | ||
| 107 | */ | ||
| 108 | |||
| 109 | ib_umem_odp_unmap_dma_pages(umem, start, end); | ||
| 110 | } | ||
| 111 | |||
| 112 | #define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do { \ | ||
| 113 | if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name) \ | ||
| 114 | ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name; \ | ||
| 115 | } while (0) | ||
| 116 | |||
| 117 | int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev) | ||
| 118 | { | ||
| 119 | int err; | ||
| 120 | struct mlx5_odp_caps hw_caps; | ||
| 121 | struct ib_odp_caps *caps = &dev->odp_caps; | ||
| 122 | |||
| 123 | memset(caps, 0, sizeof(*caps)); | ||
| 124 | |||
| 125 | if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) | ||
| 126 | return 0; | ||
| 127 | |||
| 128 | err = mlx5_query_odp_caps(dev->mdev, &hw_caps); | ||
| 129 | if (err) | ||
| 130 | goto out; | ||
| 131 | |||
| 132 | caps->general_caps = IB_ODP_SUPPORT; | ||
| 133 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps, | ||
| 134 | SEND); | ||
| 135 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
| 136 | SEND); | ||
| 137 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
| 138 | RECV); | ||
| 139 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
| 140 | WRITE); | ||
| 141 | COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps, | ||
| 142 | READ); | ||
| 143 | |||
| 144 | out: | ||
| 145 | return err; | ||
| 146 | } | ||
| 147 | |||
| 148 | static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, | ||
| 149 | u32 key) | ||
| 150 | { | ||
| 151 | u32 base_key = mlx5_base_mkey(key); | ||
| 152 | struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key); | ||
| 153 | struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr); | ||
| 154 | |||
| 155 | if (!mmr || mmr->key != key || !mr->live) | ||
| 156 | return NULL; | ||
| 157 | |||
| 158 | return container_of(mmr, struct mlx5_ib_mr, mmr); | ||
| 159 | } | ||
| 160 | |||
| 161 | static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, | ||
| 162 | struct mlx5_ib_pfault *pfault, | ||
| 163 | int error) { | ||
| 164 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
| 165 | int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, | ||
| 166 | pfault->mpfault.flags, | ||
| 167 | error); | ||
| 168 | if (ret) | ||
| 169 | pr_err("Failed to resolve the page fault on QP 0x%x\n", | ||
| 170 | qp->mqp.qpn); | ||
| 171 | } | ||
| 172 | |||
| 173 | /* | ||
| 174 | * Handle a single data segment in a page-fault WQE. | ||
| 175 | * | ||
| 176 | * Returns number of pages retrieved on success. The caller will continue to | ||
| 177 | * the next data segment. | ||
| 178 | * Can return the following error codes: | ||
| 179 | * -EAGAIN to designate a temporary error. The caller will abort handling the | ||
| 180 | * page fault and resolve it. | ||
| 181 | * -EFAULT when there's an error mapping the requested pages. The caller will | ||
| 182 | * abort the page fault handling and possibly move the QP to an error state. | ||
| 183 | * On other errors the QP should also be closed with an error. | ||
| 184 | */ | ||
| 185 | static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, | ||
| 186 | struct mlx5_ib_pfault *pfault, | ||
| 187 | u32 key, u64 io_virt, size_t bcnt, | ||
| 188 | u32 *bytes_mapped) | ||
| 189 | { | ||
| 190 | struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); | ||
| 191 | int srcu_key; | ||
| 192 | unsigned int current_seq; | ||
| 193 | u64 start_idx; | ||
| 194 | int npages = 0, ret = 0; | ||
| 195 | struct mlx5_ib_mr *mr; | ||
| 196 | u64 access_mask = ODP_READ_ALLOWED_BIT; | ||
| 197 | |||
| 198 | srcu_key = srcu_read_lock(&mib_dev->mr_srcu); | ||
| 199 | mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); | ||
| 200 | /* | ||
| 201 | * If we didn't find the MR, it means the MR was closed while we were | ||
| 202 | * handling the ODP event. In this case we return -EFAULT so that the | ||
| 203 | * QP will be closed. | ||
| 204 | */ | ||
| 205 | if (!mr || !mr->ibmr.pd) { | ||
| 206 | pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", | ||
| 207 | key); | ||
| 208 | ret = -EFAULT; | ||
| 209 | goto srcu_unlock; | ||
| 210 | } | ||
| 211 | if (!mr->umem->odp_data) { | ||
| 212 | pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", | ||
| 213 | key); | ||
| 214 | if (bytes_mapped) | ||
| 215 | *bytes_mapped += | ||
| 216 | (bcnt - pfault->mpfault.bytes_committed); | ||
| 217 | goto srcu_unlock; | ||
| 218 | } | ||
| 219 | if (mr->ibmr.pd != qp->ibqp.pd) { | ||
| 220 | pr_err("Page-fault with different PDs for QP and MR.\n"); | ||
| 221 | ret = -EFAULT; | ||
| 222 | goto srcu_unlock; | ||
| 223 | } | ||
| 224 | |||
| 225 | current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); | ||
| 226 | /* | ||
| 227 | * Ensure the sequence number is valid for some time before we call | ||
| 228 | * gup. | ||
| 229 | */ | ||
| 230 | smp_rmb(); | ||
| 231 | |||
| 232 | /* | ||
| 233 | * Avoid branches - this code will perform correctly | ||
| 234 | * in all iterations (in iteration 2 and above, | ||
| 235 | * bytes_committed == 0). | ||
| 236 | */ | ||
| 237 | io_virt += pfault->mpfault.bytes_committed; | ||
| 238 | bcnt -= pfault->mpfault.bytes_committed; | ||
| 239 | |||
| 240 | start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT; | ||
| 241 | |||
| 242 | if (mr->umem->writable) | ||
| 243 | access_mask |= ODP_WRITE_ALLOWED_BIT; | ||
| 244 | npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, | ||
| 245 | access_mask, current_seq); | ||
| 246 | if (npages < 0) { | ||
| 247 | ret = npages; | ||
| 248 | goto srcu_unlock; | ||
| 249 | } | ||
| 250 | |||
| 251 | if (npages > 0) { | ||
| 252 | mutex_lock(&mr->umem->odp_data->umem_mutex); | ||
| 253 | if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { | ||
| 254 | /* | ||
| 255 | * No need to check whether the MTTs really belong to | ||
| 256 | * this MR, since ib_umem_odp_map_dma_pages already | ||
| 257 | * checks this. | ||
| 258 | */ | ||
| 259 | ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); | ||
| 260 | } else { | ||
| 261 | ret = -EAGAIN; | ||
| 262 | } | ||
| 263 | mutex_unlock(&mr->umem->odp_data->umem_mutex); | ||
| 264 | if (ret < 0) { | ||
| 265 | if (ret != -EAGAIN) | ||
| 266 | pr_err("Failed to update mkey page tables\n"); | ||
| 267 | goto srcu_unlock; | ||
| 268 | } | ||
| 269 | |||
| 270 | if (bytes_mapped) { | ||
| 271 | u32 new_mappings = npages * PAGE_SIZE - | ||
| 272 | (io_virt - round_down(io_virt, PAGE_SIZE)); | ||
| 273 | *bytes_mapped += min_t(u32, new_mappings, bcnt); | ||
| 274 | } | ||
| 275 | } | ||
| 276 | |||
| 277 | srcu_unlock: | ||
| 278 | if (ret == -EAGAIN) { | ||
| 279 | if (!mr->umem->odp_data->dying) { | ||
| 280 | struct ib_umem_odp *odp_data = mr->umem->odp_data; | ||
| 281 | unsigned long timeout = | ||
| 282 | msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); | ||
| 283 | |||
| 284 | if (!wait_for_completion_timeout( | ||
| 285 | &odp_data->notifier_completion, | ||
| 286 | timeout)) { | ||
| 287 | pr_warn("timeout waiting for mmu notifier completion\n"); | ||
| 288 | } | ||
| 289 | } else { | ||
| 290 | /* The MR is being killed, kill the QP as well. */ | ||
| 291 | ret = -EFAULT; | ||
| 292 | } | ||
| 293 | } | ||
| 294 | srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); | ||
| 295 | pfault->mpfault.bytes_committed = 0; | ||
| 296 | return ret ? ret : npages; | ||
| 297 | } | ||
| 298 | |||
| 299 | /** | ||
| 300 | * Parse a series of data segments for page fault handling. | ||
| 301 | * | ||
| 302 | * @qp the QP on which the fault occurred. | ||
| 303 | * @pfault contains page fault information. | ||
| 304 | * @wqe points at the first data segment in the WQE. | ||
| 305 | * @wqe_end points after the end of the WQE. | ||
| 306 | * @bytes_mapped receives the number of bytes that the function was able to | ||
| 307 | * map. This allows the caller to decide intelligently whether | ||
| 308 | * enough memory was mapped to resolve the page fault | ||
| 309 | * successfully (e.g. enough for the next MTU, or the entire | ||
| 310 | * WQE). | ||
| 311 | * @total_wqe_bytes receives the total data size of this WQE in bytes (minus | ||
| 312 | * the committed bytes). | ||
| 313 | * | ||
| 314 | * Returns the number of pages loaded if positive, zero for an empty WQE, or a | ||
| 315 | * negative error code. | ||
| 316 | */ | ||
| 317 | static int pagefault_data_segments(struct mlx5_ib_qp *qp, | ||
| 318 | struct mlx5_ib_pfault *pfault, void *wqe, | ||
| 319 | void *wqe_end, u32 *bytes_mapped, | ||
| 320 | u32 *total_wqe_bytes, int receive_queue) | ||
| 321 | { | ||
| 322 | int ret = 0, npages = 0; | ||
| 323 | u64 io_virt; | ||
| 324 | u32 key; | ||
| 325 | u32 byte_count; | ||
| 326 | size_t bcnt; | ||
| 327 | int inline_segment; | ||
| 328 | |||
| 329 | /* Skip SRQ next-WQE segment. */ | ||
| 330 | if (receive_queue && qp->ibqp.srq) | ||
| 331 | wqe += sizeof(struct mlx5_wqe_srq_next_seg); | ||
| 332 | |||
| 333 | if (bytes_mapped) | ||
| 334 | *bytes_mapped = 0; | ||
| 335 | if (total_wqe_bytes) | ||
| 336 | *total_wqe_bytes = 0; | ||
| 337 | |||
| 338 | while (wqe < wqe_end) { | ||
| 339 | struct mlx5_wqe_data_seg *dseg = wqe; | ||
| 340 | |||
| 341 | io_virt = be64_to_cpu(dseg->addr); | ||
| 342 | key = be32_to_cpu(dseg->lkey); | ||
| 343 | byte_count = be32_to_cpu(dseg->byte_count); | ||
| 344 | inline_segment = !!(byte_count & MLX5_INLINE_SEG); | ||
| 345 | bcnt = byte_count & ~MLX5_INLINE_SEG; | ||
| 346 | |||
| 347 | if (inline_segment) { | ||
| 348 | bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK; | ||
| 349 | wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt, | ||
| 350 | 16); | ||
| 351 | } else { | ||
| 352 | wqe += sizeof(*dseg); | ||
| 353 | } | ||
| 354 | |||
| 355 | /* receive WQE end of sg list. */ | ||
| 356 | if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && | ||
| 357 | io_virt == 0) | ||
| 358 | break; | ||
| 359 | |||
| 360 | if (!inline_segment && total_wqe_bytes) { | ||
| 361 | *total_wqe_bytes += bcnt - min_t(size_t, bcnt, | ||
| 362 | pfault->mpfault.bytes_committed); | ||
| 363 | } | ||
| 364 | |||
| 365 | /* A zero length data segment designates a length of 2GB. */ | ||
| 366 | if (bcnt == 0) | ||
| 367 | bcnt = 1U << 31; | ||
| 368 | |||
| 369 | if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { | ||
| 370 | pfault->mpfault.bytes_committed -= | ||
| 371 | min_t(size_t, bcnt, | ||
| 372 | pfault->mpfault.bytes_committed); | ||
| 373 | continue; | ||
| 374 | } | ||
| 375 | |||
| 376 | ret = pagefault_single_data_segment(qp, pfault, key, io_virt, | ||
| 377 | bcnt, bytes_mapped); | ||
| 378 | if (ret < 0) | ||
| 379 | break; | ||
| 380 | npages += ret; | ||
| 381 | } | ||
| 382 | |||
| 383 | return ret < 0 ? ret : npages; | ||
| 384 | } | ||
| 385 | |||
| 386 | /* | ||
| 387 | * Parse initiator WQE. Advances the wqe pointer to point at the | ||
| 388 | * scatter-gather list, and set wqe_end to the end of the WQE. | ||
| 389 | */ | ||
| 390 | static int mlx5_ib_mr_initiator_pfault_handler( | ||
| 391 | struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, | ||
| 392 | void **wqe, void **wqe_end, int wqe_length) | ||
| 393 | { | ||
| 394 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
| 395 | struct mlx5_wqe_ctrl_seg *ctrl = *wqe; | ||
| 396 | u16 wqe_index = pfault->mpfault.wqe.wqe_index; | ||
| 397 | unsigned ds, opcode; | ||
| 398 | #if defined(DEBUG) | ||
| 399 | u32 ctrl_wqe_index, ctrl_qpn; | ||
| 400 | #endif | ||
| 401 | |||
| 402 | ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; | ||
| 403 | if (ds * MLX5_WQE_DS_UNITS > wqe_length) { | ||
| 404 | mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n", | ||
| 405 | ds, wqe_length); | ||
| 406 | return -EFAULT; | ||
| 407 | } | ||
| 408 | |||
| 409 | if (ds == 0) { | ||
| 410 | mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", | ||
| 411 | wqe_index, qp->mqp.qpn); | ||
| 412 | return -EFAULT; | ||
| 413 | } | ||
| 414 | |||
| 415 | #if defined(DEBUG) | ||
| 416 | ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) & | ||
| 417 | MLX5_WQE_CTRL_WQE_INDEX_MASK) >> | ||
| 418 | MLX5_WQE_CTRL_WQE_INDEX_SHIFT; | ||
| 419 | if (wqe_index != ctrl_wqe_index) { | ||
| 420 | mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", | ||
| 421 | wqe_index, qp->mqp.qpn, | ||
| 422 | ctrl_wqe_index); | ||
| 423 | return -EFAULT; | ||
| 424 | } | ||
| 425 | |||
| 426 | ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> | ||
| 427 | MLX5_WQE_CTRL_QPN_SHIFT; | ||
| 428 | if (qp->mqp.qpn != ctrl_qpn) { | ||
| 429 | mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", | ||
| 430 | wqe_index, qp->mqp.qpn, | ||
| 431 | ctrl_qpn); | ||
| 432 | return -EFAULT; | ||
| 433 | } | ||
| 434 | #endif /* DEBUG */ | ||
| 435 | |||
| 436 | *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS; | ||
| 437 | *wqe += sizeof(*ctrl); | ||
| 438 | |||
| 439 | opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & | ||
| 440 | MLX5_WQE_CTRL_OPCODE_MASK; | ||
| 441 | switch (qp->ibqp.qp_type) { | ||
| 442 | case IB_QPT_RC: | ||
| 443 | switch (opcode) { | ||
| 444 | case MLX5_OPCODE_SEND: | ||
| 445 | case MLX5_OPCODE_SEND_IMM: | ||
| 446 | case MLX5_OPCODE_SEND_INVAL: | ||
| 447 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
| 448 | IB_ODP_SUPPORT_SEND)) | ||
| 449 | goto invalid_transport_or_opcode; | ||
| 450 | break; | ||
| 451 | case MLX5_OPCODE_RDMA_WRITE: | ||
| 452 | case MLX5_OPCODE_RDMA_WRITE_IMM: | ||
| 453 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
| 454 | IB_ODP_SUPPORT_WRITE)) | ||
| 455 | goto invalid_transport_or_opcode; | ||
| 456 | *wqe += sizeof(struct mlx5_wqe_raddr_seg); | ||
| 457 | break; | ||
| 458 | case MLX5_OPCODE_RDMA_READ: | ||
| 459 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
| 460 | IB_ODP_SUPPORT_READ)) | ||
| 461 | goto invalid_transport_or_opcode; | ||
| 462 | *wqe += sizeof(struct mlx5_wqe_raddr_seg); | ||
| 463 | break; | ||
| 464 | default: | ||
| 465 | goto invalid_transport_or_opcode; | ||
| 466 | } | ||
| 467 | break; | ||
| 468 | case IB_QPT_UD: | ||
| 469 | switch (opcode) { | ||
| 470 | case MLX5_OPCODE_SEND: | ||
| 471 | case MLX5_OPCODE_SEND_IMM: | ||
| 472 | if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & | ||
| 473 | IB_ODP_SUPPORT_SEND)) | ||
| 474 | goto invalid_transport_or_opcode; | ||
| 475 | *wqe += sizeof(struct mlx5_wqe_datagram_seg); | ||
| 476 | break; | ||
| 477 | default: | ||
| 478 | goto invalid_transport_or_opcode; | ||
| 479 | } | ||
| 480 | break; | ||
| 481 | default: | ||
| 482 | invalid_transport_or_opcode: | ||
| 483 | mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", | ||
| 484 | qp->ibqp.qp_type, opcode); | ||
| 485 | return -EFAULT; | ||
| 486 | } | ||
| 487 | |||
| 488 | return 0; | ||
| 489 | } | ||
| 490 | |||
| 491 | /* | ||
| 492 | * Parse responder WQE. Advances the wqe pointer to point at the | ||
| 493 | * scatter-gather list, and set wqe_end to the end of the WQE. | ||
| 494 | */ | ||
| 495 | static int mlx5_ib_mr_responder_pfault_handler( | ||
| 496 | struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, | ||
| 497 | void **wqe, void **wqe_end, int wqe_length) | ||
| 498 | { | ||
| 499 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
| 500 | struct mlx5_ib_wq *wq = &qp->rq; | ||
| 501 | int wqe_size = 1 << wq->wqe_shift; | ||
| 502 | |||
| 503 | if (qp->ibqp.srq) { | ||
| 504 | mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n"); | ||
| 505 | return -EFAULT; | ||
| 506 | } | ||
| 507 | |||
| 508 | if (qp->wq_sig) { | ||
| 509 | mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); | ||
| 510 | return -EFAULT; | ||
| 511 | } | ||
| 512 | |||
| 513 | if (wqe_size > wqe_length) { | ||
| 514 | mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n"); | ||
| 515 | return -EFAULT; | ||
| 516 | } | ||
| 517 | |||
| 518 | switch (qp->ibqp.qp_type) { | ||
| 519 | case IB_QPT_RC: | ||
| 520 | if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & | ||
| 521 | IB_ODP_SUPPORT_RECV)) | ||
| 522 | goto invalid_transport_or_opcode; | ||
| 523 | break; | ||
| 524 | default: | ||
| 525 | invalid_transport_or_opcode: | ||
| 526 | mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n", | ||
| 527 | qp->ibqp.qp_type); | ||
| 528 | return -EFAULT; | ||
| 529 | } | ||
| 530 | |||
| 531 | *wqe_end = *wqe + wqe_size; | ||
| 532 | |||
| 533 | return 0; | ||
| 534 | } | ||
| 535 | |||
| 536 | static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, | ||
| 537 | struct mlx5_ib_pfault *pfault) | ||
| 538 | { | ||
| 539 | struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); | ||
| 540 | int ret; | ||
| 541 | void *wqe, *wqe_end; | ||
| 542 | u32 bytes_mapped, total_wqe_bytes; | ||
| 543 | char *buffer = NULL; | ||
| 544 | int resume_with_error = 0; | ||
| 545 | u16 wqe_index = pfault->mpfault.wqe.wqe_index; | ||
| 546 | int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; | ||
| 547 | |||
| 548 | buffer = (char *)__get_free_page(GFP_KERNEL); | ||
| 549 | if (!buffer) { | ||
| 550 | mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); | ||
| 551 | resume_with_error = 1; | ||
| 552 | goto resolve_page_fault; | ||
| 553 | } | ||
| 554 | |||
| 555 | ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, | ||
| 556 | PAGE_SIZE); | ||
| 557 | if (ret < 0) { | ||
| 558 | mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", | ||
| 559 | -ret, wqe_index, qp->mqp.qpn); | ||
| 560 | resume_with_error = 1; | ||
| 561 | goto resolve_page_fault; | ||
| 562 | } | ||
| 563 | |||
| 564 | wqe = buffer; | ||
| 565 | if (requestor) | ||
| 566 | ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, | ||
| 567 | &wqe_end, ret); | ||
| 568 | else | ||
| 569 | ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, | ||
| 570 | &wqe_end, ret); | ||
| 571 | if (ret < 0) { | ||
| 572 | resume_with_error = 1; | ||
| 573 | goto resolve_page_fault; | ||
| 574 | } | ||
| 575 | |||
| 576 | if (wqe >= wqe_end) { | ||
| 577 | mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); | ||
| 578 | resume_with_error = 1; | ||
| 579 | goto resolve_page_fault; | ||
| 580 | } | ||
| 581 | |||
| 582 | ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, | ||
| 583 | &total_wqe_bytes, !requestor); | ||
| 584 | if (ret == -EAGAIN) { | ||
| 585 | goto resolve_page_fault; | ||
| 586 | } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { | ||
| 587 | mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", | ||
| 588 | -ret); | ||
| 589 | resume_with_error = 1; | ||
| 590 | goto resolve_page_fault; | ||
| 591 | } | ||
| 592 | |||
| 593 | resolve_page_fault: | ||
| 594 | mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); | ||
| 595 | mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", | ||
| 596 | qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); | ||
| 597 | |||
| 598 | free_page((unsigned long)buffer); | ||
| 599 | } | ||
| 600 | |||
| 601 | static int pages_in_range(u64 address, u32 length) | ||
| 602 | { | ||
| 603 | return (ALIGN(address + length, PAGE_SIZE) - | ||
| 604 | (address & PAGE_MASK)) >> PAGE_SHIFT; | ||
| 605 | } | ||
| 606 | |||
| 607 | static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, | ||
| 608 | struct mlx5_ib_pfault *pfault) | ||
| 609 | { | ||
| 610 | struct mlx5_pagefault *mpfault = &pfault->mpfault; | ||
| 611 | u64 address; | ||
| 612 | u32 length; | ||
| 613 | u32 prefetch_len = mpfault->bytes_committed; | ||
| 614 | int prefetch_activated = 0; | ||
| 615 | u32 rkey = mpfault->rdma.r_key; | ||
| 616 | int ret; | ||
| 617 | |||
| 618 | /* The RDMA responder handler handles the page fault in two parts. | ||
| 619 | * First it brings the necessary pages for the current packet | ||
| 620 | * (and uses the pfault context), and then (after resuming the QP) | ||
| 621 | * prefetches more pages. The second operation cannot use the pfault | ||
| 622 | * context and therefore uses the dummy_pfault context allocated on | ||
| 623 | * the stack */ | ||
| 624 | struct mlx5_ib_pfault dummy_pfault = {}; | ||
| 625 | |||
| 626 | dummy_pfault.mpfault.bytes_committed = 0; | ||
| 627 | |||
| 628 | mpfault->rdma.rdma_va += mpfault->bytes_committed; | ||
| 629 | mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, | ||
| 630 | mpfault->rdma.rdma_op_len); | ||
| 631 | mpfault->bytes_committed = 0; | ||
| 632 | |||
| 633 | address = mpfault->rdma.rdma_va; | ||
| 634 | length = mpfault->rdma.rdma_op_len; | ||
| 635 | |||
| 636 | /* For some operations, the hardware cannot tell the exact message | ||
| 637 | * length, and in those cases it reports zero. Use prefetch | ||
| 638 | * logic. */ | ||
| 639 | if (length == 0) { | ||
| 640 | prefetch_activated = 1; | ||
| 641 | length = mpfault->rdma.packet_size; | ||
| 642 | prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); | ||
| 643 | } | ||
| 644 | |||
| 645 | ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, | ||
| 646 | NULL); | ||
| 647 | if (ret == -EAGAIN) { | ||
| 648 | /* We're racing with an invalidation, don't prefetch */ | ||
| 649 | prefetch_activated = 0; | ||
| 650 | } else if (ret < 0 || pages_in_range(address, length) > ret) { | ||
| 651 | mlx5_ib_page_fault_resume(qp, pfault, 1); | ||
| 652 | return; | ||
| 653 | } | ||
| 654 | |||
| 655 | mlx5_ib_page_fault_resume(qp, pfault, 0); | ||
| 656 | |||
| 657 | /* At this point, there might be a new pagefault already arriving in | ||
| 658 | * the eq, switch to the dummy pagefault for the rest of the | ||
| 659 | * processing. We're still OK with the objects being alive as the | ||
| 660 | * work-queue is being fenced. */ | ||
| 661 | |||
| 662 | if (prefetch_activated) { | ||
| 663 | ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, | ||
| 664 | address, | ||
| 665 | prefetch_len, | ||
| 666 | NULL); | ||
| 667 | if (ret < 0) { | ||
| 668 | pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", | ||
| 669 | ret, prefetch_activated, | ||
| 670 | qp->ibqp.qp_num, address, prefetch_len); | ||
| 671 | } | ||
| 672 | } | ||
| 673 | } | ||
| 674 | |||
| 675 | void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, | ||
| 676 | struct mlx5_ib_pfault *pfault) | ||
| 677 | { | ||
| 678 | u8 event_subtype = pfault->mpfault.event_subtype; | ||
| 679 | |||
| 680 | switch (event_subtype) { | ||
| 681 | case MLX5_PFAULT_SUBTYPE_WQE: | ||
| 682 | mlx5_ib_mr_wqe_pfault_handler(qp, pfault); | ||
| 683 | break; | ||
| 684 | case MLX5_PFAULT_SUBTYPE_RDMA: | ||
| 685 | mlx5_ib_mr_rdma_pfault_handler(qp, pfault); | ||
| 686 | break; | ||
| 687 | default: | ||
| 688 | pr_warn("Invalid page fault event subtype: 0x%x\n", | ||
| 689 | event_subtype); | ||
| 690 | mlx5_ib_page_fault_resume(qp, pfault, 1); | ||
| 691 | break; | ||
| 692 | } | ||
| 693 | } | ||
| 694 | |||
| 695 | static void mlx5_ib_qp_pfault_action(struct work_struct *work) | ||
| 696 | { | ||
| 697 | struct mlx5_ib_pfault *pfault = container_of(work, | ||
| 698 | struct mlx5_ib_pfault, | ||
| 699 | work); | ||
| 700 | enum mlx5_ib_pagefault_context context = | ||
| 701 | mlx5_ib_get_pagefault_context(&pfault->mpfault); | ||
| 702 | struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, | ||
| 703 | pagefaults[context]); | ||
| 704 | mlx5_ib_mr_pfault_handler(qp, pfault); | ||
| 705 | } | ||
| 706 | |||
| 707 | void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) | ||
| 708 | { | ||
| 709 | unsigned long flags; | ||
| 710 | |||
| 711 | spin_lock_irqsave(&qp->disable_page_faults_lock, flags); | ||
| 712 | qp->disable_page_faults = 1; | ||
| 713 | spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); | ||
| 714 | |||
| 715 | /* | ||
| 716 | * Note that at this point, we are guarenteed that no more | ||
| 717 | * work queue elements will be posted to the work queue with | ||
| 718 | * the QP we are closing. | ||
| 719 | */ | ||
| 720 | flush_workqueue(mlx5_ib_page_fault_wq); | ||
| 721 | } | ||
| 722 | |||
| 723 | void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) | ||
| 724 | { | ||
| 725 | unsigned long flags; | ||
| 726 | |||
| 727 | spin_lock_irqsave(&qp->disable_page_faults_lock, flags); | ||
| 728 | qp->disable_page_faults = 0; | ||
| 729 | spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); | ||
| 730 | } | ||
| 731 | |||
| 732 | static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, | ||
| 733 | struct mlx5_pagefault *pfault) | ||
| 734 | { | ||
| 735 | /* | ||
| 736 | * Note that we will only get one fault event per QP per context | ||
| 737 | * (responder/initiator, read/write), until we resolve the page fault | ||
| 738 | * with the mlx5_ib_page_fault_resume command. Since this function is | ||
| 739 | * called from within the work element, there is no risk of missing | ||
| 740 | * events. | ||
| 741 | */ | ||
| 742 | struct mlx5_ib_qp *mibqp = to_mibqp(qp); | ||
| 743 | enum mlx5_ib_pagefault_context context = | ||
| 744 | mlx5_ib_get_pagefault_context(pfault); | ||
| 745 | struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; | ||
| 746 | |||
| 747 | qp_pfault->mpfault = *pfault; | ||
| 748 | |||
| 749 | /* No need to stop interrupts here since we are in an interrupt */ | ||
| 750 | spin_lock(&mibqp->disable_page_faults_lock); | ||
| 751 | if (!mibqp->disable_page_faults) | ||
| 752 | queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); | ||
| 753 | spin_unlock(&mibqp->disable_page_faults_lock); | ||
| 754 | } | ||
| 755 | |||
| 756 | void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) | ||
| 757 | { | ||
| 758 | int i; | ||
| 759 | |||
| 760 | qp->disable_page_faults = 1; | ||
| 761 | spin_lock_init(&qp->disable_page_faults_lock); | ||
| 762 | |||
| 763 | qp->mqp.pfault_handler = mlx5_ib_pfault_handler; | ||
| 764 | |||
| 765 | for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) | ||
| 766 | INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); | ||
| 767 | } | ||
| 768 | |||
| 769 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) | ||
| 770 | { | ||
| 771 | int ret; | ||
| 772 | |||
| 773 | ret = init_srcu_struct(&ibdev->mr_srcu); | ||
| 774 | if (ret) | ||
| 775 | return ret; | ||
| 776 | |||
| 777 | return 0; | ||
| 778 | } | ||
| 779 | |||
| 780 | void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) | ||
| 781 | { | ||
| 782 | cleanup_srcu_struct(&ibdev->mr_srcu); | ||
| 783 | } | ||
| 784 | |||
| 785 | int __init mlx5_ib_odp_init(void) | ||
| 786 | { | ||
| 787 | mlx5_ib_page_fault_wq = | ||
| 788 | create_singlethread_workqueue("mlx5_ib_page_faults"); | ||
| 789 | if (!mlx5_ib_page_fault_wq) | ||
| 790 | return -ENOMEM; | ||
| 791 | |||
| 792 | return 0; | ||
| 793 | } | ||
| 794 | |||
| 795 | void mlx5_ib_odp_cleanup(void) | ||
| 796 | { | ||
| 797 | destroy_workqueue(mlx5_ib_page_fault_wq); | ||
| 798 | } | ||
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1cae1c7132b4..be0cd358b080 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c | |||
| @@ -70,15 +70,6 @@ static const u32 mlx5_ib_opcode[] = { | |||
| 70 | [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, | 70 | [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, |
| 71 | }; | 71 | }; |
| 72 | 72 | ||
| 73 | struct umr_wr { | ||
| 74 | u64 virt_addr; | ||
| 75 | struct ib_pd *pd; | ||
| 76 | unsigned int page_shift; | ||
| 77 | unsigned int npages; | ||
| 78 | u32 length; | ||
| 79 | int access_flags; | ||
| 80 | u32 mkey; | ||
| 81 | }; | ||
| 82 | 73 | ||
| 83 | static int is_qp0(enum ib_qp_type qp_type) | 74 | static int is_qp0(enum ib_qp_type qp_type) |
| 84 | { | 75 | { |
| @@ -110,6 +101,77 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) | |||
| 110 | return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); | 101 | return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE)); |
| 111 | } | 102 | } |
| 112 | 103 | ||
| 104 | /** | ||
| 105 | * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space. | ||
| 106 | * | ||
| 107 | * @qp: QP to copy from. | ||
| 108 | * @send: copy from the send queue when non-zero, use the receive queue | ||
| 109 | * otherwise. | ||
| 110 | * @wqe_index: index to start copying from. For send work queues, the | ||
| 111 | * wqe_index is in units of MLX5_SEND_WQE_BB. | ||
| 112 | * For receive work queue, it is the number of work queue | ||
| 113 | * element in the queue. | ||
| 114 | * @buffer: destination buffer. | ||
| 115 | * @length: maximum number of bytes to copy. | ||
| 116 | * | ||
| 117 | * Copies at least a single WQE, but may copy more data. | ||
| 118 | * | ||
| 119 | * Return: the number of bytes copied, or an error code. | ||
| 120 | */ | ||
| 121 | int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, | ||
| 122 | void *buffer, u32 length) | ||
| 123 | { | ||
| 124 | struct ib_device *ibdev = qp->ibqp.device; | ||
| 125 | struct mlx5_ib_dev *dev = to_mdev(ibdev); | ||
| 126 | struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; | ||
| 127 | size_t offset; | ||
| 128 | size_t wq_end; | ||
| 129 | struct ib_umem *umem = qp->umem; | ||
| 130 | u32 first_copy_length; | ||
| 131 | int wqe_length; | ||
| 132 | int ret; | ||
| 133 | |||
| 134 | if (wq->wqe_cnt == 0) { | ||
| 135 | mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n", | ||
| 136 | qp->ibqp.qp_type); | ||
| 137 | return -EINVAL; | ||
| 138 | } | ||
| 139 | |||
| 140 | offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift); | ||
| 141 | wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift); | ||
| 142 | |||
| 143 | if (send && length < sizeof(struct mlx5_wqe_ctrl_seg)) | ||
| 144 | return -EINVAL; | ||
| 145 | |||
| 146 | if (offset > umem->length || | ||
| 147 | (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length)) | ||
| 148 | return -EINVAL; | ||
| 149 | |||
| 150 | first_copy_length = min_t(u32, offset + length, wq_end) - offset; | ||
| 151 | ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length); | ||
| 152 | if (ret) | ||
| 153 | return ret; | ||
| 154 | |||
| 155 | if (send) { | ||
| 156 | struct mlx5_wqe_ctrl_seg *ctrl = buffer; | ||
| 157 | int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; | ||
| 158 | |||
| 159 | wqe_length = ds * MLX5_WQE_DS_UNITS; | ||
| 160 | } else { | ||
| 161 | wqe_length = 1 << wq->wqe_shift; | ||
| 162 | } | ||
| 163 | |||
| 164 | if (wqe_length <= first_copy_length) | ||
| 165 | return first_copy_length; | ||
| 166 | |||
| 167 | ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset, | ||
| 168 | wqe_length - first_copy_length); | ||
| 169 | if (ret) | ||
| 170 | return ret; | ||
| 171 | |||
| 172 | return wqe_length; | ||
| 173 | } | ||
| 174 | |||
| 113 | static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) | 175 | static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) |
| 114 | { | 176 | { |
| 115 | struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; | 177 | struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; |
| @@ -814,6 +876,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, | |||
| 814 | int inlen = sizeof(*in); | 876 | int inlen = sizeof(*in); |
| 815 | int err; | 877 | int err; |
| 816 | 878 | ||
| 879 | mlx5_ib_odp_create_qp(qp); | ||
| 880 | |||
| 817 | gen = &dev->mdev->caps.gen; | 881 | gen = &dev->mdev->caps.gen; |
| 818 | mutex_init(&qp->mutex); | 882 | mutex_init(&qp->mutex); |
| 819 | spin_lock_init(&qp->sq.lock); | 883 | spin_lock_init(&qp->sq.lock); |
| @@ -1098,11 +1162,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) | |||
| 1098 | in = kzalloc(sizeof(*in), GFP_KERNEL); | 1162 | in = kzalloc(sizeof(*in), GFP_KERNEL); |
| 1099 | if (!in) | 1163 | if (!in) |
| 1100 | return; | 1164 | return; |
| 1101 | if (qp->state != IB_QPS_RESET) | 1165 | if (qp->state != IB_QPS_RESET) { |
| 1166 | mlx5_ib_qp_disable_pagefaults(qp); | ||
| 1102 | if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), | 1167 | if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), |
| 1103 | MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) | 1168 | MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp)) |
| 1104 | mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", | 1169 | mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", |
| 1105 | qp->mqp.qpn); | 1170 | qp->mqp.qpn); |
| 1171 | } | ||
| 1106 | 1172 | ||
| 1107 | get_cqs(qp, &send_cq, &recv_cq); | 1173 | get_cqs(qp, &send_cq, &recv_cq); |
| 1108 | 1174 | ||
| @@ -1650,6 +1716,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, | |||
| 1650 | if (mlx5_st < 0) | 1716 | if (mlx5_st < 0) |
| 1651 | goto out; | 1717 | goto out; |
| 1652 | 1718 | ||
| 1719 | /* If moving to a reset or error state, we must disable page faults on | ||
| 1720 | * this QP and flush all current page faults. Otherwise a stale page | ||
| 1721 | * fault may attempt to work on this QP after it is reset and moved | ||
| 1722 | * again to RTS, and may cause the driver and the device to get out of | ||
| 1723 | * sync. */ | ||
| 1724 | if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && | ||
| 1725 | (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) | ||
| 1726 | mlx5_ib_qp_disable_pagefaults(qp); | ||
| 1727 | |||
| 1653 | optpar = ib_mask_to_mlx5_opt(attr_mask); | 1728 | optpar = ib_mask_to_mlx5_opt(attr_mask); |
| 1654 | optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; | 1729 | optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; |
| 1655 | in->optparam = cpu_to_be32(optpar); | 1730 | in->optparam = cpu_to_be32(optpar); |
| @@ -1659,6 +1734,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, | |||
| 1659 | if (err) | 1734 | if (err) |
| 1660 | goto out; | 1735 | goto out; |
| 1661 | 1736 | ||
| 1737 | if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) | ||
| 1738 | mlx5_ib_qp_enable_pagefaults(qp); | ||
| 1739 | |||
| 1662 | qp->state = new_state; | 1740 | qp->state = new_state; |
| 1663 | 1741 | ||
| 1664 | if (attr_mask & IB_QP_ACCESS_FLAGS) | 1742 | if (attr_mask & IB_QP_ACCESS_FLAGS) |
| @@ -1848,37 +1926,70 @@ static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, | |||
| 1848 | umr->mkey_mask = frwr_mkey_mask(); | 1926 | umr->mkey_mask = frwr_mkey_mask(); |
| 1849 | } | 1927 | } |
| 1850 | 1928 | ||
| 1929 | static __be64 get_umr_reg_mr_mask(void) | ||
| 1930 | { | ||
| 1931 | u64 result; | ||
| 1932 | |||
| 1933 | result = MLX5_MKEY_MASK_LEN | | ||
| 1934 | MLX5_MKEY_MASK_PAGE_SIZE | | ||
| 1935 | MLX5_MKEY_MASK_START_ADDR | | ||
| 1936 | MLX5_MKEY_MASK_PD | | ||
| 1937 | MLX5_MKEY_MASK_LR | | ||
| 1938 | MLX5_MKEY_MASK_LW | | ||
| 1939 | MLX5_MKEY_MASK_KEY | | ||
| 1940 | MLX5_MKEY_MASK_RR | | ||
| 1941 | MLX5_MKEY_MASK_RW | | ||
| 1942 | MLX5_MKEY_MASK_A | | ||
| 1943 | MLX5_MKEY_MASK_FREE; | ||
| 1944 | |||
| 1945 | return cpu_to_be64(result); | ||
| 1946 | } | ||
| 1947 | |||
| 1948 | static __be64 get_umr_unreg_mr_mask(void) | ||
| 1949 | { | ||
| 1950 | u64 result; | ||
| 1951 | |||
| 1952 | result = MLX5_MKEY_MASK_FREE; | ||
| 1953 | |||
| 1954 | return cpu_to_be64(result); | ||
| 1955 | } | ||
| 1956 | |||
| 1957 | static __be64 get_umr_update_mtt_mask(void) | ||
| 1958 | { | ||
| 1959 | u64 result; | ||
| 1960 | |||
| 1961 | result = MLX5_MKEY_MASK_FREE; | ||
| 1962 | |||
| 1963 | return cpu_to_be64(result); | ||
| 1964 | } | ||
| 1965 | |||
| 1851 | static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, | 1966 | static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, |
| 1852 | struct ib_send_wr *wr) | 1967 | struct ib_send_wr *wr) |
| 1853 | { | 1968 | { |
| 1854 | struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg; | 1969 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; |
| 1855 | u64 mask; | ||
| 1856 | 1970 | ||
| 1857 | memset(umr, 0, sizeof(*umr)); | 1971 | memset(umr, 0, sizeof(*umr)); |
| 1858 | 1972 | ||
| 1973 | if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) | ||
| 1974 | umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */ | ||
| 1975 | else | ||
| 1976 | umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ | ||
| 1977 | |||
| 1859 | if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { | 1978 | if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { |
| 1860 | umr->flags = 1 << 5; /* fail if not free */ | ||
| 1861 | umr->klm_octowords = get_klm_octo(umrwr->npages); | 1979 | umr->klm_octowords = get_klm_octo(umrwr->npages); |
| 1862 | mask = MLX5_MKEY_MASK_LEN | | 1980 | if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { |
| 1863 | MLX5_MKEY_MASK_PAGE_SIZE | | 1981 | umr->mkey_mask = get_umr_update_mtt_mask(); |
| 1864 | MLX5_MKEY_MASK_START_ADDR | | 1982 | umr->bsf_octowords = get_klm_octo(umrwr->target.offset); |
| 1865 | MLX5_MKEY_MASK_PD | | 1983 | umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; |
| 1866 | MLX5_MKEY_MASK_LR | | 1984 | } else { |
| 1867 | MLX5_MKEY_MASK_LW | | 1985 | umr->mkey_mask = get_umr_reg_mr_mask(); |
| 1868 | MLX5_MKEY_MASK_KEY | | 1986 | } |
| 1869 | MLX5_MKEY_MASK_RR | | ||
| 1870 | MLX5_MKEY_MASK_RW | | ||
| 1871 | MLX5_MKEY_MASK_A | | ||
| 1872 | MLX5_MKEY_MASK_FREE; | ||
| 1873 | umr->mkey_mask = cpu_to_be64(mask); | ||
| 1874 | } else { | 1987 | } else { |
| 1875 | umr->flags = 2 << 5; /* fail if free */ | 1988 | umr->mkey_mask = get_umr_unreg_mr_mask(); |
| 1876 | mask = MLX5_MKEY_MASK_FREE; | ||
| 1877 | umr->mkey_mask = cpu_to_be64(mask); | ||
| 1878 | } | 1989 | } |
| 1879 | 1990 | ||
| 1880 | if (!wr->num_sge) | 1991 | if (!wr->num_sge) |
| 1881 | umr->flags |= (1 << 7); /* inline */ | 1992 | umr->flags |= MLX5_UMR_INLINE; |
| 1882 | } | 1993 | } |
| 1883 | 1994 | ||
| 1884 | static u8 get_umr_flags(int acc) | 1995 | static u8 get_umr_flags(int acc) |
| @@ -1895,7 +2006,7 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, | |||
| 1895 | { | 2006 | { |
| 1896 | memset(seg, 0, sizeof(*seg)); | 2007 | memset(seg, 0, sizeof(*seg)); |
| 1897 | if (li) { | 2008 | if (li) { |
| 1898 | seg->status = 1 << 6; | 2009 | seg->status = MLX5_MKEY_STATUS_FREE; |
| 1899 | return; | 2010 | return; |
| 1900 | } | 2011 | } |
| 1901 | 2012 | ||
| @@ -1912,19 +2023,23 @@ static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr, | |||
| 1912 | 2023 | ||
| 1913 | static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) | 2024 | static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) |
| 1914 | { | 2025 | { |
| 2026 | struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg; | ||
| 2027 | |||
| 1915 | memset(seg, 0, sizeof(*seg)); | 2028 | memset(seg, 0, sizeof(*seg)); |
| 1916 | if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { | 2029 | if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { |
| 1917 | seg->status = 1 << 6; | 2030 | seg->status = MLX5_MKEY_STATUS_FREE; |
| 1918 | return; | 2031 | return; |
| 1919 | } | 2032 | } |
| 1920 | 2033 | ||
| 1921 | seg->flags = convert_access(wr->wr.fast_reg.access_flags); | 2034 | seg->flags = convert_access(umrwr->access_flags); |
| 1922 | seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn); | 2035 | if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { |
| 1923 | seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); | 2036 | seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); |
| 1924 | seg->len = cpu_to_be64(wr->wr.fast_reg.length); | 2037 | seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); |
| 1925 | seg->log2_page_size = wr->wr.fast_reg.page_shift; | 2038 | } |
| 2039 | seg->len = cpu_to_be64(umrwr->length); | ||
| 2040 | seg->log2_page_size = umrwr->page_shift; | ||
| 1926 | seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | | 2041 | seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | |
| 1927 | mlx5_mkey_variant(wr->wr.fast_reg.rkey)); | 2042 | mlx5_mkey_variant(umrwr->mkey)); |
| 1928 | } | 2043 | } |
| 1929 | 2044 | ||
| 1930 | static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, | 2045 | static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, |
| @@ -2927,6 +3042,14 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr | |||
| 2927 | int mlx5_state; | 3042 | int mlx5_state; |
| 2928 | int err = 0; | 3043 | int err = 0; |
| 2929 | 3044 | ||
| 3045 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 3046 | /* | ||
| 3047 | * Wait for any outstanding page faults, in case the user frees memory | ||
| 3048 | * based upon this query's result. | ||
| 3049 | */ | ||
| 3050 | flush_workqueue(mlx5_ib_page_fault_wq); | ||
| 3051 | #endif | ||
| 3052 | |||
| 2930 | mutex_lock(&qp->mutex); | 3053 | mutex_lock(&qp->mutex); |
| 2931 | outb = kzalloc(sizeof(*outb), GFP_KERNEL); | 3054 | outb = kzalloc(sizeof(*outb), GFP_KERNEL); |
| 2932 | if (!outb) { | 3055 | if (!outb) { |
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fef067c959fc..c0d0296e7a00 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c | |||
| @@ -2341,9 +2341,9 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 2341 | nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," | 2341 | nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u," |
| 2342 | " offset = %u, page size = %u.\n", | 2342 | " offset = %u, page size = %u.\n", |
| 2343 | (unsigned long int)start, (unsigned long int)virt, (u32)length, | 2343 | (unsigned long int)start, (unsigned long int)virt, (u32)length, |
| 2344 | region->offset, region->page_size); | 2344 | ib_umem_offset(region), region->page_size); |
| 2345 | 2345 | ||
| 2346 | skip_pages = ((u32)region->offset) >> 12; | 2346 | skip_pages = ((u32)ib_umem_offset(region)) >> 12; |
| 2347 | 2347 | ||
| 2348 | if (ib_copy_from_udata(&req, udata, sizeof(req))) { | 2348 | if (ib_copy_from_udata(&req, udata, sizeof(req))) { |
| 2349 | ib_umem_release(region); | 2349 | ib_umem_release(region); |
| @@ -2408,7 +2408,7 @@ static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 2408 | region_length -= skip_pages << 12; | 2408 | region_length -= skip_pages << 12; |
| 2409 | for (page_index = skip_pages; page_index < chunk_pages; page_index++) { | 2409 | for (page_index = skip_pages; page_index < chunk_pages; page_index++) { |
| 2410 | skip_pages = 0; | 2410 | skip_pages = 0; |
| 2411 | if ((page_count != 0) && (page_count<<12)-(region->offset&(4096-1)) >= region->length) | 2411 | if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length) |
| 2412 | goto enough_pages; | 2412 | goto enough_pages; |
| 2413 | if ((page_count&0x01FF) == 0) { | 2413 | if ((page_count&0x01FF) == 0) { |
| 2414 | if (page_count >= 1024 * 512) { | 2414 | if (page_count >= 1024 * 512) { |
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index b6201356d669..fb8d8c4dfbb9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | |||
| @@ -805,7 +805,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, | |||
| 805 | goto umem_err; | 805 | goto umem_err; |
| 806 | 806 | ||
| 807 | mr->hwmr.pbe_size = mr->umem->page_size; | 807 | mr->hwmr.pbe_size = mr->umem->page_size; |
| 808 | mr->hwmr.fbo = mr->umem->offset; | 808 | mr->hwmr.fbo = ib_umem_offset(mr->umem); |
| 809 | mr->hwmr.va = usr_addr; | 809 | mr->hwmr.va = usr_addr; |
| 810 | mr->hwmr.len = len; | 810 | mr->hwmr.len = len; |
| 811 | mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; | 811 | mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; |
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index 9bbb55347cc1..a77fb4fb14e4 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c | |||
| @@ -258,7 +258,7 @@ struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, | |||
| 258 | mr->mr.user_base = start; | 258 | mr->mr.user_base = start; |
| 259 | mr->mr.iova = virt_addr; | 259 | mr->mr.iova = virt_addr; |
| 260 | mr->mr.length = length; | 260 | mr->mr.length = length; |
| 261 | mr->mr.offset = umem->offset; | 261 | mr->mr.offset = ib_umem_offset(umem); |
| 262 | mr->mr.access_flags = mr_access_flags; | 262 | mr->mr.access_flags = mr_access_flags; |
| 263 | mr->umem = umem; | 263 | mr->umem = umem; |
| 264 | 264 | ||
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562beb5423..8ba80a6d3a46 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h | |||
| @@ -98,9 +98,15 @@ enum { | |||
| 98 | 98 | ||
| 99 | IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ | 99 | IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ |
| 100 | IPOIB_MCAST_FLAG_SENDONLY = 1, | 100 | IPOIB_MCAST_FLAG_SENDONLY = 1, |
| 101 | IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ | 101 | /* |
| 102 | * For IPOIB_MCAST_FLAG_BUSY | ||
| 103 | * When set, in flight join and mcast->mc is unreliable | ||
| 104 | * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or | ||
| 105 | * haven't started yet | ||
| 106 | * When clear and mcast->mc is valid pointer, join was successful | ||
| 107 | */ | ||
| 108 | IPOIB_MCAST_FLAG_BUSY = 2, | ||
| 102 | IPOIB_MCAST_FLAG_ATTACHED = 3, | 109 | IPOIB_MCAST_FLAG_ATTACHED = 3, |
| 103 | IPOIB_MCAST_JOIN_STARTED = 4, | ||
| 104 | 110 | ||
| 105 | MAX_SEND_CQE = 16, | 111 | MAX_SEND_CQE = 16, |
| 106 | IPOIB_CM_COPYBREAK = 256, | 112 | IPOIB_CM_COPYBREAK = 256, |
| @@ -317,6 +323,7 @@ struct ipoib_dev_priv { | |||
| 317 | struct list_head multicast_list; | 323 | struct list_head multicast_list; |
| 318 | struct rb_root multicast_tree; | 324 | struct rb_root multicast_tree; |
| 319 | 325 | ||
| 326 | struct workqueue_struct *wq; | ||
| 320 | struct delayed_work mcast_task; | 327 | struct delayed_work mcast_task; |
| 321 | struct work_struct carrier_on_task; | 328 | struct work_struct carrier_on_task; |
| 322 | struct work_struct flush_light; | 329 | struct work_struct flush_light; |
| @@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); | |||
| 477 | void ipoib_pkey_event(struct work_struct *work); | 484 | void ipoib_pkey_event(struct work_struct *work); |
| 478 | void ipoib_ib_dev_cleanup(struct net_device *dev); | 485 | void ipoib_ib_dev_cleanup(struct net_device *dev); |
| 479 | 486 | ||
| 480 | int ipoib_ib_dev_open(struct net_device *dev, int flush); | 487 | int ipoib_ib_dev_open(struct net_device *dev); |
| 481 | int ipoib_ib_dev_up(struct net_device *dev); | 488 | int ipoib_ib_dev_up(struct net_device *dev); |
| 482 | int ipoib_ib_dev_down(struct net_device *dev, int flush); | 489 | int ipoib_ib_dev_down(struct net_device *dev); |
| 483 | int ipoib_ib_dev_stop(struct net_device *dev, int flush); | 490 | int ipoib_ib_dev_stop(struct net_device *dev); |
| 484 | void ipoib_pkey_dev_check_presence(struct net_device *dev); | 491 | void ipoib_pkey_dev_check_presence(struct net_device *dev); |
| 485 | 492 | ||
| 486 | int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); | 493 | int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); |
| @@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); | |||
| 492 | 499 | ||
| 493 | void ipoib_mcast_restart_task(struct work_struct *work); | 500 | void ipoib_mcast_restart_task(struct work_struct *work); |
| 494 | int ipoib_mcast_start_thread(struct net_device *dev); | 501 | int ipoib_mcast_start_thread(struct net_device *dev); |
| 495 | int ipoib_mcast_stop_thread(struct net_device *dev, int flush); | 502 | int ipoib_mcast_stop_thread(struct net_device *dev); |
| 496 | 503 | ||
| 497 | void ipoib_mcast_dev_down(struct net_device *dev); | 504 | void ipoib_mcast_dev_down(struct net_device *dev); |
| 498 | void ipoib_mcast_dev_flush(struct net_device *dev); | 505 | void ipoib_mcast_dev_flush(struct net_device *dev); |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 933efcea0d03..56959adb6c7d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c | |||
| @@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even | |||
| 474 | } | 474 | } |
| 475 | 475 | ||
| 476 | spin_lock_irq(&priv->lock); | 476 | spin_lock_irq(&priv->lock); |
| 477 | queue_delayed_work(ipoib_workqueue, | 477 | queue_delayed_work(priv->wq, |
| 478 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); | 478 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); |
| 479 | /* Add this entry to passive ids list head, but do not re-add it | 479 | /* Add this entry to passive ids list head, but do not re-add it |
| 480 | * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ | 480 | * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ |
| @@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |||
| 576 | spin_lock_irqsave(&priv->lock, flags); | 576 | spin_lock_irqsave(&priv->lock, flags); |
| 577 | list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); | 577 | list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); |
| 578 | ipoib_cm_start_rx_drain(priv); | 578 | ipoib_cm_start_rx_drain(priv); |
| 579 | queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); | 579 | queue_work(priv->wq, &priv->cm.rx_reap_task); |
| 580 | spin_unlock_irqrestore(&priv->lock, flags); | 580 | spin_unlock_irqrestore(&priv->lock, flags); |
| 581 | } else | 581 | } else |
| 582 | ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", | 582 | ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", |
| @@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) | |||
| 603 | spin_lock_irqsave(&priv->lock, flags); | 603 | spin_lock_irqsave(&priv->lock, flags); |
| 604 | list_move(&p->list, &priv->cm.rx_reap_list); | 604 | list_move(&p->list, &priv->cm.rx_reap_list); |
| 605 | spin_unlock_irqrestore(&priv->lock, flags); | 605 | spin_unlock_irqrestore(&priv->lock, flags); |
| 606 | queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); | 606 | queue_work(priv->wq, &priv->cm.rx_reap_task); |
| 607 | } | 607 | } |
| 608 | return; | 608 | return; |
| 609 | } | 609 | } |
| @@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) | |||
| 827 | 827 | ||
| 828 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { | 828 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { |
| 829 | list_move(&tx->list, &priv->cm.reap_list); | 829 | list_move(&tx->list, &priv->cm.reap_list); |
| 830 | queue_work(ipoib_workqueue, &priv->cm.reap_task); | 830 | queue_work(priv->wq, &priv->cm.reap_task); |
| 831 | } | 831 | } |
| 832 | 832 | ||
| 833 | clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); | 833 | clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); |
| @@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, | |||
| 1255 | 1255 | ||
| 1256 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { | 1256 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { |
| 1257 | list_move(&tx->list, &priv->cm.reap_list); | 1257 | list_move(&tx->list, &priv->cm.reap_list); |
| 1258 | queue_work(ipoib_workqueue, &priv->cm.reap_task); | 1258 | queue_work(priv->wq, &priv->cm.reap_task); |
| 1259 | } | 1259 | } |
| 1260 | 1260 | ||
| 1261 | spin_unlock_irqrestore(&priv->lock, flags); | 1261 | spin_unlock_irqrestore(&priv->lock, flags); |
| @@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path | |||
| 1284 | tx->dev = dev; | 1284 | tx->dev = dev; |
| 1285 | list_add(&tx->list, &priv->cm.start_list); | 1285 | list_add(&tx->list, &priv->cm.start_list); |
| 1286 | set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); | 1286 | set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); |
| 1287 | queue_work(ipoib_workqueue, &priv->cm.start_task); | 1287 | queue_work(priv->wq, &priv->cm.start_task); |
| 1288 | return tx; | 1288 | return tx; |
| 1289 | } | 1289 | } |
| 1290 | 1290 | ||
| @@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) | |||
| 1295 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { | 1295 | if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { |
| 1296 | spin_lock_irqsave(&priv->lock, flags); | 1296 | spin_lock_irqsave(&priv->lock, flags); |
| 1297 | list_move(&tx->list, &priv->cm.reap_list); | 1297 | list_move(&tx->list, &priv->cm.reap_list); |
| 1298 | queue_work(ipoib_workqueue, &priv->cm.reap_task); | 1298 | queue_work(priv->wq, &priv->cm.reap_task); |
| 1299 | ipoib_dbg(priv, "Reap connection for gid %pI6\n", | 1299 | ipoib_dbg(priv, "Reap connection for gid %pI6\n", |
| 1300 | tx->neigh->daddr + 4); | 1300 | tx->neigh->daddr + 4); |
| 1301 | tx->neigh = NULL; | 1301 | tx->neigh = NULL; |
| @@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, | |||
| 1417 | 1417 | ||
| 1418 | skb_queue_tail(&priv->cm.skb_queue, skb); | 1418 | skb_queue_tail(&priv->cm.skb_queue, skb); |
| 1419 | if (e) | 1419 | if (e) |
| 1420 | queue_work(ipoib_workqueue, &priv->cm.skb_task); | 1420 | queue_work(priv->wq, &priv->cm.skb_task); |
| 1421 | } | 1421 | } |
| 1422 | 1422 | ||
| 1423 | static void ipoib_cm_rx_reap(struct work_struct *work) | 1423 | static void ipoib_cm_rx_reap(struct work_struct *work) |
| @@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work) | |||
| 1450 | } | 1450 | } |
| 1451 | 1451 | ||
| 1452 | if (!list_empty(&priv->cm.passive_ids)) | 1452 | if (!list_empty(&priv->cm.passive_ids)) |
| 1453 | queue_delayed_work(ipoib_workqueue, | 1453 | queue_delayed_work(priv->wq, |
| 1454 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); | 1454 | &priv->cm.stale_task, IPOIB_CM_RX_DELAY); |
| 1455 | spin_unlock_irq(&priv->lock); | 1455 | spin_unlock_irq(&priv->lock); |
| 1456 | } | 1456 | } |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 72626c348174..fe65abb5150c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c | |||
| @@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work) | |||
| 655 | __ipoib_reap_ah(dev); | 655 | __ipoib_reap_ah(dev); |
| 656 | 656 | ||
| 657 | if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) | 657 | if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) |
| 658 | queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, | 658 | queue_delayed_work(priv->wq, &priv->ah_reap_task, |
| 659 | round_jiffies_relative(HZ)); | 659 | round_jiffies_relative(HZ)); |
| 660 | } | 660 | } |
| 661 | 661 | ||
| @@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx) | |||
| 664 | drain_tx_cq((struct net_device *)ctx); | 664 | drain_tx_cq((struct net_device *)ctx); |
| 665 | } | 665 | } |
| 666 | 666 | ||
| 667 | int ipoib_ib_dev_open(struct net_device *dev, int flush) | 667 | int ipoib_ib_dev_open(struct net_device *dev) |
| 668 | { | 668 | { |
| 669 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 669 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
| 670 | int ret; | 670 | int ret; |
| @@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) | |||
| 696 | } | 696 | } |
| 697 | 697 | ||
| 698 | clear_bit(IPOIB_STOP_REAPER, &priv->flags); | 698 | clear_bit(IPOIB_STOP_REAPER, &priv->flags); |
| 699 | queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, | 699 | queue_delayed_work(priv->wq, &priv->ah_reap_task, |
| 700 | round_jiffies_relative(HZ)); | 700 | round_jiffies_relative(HZ)); |
| 701 | 701 | ||
| 702 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) | 702 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
| @@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) | |||
| 706 | dev_stop: | 706 | dev_stop: |
| 707 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) | 707 | if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
| 708 | napi_enable(&priv->napi); | 708 | napi_enable(&priv->napi); |
| 709 | ipoib_ib_dev_stop(dev, flush); | 709 | ipoib_ib_dev_stop(dev); |
| 710 | return -1; | 710 | return -1; |
| 711 | } | 711 | } |
| 712 | 712 | ||
| @@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev) | |||
| 738 | return ipoib_mcast_start_thread(dev); | 738 | return ipoib_mcast_start_thread(dev); |
| 739 | } | 739 | } |
| 740 | 740 | ||
| 741 | int ipoib_ib_dev_down(struct net_device *dev, int flush) | 741 | int ipoib_ib_dev_down(struct net_device *dev) |
| 742 | { | 742 | { |
| 743 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 743 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
| 744 | 744 | ||
| @@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) | |||
| 747 | clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); | 747 | clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); |
| 748 | netif_carrier_off(dev); | 748 | netif_carrier_off(dev); |
| 749 | 749 | ||
| 750 | ipoib_mcast_stop_thread(dev, flush); | 750 | ipoib_mcast_stop_thread(dev); |
| 751 | ipoib_mcast_dev_flush(dev); | 751 | ipoib_mcast_dev_flush(dev); |
| 752 | 752 | ||
| 753 | ipoib_flush_paths(dev); | 753 | ipoib_flush_paths(dev); |
| @@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev) | |||
| 807 | local_bh_enable(); | 807 | local_bh_enable(); |
| 808 | } | 808 | } |
| 809 | 809 | ||
| 810 | int ipoib_ib_dev_stop(struct net_device *dev, int flush) | 810 | int ipoib_ib_dev_stop(struct net_device *dev) |
| 811 | { | 811 | { |
| 812 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 812 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
| 813 | struct ib_qp_attr qp_attr; | 813 | struct ib_qp_attr qp_attr; |
| @@ -880,8 +880,7 @@ timeout: | |||
| 880 | /* Wait for all AHs to be reaped */ | 880 | /* Wait for all AHs to be reaped */ |
| 881 | set_bit(IPOIB_STOP_REAPER, &priv->flags); | 881 | set_bit(IPOIB_STOP_REAPER, &priv->flags); |
| 882 | cancel_delayed_work(&priv->ah_reap_task); | 882 | cancel_delayed_work(&priv->ah_reap_task); |
| 883 | if (flush) | 883 | flush_workqueue(priv->wq); |
| 884 | flush_workqueue(ipoib_workqueue); | ||
| 885 | 884 | ||
| 886 | begin = jiffies; | 885 | begin = jiffies; |
| 887 | 886 | ||
| @@ -918,7 +917,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) | |||
| 918 | (unsigned long) dev); | 917 | (unsigned long) dev); |
| 919 | 918 | ||
| 920 | if (dev->flags & IFF_UP) { | 919 | if (dev->flags & IFF_UP) { |
| 921 | if (ipoib_ib_dev_open(dev, 1)) { | 920 | if (ipoib_ib_dev_open(dev)) { |
| 922 | ipoib_transport_dev_cleanup(dev); | 921 | ipoib_transport_dev_cleanup(dev); |
| 923 | return -ENODEV; | 922 | return -ENODEV; |
| 924 | } | 923 | } |
| @@ -1040,12 +1039,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, | |||
| 1040 | } | 1039 | } |
| 1041 | 1040 | ||
| 1042 | if (level >= IPOIB_FLUSH_NORMAL) | 1041 | if (level >= IPOIB_FLUSH_NORMAL) |
| 1043 | ipoib_ib_dev_down(dev, 0); | 1042 | ipoib_ib_dev_down(dev); |
| 1044 | 1043 | ||
| 1045 | if (level == IPOIB_FLUSH_HEAVY) { | 1044 | if (level == IPOIB_FLUSH_HEAVY) { |
| 1046 | if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) | 1045 | if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) |
| 1047 | ipoib_ib_dev_stop(dev, 0); | 1046 | ipoib_ib_dev_stop(dev); |
| 1048 | if (ipoib_ib_dev_open(dev, 0) != 0) | 1047 | if (ipoib_ib_dev_open(dev) != 0) |
| 1049 | return; | 1048 | return; |
| 1050 | if (netif_queue_stopped(dev)) | 1049 | if (netif_queue_stopped(dev)) |
| 1051 | netif_start_queue(dev); | 1050 | netif_start_queue(dev); |
| @@ -1097,7 +1096,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) | |||
| 1097 | */ | 1096 | */ |
| 1098 | ipoib_flush_paths(dev); | 1097 | ipoib_flush_paths(dev); |
| 1099 | 1098 | ||
| 1100 | ipoib_mcast_stop_thread(dev, 1); | 1099 | ipoib_mcast_stop_thread(dev); |
| 1101 | ipoib_mcast_dev_flush(dev); | 1100 | ipoib_mcast_dev_flush(dev); |
| 1102 | 1101 | ||
| 1103 | ipoib_transport_dev_cleanup(dev); | 1102 | ipoib_transport_dev_cleanup(dev); |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 58b5aa3b6f2d..6bad17d4d588 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c | |||
| @@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev) | |||
| 108 | 108 | ||
| 109 | set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); | 109 | set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); |
| 110 | 110 | ||
| 111 | if (ipoib_ib_dev_open(dev, 1)) { | 111 | if (ipoib_ib_dev_open(dev)) { |
| 112 | if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) | 112 | if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) |
| 113 | return 0; | 113 | return 0; |
| 114 | goto err_disable; | 114 | goto err_disable; |
| @@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev) | |||
| 139 | return 0; | 139 | return 0; |
| 140 | 140 | ||
| 141 | err_stop: | 141 | err_stop: |
| 142 | ipoib_ib_dev_stop(dev, 1); | 142 | ipoib_ib_dev_stop(dev); |
| 143 | 143 | ||
| 144 | err_disable: | 144 | err_disable: |
| 145 | clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); | 145 | clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); |
| @@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev) | |||
| 157 | 157 | ||
| 158 | netif_stop_queue(dev); | 158 | netif_stop_queue(dev); |
| 159 | 159 | ||
| 160 | ipoib_ib_dev_down(dev, 1); | 160 | ipoib_ib_dev_down(dev); |
| 161 | ipoib_ib_dev_stop(dev, 0); | 161 | ipoib_ib_dev_stop(dev); |
| 162 | 162 | ||
| 163 | if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { | 163 | if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { |
| 164 | struct ipoib_dev_priv *cpriv; | 164 | struct ipoib_dev_priv *cpriv; |
| @@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) | |||
| 839 | return; | 839 | return; |
| 840 | } | 840 | } |
| 841 | 841 | ||
| 842 | queue_work(ipoib_workqueue, &priv->restart_task); | 842 | queue_work(priv->wq, &priv->restart_task); |
| 843 | } | 843 | } |
| 844 | 844 | ||
| 845 | static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) | 845 | static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) |
| @@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work) | |||
| 954 | __ipoib_reap_neigh(priv); | 954 | __ipoib_reap_neigh(priv); |
| 955 | 955 | ||
| 956 | if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) | 956 | if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) |
| 957 | queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, | 957 | queue_delayed_work(priv->wq, &priv->neigh_reap_task, |
| 958 | arp_tbl.gc_interval); | 958 | arp_tbl.gc_interval); |
| 959 | } | 959 | } |
| 960 | 960 | ||
| @@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) | |||
| 1133 | 1133 | ||
| 1134 | /* start garbage collection */ | 1134 | /* start garbage collection */ |
| 1135 | clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | 1135 | clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); |
| 1136 | queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, | 1136 | queue_delayed_work(priv->wq, &priv->neigh_reap_task, |
| 1137 | arp_tbl.gc_interval); | 1137 | arp_tbl.gc_interval); |
| 1138 | 1138 | ||
| 1139 | return 0; | 1139 | return 0; |
| @@ -1262,15 +1262,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) | |||
| 1262 | { | 1262 | { |
| 1263 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 1263 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
| 1264 | 1264 | ||
| 1265 | if (ipoib_neigh_hash_init(priv) < 0) | ||
| 1266 | goto out; | ||
| 1267 | /* Allocate RX/TX "rings" to hold queued skbs */ | 1265 | /* Allocate RX/TX "rings" to hold queued skbs */ |
| 1268 | priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, | 1266 | priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, |
| 1269 | GFP_KERNEL); | 1267 | GFP_KERNEL); |
| 1270 | if (!priv->rx_ring) { | 1268 | if (!priv->rx_ring) { |
| 1271 | printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", | 1269 | printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", |
| 1272 | ca->name, ipoib_recvq_size); | 1270 | ca->name, ipoib_recvq_size); |
| 1273 | goto out_neigh_hash_cleanup; | 1271 | goto out; |
| 1274 | } | 1272 | } |
| 1275 | 1273 | ||
| 1276 | priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); | 1274 | priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); |
| @@ -1285,16 +1283,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) | |||
| 1285 | if (ipoib_ib_dev_init(dev, ca, port)) | 1283 | if (ipoib_ib_dev_init(dev, ca, port)) |
| 1286 | goto out_tx_ring_cleanup; | 1284 | goto out_tx_ring_cleanup; |
| 1287 | 1285 | ||
| 1286 | /* | ||
| 1287 | * Must be after ipoib_ib_dev_init so we can allocate a per | ||
| 1288 | * device wq there and use it here | ||
| 1289 | */ | ||
| 1290 | if (ipoib_neigh_hash_init(priv) < 0) | ||
| 1291 | goto out_dev_uninit; | ||
| 1292 | |||
| 1288 | return 0; | 1293 | return 0; |
| 1289 | 1294 | ||
| 1295 | out_dev_uninit: | ||
| 1296 | ipoib_ib_dev_cleanup(dev); | ||
| 1297 | |||
| 1290 | out_tx_ring_cleanup: | 1298 | out_tx_ring_cleanup: |
| 1291 | vfree(priv->tx_ring); | 1299 | vfree(priv->tx_ring); |
| 1292 | 1300 | ||
| 1293 | out_rx_ring_cleanup: | 1301 | out_rx_ring_cleanup: |
| 1294 | kfree(priv->rx_ring); | 1302 | kfree(priv->rx_ring); |
| 1295 | 1303 | ||
| 1296 | out_neigh_hash_cleanup: | ||
| 1297 | ipoib_neigh_hash_uninit(dev); | ||
| 1298 | out: | 1304 | out: |
| 1299 | return -ENOMEM; | 1305 | return -ENOMEM; |
| 1300 | } | 1306 | } |
| @@ -1317,6 +1323,12 @@ void ipoib_dev_cleanup(struct net_device *dev) | |||
| 1317 | } | 1323 | } |
| 1318 | unregister_netdevice_many(&head); | 1324 | unregister_netdevice_many(&head); |
| 1319 | 1325 | ||
| 1326 | /* | ||
| 1327 | * Must be before ipoib_ib_dev_cleanup or we delete an in use | ||
| 1328 | * work queue | ||
| 1329 | */ | ||
| 1330 | ipoib_neigh_hash_uninit(dev); | ||
| 1331 | |||
| 1320 | ipoib_ib_dev_cleanup(dev); | 1332 | ipoib_ib_dev_cleanup(dev); |
| 1321 | 1333 | ||
| 1322 | kfree(priv->rx_ring); | 1334 | kfree(priv->rx_ring); |
| @@ -1324,8 +1336,6 @@ void ipoib_dev_cleanup(struct net_device *dev) | |||
| 1324 | 1336 | ||
| 1325 | priv->rx_ring = NULL; | 1337 | priv->rx_ring = NULL; |
| 1326 | priv->tx_ring = NULL; | 1338 | priv->tx_ring = NULL; |
| 1327 | |||
| 1328 | ipoib_neigh_hash_uninit(dev); | ||
| 1329 | } | 1339 | } |
| 1330 | 1340 | ||
| 1331 | static const struct header_ops ipoib_header_ops = { | 1341 | static const struct header_ops ipoib_header_ops = { |
| @@ -1636,7 +1646,7 @@ register_failed: | |||
| 1636 | /* Stop GC if started before flush */ | 1646 | /* Stop GC if started before flush */ |
| 1637 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | 1647 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); |
| 1638 | cancel_delayed_work(&priv->neigh_reap_task); | 1648 | cancel_delayed_work(&priv->neigh_reap_task); |
| 1639 | flush_workqueue(ipoib_workqueue); | 1649 | flush_workqueue(priv->wq); |
| 1640 | 1650 | ||
| 1641 | event_failed: | 1651 | event_failed: |
| 1642 | ipoib_dev_cleanup(priv->dev); | 1652 | ipoib_dev_cleanup(priv->dev); |
| @@ -1707,7 +1717,7 @@ static void ipoib_remove_one(struct ib_device *device) | |||
| 1707 | /* Stop GC */ | 1717 | /* Stop GC */ |
| 1708 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); | 1718 | set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); |
| 1709 | cancel_delayed_work(&priv->neigh_reap_task); | 1719 | cancel_delayed_work(&priv->neigh_reap_task); |
| 1710 | flush_workqueue(ipoib_workqueue); | 1720 | flush_workqueue(priv->wq); |
| 1711 | 1721 | ||
| 1712 | unregister_netdev(priv->dev); | 1722 | unregister_netdev(priv->dev); |
| 1713 | free_netdev(priv->dev); | 1723 | free_netdev(priv->dev); |
| @@ -1748,8 +1758,13 @@ static int __init ipoib_init_module(void) | |||
| 1748 | * unregister_netdev() and linkwatch_event take the rtnl lock, | 1758 | * unregister_netdev() and linkwatch_event take the rtnl lock, |
| 1749 | * so flush_scheduled_work() can deadlock during device | 1759 | * so flush_scheduled_work() can deadlock during device |
| 1750 | * removal. | 1760 | * removal. |
| 1761 | * | ||
| 1762 | * In addition, bringing one device up and another down at the | ||
| 1763 | * same time can deadlock a single workqueue, so we have this | ||
| 1764 | * global fallback workqueue, but we also attempt to open a | ||
| 1765 | * per device workqueue each time we bring an interface up | ||
| 1751 | */ | 1766 | */ |
| 1752 | ipoib_workqueue = create_singlethread_workqueue("ipoib"); | 1767 | ipoib_workqueue = create_singlethread_workqueue("ipoib_flush"); |
| 1753 | if (!ipoib_workqueue) { | 1768 | if (!ipoib_workqueue) { |
| 1754 | ret = -ENOMEM; | 1769 | ret = -ENOMEM; |
| 1755 | goto err_fs; | 1770 | goto err_fs; |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ffb83b5f7e80..bc50dd0d0e4d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |||
| @@ -190,12 +190,6 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, | |||
| 190 | spin_unlock_irq(&priv->lock); | 190 | spin_unlock_irq(&priv->lock); |
| 191 | priv->tx_wr.wr.ud.remote_qkey = priv->qkey; | 191 | priv->tx_wr.wr.ud.remote_qkey = priv->qkey; |
| 192 | set_qkey = 1; | 192 | set_qkey = 1; |
| 193 | |||
| 194 | if (!ipoib_cm_admin_enabled(dev)) { | ||
| 195 | rtnl_lock(); | ||
| 196 | dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); | ||
| 197 | rtnl_unlock(); | ||
| 198 | } | ||
| 199 | } | 193 | } |
| 200 | 194 | ||
| 201 | if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { | 195 | if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { |
| @@ -277,16 +271,27 @@ ipoib_mcast_sendonly_join_complete(int status, | |||
| 277 | struct ipoib_mcast *mcast = multicast->context; | 271 | struct ipoib_mcast *mcast = multicast->context; |
| 278 | struct net_device *dev = mcast->dev; | 272 | struct net_device *dev = mcast->dev; |
| 279 | 273 | ||
| 274 | /* | ||
| 275 | * We have to take the mutex to force mcast_sendonly_join to | ||
| 276 | * return from ib_sa_multicast_join and set mcast->mc to a | ||
| 277 | * valid value. Otherwise we were racing with ourselves in | ||
| 278 | * that we might fail here, but get a valid return from | ||
| 279 | * ib_sa_multicast_join after we had cleared mcast->mc here, | ||
| 280 | * resulting in mis-matched joins and leaves and a deadlock | ||
| 281 | */ | ||
| 282 | mutex_lock(&mcast_mutex); | ||
| 283 | |||
| 280 | /* We trap for port events ourselves. */ | 284 | /* We trap for port events ourselves. */ |
| 281 | if (status == -ENETRESET) | 285 | if (status == -ENETRESET) |
| 282 | return 0; | 286 | goto out; |
| 283 | 287 | ||
| 284 | if (!status) | 288 | if (!status) |
| 285 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); | 289 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); |
| 286 | 290 | ||
| 287 | if (status) { | 291 | if (status) { |
| 288 | if (mcast->logcount++ < 20) | 292 | if (mcast->logcount++ < 20) |
| 289 | ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", | 293 | ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast " |
| 294 | "join failed for %pI6, status %d\n", | ||
| 290 | mcast->mcmember.mgid.raw, status); | 295 | mcast->mcmember.mgid.raw, status); |
| 291 | 296 | ||
| 292 | /* Flush out any queued packets */ | 297 | /* Flush out any queued packets */ |
| @@ -296,11 +301,15 @@ ipoib_mcast_sendonly_join_complete(int status, | |||
| 296 | dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); | 301 | dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); |
| 297 | } | 302 | } |
| 298 | netif_tx_unlock_bh(dev); | 303 | netif_tx_unlock_bh(dev); |
| 299 | |||
| 300 | /* Clear the busy flag so we try again */ | ||
| 301 | status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, | ||
| 302 | &mcast->flags); | ||
| 303 | } | 304 | } |
| 305 | out: | ||
| 306 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | ||
| 307 | if (status) | ||
| 308 | mcast->mc = NULL; | ||
| 309 | complete(&mcast->done); | ||
| 310 | if (status == -ENETRESET) | ||
| 311 | status = 0; | ||
| 312 | mutex_unlock(&mcast_mutex); | ||
| 304 | return status; | 313 | return status; |
| 305 | } | 314 | } |
| 306 | 315 | ||
| @@ -318,12 +327,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) | |||
| 318 | int ret = 0; | 327 | int ret = 0; |
| 319 | 328 | ||
| 320 | if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { | 329 | if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { |
| 321 | ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); | 330 | ipoib_dbg_mcast(priv, "device shutting down, no sendonly " |
| 331 | "multicast joins\n"); | ||
| 322 | return -ENODEV; | 332 | return -ENODEV; |
| 323 | } | 333 | } |
| 324 | 334 | ||
| 325 | if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { | 335 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { |
| 326 | ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); | 336 | ipoib_dbg_mcast(priv, "multicast entry busy, skipping " |
| 337 | "sendonly join\n"); | ||
| 327 | return -EBUSY; | 338 | return -EBUSY; |
| 328 | } | 339 | } |
| 329 | 340 | ||
| @@ -331,6 +342,9 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) | |||
| 331 | rec.port_gid = priv->local_gid; | 342 | rec.port_gid = priv->local_gid; |
| 332 | rec.pkey = cpu_to_be16(priv->pkey); | 343 | rec.pkey = cpu_to_be16(priv->pkey); |
| 333 | 344 | ||
| 345 | mutex_lock(&mcast_mutex); | ||
| 346 | init_completion(&mcast->done); | ||
| 347 | set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | ||
| 334 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, | 348 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, |
| 335 | priv->port, &rec, | 349 | priv->port, &rec, |
| 336 | IB_SA_MCMEMBER_REC_MGID | | 350 | IB_SA_MCMEMBER_REC_MGID | |
| @@ -343,12 +357,14 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) | |||
| 343 | if (IS_ERR(mcast->mc)) { | 357 | if (IS_ERR(mcast->mc)) { |
| 344 | ret = PTR_ERR(mcast->mc); | 358 | ret = PTR_ERR(mcast->mc); |
| 345 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | 359 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); |
| 346 | ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", | 360 | complete(&mcast->done); |
| 347 | ret); | 361 | ipoib_warn(priv, "ib_sa_join_multicast for sendonly join " |
| 362 | "failed (ret = %d)\n", ret); | ||
| 348 | } else { | 363 | } else { |
| 349 | ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", | 364 | ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting " |
| 350 | mcast->mcmember.mgid.raw); | 365 | "sendonly join\n", mcast->mcmember.mgid.raw); |
| 351 | } | 366 | } |
| 367 | mutex_unlock(&mcast_mutex); | ||
| 352 | 368 | ||
| 353 | return ret; | 369 | return ret; |
| 354 | } | 370 | } |
| @@ -359,18 +375,29 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) | |||
| 359 | carrier_on_task); | 375 | carrier_on_task); |
| 360 | struct ib_port_attr attr; | 376 | struct ib_port_attr attr; |
| 361 | 377 | ||
| 362 | /* | ||
| 363 | * Take rtnl_lock to avoid racing with ipoib_stop() and | ||
| 364 | * turning the carrier back on while a device is being | ||
| 365 | * removed. | ||
| 366 | */ | ||
| 367 | if (ib_query_port(priv->ca, priv->port, &attr) || | 378 | if (ib_query_port(priv->ca, priv->port, &attr) || |
| 368 | attr.state != IB_PORT_ACTIVE) { | 379 | attr.state != IB_PORT_ACTIVE) { |
| 369 | ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); | 380 | ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); |
| 370 | return; | 381 | return; |
| 371 | } | 382 | } |
| 372 | 383 | ||
| 373 | rtnl_lock(); | 384 | /* |
| 385 | * Take rtnl_lock to avoid racing with ipoib_stop() and | ||
| 386 | * turning the carrier back on while a device is being | ||
| 387 | * removed. However, ipoib_stop() will attempt to flush | ||
| 388 | * the workqueue while holding the rtnl lock, so loop | ||
| 389 | * on trylock until either we get the lock or we see | ||
| 390 | * FLAG_ADMIN_UP go away as that signals that we are bailing | ||
| 391 | * and can safely ignore the carrier on work. | ||
| 392 | */ | ||
| 393 | while (!rtnl_trylock()) { | ||
| 394 | if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) | ||
| 395 | return; | ||
| 396 | else | ||
| 397 | msleep(20); | ||
| 398 | } | ||
| 399 | if (!ipoib_cm_admin_enabled(priv->dev)) | ||
| 400 | dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); | ||
| 374 | netif_carrier_on(priv->dev); | 401 | netif_carrier_on(priv->dev); |
| 375 | rtnl_unlock(); | 402 | rtnl_unlock(); |
| 376 | } | 403 | } |
| @@ -385,60 +412,63 @@ static int ipoib_mcast_join_complete(int status, | |||
| 385 | ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", | 412 | ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", |
| 386 | mcast->mcmember.mgid.raw, status); | 413 | mcast->mcmember.mgid.raw, status); |
| 387 | 414 | ||
| 415 | /* | ||
| 416 | * We have to take the mutex to force mcast_join to | ||
| 417 | * return from ib_sa_multicast_join and set mcast->mc to a | ||
| 418 | * valid value. Otherwise we were racing with ourselves in | ||
| 419 | * that we might fail here, but get a valid return from | ||
| 420 | * ib_sa_multicast_join after we had cleared mcast->mc here, | ||
| 421 | * resulting in mis-matched joins and leaves and a deadlock | ||
| 422 | */ | ||
| 423 | mutex_lock(&mcast_mutex); | ||
| 424 | |||
| 388 | /* We trap for port events ourselves. */ | 425 | /* We trap for port events ourselves. */ |
| 389 | if (status == -ENETRESET) { | 426 | if (status == -ENETRESET) |
| 390 | status = 0; | ||
| 391 | goto out; | 427 | goto out; |
| 392 | } | ||
| 393 | 428 | ||
| 394 | if (!status) | 429 | if (!status) |
| 395 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); | 430 | status = ipoib_mcast_join_finish(mcast, &multicast->rec); |
| 396 | 431 | ||
| 397 | if (!status) { | 432 | if (!status) { |
| 398 | mcast->backoff = 1; | 433 | mcast->backoff = 1; |
| 399 | mutex_lock(&mcast_mutex); | ||
| 400 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 434 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) |
| 401 | queue_delayed_work(ipoib_workqueue, | 435 | queue_delayed_work(priv->wq, &priv->mcast_task, 0); |
| 402 | &priv->mcast_task, 0); | ||
| 403 | mutex_unlock(&mcast_mutex); | ||
| 404 | 436 | ||
| 405 | /* | 437 | /* |
| 406 | * Defer carrier on work to ipoib_workqueue to avoid a | 438 | * Defer carrier on work to priv->wq to avoid a |
| 407 | * deadlock on rtnl_lock here. | 439 | * deadlock on rtnl_lock here. |
| 408 | */ | 440 | */ |
| 409 | if (mcast == priv->broadcast) | 441 | if (mcast == priv->broadcast) |
| 410 | queue_work(ipoib_workqueue, &priv->carrier_on_task); | 442 | queue_work(priv->wq, &priv->carrier_on_task); |
| 411 | 443 | } else { | |
| 412 | status = 0; | 444 | if (mcast->logcount++ < 20) { |
| 413 | goto out; | 445 | if (status == -ETIMEDOUT || status == -EAGAIN) { |
| 414 | } | 446 | ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", |
| 415 | 447 | mcast->mcmember.mgid.raw, status); | |
| 416 | if (mcast->logcount++ < 20) { | 448 | } else { |
| 417 | if (status == -ETIMEDOUT || status == -EAGAIN) { | 449 | ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", |
| 418 | ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", | 450 | mcast->mcmember.mgid.raw, status); |
| 419 | mcast->mcmember.mgid.raw, status); | 451 | } |
| 420 | } else { | ||
| 421 | ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", | ||
| 422 | mcast->mcmember.mgid.raw, status); | ||
| 423 | } | 452 | } |
| 424 | } | ||
| 425 | |||
| 426 | mcast->backoff *= 2; | ||
| 427 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) | ||
| 428 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; | ||
| 429 | 453 | ||
| 430 | /* Clear the busy flag so we try again */ | 454 | mcast->backoff *= 2; |
| 431 | status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | 455 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) |
| 432 | 456 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; | |
| 433 | mutex_lock(&mcast_mutex); | 457 | } |
| 458 | out: | ||
| 434 | spin_lock_irq(&priv->lock); | 459 | spin_lock_irq(&priv->lock); |
| 435 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 460 | clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); |
| 436 | queue_delayed_work(ipoib_workqueue, &priv->mcast_task, | 461 | if (status) |
| 462 | mcast->mc = NULL; | ||
| 463 | complete(&mcast->done); | ||
| 464 | if (status == -ENETRESET) | ||
| 465 | status = 0; | ||
| 466 | if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags)) | ||
| 467 | queue_delayed_work(priv->wq, &priv->mcast_task, | ||
| 437 | mcast->backoff * HZ); | 468 | mcast->backoff * HZ); |
| 438 | spin_unlock_irq(&priv->lock); | 469 | spin_unlock_irq(&priv->lock); |
| 439 | mutex_unlock(&mcast_mutex); | 470 | mutex_unlock(&mcast_mutex); |
| 440 | out: | 471 | |
| 441 | complete(&mcast->done); | ||
| 442 | return status; | 472 | return status; |
| 443 | } | 473 | } |
| 444 | 474 | ||
| @@ -487,10 +517,9 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, | |||
| 487 | rec.hop_limit = priv->broadcast->mcmember.hop_limit; | 517 | rec.hop_limit = priv->broadcast->mcmember.hop_limit; |
| 488 | } | 518 | } |
| 489 | 519 | ||
| 490 | set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); | 520 | mutex_lock(&mcast_mutex); |
| 491 | init_completion(&mcast->done); | 521 | init_completion(&mcast->done); |
| 492 | set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); | 522 | set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); |
| 493 | |||
| 494 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, | 523 | mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, |
| 495 | &rec, comp_mask, GFP_KERNEL, | 524 | &rec, comp_mask, GFP_KERNEL, |
| 496 | ipoib_mcast_join_complete, mcast); | 525 | ipoib_mcast_join_complete, mcast); |
| @@ -504,13 +533,11 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, | |||
| 504 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) | 533 | if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) |
| 505 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; | 534 | mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; |
| 506 | 535 | ||
| 507 | mutex_lock(&mcast_mutex); | ||
| 508 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 536 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) |
| 509 | queue_delayed_work(ipoib_workqueue, | 537 | queue_delayed_work(priv->wq, &priv->mcast_task, |
| 510 | &priv->mcast_task, | ||
| 511 | mcast->backoff * HZ); | 538 | mcast->backoff * HZ); |
| 512 | mutex_unlock(&mcast_mutex); | ||
| 513 | } | 539 | } |
| 540 | mutex_unlock(&mcast_mutex); | ||
| 514 | } | 541 | } |
| 515 | 542 | ||
| 516 | void ipoib_mcast_join_task(struct work_struct *work) | 543 | void ipoib_mcast_join_task(struct work_struct *work) |
| @@ -547,8 +574,8 @@ void ipoib_mcast_join_task(struct work_struct *work) | |||
| 547 | ipoib_warn(priv, "failed to allocate broadcast group\n"); | 574 | ipoib_warn(priv, "failed to allocate broadcast group\n"); |
| 548 | mutex_lock(&mcast_mutex); | 575 | mutex_lock(&mcast_mutex); |
| 549 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) | 576 | if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) |
| 550 | queue_delayed_work(ipoib_workqueue, | 577 | queue_delayed_work(priv->wq, &priv->mcast_task, |
| 551 | &priv->mcast_task, HZ); | 578 | HZ); |
| 552 | mutex_unlock(&mcast_mutex); | 579 | mutex_unlock(&mcast_mutex); |
| 553 | return; | 580 | return; |
| 554 | } | 581 | } |
| @@ -563,7 +590,8 @@ void ipoib_mcast_join_task(struct work_struct *work) | |||
| 563 | } | 590 | } |
| 564 | 591 | ||
| 565 | if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { | 592 | if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { |
| 566 | if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) | 593 | if (IS_ERR_OR_NULL(priv->broadcast->mc) && |
| 594 | !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) | ||
| 567 | ipoib_mcast_join(dev, priv->broadcast, 0); | 595 | ipoib_mcast_join(dev, priv->broadcast, 0); |
| 568 | return; | 596 | return; |
| 569 | } | 597 | } |
| @@ -571,23 +599,33 @@ void ipoib_mcast_join_task(struct work_struct *work) | |||
| 571 | while (1) { | 599 | while (1) { |
| 572 | struct ipoib_mcast *mcast = NULL; | 600 | struct ipoib_mcast *mcast = NULL; |
| 573 | 601 | ||
| 602 | /* | ||
| 603 | * Need the mutex so our flags are consistent, need the | ||
| 604 | * priv->lock so we don't race with list removals in either | ||
| 605 | * mcast_dev_flush or mcast_restart_task | ||
| 606 | */ | ||
| 607 | mutex_lock(&mcast_mutex); | ||
| 574 | spin_lock_irq(&priv->lock); | 608 | spin_lock_irq(&priv->lock); |
| 575 | list_for_each_entry(mcast, &priv->multicast_list, list) { | 609 | list_for_each_entry(mcast, &priv->multicast_list, list) { |
| 576 | if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) | 610 | if (IS_ERR_OR_NULL(mcast->mc) && |
| 577 | && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) | 611 | !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && |
| 578 | && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { | 612 | !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { |
| 579 | /* Found the next unjoined group */ | 613 | /* Found the next unjoined group */ |
| 580 | break; | 614 | break; |
| 581 | } | 615 | } |
| 582 | } | 616 | } |
| 583 | spin_unlock_irq(&priv->lock); | 617 | spin_unlock_irq(&priv->lock); |
| 618 | mutex_unlock(&mcast_mutex); | ||
| 584 | 619 | ||
| 585 | if (&mcast->list == &priv->multicast_list) { | 620 | if (&mcast->list == &priv->multicast_list) { |
| 586 | /* All done */ | 621 | /* All done */ |
| 587 | break; | 622 | break; |
| 588 | } | 623 | } |
| 589 | 624 | ||
| 590 | ipoib_mcast_join(dev, mcast, 1); | 625 | if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) |
| 626 | ipoib_mcast_sendonly_join(mcast); | ||
| 627 | else | ||
| 628 | ipoib_mcast_join(dev, mcast, 1); | ||
| 591 | return; | 629 | return; |
| 592 | } | 630 | } |
| 593 | 631 | ||
| @@ -604,13 +642,13 @@ int ipoib_mcast_start_thread(struct net_device *dev) | |||
| 604 | 642 | ||
| 605 | mutex_lock(&mcast_mutex); | 643 | mutex_lock(&mcast_mutex); |
| 606 | if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) | 644 | if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) |
| 607 | queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); | 645 | queue_delayed_work(priv->wq, &priv->mcast_task, 0); |
| 608 | mutex_unlock(&mcast_mutex); | 646 | mutex_unlock(&mcast_mutex); |
| 609 | 647 | ||
| 610 | return 0; | 648 | return 0; |
| 611 | } | 649 | } |
| 612 | 650 | ||
| 613 | int ipoib_mcast_stop_thread(struct net_device *dev, int flush) | 651 | int ipoib_mcast_stop_thread(struct net_device *dev) |
| 614 | { | 652 | { |
| 615 | struct ipoib_dev_priv *priv = netdev_priv(dev); | 653 | struct ipoib_dev_priv *priv = netdev_priv(dev); |
| 616 | 654 | ||
| @@ -621,8 +659,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush) | |||
| 621 | cancel_delayed_work(&priv->mcast_task); | 659 | cancel_delayed_work(&priv->mcast_task); |
| 622 | mutex_unlock(&mcast_mutex); | 660 | mutex_unlock(&mcast_mutex); |
| 623 | 661 | ||
| 624 | if (flush) | 662 | flush_workqueue(priv->wq); |
| 625 | flush_workqueue(ipoib_workqueue); | ||
| 626 | 663 | ||
| 627 | return 0; | 664 | return 0; |
| 628 | } | 665 | } |
| @@ -633,6 +670,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) | |||
| 633 | int ret = 0; | 670 | int ret = 0; |
| 634 | 671 | ||
| 635 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) | 672 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) |
| 673 | ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); | ||
| 674 | |||
| 675 | if (!IS_ERR_OR_NULL(mcast->mc)) | ||
| 636 | ib_sa_free_multicast(mcast->mc); | 676 | ib_sa_free_multicast(mcast->mc); |
| 637 | 677 | ||
| 638 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { | 678 | if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { |
| @@ -685,6 +725,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) | |||
| 685 | memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); | 725 | memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); |
| 686 | __ipoib_mcast_add(dev, mcast); | 726 | __ipoib_mcast_add(dev, mcast); |
| 687 | list_add_tail(&mcast->list, &priv->multicast_list); | 727 | list_add_tail(&mcast->list, &priv->multicast_list); |
| 728 | if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) | ||
| 729 | queue_delayed_work(priv->wq, &priv->mcast_task, 0); | ||
| 688 | } | 730 | } |
| 689 | 731 | ||
| 690 | if (!mcast->ah) { | 732 | if (!mcast->ah) { |
| @@ -698,8 +740,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) | |||
| 698 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) | 740 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) |
| 699 | ipoib_dbg_mcast(priv, "no address vector, " | 741 | ipoib_dbg_mcast(priv, "no address vector, " |
| 700 | "but multicast join already started\n"); | 742 | "but multicast join already started\n"); |
| 701 | else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) | ||
| 702 | ipoib_mcast_sendonly_join(mcast); | ||
| 703 | 743 | ||
| 704 | /* | 744 | /* |
| 705 | * If lookup completes between here and out:, don't | 745 | * If lookup completes between here and out:, don't |
| @@ -759,9 +799,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev) | |||
| 759 | 799 | ||
| 760 | spin_unlock_irqrestore(&priv->lock, flags); | 800 | spin_unlock_irqrestore(&priv->lock, flags); |
| 761 | 801 | ||
| 762 | /* seperate between the wait to the leave*/ | 802 | /* |
| 803 | * make sure the in-flight joins have finished before we attempt | ||
| 804 | * to leave | ||
| 805 | */ | ||
| 763 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) | 806 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) |
| 764 | if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) | 807 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) |
| 765 | wait_for_completion(&mcast->done); | 808 | wait_for_completion(&mcast->done); |
| 766 | 809 | ||
| 767 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { | 810 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { |
| @@ -794,8 +837,6 @@ void ipoib_mcast_restart_task(struct work_struct *work) | |||
| 794 | 837 | ||
| 795 | ipoib_dbg_mcast(priv, "restarting multicast task\n"); | 838 | ipoib_dbg_mcast(priv, "restarting multicast task\n"); |
| 796 | 839 | ||
| 797 | ipoib_mcast_stop_thread(dev, 0); | ||
| 798 | |||
| 799 | local_irq_save(flags); | 840 | local_irq_save(flags); |
| 800 | netif_addr_lock(dev); | 841 | netif_addr_lock(dev); |
| 801 | spin_lock(&priv->lock); | 842 | spin_lock(&priv->lock); |
| @@ -880,14 +921,38 @@ void ipoib_mcast_restart_task(struct work_struct *work) | |||
| 880 | netif_addr_unlock(dev); | 921 | netif_addr_unlock(dev); |
| 881 | local_irq_restore(flags); | 922 | local_irq_restore(flags); |
| 882 | 923 | ||
| 883 | /* We have to cancel outside of the spinlock */ | 924 | /* |
| 925 | * make sure the in-flight joins have finished before we attempt | ||
| 926 | * to leave | ||
| 927 | */ | ||
| 928 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) | ||
| 929 | if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) | ||
| 930 | wait_for_completion(&mcast->done); | ||
| 931 | |||
| 932 | /* | ||
| 933 | * We have to cancel outside of the spinlock, but we have to | ||
| 934 | * take the rtnl lock or else we race with the removal of | ||
| 935 | * entries from the remove list in mcast_dev_flush as part | ||
| 936 | * of ipoib_stop(). We detect the drop of the ADMIN_UP flag | ||
| 937 | * to signal that we have hit this particular race, and we | ||
| 938 | * return since we know we don't need to do anything else | ||
| 939 | * anyway. | ||
| 940 | */ | ||
| 941 | while (!rtnl_trylock()) { | ||
| 942 | if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) | ||
| 943 | return; | ||
| 944 | else | ||
| 945 | msleep(20); | ||
| 946 | } | ||
| 884 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { | 947 | list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { |
| 885 | ipoib_mcast_leave(mcast->dev, mcast); | 948 | ipoib_mcast_leave(mcast->dev, mcast); |
| 886 | ipoib_mcast_free(mcast); | 949 | ipoib_mcast_free(mcast); |
| 887 | } | 950 | } |
| 888 | 951 | /* | |
| 889 | if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) | 952 | * Restart our join task if needed |
| 890 | ipoib_mcast_start_thread(dev); | 953 | */ |
| 954 | ipoib_mcast_start_thread(dev); | ||
| 955 | rtnl_unlock(); | ||
| 891 | } | 956 | } |
| 892 | 957 | ||
| 893 | #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG | 958 | #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG |
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index c56d5d44c53b..b72a753eb41d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c | |||
| @@ -145,10 +145,20 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) | |||
| 145 | int ret, size; | 145 | int ret, size; |
| 146 | int i; | 146 | int i; |
| 147 | 147 | ||
| 148 | /* | ||
| 149 | * the various IPoIB tasks assume they will never race against | ||
| 150 | * themselves, so always use a single thread workqueue | ||
| 151 | */ | ||
| 152 | priv->wq = create_singlethread_workqueue("ipoib_wq"); | ||
| 153 | if (!priv->wq) { | ||
| 154 | printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); | ||
| 155 | return -ENODEV; | ||
| 156 | } | ||
| 157 | |||
| 148 | priv->pd = ib_alloc_pd(priv->ca); | 158 | priv->pd = ib_alloc_pd(priv->ca); |
| 149 | if (IS_ERR(priv->pd)) { | 159 | if (IS_ERR(priv->pd)) { |
| 150 | printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); | 160 | printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); |
| 151 | return -ENODEV; | 161 | goto out_free_wq; |
| 152 | } | 162 | } |
| 153 | 163 | ||
| 154 | priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); | 164 | priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE); |
| @@ -242,6 +252,10 @@ out_free_mr: | |||
| 242 | 252 | ||
| 243 | out_free_pd: | 253 | out_free_pd: |
| 244 | ib_dealloc_pd(priv->pd); | 254 | ib_dealloc_pd(priv->pd); |
| 255 | |||
| 256 | out_free_wq: | ||
| 257 | destroy_workqueue(priv->wq); | ||
| 258 | priv->wq = NULL; | ||
| 245 | return -ENODEV; | 259 | return -ENODEV; |
| 246 | } | 260 | } |
| 247 | 261 | ||
| @@ -270,6 +284,12 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) | |||
| 270 | 284 | ||
| 271 | if (ib_dealloc_pd(priv->pd)) | 285 | if (ib_dealloc_pd(priv->pd)) |
| 272 | ipoib_warn(priv, "ib_dealloc_pd failed\n"); | 286 | ipoib_warn(priv, "ib_dealloc_pd failed\n"); |
| 287 | |||
| 288 | if (priv->wq) { | ||
| 289 | flush_workqueue(priv->wq); | ||
| 290 | destroy_workqueue(priv->wq); | ||
| 291 | priv->wq = NULL; | ||
| 292 | } | ||
| 273 | } | 293 | } |
| 274 | 294 | ||
| 275 | void ipoib_event(struct ib_event_handler *handler, | 295 | void ipoib_event(struct ib_event_handler *handler, |
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 20ca6a619476..6a594aac2290 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c | |||
| @@ -97,7 +97,7 @@ module_param_named(pi_enable, iser_pi_enable, bool, 0644); | |||
| 97 | MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); | 97 | MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)"); |
| 98 | 98 | ||
| 99 | module_param_named(pi_guard, iser_pi_guard, int, 0644); | 99 | module_param_named(pi_guard, iser_pi_guard, int, 0644); |
| 100 | MODULE_PARM_DESC(pi_guard, "T10-PI guard_type, 0:CRC|1:IP_CSUM (default:IP_CSUM)"); | 100 | MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]"); |
| 101 | 101 | ||
| 102 | static struct workqueue_struct *release_wq; | 102 | static struct workqueue_struct *release_wq; |
| 103 | struct iser_global ig; | 103 | struct iser_global ig; |
| @@ -164,18 +164,42 @@ iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode) | |||
| 164 | return 0; | 164 | return 0; |
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | int iser_initialize_task_headers(struct iscsi_task *task, | 167 | /** |
| 168 | struct iser_tx_desc *tx_desc) | 168 | * iser_initialize_task_headers() - Initialize task headers |
| 169 | * @task: iscsi task | ||
| 170 | * @tx_desc: iser tx descriptor | ||
| 171 | * | ||
| 172 | * Notes: | ||
| 173 | * This routine may race with iser teardown flow for scsi | ||
| 174 | * error handling TMFs. So for TMF we should acquire the | ||
| 175 | * state mutex to avoid dereferencing the IB device which | ||
| 176 | * may have already been terminated. | ||
| 177 | */ | ||
| 178 | int | ||
| 179 | iser_initialize_task_headers(struct iscsi_task *task, | ||
| 180 | struct iser_tx_desc *tx_desc) | ||
| 169 | { | 181 | { |
| 170 | struct iser_conn *iser_conn = task->conn->dd_data; | 182 | struct iser_conn *iser_conn = task->conn->dd_data; |
| 171 | struct iser_device *device = iser_conn->ib_conn.device; | 183 | struct iser_device *device = iser_conn->ib_conn.device; |
| 172 | struct iscsi_iser_task *iser_task = task->dd_data; | 184 | struct iscsi_iser_task *iser_task = task->dd_data; |
| 173 | u64 dma_addr; | 185 | u64 dma_addr; |
| 186 | const bool mgmt_task = !task->sc && !in_interrupt(); | ||
| 187 | int ret = 0; | ||
| 188 | |||
| 189 | if (unlikely(mgmt_task)) | ||
| 190 | mutex_lock(&iser_conn->state_mutex); | ||
| 191 | |||
| 192 | if (unlikely(iser_conn->state != ISER_CONN_UP)) { | ||
| 193 | ret = -ENODEV; | ||
| 194 | goto out; | ||
| 195 | } | ||
| 174 | 196 | ||
| 175 | dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, | 197 | dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc, |
| 176 | ISER_HEADERS_LEN, DMA_TO_DEVICE); | 198 | ISER_HEADERS_LEN, DMA_TO_DEVICE); |
| 177 | if (ib_dma_mapping_error(device->ib_device, dma_addr)) | 199 | if (ib_dma_mapping_error(device->ib_device, dma_addr)) { |
| 178 | return -ENOMEM; | 200 | ret = -ENOMEM; |
| 201 | goto out; | ||
| 202 | } | ||
| 179 | 203 | ||
| 180 | tx_desc->dma_addr = dma_addr; | 204 | tx_desc->dma_addr = dma_addr; |
| 181 | tx_desc->tx_sg[0].addr = tx_desc->dma_addr; | 205 | tx_desc->tx_sg[0].addr = tx_desc->dma_addr; |
| @@ -183,7 +207,11 @@ int iser_initialize_task_headers(struct iscsi_task *task, | |||
| 183 | tx_desc->tx_sg[0].lkey = device->mr->lkey; | 207 | tx_desc->tx_sg[0].lkey = device->mr->lkey; |
| 184 | 208 | ||
| 185 | iser_task->iser_conn = iser_conn; | 209 | iser_task->iser_conn = iser_conn; |
| 186 | return 0; | 210 | out: |
| 211 | if (unlikely(mgmt_task)) | ||
| 212 | mutex_unlock(&iser_conn->state_mutex); | ||
| 213 | |||
| 214 | return ret; | ||
| 187 | } | 215 | } |
| 188 | 216 | ||
| 189 | /** | 217 | /** |
| @@ -199,9 +227,14 @@ static int | |||
| 199 | iscsi_iser_task_init(struct iscsi_task *task) | 227 | iscsi_iser_task_init(struct iscsi_task *task) |
| 200 | { | 228 | { |
| 201 | struct iscsi_iser_task *iser_task = task->dd_data; | 229 | struct iscsi_iser_task *iser_task = task->dd_data; |
| 230 | int ret; | ||
| 202 | 231 | ||
| 203 | if (iser_initialize_task_headers(task, &iser_task->desc)) | 232 | ret = iser_initialize_task_headers(task, &iser_task->desc); |
| 204 | return -ENOMEM; | 233 | if (ret) { |
| 234 | iser_err("Failed to init task %p, err = %d\n", | ||
| 235 | iser_task, ret); | ||
| 236 | return ret; | ||
| 237 | } | ||
| 205 | 238 | ||
| 206 | /* mgmt task */ | 239 | /* mgmt task */ |
| 207 | if (!task->sc) | 240 | if (!task->sc) |
| @@ -508,8 +541,8 @@ iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag) | |||
| 508 | */ | 541 | */ |
| 509 | if (iser_conn) { | 542 | if (iser_conn) { |
| 510 | mutex_lock(&iser_conn->state_mutex); | 543 | mutex_lock(&iser_conn->state_mutex); |
| 511 | iscsi_conn_stop(cls_conn, flag); | ||
| 512 | iser_conn_terminate(iser_conn); | 544 | iser_conn_terminate(iser_conn); |
| 545 | iscsi_conn_stop(cls_conn, flag); | ||
| 513 | 546 | ||
| 514 | /* unbind */ | 547 | /* unbind */ |
| 515 | iser_conn->iscsi_conn = NULL; | 548 | iser_conn->iscsi_conn = NULL; |
| @@ -541,12 +574,13 @@ iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session) | |||
| 541 | static inline unsigned int | 574 | static inline unsigned int |
| 542 | iser_dif_prot_caps(int prot_caps) | 575 | iser_dif_prot_caps(int prot_caps) |
| 543 | { | 576 | { |
| 544 | return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? SHOST_DIF_TYPE1_PROTECTION | | 577 | return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ? |
| 545 | SHOST_DIX_TYPE1_PROTECTION : 0) | | 578 | SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION | |
| 546 | ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? SHOST_DIF_TYPE2_PROTECTION | | 579 | SHOST_DIX_TYPE1_PROTECTION : 0) | |
| 547 | SHOST_DIX_TYPE2_PROTECTION : 0) | | 580 | ((prot_caps & IB_PROT_T10DIF_TYPE_2) ? |
| 548 | ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? SHOST_DIF_TYPE3_PROTECTION | | 581 | SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) | |
| 549 | SHOST_DIX_TYPE3_PROTECTION : 0); | 582 | ((prot_caps & IB_PROT_T10DIF_TYPE_3) ? |
| 583 | SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0); | ||
| 550 | } | 584 | } |
| 551 | 585 | ||
| 552 | /** | 586 | /** |
| @@ -569,6 +603,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, | |||
| 569 | struct Scsi_Host *shost; | 603 | struct Scsi_Host *shost; |
| 570 | struct iser_conn *iser_conn = NULL; | 604 | struct iser_conn *iser_conn = NULL; |
| 571 | struct ib_conn *ib_conn; | 605 | struct ib_conn *ib_conn; |
| 606 | u16 max_cmds; | ||
| 572 | 607 | ||
| 573 | shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); | 608 | shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); |
| 574 | if (!shost) | 609 | if (!shost) |
| @@ -586,26 +621,41 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, | |||
| 586 | */ | 621 | */ |
| 587 | if (ep) { | 622 | if (ep) { |
| 588 | iser_conn = ep->dd_data; | 623 | iser_conn = ep->dd_data; |
| 624 | max_cmds = iser_conn->max_cmds; | ||
| 625 | |||
| 626 | mutex_lock(&iser_conn->state_mutex); | ||
| 627 | if (iser_conn->state != ISER_CONN_UP) { | ||
| 628 | iser_err("iser conn %p already started teardown\n", | ||
| 629 | iser_conn); | ||
| 630 | mutex_unlock(&iser_conn->state_mutex); | ||
| 631 | goto free_host; | ||
| 632 | } | ||
| 633 | |||
| 589 | ib_conn = &iser_conn->ib_conn; | 634 | ib_conn = &iser_conn->ib_conn; |
| 590 | if (ib_conn->pi_support) { | 635 | if (ib_conn->pi_support) { |
| 591 | u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; | 636 | u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; |
| 592 | 637 | ||
| 593 | scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); | 638 | scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); |
| 594 | if (iser_pi_guard) | 639 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP | |
| 595 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP); | 640 | SHOST_DIX_GUARD_CRC); |
| 596 | else | ||
| 597 | scsi_host_set_guard(shost, SHOST_DIX_GUARD_CRC); | ||
| 598 | } | 641 | } |
| 599 | } | ||
| 600 | 642 | ||
| 601 | if (iscsi_host_add(shost, ep ? | 643 | if (iscsi_host_add(shost, |
| 602 | ib_conn->device->ib_device->dma_device : NULL)) | 644 | ib_conn->device->ib_device->dma_device)) { |
| 603 | goto free_host; | 645 | mutex_unlock(&iser_conn->state_mutex); |
| 646 | goto free_host; | ||
| 647 | } | ||
| 648 | mutex_unlock(&iser_conn->state_mutex); | ||
| 649 | } else { | ||
| 650 | max_cmds = ISER_DEF_XMIT_CMDS_MAX; | ||
| 651 | if (iscsi_host_add(shost, NULL)) | ||
| 652 | goto free_host; | ||
| 653 | } | ||
| 604 | 654 | ||
| 605 | if (cmds_max > ISER_DEF_XMIT_CMDS_MAX) { | 655 | if (cmds_max > max_cmds) { |
| 606 | iser_info("cmds_max changed from %u to %u\n", | 656 | iser_info("cmds_max changed from %u to %u\n", |
| 607 | cmds_max, ISER_DEF_XMIT_CMDS_MAX); | 657 | cmds_max, max_cmds); |
| 608 | cmds_max = ISER_DEF_XMIT_CMDS_MAX; | 658 | cmds_max = max_cmds; |
| 609 | } | 659 | } |
| 610 | 660 | ||
| 611 | cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, | 661 | cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, |
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index cd4174ca9a76..5ce26817e7e1 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h | |||
| @@ -69,34 +69,31 @@ | |||
| 69 | 69 | ||
| 70 | #define DRV_NAME "iser" | 70 | #define DRV_NAME "iser" |
| 71 | #define PFX DRV_NAME ": " | 71 | #define PFX DRV_NAME ": " |
| 72 | #define DRV_VER "1.4.8" | 72 | #define DRV_VER "1.5" |
| 73 | 73 | ||
| 74 | #define iser_dbg(fmt, arg...) \ | 74 | #define iser_dbg(fmt, arg...) \ |
| 75 | do { \ | 75 | do { \ |
| 76 | if (iser_debug_level > 2) \ | 76 | if (unlikely(iser_debug_level > 2)) \ |
| 77 | printk(KERN_DEBUG PFX "%s: " fmt,\ | 77 | printk(KERN_DEBUG PFX "%s: " fmt,\ |
| 78 | __func__ , ## arg); \ | 78 | __func__ , ## arg); \ |
| 79 | } while (0) | 79 | } while (0) |
| 80 | 80 | ||
| 81 | #define iser_warn(fmt, arg...) \ | 81 | #define iser_warn(fmt, arg...) \ |
| 82 | do { \ | 82 | do { \ |
| 83 | if (iser_debug_level > 0) \ | 83 | if (unlikely(iser_debug_level > 0)) \ |
| 84 | pr_warn(PFX "%s: " fmt, \ | 84 | pr_warn(PFX "%s: " fmt, \ |
| 85 | __func__ , ## arg); \ | 85 | __func__ , ## arg); \ |
| 86 | } while (0) | 86 | } while (0) |
| 87 | 87 | ||
| 88 | #define iser_info(fmt, arg...) \ | 88 | #define iser_info(fmt, arg...) \ |
| 89 | do { \ | 89 | do { \ |
| 90 | if (iser_debug_level > 1) \ | 90 | if (unlikely(iser_debug_level > 1)) \ |
| 91 | pr_info(PFX "%s: " fmt, \ | 91 | pr_info(PFX "%s: " fmt, \ |
| 92 | __func__ , ## arg); \ | 92 | __func__ , ## arg); \ |
| 93 | } while (0) | 93 | } while (0) |
| 94 | 94 | ||
| 95 | #define iser_err(fmt, arg...) \ | 95 | #define iser_err(fmt, arg...) \ |
| 96 | do { \ | 96 | pr_err(PFX "%s: " fmt, __func__ , ## arg) |
| 97 | printk(KERN_ERR PFX "%s: " fmt, \ | ||
| 98 | __func__ , ## arg); \ | ||
| 99 | } while (0) | ||
| 100 | 97 | ||
| 101 | #define SHIFT_4K 12 | 98 | #define SHIFT_4K 12 |
| 102 | #define SIZE_4K (1ULL << SHIFT_4K) | 99 | #define SIZE_4K (1ULL << SHIFT_4K) |
| @@ -144,6 +141,11 @@ | |||
| 144 | ISER_MAX_TX_MISC_PDUS + \ | 141 | ISER_MAX_TX_MISC_PDUS + \ |
| 145 | ISER_MAX_RX_MISC_PDUS) | 142 | ISER_MAX_RX_MISC_PDUS) |
| 146 | 143 | ||
| 144 | #define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \ | ||
| 145 | - ISER_MAX_TX_MISC_PDUS \ | ||
| 146 | - ISER_MAX_RX_MISC_PDUS) / \ | ||
| 147 | (1 + ISER_INFLIGHT_DATAOUTS)) | ||
| 148 | |||
| 147 | #define ISER_WC_BATCH_COUNT 16 | 149 | #define ISER_WC_BATCH_COUNT 16 |
| 148 | #define ISER_SIGNAL_CMD_COUNT 32 | 150 | #define ISER_SIGNAL_CMD_COUNT 32 |
| 149 | 151 | ||
| @@ -247,7 +249,6 @@ struct iscsi_endpoint; | |||
| 247 | * @va: MR start address (buffer va) | 249 | * @va: MR start address (buffer va) |
| 248 | * @len: MR length | 250 | * @len: MR length |
| 249 | * @mem_h: pointer to registration context (FMR/Fastreg) | 251 | * @mem_h: pointer to registration context (FMR/Fastreg) |
| 250 | * @is_mr: indicates weather we registered the buffer | ||
| 251 | */ | 252 | */ |
| 252 | struct iser_mem_reg { | 253 | struct iser_mem_reg { |
| 253 | u32 lkey; | 254 | u32 lkey; |
| @@ -255,7 +256,6 @@ struct iser_mem_reg { | |||
| 255 | u64 va; | 256 | u64 va; |
| 256 | u64 len; | 257 | u64 len; |
| 257 | void *mem_h; | 258 | void *mem_h; |
| 258 | int is_mr; | ||
| 259 | }; | 259 | }; |
| 260 | 260 | ||
| 261 | /** | 261 | /** |
| @@ -323,8 +323,6 @@ struct iser_rx_desc { | |||
| 323 | char pad[ISER_RX_PAD_SIZE]; | 323 | char pad[ISER_RX_PAD_SIZE]; |
| 324 | } __attribute__((packed)); | 324 | } __attribute__((packed)); |
| 325 | 325 | ||
| 326 | #define ISER_MAX_CQ 4 | ||
| 327 | |||
| 328 | struct iser_conn; | 326 | struct iser_conn; |
| 329 | struct ib_conn; | 327 | struct ib_conn; |
| 330 | struct iscsi_iser_task; | 328 | struct iscsi_iser_task; |
| @@ -375,7 +373,7 @@ struct iser_device { | |||
| 375 | struct list_head ig_list; | 373 | struct list_head ig_list; |
| 376 | int refcount; | 374 | int refcount; |
| 377 | int comps_used; | 375 | int comps_used; |
| 378 | struct iser_comp comps[ISER_MAX_CQ]; | 376 | struct iser_comp *comps; |
| 379 | int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, | 377 | int (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn, |
| 380 | unsigned cmds_max); | 378 | unsigned cmds_max); |
| 381 | void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); | 379 | void (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn); |
| @@ -432,6 +430,7 @@ struct fast_reg_descriptor { | |||
| 432 | * @cma_id: rdma_cm connection maneger handle | 430 | * @cma_id: rdma_cm connection maneger handle |
| 433 | * @qp: Connection Queue-pair | 431 | * @qp: Connection Queue-pair |
| 434 | * @post_recv_buf_count: post receive counter | 432 | * @post_recv_buf_count: post receive counter |
| 433 | * @sig_count: send work request signal count | ||
| 435 | * @rx_wr: receive work request for batch posts | 434 | * @rx_wr: receive work request for batch posts |
| 436 | * @device: reference to iser device | 435 | * @device: reference to iser device |
| 437 | * @comp: iser completion context | 436 | * @comp: iser completion context |
| @@ -452,6 +451,7 @@ struct ib_conn { | |||
| 452 | struct rdma_cm_id *cma_id; | 451 | struct rdma_cm_id *cma_id; |
| 453 | struct ib_qp *qp; | 452 | struct ib_qp *qp; |
| 454 | int post_recv_buf_count; | 453 | int post_recv_buf_count; |
| 454 | u8 sig_count; | ||
| 455 | struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; | 455 | struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; |
| 456 | struct iser_device *device; | 456 | struct iser_device *device; |
| 457 | struct iser_comp *comp; | 457 | struct iser_comp *comp; |
| @@ -482,6 +482,7 @@ struct ib_conn { | |||
| 482 | * to max number of post recvs | 482 | * to max number of post recvs |
| 483 | * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) | 483 | * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) |
| 484 | * @min_posted_rx: (qp_max_recv_dtos >> 2) | 484 | * @min_posted_rx: (qp_max_recv_dtos >> 2) |
| 485 | * @max_cmds: maximum cmds allowed for this connection | ||
| 485 | * @name: connection peer portal | 486 | * @name: connection peer portal |
| 486 | * @release_work: deffered work for release job | 487 | * @release_work: deffered work for release job |
| 487 | * @state_mutex: protects iser onnection state | 488 | * @state_mutex: protects iser onnection state |
| @@ -507,6 +508,7 @@ struct iser_conn { | |||
| 507 | unsigned qp_max_recv_dtos; | 508 | unsigned qp_max_recv_dtos; |
| 508 | unsigned qp_max_recv_dtos_mask; | 509 | unsigned qp_max_recv_dtos_mask; |
| 509 | unsigned min_posted_rx; | 510 | unsigned min_posted_rx; |
| 511 | u16 max_cmds; | ||
| 510 | char name[ISER_OBJECT_NAME_SIZE]; | 512 | char name[ISER_OBJECT_NAME_SIZE]; |
| 511 | struct work_struct release_work; | 513 | struct work_struct release_work; |
| 512 | struct mutex state_mutex; | 514 | struct mutex state_mutex; |
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 5a489ea63732..3821633f1065 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c | |||
| @@ -369,7 +369,7 @@ static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) | |||
| 369 | return 0; | 369 | return 0; |
| 370 | } | 370 | } |
| 371 | 371 | ||
| 372 | static inline bool iser_signal_comp(int sig_count) | 372 | static inline bool iser_signal_comp(u8 sig_count) |
| 373 | { | 373 | { |
| 374 | return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); | 374 | return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0); |
| 375 | } | 375 | } |
| @@ -388,7 +388,7 @@ int iser_send_command(struct iscsi_conn *conn, | |||
| 388 | struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; | 388 | struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr; |
| 389 | struct scsi_cmnd *sc = task->sc; | 389 | struct scsi_cmnd *sc = task->sc; |
| 390 | struct iser_tx_desc *tx_desc = &iser_task->desc; | 390 | struct iser_tx_desc *tx_desc = &iser_task->desc; |
| 391 | static unsigned sig_count; | 391 | u8 sig_count = ++iser_conn->ib_conn.sig_count; |
| 392 | 392 | ||
| 393 | edtl = ntohl(hdr->data_length); | 393 | edtl = ntohl(hdr->data_length); |
| 394 | 394 | ||
| @@ -435,7 +435,7 @@ int iser_send_command(struct iscsi_conn *conn, | |||
| 435 | iser_task->status = ISER_TASK_STATUS_STARTED; | 435 | iser_task->status = ISER_TASK_STATUS_STARTED; |
| 436 | 436 | ||
| 437 | err = iser_post_send(&iser_conn->ib_conn, tx_desc, | 437 | err = iser_post_send(&iser_conn->ib_conn, tx_desc, |
| 438 | iser_signal_comp(++sig_count)); | 438 | iser_signal_comp(sig_count)); |
| 439 | if (!err) | 439 | if (!err) |
| 440 | return 0; | 440 | return 0; |
| 441 | 441 | ||
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 6c5ce357fba6..abce9339333f 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c | |||
| @@ -73,7 +73,6 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, | |||
| 73 | 73 | ||
| 74 | if (cmd_dir == ISER_DIR_OUT) { | 74 | if (cmd_dir == ISER_DIR_OUT) { |
| 75 | /* copy the unaligned sg the buffer which is used for RDMA */ | 75 | /* copy the unaligned sg the buffer which is used for RDMA */ |
| 76 | int i; | ||
| 77 | char *p, *from; | 76 | char *p, *from; |
| 78 | 77 | ||
| 79 | sgl = (struct scatterlist *)data->buf; | 78 | sgl = (struct scatterlist *)data->buf; |
| @@ -409,7 +408,6 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, | |||
| 409 | regd_buf->reg.rkey = device->mr->rkey; | 408 | regd_buf->reg.rkey = device->mr->rkey; |
| 410 | regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); | 409 | regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); |
| 411 | regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); | 410 | regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); |
| 412 | regd_buf->reg.is_mr = 0; | ||
| 413 | 411 | ||
| 414 | iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " | 412 | iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " |
| 415 | "va: 0x%08lX sz: %ld]\n", | 413 | "va: 0x%08lX sz: %ld]\n", |
| @@ -440,13 +438,13 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, | |||
| 440 | return 0; | 438 | return 0; |
| 441 | } | 439 | } |
| 442 | 440 | ||
| 443 | static inline void | 441 | static void |
| 444 | iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, | 442 | iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, |
| 445 | struct ib_sig_domain *domain) | 443 | struct ib_sig_domain *domain) |
| 446 | { | 444 | { |
| 447 | domain->sig_type = IB_SIG_TYPE_T10_DIF; | 445 | domain->sig_type = IB_SIG_TYPE_T10_DIF; |
| 448 | domain->sig.dif.pi_interval = sc->device->sector_size; | 446 | domain->sig.dif.pi_interval = scsi_prot_interval(sc); |
| 449 | domain->sig.dif.ref_tag = scsi_get_lba(sc) & 0xffffffff; | 447 | domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc); |
| 450 | /* | 448 | /* |
| 451 | * At the moment we hard code those, but in the future | 449 | * At the moment we hard code those, but in the future |
| 452 | * we will take them from sc. | 450 | * we will take them from sc. |
| @@ -454,8 +452,7 @@ iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs, | |||
| 454 | domain->sig.dif.apptag_check_mask = 0xffff; | 452 | domain->sig.dif.apptag_check_mask = 0xffff; |
| 455 | domain->sig.dif.app_escape = true; | 453 | domain->sig.dif.app_escape = true; |
| 456 | domain->sig.dif.ref_escape = true; | 454 | domain->sig.dif.ref_escape = true; |
| 457 | if (scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE1 || | 455 | if (sc->prot_flags & SCSI_PROT_REF_INCREMENT) |
| 458 | scsi_get_prot_type(sc) == SCSI_PROT_DIF_TYPE2) | ||
| 459 | domain->sig.dif.ref_remap = true; | 456 | domain->sig.dif.ref_remap = true; |
| 460 | }; | 457 | }; |
| 461 | 458 | ||
| @@ -473,26 +470,16 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) | |||
| 473 | case SCSI_PROT_WRITE_STRIP: | 470 | case SCSI_PROT_WRITE_STRIP: |
| 474 | sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; | 471 | sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE; |
| 475 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); | 472 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); |
| 476 | /* | 473 | sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? |
| 477 | * At the moment we use this modparam to tell what is | 474 | IB_T10DIF_CSUM : IB_T10DIF_CRC; |
| 478 | * the memory bg_type, in the future we will take it | ||
| 479 | * from sc. | ||
| 480 | */ | ||
| 481 | sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : | ||
| 482 | IB_T10DIF_CRC; | ||
| 483 | break; | 475 | break; |
| 484 | case SCSI_PROT_READ_PASS: | 476 | case SCSI_PROT_READ_PASS: |
| 485 | case SCSI_PROT_WRITE_PASS: | 477 | case SCSI_PROT_WRITE_PASS: |
| 486 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); | 478 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire); |
| 487 | sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; | 479 | sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC; |
| 488 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); | 480 | iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem); |
| 489 | /* | 481 | sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ? |
| 490 | * At the moment we use this modparam to tell what is | 482 | IB_T10DIF_CSUM : IB_T10DIF_CRC; |
| 491 | * the memory bg_type, in the future we will take it | ||
| 492 | * from sc. | ||
| 493 | */ | ||
| 494 | sig_attrs->mem.sig.dif.bg_type = iser_pi_guard ? IB_T10DIF_CSUM : | ||
| 495 | IB_T10DIF_CRC; | ||
| 496 | break; | 483 | break; |
| 497 | default: | 484 | default: |
| 498 | iser_err("Unsupported PI operation %d\n", | 485 | iser_err("Unsupported PI operation %d\n", |
| @@ -503,26 +490,28 @@ iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs) | |||
| 503 | return 0; | 490 | return 0; |
| 504 | } | 491 | } |
| 505 | 492 | ||
| 506 | static int | 493 | static inline void |
| 507 | iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) | 494 | iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) |
| 508 | { | 495 | { |
| 509 | switch (scsi_get_prot_type(sc)) { | 496 | *mask = 0; |
| 510 | case SCSI_PROT_DIF_TYPE0: | 497 | if (sc->prot_flags & SCSI_PROT_REF_CHECK) |
| 511 | break; | 498 | *mask |= ISER_CHECK_REFTAG; |
| 512 | case SCSI_PROT_DIF_TYPE1: | 499 | if (sc->prot_flags & SCSI_PROT_GUARD_CHECK) |
| 513 | case SCSI_PROT_DIF_TYPE2: | 500 | *mask |= ISER_CHECK_GUARD; |
| 514 | *mask = ISER_CHECK_GUARD | ISER_CHECK_REFTAG; | 501 | } |
| 515 | break; | ||
| 516 | case SCSI_PROT_DIF_TYPE3: | ||
| 517 | *mask = ISER_CHECK_GUARD; | ||
| 518 | break; | ||
| 519 | default: | ||
| 520 | iser_err("Unsupported protection type %d\n", | ||
| 521 | scsi_get_prot_type(sc)); | ||
| 522 | return -EINVAL; | ||
| 523 | } | ||
| 524 | 502 | ||
| 525 | return 0; | 503 | static void |
| 504 | iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) | ||
| 505 | { | ||
| 506 | u32 rkey; | ||
| 507 | |||
| 508 | memset(inv_wr, 0, sizeof(*inv_wr)); | ||
| 509 | inv_wr->opcode = IB_WR_LOCAL_INV; | ||
| 510 | inv_wr->wr_id = ISER_FASTREG_LI_WRID; | ||
| 511 | inv_wr->ex.invalidate_rkey = mr->rkey; | ||
| 512 | |||
| 513 | rkey = ib_inc_rkey(mr->rkey); | ||
| 514 | ib_update_fast_reg_key(mr, rkey); | ||
| 526 | } | 515 | } |
| 527 | 516 | ||
| 528 | static int | 517 | static int |
| @@ -536,26 +525,17 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, | |||
| 536 | struct ib_send_wr *bad_wr, *wr = NULL; | 525 | struct ib_send_wr *bad_wr, *wr = NULL; |
| 537 | struct ib_sig_attrs sig_attrs; | 526 | struct ib_sig_attrs sig_attrs; |
| 538 | int ret; | 527 | int ret; |
| 539 | u32 key; | ||
| 540 | 528 | ||
| 541 | memset(&sig_attrs, 0, sizeof(sig_attrs)); | 529 | memset(&sig_attrs, 0, sizeof(sig_attrs)); |
| 542 | ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); | 530 | ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs); |
| 543 | if (ret) | 531 | if (ret) |
| 544 | goto err; | 532 | goto err; |
| 545 | 533 | ||
| 546 | ret = iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); | 534 | iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask); |
| 547 | if (ret) | ||
| 548 | goto err; | ||
| 549 | 535 | ||
| 550 | if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { | 536 | if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) { |
| 551 | memset(&inv_wr, 0, sizeof(inv_wr)); | 537 | iser_inv_rkey(&inv_wr, pi_ctx->sig_mr); |
| 552 | inv_wr.opcode = IB_WR_LOCAL_INV; | ||
| 553 | inv_wr.wr_id = ISER_FASTREG_LI_WRID; | ||
| 554 | inv_wr.ex.invalidate_rkey = pi_ctx->sig_mr->rkey; | ||
| 555 | wr = &inv_wr; | 538 | wr = &inv_wr; |
| 556 | /* Bump the key */ | ||
| 557 | key = (u8)(pi_ctx->sig_mr->rkey & 0x000000FF); | ||
| 558 | ib_update_fast_reg_key(pi_ctx->sig_mr, ++key); | ||
| 559 | } | 539 | } |
| 560 | 540 | ||
| 561 | memset(&sig_wr, 0, sizeof(sig_wr)); | 541 | memset(&sig_wr, 0, sizeof(sig_wr)); |
| @@ -585,12 +565,7 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, | |||
| 585 | 565 | ||
| 586 | sig_sge->lkey = pi_ctx->sig_mr->lkey; | 566 | sig_sge->lkey = pi_ctx->sig_mr->lkey; |
| 587 | sig_sge->addr = 0; | 567 | sig_sge->addr = 0; |
| 588 | sig_sge->length = data_sge->length + prot_sge->length; | 568 | sig_sge->length = scsi_transfer_length(iser_task->sc); |
| 589 | if (scsi_get_prot_op(iser_task->sc) == SCSI_PROT_WRITE_INSERT || | ||
| 590 | scsi_get_prot_op(iser_task->sc) == SCSI_PROT_READ_STRIP) { | ||
| 591 | sig_sge->length += (data_sge->length / | ||
| 592 | iser_task->sc->device->sector_size) * 8; | ||
| 593 | } | ||
| 594 | 569 | ||
| 595 | iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", | 570 | iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", |
| 596 | sig_sge->addr, sig_sge->length, | 571 | sig_sge->addr, sig_sge->length, |
| @@ -613,7 +588,6 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, | |||
| 613 | struct ib_fast_reg_page_list *frpl; | 588 | struct ib_fast_reg_page_list *frpl; |
| 614 | struct ib_send_wr fastreg_wr, inv_wr; | 589 | struct ib_send_wr fastreg_wr, inv_wr; |
| 615 | struct ib_send_wr *bad_wr, *wr = NULL; | 590 | struct ib_send_wr *bad_wr, *wr = NULL; |
| 616 | u8 key; | ||
| 617 | int ret, offset, size, plen; | 591 | int ret, offset, size, plen; |
| 618 | 592 | ||
| 619 | /* if there a single dma entry, dma mr suffices */ | 593 | /* if there a single dma entry, dma mr suffices */ |
| @@ -645,14 +619,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, | |||
| 645 | } | 619 | } |
| 646 | 620 | ||
| 647 | if (!(desc->reg_indicators & ind)) { | 621 | if (!(desc->reg_indicators & ind)) { |
| 648 | memset(&inv_wr, 0, sizeof(inv_wr)); | 622 | iser_inv_rkey(&inv_wr, mr); |
| 649 | inv_wr.wr_id = ISER_FASTREG_LI_WRID; | ||
| 650 | inv_wr.opcode = IB_WR_LOCAL_INV; | ||
| 651 | inv_wr.ex.invalidate_rkey = mr->rkey; | ||
| 652 | wr = &inv_wr; | 623 | wr = &inv_wr; |
| 653 | /* Bump the key */ | ||
| 654 | key = (u8)(mr->rkey & 0x000000FF); | ||
| 655 | ib_update_fast_reg_key(mr, ++key); | ||
| 656 | } | 624 | } |
| 657 | 625 | ||
| 658 | /* Prepare FASTREG WR */ | 626 | /* Prepare FASTREG WR */ |
| @@ -770,15 +738,11 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, | |||
| 770 | regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; | 738 | regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; |
| 771 | regd_buf->reg.va = sig_sge.addr; | 739 | regd_buf->reg.va = sig_sge.addr; |
| 772 | regd_buf->reg.len = sig_sge.length; | 740 | regd_buf->reg.len = sig_sge.length; |
| 773 | regd_buf->reg.is_mr = 1; | ||
| 774 | } else { | 741 | } else { |
| 775 | if (desc) { | 742 | if (desc) |
| 776 | regd_buf->reg.rkey = desc->data_mr->rkey; | 743 | regd_buf->reg.rkey = desc->data_mr->rkey; |
| 777 | regd_buf->reg.is_mr = 1; | 744 | else |
| 778 | } else { | ||
| 779 | regd_buf->reg.rkey = device->mr->rkey; | 745 | regd_buf->reg.rkey = device->mr->rkey; |
| 780 | regd_buf->reg.is_mr = 0; | ||
| 781 | } | ||
| 782 | 746 | ||
| 783 | regd_buf->reg.lkey = data_sge.lkey; | 747 | regd_buf->reg.lkey = data_sge.lkey; |
| 784 | regd_buf->reg.va = data_sge.addr; | 748 | regd_buf->reg.va = data_sge.addr; |
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 67225bb82bb5..695a2704bd43 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c | |||
| @@ -76,7 +76,7 @@ static void iser_event_handler(struct ib_event_handler *handler, | |||
| 76 | static int iser_create_device_ib_res(struct iser_device *device) | 76 | static int iser_create_device_ib_res(struct iser_device *device) |
| 77 | { | 77 | { |
| 78 | struct ib_device_attr *dev_attr = &device->dev_attr; | 78 | struct ib_device_attr *dev_attr = &device->dev_attr; |
| 79 | int ret, i; | 79 | int ret, i, max_cqe; |
| 80 | 80 | ||
| 81 | ret = ib_query_device(device->ib_device, dev_attr); | 81 | ret = ib_query_device(device->ib_device, dev_attr); |
| 82 | if (ret) { | 82 | if (ret) { |
| @@ -104,11 +104,19 @@ static int iser_create_device_ib_res(struct iser_device *device) | |||
| 104 | return -1; | 104 | return -1; |
| 105 | } | 105 | } |
| 106 | 106 | ||
| 107 | device->comps_used = min(ISER_MAX_CQ, | 107 | device->comps_used = min_t(int, num_online_cpus(), |
| 108 | device->ib_device->num_comp_vectors); | 108 | device->ib_device->num_comp_vectors); |
| 109 | iser_info("using %d CQs, device %s supports %d vectors\n", | 109 | |
| 110 | device->comps = kcalloc(device->comps_used, sizeof(*device->comps), | ||
| 111 | GFP_KERNEL); | ||
| 112 | if (!device->comps) | ||
| 113 | goto comps_err; | ||
| 114 | |||
| 115 | max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe); | ||
| 116 | |||
| 117 | iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n", | ||
| 110 | device->comps_used, device->ib_device->name, | 118 | device->comps_used, device->ib_device->name, |
| 111 | device->ib_device->num_comp_vectors); | 119 | device->ib_device->num_comp_vectors, max_cqe); |
| 112 | 120 | ||
| 113 | device->pd = ib_alloc_pd(device->ib_device); | 121 | device->pd = ib_alloc_pd(device->ib_device); |
| 114 | if (IS_ERR(device->pd)) | 122 | if (IS_ERR(device->pd)) |
| @@ -122,7 +130,7 @@ static int iser_create_device_ib_res(struct iser_device *device) | |||
| 122 | iser_cq_callback, | 130 | iser_cq_callback, |
| 123 | iser_cq_event_callback, | 131 | iser_cq_event_callback, |
| 124 | (void *)comp, | 132 | (void *)comp, |
| 125 | ISER_MAX_CQ_LEN, i); | 133 | max_cqe, i); |
| 126 | if (IS_ERR(comp->cq)) { | 134 | if (IS_ERR(comp->cq)) { |
| 127 | comp->cq = NULL; | 135 | comp->cq = NULL; |
| 128 | goto cq_err; | 136 | goto cq_err; |
| @@ -162,6 +170,8 @@ cq_err: | |||
| 162 | } | 170 | } |
| 163 | ib_dealloc_pd(device->pd); | 171 | ib_dealloc_pd(device->pd); |
| 164 | pd_err: | 172 | pd_err: |
| 173 | kfree(device->comps); | ||
| 174 | comps_err: | ||
| 165 | iser_err("failed to allocate an IB resource\n"); | 175 | iser_err("failed to allocate an IB resource\n"); |
| 166 | return -1; | 176 | return -1; |
| 167 | } | 177 | } |
| @@ -187,6 +197,9 @@ static void iser_free_device_ib_res(struct iser_device *device) | |||
| 187 | (void)ib_dereg_mr(device->mr); | 197 | (void)ib_dereg_mr(device->mr); |
| 188 | (void)ib_dealloc_pd(device->pd); | 198 | (void)ib_dealloc_pd(device->pd); |
| 189 | 199 | ||
| 200 | kfree(device->comps); | ||
| 201 | device->comps = NULL; | ||
| 202 | |||
| 190 | device->mr = NULL; | 203 | device->mr = NULL; |
| 191 | device->pd = NULL; | 204 | device->pd = NULL; |
| 192 | } | 205 | } |
| @@ -425,7 +438,10 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) | |||
| 425 | */ | 438 | */ |
| 426 | static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | 439 | static int iser_create_ib_conn_res(struct ib_conn *ib_conn) |
| 427 | { | 440 | { |
| 441 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, | ||
| 442 | ib_conn); | ||
| 428 | struct iser_device *device; | 443 | struct iser_device *device; |
| 444 | struct ib_device_attr *dev_attr; | ||
| 429 | struct ib_qp_init_attr init_attr; | 445 | struct ib_qp_init_attr init_attr; |
| 430 | int ret = -ENOMEM; | 446 | int ret = -ENOMEM; |
| 431 | int index, min_index = 0; | 447 | int index, min_index = 0; |
| @@ -433,6 +449,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | |||
| 433 | BUG_ON(ib_conn->device == NULL); | 449 | BUG_ON(ib_conn->device == NULL); |
| 434 | 450 | ||
| 435 | device = ib_conn->device; | 451 | device = ib_conn->device; |
| 452 | dev_attr = &device->dev_attr; | ||
| 436 | 453 | ||
| 437 | memset(&init_attr, 0, sizeof init_attr); | 454 | memset(&init_attr, 0, sizeof init_attr); |
| 438 | 455 | ||
| @@ -460,8 +477,20 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | |||
| 460 | if (ib_conn->pi_support) { | 477 | if (ib_conn->pi_support) { |
| 461 | init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; | 478 | init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1; |
| 462 | init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; | 479 | init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; |
| 480 | iser_conn->max_cmds = | ||
| 481 | ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS); | ||
| 463 | } else { | 482 | } else { |
| 464 | init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; | 483 | if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) { |
| 484 | init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; | ||
| 485 | iser_conn->max_cmds = | ||
| 486 | ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS); | ||
| 487 | } else { | ||
| 488 | init_attr.cap.max_send_wr = dev_attr->max_qp_wr; | ||
| 489 | iser_conn->max_cmds = | ||
| 490 | ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr); | ||
| 491 | iser_dbg("device %s supports max_send_wr %d\n", | ||
| 492 | device->ib_device->name, dev_attr->max_qp_wr); | ||
| 493 | } | ||
| 465 | } | 494 | } |
| 466 | 495 | ||
| 467 | ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); | 496 | ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); |
| @@ -475,7 +504,11 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) | |||
| 475 | return ret; | 504 | return ret; |
| 476 | 505 | ||
| 477 | out_err: | 506 | out_err: |
| 507 | mutex_lock(&ig.connlist_mutex); | ||
| 508 | ib_conn->comp->active_qps--; | ||
| 509 | mutex_unlock(&ig.connlist_mutex); | ||
| 478 | iser_err("unable to alloc mem or create resource, err %d\n", ret); | 510 | iser_err("unable to alloc mem or create resource, err %d\n", ret); |
| 511 | |||
| 479 | return ret; | 512 | return ret; |
| 480 | } | 513 | } |
| 481 | 514 | ||
| @@ -610,9 +643,11 @@ void iser_conn_release(struct iser_conn *iser_conn) | |||
| 610 | mutex_unlock(&ig.connlist_mutex); | 643 | mutex_unlock(&ig.connlist_mutex); |
| 611 | 644 | ||
| 612 | mutex_lock(&iser_conn->state_mutex); | 645 | mutex_lock(&iser_conn->state_mutex); |
| 613 | if (iser_conn->state != ISER_CONN_DOWN) | 646 | if (iser_conn->state != ISER_CONN_DOWN) { |
| 614 | iser_warn("iser conn %p state %d, expected state down.\n", | 647 | iser_warn("iser conn %p state %d, expected state down.\n", |
| 615 | iser_conn, iser_conn->state); | 648 | iser_conn, iser_conn->state); |
| 649 | iser_conn->state = ISER_CONN_DOWN; | ||
| 650 | } | ||
| 616 | /* | 651 | /* |
| 617 | * In case we never got to bind stage, we still need to | 652 | * In case we never got to bind stage, we still need to |
| 618 | * release IB resources (which is safe to call more than once). | 653 | * release IB resources (which is safe to call more than once). |
| @@ -662,8 +697,10 @@ int iser_conn_terminate(struct iser_conn *iser_conn) | |||
| 662 | 697 | ||
| 663 | /* post an indication that all flush errors were consumed */ | 698 | /* post an indication that all flush errors were consumed */ |
| 664 | err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); | 699 | err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); |
| 665 | if (err) | 700 | if (err) { |
| 666 | iser_err("conn %p failed to post beacon", ib_conn); | 701 | iser_err("conn %p failed to post beacon", ib_conn); |
| 702 | return 1; | ||
| 703 | } | ||
| 667 | 704 | ||
| 668 | wait_for_completion(&ib_conn->flush_comp); | 705 | wait_for_completion(&ib_conn->flush_comp); |
| 669 | } | 706 | } |
| @@ -846,20 +883,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve | |||
| 846 | break; | 883 | break; |
| 847 | case RDMA_CM_EVENT_DISCONNECTED: | 884 | case RDMA_CM_EVENT_DISCONNECTED: |
| 848 | case RDMA_CM_EVENT_ADDR_CHANGE: | 885 | case RDMA_CM_EVENT_ADDR_CHANGE: |
| 849 | iser_disconnected_handler(cma_id); | 886 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: |
| 887 | iser_cleanup_handler(cma_id, false); | ||
| 850 | break; | 888 | break; |
| 851 | case RDMA_CM_EVENT_DEVICE_REMOVAL: | 889 | case RDMA_CM_EVENT_DEVICE_REMOVAL: |
| 852 | /* | 890 | /* |
| 853 | * we *must* destroy the device as we cannot rely | 891 | * we *must* destroy the device as we cannot rely |
| 854 | * on iscsid to be around to initiate error handling. | 892 | * on iscsid to be around to initiate error handling. |
| 855 | * also implicitly destroy the cma_id. | 893 | * also if we are not in state DOWN implicitly destroy |
| 894 | * the cma_id. | ||
| 856 | */ | 895 | */ |
| 857 | iser_cleanup_handler(cma_id, true); | 896 | iser_cleanup_handler(cma_id, true); |
| 858 | iser_conn->ib_conn.cma_id = NULL; | 897 | if (iser_conn->state != ISER_CONN_DOWN) { |
| 859 | ret = 1; | 898 | iser_conn->ib_conn.cma_id = NULL; |
| 860 | break; | 899 | ret = 1; |
| 861 | case RDMA_CM_EVENT_TIMEWAIT_EXIT: | 900 | } |
| 862 | iser_cleanup_handler(cma_id, false); | ||
| 863 | break; | 901 | break; |
| 864 | default: | 902 | default: |
| 865 | iser_err("Unexpected RDMA CM event (%d)\n", event->event); | 903 | iser_err("Unexpected RDMA CM event (%d)\n", event->event); |
| @@ -981,7 +1019,6 @@ int iser_reg_page_vec(struct ib_conn *ib_conn, | |||
| 981 | mem_reg->rkey = mem->fmr->rkey; | 1019 | mem_reg->rkey = mem->fmr->rkey; |
| 982 | mem_reg->len = page_vec->length * SIZE_4K; | 1020 | mem_reg->len = page_vec->length * SIZE_4K; |
| 983 | mem_reg->va = io_addr; | 1021 | mem_reg->va = io_addr; |
| 984 | mem_reg->is_mr = 1; | ||
| 985 | mem_reg->mem_h = (void *)mem; | 1022 | mem_reg->mem_h = (void *)mem; |
| 986 | 1023 | ||
| 987 | mem_reg->va += page_vec->offset; | 1024 | mem_reg->va += page_vec->offset; |
| @@ -1008,7 +1045,7 @@ void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, | |||
| 1008 | struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; | 1045 | struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; |
| 1009 | int ret; | 1046 | int ret; |
| 1010 | 1047 | ||
| 1011 | if (!reg->is_mr) | 1048 | if (!reg->mem_h) |
| 1012 | return; | 1049 | return; |
| 1013 | 1050 | ||
| 1014 | iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); | 1051 | iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); |
| @@ -1028,11 +1065,10 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, | |||
| 1028 | struct ib_conn *ib_conn = &iser_conn->ib_conn; | 1065 | struct ib_conn *ib_conn = &iser_conn->ib_conn; |
| 1029 | struct fast_reg_descriptor *desc = reg->mem_h; | 1066 | struct fast_reg_descriptor *desc = reg->mem_h; |
| 1030 | 1067 | ||
| 1031 | if (!reg->is_mr) | 1068 | if (!desc) |
| 1032 | return; | 1069 | return; |
| 1033 | 1070 | ||
| 1034 | reg->mem_h = NULL; | 1071 | reg->mem_h = NULL; |
| 1035 | reg->is_mr = 0; | ||
| 1036 | spin_lock_bh(&ib_conn->lock); | 1072 | spin_lock_bh(&ib_conn->lock); |
| 1037 | list_add_tail(&desc->list, &ib_conn->fastreg.pool); | 1073 | list_add_tail(&desc->list, &ib_conn->fastreg.pool); |
| 1038 | spin_unlock_bh(&ib_conn->lock); | 1074 | spin_unlock_bh(&ib_conn->lock); |
| @@ -1049,7 +1085,7 @@ int iser_post_recvl(struct iser_conn *iser_conn) | |||
| 1049 | sge.length = ISER_RX_LOGIN_SIZE; | 1085 | sge.length = ISER_RX_LOGIN_SIZE; |
| 1050 | sge.lkey = ib_conn->device->mr->lkey; | 1086 | sge.lkey = ib_conn->device->mr->lkey; |
| 1051 | 1087 | ||
| 1052 | rx_wr.wr_id = (unsigned long)iser_conn->login_resp_buf; | 1088 | rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; |
| 1053 | rx_wr.sg_list = &sge; | 1089 | rx_wr.sg_list = &sge; |
| 1054 | rx_wr.num_sge = 1; | 1090 | rx_wr.num_sge = 1; |
| 1055 | rx_wr.next = NULL; | 1091 | rx_wr.next = NULL; |
| @@ -1073,7 +1109,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) | |||
| 1073 | 1109 | ||
| 1074 | for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { | 1110 | for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { |
| 1075 | rx_desc = &iser_conn->rx_descs[my_rx_head]; | 1111 | rx_desc = &iser_conn->rx_descs[my_rx_head]; |
| 1076 | rx_wr->wr_id = (unsigned long)rx_desc; | 1112 | rx_wr->wr_id = (uintptr_t)rx_desc; |
| 1077 | rx_wr->sg_list = &rx_desc->rx_sg; | 1113 | rx_wr->sg_list = &rx_desc->rx_sg; |
| 1078 | rx_wr->num_sge = 1; | 1114 | rx_wr->num_sge = 1; |
| 1079 | rx_wr->next = rx_wr + 1; | 1115 | rx_wr->next = rx_wr + 1; |
| @@ -1110,7 +1146,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, | |||
| 1110 | DMA_TO_DEVICE); | 1146 | DMA_TO_DEVICE); |
| 1111 | 1147 | ||
| 1112 | send_wr.next = NULL; | 1148 | send_wr.next = NULL; |
| 1113 | send_wr.wr_id = (unsigned long)tx_desc; | 1149 | send_wr.wr_id = (uintptr_t)tx_desc; |
| 1114 | send_wr.sg_list = tx_desc->tx_sg; | 1150 | send_wr.sg_list = tx_desc->tx_sg; |
| 1115 | send_wr.num_sge = tx_desc->num_sge; | 1151 | send_wr.num_sge = tx_desc->num_sge; |
| 1116 | send_wr.opcode = IB_WR_SEND; | 1152 | send_wr.opcode = IB_WR_SEND; |
| @@ -1160,6 +1196,7 @@ static void | |||
| 1160 | iser_handle_comp_error(struct ib_conn *ib_conn, | 1196 | iser_handle_comp_error(struct ib_conn *ib_conn, |
| 1161 | struct ib_wc *wc) | 1197 | struct ib_wc *wc) |
| 1162 | { | 1198 | { |
| 1199 | void *wr_id = (void *)(uintptr_t)wc->wr_id; | ||
| 1163 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, | 1200 | struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, |
| 1164 | ib_conn); | 1201 | ib_conn); |
| 1165 | 1202 | ||
| @@ -1168,8 +1205,8 @@ iser_handle_comp_error(struct ib_conn *ib_conn, | |||
| 1168 | iscsi_conn_failure(iser_conn->iscsi_conn, | 1205 | iscsi_conn_failure(iser_conn->iscsi_conn, |
| 1169 | ISCSI_ERR_CONN_FAILED); | 1206 | ISCSI_ERR_CONN_FAILED); |
| 1170 | 1207 | ||
| 1171 | if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) { | 1208 | if (is_iser_tx_desc(iser_conn, wr_id)) { |
| 1172 | struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id; | 1209 | struct iser_tx_desc *desc = wr_id; |
| 1173 | 1210 | ||
| 1174 | if (desc->type == ISCSI_TX_DATAOUT) | 1211 | if (desc->type == ISCSI_TX_DATAOUT) |
| 1175 | kmem_cache_free(ig.desc_cache, desc); | 1212 | kmem_cache_free(ig.desc_cache, desc); |
| @@ -1193,14 +1230,14 @@ static void iser_handle_wc(struct ib_wc *wc) | |||
| 1193 | struct iser_rx_desc *rx_desc; | 1230 | struct iser_rx_desc *rx_desc; |
| 1194 | 1231 | ||
| 1195 | ib_conn = wc->qp->qp_context; | 1232 | ib_conn = wc->qp->qp_context; |
| 1196 | if (wc->status == IB_WC_SUCCESS) { | 1233 | if (likely(wc->status == IB_WC_SUCCESS)) { |
| 1197 | if (wc->opcode == IB_WC_RECV) { | 1234 | if (wc->opcode == IB_WC_RECV) { |
| 1198 | rx_desc = (struct iser_rx_desc *)wc->wr_id; | 1235 | rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; |
| 1199 | iser_rcv_completion(rx_desc, wc->byte_len, | 1236 | iser_rcv_completion(rx_desc, wc->byte_len, |
| 1200 | ib_conn); | 1237 | ib_conn); |
| 1201 | } else | 1238 | } else |
| 1202 | if (wc->opcode == IB_WC_SEND) { | 1239 | if (wc->opcode == IB_WC_SEND) { |
| 1203 | tx_desc = (struct iser_tx_desc *)wc->wr_id; | 1240 | tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; |
| 1204 | iser_snd_completion(tx_desc, ib_conn); | 1241 | iser_snd_completion(tx_desc, ib_conn); |
| 1205 | } else { | 1242 | } else { |
| 1206 | iser_err("Unknown wc opcode %d\n", wc->opcode); | 1243 | iser_err("Unknown wc opcode %d\n", wc->opcode); |
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 5461924c9f10..db3c8c851af1 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c | |||
| @@ -2929,7 +2929,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) | |||
| 2929 | return -ENOMEM; | 2929 | return -ENOMEM; |
| 2930 | 2930 | ||
| 2931 | sep_opt = options; | 2931 | sep_opt = options; |
| 2932 | while ((p = strsep(&sep_opt, ",")) != NULL) { | 2932 | while ((p = strsep(&sep_opt, ",\n")) != NULL) { |
| 2933 | if (!*p) | 2933 | if (!*p) |
| 2934 | continue; | 2934 | continue; |
| 2935 | 2935 | ||
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index e25436b24ce7..629f9f1435a5 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c | |||
| @@ -171,9 +171,9 @@ int mlx4_check_port_params(struct mlx4_dev *dev, | |||
| 171 | { | 171 | { |
| 172 | int i; | 172 | int i; |
| 173 | 173 | ||
| 174 | for (i = 0; i < dev->caps.num_ports - 1; i++) { | 174 | if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { |
| 175 | if (port_type[i] != port_type[i + 1]) { | 175 | for (i = 0; i < dev->caps.num_ports - 1; i++) { |
| 176 | if (!(dev->caps.flags & MLX4_DEV_CAP_FLAG_DPDP)) { | 176 | if (port_type[i] != port_type[i + 1]) { |
| 177 | mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); | 177 | mlx4_err(dev, "Only same port types supported on this HCA, aborting\n"); |
| 178 | return -EINVAL; | 178 | return -EINVAL; |
| 179 | } | 179 | } |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index ab684463780b..da82991239a8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c | |||
| @@ -157,6 +157,8 @@ static const char *eqe_type_str(u8 type) | |||
| 157 | return "MLX5_EVENT_TYPE_CMD"; | 157 | return "MLX5_EVENT_TYPE_CMD"; |
| 158 | case MLX5_EVENT_TYPE_PAGE_REQUEST: | 158 | case MLX5_EVENT_TYPE_PAGE_REQUEST: |
| 159 | return "MLX5_EVENT_TYPE_PAGE_REQUEST"; | 159 | return "MLX5_EVENT_TYPE_PAGE_REQUEST"; |
| 160 | case MLX5_EVENT_TYPE_PAGE_FAULT: | ||
| 161 | return "MLX5_EVENT_TYPE_PAGE_FAULT"; | ||
| 160 | default: | 162 | default: |
| 161 | return "Unrecognized event"; | 163 | return "Unrecognized event"; |
| 162 | } | 164 | } |
| @@ -279,6 +281,11 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) | |||
| 279 | } | 281 | } |
| 280 | break; | 282 | break; |
| 281 | 283 | ||
| 284 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 285 | case MLX5_EVENT_TYPE_PAGE_FAULT: | ||
| 286 | mlx5_eq_pagefault(dev, eqe); | ||
| 287 | break; | ||
| 288 | #endif | ||
| 282 | 289 | ||
| 283 | default: | 290 | default: |
| 284 | mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", | 291 | mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", |
| @@ -446,8 +453,12 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev) | |||
| 446 | int mlx5_start_eqs(struct mlx5_core_dev *dev) | 453 | int mlx5_start_eqs(struct mlx5_core_dev *dev) |
| 447 | { | 454 | { |
| 448 | struct mlx5_eq_table *table = &dev->priv.eq_table; | 455 | struct mlx5_eq_table *table = &dev->priv.eq_table; |
| 456 | u32 async_event_mask = MLX5_ASYNC_EVENT_MASK; | ||
| 449 | int err; | 457 | int err; |
| 450 | 458 | ||
| 459 | if (dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG) | ||
| 460 | async_event_mask |= (1ull << MLX5_EVENT_TYPE_PAGE_FAULT); | ||
| 461 | |||
| 451 | err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, | 462 | err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, |
| 452 | MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, | 463 | MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, |
| 453 | "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); | 464 | "mlx5_cmd_eq", &dev->priv.uuari.uars[0]); |
| @@ -459,7 +470,7 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) | |||
| 459 | mlx5_cmd_use_events(dev); | 470 | mlx5_cmd_use_events(dev); |
| 460 | 471 | ||
| 461 | err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, | 472 | err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC, |
| 462 | MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK, | 473 | MLX5_NUM_ASYNC_EQE, async_event_mask, |
| 463 | "mlx5_async_eq", &dev->priv.uuari.uars[0]); | 474 | "mlx5_async_eq", &dev->priv.uuari.uars[0]); |
| 464 | if (err) { | 475 | if (err) { |
| 465 | mlx5_core_warn(dev, "failed to create async EQ %d\n", err); | 476 | mlx5_core_warn(dev, "failed to create async EQ %d\n", err); |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 087c4c797deb..06f9036acd83 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c | |||
| @@ -69,6 +69,46 @@ int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev, struct mlx5_caps *caps) | |||
| 69 | return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); | 69 | return mlx5_core_get_caps(dev, caps, HCA_CAP_OPMOD_GET_CUR); |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *caps) | ||
| 73 | { | ||
| 74 | u8 in[MLX5_ST_SZ_BYTES(query_hca_cap_in)]; | ||
| 75 | int out_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); | ||
| 76 | void *out; | ||
| 77 | int err; | ||
| 78 | |||
| 79 | if (!(dev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)) | ||
| 80 | return -ENOTSUPP; | ||
| 81 | |||
| 82 | memset(in, 0, sizeof(in)); | ||
| 83 | out = kzalloc(out_sz, GFP_KERNEL); | ||
| 84 | if (!out) | ||
| 85 | return -ENOMEM; | ||
| 86 | MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); | ||
| 87 | MLX5_SET(query_hca_cap_in, in, op_mod, HCA_CAP_OPMOD_GET_ODP_CUR); | ||
| 88 | err = mlx5_cmd_exec(dev, in, sizeof(in), out, out_sz); | ||
| 89 | if (err) | ||
| 90 | goto out; | ||
| 91 | |||
| 92 | err = mlx5_cmd_status_to_err_v2(out); | ||
| 93 | if (err) { | ||
| 94 | mlx5_core_warn(dev, "query cur hca ODP caps failed, %d\n", err); | ||
| 95 | goto out; | ||
| 96 | } | ||
| 97 | |||
| 98 | memcpy(caps, MLX5_ADDR_OF(query_hca_cap_out, out, capability_struct), | ||
| 99 | sizeof(*caps)); | ||
| 100 | |||
| 101 | mlx5_core_dbg(dev, "on-demand paging capabilities:\nrc: %08x\nuc: %08x\nud: %08x\n", | ||
| 102 | be32_to_cpu(caps->per_transport_caps.rc_odp_caps), | ||
| 103 | be32_to_cpu(caps->per_transport_caps.uc_odp_caps), | ||
| 104 | be32_to_cpu(caps->per_transport_caps.ud_odp_caps)); | ||
| 105 | |||
| 106 | out: | ||
| 107 | kfree(out); | ||
| 108 | return err; | ||
| 109 | } | ||
| 110 | EXPORT_SYMBOL(mlx5_query_odp_caps); | ||
| 111 | |||
| 72 | int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) | 112 | int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) |
| 73 | { | 113 | { |
| 74 | struct mlx5_cmd_init_hca_mbox_in in; | 114 | struct mlx5_cmd_init_hca_mbox_in in; |
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 5261a2b0da43..575d853dbe05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c | |||
| @@ -88,6 +88,95 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) | |||
| 88 | mlx5_core_put_rsc(common); | 88 | mlx5_core_put_rsc(common); |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 92 | void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) | ||
| 93 | { | ||
| 94 | struct mlx5_eqe_page_fault *pf_eqe = &eqe->data.page_fault; | ||
| 95 | int qpn = be32_to_cpu(pf_eqe->flags_qpn) & MLX5_QPN_MASK; | ||
| 96 | struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, qpn); | ||
| 97 | struct mlx5_core_qp *qp = | ||
| 98 | container_of(common, struct mlx5_core_qp, common); | ||
| 99 | struct mlx5_pagefault pfault; | ||
| 100 | |||
| 101 | if (!qp) { | ||
| 102 | mlx5_core_warn(dev, "ODP event for non-existent QP %06x\n", | ||
| 103 | qpn); | ||
| 104 | return; | ||
| 105 | } | ||
| 106 | |||
| 107 | pfault.event_subtype = eqe->sub_type; | ||
| 108 | pfault.flags = (be32_to_cpu(pf_eqe->flags_qpn) >> MLX5_QPN_BITS) & | ||
| 109 | (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE | MLX5_PFAULT_RDMA); | ||
| 110 | pfault.bytes_committed = be32_to_cpu( | ||
| 111 | pf_eqe->bytes_committed); | ||
| 112 | |||
| 113 | mlx5_core_dbg(dev, | ||
| 114 | "PAGE_FAULT: subtype: 0x%02x, flags: 0x%02x,\n", | ||
| 115 | eqe->sub_type, pfault.flags); | ||
| 116 | |||
| 117 | switch (eqe->sub_type) { | ||
| 118 | case MLX5_PFAULT_SUBTYPE_RDMA: | ||
| 119 | /* RDMA based event */ | ||
| 120 | pfault.rdma.r_key = | ||
| 121 | be32_to_cpu(pf_eqe->rdma.r_key); | ||
| 122 | pfault.rdma.packet_size = | ||
| 123 | be16_to_cpu(pf_eqe->rdma.packet_length); | ||
| 124 | pfault.rdma.rdma_op_len = | ||
| 125 | be32_to_cpu(pf_eqe->rdma.rdma_op_len); | ||
| 126 | pfault.rdma.rdma_va = | ||
| 127 | be64_to_cpu(pf_eqe->rdma.rdma_va); | ||
| 128 | mlx5_core_dbg(dev, | ||
| 129 | "PAGE_FAULT: qpn: 0x%06x, r_key: 0x%08x,\n", | ||
| 130 | qpn, pfault.rdma.r_key); | ||
| 131 | mlx5_core_dbg(dev, | ||
| 132 | "PAGE_FAULT: rdma_op_len: 0x%08x,\n", | ||
| 133 | pfault.rdma.rdma_op_len); | ||
| 134 | mlx5_core_dbg(dev, | ||
| 135 | "PAGE_FAULT: rdma_va: 0x%016llx,\n", | ||
| 136 | pfault.rdma.rdma_va); | ||
| 137 | mlx5_core_dbg(dev, | ||
| 138 | "PAGE_FAULT: bytes_committed: 0x%06x\n", | ||
| 139 | pfault.bytes_committed); | ||
| 140 | break; | ||
| 141 | |||
| 142 | case MLX5_PFAULT_SUBTYPE_WQE: | ||
| 143 | /* WQE based event */ | ||
| 144 | pfault.wqe.wqe_index = | ||
| 145 | be16_to_cpu(pf_eqe->wqe.wqe_index); | ||
| 146 | pfault.wqe.packet_size = | ||
| 147 | be16_to_cpu(pf_eqe->wqe.packet_length); | ||
| 148 | mlx5_core_dbg(dev, | ||
| 149 | "PAGE_FAULT: qpn: 0x%06x, wqe_index: 0x%04x,\n", | ||
| 150 | qpn, pfault.wqe.wqe_index); | ||
| 151 | mlx5_core_dbg(dev, | ||
| 152 | "PAGE_FAULT: bytes_committed: 0x%06x\n", | ||
| 153 | pfault.bytes_committed); | ||
| 154 | break; | ||
| 155 | |||
| 156 | default: | ||
| 157 | mlx5_core_warn(dev, | ||
| 158 | "Unsupported page fault event sub-type: 0x%02hhx, QP %06x\n", | ||
| 159 | eqe->sub_type, qpn); | ||
| 160 | /* Unsupported page faults should still be resolved by the | ||
| 161 | * page fault handler | ||
| 162 | */ | ||
| 163 | } | ||
| 164 | |||
| 165 | if (qp->pfault_handler) { | ||
| 166 | qp->pfault_handler(qp, &pfault); | ||
| 167 | } else { | ||
| 168 | mlx5_core_err(dev, | ||
| 169 | "ODP event for QP %08x, without a fault handler in QP\n", | ||
| 170 | qpn); | ||
| 171 | /* Page fault will remain unresolved. QP will hang until it is | ||
| 172 | * destroyed | ||
| 173 | */ | ||
| 174 | } | ||
| 175 | |||
| 176 | mlx5_core_put_rsc(common); | ||
| 177 | } | ||
| 178 | #endif | ||
| 179 | |||
| 91 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, | 180 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, |
| 92 | struct mlx5_core_qp *qp, | 181 | struct mlx5_core_qp *qp, |
| 93 | struct mlx5_create_qp_mbox_in *in, | 182 | struct mlx5_create_qp_mbox_in *in, |
| @@ -322,3 +411,33 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn) | |||
| 322 | return err; | 411 | return err; |
| 323 | } | 412 | } |
| 324 | EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); | 413 | EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc); |
| 414 | |||
| 415 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 416 | int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, | ||
| 417 | u8 flags, int error) | ||
| 418 | { | ||
| 419 | struct mlx5_page_fault_resume_mbox_in in; | ||
| 420 | struct mlx5_page_fault_resume_mbox_out out; | ||
| 421 | int err; | ||
| 422 | |||
| 423 | memset(&in, 0, sizeof(in)); | ||
| 424 | memset(&out, 0, sizeof(out)); | ||
| 425 | in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_PAGE_FAULT_RESUME); | ||
| 426 | in.hdr.opmod = 0; | ||
| 427 | flags &= (MLX5_PAGE_FAULT_RESUME_REQUESTOR | | ||
| 428 | MLX5_PAGE_FAULT_RESUME_WRITE | | ||
| 429 | MLX5_PAGE_FAULT_RESUME_RDMA); | ||
| 430 | flags |= (error ? MLX5_PAGE_FAULT_RESUME_ERROR : 0); | ||
| 431 | in.flags_qpn = cpu_to_be32((qpn & MLX5_QPN_MASK) | | ||
| 432 | (flags << MLX5_QPN_BITS)); | ||
| 433 | err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out)); | ||
| 434 | if (err) | ||
| 435 | return err; | ||
| 436 | |||
| 437 | if (out.hdr.status) | ||
| 438 | err = mlx5_cmd_status_to_err(&out.hdr); | ||
| 439 | |||
| 440 | return err; | ||
| 441 | } | ||
| 442 | EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); | ||
| 443 | #endif | ||
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ea4f1c46f761..4e5bd813bb9a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h | |||
| @@ -120,6 +120,15 @@ enum { | |||
| 120 | }; | 120 | }; |
| 121 | 121 | ||
| 122 | enum { | 122 | enum { |
| 123 | MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31 | ||
| 124 | }; | ||
| 125 | |||
| 126 | enum { | ||
| 127 | MLX5_PFAULT_SUBTYPE_WQE = 0, | ||
| 128 | MLX5_PFAULT_SUBTYPE_RDMA = 1, | ||
| 129 | }; | ||
| 130 | |||
| 131 | enum { | ||
| 123 | MLX5_PERM_LOCAL_READ = 1 << 2, | 132 | MLX5_PERM_LOCAL_READ = 1 << 2, |
| 124 | MLX5_PERM_LOCAL_WRITE = 1 << 3, | 133 | MLX5_PERM_LOCAL_WRITE = 1 << 3, |
| 125 | MLX5_PERM_REMOTE_READ = 1 << 4, | 134 | MLX5_PERM_REMOTE_READ = 1 << 4, |
| @@ -180,6 +189,19 @@ enum { | |||
| 180 | MLX5_MKEY_MASK_FREE = 1ull << 29, | 189 | MLX5_MKEY_MASK_FREE = 1ull << 29, |
| 181 | }; | 190 | }; |
| 182 | 191 | ||
| 192 | enum { | ||
| 193 | MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4), | ||
| 194 | |||
| 195 | MLX5_UMR_CHECK_NOT_FREE = (1 << 5), | ||
| 196 | MLX5_UMR_CHECK_FREE = (2 << 5), | ||
| 197 | |||
| 198 | MLX5_UMR_INLINE = (1 << 7), | ||
| 199 | }; | ||
| 200 | |||
| 201 | #define MLX5_UMR_MTT_ALIGNMENT 0x40 | ||
| 202 | #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) | ||
| 203 | #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT | ||
| 204 | |||
| 183 | enum mlx5_event { | 205 | enum mlx5_event { |
| 184 | MLX5_EVENT_TYPE_COMP = 0x0, | 206 | MLX5_EVENT_TYPE_COMP = 0x0, |
| 185 | 207 | ||
| @@ -206,6 +228,8 @@ enum mlx5_event { | |||
| 206 | 228 | ||
| 207 | MLX5_EVENT_TYPE_CMD = 0x0a, | 229 | MLX5_EVENT_TYPE_CMD = 0x0a, |
| 208 | MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, | 230 | MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, |
| 231 | |||
| 232 | MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, | ||
| 209 | }; | 233 | }; |
| 210 | 234 | ||
| 211 | enum { | 235 | enum { |
| @@ -225,6 +249,7 @@ enum { | |||
| 225 | MLX5_DEV_CAP_FLAG_APM = 1LL << 17, | 249 | MLX5_DEV_CAP_FLAG_APM = 1LL << 17, |
| 226 | MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, | 250 | MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, |
| 227 | MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, | 251 | MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, |
| 252 | MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, | ||
| 228 | MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, | 253 | MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, |
| 229 | MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, | 254 | MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, |
| 230 | MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, | 255 | MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, |
| @@ -290,6 +315,8 @@ enum { | |||
| 290 | enum { | 315 | enum { |
| 291 | HCA_CAP_OPMOD_GET_MAX = 0, | 316 | HCA_CAP_OPMOD_GET_MAX = 0, |
| 292 | HCA_CAP_OPMOD_GET_CUR = 1, | 317 | HCA_CAP_OPMOD_GET_CUR = 1, |
| 318 | HCA_CAP_OPMOD_GET_ODP_MAX = 4, | ||
| 319 | HCA_CAP_OPMOD_GET_ODP_CUR = 5 | ||
| 293 | }; | 320 | }; |
| 294 | 321 | ||
| 295 | struct mlx5_inbox_hdr { | 322 | struct mlx5_inbox_hdr { |
| @@ -319,6 +346,23 @@ struct mlx5_cmd_query_adapter_mbox_out { | |||
| 319 | u8 vsd_psid[16]; | 346 | u8 vsd_psid[16]; |
| 320 | }; | 347 | }; |
| 321 | 348 | ||
| 349 | enum mlx5_odp_transport_cap_bits { | ||
| 350 | MLX5_ODP_SUPPORT_SEND = 1 << 31, | ||
| 351 | MLX5_ODP_SUPPORT_RECV = 1 << 30, | ||
| 352 | MLX5_ODP_SUPPORT_WRITE = 1 << 29, | ||
| 353 | MLX5_ODP_SUPPORT_READ = 1 << 28, | ||
| 354 | }; | ||
| 355 | |||
| 356 | struct mlx5_odp_caps { | ||
| 357 | char reserved[0x10]; | ||
| 358 | struct { | ||
| 359 | __be32 rc_odp_caps; | ||
| 360 | __be32 uc_odp_caps; | ||
| 361 | __be32 ud_odp_caps; | ||
| 362 | } per_transport_caps; | ||
| 363 | char reserved2[0xe4]; | ||
| 364 | }; | ||
| 365 | |||
| 322 | struct mlx5_cmd_init_hca_mbox_in { | 366 | struct mlx5_cmd_init_hca_mbox_in { |
| 323 | struct mlx5_inbox_hdr hdr; | 367 | struct mlx5_inbox_hdr hdr; |
| 324 | u8 rsvd0[2]; | 368 | u8 rsvd0[2]; |
| @@ -439,6 +483,27 @@ struct mlx5_eqe_page_req { | |||
| 439 | __be32 rsvd1[5]; | 483 | __be32 rsvd1[5]; |
| 440 | }; | 484 | }; |
| 441 | 485 | ||
| 486 | struct mlx5_eqe_page_fault { | ||
| 487 | __be32 bytes_committed; | ||
| 488 | union { | ||
| 489 | struct { | ||
| 490 | u16 reserved1; | ||
| 491 | __be16 wqe_index; | ||
| 492 | u16 reserved2; | ||
| 493 | __be16 packet_length; | ||
| 494 | u8 reserved3[12]; | ||
| 495 | } __packed wqe; | ||
| 496 | struct { | ||
| 497 | __be32 r_key; | ||
| 498 | u16 reserved1; | ||
| 499 | __be16 packet_length; | ||
| 500 | __be32 rdma_op_len; | ||
| 501 | __be64 rdma_va; | ||
| 502 | } __packed rdma; | ||
| 503 | } __packed; | ||
| 504 | __be32 flags_qpn; | ||
| 505 | } __packed; | ||
| 506 | |||
| 442 | union ev_data { | 507 | union ev_data { |
| 443 | __be32 raw[7]; | 508 | __be32 raw[7]; |
| 444 | struct mlx5_eqe_cmd cmd; | 509 | struct mlx5_eqe_cmd cmd; |
| @@ -450,6 +515,7 @@ union ev_data { | |||
| 450 | struct mlx5_eqe_congestion cong; | 515 | struct mlx5_eqe_congestion cong; |
| 451 | struct mlx5_eqe_stall_vl stall_vl; | 516 | struct mlx5_eqe_stall_vl stall_vl; |
| 452 | struct mlx5_eqe_page_req req_pages; | 517 | struct mlx5_eqe_page_req req_pages; |
| 518 | struct mlx5_eqe_page_fault page_fault; | ||
| 453 | } __packed; | 519 | } __packed; |
| 454 | 520 | ||
| 455 | struct mlx5_eqe { | 521 | struct mlx5_eqe { |
| @@ -776,6 +842,10 @@ struct mlx5_query_eq_mbox_out { | |||
| 776 | struct mlx5_eq_context ctx; | 842 | struct mlx5_eq_context ctx; |
| 777 | }; | 843 | }; |
| 778 | 844 | ||
| 845 | enum { | ||
| 846 | MLX5_MKEY_STATUS_FREE = 1 << 6, | ||
| 847 | }; | ||
| 848 | |||
| 779 | struct mlx5_mkey_seg { | 849 | struct mlx5_mkey_seg { |
| 780 | /* This is a two bit field occupying bits 31-30. | 850 | /* This is a two bit field occupying bits 31-30. |
| 781 | * bit 31 is always 0, | 851 | * bit 31 is always 0, |
| @@ -812,7 +882,7 @@ struct mlx5_query_special_ctxs_mbox_out { | |||
| 812 | struct mlx5_create_mkey_mbox_in { | 882 | struct mlx5_create_mkey_mbox_in { |
| 813 | struct mlx5_inbox_hdr hdr; | 883 | struct mlx5_inbox_hdr hdr; |
| 814 | __be32 input_mkey_index; | 884 | __be32 input_mkey_index; |
| 815 | u8 rsvd0[4]; | 885 | __be32 flags; |
| 816 | struct mlx5_mkey_seg seg; | 886 | struct mlx5_mkey_seg seg; |
| 817 | u8 rsvd1[16]; | 887 | u8 rsvd1[16]; |
| 818 | __be32 xlat_oct_act_size; | 888 | __be32 xlat_oct_act_size; |
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b1bf41556b32..166d9315fe4b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h | |||
| @@ -113,6 +113,13 @@ enum { | |||
| 113 | MLX5_REG_HOST_ENDIANNESS = 0x7004, | 113 | MLX5_REG_HOST_ENDIANNESS = 0x7004, |
| 114 | }; | 114 | }; |
| 115 | 115 | ||
| 116 | enum mlx5_page_fault_resume_flags { | ||
| 117 | MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, | ||
| 118 | MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, | ||
| 119 | MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2, | ||
| 120 | MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7, | ||
| 121 | }; | ||
| 122 | |||
| 116 | enum dbg_rsc_type { | 123 | enum dbg_rsc_type { |
| 117 | MLX5_DBG_RSC_QP, | 124 | MLX5_DBG_RSC_QP, |
| 118 | MLX5_DBG_RSC_EQ, | 125 | MLX5_DBG_RSC_EQ, |
| @@ -467,7 +474,7 @@ struct mlx5_priv { | |||
| 467 | struct workqueue_struct *pg_wq; | 474 | struct workqueue_struct *pg_wq; |
| 468 | struct rb_root page_root; | 475 | struct rb_root page_root; |
| 469 | int fw_pages; | 476 | int fw_pages; |
| 470 | int reg_pages; | 477 | atomic_t reg_pages; |
| 471 | struct list_head free_list; | 478 | struct list_head free_list; |
| 472 | 479 | ||
| 473 | struct mlx5_core_health health; | 480 | struct mlx5_core_health health; |
| @@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev); | |||
| 703 | void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); | 710 | void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); |
| 704 | void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); | 711 | void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); |
| 705 | void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); | 712 | void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); |
| 713 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 714 | void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); | ||
| 715 | #endif | ||
| 706 | void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); | 716 | void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); |
| 707 | struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); | 717 | struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); |
| 708 | void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); | 718 | void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); |
| @@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, | |||
| 740 | int npsvs, u32 *sig_index); | 750 | int npsvs, u32 *sig_index); |
| 741 | int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); | 751 | int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); |
| 742 | void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); | 752 | void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); |
| 753 | int mlx5_query_odp_caps(struct mlx5_core_dev *dev, | ||
| 754 | struct mlx5_odp_caps *odp_caps); | ||
| 743 | 755 | ||
| 744 | static inline u32 mlx5_mkey_to_idx(u32 mkey) | 756 | static inline u32 mlx5_mkey_to_idx(u32 mkey) |
| 745 | { | 757 | { |
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3fa075daeb1d..61f7a342d1bf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h | |||
| @@ -50,6 +50,9 @@ | |||
| 50 | #define MLX5_BSF_APPTAG_ESCAPE 0x1 | 50 | #define MLX5_BSF_APPTAG_ESCAPE 0x1 |
| 51 | #define MLX5_BSF_APPREF_ESCAPE 0x2 | 51 | #define MLX5_BSF_APPREF_ESCAPE 0x2 |
| 52 | 52 | ||
| 53 | #define MLX5_QPN_BITS 24 | ||
| 54 | #define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1) | ||
| 55 | |||
| 53 | enum mlx5_qp_optpar { | 56 | enum mlx5_qp_optpar { |
| 54 | MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, | 57 | MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, |
| 55 | MLX5_QP_OPTPAR_RRE = 1 << 1, | 58 | MLX5_QP_OPTPAR_RRE = 1 << 1, |
| @@ -189,6 +192,14 @@ struct mlx5_wqe_ctrl_seg { | |||
| 189 | __be32 imm; | 192 | __be32 imm; |
| 190 | }; | 193 | }; |
| 191 | 194 | ||
| 195 | #define MLX5_WQE_CTRL_DS_MASK 0x3f | ||
| 196 | #define MLX5_WQE_CTRL_QPN_MASK 0xffffff00 | ||
| 197 | #define MLX5_WQE_CTRL_QPN_SHIFT 8 | ||
| 198 | #define MLX5_WQE_DS_UNITS 16 | ||
| 199 | #define MLX5_WQE_CTRL_OPCODE_MASK 0xff | ||
| 200 | #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 | ||
| 201 | #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 | ||
| 202 | |||
| 192 | struct mlx5_wqe_xrc_seg { | 203 | struct mlx5_wqe_xrc_seg { |
| 193 | __be32 xrc_srqn; | 204 | __be32 xrc_srqn; |
| 194 | u8 rsvd[12]; | 205 | u8 rsvd[12]; |
| @@ -292,6 +303,8 @@ struct mlx5_wqe_signature_seg { | |||
| 292 | u8 rsvd1[11]; | 303 | u8 rsvd1[11]; |
| 293 | }; | 304 | }; |
| 294 | 305 | ||
| 306 | #define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff | ||
| 307 | |||
| 295 | struct mlx5_wqe_inline_seg { | 308 | struct mlx5_wqe_inline_seg { |
| 296 | __be32 byte_count; | 309 | __be32 byte_count; |
| 297 | }; | 310 | }; |
| @@ -360,9 +373,46 @@ struct mlx5_stride_block_ctrl_seg { | |||
| 360 | __be16 num_entries; | 373 | __be16 num_entries; |
| 361 | }; | 374 | }; |
| 362 | 375 | ||
| 376 | enum mlx5_pagefault_flags { | ||
| 377 | MLX5_PFAULT_REQUESTOR = 1 << 0, | ||
| 378 | MLX5_PFAULT_WRITE = 1 << 1, | ||
| 379 | MLX5_PFAULT_RDMA = 1 << 2, | ||
| 380 | }; | ||
| 381 | |||
| 382 | /* Contains the details of a pagefault. */ | ||
| 383 | struct mlx5_pagefault { | ||
| 384 | u32 bytes_committed; | ||
| 385 | u8 event_subtype; | ||
| 386 | enum mlx5_pagefault_flags flags; | ||
| 387 | union { | ||
| 388 | /* Initiator or send message responder pagefault details. */ | ||
| 389 | struct { | ||
| 390 | /* Received packet size, only valid for responders. */ | ||
| 391 | u32 packet_size; | ||
| 392 | /* | ||
| 393 | * WQE index. Refers to either the send queue or | ||
| 394 | * receive queue, according to event_subtype. | ||
| 395 | */ | ||
| 396 | u16 wqe_index; | ||
| 397 | } wqe; | ||
| 398 | /* RDMA responder pagefault details */ | ||
| 399 | struct { | ||
| 400 | u32 r_key; | ||
| 401 | /* | ||
| 402 | * Received packet size, minimal size page fault | ||
| 403 | * resolution required for forward progress. | ||
| 404 | */ | ||
| 405 | u32 packet_size; | ||
| 406 | u32 rdma_op_len; | ||
| 407 | u64 rdma_va; | ||
| 408 | } rdma; | ||
| 409 | }; | ||
| 410 | }; | ||
| 411 | |||
| 363 | struct mlx5_core_qp { | 412 | struct mlx5_core_qp { |
| 364 | struct mlx5_core_rsc_common common; /* must be first */ | 413 | struct mlx5_core_rsc_common common; /* must be first */ |
| 365 | void (*event) (struct mlx5_core_qp *, int); | 414 | void (*event) (struct mlx5_core_qp *, int); |
| 415 | void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *); | ||
| 366 | int qpn; | 416 | int qpn; |
| 367 | struct mlx5_rsc_debug *dbg; | 417 | struct mlx5_rsc_debug *dbg; |
| 368 | int pid; | 418 | int pid; |
| @@ -530,6 +580,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u | |||
| 530 | return radix_tree_lookup(&dev->priv.mr_table.tree, key); | 580 | return radix_tree_lookup(&dev->priv.mr_table.tree, key); |
| 531 | } | 581 | } |
| 532 | 582 | ||
| 583 | struct mlx5_page_fault_resume_mbox_in { | ||
| 584 | struct mlx5_inbox_hdr hdr; | ||
| 585 | __be32 flags_qpn; | ||
| 586 | u8 reserved[4]; | ||
| 587 | }; | ||
| 588 | |||
| 589 | struct mlx5_page_fault_resume_mbox_out { | ||
| 590 | struct mlx5_outbox_hdr hdr; | ||
| 591 | u8 rsvd[8]; | ||
| 592 | }; | ||
| 593 | |||
| 533 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, | 594 | int mlx5_core_create_qp(struct mlx5_core_dev *dev, |
| 534 | struct mlx5_core_qp *qp, | 595 | struct mlx5_core_qp *qp, |
| 535 | struct mlx5_create_qp_mbox_in *in, | 596 | struct mlx5_create_qp_mbox_in *in, |
| @@ -549,6 +610,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev); | |||
| 549 | void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); | 610 | void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); |
| 550 | int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); | 611 | int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); |
| 551 | void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); | 612 | void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); |
| 613 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 614 | int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, | ||
| 615 | u8 context, int error); | ||
| 616 | #endif | ||
| 552 | 617 | ||
| 553 | static inline const char *mlx5_qp_type_str(int type) | 618 | static inline const char *mlx5_qp_type_str(int type) |
| 554 | { | 619 | { |
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a2bf41e0bde9..2d83cfd7e6ce 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h | |||
| @@ -38,11 +38,12 @@ | |||
| 38 | #include <linux/workqueue.h> | 38 | #include <linux/workqueue.h> |
| 39 | 39 | ||
| 40 | struct ib_ucontext; | 40 | struct ib_ucontext; |
| 41 | struct ib_umem_odp; | ||
| 41 | 42 | ||
| 42 | struct ib_umem { | 43 | struct ib_umem { |
| 43 | struct ib_ucontext *context; | 44 | struct ib_ucontext *context; |
| 44 | size_t length; | 45 | size_t length; |
| 45 | int offset; | 46 | unsigned long address; |
| 46 | int page_size; | 47 | int page_size; |
| 47 | int writable; | 48 | int writable; |
| 48 | int hugetlb; | 49 | int hugetlb; |
| @@ -50,17 +51,43 @@ struct ib_umem { | |||
| 50 | struct pid *pid; | 51 | struct pid *pid; |
| 51 | struct mm_struct *mm; | 52 | struct mm_struct *mm; |
| 52 | unsigned long diff; | 53 | unsigned long diff; |
| 54 | struct ib_umem_odp *odp_data; | ||
| 53 | struct sg_table sg_head; | 55 | struct sg_table sg_head; |
| 54 | int nmap; | 56 | int nmap; |
| 55 | int npages; | 57 | int npages; |
| 56 | }; | 58 | }; |
| 57 | 59 | ||
| 60 | /* Returns the offset of the umem start relative to the first page. */ | ||
| 61 | static inline int ib_umem_offset(struct ib_umem *umem) | ||
| 62 | { | ||
| 63 | return umem->address & ((unsigned long)umem->page_size - 1); | ||
| 64 | } | ||
| 65 | |||
| 66 | /* Returns the first page of an ODP umem. */ | ||
| 67 | static inline unsigned long ib_umem_start(struct ib_umem *umem) | ||
| 68 | { | ||
| 69 | return umem->address - ib_umem_offset(umem); | ||
| 70 | } | ||
| 71 | |||
| 72 | /* Returns the address of the page after the last one of an ODP umem. */ | ||
| 73 | static inline unsigned long ib_umem_end(struct ib_umem *umem) | ||
| 74 | { | ||
| 75 | return PAGE_ALIGN(umem->address + umem->length); | ||
| 76 | } | ||
| 77 | |||
| 78 | static inline size_t ib_umem_num_pages(struct ib_umem *umem) | ||
| 79 | { | ||
| 80 | return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; | ||
| 81 | } | ||
| 82 | |||
| 58 | #ifdef CONFIG_INFINIBAND_USER_MEM | 83 | #ifdef CONFIG_INFINIBAND_USER_MEM |
| 59 | 84 | ||
| 60 | struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, | 85 | struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, |
| 61 | size_t size, int access, int dmasync); | 86 | size_t size, int access, int dmasync); |
| 62 | void ib_umem_release(struct ib_umem *umem); | 87 | void ib_umem_release(struct ib_umem *umem); |
| 63 | int ib_umem_page_count(struct ib_umem *umem); | 88 | int ib_umem_page_count(struct ib_umem *umem); |
| 89 | int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, | ||
| 90 | size_t length); | ||
| 64 | 91 | ||
| 65 | #else /* CONFIG_INFINIBAND_USER_MEM */ | 92 | #else /* CONFIG_INFINIBAND_USER_MEM */ |
| 66 | 93 | ||
| @@ -73,7 +100,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, | |||
| 73 | } | 100 | } |
| 74 | static inline void ib_umem_release(struct ib_umem *umem) { } | 101 | static inline void ib_umem_release(struct ib_umem *umem) { } |
| 75 | static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } | 102 | static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } |
| 76 | 103 | static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, | |
| 104 | size_t length) { | ||
| 105 | return -EINVAL; | ||
| 106 | } | ||
| 77 | #endif /* CONFIG_INFINIBAND_USER_MEM */ | 107 | #endif /* CONFIG_INFINIBAND_USER_MEM */ |
| 78 | 108 | ||
| 79 | #endif /* IB_UMEM_H */ | 109 | #endif /* IB_UMEM_H */ |
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h new file mode 100644 index 000000000000..3da0b167041b --- /dev/null +++ b/include/rdma/ib_umem_odp.h | |||
| @@ -0,0 +1,160 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (c) 2014 Mellanox Technologies. All rights reserved. | ||
| 3 | * | ||
| 4 | * This software is available to you under a choice of one of two | ||
| 5 | * licenses. You may choose to be licensed under the terms of the GNU | ||
| 6 | * General Public License (GPL) Version 2, available from the file | ||
| 7 | * COPYING in the main directory of this source tree, or the | ||
| 8 | * OpenIB.org BSD license below: | ||
| 9 | * | ||
| 10 | * Redistribution and use in source and binary forms, with or | ||
| 11 | * without modification, are permitted provided that the following | ||
| 12 | * conditions are met: | ||
| 13 | * | ||
| 14 | * - Redistributions of source code must retain the above | ||
| 15 | * copyright notice, this list of conditions and the following | ||
| 16 | * disclaimer. | ||
| 17 | * | ||
| 18 | * - Redistributions in binary form must reproduce the above | ||
| 19 | * copyright notice, this list of conditions and the following | ||
| 20 | * disclaimer in the documentation and/or other materials | ||
| 21 | * provided with the distribution. | ||
| 22 | * | ||
| 23 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 24 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
| 25 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 26 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
| 27 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
| 28 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
| 29 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| 30 | * SOFTWARE. | ||
| 31 | */ | ||
| 32 | |||
| 33 | #ifndef IB_UMEM_ODP_H | ||
| 34 | #define IB_UMEM_ODP_H | ||
| 35 | |||
| 36 | #include <rdma/ib_umem.h> | ||
| 37 | #include <rdma/ib_verbs.h> | ||
| 38 | #include <linux/interval_tree.h> | ||
| 39 | |||
| 40 | struct umem_odp_node { | ||
| 41 | u64 __subtree_last; | ||
| 42 | struct rb_node rb; | ||
| 43 | }; | ||
| 44 | |||
| 45 | struct ib_umem_odp { | ||
| 46 | /* | ||
| 47 | * An array of the pages included in the on-demand paging umem. | ||
| 48 | * Indices of pages that are currently not mapped into the device will | ||
| 49 | * contain NULL. | ||
| 50 | */ | ||
| 51 | struct page **page_list; | ||
| 52 | /* | ||
| 53 | * An array of the same size as page_list, with DMA addresses mapped | ||
| 54 | * for pages the pages in page_list. The lower two bits designate | ||
| 55 | * access permissions. See ODP_READ_ALLOWED_BIT and | ||
| 56 | * ODP_WRITE_ALLOWED_BIT. | ||
| 57 | */ | ||
| 58 | dma_addr_t *dma_list; | ||
| 59 | /* | ||
| 60 | * The umem_mutex protects the page_list and dma_list fields of an ODP | ||
| 61 | * umem, allowing only a single thread to map/unmap pages. The mutex | ||
| 62 | * also protects access to the mmu notifier counters. | ||
| 63 | */ | ||
| 64 | struct mutex umem_mutex; | ||
| 65 | void *private; /* for the HW driver to use. */ | ||
| 66 | |||
| 67 | /* When false, use the notifier counter in the ucontext struct. */ | ||
| 68 | bool mn_counters_active; | ||
| 69 | int notifiers_seq; | ||
| 70 | int notifiers_count; | ||
| 71 | |||
| 72 | /* A linked list of umems that don't have private mmu notifier | ||
| 73 | * counters yet. */ | ||
| 74 | struct list_head no_private_counters; | ||
| 75 | struct ib_umem *umem; | ||
| 76 | |||
| 77 | /* Tree tracking */ | ||
| 78 | struct umem_odp_node interval_tree; | ||
| 79 | |||
| 80 | struct completion notifier_completion; | ||
| 81 | int dying; | ||
| 82 | }; | ||
| 83 | |||
| 84 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 85 | |||
| 86 | int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); | ||
| 87 | |||
| 88 | void ib_umem_odp_release(struct ib_umem *umem); | ||
| 89 | |||
| 90 | /* | ||
| 91 | * The lower 2 bits of the DMA address signal the R/W permissions for | ||
| 92 | * the entry. To upgrade the permissions, provide the appropriate | ||
| 93 | * bitmask to the map_dma_pages function. | ||
| 94 | * | ||
| 95 | * Be aware that upgrading a mapped address might result in change of | ||
| 96 | * the DMA address for the page. | ||
| 97 | */ | ||
| 98 | #define ODP_READ_ALLOWED_BIT (1<<0ULL) | ||
| 99 | #define ODP_WRITE_ALLOWED_BIT (1<<1ULL) | ||
| 100 | |||
| 101 | #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) | ||
| 102 | |||
| 103 | int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, | ||
| 104 | u64 access_mask, unsigned long current_seq); | ||
| 105 | |||
| 106 | void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, | ||
| 107 | u64 bound); | ||
| 108 | |||
| 109 | void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); | ||
| 110 | void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); | ||
| 111 | typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, | ||
| 112 | void *cookie); | ||
| 113 | /* | ||
| 114 | * Call the callback on each ib_umem in the range. Returns the logical or of | ||
| 115 | * the return values of the functions called. | ||
| 116 | */ | ||
| 117 | int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, | ||
| 118 | umem_call_back cb, void *cookie); | ||
| 119 | |||
| 120 | struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, | ||
| 121 | u64 start, u64 last); | ||
| 122 | struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, | ||
| 123 | u64 start, u64 last); | ||
| 124 | |||
| 125 | static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, | ||
| 126 | unsigned long mmu_seq) | ||
| 127 | { | ||
| 128 | /* | ||
| 129 | * This code is strongly based on the KVM code from | ||
| 130 | * mmu_notifier_retry. Should be called with | ||
| 131 | * the relevant locks taken (item->odp_data->umem_mutex | ||
| 132 | * and the ucontext umem_mutex semaphore locked for read). | ||
| 133 | */ | ||
| 134 | |||
| 135 | /* Do not allow page faults while the new ib_umem hasn't seen a state | ||
| 136 | * with zero notifiers yet, and doesn't have its own valid set of | ||
| 137 | * private counters. */ | ||
| 138 | if (!item->odp_data->mn_counters_active) | ||
| 139 | return 1; | ||
| 140 | |||
| 141 | if (unlikely(item->odp_data->notifiers_count)) | ||
| 142 | return 1; | ||
| 143 | if (item->odp_data->notifiers_seq != mmu_seq) | ||
| 144 | return 1; | ||
| 145 | return 0; | ||
| 146 | } | ||
| 147 | |||
| 148 | #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
| 149 | |||
| 150 | static inline int ib_umem_odp_get(struct ib_ucontext *context, | ||
| 151 | struct ib_umem *umem) | ||
| 152 | { | ||
| 153 | return -EINVAL; | ||
| 154 | } | ||
| 155 | |||
| 156 | static inline void ib_umem_odp_release(struct ib_umem *umem) {} | ||
| 157 | |||
| 158 | #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | ||
| 159 | |||
| 160 | #endif /* IB_UMEM_ODP_H */ | ||
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 470a011d6fa4..0d74f1de99aa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <uapi/linux/if_ether.h> | 51 | #include <uapi/linux/if_ether.h> |
| 52 | 52 | ||
| 53 | #include <linux/atomic.h> | 53 | #include <linux/atomic.h> |
| 54 | #include <linux/mmu_notifier.h> | ||
| 54 | #include <asm/uaccess.h> | 55 | #include <asm/uaccess.h> |
| 55 | 56 | ||
| 56 | extern struct workqueue_struct *ib_wq; | 57 | extern struct workqueue_struct *ib_wq; |
| @@ -123,7 +124,8 @@ enum ib_device_cap_flags { | |||
| 123 | IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), | 124 | IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), |
| 124 | IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), | 125 | IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), |
| 125 | IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), | 126 | IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), |
| 126 | IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) | 127 | IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), |
| 128 | IB_DEVICE_ON_DEMAND_PAGING = (1<<31), | ||
| 127 | }; | 129 | }; |
| 128 | 130 | ||
| 129 | enum ib_signature_prot_cap { | 131 | enum ib_signature_prot_cap { |
| @@ -143,6 +145,27 @@ enum ib_atomic_cap { | |||
| 143 | IB_ATOMIC_GLOB | 145 | IB_ATOMIC_GLOB |
| 144 | }; | 146 | }; |
| 145 | 147 | ||
| 148 | enum ib_odp_general_cap_bits { | ||
| 149 | IB_ODP_SUPPORT = 1 << 0, | ||
| 150 | }; | ||
| 151 | |||
| 152 | enum ib_odp_transport_cap_bits { | ||
| 153 | IB_ODP_SUPPORT_SEND = 1 << 0, | ||
| 154 | IB_ODP_SUPPORT_RECV = 1 << 1, | ||
| 155 | IB_ODP_SUPPORT_WRITE = 1 << 2, | ||
| 156 | IB_ODP_SUPPORT_READ = 1 << 3, | ||
| 157 | IB_ODP_SUPPORT_ATOMIC = 1 << 4, | ||
| 158 | }; | ||
| 159 | |||
| 160 | struct ib_odp_caps { | ||
| 161 | uint64_t general_caps; | ||
| 162 | struct { | ||
| 163 | uint32_t rc_odp_caps; | ||
| 164 | uint32_t uc_odp_caps; | ||
| 165 | uint32_t ud_odp_caps; | ||
| 166 | } per_transport_caps; | ||
| 167 | }; | ||
| 168 | |||
| 146 | struct ib_device_attr { | 169 | struct ib_device_attr { |
| 147 | u64 fw_ver; | 170 | u64 fw_ver; |
| 148 | __be64 sys_image_guid; | 171 | __be64 sys_image_guid; |
| @@ -186,6 +209,7 @@ struct ib_device_attr { | |||
| 186 | u8 local_ca_ack_delay; | 209 | u8 local_ca_ack_delay; |
| 187 | int sig_prot_cap; | 210 | int sig_prot_cap; |
| 188 | int sig_guard_cap; | 211 | int sig_guard_cap; |
| 212 | struct ib_odp_caps odp_caps; | ||
| 189 | }; | 213 | }; |
| 190 | 214 | ||
| 191 | enum ib_mtu { | 215 | enum ib_mtu { |
| @@ -1073,7 +1097,8 @@ enum ib_access_flags { | |||
| 1073 | IB_ACCESS_REMOTE_READ = (1<<2), | 1097 | IB_ACCESS_REMOTE_READ = (1<<2), |
| 1074 | IB_ACCESS_REMOTE_ATOMIC = (1<<3), | 1098 | IB_ACCESS_REMOTE_ATOMIC = (1<<3), |
| 1075 | IB_ACCESS_MW_BIND = (1<<4), | 1099 | IB_ACCESS_MW_BIND = (1<<4), |
| 1076 | IB_ZERO_BASED = (1<<5) | 1100 | IB_ZERO_BASED = (1<<5), |
| 1101 | IB_ACCESS_ON_DEMAND = (1<<6), | ||
| 1077 | }; | 1102 | }; |
| 1078 | 1103 | ||
| 1079 | struct ib_phys_buf { | 1104 | struct ib_phys_buf { |
| @@ -1115,6 +1140,8 @@ struct ib_fmr_attr { | |||
| 1115 | u8 page_shift; | 1140 | u8 page_shift; |
| 1116 | }; | 1141 | }; |
| 1117 | 1142 | ||
| 1143 | struct ib_umem; | ||
| 1144 | |||
| 1118 | struct ib_ucontext { | 1145 | struct ib_ucontext { |
| 1119 | struct ib_device *device; | 1146 | struct ib_device *device; |
| 1120 | struct list_head pd_list; | 1147 | struct list_head pd_list; |
| @@ -1127,6 +1154,24 @@ struct ib_ucontext { | |||
| 1127 | struct list_head xrcd_list; | 1154 | struct list_head xrcd_list; |
| 1128 | struct list_head rule_list; | 1155 | struct list_head rule_list; |
| 1129 | int closing; | 1156 | int closing; |
| 1157 | |||
| 1158 | struct pid *tgid; | ||
| 1159 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | ||
| 1160 | struct rb_root umem_tree; | ||
| 1161 | /* | ||
| 1162 | * Protects .umem_rbroot and tree, as well as odp_mrs_count and | ||
| 1163 | * mmu notifiers registration. | ||
| 1164 | */ | ||
| 1165 | struct rw_semaphore umem_rwsem; | ||
| 1166 | void (*invalidate_range)(struct ib_umem *umem, | ||
| 1167 | unsigned long start, unsigned long end); | ||
| 1168 | |||
| 1169 | struct mmu_notifier mn; | ||
| 1170 | atomic_t notifier_count; | ||
| 1171 | /* A list of umems that don't have private mmu notifier counters yet. */ | ||
| 1172 | struct list_head no_private_counters; | ||
| 1173 | int odp_mrs_count; | ||
| 1174 | #endif | ||
| 1130 | }; | 1175 | }; |
| 1131 | 1176 | ||
| 1132 | struct ib_uobject { | 1177 | struct ib_uobject { |
| @@ -1662,7 +1707,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t | |||
| 1662 | 1707 | ||
| 1663 | static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) | 1708 | static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) |
| 1664 | { | 1709 | { |
| 1665 | return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; | 1710 | size_t copy_sz; |
| 1711 | |||
| 1712 | copy_sz = min_t(size_t, len, udata->outlen); | ||
| 1713 | return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0; | ||
| 1666 | } | 1714 | } |
| 1667 | 1715 | ||
| 1668 | /** | 1716 | /** |
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 26daf55ff76e..4275b961bf60 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h | |||
| @@ -90,8 +90,9 @@ enum { | |||
| 90 | }; | 90 | }; |
| 91 | 91 | ||
| 92 | enum { | 92 | enum { |
| 93 | IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, | ||
| 93 | IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, | 94 | IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, |
| 94 | IB_USER_VERBS_EX_CMD_DESTROY_FLOW | 95 | IB_USER_VERBS_EX_CMD_DESTROY_FLOW, |
| 95 | }; | 96 | }; |
| 96 | 97 | ||
| 97 | /* | 98 | /* |
| @@ -201,6 +202,32 @@ struct ib_uverbs_query_device_resp { | |||
| 201 | __u8 reserved[4]; | 202 | __u8 reserved[4]; |
| 202 | }; | 203 | }; |
| 203 | 204 | ||
| 205 | enum { | ||
| 206 | IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0, | ||
| 207 | }; | ||
| 208 | |||
| 209 | struct ib_uverbs_ex_query_device { | ||
| 210 | __u32 comp_mask; | ||
| 211 | __u32 reserved; | ||
| 212 | }; | ||
| 213 | |||
| 214 | struct ib_uverbs_odp_caps { | ||
| 215 | __u64 general_caps; | ||
| 216 | struct { | ||
| 217 | __u32 rc_odp_caps; | ||
| 218 | __u32 uc_odp_caps; | ||
| 219 | __u32 ud_odp_caps; | ||
| 220 | } per_transport_caps; | ||
| 221 | __u32 reserved; | ||
| 222 | }; | ||
| 223 | |||
| 224 | struct ib_uverbs_ex_query_device_resp { | ||
| 225 | struct ib_uverbs_query_device_resp base; | ||
| 226 | __u32 comp_mask; | ||
| 227 | __u32 reserved; | ||
| 228 | struct ib_uverbs_odp_caps odp_caps; | ||
| 229 | }; | ||
| 230 | |||
| 204 | struct ib_uverbs_query_port { | 231 | struct ib_uverbs_query_port { |
| 205 | __u64 response; | 232 | __u64 response; |
| 206 | __u8 port_num; | 233 | __u8 port_num; |
