aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorHaggai Eran <haggaie@mellanox.com>2014-12-11 10:04:18 -0500
committerRoland Dreier <roland@purestorage.com>2014-12-15 21:13:36 -0500
commit882214e2b12860bff1ccff15a3ec2bbb29d58c02 (patch)
treea3609ca71cec22f0c80b4f1b3d5bebf8024051bb /drivers
parent8ada2c1c0c1d75a60723cd2ca7d49c594a146af6 (diff)
IB/core: Implement support for MMU notifiers regarding on demand paging regions
* Add an interval tree implementation for ODP umems. Create an interval tree for each ucontext (including a count of the number of ODP MRs in this context, semaphore, etc.), and register ODP umems in the interval tree. * Add MMU notifiers handling functions, using the interval tree to notify only the relevant umems and underlying MRs. * Register to receive MMU notifier events from the MM subsystem upon ODP MR registration (and unregister accordingly). * Add a completion object to synchronize the destruction of ODP umems. * Add mechanism to abort page faults when there's a concurrent invalidation. The way we synchronize between concurrent invalidations and page faults is by keeping a counter of currently running invalidations, and a sequence number that is incremented whenever an invalidation is caught. The page fault code checks the counter and also verifies that the sequence number hasn't progressed before it updates the umem's page tables. This is similar to what the kvm module does. In order to prevent the case where we register a umem in the middle of an ongoing notifier, we also keep a per ucontext counter of the total number of active mmu notifiers. We only enable new umems when all the running notifiers complete. Signed-off-by: Sagi Grimberg <sagig@mellanox.com> Signed-off-by: Shachar Raindel <raindel@mellanox.com> Signed-off-by: Haggai Eran <haggaie@mellanox.com> Signed-off-by: Yuval Dagan <yuvalda@mellanox.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/Makefile2
-rw-r--r--drivers/infiniband/core/umem.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c379
-rw-r--r--drivers/infiniband/core/umem_rbtree.c94
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c17
6 files changed, 483 insertions, 12 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 089a2c2af329..b899531498eb 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -41,6 +41,7 @@ config INFINIBAND_USER_MEM
41config INFINIBAND_ON_DEMAND_PAGING 41config INFINIBAND_ON_DEMAND_PAGING
42 bool "InfiniBand on-demand paging support" 42 bool "InfiniBand on-demand paging support"
43 depends on INFINIBAND_USER_MEM 43 depends on INFINIBAND_USER_MEM
44 select MMU_NOTIFIER
44 default y 45 default y
45 ---help--- 46 ---help---
46 On demand paging support for the InfiniBand subsystem. 47 On demand paging support for the InfiniBand subsystem.
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index c58f7913c560..acf736764445 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
11ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ 11ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
12 device.o fmr_pool.o cache.o netlink.o 12 device.o fmr_pool.o cache.o netlink.o
13ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o 13ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
14ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o 14ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
15 15
16ib_mad-y := mad.o smi.o agent.o mad_rmpp.o 16ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
17 17
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 5baceb79f21b..aec7a6aa2951 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -72,7 +72,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
72 * ib_umem_get - Pin and DMA map userspace memory. 72 * ib_umem_get - Pin and DMA map userspace memory.
73 * 73 *
74 * If access flags indicate ODP memory, avoid pinning. Instead, stores 74 * If access flags indicate ODP memory, avoid pinning. Instead, stores
75 * the mm for future page fault handling. 75 * the mm for future page fault handling in conjunction with MMU notifiers.
76 * 76 *
77 * @context: userspace context to pin memory for 77 * @context: userspace context to pin memory for
78 * @addr: userspace virtual address to start at 78 * @addr: userspace virtual address to start at
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index f889e8d793bd..6095872549e7 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -41,26 +41,235 @@
41#include <rdma/ib_umem.h> 41#include <rdma/ib_umem.h>
42#include <rdma/ib_umem_odp.h> 42#include <rdma/ib_umem_odp.h>
43 43
44static void ib_umem_notifier_start_account(struct ib_umem *item)
45{
46 mutex_lock(&item->odp_data->umem_mutex);
47
48 /* Only update private counters for this umem if it has them.
49 * Otherwise skip it. All page faults will be delayed for this umem. */
50 if (item->odp_data->mn_counters_active) {
51 int notifiers_count = item->odp_data->notifiers_count++;
52
53 if (notifiers_count == 0)
54 /* Initialize the completion object for waiting on
55 * notifiers. Since notifier_count is zero, no one
56 * should be waiting right now. */
57 reinit_completion(&item->odp_data->notifier_completion);
58 }
59 mutex_unlock(&item->odp_data->umem_mutex);
60}
61
62static void ib_umem_notifier_end_account(struct ib_umem *item)
63{
64 mutex_lock(&item->odp_data->umem_mutex);
65
66 /* Only update private counters for this umem if it has them.
67 * Otherwise skip it. All page faults will be delayed for this umem. */
68 if (item->odp_data->mn_counters_active) {
69 /*
70 * This sequence increase will notify the QP page fault that
71 * the page that is going to be mapped in the spte could have
72 * been freed.
73 */
74 ++item->odp_data->notifiers_seq;
75 if (--item->odp_data->notifiers_count == 0)
76 complete_all(&item->odp_data->notifier_completion);
77 }
78 mutex_unlock(&item->odp_data->umem_mutex);
79}
80
81/* Account for a new mmu notifier in an ib_ucontext. */
82static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
83{
84 atomic_inc(&context->notifier_count);
85}
86
87/* Account for a terminating mmu notifier in an ib_ucontext.
88 *
89 * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
90 * the function takes the semaphore itself. */
91static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
92{
93 int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
94
95 if (zero_notifiers &&
96 !list_empty(&context->no_private_counters)) {
97 /* No currently running mmu notifiers. Now is the chance to
98 * add private accounting to all previously added umems. */
99 struct ib_umem_odp *odp_data, *next;
100
101 /* Prevent concurrent mmu notifiers from working on the
102 * no_private_counters list. */
103 down_write(&context->umem_rwsem);
104
105 /* Read the notifier_count again, with the umem_rwsem
106 * semaphore taken for write. */
107 if (!atomic_read(&context->notifier_count)) {
108 list_for_each_entry_safe(odp_data, next,
109 &context->no_private_counters,
110 no_private_counters) {
111 mutex_lock(&odp_data->umem_mutex);
112 odp_data->mn_counters_active = true;
113 list_del(&odp_data->no_private_counters);
114 complete_all(&odp_data->notifier_completion);
115 mutex_unlock(&odp_data->umem_mutex);
116 }
117 }
118
119 up_write(&context->umem_rwsem);
120 }
121}
122
123static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
124 u64 end, void *cookie) {
125 /*
126 * Increase the number of notifiers running, to
127 * prevent any further fault handling on this MR.
128 */
129 ib_umem_notifier_start_account(item);
130 item->odp_data->dying = 1;
131 /* Make sure that the fact the umem is dying is out before we release
132 * all pending page faults. */
133 smp_wmb();
134 complete_all(&item->odp_data->notifier_completion);
135 item->context->invalidate_range(item, ib_umem_start(item),
136 ib_umem_end(item));
137 return 0;
138}
139
140static void ib_umem_notifier_release(struct mmu_notifier *mn,
141 struct mm_struct *mm)
142{
143 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
144
145 if (!context->invalidate_range)
146 return;
147
148 ib_ucontext_notifier_start_account(context);
149 down_read(&context->umem_rwsem);
150 rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
151 ULLONG_MAX,
152 ib_umem_notifier_release_trampoline,
153 NULL);
154 up_read(&context->umem_rwsem);
155}
156
157static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
158 u64 end, void *cookie)
159{
160 ib_umem_notifier_start_account(item);
161 item->context->invalidate_range(item, start, start + PAGE_SIZE);
162 ib_umem_notifier_end_account(item);
163 return 0;
164}
165
166static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
167 struct mm_struct *mm,
168 unsigned long address)
169{
170 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
171
172 if (!context->invalidate_range)
173 return;
174
175 ib_ucontext_notifier_start_account(context);
176 down_read(&context->umem_rwsem);
177 rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
178 address + PAGE_SIZE,
179 invalidate_page_trampoline, NULL);
180 up_read(&context->umem_rwsem);
181 ib_ucontext_notifier_end_account(context);
182}
183
184static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
185 u64 end, void *cookie)
186{
187 ib_umem_notifier_start_account(item);
188 item->context->invalidate_range(item, start, end);
189 return 0;
190}
191
192static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
193 struct mm_struct *mm,
194 unsigned long start,
195 unsigned long end)
196{
197 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
198
199 if (!context->invalidate_range)
200 return;
201
202 ib_ucontext_notifier_start_account(context);
203 down_read(&context->umem_rwsem);
204 rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
205 end,
206 invalidate_range_start_trampoline, NULL);
207 up_read(&context->umem_rwsem);
208}
209
210static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
211 u64 end, void *cookie)
212{
213 ib_umem_notifier_end_account(item);
214 return 0;
215}
216
217static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
218 struct mm_struct *mm,
219 unsigned long start,
220 unsigned long end)
221{
222 struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
223
224 if (!context->invalidate_range)
225 return;
226
227 down_read(&context->umem_rwsem);
228 rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
229 end,
230 invalidate_range_end_trampoline, NULL);
231 up_read(&context->umem_rwsem);
232 ib_ucontext_notifier_end_account(context);
233}
234
235static struct mmu_notifier_ops ib_umem_notifiers = {
236 .release = ib_umem_notifier_release,
237 .invalidate_page = ib_umem_notifier_invalidate_page,
238 .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
239 .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
240};
241
44int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem) 242int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
45{ 243{
46 int ret_val; 244 int ret_val;
47 struct pid *our_pid; 245 struct pid *our_pid;
246 struct mm_struct *mm = get_task_mm(current);
247
248 if (!mm)
249 return -EINVAL;
48 250
49 /* Prevent creating ODP MRs in child processes */ 251 /* Prevent creating ODP MRs in child processes */
50 rcu_read_lock(); 252 rcu_read_lock();
51 our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); 253 our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
52 rcu_read_unlock(); 254 rcu_read_unlock();
53 put_pid(our_pid); 255 put_pid(our_pid);
54 if (context->tgid != our_pid) 256 if (context->tgid != our_pid) {
55 return -EINVAL; 257 ret_val = -EINVAL;
258 goto out_mm;
259 }
56 260
57 umem->hugetlb = 0; 261 umem->hugetlb = 0;
58 umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); 262 umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
59 if (!umem->odp_data) 263 if (!umem->odp_data) {
60 return -ENOMEM; 264 ret_val = -ENOMEM;
265 goto out_mm;
266 }
267 umem->odp_data->umem = umem;
61 268
62 mutex_init(&umem->odp_data->umem_mutex); 269 mutex_init(&umem->odp_data->umem_mutex);
63 270
271 init_completion(&umem->odp_data->notifier_completion);
272
64 umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) * 273 umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
65 sizeof(*umem->odp_data->page_list)); 274 sizeof(*umem->odp_data->page_list));
66 if (!umem->odp_data->page_list) { 275 if (!umem->odp_data->page_list) {
@@ -75,17 +284,72 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
75 goto out_page_list; 284 goto out_page_list;
76 } 285 }
77 286
287 /*
288 * When using MMU notifiers, we will get a
289 * notification before the "current" task (and MM) is
290 * destroyed. We use the umem_rwsem semaphore to synchronize.
291 */
292 down_write(&context->umem_rwsem);
293 context->odp_mrs_count++;
294 if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
295 rbt_ib_umem_insert(&umem->odp_data->interval_tree,
296 &context->umem_tree);
297 if (likely(!atomic_read(&context->notifier_count)))
298 umem->odp_data->mn_counters_active = true;
299 else
300 list_add(&umem->odp_data->no_private_counters,
301 &context->no_private_counters);
302 downgrade_write(&context->umem_rwsem);
303
304 if (context->odp_mrs_count == 1) {
305 /*
306 * Note that at this point, no MMU notifier is running
307 * for this context!
308 */
309 atomic_set(&context->notifier_count, 0);
310 INIT_HLIST_NODE(&context->mn.hlist);
311 context->mn.ops = &ib_umem_notifiers;
312 /*
313 * Lock-dep detects a false positive for mmap_sem vs.
314 * umem_rwsem, due to not grasping downgrade_write correctly.
315 */
316 lockdep_off();
317 ret_val = mmu_notifier_register(&context->mn, mm);
318 lockdep_on();
319 if (ret_val) {
320 pr_err("Failed to register mmu_notifier %d\n", ret_val);
321 ret_val = -EBUSY;
322 goto out_mutex;
323 }
324 }
325
326 up_read(&context->umem_rwsem);
327
328 /*
329 * Note that doing an mmput can cause a notifier for the relevant mm.
330 * If the notifier is called while we hold the umem_rwsem, this will
331 * cause a deadlock. Therefore, we release the reference only after we
332 * released the semaphore.
333 */
334 mmput(mm);
78 return 0; 335 return 0;
79 336
337out_mutex:
338 up_read(&context->umem_rwsem);
339 vfree(umem->odp_data->dma_list);
80out_page_list: 340out_page_list:
81 vfree(umem->odp_data->page_list); 341 vfree(umem->odp_data->page_list);
82out_odp_data: 342out_odp_data:
83 kfree(umem->odp_data); 343 kfree(umem->odp_data);
344out_mm:
345 mmput(mm);
84 return ret_val; 346 return ret_val;
85} 347}
86 348
87void ib_umem_odp_release(struct ib_umem *umem) 349void ib_umem_odp_release(struct ib_umem *umem)
88{ 350{
351 struct ib_ucontext *context = umem->context;
352
89 /* 353 /*
90 * Ensure that no more pages are mapped in the umem. 354 * Ensure that no more pages are mapped in the umem.
91 * 355 *
@@ -95,6 +359,54 @@ void ib_umem_odp_release(struct ib_umem *umem)
95 ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), 359 ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
96 ib_umem_end(umem)); 360 ib_umem_end(umem));
97 361
362 down_write(&context->umem_rwsem);
363 if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
364 rbt_ib_umem_remove(&umem->odp_data->interval_tree,
365 &context->umem_tree);
366 context->odp_mrs_count--;
367 if (!umem->odp_data->mn_counters_active) {
368 list_del(&umem->odp_data->no_private_counters);
369 complete_all(&umem->odp_data->notifier_completion);
370 }
371
372 /*
373 * Downgrade the lock to a read lock. This ensures that the notifiers
374 * (who lock the mutex for reading) will be able to finish, and we
375 * will be able to enventually obtain the mmu notifiers SRCU. Note
376 * that since we are doing it atomically, no other user could register
377 * and unregister while we do the check.
378 */
379 downgrade_write(&context->umem_rwsem);
380 if (!context->odp_mrs_count) {
381 struct task_struct *owning_process = NULL;
382 struct mm_struct *owning_mm = NULL;
383
384 owning_process = get_pid_task(context->tgid,
385 PIDTYPE_PID);
386 if (owning_process == NULL)
387 /*
388 * The process is already dead, notifier were removed
389 * already.
390 */
391 goto out;
392
393 owning_mm = get_task_mm(owning_process);
394 if (owning_mm == NULL)
395 /*
396 * The process' mm is already dead, notifier were
397 * removed already.
398 */
399 goto out_put_task;
400 mmu_notifier_unregister(&context->mn, owning_mm);
401
402 mmput(owning_mm);
403
404out_put_task:
405 put_task_struct(owning_process);
406 }
407out:
408 up_read(&context->umem_rwsem);
409
98 vfree(umem->odp_data->dma_list); 410 vfree(umem->odp_data->dma_list);
99 vfree(umem->odp_data->page_list); 411 vfree(umem->odp_data->page_list);
100 kfree(umem->odp_data); 412 kfree(umem->odp_data);
@@ -112,7 +424,8 @@ void ib_umem_odp_release(struct ib_umem *umem)
112 * the sequence number is taken from 424 * the sequence number is taken from
113 * umem->odp_data->notifiers_seq. 425 * umem->odp_data->notifiers_seq.
114 * 426 *
115 * The function returns -EFAULT if the DMA mapping operation fails. 427 * The function returns -EFAULT if the DMA mapping operation fails. It returns
428 * -EAGAIN if a concurrent invalidation prevents us from updating the page.
116 * 429 *
117 * The page is released via put_page even if the operation failed. For 430 * The page is released via put_page even if the operation failed. For
118 * on-demand pinning, the page is released whenever it isn't stored in the 431 * on-demand pinning, the page is released whenever it isn't stored in the
@@ -121,6 +434,7 @@ void ib_umem_odp_release(struct ib_umem *umem)
121static int ib_umem_odp_map_dma_single_page( 434static int ib_umem_odp_map_dma_single_page(
122 struct ib_umem *umem, 435 struct ib_umem *umem,
123 int page_index, 436 int page_index,
437 u64 base_virt_addr,
124 struct page *page, 438 struct page *page,
125 u64 access_mask, 439 u64 access_mask,
126 unsigned long current_seq) 440 unsigned long current_seq)
@@ -128,9 +442,19 @@ static int ib_umem_odp_map_dma_single_page(
128 struct ib_device *dev = umem->context->device; 442 struct ib_device *dev = umem->context->device;
129 dma_addr_t dma_addr; 443 dma_addr_t dma_addr;
130 int stored_page = 0; 444 int stored_page = 0;
445 int remove_existing_mapping = 0;
131 int ret = 0; 446 int ret = 0;
132 447
133 mutex_lock(&umem->odp_data->umem_mutex); 448 mutex_lock(&umem->odp_data->umem_mutex);
449 /*
450 * Note: we avoid writing if seq is different from the initial seq, to
451 * handle case of a racing notifier. This check also allows us to bail
452 * early if we have a notifier running in parallel with us.
453 */
454 if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
455 ret = -EAGAIN;
456 goto out;
457 }
134 if (!(umem->odp_data->dma_list[page_index])) { 458 if (!(umem->odp_data->dma_list[page_index])) {
135 dma_addr = ib_dma_map_page(dev, 459 dma_addr = ib_dma_map_page(dev,
136 page, 460 page,
@@ -148,14 +472,27 @@ static int ib_umem_odp_map_dma_single_page(
148 } else { 472 } else {
149 pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", 473 pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
150 umem->odp_data->page_list[page_index], page); 474 umem->odp_data->page_list[page_index], page);
475 /* Better remove the mapping now, to prevent any further
476 * damage. */
477 remove_existing_mapping = 1;
151 } 478 }
152 479
153out: 480out:
154 mutex_unlock(&umem->odp_data->umem_mutex); 481 mutex_unlock(&umem->odp_data->umem_mutex);
155 482
156 if (!stored_page) 483 /* On Demand Paging - avoid pinning the page */
484 if (umem->context->invalidate_range || !stored_page)
157 put_page(page); 485 put_page(page);
158 486
487 if (remove_existing_mapping && umem->context->invalidate_range) {
488 invalidate_page_trampoline(
489 umem,
490 base_virt_addr + (page_index * PAGE_SIZE),
491 base_virt_addr + ((page_index+1)*PAGE_SIZE),
492 NULL);
493 ret = -EAGAIN;
494 }
495
159 return ret; 496 return ret;
160} 497}
161 498
@@ -168,6 +505,8 @@ out:
168 * 505 *
169 * Returns the number of pages mapped in success, negative error code 506 * Returns the number of pages mapped in success, negative error code
170 * for failure. 507 * for failure.
508 * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
509 * the function from completing its task.
171 * 510 *
172 * @umem: the umem to map and pin 511 * @umem: the umem to map and pin
173 * @user_virt: the address from which we need to map. 512 * @user_virt: the address from which we need to map.
@@ -189,6 +528,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
189 struct page **local_page_list = NULL; 528 struct page **local_page_list = NULL;
190 u64 off; 529 u64 off;
191 int j, k, ret = 0, start_idx, npages = 0; 530 int j, k, ret = 0, start_idx, npages = 0;
531 u64 base_virt_addr;
192 532
193 if (access_mask == 0) 533 if (access_mask == 0)
194 return -EINVAL; 534 return -EINVAL;
@@ -203,6 +543,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
203 543
204 off = user_virt & (~PAGE_MASK); 544 off = user_virt & (~PAGE_MASK);
205 user_virt = user_virt & PAGE_MASK; 545 user_virt = user_virt & PAGE_MASK;
546 base_virt_addr = user_virt;
206 bcnt += off; /* Charge for the first page offset as well. */ 547 bcnt += off; /* Charge for the first page offset as well. */
207 548
208 owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); 549 owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
@@ -246,8 +587,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
246 user_virt += npages << PAGE_SHIFT; 587 user_virt += npages << PAGE_SHIFT;
247 for (j = 0; j < npages; ++j) { 588 for (j = 0; j < npages; ++j) {
248 ret = ib_umem_odp_map_dma_single_page( 589 ret = ib_umem_odp_map_dma_single_page(
249 umem, k, local_page_list[j], access_mask, 590 umem, k, base_virt_addr, local_page_list[j],
250 current_seq); 591 access_mask, current_seq);
251 if (ret < 0) 592 if (ret < 0)
252 break; 593 break;
253 k++; 594 k++;
@@ -286,6 +627,11 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
286 627
287 virt = max_t(u64, virt, ib_umem_start(umem)); 628 virt = max_t(u64, virt, ib_umem_start(umem));
288 bound = min_t(u64, bound, ib_umem_end(umem)); 629 bound = min_t(u64, bound, ib_umem_end(umem));
630 /* Note that during the run of this function, the
631 * notifiers_count of the MR is > 0, preventing any racing
632 * faults from completion. We might be racing with other
633 * invalidations, so we must make sure we free each page only
634 * once. */
289 for (addr = virt; addr < bound; addr += (u64)umem->page_size) { 635 for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
290 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE; 636 idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
291 mutex_lock(&umem->odp_data->umem_mutex); 637 mutex_lock(&umem->odp_data->umem_mutex);
@@ -300,8 +646,21 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
300 ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, 646 ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
301 DMA_BIDIRECTIONAL); 647 DMA_BIDIRECTIONAL);
302 if (dma & ODP_WRITE_ALLOWED_BIT) 648 if (dma & ODP_WRITE_ALLOWED_BIT)
303 set_page_dirty_lock(head_page); 649 /*
304 put_page(page); 650 * set_page_dirty prefers being called with
651 * the page lock. However, MMU notifiers are
652 * called sometimes with and sometimes without
653 * the lock. We rely on the umem_mutex instead
654 * to prevent other mmu notifiers from
655 * continuing and allowing the page mapping to
656 * be removed.
657 */
658 set_page_dirty(head_page);
659 /* on demand pinning support */
660 if (!umem->context->invalidate_range)
661 put_page(page);
662 umem->odp_data->page_list[idx] = NULL;
663 umem->odp_data->dma_list[idx] = 0;
305 } 664 }
306 mutex_unlock(&umem->odp_data->umem_mutex); 665 mutex_unlock(&umem->odp_data->umem_mutex);
307 } 666 }
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
new file mode 100644
index 000000000000..727d788448f5
--- /dev/null
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -0,0 +1,94 @@
1/*
2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33#include <linux/kernel.h>
34#include <linux/module.h>
35#include <linux/interval_tree_generic.h>
36#include <linux/sched.h>
37#include <linux/gfp.h>
38#include <rdma/ib_umem_odp.h>
39
40/*
41 * The ib_umem list keeps track of memory regions for which the HW
42 * device request to receive notification when the related memory
43 * mapping is changed.
44 *
45 * ib_umem_lock protects the list.
46 */
47
48static inline u64 node_start(struct umem_odp_node *n)
49{
50 struct ib_umem_odp *umem_odp =
51 container_of(n, struct ib_umem_odp, interval_tree);
52
53 return ib_umem_start(umem_odp->umem);
54}
55
56/* Note that the representation of the intervals in the interval tree
57 * considers the ending point as contained in the interval, while the
58 * function ib_umem_end returns the first address which is not contained
59 * in the umem.
60 */
61static inline u64 node_last(struct umem_odp_node *n)
62{
63 struct ib_umem_odp *umem_odp =
64 container_of(n, struct ib_umem_odp, interval_tree);
65
66 return ib_umem_end(umem_odp->umem) - 1;
67}
68
69INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
70 node_start, node_last, , rbt_ib_umem)
71
72/* @last is not a part of the interval. See comment for function
73 * node_last.
74 */
75int rbt_ib_umem_for_each_in_range(struct rb_root *root,
76 u64 start, u64 last,
77 umem_call_back cb,
78 void *cookie)
79{
80 int ret_val = 0;
81 struct umem_odp_node *node;
82 struct ib_umem_odp *umem;
83
84 if (unlikely(start == last))
85 return ret_val;
86
87 for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
88 node = rbt_ib_umem_iter_next(node, start, last - 1)) {
89 umem = container_of(node, struct ib_umem_odp, interval_tree);
90 ret_val = cb(umem->umem, start, last, cookie) || ret_val;
91 }
92
93 return ret_val;
94}
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 70b697d8fbb3..532d8eba8b02 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -289,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
289 struct ib_uverbs_get_context_resp resp; 289 struct ib_uverbs_get_context_resp resp;
290 struct ib_udata udata; 290 struct ib_udata udata;
291 struct ib_device *ibdev = file->device->ib_dev; 291 struct ib_device *ibdev = file->device->ib_dev;
292#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
293 struct ib_device_attr dev_attr;
294#endif
292 struct ib_ucontext *ucontext; 295 struct ib_ucontext *ucontext;
293 struct file *filp; 296 struct file *filp;
294 int ret; 297 int ret;
@@ -331,6 +334,20 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
331 rcu_read_unlock(); 334 rcu_read_unlock();
332 ucontext->closing = 0; 335 ucontext->closing = 0;
333 336
337#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
338 ucontext->umem_tree = RB_ROOT;
339 init_rwsem(&ucontext->umem_rwsem);
340 ucontext->odp_mrs_count = 0;
341 INIT_LIST_HEAD(&ucontext->no_private_counters);
342
343 ret = ib_query_device(ibdev, &dev_attr);
344 if (ret)
345 goto err_free;
346 if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
347 ucontext->invalidate_range = NULL;
348
349#endif
350
334 resp.num_comp_vectors = file->device->num_comp_vectors; 351 resp.num_comp_vectors = file->device->num_comp_vectors;
335 352
336 ret = get_unused_fd_flags(O_CLOEXEC); 353 ret = get_unused_fd_flags(O_CLOEXEC);