summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 13:07:42 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-21 13:07:42 -0400
commit84da111de0b4be15bd500deff773f5116f39f7be (patch)
tree76b5796f8258397bf7a3926b742a89166a8501ef
parent227c3e9eb5cf3552c2cc83225df6d14adb05f8e8 (diff)
parent62974fc389b364d8af70e044836362222bd3ae53 (diff)
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe: "This is more cleanup and consolidation of the hmm APIs and the very strongly related mmu_notifier interfaces. Many places across the tree using these interfaces are touched in the process. Beyond that a cleanup to the page walker API and a few memremap related changes round out the series: - General improvement of hmm_range_fault() and related APIs, more documentation, bug fixes from testing, API simplification & consolidation, and unused API removal - Simplify the hmm related kconfigs to HMM_MIRROR and DEVICE_PRIVATE, and make them internal kconfig selects - Hoist a lot of code related to mmu notifier attachment out of drivers by using a refcount get/put attachment idiom and remove the convoluted mmu_notifier_unregister_no_release() and related APIs. - General API improvement for the migrate_vma API and revision of its only user in nouveau - Annotate mmu_notifiers with lockdep and sleeping region debugging Two series unrelated to HMM or mmu_notifiers came along due to dependencies: - Allow pagemap's memremap_pages family of APIs to work without providing a struct device - Make walk_page_range() and related use a constant structure for function pointers" * tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (75 commits) libnvdimm: Enable unit test infrastructure compile checks mm, notifier: Catch sleeping/blocking for !blockable kernel.h: Add non_block_start/end() drm/radeon: guard against calling an unpaired radeon_mn_unregister() csky: add missing brackets in a macro for tlb.h pagewalk: use lockdep_assert_held for locking validation pagewalk: separate function pointers from iterator data mm: split out a new pagewalk.h header from mm.h mm/mmu_notifiers: annotate with might_sleep() mm/mmu_notifiers: prime lockdep mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports mm/hmm: hmm_range_fault() infinite loop mm/hmm: hmm_range_fault() NULL pointer bug mm/hmm: fix hmm_range_fault()'s handling of swapped out pages mm/mmu_notifiers: remove unregister_no_release RDMA/odp: remove ib_ucontext from ib_umem RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm' RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr RDMA/mlx5: Use ib_umem_start instead of umem.address ...
-rw-r--r--Documentation/vm/hmm.rst73
-rw-r--r--arch/csky/include/asm/tlb.h8
-rw-r--r--arch/openrisc/kernel/dma.c23
-rw-r--r--arch/powerpc/mm/book3s64/subpage_prot.c12
-rw-r--r--arch/s390/mm/gmap.c35
-rw-r--r--drivers/gpu/drm/amd/amdgpu/Kconfig4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c15
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c31
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_priv.h3
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c88
-rw-r--r--drivers/gpu/drm/nouveau/Kconfig5
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_dmem.c456
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_dmem.h11
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_drm.c3
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_svm.c23
-rw-r--r--drivers/gpu/drm/radeon/radeon.h3
-rw-r--r--drivers/gpu/drm/radeon/radeon_device.c2
-rw-r--r--drivers/gpu/drm/radeon/radeon_drv.c2
-rw-r--r--drivers/gpu/drm/radeon/radeon_mn.c156
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/infiniband/core/device.c1
-rw-r--r--drivers/infiniband/core/umem.c54
-rw-r--r--drivers/infiniband/core/umem_odp.c524
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c5
-rw-r--r--drivers/infiniband/core/uverbs_main.c1
-rw-r--r--drivers/infiniband/hw/mlx5/main.c9
-rw-r--r--drivers/infiniband/hw/mlx5/mem.c13
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c38
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c88
-rw-r--r--drivers/misc/sgi-gru/grufile.c1
-rw-r--r--drivers/misc/sgi-gru/grutables.h2
-rw-r--r--drivers/misc/sgi-gru/grutlbpurge.c84
-rw-r--r--drivers/nvdimm/Kconfig12
-rw-r--r--drivers/nvdimm/Makefile4
-rw-r--r--fs/proc/task_mmu.c80
-rw-r--r--include/linux/hmm.h125
-rw-r--r--include/linux/ioport.h2
-rw-r--r--include/linux/kernel.h23
-rw-r--r--include/linux/memremap.h3
-rw-r--r--include/linux/migrate.h120
-rw-r--r--include/linux/mm.h46
-rw-r--r--include/linux/mm_types.h6
-rw-r--r--include/linux/mmu_notifier.h59
-rw-r--r--include/linux/pagewalk.h66
-rw-r--r--include/linux/sched.h4
-rw-r--r--include/rdma/ib_umem.h2
-rw-r--r--include/rdma/ib_umem_odp.h58
-rw-r--r--include/rdma/ib_verbs.h7
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/resource.c45
-rw-r--r--kernel/sched/core.c19
-rw-r--r--mm/Kconfig20
-rw-r--r--mm/hmm.c490
-rw-r--r--mm/madvise.c42
-rw-r--r--mm/memcontrol.c25
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/memremap.c105
-rw-r--r--mm/migrate.c276
-rw-r--r--mm/mincore.c17
-rw-r--r--mm/mmu_notifier.c263
-rw-r--r--mm/mprotect.c26
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/pagewalk.c126
-rw-r--r--tools/testing/nvdimm/test/iomap.c1
65 files changed, 1684 insertions, 2184 deletions
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 710ce1c701bf..0a5960beccf7 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must complete the update before
192the driver callback returns. 192the driver callback returns.
193 193
194When the device driver wants to populate a range of virtual addresses, it can 194When the device driver wants to populate a range of virtual addresses, it can
195use either:: 195use::
196 196
197 long hmm_range_snapshot(struct hmm_range *range); 197 long hmm_range_fault(struct hmm_range *range, unsigned int flags);
198 long hmm_range_fault(struct hmm_range *range, bool block);
199 198
200The first one (hmm_range_snapshot()) will only fetch present CPU page table 199With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table
201entries and will not trigger a page fault on missing or non-present entries. 200entries and will not trigger a page fault on missing or non-present entries.
202The second one does trigger a page fault on missing or read-only entries if 201Without that flag, it does trigger a page fault on missing or read-only entries
203write access is requested (see below). Page faults use the generic mm page 202if write access is requested (see below). Page faults use the generic mm page
204fault code path just like a CPU page fault. 203fault code path just like a CPU page fault.
205 204
206Both functions copy CPU page table entries into their pfns array argument. Each 205Both functions copy CPU page table entries into their pfns array argument. Each
@@ -223,24 +222,24 @@ The usage pattern is::
223 range.flags = ...; 222 range.flags = ...;
224 range.values = ...; 223 range.values = ...;
225 range.pfn_shift = ...; 224 range.pfn_shift = ...;
226 hmm_range_register(&range); 225 hmm_range_register(&range, mirror);
227 226
228 /* 227 /*
229 * Just wait for range to be valid, safe to ignore return value as we 228 * Just wait for range to be valid, safe to ignore return value as we
230 * will use the return value of hmm_range_snapshot() below under the 229 * will use the return value of hmm_range_fault() below under the
231 * mmap_sem to ascertain the validity of the range. 230 * mmap_sem to ascertain the validity of the range.
232 */ 231 */
233 hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); 232 hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
234 233
235 again: 234 again:
236 down_read(&mm->mmap_sem); 235 down_read(&mm->mmap_sem);
237 ret = hmm_range_snapshot(&range); 236 ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT);
238 if (ret) { 237 if (ret) {
239 up_read(&mm->mmap_sem); 238 up_read(&mm->mmap_sem);
240 if (ret == -EBUSY) { 239 if (ret == -EBUSY) {
241 /* 240 /*
242 * No need to check hmm_range_wait_until_valid() return value 241 * No need to check hmm_range_wait_until_valid() return value
243 * on retry we will get proper error with hmm_range_snapshot() 242 * on retry we will get proper error with hmm_range_fault()
244 */ 243 */
245 hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); 244 hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
246 goto again; 245 goto again;
@@ -340,58 +339,8 @@ Migration to and from device memory
340=================================== 339===================================
341 340
342Because the CPU cannot access device memory, migration must use the device DMA 341Because the CPU cannot access device memory, migration must use the device DMA
343engine to perform copy from and to device memory. For this we need a new 342engine to perform copy from and to device memory. For this we need to use
344migration helper:: 343migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() helpers.
345
346 int migrate_vma(const struct migrate_vma_ops *ops,
347 struct vm_area_struct *vma,
348 unsigned long mentries,
349 unsigned long start,
350 unsigned long end,
351 unsigned long *src,
352 unsigned long *dst,
353 void *private);
354
355Unlike other migration functions it works on a range of virtual address, there
356are two reasons for that. First, device DMA copy has a high setup overhead cost
357and thus batching multiple pages is needed as otherwise the migration overhead
358makes the whole exercise pointless. The second reason is because the
359migration might be for a range of addresses the device is actively accessing.
360
361The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
362controls destination memory allocation and copy operation. Second one is there
363to allow the device driver to perform cleanup operations after migration::
364
365 struct migrate_vma_ops {
366 void (*alloc_and_copy)(struct vm_area_struct *vma,
367 const unsigned long *src,
368 unsigned long *dst,
369 unsigned long start,
370 unsigned long end,
371 void *private);
372 void (*finalize_and_map)(struct vm_area_struct *vma,
373 const unsigned long *src,
374 const unsigned long *dst,
375 unsigned long start,
376 unsigned long end,
377 void *private);
378 };
379
380It is important to stress that these migration helpers allow for holes in the
381virtual address range. Some pages in the range might not be migrated for all
382the usual reasons (page is pinned, page is locked, ...). This helper does not
383fail but just skips over those pages.
384
385The alloc_and_copy() might decide to not migrate all pages in the
386range (for reasons under the callback control). For those, the callback just
387has to leave the corresponding dst entry empty.
388
389Finally, the migration of the struct page might fail (for file backed page) for
390various reasons (failure to freeze reference, or update page cache, ...). If
391that happens, then the finalize_and_map() can catch any pages that were not
392migrated. Note those pages were still copied to a new page and thus we wasted
393bandwidth but this is considered as a rare event and a price that we are
394willing to pay to keep all the code simpler.
395 344
396 345
397Memory cgroup (memcg) and rss accounting 346Memory cgroup (memcg) and rss accounting
diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h
index 8c7cc097666f..fdff9b8d70c8 100644
--- a/arch/csky/include/asm/tlb.h
+++ b/arch/csky/include/asm/tlb.h
@@ -8,14 +8,14 @@
8 8
9#define tlb_start_vma(tlb, vma) \ 9#define tlb_start_vma(tlb, vma) \
10 do { \ 10 do { \
11 if (!tlb->fullmm) \ 11 if (!(tlb)->fullmm) \
12 flush_cache_range(vma, vma->vm_start, vma->vm_end); \ 12 flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \
13 } while (0) 13 } while (0)
14 14
15#define tlb_end_vma(tlb, vma) \ 15#define tlb_end_vma(tlb, vma) \
16 do { \ 16 do { \
17 if (!tlb->fullmm) \ 17 if (!(tlb)->fullmm) \
18 flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ 18 flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \
19 } while (0) 19 } while (0)
20 20
21#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) 21#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index b41a79fcdbd9..4d5b8bd1d795 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#include <linux/dma-noncoherent.h> 18#include <linux/dma-noncoherent.h>
19#include <linux/pagewalk.h>
19 20
20#include <asm/cpuinfo.h> 21#include <asm/cpuinfo.h>
21#include <asm/spr_defs.h> 22#include <asm/spr_defs.h>
@@ -43,6 +44,10 @@ page_set_nocache(pte_t *pte, unsigned long addr,
43 return 0; 44 return 0;
44} 45}
45 46
47static const struct mm_walk_ops set_nocache_walk_ops = {
48 .pte_entry = page_set_nocache,
49};
50
46static int 51static int
47page_clear_nocache(pte_t *pte, unsigned long addr, 52page_clear_nocache(pte_t *pte, unsigned long addr,
48 unsigned long next, struct mm_walk *walk) 53 unsigned long next, struct mm_walk *walk)
@@ -58,6 +63,10 @@ page_clear_nocache(pte_t *pte, unsigned long addr,
58 return 0; 63 return 0;
59} 64}
60 65
66static const struct mm_walk_ops clear_nocache_walk_ops = {
67 .pte_entry = page_clear_nocache,
68};
69
61/* 70/*
62 * Alloc "coherent" memory, which for OpenRISC means simply uncached. 71 * Alloc "coherent" memory, which for OpenRISC means simply uncached.
63 * 72 *
@@ -80,10 +89,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
80{ 89{
81 unsigned long va; 90 unsigned long va;
82 void *page; 91 void *page;
83 struct mm_walk walk = {
84 .pte_entry = page_set_nocache,
85 .mm = &init_mm
86 };
87 92
88 page = alloc_pages_exact(size, gfp | __GFP_ZERO); 93 page = alloc_pages_exact(size, gfp | __GFP_ZERO);
89 if (!page) 94 if (!page)
@@ -98,7 +103,8 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
98 * We need to iterate through the pages, clearing the dcache for 103 * We need to iterate through the pages, clearing the dcache for
99 * them and setting the cache-inhibit bit. 104 * them and setting the cache-inhibit bit.
100 */ 105 */
101 if (walk_page_range(va, va + size, &walk)) { 106 if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops,
107 NULL)) {
102 free_pages_exact(page, size); 108 free_pages_exact(page, size);
103 return NULL; 109 return NULL;
104 } 110 }
@@ -111,13 +117,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr,
111 dma_addr_t dma_handle, unsigned long attrs) 117 dma_addr_t dma_handle, unsigned long attrs)
112{ 118{
113 unsigned long va = (unsigned long)vaddr; 119 unsigned long va = (unsigned long)vaddr;
114 struct mm_walk walk = {
115 .pte_entry = page_clear_nocache,
116 .mm = &init_mm
117 };
118 120
119 /* walk_page_range shouldn't be able to fail here */ 121 /* walk_page_range shouldn't be able to fail here */
120 WARN_ON(walk_page_range(va, va + size, &walk)); 122 WARN_ON(walk_page_range(&init_mm, va, va + size,
123 &clear_nocache_walk_ops, NULL));
121 124
122 free_pages_exact(vaddr, size); 125 free_pages_exact(vaddr, size);
123} 126}
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c
index 9ba07e55c489..2ef24a53f4c9 100644
--- a/arch/powerpc/mm/book3s64/subpage_prot.c
+++ b/arch/powerpc/mm/book3s64/subpage_prot.c
@@ -7,7 +7,7 @@
7#include <linux/kernel.h> 7#include <linux/kernel.h>
8#include <linux/gfp.h> 8#include <linux/gfp.h>
9#include <linux/types.h> 9#include <linux/types.h>
10#include <linux/mm.h> 10#include <linux/pagewalk.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/syscalls.h> 12#include <linux/syscalls.h>
13 13
@@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
139 return 0; 139 return 0;
140} 140}
141 141
142static const struct mm_walk_ops subpage_walk_ops = {
143 .pmd_entry = subpage_walk_pmd_entry,
144};
145
142static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, 146static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
143 unsigned long len) 147 unsigned long len)
144{ 148{
145 struct vm_area_struct *vma; 149 struct vm_area_struct *vma;
146 struct mm_walk subpage_proto_walk = {
147 .mm = mm,
148 .pmd_entry = subpage_walk_pmd_entry,
149 };
150 150
151 /* 151 /*
152 * We don't try too hard, we just mark all the vma in that range 152 * We don't try too hard, we just mark all the vma in that range
@@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
163 if (vma->vm_start >= (addr + len)) 163 if (vma->vm_start >= (addr + len))
164 break; 164 break;
165 vma->vm_flags |= VM_NOHUGEPAGE; 165 vma->vm_flags |= VM_NOHUGEPAGE;
166 walk_page_vma(vma, &subpage_proto_walk); 166 walk_page_vma(vma, &subpage_walk_ops, NULL);
167 vma = vma->vm_next; 167 vma = vma->vm_next;
168 } 168 }
169} 169}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index cd8e03f04d6d..edcdca97e85e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -9,7 +9,7 @@
9 */ 9 */
10 10
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/mm.h> 12#include <linux/pagewalk.h>
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
@@ -2521,13 +2521,9 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
2521 return 0; 2521 return 0;
2522} 2522}
2523 2523
2524static inline void zap_zero_pages(struct mm_struct *mm) 2524static const struct mm_walk_ops zap_zero_walk_ops = {
2525{ 2525 .pmd_entry = __zap_zero_pages,
2526 struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; 2526};
2527
2528 walk.mm = mm;
2529 walk_page_range(0, TASK_SIZE, &walk);
2530}
2531 2527
2532/* 2528/*
2533 * switch on pgstes for its userspace process (for kvm) 2529 * switch on pgstes for its userspace process (for kvm)
@@ -2546,7 +2542,7 @@ int s390_enable_sie(void)
2546 mm->context.has_pgste = 1; 2542 mm->context.has_pgste = 1;
2547 /* split thp mappings and disable thp for future mappings */ 2543 /* split thp mappings and disable thp for future mappings */
2548 thp_split_mm(mm); 2544 thp_split_mm(mm);
2549 zap_zero_pages(mm); 2545 walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
2550 up_write(&mm->mmap_sem); 2546 up_write(&mm->mmap_sem);
2551 return 0; 2547 return 0;
2552} 2548}
@@ -2589,12 +2585,13 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2589 return 0; 2585 return 0;
2590} 2586}
2591 2587
2588static const struct mm_walk_ops enable_skey_walk_ops = {
2589 .hugetlb_entry = __s390_enable_skey_hugetlb,
2590 .pte_entry = __s390_enable_skey_pte,
2591};
2592
2592int s390_enable_skey(void) 2593int s390_enable_skey(void)
2593{ 2594{
2594 struct mm_walk walk = {
2595 .hugetlb_entry = __s390_enable_skey_hugetlb,
2596 .pte_entry = __s390_enable_skey_pte,
2597 };
2598 struct mm_struct *mm = current->mm; 2595 struct mm_struct *mm = current->mm;
2599 struct vm_area_struct *vma; 2596 struct vm_area_struct *vma;
2600 int rc = 0; 2597 int rc = 0;
@@ -2614,8 +2611,7 @@ int s390_enable_skey(void)
2614 } 2611 }
2615 mm->def_flags &= ~VM_MERGEABLE; 2612 mm->def_flags &= ~VM_MERGEABLE;
2616 2613
2617 walk.mm = mm; 2614 walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2618 walk_page_range(0, TASK_SIZE, &walk);
2619 2615
2620out_up: 2616out_up:
2621 up_write(&mm->mmap_sem); 2617 up_write(&mm->mmap_sem);
@@ -2633,13 +2629,14 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2633 return 0; 2629 return 0;
2634} 2630}
2635 2631
2632static const struct mm_walk_ops reset_cmma_walk_ops = {
2633 .pte_entry = __s390_reset_cmma,
2634};
2635
2636void s390_reset_cmma(struct mm_struct *mm) 2636void s390_reset_cmma(struct mm_struct *mm)
2637{ 2637{
2638 struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
2639
2640 down_write(&mm->mmap_sem); 2638 down_write(&mm->mmap_sem);
2641 walk.mm = mm; 2639 walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2642 walk_page_range(0, TASK_SIZE, &walk);
2643 up_write(&mm->mmap_sem); 2640 up_write(&mm->mmap_sem);
2644} 2641}
2645EXPORT_SYMBOL_GPL(s390_reset_cmma); 2642EXPORT_SYMBOL_GPL(s390_reset_cmma);
diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig
index f6e5c0282fc1..2e98c016cb47 100644
--- a/drivers/gpu/drm/amd/amdgpu/Kconfig
+++ b/drivers/gpu/drm/amd/amdgpu/Kconfig
@@ -27,7 +27,9 @@ config DRM_AMDGPU_CIK
27config DRM_AMDGPU_USERPTR 27config DRM_AMDGPU_USERPTR
28 bool "Always enable userptr write support" 28 bool "Always enable userptr write support"
29 depends on DRM_AMDGPU 29 depends on DRM_AMDGPU
30 depends on HMM_MIRROR 30 depends on MMU
31 select HMM_MIRROR
32 select MMU_NOTIFIER
31 help 33 help
32 This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it 34 This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it
33 isn't already selected to enabled full userptr support. 35 isn't already selected to enabled full userptr support.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 48a2070e72f2..bdf849da32e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -35,6 +35,7 @@
35#include <linux/pm_runtime.h> 35#include <linux/pm_runtime.h>
36#include <linux/vga_switcheroo.h> 36#include <linux/vga_switcheroo.h>
37#include <drm/drm_probe_helper.h> 37#include <drm/drm_probe_helper.h>
38#include <linux/mmu_notifier.h>
38 39
39#include "amdgpu.h" 40#include "amdgpu.h"
40#include "amdgpu_irq.h" 41#include "amdgpu_irq.h"
@@ -1469,6 +1470,7 @@ static void __exit amdgpu_exit(void)
1469 amdgpu_unregister_atpx_handler(); 1470 amdgpu_unregister_atpx_handler();
1470 amdgpu_sync_fini(); 1471 amdgpu_sync_fini();
1471 amdgpu_fence_slab_fini(); 1472 amdgpu_fence_slab_fini();
1473 mmu_notifier_synchronize();
1472} 1474}
1473 1475
1474module_init(amdgpu_init); 1476module_init(amdgpu_init);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index f1f8cdd695d3..31d4deb5d294 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -195,13 +195,14 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node,
195 * Block for operations on BOs to finish and mark pages as accessed and 195 * Block for operations on BOs to finish and mark pages as accessed and
196 * potentially dirty. 196 * potentially dirty.
197 */ 197 */
198static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, 198static int
199 const struct hmm_update *update) 199amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
200 const struct mmu_notifier_range *update)
200{ 201{
201 struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); 202 struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
202 unsigned long start = update->start; 203 unsigned long start = update->start;
203 unsigned long end = update->end; 204 unsigned long end = update->end;
204 bool blockable = update->blockable; 205 bool blockable = mmu_notifier_range_blockable(update);
205 struct interval_tree_node *it; 206 struct interval_tree_node *it;
206 207
207 /* notification is exclusive, but interval is inclusive */ 208 /* notification is exclusive, but interval is inclusive */
@@ -243,13 +244,14 @@ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
243 * necessitates evicting all user-mode queues of the process. The BOs 244 * necessitates evicting all user-mode queues of the process. The BOs
244 * are restorted in amdgpu_mn_invalidate_range_end_hsa. 245 * are restorted in amdgpu_mn_invalidate_range_end_hsa.
245 */ 246 */
246static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, 247static int
247 const struct hmm_update *update) 248amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
249 const struct mmu_notifier_range *update)
248{ 250{
249 struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); 251 struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
250 unsigned long start = update->start; 252 unsigned long start = update->start;
251 unsigned long end = update->end; 253 unsigned long end = update->end;
252 bool blockable = update->blockable; 254 bool blockable = mmu_notifier_range_blockable(update);
253 struct interval_tree_node *it; 255 struct interval_tree_node *it;
254 256
255 /* notification is exclusive, but interval is inclusive */ 257 /* notification is exclusive, but interval is inclusive */
@@ -482,6 +484,5 @@ void amdgpu_hmm_init_range(struct hmm_range *range)
482 range->flags = hmm_range_flags; 484 range->flags = hmm_range_flags;
483 range->values = hmm_range_values; 485 range->values = hmm_range_values;
484 range->pfn_shift = PAGE_SHIFT; 486 range->pfn_shift = PAGE_SHIFT;
485 INIT_LIST_HEAD(&range->list);
486 } 487 }
487} 488}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 13b144c8f67d..dff41d0a85fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -794,7 +794,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
794 struct hmm_range *range; 794 struct hmm_range *range;
795 unsigned long i; 795 unsigned long i;
796 uint64_t *pfns; 796 uint64_t *pfns;
797 int retry = 0;
798 int r = 0; 797 int r = 0;
799 798
800 if (!mm) /* Happens during process shutdown */ 799 if (!mm) /* Happens during process shutdown */
@@ -835,10 +834,11 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages)
835 0 : range->flags[HMM_PFN_WRITE]; 834 0 : range->flags[HMM_PFN_WRITE];
836 range->pfn_flags_mask = 0; 835 range->pfn_flags_mask = 0;
837 range->pfns = pfns; 836 range->pfns = pfns;
838 hmm_range_register(range, mirror, start, 837 range->start = start;
839 start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT); 838 range->end = start + ttm->num_pages * PAGE_SIZE;
839
840 hmm_range_register(range, mirror);
840 841
841retry:
842 /* 842 /*
843 * Just wait for range to be valid, safe to ignore return value as we 843 * Just wait for range to be valid, safe to ignore return value as we
844 * will use the return value of hmm_range_fault() below under the 844 * will use the return value of hmm_range_fault() below under the
@@ -847,24 +847,12 @@ retry:
847 hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT); 847 hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT);
848 848
849 down_read(&mm->mmap_sem); 849 down_read(&mm->mmap_sem);
850 850 r = hmm_range_fault(range, 0);
851 r = hmm_range_fault(range, true);
852 if (unlikely(r < 0)) {
853 if (likely(r == -EAGAIN)) {
854 /*
855 * return -EAGAIN, mmap_sem is dropped
856 */
857 if (retry++ < MAX_RETRY_HMM_RANGE_FAULT)
858 goto retry;
859 else
860 pr_err("Retry hmm fault too many times\n");
861 }
862
863 goto out_up_read;
864 }
865
866 up_read(&mm->mmap_sem); 851 up_read(&mm->mmap_sem);
867 852
853 if (unlikely(r < 0))
854 goto out_free_pfns;
855
868 for (i = 0; i < ttm->num_pages; i++) { 856 for (i = 0; i < ttm->num_pages; i++) {
869 pages[i] = hmm_device_entry_to_page(range, pfns[i]); 857 pages[i] = hmm_device_entry_to_page(range, pfns[i]);
870 if (unlikely(!pages[i])) { 858 if (unlikely(!pages[i])) {
@@ -880,9 +868,6 @@ retry:
880 868
881 return 0; 869 return 0;
882 870
883out_up_read:
884 if (likely(r != -EAGAIN))
885 up_read(&mm->mmap_sem);
886out_free_pfns: 871out_free_pfns:
887 hmm_range_unregister(range); 872 hmm_range_unregister(range);
888 kvfree(pfns); 873 kvfree(pfns);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 3bb75d11a662..c89326125d71 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -687,9 +687,6 @@ struct kfd_process {
687 /* We want to receive a notification when the mm_struct is destroyed */ 687 /* We want to receive a notification when the mm_struct is destroyed */
688 struct mmu_notifier mmu_notifier; 688 struct mmu_notifier mmu_notifier;
689 689
690 /* Use for delayed freeing of kfd_process structure */
691 struct rcu_head rcu;
692
693 unsigned int pasid; 690 unsigned int pasid;
694 unsigned int doorbell_index; 691 unsigned int doorbell_index;
695 692
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 0c6ac043ae3c..40e3fc0c6942 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -62,8 +62,8 @@ static struct workqueue_struct *kfd_restore_wq;
62 62
63static struct kfd_process *find_process(const struct task_struct *thread); 63static struct kfd_process *find_process(const struct task_struct *thread);
64static void kfd_process_ref_release(struct kref *ref); 64static void kfd_process_ref_release(struct kref *ref);
65static struct kfd_process *create_process(const struct task_struct *thread, 65static struct kfd_process *create_process(const struct task_struct *thread);
66 struct file *filep); 66static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep);
67 67
68static void evict_process_worker(struct work_struct *work); 68static void evict_process_worker(struct work_struct *work);
69static void restore_process_worker(struct work_struct *work); 69static void restore_process_worker(struct work_struct *work);
@@ -289,7 +289,15 @@ struct kfd_process *kfd_create_process(struct file *filep)
289 if (process) { 289 if (process) {
290 pr_debug("Process already found\n"); 290 pr_debug("Process already found\n");
291 } else { 291 } else {
292 process = create_process(thread, filep); 292 process = create_process(thread);
293 if (IS_ERR(process))
294 goto out;
295
296 ret = kfd_process_init_cwsr_apu(process, filep);
297 if (ret) {
298 process = ERR_PTR(ret);
299 goto out;
300 }
293 301
294 if (!procfs.kobj) 302 if (!procfs.kobj)
295 goto out; 303 goto out;
@@ -478,11 +486,9 @@ static void kfd_process_ref_release(struct kref *ref)
478 queue_work(kfd_process_wq, &p->release_work); 486 queue_work(kfd_process_wq, &p->release_work);
479} 487}
480 488
481static void kfd_process_destroy_delayed(struct rcu_head *rcu) 489static void kfd_process_free_notifier(struct mmu_notifier *mn)
482{ 490{
483 struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); 491 kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier));
484
485 kfd_unref_process(p);
486} 492}
487 493
488static void kfd_process_notifier_release(struct mmu_notifier *mn, 494static void kfd_process_notifier_release(struct mmu_notifier *mn,
@@ -534,12 +540,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
534 540
535 mutex_unlock(&p->mutex); 541 mutex_unlock(&p->mutex);
536 542
537 mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); 543 mmu_notifier_put(&p->mmu_notifier);
538 mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed);
539} 544}
540 545
541static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { 546static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
542 .release = kfd_process_notifier_release, 547 .release = kfd_process_notifier_release,
548 .free_notifier = kfd_process_free_notifier,
543}; 549};
544 550
545static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) 551static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
@@ -609,81 +615,69 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd)
609 return 0; 615 return 0;
610} 616}
611 617
612static struct kfd_process *create_process(const struct task_struct *thread, 618/*
613 struct file *filep) 619 * On return the kfd_process is fully operational and will be freed when the
620 * mm is released
621 */
622static struct kfd_process *create_process(const struct task_struct *thread)
614{ 623{
615 struct kfd_process *process; 624 struct kfd_process *process;
616 int err = -ENOMEM; 625 int err = -ENOMEM;
617 626
618 process = kzalloc(sizeof(*process), GFP_KERNEL); 627 process = kzalloc(sizeof(*process), GFP_KERNEL);
619
620 if (!process) 628 if (!process)
621 goto err_alloc_process; 629 goto err_alloc_process;
622 630
623 process->pasid = kfd_pasid_alloc();
624 if (process->pasid == 0)
625 goto err_alloc_pasid;
626
627 if (kfd_alloc_process_doorbells(process) < 0)
628 goto err_alloc_doorbells;
629
630 kref_init(&process->ref); 631 kref_init(&process->ref);
631
632 mutex_init(&process->mutex); 632 mutex_init(&process->mutex);
633
634 process->mm = thread->mm; 633 process->mm = thread->mm;
635
636 /* register notifier */
637 process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
638 err = mmu_notifier_register(&process->mmu_notifier, process->mm);
639 if (err)
640 goto err_mmu_notifier;
641
642 hash_add_rcu(kfd_processes_table, &process->kfd_processes,
643 (uintptr_t)process->mm);
644
645 process->lead_thread = thread->group_leader; 634 process->lead_thread = thread->group_leader;
646 get_task_struct(process->lead_thread);
647
648 INIT_LIST_HEAD(&process->per_device_data); 635 INIT_LIST_HEAD(&process->per_device_data);
649 636 INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
637 INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
638 process->last_restore_timestamp = get_jiffies_64();
650 kfd_event_init_process(process); 639 kfd_event_init_process(process);
640 process->is_32bit_user_mode = in_compat_syscall();
641
642 process->pasid = kfd_pasid_alloc();
643 if (process->pasid == 0)
644 goto err_alloc_pasid;
645
646 if (kfd_alloc_process_doorbells(process) < 0)
647 goto err_alloc_doorbells;
651 648
652 err = pqm_init(&process->pqm, process); 649 err = pqm_init(&process->pqm, process);
653 if (err != 0) 650 if (err != 0)
654 goto err_process_pqm_init; 651 goto err_process_pqm_init;
655 652
656 /* init process apertures*/ 653 /* init process apertures*/
657 process->is_32bit_user_mode = in_compat_syscall();
658 err = kfd_init_apertures(process); 654 err = kfd_init_apertures(process);
659 if (err != 0) 655 if (err != 0)
660 goto err_init_apertures; 656 goto err_init_apertures;
661 657
662 INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); 658 /* Must be last, have to use release destruction after this */
663 INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); 659 process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops;
664 process->last_restore_timestamp = get_jiffies_64(); 660 err = mmu_notifier_register(&process->mmu_notifier, process->mm);
665
666 err = kfd_process_init_cwsr_apu(process, filep);
667 if (err) 661 if (err)
668 goto err_init_cwsr; 662 goto err_register_notifier;
663
664 get_task_struct(process->lead_thread);
665 hash_add_rcu(kfd_processes_table, &process->kfd_processes,
666 (uintptr_t)process->mm);
669 667
670 return process; 668 return process;
671 669
672err_init_cwsr: 670err_register_notifier:
673 kfd_process_free_outstanding_kfd_bos(process); 671 kfd_process_free_outstanding_kfd_bos(process);
674 kfd_process_destroy_pdds(process); 672 kfd_process_destroy_pdds(process);
675err_init_apertures: 673err_init_apertures:
676 pqm_uninit(&process->pqm); 674 pqm_uninit(&process->pqm);
677err_process_pqm_init: 675err_process_pqm_init:
678 hash_del_rcu(&process->kfd_processes);
679 synchronize_rcu();
680 mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm);
681err_mmu_notifier:
682 mutex_destroy(&process->mutex);
683 kfd_free_process_doorbells(process); 676 kfd_free_process_doorbells(process);
684err_alloc_doorbells: 677err_alloc_doorbells:
685 kfd_pasid_free(process->pasid); 678 kfd_pasid_free(process->pasid);
686err_alloc_pasid: 679err_alloc_pasid:
680 mutex_destroy(&process->mutex);
687 kfree(process); 681 kfree(process);
688err_alloc_process: 682err_alloc_process:
689 return ERR_PTR(err); 683 return ERR_PTR(err);
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index 96b9814e6d06..3558df043592 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -86,9 +86,10 @@ config DRM_NOUVEAU_SVM
86 bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" 86 bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
87 depends on DEVICE_PRIVATE 87 depends on DEVICE_PRIVATE
88 depends on DRM_NOUVEAU 88 depends on DRM_NOUVEAU
89 depends on HMM_MIRROR 89 depends on MMU
90 depends on STAGING 90 depends on STAGING
91 select MIGRATE_VMA_HELPER 91 select HMM_MIRROR
92 select MMU_NOTIFIER
92 default n 93 default n
93 help 94 help
94 Say Y here if you want to enable experimental support for 95 Say Y here if you want to enable experimental support for
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 1333220787a1..fa1439941596 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -44,8 +44,6 @@
44#define DMEM_CHUNK_SIZE (2UL << 20) 44#define DMEM_CHUNK_SIZE (2UL << 20)
45#define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT) 45#define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT)
46 46
47struct nouveau_migrate;
48
49enum nouveau_aper { 47enum nouveau_aper {
50 NOUVEAU_APER_VIRT, 48 NOUVEAU_APER_VIRT,
51 NOUVEAU_APER_VRAM, 49 NOUVEAU_APER_VRAM,
@@ -86,21 +84,13 @@ static inline struct nouveau_dmem *page_to_dmem(struct page *page)
86 return container_of(page->pgmap, struct nouveau_dmem, pagemap); 84 return container_of(page->pgmap, struct nouveau_dmem, pagemap);
87} 85}
88 86
89struct nouveau_dmem_fault { 87static unsigned long nouveau_dmem_page_addr(struct page *page)
90 struct nouveau_drm *drm; 88{
91 struct nouveau_fence *fence; 89 struct nouveau_dmem_chunk *chunk = page->zone_device_data;
92 dma_addr_t *dma; 90 unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
93 unsigned long npages;
94};
95 91
96struct nouveau_migrate { 92 return (idx << PAGE_SHIFT) + chunk->bo->bo.offset;
97 struct vm_area_struct *vma; 93}
98 struct nouveau_drm *drm;
99 struct nouveau_fence *fence;
100 unsigned long npages;
101 dma_addr_t *dma;
102 unsigned long dma_nr;
103};
104 94
105static void nouveau_dmem_page_free(struct page *page) 95static void nouveau_dmem_page_free(struct page *page)
106{ 96{
@@ -125,165 +115,90 @@ static void nouveau_dmem_page_free(struct page *page)
125 spin_unlock(&chunk->lock); 115 spin_unlock(&chunk->lock);
126} 116}
127 117
128static void 118static void nouveau_dmem_fence_done(struct nouveau_fence **fence)
129nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
130 const unsigned long *src_pfns,
131 unsigned long *dst_pfns,
132 unsigned long start,
133 unsigned long end,
134 void *private)
135{ 119{
136 struct nouveau_dmem_fault *fault = private; 120 if (fence) {
137 struct nouveau_drm *drm = fault->drm; 121 nouveau_fence_wait(*fence, true, false);
138 struct device *dev = drm->dev->dev; 122 nouveau_fence_unref(fence);
139 unsigned long addr, i, npages = 0; 123 } else {
140 nouveau_migrate_copy_t copy; 124 /*
141 int ret; 125 * FIXME wait for channel to be IDLE before calling finalizing
142 126 * the hmem object.
143 127 */
144 /* First allocate new memory */
145 for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
146 struct page *dpage, *spage;
147
148 dst_pfns[i] = 0;
149 spage = migrate_pfn_to_page(src_pfns[i]);
150 if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
151 continue;
152
153 dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr);
154 if (!dpage) {
155 dst_pfns[i] = MIGRATE_PFN_ERROR;
156 continue;
157 }
158 lock_page(dpage);
159
160 dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
161 MIGRATE_PFN_LOCKED;
162 npages++;
163 }
164
165 /* Allocate storage for DMA addresses, so we can unmap later. */
166 fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL);
167 if (!fault->dma)
168 goto error;
169
170 /* Copy things over */
171 copy = drm->dmem->migrate.copy_func;
172 for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
173 struct nouveau_dmem_chunk *chunk;
174 struct page *spage, *dpage;
175 u64 src_addr, dst_addr;
176
177 dpage = migrate_pfn_to_page(dst_pfns[i]);
178 if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
179 continue;
180
181 spage = migrate_pfn_to_page(src_pfns[i]);
182 if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) {
183 dst_pfns[i] = MIGRATE_PFN_ERROR;
184 __free_page(dpage);
185 continue;
186 }
187
188 fault->dma[fault->npages] =
189 dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE,
190 PCI_DMA_BIDIRECTIONAL,
191 DMA_ATTR_SKIP_CPU_SYNC);
192 if (dma_mapping_error(dev, fault->dma[fault->npages])) {
193 dst_pfns[i] = MIGRATE_PFN_ERROR;
194 __free_page(dpage);
195 continue;
196 }
197
198 dst_addr = fault->dma[fault->npages++];
199
200 chunk = spage->zone_device_data;
201 src_addr = page_to_pfn(spage) - chunk->pfn_first;
202 src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
203
204 ret = copy(drm, 1, NOUVEAU_APER_HOST, dst_addr,
205 NOUVEAU_APER_VRAM, src_addr);
206 if (ret) {
207 dst_pfns[i] = MIGRATE_PFN_ERROR;
208 __free_page(dpage);
209 continue;
210 }
211 } 128 }
129}
212 130
213 nouveau_fence_new(drm->dmem->migrate.chan, false, &fault->fence); 131static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
214 132 struct vm_fault *vmf, struct migrate_vma *args,
215 return; 133 dma_addr_t *dma_addr)
216 134{
217error: 135 struct device *dev = drm->dev->dev;
218 for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, ++i) { 136 struct page *dpage, *spage;
219 struct page *page;
220 137
221 if (!dst_pfns[i] || dst_pfns[i] == MIGRATE_PFN_ERROR) 138 spage = migrate_pfn_to_page(args->src[0]);
222 continue; 139 if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE))
140 return 0;
223 141
224 page = migrate_pfn_to_page(dst_pfns[i]); 142 dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address);
225 dst_pfns[i] = MIGRATE_PFN_ERROR; 143 if (!dpage)
226 if (page == NULL) 144 return VM_FAULT_SIGBUS;
227 continue; 145 lock_page(dpage);
228 146
229 __free_page(page); 147 *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
230 } 148 if (dma_mapping_error(dev, *dma_addr))
231} 149 goto error_free_page;
232 150
233void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, 151 if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr,
234 const unsigned long *src_pfns, 152 NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage)))
235 const unsigned long *dst_pfns, 153 goto error_dma_unmap;
236 unsigned long start,
237 unsigned long end,
238 void *private)
239{
240 struct nouveau_dmem_fault *fault = private;
241 struct nouveau_drm *drm = fault->drm;
242 154
243 if (fault->fence) { 155 args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
244 nouveau_fence_wait(fault->fence, true, false); 156 return 0;
245 nouveau_fence_unref(&fault->fence);
246 } else {
247 /*
248 * FIXME wait for channel to be IDLE before calling finalizing
249 * the hmem object below (nouveau_migrate_hmem_fini()).
250 */
251 }
252 157
253 while (fault->npages--) { 158error_dma_unmap:
254 dma_unmap_page(drm->dev->dev, fault->dma[fault->npages], 159 dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
255 PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); 160error_free_page:
256 } 161 __free_page(dpage);
257 kfree(fault->dma); 162 return VM_FAULT_SIGBUS;
258} 163}
259 164
260static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = {
261 .alloc_and_copy = nouveau_dmem_fault_alloc_and_copy,
262 .finalize_and_map = nouveau_dmem_fault_finalize_and_map,
263};
264
265static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) 165static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
266{ 166{
267 struct nouveau_dmem *dmem = page_to_dmem(vmf->page); 167 struct nouveau_dmem *dmem = page_to_dmem(vmf->page);
268 unsigned long src[1] = {0}, dst[1] = {0}; 168 struct nouveau_drm *drm = dmem->drm;
269 struct nouveau_dmem_fault fault = { .drm = dmem->drm }; 169 struct nouveau_fence *fence;
270 int ret; 170 unsigned long src = 0, dst = 0;
171 dma_addr_t dma_addr = 0;
172 vm_fault_t ret;
173 struct migrate_vma args = {
174 .vma = vmf->vma,
175 .start = vmf->address,
176 .end = vmf->address + PAGE_SIZE,
177 .src = &src,
178 .dst = &dst,
179 };
271 180
272 /* 181 /*
273 * FIXME what we really want is to find some heuristic to migrate more 182 * FIXME what we really want is to find some heuristic to migrate more
274 * than just one page on CPU fault. When such fault happens it is very 183 * than just one page on CPU fault. When such fault happens it is very
275 * likely that more surrounding page will CPU fault too. 184 * likely that more surrounding page will CPU fault too.
276 */ 185 */
277 ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma, 186 if (migrate_vma_setup(&args) < 0)
278 vmf->address, vmf->address + PAGE_SIZE,
279 src, dst, &fault);
280 if (ret)
281 return VM_FAULT_SIGBUS; 187 return VM_FAULT_SIGBUS;
188 if (!args.cpages)
189 return 0;
282 190
283 if (dst[0] == MIGRATE_PFN_ERROR) 191 ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr);
284 return VM_FAULT_SIGBUS; 192 if (ret || dst == 0)
193 goto done;
285 194
286 return 0; 195 nouveau_fence_new(dmem->migrate.chan, false, &fence);
196 migrate_vma_pages(&args);
197 nouveau_dmem_fence_done(&fence);
198 dma_unmap_page(drm->dev->dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
199done:
200 migrate_vma_finalize(&args);
201 return ret;
287} 202}
288 203
289static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { 204static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = {
@@ -642,188 +557,115 @@ out_free:
642 drm->dmem = NULL; 557 drm->dmem = NULL;
643} 558}
644 559
645static void 560static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
646nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, 561 unsigned long src, dma_addr_t *dma_addr)
647 const unsigned long *src_pfns,
648 unsigned long *dst_pfns,
649 unsigned long start,
650 unsigned long end,
651 void *private)
652{ 562{
653 struct nouveau_migrate *migrate = private;
654 struct nouveau_drm *drm = migrate->drm;
655 struct device *dev = drm->dev->dev; 563 struct device *dev = drm->dev->dev;
656 unsigned long addr, i, npages = 0; 564 struct page *dpage, *spage;
657 nouveau_migrate_copy_t copy;
658 int ret;
659
660 /* First allocate new memory */
661 for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
662 struct page *dpage, *spage;
663
664 dst_pfns[i] = 0;
665 spage = migrate_pfn_to_page(src_pfns[i]);
666 if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
667 continue;
668
669 dpage = nouveau_dmem_page_alloc_locked(drm);
670 if (!dpage)
671 continue;
672
673 dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
674 MIGRATE_PFN_LOCKED |
675 MIGRATE_PFN_DEVICE;
676 npages++;
677 }
678
679 if (!npages)
680 return;
681
682 /* Allocate storage for DMA addresses, so we can unmap later. */
683 migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL);
684 if (!migrate->dma)
685 goto error;
686
687 /* Copy things over */
688 copy = drm->dmem->migrate.copy_func;
689 for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
690 struct nouveau_dmem_chunk *chunk;
691 struct page *spage, *dpage;
692 u64 src_addr, dst_addr;
693
694 dpage = migrate_pfn_to_page(dst_pfns[i]);
695 if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
696 continue;
697
698 chunk = dpage->zone_device_data;
699 dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
700 dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
701
702 spage = migrate_pfn_to_page(src_pfns[i]);
703 if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) {
704 nouveau_dmem_page_free_locked(drm, dpage);
705 dst_pfns[i] = 0;
706 continue;
707 }
708
709 migrate->dma[migrate->dma_nr] =
710 dma_map_page_attrs(dev, spage, 0, PAGE_SIZE,
711 PCI_DMA_BIDIRECTIONAL,
712 DMA_ATTR_SKIP_CPU_SYNC);
713 if (dma_mapping_error(dev, migrate->dma[migrate->dma_nr])) {
714 nouveau_dmem_page_free_locked(drm, dpage);
715 dst_pfns[i] = 0;
716 continue;
717 }
718
719 src_addr = migrate->dma[migrate->dma_nr++];
720 565
721 ret = copy(drm, 1, NOUVEAU_APER_VRAM, dst_addr, 566 spage = migrate_pfn_to_page(src);
722 NOUVEAU_APER_HOST, src_addr); 567 if (!spage || !(src & MIGRATE_PFN_MIGRATE))
723 if (ret) { 568 goto out;
724 nouveau_dmem_page_free_locked(drm, dpage);
725 dst_pfns[i] = 0;
726 continue;
727 }
728 }
729
730 nouveau_fence_new(drm->dmem->migrate.chan, false, &migrate->fence);
731 569
732 return; 570 dpage = nouveau_dmem_page_alloc_locked(drm);
571 if (!dpage)
572 return 0;
733 573
734error: 574 *dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
735 for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, ++i) { 575 if (dma_mapping_error(dev, *dma_addr))
736 struct page *page; 576 goto out_free_page;
737 577
738 if (!dst_pfns[i] || dst_pfns[i] == MIGRATE_PFN_ERROR) 578 if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM,
739 continue; 579 nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST,
580 *dma_addr))
581 goto out_dma_unmap;
740 582
741 page = migrate_pfn_to_page(dst_pfns[i]); 583 return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
742 dst_pfns[i] = MIGRATE_PFN_ERROR;
743 if (page == NULL)
744 continue;
745 584
746 __free_page(page); 585out_dma_unmap:
747 } 586 dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
587out_free_page:
588 nouveau_dmem_page_free_locked(drm, dpage);
589out:
590 return 0;
748} 591}
749 592
750void nouveau_dmem_migrate_finalize_and_map(struct vm_area_struct *vma, 593static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm,
751 const unsigned long *src_pfns, 594 struct migrate_vma *args, dma_addr_t *dma_addrs)
752 const unsigned long *dst_pfns,
753 unsigned long start,
754 unsigned long end,
755 void *private)
756{ 595{
757 struct nouveau_migrate *migrate = private; 596 struct nouveau_fence *fence;
758 struct nouveau_drm *drm = migrate->drm; 597 unsigned long addr = args->start, nr_dma = 0, i;
759 598
760 if (migrate->fence) { 599 for (i = 0; addr < args->end; i++) {
761 nouveau_fence_wait(migrate->fence, true, false); 600 args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i],
762 nouveau_fence_unref(&migrate->fence); 601 dma_addrs + nr_dma);
763 } else { 602 if (args->dst[i])
764 /* 603 nr_dma++;
765 * FIXME wait for channel to be IDLE before finalizing 604 addr += PAGE_SIZE;
766 * the hmem object below (nouveau_migrate_hmem_fini()) ?
767 */
768 } 605 }
769 606
770 while (migrate->dma_nr--) { 607 nouveau_fence_new(drm->dmem->migrate.chan, false, &fence);
771 dma_unmap_page(drm->dev->dev, migrate->dma[migrate->dma_nr], 608 migrate_vma_pages(args);
772 PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); 609 nouveau_dmem_fence_done(&fence);
773 }
774 kfree(migrate->dma);
775 610
611 while (nr_dma--) {
612 dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE,
613 DMA_BIDIRECTIONAL);
614 }
776 /* 615 /*
777 * FIXME optimization: update GPU page table to point to newly 616 * FIXME optimization: update GPU page table to point to newly migrated
778 * migrated memory. 617 * memory.
779 */ 618 */
619 migrate_vma_finalize(args);
780} 620}
781 621
782static const struct migrate_vma_ops nouveau_dmem_migrate_ops = {
783 .alloc_and_copy = nouveau_dmem_migrate_alloc_and_copy,
784 .finalize_and_map = nouveau_dmem_migrate_finalize_and_map,
785};
786
787int 622int
788nouveau_dmem_migrate_vma(struct nouveau_drm *drm, 623nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
789 struct vm_area_struct *vma, 624 struct vm_area_struct *vma,
790 unsigned long start, 625 unsigned long start,
791 unsigned long end) 626 unsigned long end)
792{ 627{
793 unsigned long *src_pfns, *dst_pfns, npages; 628 unsigned long npages = (end - start) >> PAGE_SHIFT;
794 struct nouveau_migrate migrate = {0}; 629 unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages);
795 unsigned long i, c, max; 630 dma_addr_t *dma_addrs;
796 int ret = 0; 631 struct migrate_vma args = {
797 632 .vma = vma,
798 npages = (end - start) >> PAGE_SHIFT; 633 .start = start,
799 max = min(SG_MAX_SINGLE_ALLOC, npages); 634 };
800 src_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); 635 unsigned long c, i;
801 if (src_pfns == NULL) 636 int ret = -ENOMEM;
802 return -ENOMEM; 637
803 dst_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); 638 args.src = kcalloc(max, sizeof(args.src), GFP_KERNEL);
804 if (dst_pfns == NULL) { 639 if (!args.src)
805 kfree(src_pfns); 640 goto out;
806 return -ENOMEM; 641 args.dst = kcalloc(max, sizeof(args.dst), GFP_KERNEL);
807 } 642 if (!args.dst)
643 goto out_free_src;
808 644
809 migrate.drm = drm; 645 dma_addrs = kmalloc_array(max, sizeof(*dma_addrs), GFP_KERNEL);
810 migrate.vma = vma; 646 if (!dma_addrs)
811 migrate.npages = npages; 647 goto out_free_dst;
812 for (i = 0; i < npages; i += c) {
813 unsigned long next;
814 648
649 for (i = 0; i < npages; i += c) {
815 c = min(SG_MAX_SINGLE_ALLOC, npages); 650 c = min(SG_MAX_SINGLE_ALLOC, npages);
816 next = start + (c << PAGE_SHIFT); 651 args.end = start + (c << PAGE_SHIFT);
817 ret = migrate_vma(&nouveau_dmem_migrate_ops, vma, start, 652 ret = migrate_vma_setup(&args);
818 next, src_pfns, dst_pfns, &migrate);
819 if (ret) 653 if (ret)
820 goto out; 654 goto out_free_dma;
821 start = next; 655
656 if (args.cpages)
657 nouveau_dmem_migrate_chunk(drm, &args, dma_addrs);
658 args.start = args.end;
822 } 659 }
823 660
661 ret = 0;
662out_free_dma:
663 kfree(dma_addrs);
664out_free_dst:
665 kfree(args.dst);
666out_free_src:
667 kfree(args.src);
824out: 668out:
825 kfree(dst_pfns);
826 kfree(src_pfns);
827 return ret; 669 return ret;
828} 670}
829 671
@@ -841,11 +683,10 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
841 683
842 npages = (range->end - range->start) >> PAGE_SHIFT; 684 npages = (range->end - range->start) >> PAGE_SHIFT;
843 for (i = 0; i < npages; ++i) { 685 for (i = 0; i < npages; ++i) {
844 struct nouveau_dmem_chunk *chunk;
845 struct page *page; 686 struct page *page;
846 uint64_t addr; 687 uint64_t addr;
847 688
848 page = hmm_pfn_to_page(range, range->pfns[i]); 689 page = hmm_device_entry_to_page(range, range->pfns[i]);
849 if (page == NULL) 690 if (page == NULL)
850 continue; 691 continue;
851 692
@@ -859,10 +700,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
859 continue; 700 continue;
860 } 701 }
861 702
862 chunk = page->zone_device_data; 703 addr = nouveau_dmem_page_addr(page);
863 addr = page_to_pfn(page) - chunk->pfn_first;
864 addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
865
866 range->pfns[i] &= ((1UL << range->pfn_shift) - 1); 704 range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
867 range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; 705 range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
868 } 706 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.h b/drivers/gpu/drm/nouveau/nouveau_dmem.h
index 9d97d756fb7d..92394be5d649 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.h
@@ -45,16 +45,5 @@ static inline void nouveau_dmem_init(struct nouveau_drm *drm) {}
45static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {} 45static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {}
46static inline void nouveau_dmem_suspend(struct nouveau_drm *drm) {} 46static inline void nouveau_dmem_suspend(struct nouveau_drm *drm) {}
47static inline void nouveau_dmem_resume(struct nouveau_drm *drm) {} 47static inline void nouveau_dmem_resume(struct nouveau_drm *drm) {}
48
49static inline int nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
50 struct vm_area_struct *vma,
51 unsigned long start,
52 unsigned long end)
53{
54 return 0;
55}
56
57static inline void nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
58 struct hmm_range *range) {}
59#endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ 48#endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */
60#endif 49#endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index bdc948352467..2cd83849600f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -28,6 +28,7 @@
28#include <linux/pci.h> 28#include <linux/pci.h>
29#include <linux/pm_runtime.h> 29#include <linux/pm_runtime.h>
30#include <linux/vga_switcheroo.h> 30#include <linux/vga_switcheroo.h>
31#include <linux/mmu_notifier.h>
31 32
32#include <drm/drm_crtc_helper.h> 33#include <drm/drm_crtc_helper.h>
33#include <drm/drm_ioctl.h> 34#include <drm/drm_ioctl.h>
@@ -1290,6 +1291,8 @@ nouveau_drm_exit(void)
1290#ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER 1291#ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER
1291 platform_driver_unregister(&nouveau_platform_driver); 1292 platform_driver_unregister(&nouveau_platform_driver);
1292#endif 1293#endif
1294 if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM))
1295 mmu_notifier_synchronize();
1293} 1296}
1294 1297
1295module_init(nouveau_drm_init); 1298module_init(nouveau_drm_init);
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index a835cebb6d90..668d4bd0c118 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
252 252
253static int 253static int
254nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, 254nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
255 const struct hmm_update *update) 255 const struct mmu_notifier_range *update)
256{ 256{
257 struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); 257 struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
258 unsigned long start = update->start; 258 unsigned long start = update->start;
259 unsigned long limit = update->end; 259 unsigned long limit = update->end;
260 260
261 if (!update->blockable) 261 if (!mmu_notifier_range_blockable(update))
262 return -EAGAIN; 262 return -EAGAIN;
263 263
264 SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); 264 SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
@@ -485,31 +485,29 @@ nouveau_range_done(struct hmm_range *range)
485} 485}
486 486
487static int 487static int
488nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) 488nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range)
489{ 489{
490 long ret; 490 long ret;
491 491
492 range->default_flags = 0; 492 range->default_flags = 0;
493 range->pfn_flags_mask = -1UL; 493 range->pfn_flags_mask = -1UL;
494 494
495 ret = hmm_range_register(range, mirror, 495 ret = hmm_range_register(range, &svmm->mirror);
496 range->start, range->end,
497 PAGE_SHIFT);
498 if (ret) { 496 if (ret) {
499 up_read(&range->vma->vm_mm->mmap_sem); 497 up_read(&svmm->mm->mmap_sem);
500 return (int)ret; 498 return (int)ret;
501 } 499 }
502 500
503 if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { 501 if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
504 up_read(&range->vma->vm_mm->mmap_sem); 502 up_read(&svmm->mm->mmap_sem);
505 return -EAGAIN; 503 return -EBUSY;
506 } 504 }
507 505
508 ret = hmm_range_fault(range, true); 506 ret = hmm_range_fault(range, 0);
509 if (ret <= 0) { 507 if (ret <= 0) {
510 if (ret == 0) 508 if (ret == 0)
511 ret = -EBUSY; 509 ret = -EBUSY;
512 up_read(&range->vma->vm_mm->mmap_sem); 510 up_read(&svmm->mm->mmap_sem);
513 hmm_range_unregister(range); 511 hmm_range_unregister(range);
514 return ret; 512 return ret;
515 } 513 }
@@ -682,7 +680,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
682 args.i.p.addr + args.i.p.size, fn - fi); 680 args.i.p.addr + args.i.p.size, fn - fi);
683 681
684 /* Have HMM fault pages within the fault window to the GPU. */ 682 /* Have HMM fault pages within the fault window to the GPU. */
685 range.vma = vma;
686 range.start = args.i.p.addr; 683 range.start = args.i.p.addr;
687 range.end = args.i.p.addr + args.i.p.size; 684 range.end = args.i.p.addr + args.i.p.size;
688 range.pfns = args.phys; 685 range.pfns = args.phys;
@@ -690,7 +687,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
690 range.values = nouveau_svm_pfn_values; 687 range.values = nouveau_svm_pfn_values;
691 range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; 688 range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
692again: 689again:
693 ret = nouveau_range_fault(&svmm->mirror, &range); 690 ret = nouveau_range_fault(svmm, &range);
694 if (ret == 0) { 691 if (ret == 0) {
695 mutex_lock(&svmm->mutex); 692 mutex_lock(&svmm->mutex);
696 if (!nouveau_range_done(&range)) { 693 if (!nouveau_range_done(&range)) {
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h
index 05b88491ccb9..d59b004f6695 100644
--- a/drivers/gpu/drm/radeon/radeon.h
+++ b/drivers/gpu/drm/radeon/radeon.h
@@ -2449,9 +2449,6 @@ struct radeon_device {
2449 /* tracking pinned memory */ 2449 /* tracking pinned memory */
2450 u64 vram_pin_size; 2450 u64 vram_pin_size;
2451 u64 gart_pin_size; 2451 u64 gart_pin_size;
2452
2453 struct mutex mn_lock;
2454 DECLARE_HASHTABLE(mn_hash, 7);
2455}; 2452};
2456 2453
2457bool radeon_is_px(struct drm_device *dev); 2454bool radeon_is_px(struct drm_device *dev);
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 88eb7cb522bb..5d017f0aec66 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1325,8 +1325,6 @@ int radeon_device_init(struct radeon_device *rdev,
1325 init_rwsem(&rdev->pm.mclk_lock); 1325 init_rwsem(&rdev->pm.mclk_lock);
1326 init_rwsem(&rdev->exclusive_lock); 1326 init_rwsem(&rdev->exclusive_lock);
1327 init_waitqueue_head(&rdev->irq.vblank_queue); 1327 init_waitqueue_head(&rdev->irq.vblank_queue);
1328 mutex_init(&rdev->mn_lock);
1329 hash_init(rdev->mn_hash);
1330 r = radeon_gem_init(rdev); 1328 r = radeon_gem_init(rdev);
1331 if (r) 1329 if (r)
1332 return r; 1330 return r;
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 5838162f687f..431e6b64b77d 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -35,6 +35,7 @@
35#include <linux/module.h> 35#include <linux/module.h>
36#include <linux/pm_runtime.h> 36#include <linux/pm_runtime.h>
37#include <linux/vga_switcheroo.h> 37#include <linux/vga_switcheroo.h>
38#include <linux/mmu_notifier.h>
38 39
39#include <drm/drm_crtc_helper.h> 40#include <drm/drm_crtc_helper.h>
40#include <drm/drm_drv.h> 41#include <drm/drm_drv.h>
@@ -623,6 +624,7 @@ static void __exit radeon_exit(void)
623{ 624{
624 pci_unregister_driver(pdriver); 625 pci_unregister_driver(pdriver);
625 radeon_unregister_atpx_handler(); 626 radeon_unregister_atpx_handler();
627 mmu_notifier_synchronize();
626} 628}
627 629
628module_init(radeon_init); 630module_init(radeon_init);
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c
index 6902f998ede9..dbab9a3a969b 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -37,17 +37,8 @@
37#include "radeon.h" 37#include "radeon.h"
38 38
39struct radeon_mn { 39struct radeon_mn {
40 /* constant after initialisation */
41 struct radeon_device *rdev;
42 struct mm_struct *mm;
43 struct mmu_notifier mn; 40 struct mmu_notifier mn;
44 41
45 /* only used on destruction */
46 struct work_struct work;
47
48 /* protected by rdev->mn_lock */
49 struct hlist_node node;
50
51 /* objects protected by lock */ 42 /* objects protected by lock */
52 struct mutex lock; 43 struct mutex lock;
53 struct rb_root_cached objects; 44 struct rb_root_cached objects;
@@ -59,55 +50,6 @@ struct radeon_mn_node {
59}; 50};
60 51
61/** 52/**
62 * radeon_mn_destroy - destroy the rmn
63 *
64 * @work: previously sheduled work item
65 *
66 * Lazy destroys the notifier from a work item
67 */
68static void radeon_mn_destroy(struct work_struct *work)
69{
70 struct radeon_mn *rmn = container_of(work, struct radeon_mn, work);
71 struct radeon_device *rdev = rmn->rdev;
72 struct radeon_mn_node *node, *next_node;
73 struct radeon_bo *bo, *next_bo;
74
75 mutex_lock(&rdev->mn_lock);
76 mutex_lock(&rmn->lock);
77 hash_del(&rmn->node);
78 rbtree_postorder_for_each_entry_safe(node, next_node,
79 &rmn->objects.rb_root, it.rb) {
80
81 interval_tree_remove(&node->it, &rmn->objects);
82 list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) {
83 bo->mn = NULL;
84 list_del_init(&bo->mn_list);
85 }
86 kfree(node);
87 }
88 mutex_unlock(&rmn->lock);
89 mutex_unlock(&rdev->mn_lock);
90 mmu_notifier_unregister(&rmn->mn, rmn->mm);
91 kfree(rmn);
92}
93
94/**
95 * radeon_mn_release - callback to notify about mm destruction
96 *
97 * @mn: our notifier
98 * @mn: the mm this callback is about
99 *
100 * Shedule a work item to lazy destroy our notifier.
101 */
102static void radeon_mn_release(struct mmu_notifier *mn,
103 struct mm_struct *mm)
104{
105 struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn);
106 INIT_WORK(&rmn->work, radeon_mn_destroy);
107 schedule_work(&rmn->work);
108}
109
110/**
111 * radeon_mn_invalidate_range_start - callback to notify about mm change 53 * radeon_mn_invalidate_range_start - callback to notify about mm change
112 * 54 *
113 * @mn: our notifier 55 * @mn: our notifier
@@ -183,65 +125,44 @@ out_unlock:
183 return ret; 125 return ret;
184} 126}
185 127
186static const struct mmu_notifier_ops radeon_mn_ops = { 128static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm)
187 .release = radeon_mn_release, 129{
188 .invalidate_range_start = radeon_mn_invalidate_range_start, 130 struct mmu_notifier_range range = {
189}; 131 .mm = mm,
132 .start = 0,
133 .end = ULONG_MAX,
134 .flags = 0,
135 .event = MMU_NOTIFY_UNMAP,
136 };
137
138 radeon_mn_invalidate_range_start(mn, &range);
139}
190 140
191/** 141static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm)
192 * radeon_mn_get - create notifier context
193 *
194 * @rdev: radeon device pointer
195 *
196 * Creates a notifier context for current->mm.
197 */
198static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev)
199{ 142{
200 struct mm_struct *mm = current->mm;
201 struct radeon_mn *rmn; 143 struct radeon_mn *rmn;
202 int r;
203
204 if (down_write_killable(&mm->mmap_sem))
205 return ERR_PTR(-EINTR);
206
207 mutex_lock(&rdev->mn_lock);
208
209 hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm)
210 if (rmn->mm == mm)
211 goto release_locks;
212 144
213 rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); 145 rmn = kzalloc(sizeof(*rmn), GFP_KERNEL);
214 if (!rmn) { 146 if (!rmn)
215 rmn = ERR_PTR(-ENOMEM); 147 return ERR_PTR(-ENOMEM);
216 goto release_locks;
217 }
218 148
219 rmn->rdev = rdev;
220 rmn->mm = mm;
221 rmn->mn.ops = &radeon_mn_ops;
222 mutex_init(&rmn->lock); 149 mutex_init(&rmn->lock);
223 rmn->objects = RB_ROOT_CACHED; 150 rmn->objects = RB_ROOT_CACHED;
224 151 return &rmn->mn;
225 r = __mmu_notifier_register(&rmn->mn, mm); 152}
226 if (r)
227 goto free_rmn;
228
229 hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm);
230
231release_locks:
232 mutex_unlock(&rdev->mn_lock);
233 up_write(&mm->mmap_sem);
234
235 return rmn;
236
237free_rmn:
238 mutex_unlock(&rdev->mn_lock);
239 up_write(&mm->mmap_sem);
240 kfree(rmn);
241 153
242 return ERR_PTR(r); 154static void radeon_mn_free_notifier(struct mmu_notifier *mn)
155{
156 kfree(container_of(mn, struct radeon_mn, mn));
243} 157}
244 158
159static const struct mmu_notifier_ops radeon_mn_ops = {
160 .release = radeon_mn_release,
161 .invalidate_range_start = radeon_mn_invalidate_range_start,
162 .alloc_notifier = radeon_mn_alloc_notifier,
163 .free_notifier = radeon_mn_free_notifier,
164};
165
245/** 166/**
246 * radeon_mn_register - register a BO for notifier updates 167 * radeon_mn_register - register a BO for notifier updates
247 * 168 *
@@ -254,15 +175,16 @@ free_rmn:
254int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) 175int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
255{ 176{
256 unsigned long end = addr + radeon_bo_size(bo) - 1; 177 unsigned long end = addr + radeon_bo_size(bo) - 1;
257 struct radeon_device *rdev = bo->rdev; 178 struct mmu_notifier *mn;
258 struct radeon_mn *rmn; 179 struct radeon_mn *rmn;
259 struct radeon_mn_node *node = NULL; 180 struct radeon_mn_node *node = NULL;
260 struct list_head bos; 181 struct list_head bos;
261 struct interval_tree_node *it; 182 struct interval_tree_node *it;
262 183
263 rmn = radeon_mn_get(rdev); 184 mn = mmu_notifier_get(&radeon_mn_ops, current->mm);
264 if (IS_ERR(rmn)) 185 if (IS_ERR(mn))
265 return PTR_ERR(rmn); 186 return PTR_ERR(mn);
187 rmn = container_of(mn, struct radeon_mn, mn);
266 188
267 INIT_LIST_HEAD(&bos); 189 INIT_LIST_HEAD(&bos);
268 190
@@ -309,22 +231,16 @@ int radeon_mn_register(struct radeon_bo *bo, unsigned long addr)
309 */ 231 */
310void radeon_mn_unregister(struct radeon_bo *bo) 232void radeon_mn_unregister(struct radeon_bo *bo)
311{ 233{
312 struct radeon_device *rdev = bo->rdev; 234 struct radeon_mn *rmn = bo->mn;
313 struct radeon_mn *rmn;
314 struct list_head *head; 235 struct list_head *head;
315 236
316 mutex_lock(&rdev->mn_lock); 237 if (!rmn)
317 rmn = bo->mn;
318 if (rmn == NULL) {
319 mutex_unlock(&rdev->mn_lock);
320 return; 238 return;
321 }
322 239
323 mutex_lock(&rmn->lock); 240 mutex_lock(&rmn->lock);
324 /* save the next list entry for later */ 241 /* save the next list entry for later */
325 head = bo->mn_list.next; 242 head = bo->mn_list.next;
326 243
327 bo->mn = NULL;
328 list_del(&bo->mn_list); 244 list_del(&bo->mn_list);
329 245
330 if (list_empty(head)) { 246 if (list_empty(head)) {
@@ -335,5 +251,7 @@ void radeon_mn_unregister(struct radeon_bo *bo)
335 } 251 }
336 252
337 mutex_unlock(&rmn->lock); 253 mutex_unlock(&rmn->lock);
338 mutex_unlock(&rdev->mn_lock); 254
255 mmu_notifier_put(&rmn->mn);
256 bo->mn = NULL;
339} 257}
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 85e103b147cc..b44b1c322ec8 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -55,6 +55,7 @@ config INFINIBAND_ON_DEMAND_PAGING
55 bool "InfiniBand on-demand paging support" 55 bool "InfiniBand on-demand paging support"
56 depends on INFINIBAND_USER_MEM 56 depends on INFINIBAND_USER_MEM
57 select MMU_NOTIFIER 57 select MMU_NOTIFIER
58 select INTERVAL_TREE
58 default y 59 default y
59 ---help--- 60 ---help---
60 On demand paging support for the InfiniBand subsystem. 61 On demand paging support for the InfiniBand subsystem.
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index ea8661a00651..b5631b8a0397 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -2562,6 +2562,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
2562 SET_DEVICE_OP(dev_ops, get_vf_config); 2562 SET_DEVICE_OP(dev_ops, get_vf_config);
2563 SET_DEVICE_OP(dev_ops, get_vf_stats); 2563 SET_DEVICE_OP(dev_ops, get_vf_stats);
2564 SET_DEVICE_OP(dev_ops, init_port); 2564 SET_DEVICE_OP(dev_ops, init_port);
2565 SET_DEVICE_OP(dev_ops, invalidate_range);
2565 SET_DEVICE_OP(dev_ops, iw_accept); 2566 SET_DEVICE_OP(dev_ops, iw_accept);
2566 SET_DEVICE_OP(dev_ops, iw_add_ref); 2567 SET_DEVICE_OP(dev_ops, iw_add_ref);
2567 SET_DEVICE_OP(dev_ops, iw_connect); 2568 SET_DEVICE_OP(dev_ops, iw_connect);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 56553668256f..41f9e268e3fb 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -184,9 +184,6 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
184/** 184/**
185 * ib_umem_get - Pin and DMA map userspace memory. 185 * ib_umem_get - Pin and DMA map userspace memory.
186 * 186 *
187 * If access flags indicate ODP memory, avoid pinning. Instead, stores
188 * the mm for future page fault handling in conjunction with MMU notifiers.
189 *
190 * @udata: userspace context to pin memory for 187 * @udata: userspace context to pin memory for
191 * @addr: userspace virtual address to start at 188 * @addr: userspace virtual address to start at
192 * @size: length of region to pin 189 * @size: length of region to pin
@@ -231,36 +228,19 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
231 if (!can_do_mlock()) 228 if (!can_do_mlock())
232 return ERR_PTR(-EPERM); 229 return ERR_PTR(-EPERM);
233 230
234 if (access & IB_ACCESS_ON_DEMAND) { 231 if (access & IB_ACCESS_ON_DEMAND)
235 umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); 232 return ERR_PTR(-EOPNOTSUPP);
236 if (!umem)
237 return ERR_PTR(-ENOMEM);
238 umem->is_odp = 1;
239 } else {
240 umem = kzalloc(sizeof(*umem), GFP_KERNEL);
241 if (!umem)
242 return ERR_PTR(-ENOMEM);
243 }
244 233
245 umem->context = context; 234 umem = kzalloc(sizeof(*umem), GFP_KERNEL);
235 if (!umem)
236 return ERR_PTR(-ENOMEM);
237 umem->ibdev = context->device;
246 umem->length = size; 238 umem->length = size;
247 umem->address = addr; 239 umem->address = addr;
248 umem->writable = ib_access_writable(access); 240 umem->writable = ib_access_writable(access);
249 umem->owning_mm = mm = current->mm; 241 umem->owning_mm = mm = current->mm;
250 mmgrab(mm); 242 mmgrab(mm);
251 243
252 if (access & IB_ACCESS_ON_DEMAND) {
253 if (WARN_ON_ONCE(!context->invalidate_range)) {
254 ret = -EINVAL;
255 goto umem_kfree;
256 }
257
258 ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
259 if (ret)
260 goto umem_kfree;
261 return umem;
262 }
263
264 page_list = (struct page **) __get_free_page(GFP_KERNEL); 244 page_list = (struct page **) __get_free_page(GFP_KERNEL);
265 if (!page_list) { 245 if (!page_list) {
266 ret = -ENOMEM; 246 ret = -ENOMEM;
@@ -346,15 +326,6 @@ umem_kfree:
346} 326}
347EXPORT_SYMBOL(ib_umem_get); 327EXPORT_SYMBOL(ib_umem_get);
348 328
349static void __ib_umem_release_tail(struct ib_umem *umem)
350{
351 mmdrop(umem->owning_mm);
352 if (umem->is_odp)
353 kfree(to_ib_umem_odp(umem));
354 else
355 kfree(umem);
356}
357
358/** 329/**
359 * ib_umem_release - release memory pinned with ib_umem_get 330 * ib_umem_release - release memory pinned with ib_umem_get
360 * @umem: umem struct to release 331 * @umem: umem struct to release
@@ -363,17 +334,14 @@ void ib_umem_release(struct ib_umem *umem)
363{ 334{
364 if (!umem) 335 if (!umem)
365 return; 336 return;
337 if (umem->is_odp)
338 return ib_umem_odp_release(to_ib_umem_odp(umem));
366 339
367 if (umem->is_odp) { 340 __ib_umem_release(umem->ibdev, umem, 1);
368 ib_umem_odp_release(to_ib_umem_odp(umem));
369 __ib_umem_release_tail(umem);
370 return;
371 }
372
373 __ib_umem_release(umem->context->device, umem, 1);
374 341
375 atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); 342 atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
376 __ib_umem_release_tail(umem); 343 mmdrop(umem->owning_mm);
344 kfree(umem);
377} 345}
378EXPORT_SYMBOL(ib_umem_release); 346EXPORT_SYMBOL(ib_umem_release);
379 347
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index c0e15db34680..9aebe9ce8b07 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -39,44 +39,14 @@
39#include <linux/export.h> 39#include <linux/export.h>
40#include <linux/vmalloc.h> 40#include <linux/vmalloc.h>
41#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
42#include <linux/interval_tree_generic.h> 42#include <linux/interval_tree.h>
43#include <linux/pagemap.h> 43#include <linux/pagemap.h>
44 44
45#include <rdma/ib_verbs.h> 45#include <rdma/ib_verbs.h>
46#include <rdma/ib_umem.h> 46#include <rdma/ib_umem.h>
47#include <rdma/ib_umem_odp.h> 47#include <rdma/ib_umem_odp.h>
48 48
49/* 49#include "uverbs.h"
50 * The ib_umem list keeps track of memory regions for which the HW
51 * device request to receive notification when the related memory
52 * mapping is changed.
53 *
54 * ib_umem_lock protects the list.
55 */
56
57static u64 node_start(struct umem_odp_node *n)
58{
59 struct ib_umem_odp *umem_odp =
60 container_of(n, struct ib_umem_odp, interval_tree);
61
62 return ib_umem_start(umem_odp);
63}
64
65/* Note that the representation of the intervals in the interval tree
66 * considers the ending point as contained in the interval, while the
67 * function ib_umem_end returns the first address which is not contained
68 * in the umem.
69 */
70static u64 node_last(struct umem_odp_node *n)
71{
72 struct ib_umem_odp *umem_odp =
73 container_of(n, struct ib_umem_odp, interval_tree);
74
75 return ib_umem_end(umem_odp) - 1;
76}
77
78INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
79 node_start, node_last, static, rbt_ib_umem)
80 50
81static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) 51static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp)
82{ 52{
@@ -104,31 +74,34 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp)
104 mutex_unlock(&umem_odp->umem_mutex); 74 mutex_unlock(&umem_odp->umem_mutex);
105} 75}
106 76
107static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp,
108 u64 start, u64 end, void *cookie)
109{
110 /*
111 * Increase the number of notifiers running, to
112 * prevent any further fault handling on this MR.
113 */
114 ib_umem_notifier_start_account(umem_odp);
115 complete_all(&umem_odp->notifier_completion);
116 umem_odp->umem.context->invalidate_range(
117 umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp));
118 return 0;
119}
120
121static void ib_umem_notifier_release(struct mmu_notifier *mn, 77static void ib_umem_notifier_release(struct mmu_notifier *mn,
122 struct mm_struct *mm) 78 struct mm_struct *mm)
123{ 79{
124 struct ib_ucontext_per_mm *per_mm = 80 struct ib_ucontext_per_mm *per_mm =
125 container_of(mn, struct ib_ucontext_per_mm, mn); 81 container_of(mn, struct ib_ucontext_per_mm, mn);
82 struct rb_node *node;
126 83
127 down_read(&per_mm->umem_rwsem); 84 down_read(&per_mm->umem_rwsem);
128 if (per_mm->active) 85 if (!per_mm->mn.users)
129 rbt_ib_umem_for_each_in_range( 86 goto out;
130 &per_mm->umem_tree, 0, ULLONG_MAX, 87
131 ib_umem_notifier_release_trampoline, true, NULL); 88 for (node = rb_first_cached(&per_mm->umem_tree); node;
89 node = rb_next(node)) {
90 struct ib_umem_odp *umem_odp =
91 rb_entry(node, struct ib_umem_odp, interval_tree.rb);
92
93 /*
94 * Increase the number of notifiers running, to prevent any
95 * further fault handling on this MR.
96 */
97 ib_umem_notifier_start_account(umem_odp);
98 complete_all(&umem_odp->notifier_completion);
99 umem_odp->umem.ibdev->ops.invalidate_range(
100 umem_odp, ib_umem_start(umem_odp),
101 ib_umem_end(umem_odp));
102 }
103
104out:
132 up_read(&per_mm->umem_rwsem); 105 up_read(&per_mm->umem_rwsem);
133} 106}
134 107
@@ -136,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item,
136 u64 start, u64 end, void *cookie) 109 u64 start, u64 end, void *cookie)
137{ 110{
138 ib_umem_notifier_start_account(item); 111 ib_umem_notifier_start_account(item);
139 item->umem.context->invalidate_range(item, start, end); 112 item->umem.ibdev->ops.invalidate_range(item, start, end);
140 return 0; 113 return 0;
141} 114}
142 115
@@ -152,10 +125,10 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
152 else if (!down_read_trylock(&per_mm->umem_rwsem)) 125 else if (!down_read_trylock(&per_mm->umem_rwsem))
153 return -EAGAIN; 126 return -EAGAIN;
154 127
155 if (!per_mm->active) { 128 if (!per_mm->mn.users) {
156 up_read(&per_mm->umem_rwsem); 129 up_read(&per_mm->umem_rwsem);
157 /* 130 /*
158 * At this point active is permanently set and visible to this 131 * At this point users is permanently zero and visible to this
159 * CPU without a lock, that fact is relied on to skip the unlock 132 * CPU without a lock, that fact is relied on to skip the unlock
160 * in range_end. 133 * in range_end.
161 */ 134 */
@@ -185,7 +158,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
185 struct ib_ucontext_per_mm *per_mm = 158 struct ib_ucontext_per_mm *per_mm =
186 container_of(mn, struct ib_ucontext_per_mm, mn); 159 container_of(mn, struct ib_ucontext_per_mm, mn);
187 160
188 if (unlikely(!per_mm->active)) 161 if (unlikely(!per_mm->mn.users))
189 return; 162 return;
190 163
191 rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, 164 rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start,
@@ -194,212 +167,250 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
194 up_read(&per_mm->umem_rwsem); 167 up_read(&per_mm->umem_rwsem);
195} 168}
196 169
197static const struct mmu_notifier_ops ib_umem_notifiers = { 170static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm)
198 .release = ib_umem_notifier_release,
199 .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
200 .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
201};
202
203static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp)
204{
205 struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
206
207 down_write(&per_mm->umem_rwsem);
208 if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
209 rbt_ib_umem_insert(&umem_odp->interval_tree,
210 &per_mm->umem_tree);
211 up_write(&per_mm->umem_rwsem);
212}
213
214static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp)
215{
216 struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
217
218 down_write(&per_mm->umem_rwsem);
219 if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp)))
220 rbt_ib_umem_remove(&umem_odp->interval_tree,
221 &per_mm->umem_tree);
222 complete_all(&umem_odp->notifier_completion);
223
224 up_write(&per_mm->umem_rwsem);
225}
226
227static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
228 struct mm_struct *mm)
229{ 171{
230 struct ib_ucontext_per_mm *per_mm; 172 struct ib_ucontext_per_mm *per_mm;
231 int ret;
232 173
233 per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); 174 per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL);
234 if (!per_mm) 175 if (!per_mm)
235 return ERR_PTR(-ENOMEM); 176 return ERR_PTR(-ENOMEM);
236 177
237 per_mm->context = ctx;
238 per_mm->mm = mm;
239 per_mm->umem_tree = RB_ROOT_CACHED; 178 per_mm->umem_tree = RB_ROOT_CACHED;
240 init_rwsem(&per_mm->umem_rwsem); 179 init_rwsem(&per_mm->umem_rwsem);
241 per_mm->active = true;
242 180
181 WARN_ON(mm != current->mm);
243 rcu_read_lock(); 182 rcu_read_lock();
244 per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 183 per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
245 rcu_read_unlock(); 184 rcu_read_unlock();
185 return &per_mm->mn;
186}
246 187
247 WARN_ON(mm != current->mm); 188static void ib_umem_free_notifier(struct mmu_notifier *mn)
248 189{
249 per_mm->mn.ops = &ib_umem_notifiers; 190 struct ib_ucontext_per_mm *per_mm =
250 ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); 191 container_of(mn, struct ib_ucontext_per_mm, mn);
251 if (ret) {
252 dev_err(&ctx->device->dev,
253 "Failed to register mmu_notifier %d\n", ret);
254 goto out_pid;
255 }
256 192
257 list_add(&per_mm->ucontext_list, &ctx->per_mm_list); 193 WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root));
258 return per_mm;
259 194
260out_pid:
261 put_pid(per_mm->tgid); 195 put_pid(per_mm->tgid);
262 kfree(per_mm); 196 kfree(per_mm);
263 return ERR_PTR(ret);
264} 197}
265 198
266static int get_per_mm(struct ib_umem_odp *umem_odp) 199static const struct mmu_notifier_ops ib_umem_notifiers = {
200 .release = ib_umem_notifier_release,
201 .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
202 .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
203 .alloc_notifier = ib_umem_alloc_notifier,
204 .free_notifier = ib_umem_free_notifier,
205};
206
207static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp)
267{ 208{
268 struct ib_ucontext *ctx = umem_odp->umem.context;
269 struct ib_ucontext_per_mm *per_mm; 209 struct ib_ucontext_per_mm *per_mm;
210 struct mmu_notifier *mn;
211 int ret;
270 212
271 /* 213 umem_odp->umem.is_odp = 1;
272 * Generally speaking we expect only one or two per_mm in this list, 214 if (!umem_odp->is_implicit_odp) {
273 * so no reason to optimize this search today. 215 size_t page_size = 1UL << umem_odp->page_shift;
274 */ 216 size_t pages;
275 mutex_lock(&ctx->per_mm_list_lock); 217
276 list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { 218 umem_odp->interval_tree.start =
277 if (per_mm->mm == umem_odp->umem.owning_mm) 219 ALIGN_DOWN(umem_odp->umem.address, page_size);
278 goto found; 220 if (check_add_overflow(umem_odp->umem.address,
221 umem_odp->umem.length,
222 &umem_odp->interval_tree.last))
223 return -EOVERFLOW;
224 umem_odp->interval_tree.last =
225 ALIGN(umem_odp->interval_tree.last, page_size);
226 if (unlikely(umem_odp->interval_tree.last < page_size))
227 return -EOVERFLOW;
228
229 pages = (umem_odp->interval_tree.last -
230 umem_odp->interval_tree.start) >>
231 umem_odp->page_shift;
232 if (!pages)
233 return -EINVAL;
234
235 /*
236 * Note that the representation of the intervals in the
237 * interval tree considers the ending point as contained in
238 * the interval.
239 */
240 umem_odp->interval_tree.last--;
241
242 umem_odp->page_list = kvcalloc(
243 pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
244 if (!umem_odp->page_list)
245 return -ENOMEM;
246
247 umem_odp->dma_list = kvcalloc(
248 pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
249 if (!umem_odp->dma_list) {
250 ret = -ENOMEM;
251 goto out_page_list;
252 }
279 } 253 }
280 254
281 per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); 255 mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm);
282 if (IS_ERR(per_mm)) { 256 if (IS_ERR(mn)) {
283 mutex_unlock(&ctx->per_mm_list_lock); 257 ret = PTR_ERR(mn);
284 return PTR_ERR(per_mm); 258 goto out_dma_list;
285 } 259 }
260 umem_odp->per_mm = per_mm =
261 container_of(mn, struct ib_ucontext_per_mm, mn);
286 262
287found: 263 mutex_init(&umem_odp->umem_mutex);
288 umem_odp->per_mm = per_mm; 264 init_completion(&umem_odp->notifier_completion);
289 per_mm->odp_mrs_count++; 265
290 mutex_unlock(&ctx->per_mm_list_lock); 266 if (!umem_odp->is_implicit_odp) {
267 down_write(&per_mm->umem_rwsem);
268 interval_tree_insert(&umem_odp->interval_tree,
269 &per_mm->umem_tree);
270 up_write(&per_mm->umem_rwsem);
271 }
272 mmgrab(umem_odp->umem.owning_mm);
291 273
292 return 0; 274 return 0;
293}
294 275
295static void free_per_mm(struct rcu_head *rcu) 276out_dma_list:
296{ 277 kvfree(umem_odp->dma_list);
297 kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); 278out_page_list:
279 kvfree(umem_odp->page_list);
280 return ret;
298} 281}
299 282
300static void put_per_mm(struct ib_umem_odp *umem_odp) 283/**
284 * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
285 *
286 * Implicit ODP umems do not have a VA range and do not have any page lists.
287 * They exist only to hold the per_mm reference to help the driver create
288 * children umems.
289 *
290 * @udata: udata from the syscall being used to create the umem
291 * @access: ib_reg_mr access flags
292 */
293struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
294 int access)
301{ 295{
302 struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; 296 struct ib_ucontext *context =
303 struct ib_ucontext *ctx = umem_odp->umem.context; 297 container_of(udata, struct uverbs_attr_bundle, driver_udata)
304 bool need_free; 298 ->context;
305 299 struct ib_umem *umem;
306 mutex_lock(&ctx->per_mm_list_lock); 300 struct ib_umem_odp *umem_odp;
307 umem_odp->per_mm = NULL; 301 int ret;
308 per_mm->odp_mrs_count--;
309 need_free = per_mm->odp_mrs_count == 0;
310 if (need_free)
311 list_del(&per_mm->ucontext_list);
312 mutex_unlock(&ctx->per_mm_list_lock);
313
314 if (!need_free)
315 return;
316 302
317 /* 303 if (access & IB_ACCESS_HUGETLB)
318 * NOTE! mmu_notifier_unregister() can happen between a start/end 304 return ERR_PTR(-EINVAL);
319 * callback, resulting in an start/end, and thus an unbalanced
320 * lock. This doesn't really matter to us since we are about to kfree
321 * the memory that holds the lock, however LOCKDEP doesn't like this.
322 */
323 down_write(&per_mm->umem_rwsem);
324 per_mm->active = false;
325 up_write(&per_mm->umem_rwsem);
326 305
327 WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); 306 if (!context)
328 mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); 307 return ERR_PTR(-EIO);
329 put_pid(per_mm->tgid); 308 if (WARN_ON_ONCE(!context->device->ops.invalidate_range))
330 mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); 309 return ERR_PTR(-EINVAL);
310
311 umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
312 if (!umem_odp)
313 return ERR_PTR(-ENOMEM);
314 umem = &umem_odp->umem;
315 umem->ibdev = context->device;
316 umem->writable = ib_access_writable(access);
317 umem->owning_mm = current->mm;
318 umem_odp->is_implicit_odp = 1;
319 umem_odp->page_shift = PAGE_SHIFT;
320
321 ret = ib_init_umem_odp(umem_odp);
322 if (ret) {
323 kfree(umem_odp);
324 return ERR_PTR(ret);
325 }
326 return umem_odp;
331} 327}
328EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
332 329
333struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, 330/**
334 unsigned long addr, size_t size) 331 * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
332 * parent ODP umem
333 *
334 * @root: The parent umem enclosing the child. This must be allocated using
335 * ib_alloc_implicit_odp_umem()
336 * @addr: The starting userspace VA
337 * @size: The length of the userspace VA
338 */
339struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root,
340 unsigned long addr, size_t size)
335{ 341{
336 struct ib_ucontext_per_mm *per_mm = root->per_mm; 342 /*
337 struct ib_ucontext *ctx = per_mm->context; 343 * Caller must ensure that root cannot be freed during the call to
344 * ib_alloc_odp_umem.
345 */
338 struct ib_umem_odp *odp_data; 346 struct ib_umem_odp *odp_data;
339 struct ib_umem *umem; 347 struct ib_umem *umem;
340 int pages = size >> PAGE_SHIFT;
341 int ret; 348 int ret;
342 349
350 if (WARN_ON(!root->is_implicit_odp))
351 return ERR_PTR(-EINVAL);
352
343 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); 353 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
344 if (!odp_data) 354 if (!odp_data)
345 return ERR_PTR(-ENOMEM); 355 return ERR_PTR(-ENOMEM);
346 umem = &odp_data->umem; 356 umem = &odp_data->umem;
347 umem->context = ctx; 357 umem->ibdev = root->umem.ibdev;
348 umem->length = size; 358 umem->length = size;
349 umem->address = addr; 359 umem->address = addr;
350 odp_data->page_shift = PAGE_SHIFT;
351 umem->writable = root->umem.writable; 360 umem->writable = root->umem.writable;
352 umem->is_odp = 1; 361 umem->owning_mm = root->umem.owning_mm;
353 odp_data->per_mm = per_mm; 362 odp_data->page_shift = PAGE_SHIFT;
354 umem->owning_mm = per_mm->mm;
355 mmgrab(umem->owning_mm);
356
357 mutex_init(&odp_data->umem_mutex);
358 init_completion(&odp_data->notifier_completion);
359
360 odp_data->page_list =
361 vzalloc(array_size(pages, sizeof(*odp_data->page_list)));
362 if (!odp_data->page_list) {
363 ret = -ENOMEM;
364 goto out_odp_data;
365 }
366 363
367 odp_data->dma_list = 364 ret = ib_init_umem_odp(odp_data);
368 vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); 365 if (ret) {
369 if (!odp_data->dma_list) { 366 kfree(odp_data);
370 ret = -ENOMEM; 367 return ERR_PTR(ret);
371 goto out_page_list;
372 } 368 }
373
374 /*
375 * Caller must ensure that the umem_odp that the per_mm came from
376 * cannot be freed during the call to ib_alloc_odp_umem.
377 */
378 mutex_lock(&ctx->per_mm_list_lock);
379 per_mm->odp_mrs_count++;
380 mutex_unlock(&ctx->per_mm_list_lock);
381 add_umem_to_per_mm(odp_data);
382
383 return odp_data; 369 return odp_data;
384
385out_page_list:
386 vfree(odp_data->page_list);
387out_odp_data:
388 mmdrop(umem->owning_mm);
389 kfree(odp_data);
390 return ERR_PTR(ret);
391} 370}
392EXPORT_SYMBOL(ib_alloc_odp_umem); 371EXPORT_SYMBOL(ib_umem_odp_alloc_child);
393 372
394int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) 373/**
374 * ib_umem_odp_get - Create a umem_odp for a userspace va
375 *
376 * @udata: userspace context to pin memory for
377 * @addr: userspace virtual address to start at
378 * @size: length of region to pin
379 * @access: IB_ACCESS_xxx flags for memory being pinned
380 *
381 * The driver should use when the access flags indicate ODP memory. It avoids
382 * pinning, instead, stores the mm for future page fault handling in
383 * conjunction with MMU notifiers.
384 */
385struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
386 size_t size, int access)
395{ 387{
396 struct ib_umem *umem = &umem_odp->umem; 388 struct ib_umem_odp *umem_odp;
397 /* 389 struct ib_ucontext *context;
398 * NOTE: This must called in a process context where umem->owning_mm 390 struct mm_struct *mm;
399 * == current->mm 391 int ret;
400 */ 392
401 struct mm_struct *mm = umem->owning_mm; 393 if (!udata)
402 int ret_val; 394 return ERR_PTR(-EIO);
395
396 context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
397 ->context;
398 if (!context)
399 return ERR_PTR(-EIO);
400
401 if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) ||
402 WARN_ON_ONCE(!context->device->ops.invalidate_range))
403 return ERR_PTR(-EINVAL);
404
405 umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
406 if (!umem_odp)
407 return ERR_PTR(-ENOMEM);
408
409 umem_odp->umem.ibdev = context->device;
410 umem_odp->umem.length = size;
411 umem_odp->umem.address = addr;
412 umem_odp->umem.writable = ib_access_writable(access);
413 umem_odp->umem.owning_mm = mm = current->mm;
403 414
404 umem_odp->page_shift = PAGE_SHIFT; 415 umem_odp->page_shift = PAGE_SHIFT;
405 if (access & IB_ACCESS_HUGETLB) { 416 if (access & IB_ACCESS_HUGETLB) {
@@ -410,63 +421,63 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
410 vma = find_vma(mm, ib_umem_start(umem_odp)); 421 vma = find_vma(mm, ib_umem_start(umem_odp));
411 if (!vma || !is_vm_hugetlb_page(vma)) { 422 if (!vma || !is_vm_hugetlb_page(vma)) {
412 up_read(&mm->mmap_sem); 423 up_read(&mm->mmap_sem);
413 return -EINVAL; 424 ret = -EINVAL;
425 goto err_free;
414 } 426 }
415 h = hstate_vma(vma); 427 h = hstate_vma(vma);
416 umem_odp->page_shift = huge_page_shift(h); 428 umem_odp->page_shift = huge_page_shift(h);
417 up_read(&mm->mmap_sem); 429 up_read(&mm->mmap_sem);
418 } 430 }
419 431
420 mutex_init(&umem_odp->umem_mutex); 432 ret = ib_init_umem_odp(umem_odp);
421 433 if (ret)
422 init_completion(&umem_odp->notifier_completion); 434 goto err_free;
423 435 return umem_odp;
424 if (ib_umem_odp_num_pages(umem_odp)) {
425 umem_odp->page_list =
426 vzalloc(array_size(sizeof(*umem_odp->page_list),
427 ib_umem_odp_num_pages(umem_odp)));
428 if (!umem_odp->page_list)
429 return -ENOMEM;
430
431 umem_odp->dma_list =
432 vzalloc(array_size(sizeof(*umem_odp->dma_list),
433 ib_umem_odp_num_pages(umem_odp)));
434 if (!umem_odp->dma_list) {
435 ret_val = -ENOMEM;
436 goto out_page_list;
437 }
438 }
439
440 ret_val = get_per_mm(umem_odp);
441 if (ret_val)
442 goto out_dma_list;
443 add_umem_to_per_mm(umem_odp);
444
445 return 0;
446 436
447out_dma_list: 437err_free:
448 vfree(umem_odp->dma_list); 438 kfree(umem_odp);
449out_page_list: 439 return ERR_PTR(ret);
450 vfree(umem_odp->page_list);
451 return ret_val;
452} 440}
441EXPORT_SYMBOL(ib_umem_odp_get);
453 442
454void ib_umem_odp_release(struct ib_umem_odp *umem_odp) 443void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
455{ 444{
445 struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm;
446
456 /* 447 /*
457 * Ensure that no more pages are mapped in the umem. 448 * Ensure that no more pages are mapped in the umem.
458 * 449 *
459 * It is the driver's responsibility to ensure, before calling us, 450 * It is the driver's responsibility to ensure, before calling us,
460 * that the hardware will not attempt to access the MR any more. 451 * that the hardware will not attempt to access the MR any more.
461 */ 452 */
462 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 453 if (!umem_odp->is_implicit_odp) {
463 ib_umem_end(umem_odp)); 454 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
455 ib_umem_end(umem_odp));
456 kvfree(umem_odp->dma_list);
457 kvfree(umem_odp->page_list);
458 }
464 459
465 remove_umem_from_per_mm(umem_odp); 460 down_write(&per_mm->umem_rwsem);
466 put_per_mm(umem_odp); 461 if (!umem_odp->is_implicit_odp) {
467 vfree(umem_odp->dma_list); 462 interval_tree_remove(&umem_odp->interval_tree,
468 vfree(umem_odp->page_list); 463 &per_mm->umem_tree);
464 complete_all(&umem_odp->notifier_completion);
465 }
466 /*
467 * NOTE! mmu_notifier_unregister() can happen between a start/end
468 * callback, resulting in a missing end, and thus an unbalanced
469 * lock. This doesn't really matter to us since we are about to kfree
470 * the memory that holds the lock, however LOCKDEP doesn't like this.
471 * Thus we call the mmu_notifier_put under the rwsem and test the
472 * internal users count to reliably see if we are past this point.
473 */
474 mmu_notifier_put(&per_mm->mn);
475 up_write(&per_mm->umem_rwsem);
476
477 mmdrop(umem_odp->umem.owning_mm);
478 kfree(umem_odp);
469} 479}
480EXPORT_SYMBOL(ib_umem_odp_release);
470 481
471/* 482/*
472 * Map for DMA and insert a single page into the on-demand paging page tables. 483 * Map for DMA and insert a single page into the on-demand paging page tables.
@@ -493,8 +504,7 @@ static int ib_umem_odp_map_dma_single_page(
493 u64 access_mask, 504 u64 access_mask,
494 unsigned long current_seq) 505 unsigned long current_seq)
495{ 506{
496 struct ib_ucontext *context = umem_odp->umem.context; 507 struct ib_device *dev = umem_odp->umem.ibdev;
497 struct ib_device *dev = context->device;
498 dma_addr_t dma_addr; 508 dma_addr_t dma_addr;
499 int remove_existing_mapping = 0; 509 int remove_existing_mapping = 0;
500 int ret = 0; 510 int ret = 0;
@@ -534,7 +544,7 @@ out:
534 544
535 if (remove_existing_mapping) { 545 if (remove_existing_mapping) {
536 ib_umem_notifier_start_account(umem_odp); 546 ib_umem_notifier_start_account(umem_odp);
537 context->invalidate_range( 547 dev->ops.invalidate_range(
538 umem_odp, 548 umem_odp,
539 ib_umem_start(umem_odp) + 549 ib_umem_start(umem_odp) +
540 (page_index << umem_odp->page_shift), 550 (page_index << umem_odp->page_shift),
@@ -707,7 +717,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
707{ 717{
708 int idx; 718 int idx;
709 u64 addr; 719 u64 addr;
710 struct ib_device *dev = umem_odp->umem.context->device; 720 struct ib_device *dev = umem_odp->umem.ibdev;
711 721
712 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 722 virt = max_t(u64, virt, ib_umem_start(umem_odp));
713 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 723 bound = min_t(u64, bound, ib_umem_end(umem_odp));
@@ -761,35 +771,21 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
761 void *cookie) 771 void *cookie)
762{ 772{
763 int ret_val = 0; 773 int ret_val = 0;
764 struct umem_odp_node *node, *next; 774 struct interval_tree_node *node, *next;
765 struct ib_umem_odp *umem; 775 struct ib_umem_odp *umem;
766 776
767 if (unlikely(start == last)) 777 if (unlikely(start == last))
768 return ret_val; 778 return ret_val;
769 779
770 for (node = rbt_ib_umem_iter_first(root, start, last - 1); 780 for (node = interval_tree_iter_first(root, start, last - 1);
771 node; node = next) { 781 node; node = next) {
772 /* TODO move the blockable decision up to the callback */ 782 /* TODO move the blockable decision up to the callback */
773 if (!blockable) 783 if (!blockable)
774 return -EAGAIN; 784 return -EAGAIN;
775 next = rbt_ib_umem_iter_next(node, start, last - 1); 785 next = interval_tree_iter_next(node, start, last - 1);
776 umem = container_of(node, struct ib_umem_odp, interval_tree); 786 umem = container_of(node, struct ib_umem_odp, interval_tree);
777 ret_val = cb(umem, start, last, cookie) || ret_val; 787 ret_val = cb(umem, start, last, cookie) || ret_val;
778 } 788 }
779 789
780 return ret_val; 790 return ret_val;
781} 791}
782EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range);
783
784struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root,
785 u64 addr, u64 length)
786{
787 struct umem_odp_node *node;
788
789 node = rbt_ib_umem_iter_first(root, addr, addr + length - 1);
790 if (node)
791 return container_of(node, struct ib_umem_odp, interval_tree);
792 return NULL;
793
794}
795EXPORT_SYMBOL(rbt_ib_umem_lookup);
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 7ddd0e5bc6b3..7c10dfe417a4 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -252,9 +252,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
252 ucontext->closing = false; 252 ucontext->closing = false;
253 ucontext->cleanup_retryable = false; 253 ucontext->cleanup_retryable = false;
254 254
255 mutex_init(&ucontext->per_mm_list_lock);
256 INIT_LIST_HEAD(&ucontext->per_mm_list);
257
258 ret = get_unused_fd_flags(O_CLOEXEC); 255 ret = get_unused_fd_flags(O_CLOEXEC);
259 if (ret < 0) 256 if (ret < 0)
260 goto err_free; 257 goto err_free;
@@ -275,8 +272,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
275 ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); 272 ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata);
276 if (ret) 273 if (ret)
277 goto err_file; 274 goto err_file;
278 if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
279 ucontext->invalidate_range = NULL;
280 275
281 rdma_restrack_uadd(&ucontext->res); 276 rdma_restrack_uadd(&ucontext->res);
282 277
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 11c13c1381cf..e369ac0d6f51 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1487,6 +1487,7 @@ static void __exit ib_uverbs_cleanup(void)
1487 IB_UVERBS_NUM_FIXED_MINOR); 1487 IB_UVERBS_NUM_FIXED_MINOR);
1488 unregister_chrdev_region(dynamic_uverbs_dev, 1488 unregister_chrdev_region(dynamic_uverbs_dev,
1489 IB_UVERBS_NUM_DYNAMIC_MINOR); 1489 IB_UVERBS_NUM_DYNAMIC_MINOR);
1490 mmu_notifier_synchronize();
1490} 1491}
1491 1492
1492module_init(ib_uverbs_init); 1493module_init(ib_uverbs_init);
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 4e9f1507ffd9..bface798ee59 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1867,10 +1867,6 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
1867 if (err) 1867 if (err)
1868 goto out_sys_pages; 1868 goto out_sys_pages;
1869 1869
1870 if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
1871 context->ibucontext.invalidate_range =
1872 &mlx5_ib_invalidate_range;
1873
1874 if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { 1870 if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
1875 err = mlx5_ib_devx_create(dev, true); 1871 err = mlx5_ib_devx_create(dev, true);
1876 if (err < 0) 1872 if (err < 0)
@@ -1999,11 +1995,6 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1999 struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); 1995 struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
2000 struct mlx5_bfreg_info *bfregi; 1996 struct mlx5_bfreg_info *bfregi;
2001 1997
2002 /* All umem's must be destroyed before destroying the ucontext. */
2003 mutex_lock(&ibcontext->per_mm_list_lock);
2004 WARN_ON(!list_empty(&ibcontext->per_mm_list));
2005 mutex_unlock(&ibcontext->per_mm_list_lock);
2006
2007 bfregi = &context->bfregi; 1998 bfregi = &context->bfregi;
2008 mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); 1999 mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
2009 2000
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index a40e0abf2338..b5aece786b36 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -56,19 +56,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
56 struct scatterlist *sg; 56 struct scatterlist *sg;
57 int entry; 57 int entry;
58 58
59 if (umem->is_odp) {
60 struct ib_umem_odp *odp = to_ib_umem_odp(umem);
61 unsigned int page_shift = odp->page_shift;
62
63 *ncont = ib_umem_odp_num_pages(odp);
64 *count = *ncont << (page_shift - PAGE_SHIFT);
65 *shift = page_shift;
66 if (order)
67 *order = ilog2(roundup_pow_of_two(*ncont));
68
69 return;
70 }
71
72 addr = addr >> PAGE_SHIFT; 59 addr = addr >> PAGE_SHIFT;
73 tmp = (unsigned long)addr; 60 tmp = (unsigned long)addr;
74 m = find_first_bit(&tmp, BITS_PER_LONG); 61 m = find_first_bit(&tmp, BITS_PER_LONG);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 3401f5f6792e..1eff031ef048 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -784,19 +784,37 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
784 int *ncont, int *order) 784 int *ncont, int *order)
785{ 785{
786 struct ib_umem *u; 786 struct ib_umem *u;
787 int err;
788 787
789 *umem = NULL; 788 *umem = NULL;
790 789
791 u = ib_umem_get(udata, start, length, access_flags, 0); 790 if (access_flags & IB_ACCESS_ON_DEMAND) {
792 err = PTR_ERR_OR_ZERO(u); 791 struct ib_umem_odp *odp;
793 if (err) { 792
794 mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); 793 odp = ib_umem_odp_get(udata, start, length, access_flags);
795 return err; 794 if (IS_ERR(odp)) {
795 mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
796 PTR_ERR(odp));
797 return PTR_ERR(odp);
798 }
799
800 u = &odp->umem;
801
802 *page_shift = odp->page_shift;
803 *ncont = ib_umem_odp_num_pages(odp);
804 *npages = *ncont << (*page_shift - PAGE_SHIFT);
805 if (order)
806 *order = ilog2(roundup_pow_of_two(*ncont));
807 } else {
808 u = ib_umem_get(udata, start, length, access_flags, 0);
809 if (IS_ERR(u)) {
810 mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
811 return PTR_ERR(u);
812 }
813
814 mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
815 page_shift, ncont, order);
796 } 816 }
797 817
798 mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages,
799 page_shift, ncont, order);
800 if (!*npages) { 818 if (!*npages) {
801 mlx5_ib_warn(dev, "avoid zero region\n"); 819 mlx5_ib_warn(dev, "avoid zero region\n");
802 ib_umem_release(u); 820 ib_umem_release(u);
@@ -1599,7 +1617,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1599 /* Wait for all running page-fault handlers to finish. */ 1617 /* Wait for all running page-fault handlers to finish. */
1600 synchronize_srcu(&dev->mr_srcu); 1618 synchronize_srcu(&dev->mr_srcu);
1601 /* Destroy all page mappings */ 1619 /* Destroy all page mappings */
1602 if (umem_odp->page_list) 1620 if (!umem_odp->is_implicit_odp)
1603 mlx5_ib_invalidate_range(umem_odp, 1621 mlx5_ib_invalidate_range(umem_odp,
1604 ib_umem_start(umem_odp), 1622 ib_umem_start(umem_odp),
1605 ib_umem_end(umem_odp)); 1623 ib_umem_end(umem_odp));
@@ -1610,7 +1628,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1610 * so that there will not be any invalidations in 1628 * so that there will not be any invalidations in
1611 * flight, looking at the *mr struct. 1629 * flight, looking at the *mr struct.
1612 */ 1630 */
1613 ib_umem_release(umem); 1631 ib_umem_odp_release(umem_odp);
1614 atomic_sub(npages, &dev->mdev->priv.reg_pages); 1632 atomic_sub(npages, &dev->mdev->priv.reg_pages);
1615 1633
1616 /* Avoid double-freeing the umem. */ 1634 /* Avoid double-freeing the umem. */
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 0a59912a4cef..dd26e7acb37e 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -184,7 +184,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
184 for (i = 0; i < nentries; i++, pklm++) { 184 for (i = 0; i < nentries; i++, pklm++) {
185 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); 185 pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
186 va = (offset + i) * MLX5_IMR_MTT_SIZE; 186 va = (offset + i) * MLX5_IMR_MTT_SIZE;
187 if (odp && odp->umem.address == va) { 187 if (odp && ib_umem_start(odp) == va) {
188 struct mlx5_ib_mr *mtt = odp->private; 188 struct mlx5_ib_mr *mtt = odp->private;
189 189
190 pklm->key = cpu_to_be32(mtt->ibmr.lkey); 190 pklm->key = cpu_to_be32(mtt->ibmr.lkey);
@@ -206,7 +206,7 @@ static void mr_leaf_free_action(struct work_struct *work)
206 mr->parent = NULL; 206 mr->parent = NULL;
207 synchronize_srcu(&mr->dev->mr_srcu); 207 synchronize_srcu(&mr->dev->mr_srcu);
208 208
209 ib_umem_release(&odp->umem); 209 ib_umem_odp_release(odp);
210 if (imr->live) 210 if (imr->live)
211 mlx5_ib_update_xlt(imr, idx, 1, 0, 211 mlx5_ib_update_xlt(imr, idx, 1, 0,
212 MLX5_IB_UPD_XLT_INDIRECT | 212 MLX5_IB_UPD_XLT_INDIRECT |
@@ -386,7 +386,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
386} 386}
387 387
388static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, 388static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
389 struct ib_umem *umem, 389 struct ib_umem_odp *umem_odp,
390 bool ksm, int access_flags) 390 bool ksm, int access_flags)
391{ 391{
392 struct mlx5_ib_dev *dev = to_mdev(pd->device); 392 struct mlx5_ib_dev *dev = to_mdev(pd->device);
@@ -404,7 +404,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
404 mr->dev = dev; 404 mr->dev = dev;
405 mr->access_flags = access_flags; 405 mr->access_flags = access_flags;
406 mr->mmkey.iova = 0; 406 mr->mmkey.iova = 0;
407 mr->umem = umem; 407 mr->umem = &umem_odp->umem;
408 408
409 if (ksm) { 409 if (ksm) {
410 err = mlx5_ib_update_xlt(mr, 0, 410 err = mlx5_ib_update_xlt(mr, 0,
@@ -464,18 +464,17 @@ next_mr:
464 if (nentries) 464 if (nentries)
465 nentries++; 465 nentries++;
466 } else { 466 } else {
467 odp = ib_alloc_odp_umem(odp_mr, addr, 467 odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
468 MLX5_IMR_MTT_SIZE);
469 if (IS_ERR(odp)) { 468 if (IS_ERR(odp)) {
470 mutex_unlock(&odp_mr->umem_mutex); 469 mutex_unlock(&odp_mr->umem_mutex);
471 return ERR_CAST(odp); 470 return ERR_CAST(odp);
472 } 471 }
473 472
474 mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, 473 mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
475 mr->access_flags); 474 mr->access_flags);
476 if (IS_ERR(mtt)) { 475 if (IS_ERR(mtt)) {
477 mutex_unlock(&odp_mr->umem_mutex); 476 mutex_unlock(&odp_mr->umem_mutex);
478 ib_umem_release(&odp->umem); 477 ib_umem_odp_release(odp);
479 return ERR_CAST(mtt); 478 return ERR_CAST(mtt);
480 } 479 }
481 480
@@ -497,7 +496,7 @@ next_mr:
497 addr += MLX5_IMR_MTT_SIZE; 496 addr += MLX5_IMR_MTT_SIZE;
498 if (unlikely(addr < io_virt + bcnt)) { 497 if (unlikely(addr < io_virt + bcnt)) {
499 odp = odp_next(odp); 498 odp = odp_next(odp);
500 if (odp && odp->umem.address != addr) 499 if (odp && ib_umem_start(odp) != addr)
501 odp = NULL; 500 odp = NULL;
502 goto next_mr; 501 goto next_mr;
503 } 502 }
@@ -521,19 +520,19 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
521 int access_flags) 520 int access_flags)
522{ 521{
523 struct mlx5_ib_mr *imr; 522 struct mlx5_ib_mr *imr;
524 struct ib_umem *umem; 523 struct ib_umem_odp *umem_odp;
525 524
526 umem = ib_umem_get(udata, 0, 0, access_flags, 0); 525 umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
527 if (IS_ERR(umem)) 526 if (IS_ERR(umem_odp))
528 return ERR_CAST(umem); 527 return ERR_CAST(umem_odp);
529 528
530 imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); 529 imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
531 if (IS_ERR(imr)) { 530 if (IS_ERR(imr)) {
532 ib_umem_release(umem); 531 ib_umem_odp_release(umem_odp);
533 return ERR_CAST(imr); 532 return ERR_CAST(imr);
534 } 533 }
535 534
536 imr->umem = umem; 535 imr->umem = &umem_odp->umem;
537 init_waitqueue_head(&imr->q_leaf_free); 536 init_waitqueue_head(&imr->q_leaf_free);
538 atomic_set(&imr->num_leaf_free, 0); 537 atomic_set(&imr->num_leaf_free, 0);
539 atomic_set(&imr->num_pending_prefetch, 0); 538 atomic_set(&imr->num_pending_prefetch, 0);
@@ -541,34 +540,31 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
541 return imr; 540 return imr;
542} 541}
543 542
544static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, 543void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
545 void *cookie)
546{ 544{
547 struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; 545 struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
548 546 struct rb_node *node;
549 if (mr->parent != imr)
550 return 0;
551
552 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
553 ib_umem_end(umem_odp));
554 547
555 if (umem_odp->dying) 548 down_read(&per_mm->umem_rwsem);
556 return 0; 549 for (node = rb_first_cached(&per_mm->umem_tree); node;
550 node = rb_next(node)) {
551 struct ib_umem_odp *umem_odp =
552 rb_entry(node, struct ib_umem_odp, interval_tree.rb);
553 struct mlx5_ib_mr *mr = umem_odp->private;
557 554
558 WRITE_ONCE(umem_odp->dying, 1); 555 if (mr->parent != imr)
559 atomic_inc(&imr->num_leaf_free); 556 continue;
560 schedule_work(&umem_odp->work);
561 557
562 return 0; 558 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
563} 559 ib_umem_end(umem_odp));
564 560
565void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 561 if (umem_odp->dying)
566{ 562 continue;
567 struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
568 563
569 down_read(&per_mm->umem_rwsem); 564 WRITE_ONCE(umem_odp->dying, 1);
570 rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, 565 atomic_inc(&imr->num_leaf_free);
571 mr_leaf_free, true, imr); 566 schedule_work(&umem_odp->work);
567 }
572 up_read(&per_mm->umem_rwsem); 568 up_read(&per_mm->umem_rwsem);
573 569
574 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); 570 wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
@@ -589,7 +585,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
589 struct ib_umem_odp *odp; 585 struct ib_umem_odp *odp;
590 size_t size; 586 size_t size;
591 587
592 if (!odp_mr->page_list) { 588 if (odp_mr->is_implicit_odp) {
593 odp = implicit_mr_get_data(mr, io_virt, bcnt); 589 odp = implicit_mr_get_data(mr, io_virt, bcnt);
594 590
595 if (IS_ERR(odp)) 591 if (IS_ERR(odp))
@@ -607,7 +603,7 @@ next_mr:
607 start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; 603 start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
608 access_mask = ODP_READ_ALLOWED_BIT; 604 access_mask = ODP_READ_ALLOWED_BIT;
609 605
610 if (prefetch && !downgrade && !mr->umem->writable) { 606 if (prefetch && !downgrade && !odp->umem.writable) {
611 /* prefetch with write-access must 607 /* prefetch with write-access must
612 * be supported by the MR 608 * be supported by the MR
613 */ 609 */
@@ -615,7 +611,7 @@ next_mr:
615 goto out; 611 goto out;
616 } 612 }
617 613
618 if (mr->umem->writable && !downgrade) 614 if (odp->umem.writable && !downgrade)
619 access_mask |= ODP_WRITE_ALLOWED_BIT; 615 access_mask |= ODP_WRITE_ALLOWED_BIT;
620 616
621 current_seq = READ_ONCE(odp->notifiers_seq); 617 current_seq = READ_ONCE(odp->notifiers_seq);
@@ -625,8 +621,8 @@ next_mr:
625 */ 621 */
626 smp_rmb(); 622 smp_rmb();
627 623
628 ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, 624 ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
629 access_mask, current_seq); 625 current_seq);
630 626
631 if (ret < 0) 627 if (ret < 0)
632 goto out; 628 goto out;
@@ -634,8 +630,7 @@ next_mr:
634 np = ret; 630 np = ret;
635 631
636 mutex_lock(&odp->umem_mutex); 632 mutex_lock(&odp->umem_mutex);
637 if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), 633 if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
638 current_seq)) {
639 /* 634 /*
640 * No need to check whether the MTTs really belong to 635 * No need to check whether the MTTs really belong to
641 * this MR, since ib_umem_odp_map_dma_pages already 636 * this MR, since ib_umem_odp_map_dma_pages already
@@ -668,7 +663,7 @@ next_mr:
668 663
669 io_virt += size; 664 io_virt += size;
670 next = odp_next(odp); 665 next = odp_next(odp);
671 if (unlikely(!next || next->umem.address != io_virt)) { 666 if (unlikely(!next || ib_umem_start(next) != io_virt)) {
672 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", 667 mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
673 io_virt, next); 668 io_virt, next);
674 return -EAGAIN; 669 return -EAGAIN;
@@ -1618,6 +1613,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent)
1618 1613
1619static const struct ib_device_ops mlx5_ib_dev_odp_ops = { 1614static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
1620 .advise_mr = mlx5_ib_advise_mr, 1615 .advise_mr = mlx5_ib_advise_mr,
1616 .invalidate_range = mlx5_ib_invalidate_range,
1621}; 1617};
1622 1618
1623int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) 1619int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index a2a142ae087b..9d042310214f 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -573,6 +573,7 @@ static void __exit gru_exit(void)
573 gru_free_tables(); 573 gru_free_tables();
574 misc_deregister(&gru_miscdev); 574 misc_deregister(&gru_miscdev);
575 gru_proc_exit(); 575 gru_proc_exit();
576 mmu_notifier_synchronize();
576} 577}
577 578
578static const struct file_operations gru_fops = { 579static const struct file_operations gru_fops = {
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h
index 438191c22057..a7e44b2eb413 100644
--- a/drivers/misc/sgi-gru/grutables.h
+++ b/drivers/misc/sgi-gru/grutables.h
@@ -307,10 +307,8 @@ struct gru_mm_tracker { /* pack to reduce size */
307 307
308struct gru_mm_struct { 308struct gru_mm_struct {
309 struct mmu_notifier ms_notifier; 309 struct mmu_notifier ms_notifier;
310 atomic_t ms_refcnt;
311 spinlock_t ms_asid_lock; /* protects ASID assignment */ 310 spinlock_t ms_asid_lock; /* protects ASID assignment */
312 atomic_t ms_range_active;/* num range_invals active */ 311 atomic_t ms_range_active;/* num range_invals active */
313 char ms_released;
314 wait_queue_head_t ms_wait_queue; 312 wait_queue_head_t ms_wait_queue;
315 DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS); 313 DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS);
316 struct gru_mm_tracker ms_asids[GRU_MAX_GRUS]; 314 struct gru_mm_tracker ms_asids[GRU_MAX_GRUS];
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c
index 59ba0adf23ce..10921cd2608d 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -235,83 +235,47 @@ static void gru_invalidate_range_end(struct mmu_notifier *mn,
235 gms, range->start, range->end); 235 gms, range->start, range->end);
236} 236}
237 237
238static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) 238static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm)
239{ 239{
240 struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, 240 struct gru_mm_struct *gms;
241 ms_notifier); 241
242 gms = kzalloc(sizeof(*gms), GFP_KERNEL);
243 if (!gms)
244 return ERR_PTR(-ENOMEM);
245 STAT(gms_alloc);
246 spin_lock_init(&gms->ms_asid_lock);
247 init_waitqueue_head(&gms->ms_wait_queue);
242 248
243 gms->ms_released = 1; 249 return &gms->ms_notifier;
244 gru_dbg(grudev, "gms %p\n", gms);
245} 250}
246 251
252static void gru_free_notifier(struct mmu_notifier *mn)
253{
254 kfree(container_of(mn, struct gru_mm_struct, ms_notifier));
255 STAT(gms_free);
256}
247 257
248static const struct mmu_notifier_ops gru_mmuops = { 258static const struct mmu_notifier_ops gru_mmuops = {
249 .invalidate_range_start = gru_invalidate_range_start, 259 .invalidate_range_start = gru_invalidate_range_start,
250 .invalidate_range_end = gru_invalidate_range_end, 260 .invalidate_range_end = gru_invalidate_range_end,
251 .release = gru_release, 261 .alloc_notifier = gru_alloc_notifier,
262 .free_notifier = gru_free_notifier,
252}; 263};
253 264
254/* Move this to the basic mmu_notifier file. But for now... */
255static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm,
256 const struct mmu_notifier_ops *ops)
257{
258 struct mmu_notifier *mn, *gru_mn = NULL;
259
260 if (mm->mmu_notifier_mm) {
261 rcu_read_lock();
262 hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list,
263 hlist)
264 if (mn->ops == ops) {
265 gru_mn = mn;
266 break;
267 }
268 rcu_read_unlock();
269 }
270 return gru_mn;
271}
272
273struct gru_mm_struct *gru_register_mmu_notifier(void) 265struct gru_mm_struct *gru_register_mmu_notifier(void)
274{ 266{
275 struct gru_mm_struct *gms;
276 struct mmu_notifier *mn; 267 struct mmu_notifier *mn;
277 int err; 268
278 269 mn = mmu_notifier_get_locked(&gru_mmuops, current->mm);
279 mn = mmu_find_ops(current->mm, &gru_mmuops); 270 if (IS_ERR(mn))
280 if (mn) { 271 return ERR_CAST(mn);
281 gms = container_of(mn, struct gru_mm_struct, ms_notifier); 272
282 atomic_inc(&gms->ms_refcnt); 273 return container_of(mn, struct gru_mm_struct, ms_notifier);
283 } else {
284 gms = kzalloc(sizeof(*gms), GFP_KERNEL);
285 if (!gms)
286 return ERR_PTR(-ENOMEM);
287 STAT(gms_alloc);
288 spin_lock_init(&gms->ms_asid_lock);
289 gms->ms_notifier.ops = &gru_mmuops;
290 atomic_set(&gms->ms_refcnt, 1);
291 init_waitqueue_head(&gms->ms_wait_queue);
292 err = __mmu_notifier_register(&gms->ms_notifier, current->mm);
293 if (err)
294 goto error;
295 }
296 if (gms)
297 gru_dbg(grudev, "gms %p, refcnt %d\n", gms,
298 atomic_read(&gms->ms_refcnt));
299 return gms;
300error:
301 kfree(gms);
302 return ERR_PTR(err);
303} 274}
304 275
305void gru_drop_mmu_notifier(struct gru_mm_struct *gms) 276void gru_drop_mmu_notifier(struct gru_mm_struct *gms)
306{ 277{
307 gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms, 278 mmu_notifier_put(&gms->ms_notifier);
308 atomic_read(&gms->ms_refcnt), gms->ms_released);
309 if (atomic_dec_return(&gms->ms_refcnt) == 0) {
310 if (!gms->ms_released)
311 mmu_notifier_unregister(&gms->ms_notifier, current->mm);
312 kfree(gms);
313 STAT(gms_free);
314 }
315} 279}
316 280
317/* 281/*
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index a5fde15e91d3..36af7af6b7cf 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -118,4 +118,16 @@ config NVDIMM_KEYS
118 depends on ENCRYPTED_KEYS 118 depends on ENCRYPTED_KEYS
119 depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m 119 depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m
120 120
121config NVDIMM_TEST_BUILD
122 tristate "Build the unit test core"
123 depends on m
124 depends on COMPILE_TEST && X86_64
125 default m if COMPILE_TEST
126 help
127 Build the core of the unit test infrastructure. The result of
128 this build is non-functional for unit test execution, but it
129 otherwise helps catch build errors induced by changes to the
130 core devm_memremap_pages() implementation and other
131 infrastructure.
132
121endif 133endif
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index cefe233e0b52..29203f3d3069 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -29,3 +29,7 @@ libnvdimm-$(CONFIG_BTT) += btt_devs.o
29libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o 29libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
30libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o 30libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o
31libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o 31libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o
32
33TOOLS := ../../tools
34TEST_SRC := $(TOOLS)/testing/nvdimm/test
35obj-$(CONFIG_NVDIMM_TEST_BUILD) += $(TEST_SRC)/iomap.o
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 731642e0f5a0..bf43d1d60059 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,5 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h> 2#include <linux/pagewalk.h>
3#include <linux/vmacache.h> 3#include <linux/vmacache.h>
4#include <linux/hugetlb.h> 4#include <linux/hugetlb.h>
5#include <linux/huge_mm.h> 5#include <linux/huge_mm.h>
@@ -513,7 +513,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end,
513 513
514 return 0; 514 return 0;
515} 515}
516#endif 516#else
517#define smaps_pte_hole NULL
518#endif /* CONFIG_SHMEM */
517 519
518static void smaps_pte_entry(pte_t *pte, unsigned long addr, 520static void smaps_pte_entry(pte_t *pte, unsigned long addr,
519 struct mm_walk *walk) 521 struct mm_walk *walk)
@@ -729,21 +731,24 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
729 } 731 }
730 return 0; 732 return 0;
731} 733}
734#else
735#define smaps_hugetlb_range NULL
732#endif /* HUGETLB_PAGE */ 736#endif /* HUGETLB_PAGE */
733 737
738static const struct mm_walk_ops smaps_walk_ops = {
739 .pmd_entry = smaps_pte_range,
740 .hugetlb_entry = smaps_hugetlb_range,
741};
742
743static const struct mm_walk_ops smaps_shmem_walk_ops = {
744 .pmd_entry = smaps_pte_range,
745 .hugetlb_entry = smaps_hugetlb_range,
746 .pte_hole = smaps_pte_hole,
747};
748
734static void smap_gather_stats(struct vm_area_struct *vma, 749static void smap_gather_stats(struct vm_area_struct *vma,
735 struct mem_size_stats *mss) 750 struct mem_size_stats *mss)
736{ 751{
737 struct mm_walk smaps_walk = {
738 .pmd_entry = smaps_pte_range,
739#ifdef CONFIG_HUGETLB_PAGE
740 .hugetlb_entry = smaps_hugetlb_range,
741#endif
742 .mm = vma->vm_mm,
743 };
744
745 smaps_walk.private = mss;
746
747#ifdef CONFIG_SHMEM 752#ifdef CONFIG_SHMEM
748 /* In case of smaps_rollup, reset the value from previous vma */ 753 /* In case of smaps_rollup, reset the value from previous vma */
749 mss->check_shmem_swap = false; 754 mss->check_shmem_swap = false;
@@ -765,12 +770,13 @@ static void smap_gather_stats(struct vm_area_struct *vma,
765 mss->swap += shmem_swapped; 770 mss->swap += shmem_swapped;
766 } else { 771 } else {
767 mss->check_shmem_swap = true; 772 mss->check_shmem_swap = true;
768 smaps_walk.pte_hole = smaps_pte_hole; 773 walk_page_vma(vma, &smaps_shmem_walk_ops, mss);
774 return;
769 } 775 }
770 } 776 }
771#endif 777#endif
772 /* mmap_sem is held in m_start */ 778 /* mmap_sem is held in m_start */
773 walk_page_vma(vma, &smaps_walk); 779 walk_page_vma(vma, &smaps_walk_ops, mss);
774} 780}
775 781
776#define SEQ_PUT_DEC(str, val) \ 782#define SEQ_PUT_DEC(str, val) \
@@ -1118,6 +1124,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
1118 return 0; 1124 return 0;
1119} 1125}
1120 1126
1127static const struct mm_walk_ops clear_refs_walk_ops = {
1128 .pmd_entry = clear_refs_pte_range,
1129 .test_walk = clear_refs_test_walk,
1130};
1131
1121static ssize_t clear_refs_write(struct file *file, const char __user *buf, 1132static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1122 size_t count, loff_t *ppos) 1133 size_t count, loff_t *ppos)
1123{ 1134{
@@ -1151,12 +1162,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1151 struct clear_refs_private cp = { 1162 struct clear_refs_private cp = {
1152 .type = type, 1163 .type = type,
1153 }; 1164 };
1154 struct mm_walk clear_refs_walk = {
1155 .pmd_entry = clear_refs_pte_range,
1156 .test_walk = clear_refs_test_walk,
1157 .mm = mm,
1158 .private = &cp,
1159 };
1160 1165
1161 if (type == CLEAR_REFS_MM_HIWATER_RSS) { 1166 if (type == CLEAR_REFS_MM_HIWATER_RSS) {
1162 if (down_write_killable(&mm->mmap_sem)) { 1167 if (down_write_killable(&mm->mmap_sem)) {
@@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
1217 0, NULL, mm, 0, -1UL); 1222 0, NULL, mm, 0, -1UL);
1218 mmu_notifier_invalidate_range_start(&range); 1223 mmu_notifier_invalidate_range_start(&range);
1219 } 1224 }
1220 walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); 1225 walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
1226 &cp);
1221 if (type == CLEAR_REFS_SOFT_DIRTY) 1227 if (type == CLEAR_REFS_SOFT_DIRTY)
1222 mmu_notifier_invalidate_range_end(&range); 1228 mmu_notifier_invalidate_range_end(&range);
1223 tlb_finish_mmu(&tlb, 0, -1); 1229 tlb_finish_mmu(&tlb, 0, -1);
@@ -1489,8 +1495,16 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1489 1495
1490 return err; 1496 return err;
1491} 1497}
1498#else
1499#define pagemap_hugetlb_range NULL
1492#endif /* HUGETLB_PAGE */ 1500#endif /* HUGETLB_PAGE */
1493 1501
1502static const struct mm_walk_ops pagemap_ops = {
1503 .pmd_entry = pagemap_pmd_range,
1504 .pte_hole = pagemap_pte_hole,
1505 .hugetlb_entry = pagemap_hugetlb_range,
1506};
1507
1494/* 1508/*
1495 * /proc/pid/pagemap - an array mapping virtual pages to pfns 1509 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1496 * 1510 *
@@ -1522,7 +1536,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1522{ 1536{
1523 struct mm_struct *mm = file->private_data; 1537 struct mm_struct *mm = file->private_data;
1524 struct pagemapread pm; 1538 struct pagemapread pm;
1525 struct mm_walk pagemap_walk = {};
1526 unsigned long src; 1539 unsigned long src;
1527 unsigned long svpfn; 1540 unsigned long svpfn;
1528 unsigned long start_vaddr; 1541 unsigned long start_vaddr;
@@ -1550,14 +1563,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1550 if (!pm.buffer) 1563 if (!pm.buffer)
1551 goto out_mm; 1564 goto out_mm;
1552 1565
1553 pagemap_walk.pmd_entry = pagemap_pmd_range;
1554 pagemap_walk.pte_hole = pagemap_pte_hole;
1555#ifdef CONFIG_HUGETLB_PAGE
1556 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1557#endif
1558 pagemap_walk.mm = mm;
1559 pagemap_walk.private = &pm;
1560
1561 src = *ppos; 1566 src = *ppos;
1562 svpfn = src / PM_ENTRY_BYTES; 1567 svpfn = src / PM_ENTRY_BYTES;
1563 start_vaddr = svpfn << PAGE_SHIFT; 1568 start_vaddr = svpfn << PAGE_SHIFT;
@@ -1586,7 +1591,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1586 ret = down_read_killable(&mm->mmap_sem); 1591 ret = down_read_killable(&mm->mmap_sem);
1587 if (ret) 1592 if (ret)
1588 goto out_free; 1593 goto out_free;
1589 ret = walk_page_range(start_vaddr, end, &pagemap_walk); 1594 ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
1590 up_read(&mm->mmap_sem); 1595 up_read(&mm->mmap_sem);
1591 start_vaddr = end; 1596 start_vaddr = end;
1592 1597
@@ -1798,6 +1803,11 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1798} 1803}
1799#endif 1804#endif
1800 1805
1806static const struct mm_walk_ops show_numa_ops = {
1807 .hugetlb_entry = gather_hugetlb_stats,
1808 .pmd_entry = gather_pte_stats,
1809};
1810
1801/* 1811/*
1802 * Display pages allocated per node and memory policy via /proc. 1812 * Display pages allocated per node and memory policy via /proc.
1803 */ 1813 */
@@ -1809,12 +1819,6 @@ static int show_numa_map(struct seq_file *m, void *v)
1809 struct numa_maps *md = &numa_priv->md; 1819 struct numa_maps *md = &numa_priv->md;
1810 struct file *file = vma->vm_file; 1820 struct file *file = vma->vm_file;
1811 struct mm_struct *mm = vma->vm_mm; 1821 struct mm_struct *mm = vma->vm_mm;
1812 struct mm_walk walk = {
1813 .hugetlb_entry = gather_hugetlb_stats,
1814 .pmd_entry = gather_pte_stats,
1815 .private = md,
1816 .mm = mm,
1817 };
1818 struct mempolicy *pol; 1822 struct mempolicy *pol;
1819 char buffer[64]; 1823 char buffer[64];
1820 int nid; 1824 int nid;
@@ -1848,7 +1852,7 @@ static int show_numa_map(struct seq_file *m, void *v)
1848 seq_puts(m, " huge"); 1852 seq_puts(m, " huge");
1849 1853
1850 /* mmap_sem is held by m_start */ 1854 /* mmap_sem is held by m_start */
1851 walk_page_vma(vma, &walk); 1855 walk_page_vma(vma, &show_numa_ops, md);
1852 1856
1853 if (!md->pages) 1857 if (!md->pages)
1854 goto out; 1858 goto out;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7ef56dc18050..3fec513b9c00 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -84,15 +84,12 @@
84 * @notifiers: count of active mmu notifiers 84 * @notifiers: count of active mmu notifiers
85 */ 85 */
86struct hmm { 86struct hmm {
87 struct mm_struct *mm; 87 struct mmu_notifier mmu_notifier;
88 struct kref kref;
89 spinlock_t ranges_lock; 88 spinlock_t ranges_lock;
90 struct list_head ranges; 89 struct list_head ranges;
91 struct list_head mirrors; 90 struct list_head mirrors;
92 struct mmu_notifier mmu_notifier;
93 struct rw_semaphore mirrors_sem; 91 struct rw_semaphore mirrors_sem;
94 wait_queue_head_t wq; 92 wait_queue_head_t wq;
95 struct rcu_head rcu;
96 long notifiers; 93 long notifiers;
97}; 94};
98 95
@@ -158,13 +155,11 @@ enum hmm_pfn_value_e {
158 * @values: pfn value for some special case (none, special, error, ...) 155 * @values: pfn value for some special case (none, special, error, ...)
159 * @default_flags: default flags for the range (write, read, ... see hmm doc) 156 * @default_flags: default flags for the range (write, read, ... see hmm doc)
160 * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter 157 * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
161 * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
162 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) 158 * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
163 * @valid: pfns array did not change since it has been fill by an HMM function 159 * @valid: pfns array did not change since it has been fill by an HMM function
164 */ 160 */
165struct hmm_range { 161struct hmm_range {
166 struct hmm *hmm; 162 struct hmm *hmm;
167 struct vm_area_struct *vma;
168 struct list_head list; 163 struct list_head list;
169 unsigned long start; 164 unsigned long start;
170 unsigned long end; 165 unsigned long end;
@@ -173,32 +168,11 @@ struct hmm_range {
173 const uint64_t *values; 168 const uint64_t *values;
174 uint64_t default_flags; 169 uint64_t default_flags;
175 uint64_t pfn_flags_mask; 170 uint64_t pfn_flags_mask;
176 uint8_t page_shift;
177 uint8_t pfn_shift; 171 uint8_t pfn_shift;
178 bool valid; 172 bool valid;
179}; 173};
180 174
181/* 175/*
182 * hmm_range_page_shift() - return the page shift for the range
183 * @range: range being queried
184 * Return: page shift (page size = 1 << page shift) for the range
185 */
186static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
187{
188 return range->page_shift;
189}
190
191/*
192 * hmm_range_page_size() - return the page size for the range
193 * @range: range being queried
194 * Return: page size for the range in bytes
195 */
196static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
197{
198 return 1UL << hmm_range_page_shift(range);
199}
200
201/*
202 * hmm_range_wait_until_valid() - wait for range to be valid 176 * hmm_range_wait_until_valid() - wait for range to be valid
203 * @range: range affected by invalidation to wait on 177 * @range: range affected by invalidation to wait on
204 * @timeout: time out for wait in ms (ie abort wait after that period of time) 178 * @timeout: time out for wait in ms (ie abort wait after that period of time)
@@ -291,40 +265,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
291} 265}
292 266
293/* 267/*
294 * Old API:
295 * hmm_pfn_to_page()
296 * hmm_pfn_to_pfn()
297 * hmm_pfn_from_page()
298 * hmm_pfn_from_pfn()
299 *
300 * This are the OLD API please use new API, it is here to avoid cross-tree
301 * merge painfullness ie we convert things to new API in stages.
302 */
303static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
304 uint64_t pfn)
305{
306 return hmm_device_entry_to_page(range, pfn);
307}
308
309static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
310 uint64_t pfn)
311{
312 return hmm_device_entry_to_pfn(range, pfn);
313}
314
315static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
316 struct page *page)
317{
318 return hmm_device_entry_from_page(range, page);
319}
320
321static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
322 unsigned long pfn)
323{
324 return hmm_device_entry_from_pfn(range, pfn);
325}
326
327/*
328 * Mirroring: how to synchronize device page table with CPU page table. 268 * Mirroring: how to synchronize device page table with CPU page table.
329 * 269 *
330 * A device driver that is participating in HMM mirroring must always 270 * A device driver that is participating in HMM mirroring must always
@@ -375,29 +315,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
375struct hmm_mirror; 315struct hmm_mirror;
376 316
377/* 317/*
378 * enum hmm_update_event - type of update
379 * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
380 */
381enum hmm_update_event {
382 HMM_UPDATE_INVALIDATE,
383};
384
385/*
386 * struct hmm_update - HMM update information for callback
387 *
388 * @start: virtual start address of the range to update
389 * @end: virtual end address of the range to update
390 * @event: event triggering the update (what is happening)
391 * @blockable: can the callback block/sleep ?
392 */
393struct hmm_update {
394 unsigned long start;
395 unsigned long end;
396 enum hmm_update_event event;
397 bool blockable;
398};
399
400/*
401 * struct hmm_mirror_ops - HMM mirror device operations callback 318 * struct hmm_mirror_ops - HMM mirror device operations callback
402 * 319 *
403 * @update: callback to update range on a device 320 * @update: callback to update range on a device
@@ -417,9 +334,9 @@ struct hmm_mirror_ops {
417 /* sync_cpu_device_pagetables() - synchronize page tables 334 /* sync_cpu_device_pagetables() - synchronize page tables
418 * 335 *
419 * @mirror: pointer to struct hmm_mirror 336 * @mirror: pointer to struct hmm_mirror
420 * @update: update information (see struct hmm_update) 337 * @update: update information (see struct mmu_notifier_range)
421 * Return: -EAGAIN if update.blockable false and callback need to 338 * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
422 * block, 0 otherwise. 339 * and callback needs to block, 0 otherwise.
423 * 340 *
424 * This callback ultimately originates from mmu_notifiers when the CPU 341 * This callback ultimately originates from mmu_notifiers when the CPU
425 * page table is updated. The device driver must update its page table 342 * page table is updated. The device driver must update its page table
@@ -430,8 +347,9 @@ struct hmm_mirror_ops {
430 * page tables are completely updated (TLBs flushed, etc); this is a 347 * page tables are completely updated (TLBs flushed, etc); this is a
431 * synchronous call. 348 * synchronous call.
432 */ 349 */
433 int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, 350 int (*sync_cpu_device_pagetables)(
434 const struct hmm_update *update); 351 struct hmm_mirror *mirror,
352 const struct mmu_notifier_range *update);
435}; 353};
436 354
437/* 355/*
@@ -457,20 +375,24 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
457/* 375/*
458 * Please see Documentation/vm/hmm.rst for how to use the range API. 376 * Please see Documentation/vm/hmm.rst for how to use the range API.
459 */ 377 */
460int hmm_range_register(struct hmm_range *range, 378int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror);
461 struct hmm_mirror *mirror,
462 unsigned long start,
463 unsigned long end,
464 unsigned page_shift);
465void hmm_range_unregister(struct hmm_range *range); 379void hmm_range_unregister(struct hmm_range *range);
466long hmm_range_snapshot(struct hmm_range *range); 380
467long hmm_range_fault(struct hmm_range *range, bool block); 381/*
382 * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
383 */
384#define HMM_FAULT_ALLOW_RETRY (1 << 0)
385
386/* Don't fault in missing PTEs, just snapshot the current state. */
387#define HMM_FAULT_SNAPSHOT (1 << 1)
388
389long hmm_range_fault(struct hmm_range *range, unsigned int flags);
390
468long hmm_range_dma_map(struct hmm_range *range, 391long hmm_range_dma_map(struct hmm_range *range,
469 struct device *device, 392 struct device *device,
470 dma_addr_t *daddrs, 393 dma_addr_t *daddrs,
471 bool block); 394 unsigned int flags);
472long hmm_range_dma_unmap(struct hmm_range *range, 395long hmm_range_dma_unmap(struct hmm_range *range,
473 struct vm_area_struct *vma,
474 struct device *device, 396 struct device *device,
475 dma_addr_t *daddrs, 397 dma_addr_t *daddrs,
476 bool dirty); 398 bool dirty);
@@ -484,13 +406,6 @@ long hmm_range_dma_unmap(struct hmm_range *range,
484 */ 406 */
485#define HMM_RANGE_DEFAULT_TIMEOUT 1000 407#define HMM_RANGE_DEFAULT_TIMEOUT 1000
486 408
487/* Below are for HMM internal use only! Not to be used by device driver! */
488static inline void hmm_mm_init(struct mm_struct *mm)
489{
490 mm->hmm = NULL;
491}
492#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
493static inline void hmm_mm_init(struct mm_struct *mm) {}
494#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 409#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
495 410
496#endif /* LINUX_HMM_H */ 411#endif /* LINUX_HMM_H */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5b6a7121c9f0..7bddddfc76d6 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -297,6 +297,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
297 297
298struct resource *devm_request_free_mem_region(struct device *dev, 298struct resource *devm_request_free_mem_region(struct device *dev,
299 struct resource *base, unsigned long size); 299 struct resource *base, unsigned long size);
300struct resource *request_free_mem_region(struct resource *base,
301 unsigned long size, const char *name);
300 302
301#endif /* __ASSEMBLY__ */ 303#endif /* __ASSEMBLY__ */
302#endif /* _LINUX_IOPORT_H */ 304#endif /* _LINUX_IOPORT_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4fa360a13c1e..d83d403dac2e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -217,7 +217,9 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
217 * might_sleep - annotation for functions that can sleep 217 * might_sleep - annotation for functions that can sleep
218 * 218 *
219 * this macro will print a stack trace if it is executed in an atomic 219 * this macro will print a stack trace if it is executed in an atomic
220 * context (spinlock, irq-handler, ...). 220 * context (spinlock, irq-handler, ...). Additional sections where blocking is
221 * not allowed can be annotated with non_block_start() and non_block_end()
222 * pairs.
221 * 223 *
222 * This is a useful debugging help to be able to catch problems early and not 224 * This is a useful debugging help to be able to catch problems early and not
223 * be bitten later when the calling function happens to sleep when it is not 225 * be bitten later when the calling function happens to sleep when it is not
@@ -233,6 +235,23 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
233# define cant_sleep() \ 235# define cant_sleep() \
234 do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) 236 do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
235# define sched_annotate_sleep() (current->task_state_change = 0) 237# define sched_annotate_sleep() (current->task_state_change = 0)
238/**
239 * non_block_start - annotate the start of section where sleeping is prohibited
240 *
241 * This is on behalf of the oom reaper, specifically when it is calling the mmu
242 * notifiers. The problem is that if the notifier were to block on, for example,
243 * mutex_lock() and if the process which holds that mutex were to perform a
244 * sleeping memory allocation, the oom reaper is now blocked on completion of
245 * that memory allocation. Other blocking calls like wait_event() pose similar
246 * issues.
247 */
248# define non_block_start() (current->non_block_count++)
249/**
250 * non_block_end - annotate the end of section where sleeping is prohibited
251 *
252 * Closes a section opened by non_block_start().
253 */
254# define non_block_end() WARN_ON(current->non_block_count-- == 0)
236#else 255#else
237 static inline void ___might_sleep(const char *file, int line, 256 static inline void ___might_sleep(const char *file, int line,
238 int preempt_offset) { } 257 int preempt_offset) { }
@@ -241,6 +260,8 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
241# define might_sleep() do { might_resched(); } while (0) 260# define might_sleep() do { might_resched(); } while (0)
242# define cant_sleep() do { } while (0) 261# define cant_sleep() do { } while (0)
243# define sched_annotate_sleep() do { } while (0) 262# define sched_annotate_sleep() do { } while (0)
263# define non_block_start() do { } while (0)
264# define non_block_end() do { } while (0)
244#endif 265#endif
245 266
246#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) 267#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index f8a5b2a19945..fb2a0bd826b9 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -109,7 +109,6 @@ struct dev_pagemap {
109 struct percpu_ref *ref; 109 struct percpu_ref *ref;
110 struct percpu_ref internal_ref; 110 struct percpu_ref internal_ref;
111 struct completion done; 111 struct completion done;
112 struct device *dev;
113 enum memory_type type; 112 enum memory_type type;
114 unsigned int flags; 113 unsigned int flags;
115 u64 pci_p2pdma_bus_offset; 114 u64 pci_p2pdma_bus_offset;
@@ -124,6 +123,8 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
124} 123}
125 124
126#ifdef CONFIG_ZONE_DEVICE 125#ifdef CONFIG_ZONE_DEVICE
126void *memremap_pages(struct dev_pagemap *pgmap, int nid);
127void memunmap_pages(struct dev_pagemap *pgmap);
127void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); 128void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
128void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); 129void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
129struct dev_pagemap *get_dev_pagemap(unsigned long pfn, 130struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7f04754c7f2b..72120061b7d4 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -166,8 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
166#define MIGRATE_PFN_MIGRATE (1UL << 1) 166#define MIGRATE_PFN_MIGRATE (1UL << 1)
167#define MIGRATE_PFN_LOCKED (1UL << 2) 167#define MIGRATE_PFN_LOCKED (1UL << 2)
168#define MIGRATE_PFN_WRITE (1UL << 3) 168#define MIGRATE_PFN_WRITE (1UL << 3)
169#define MIGRATE_PFN_DEVICE (1UL << 4)
170#define MIGRATE_PFN_ERROR (1UL << 5)
171#define MIGRATE_PFN_SHIFT 6 169#define MIGRATE_PFN_SHIFT 6
172 170
173static inline struct page *migrate_pfn_to_page(unsigned long mpfn) 171static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
@@ -182,107 +180,27 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
182 return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; 180 return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
183} 181}
184 182
185/* 183struct migrate_vma {
186 * struct migrate_vma_ops - migrate operation callback 184 struct vm_area_struct *vma;
187 * 185 /*
188 * @alloc_and_copy: alloc destination memory and copy source memory to it 186 * Both src and dst array must be big enough for
189 * @finalize_and_map: allow caller to map the successfully migrated pages 187 * (end - start) >> PAGE_SHIFT entries.
190 * 188 *
191 * 189 * The src array must not be modified by the caller after
192 * The alloc_and_copy() callback happens once all source pages have been locked, 190 * migrate_vma_setup(), and must not change the dst array after
193 * unmapped and checked (checked whether pinned or not). All pages that can be 191 * migrate_vma_pages() returns.
194 * migrated will have an entry in the src array set with the pfn value of the 192 */
195 * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other 193 unsigned long *dst;
196 * flags might be set but should be ignored by the callback). 194 unsigned long *src;
197 * 195 unsigned long cpages;
198 * The alloc_and_copy() callback can then allocate destination memory and copy 196 unsigned long npages;
199 * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and 197 unsigned long start;
200 * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the 198 unsigned long end;
201 * callback must update each corresponding entry in the dst array with the pfn
202 * value of the destination page and with the MIGRATE_PFN_VALID and
203 * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
204 * locked, via lock_page()).
205 *
206 * At this point the alloc_and_copy() callback is done and returns.
207 *
208 * Note that the callback does not have to migrate all the pages that are
209 * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
210 * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
211 * set in the src array entry). If the device driver cannot migrate a device
212 * page back to system memory, then it must set the corresponding dst array
213 * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
214 * access any of the virtual addresses originally backed by this page. Because
215 * a SIGBUS is such a severe result for the userspace process, the device
216 * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
217 * unrecoverable state.
218 *
219 * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
220 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
221 * allowing device driver to allocate device memory for those unback virtual
222 * address. For this the device driver simply have to allocate device memory
223 * and properly set the destination entry like for regular migration. Note that
224 * this can still fails and thus inside the device driver must check if the
225 * migration was successful for those entry inside the finalize_and_map()
226 * callback just like for regular migration.
227 *
228 * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
229 * OR BAD THINGS WILL HAPPEN !
230 *
231 *
232 * The finalize_and_map() callback happens after struct page migration from
233 * source to destination (destination struct pages are the struct pages for the
234 * memory allocated by the alloc_and_copy() callback). Migration can fail, and
235 * thus the finalize_and_map() allows the driver to inspect which pages were
236 * successfully migrated, and which were not. Successfully migrated pages will
237 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
238 *
239 * It is safe to update device page table from within the finalize_and_map()
240 * callback because both destination and source page are still locked, and the
241 * mmap_sem is held in read mode (hence no one can unmap the range being
242 * migrated).
243 *
244 * Once callback is done cleaning up things and updating its page table (if it
245 * chose to do so, this is not an obligation) then it returns. At this point,
246 * the HMM core will finish up the final steps, and the migration is complete.
247 *
248 * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY
249 * ENTRIES OR BAD THINGS WILL HAPPEN !
250 */
251struct migrate_vma_ops {
252 void (*alloc_and_copy)(struct vm_area_struct *vma,
253 const unsigned long *src,
254 unsigned long *dst,
255 unsigned long start,
256 unsigned long end,
257 void *private);
258 void (*finalize_and_map)(struct vm_area_struct *vma,
259 const unsigned long *src,
260 const unsigned long *dst,
261 unsigned long start,
262 unsigned long end,
263 void *private);
264}; 199};
265 200
266#if defined(CONFIG_MIGRATE_VMA_HELPER) 201int migrate_vma_setup(struct migrate_vma *args);
267int migrate_vma(const struct migrate_vma_ops *ops, 202void migrate_vma_pages(struct migrate_vma *migrate);
268 struct vm_area_struct *vma, 203void migrate_vma_finalize(struct migrate_vma *migrate);
269 unsigned long start,
270 unsigned long end,
271 unsigned long *src,
272 unsigned long *dst,
273 void *private);
274#else
275static inline int migrate_vma(const struct migrate_vma_ops *ops,
276 struct vm_area_struct *vma,
277 unsigned long start,
278 unsigned long end,
279 unsigned long *src,
280 unsigned long *dst,
281 void *private)
282{
283 return -EINVAL;
284}
285#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
286 204
287#endif /* CONFIG_MIGRATION */ 205#endif /* CONFIG_MIGRATION */
288 206
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..7cf955feb823 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1430,54 +1430,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address,
1430void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 1430void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1431 unsigned long start, unsigned long end); 1431 unsigned long start, unsigned long end);
1432 1432
1433/**
1434 * mm_walk - callbacks for walk_page_range
1435 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
1436 * this handler should only handle pud_trans_huge() puds.
1437 * the pmd_entry or pte_entry callbacks will be used for
1438 * regular PUDs.
1439 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
1440 * this handler is required to be able to handle
1441 * pmd_trans_huge() pmds. They may simply choose to
1442 * split_huge_page() instead of handling it explicitly.
1443 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
1444 * @pte_hole: if set, called for each hole at all levels
1445 * @hugetlb_entry: if set, called for each hugetlb entry
1446 * @test_walk: caller specific callback function to determine whether
1447 * we walk over the current vma or not. Returning 0
1448 * value means "do page table walk over the current vma,"
1449 * and a negative one means "abort current page table walk
1450 * right now." 1 means "skip the current vma."
1451 * @mm: mm_struct representing the target process of page table walk
1452 * @vma: vma currently walked (NULL if walking outside vmas)
1453 * @private: private data for callbacks' usage
1454 *
1455 * (see the comment on walk_page_range() for more details)
1456 */
1457struct mm_walk {
1458 int (*pud_entry)(pud_t *pud, unsigned long addr,
1459 unsigned long next, struct mm_walk *walk);
1460 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
1461 unsigned long next, struct mm_walk *walk);
1462 int (*pte_entry)(pte_t *pte, unsigned long addr,
1463 unsigned long next, struct mm_walk *walk);
1464 int (*pte_hole)(unsigned long addr, unsigned long next,
1465 struct mm_walk *walk);
1466 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
1467 unsigned long addr, unsigned long next,
1468 struct mm_walk *walk);
1469 int (*test_walk)(unsigned long addr, unsigned long next,
1470 struct mm_walk *walk);
1471 struct mm_struct *mm;
1472 struct vm_area_struct *vma;
1473 void *private;
1474};
1475
1476struct mmu_notifier_range; 1433struct mmu_notifier_range;
1477 1434
1478int walk_page_range(unsigned long addr, unsigned long end,
1479 struct mm_walk *walk);
1480int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
1481void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 1435void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
1482 unsigned long end, unsigned long floor, unsigned long ceiling); 1436 unsigned long end, unsigned long floor, unsigned long ceiling);
1483int copy_page_range(struct mm_struct *dst, struct mm_struct *src, 1437int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a7a1083b6fb..0b739f360cec 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -25,7 +25,6 @@
25 25
26struct address_space; 26struct address_space;
27struct mem_cgroup; 27struct mem_cgroup;
28struct hmm;
29 28
30/* 29/*
31 * Each physical page in the system has a struct page associated with 30 * Each physical page in the system has a struct page associated with
@@ -511,11 +510,6 @@ struct mm_struct {
511 atomic_long_t hugetlb_usage; 510 atomic_long_t hugetlb_usage;
512#endif 511#endif
513 struct work_struct async_put_work; 512 struct work_struct async_put_work;
514
515#ifdef CONFIG_HMM_MIRROR
516 /* HMM needs to track a few things per mm */
517 struct hmm *hmm;
518#endif
519 } __randomize_layout; 513 } __randomize_layout;
520 514
521 /* 515 /*
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b6c004bd9f6a..1bd8e6a09a3c 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -42,6 +42,10 @@ enum mmu_notifier_event {
42 42
43#ifdef CONFIG_MMU_NOTIFIER 43#ifdef CONFIG_MMU_NOTIFIER
44 44
45#ifdef CONFIG_LOCKDEP
46extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
47#endif
48
45/* 49/*
46 * The mmu notifier_mm structure is allocated and installed in 50 * The mmu notifier_mm structure is allocated and installed in
47 * mm->mmu_notifier_mm inside the mm_take_all_locks() protected 51 * mm->mmu_notifier_mm inside the mm_take_all_locks() protected
@@ -211,6 +215,19 @@ struct mmu_notifier_ops {
211 */ 215 */
212 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, 216 void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
213 unsigned long start, unsigned long end); 217 unsigned long start, unsigned long end);
218
219 /*
220 * These callbacks are used with the get/put interface to manage the
221 * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
222 * notifier for use with the mm.
223 *
224 * free_notifier() is only called after the mmu_notifier has been
225 * fully put, calls to any ops callback are prevented and no ops
226 * callbacks are currently running. It is called from a SRCU callback
227 * and cannot sleep.
228 */
229 struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
230 void (*free_notifier)(struct mmu_notifier *mn);
214}; 231};
215 232
216/* 233/*
@@ -227,6 +244,9 @@ struct mmu_notifier_ops {
227struct mmu_notifier { 244struct mmu_notifier {
228 struct hlist_node hlist; 245 struct hlist_node hlist;
229 const struct mmu_notifier_ops *ops; 246 const struct mmu_notifier_ops *ops;
247 struct mm_struct *mm;
248 struct rcu_head rcu;
249 unsigned int users;
230}; 250};
231 251
232static inline int mm_has_notifiers(struct mm_struct *mm) 252static inline int mm_has_notifiers(struct mm_struct *mm)
@@ -234,14 +254,27 @@ static inline int mm_has_notifiers(struct mm_struct *mm)
234 return unlikely(mm->mmu_notifier_mm); 254 return unlikely(mm->mmu_notifier_mm);
235} 255}
236 256
257struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
258 struct mm_struct *mm);
259static inline struct mmu_notifier *
260mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
261{
262 struct mmu_notifier *ret;
263
264 down_write(&mm->mmap_sem);
265 ret = mmu_notifier_get_locked(ops, mm);
266 up_write(&mm->mmap_sem);
267 return ret;
268}
269void mmu_notifier_put(struct mmu_notifier *mn);
270void mmu_notifier_synchronize(void);
271
237extern int mmu_notifier_register(struct mmu_notifier *mn, 272extern int mmu_notifier_register(struct mmu_notifier *mn,
238 struct mm_struct *mm); 273 struct mm_struct *mm);
239extern int __mmu_notifier_register(struct mmu_notifier *mn, 274extern int __mmu_notifier_register(struct mmu_notifier *mn,
240 struct mm_struct *mm); 275 struct mm_struct *mm);
241extern void mmu_notifier_unregister(struct mmu_notifier *mn, 276extern void mmu_notifier_unregister(struct mmu_notifier *mn,
242 struct mm_struct *mm); 277 struct mm_struct *mm);
243extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
244 struct mm_struct *mm);
245extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); 278extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
246extern void __mmu_notifier_release(struct mm_struct *mm); 279extern void __mmu_notifier_release(struct mm_struct *mm);
247extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, 280extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
@@ -310,25 +343,36 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm,
310static inline void 343static inline void
311mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) 344mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
312{ 345{
346 might_sleep();
347
348 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
313 if (mm_has_notifiers(range->mm)) { 349 if (mm_has_notifiers(range->mm)) {
314 range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; 350 range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
315 __mmu_notifier_invalidate_range_start(range); 351 __mmu_notifier_invalidate_range_start(range);
316 } 352 }
353 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
317} 354}
318 355
319static inline int 356static inline int
320mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) 357mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
321{ 358{
359 int ret = 0;
360
361 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
322 if (mm_has_notifiers(range->mm)) { 362 if (mm_has_notifiers(range->mm)) {
323 range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; 363 range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
324 return __mmu_notifier_invalidate_range_start(range); 364 ret = __mmu_notifier_invalidate_range_start(range);
325 } 365 }
326 return 0; 366 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
367 return ret;
327} 368}
328 369
329static inline void 370static inline void
330mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) 371mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
331{ 372{
373 if (mmu_notifier_range_blockable(range))
374 might_sleep();
375
332 if (mm_has_notifiers(range->mm)) 376 if (mm_has_notifiers(range->mm))
333 __mmu_notifier_invalidate_range_end(range, false); 377 __mmu_notifier_invalidate_range_end(range, false);
334} 378}
@@ -482,9 +526,6 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
482 set_pte_at(___mm, ___address, __ptep, ___pte); \ 526 set_pte_at(___mm, ___address, __ptep, ___pte); \
483}) 527})
484 528
485extern void mmu_notifier_call_srcu(struct rcu_head *rcu,
486 void (*func)(struct rcu_head *rcu));
487
488#else /* CONFIG_MMU_NOTIFIER */ 529#else /* CONFIG_MMU_NOTIFIER */
489 530
490struct mmu_notifier_range { 531struct mmu_notifier_range {
@@ -581,6 +622,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
581#define pudp_huge_clear_flush_notify pudp_huge_clear_flush 622#define pudp_huge_clear_flush_notify pudp_huge_clear_flush
582#define set_pte_at_notify set_pte_at 623#define set_pte_at_notify set_pte_at
583 624
625static inline void mmu_notifier_synchronize(void)
626{
627}
628
584#endif /* CONFIG_MMU_NOTIFIER */ 629#endif /* CONFIG_MMU_NOTIFIER */
585 630
586#endif /* _LINUX_MMU_NOTIFIER_H */ 631#endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
new file mode 100644
index 000000000000..bddd9759bab9
--- /dev/null
+++ b/include/linux/pagewalk.h
@@ -0,0 +1,66 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_PAGEWALK_H
3#define _LINUX_PAGEWALK_H
4
5#include <linux/mm.h>
6
7struct mm_walk;
8
9/**
10 * mm_walk_ops - callbacks for walk_page_range
11 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
12 * this handler should only handle pud_trans_huge() puds.
13 * the pmd_entry or pte_entry callbacks will be used for
14 * regular PUDs.
15 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
16 * this handler is required to be able to handle
17 * pmd_trans_huge() pmds. They may simply choose to
18 * split_huge_page() instead of handling it explicitly.
19 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
20 * @pte_hole: if set, called for each hole at all levels
21 * @hugetlb_entry: if set, called for each hugetlb entry
22 * @test_walk: caller specific callback function to determine whether
23 * we walk over the current vma or not. Returning 0 means
24 * "do page table walk over the current vma", returning
25 * a negative value means "abort current page table walk
26 * right now" and returning 1 means "skip the current vma"
27 */
28struct mm_walk_ops {
29 int (*pud_entry)(pud_t *pud, unsigned long addr,
30 unsigned long next, struct mm_walk *walk);
31 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
32 unsigned long next, struct mm_walk *walk);
33 int (*pte_entry)(pte_t *pte, unsigned long addr,
34 unsigned long next, struct mm_walk *walk);
35 int (*pte_hole)(unsigned long addr, unsigned long next,
36 struct mm_walk *walk);
37 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
38 unsigned long addr, unsigned long next,
39 struct mm_walk *walk);
40 int (*test_walk)(unsigned long addr, unsigned long next,
41 struct mm_walk *walk);
42};
43
44/**
45 * mm_walk - walk_page_range data
46 * @ops: operation to call during the walk
47 * @mm: mm_struct representing the target process of page table walk
48 * @vma: vma currently walked (NULL if walking outside vmas)
49 * @private: private data for callbacks' usage
50 *
51 * (see the comment on walk_page_range() for more details)
52 */
53struct mm_walk {
54 const struct mm_walk_ops *ops;
55 struct mm_struct *mm;
56 struct vm_area_struct *vma;
57 void *private;
58};
59
60int walk_page_range(struct mm_struct *mm, unsigned long start,
61 unsigned long end, const struct mm_walk_ops *ops,
62 void *private);
63int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
64 void *private);
65
66#endif /* _LINUX_PAGEWALK_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b75b28287005..70db597d6fd4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -958,6 +958,10 @@ struct task_struct {
958 struct mutex_waiter *blocked_on; 958 struct mutex_waiter *blocked_on;
959#endif 959#endif
960 960
961#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
962 int non_block_count;
963#endif
964
961#ifdef CONFIG_TRACE_IRQFLAGS 965#ifdef CONFIG_TRACE_IRQFLAGS
962 unsigned int irq_events; 966 unsigned int irq_events;
963 unsigned long hardirq_enable_ip; 967 unsigned long hardirq_enable_ip;
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 1052d0d62be7..a91b2af64ec4 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -42,7 +42,7 @@ struct ib_ucontext;
42struct ib_umem_odp; 42struct ib_umem_odp;
43 43
44struct ib_umem { 44struct ib_umem {
45 struct ib_ucontext *context; 45 struct ib_device *ibdev;
46 struct mm_struct *owning_mm; 46 struct mm_struct *owning_mm;
47 size_t length; 47 size_t length;
48 unsigned long address; 48 unsigned long address;
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 479db5c98ff6..253df1a1fa54 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -37,11 +37,6 @@
37#include <rdma/ib_verbs.h> 37#include <rdma/ib_verbs.h>
38#include <linux/interval_tree.h> 38#include <linux/interval_tree.h>
39 39
40struct umem_odp_node {
41 u64 __subtree_last;
42 struct rb_node rb;
43};
44
45struct ib_umem_odp { 40struct ib_umem_odp {
46 struct ib_umem umem; 41 struct ib_umem umem;
47 struct ib_ucontext_per_mm *per_mm; 42 struct ib_ucontext_per_mm *per_mm;
@@ -72,7 +67,15 @@ struct ib_umem_odp {
72 int npages; 67 int npages;
73 68
74 /* Tree tracking */ 69 /* Tree tracking */
75 struct umem_odp_node interval_tree; 70 struct interval_tree_node interval_tree;
71
72 /*
73 * An implicit odp umem cannot be DMA mapped, has 0 length, and serves
74 * only as an anchor for the driver to hold onto the per_mm. FIXME:
75 * This should be removed and drivers should work with the per_mm
76 * directly.
77 */
78 bool is_implicit_odp;
76 79
77 struct completion notifier_completion; 80 struct completion notifier_completion;
78 int dying; 81 int dying;
@@ -88,14 +91,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
88/* Returns the first page of an ODP umem. */ 91/* Returns the first page of an ODP umem. */
89static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) 92static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
90{ 93{
91 return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift); 94 return umem_odp->interval_tree.start;
92} 95}
93 96
94/* Returns the address of the page after the last one of an ODP umem. */ 97/* Returns the address of the page after the last one of an ODP umem. */
95static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) 98static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
96{ 99{
97 return ALIGN(umem_odp->umem.address + umem_odp->umem.length, 100 return umem_odp->interval_tree.last + 1;
98 1UL << umem_odp->page_shift);
99} 101}
100 102
101static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) 103static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
@@ -120,25 +122,20 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
120#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING 122#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
121 123
122struct ib_ucontext_per_mm { 124struct ib_ucontext_per_mm {
123 struct ib_ucontext *context; 125 struct mmu_notifier mn;
124 struct mm_struct *mm;
125 struct pid *tgid; 126 struct pid *tgid;
126 bool active;
127 127
128 struct rb_root_cached umem_tree; 128 struct rb_root_cached umem_tree;
129 /* Protects umem_tree */ 129 /* Protects umem_tree */
130 struct rw_semaphore umem_rwsem; 130 struct rw_semaphore umem_rwsem;
131
132 struct mmu_notifier mn;
133 unsigned int odp_mrs_count;
134
135 struct list_head ucontext_list;
136 struct rcu_head rcu;
137}; 131};
138 132
139int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); 133struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
140struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, 134 size_t size, int access);
141 unsigned long addr, size_t size); 135struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
136 int access);
137struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem,
138 unsigned long addr, size_t size);
142void ib_umem_odp_release(struct ib_umem_odp *umem_odp); 139void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
143 140
144int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, 141int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
@@ -163,8 +160,17 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
163 * Find first region intersecting with address range. 160 * Find first region intersecting with address range.
164 * Return NULL if not found 161 * Return NULL if not found
165 */ 162 */
166struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, 163static inline struct ib_umem_odp *
167 u64 addr, u64 length); 164rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length)
165{
166 struct interval_tree_node *node;
167
168 node = interval_tree_iter_first(root, addr, addr + length - 1);
169 if (!node)
170 return NULL;
171 return container_of(node, struct ib_umem_odp, interval_tree);
172
173}
168 174
169static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, 175static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
170 unsigned long mmu_seq) 176 unsigned long mmu_seq)
@@ -185,9 +191,11 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
185 191
186#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ 192#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
187 193
188static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) 194static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata,
195 unsigned long addr,
196 size_t size, int access)
189{ 197{
190 return -EINVAL; 198 return ERR_PTR(-EINVAL);
191} 199}
192 200
193static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} 201static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 4f225175cb91..f659f4a02aa9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1417,11 +1417,6 @@ struct ib_ucontext {
1417 1417
1418 bool cleanup_retryable; 1418 bool cleanup_retryable;
1419 1419
1420 void (*invalidate_range)(struct ib_umem_odp *umem_odp,
1421 unsigned long start, unsigned long end);
1422 struct mutex per_mm_list_lock;
1423 struct list_head per_mm_list;
1424
1425 struct ib_rdmacg_object cg_obj; 1420 struct ib_rdmacg_object cg_obj;
1426 /* 1421 /*
1427 * Implementation details of the RDMA core, don't use in drivers: 1422 * Implementation details of the RDMA core, don't use in drivers:
@@ -2378,6 +2373,8 @@ struct ib_device_ops {
2378 u64 iova); 2373 u64 iova);
2379 int (*unmap_fmr)(struct list_head *fmr_list); 2374 int (*unmap_fmr)(struct list_head *fmr_list);
2380 int (*dealloc_fmr)(struct ib_fmr *fmr); 2375 int (*dealloc_fmr)(struct ib_fmr *fmr);
2376 void (*invalidate_range)(struct ib_umem_odp *umem_odp,
2377 unsigned long start, unsigned long end);
2381 int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); 2378 int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
2382 int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); 2379 int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
2383 struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, 2380 struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,
diff --git a/kernel/fork.c b/kernel/fork.c
index 53e780748fe3..5a0fd518e04e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1009,7 +1009,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
1009 mm_init_owner(mm, p); 1009 mm_init_owner(mm, p);
1010 RCU_INIT_POINTER(mm->exe_file, NULL); 1010 RCU_INIT_POINTER(mm->exe_file, NULL);
1011 mmu_notifier_mm_init(mm); 1011 mmu_notifier_mm_init(mm);
1012 hmm_mm_init(mm);
1013 init_tlb_flush_pending(mm); 1012 init_tlb_flush_pending(mm);
1014#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 1013#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
1015 mm->pmd_huge_pte = NULL; 1014 mm->pmd_huge_pte = NULL;
diff --git a/kernel/resource.c b/kernel/resource.c
index 7ea4306503c5..74877e9d90ca 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head)
1644EXPORT_SYMBOL(resource_list_free); 1644EXPORT_SYMBOL(resource_list_free);
1645 1645
1646#ifdef CONFIG_DEVICE_PRIVATE 1646#ifdef CONFIG_DEVICE_PRIVATE
1647/** 1647static struct resource *__request_free_mem_region(struct device *dev,
1648 * devm_request_free_mem_region - find free region for device private memory 1648 struct resource *base, unsigned long size, const char *name)
1649 *
1650 * @dev: device struct to bind the resource to
1651 * @size: size in bytes of the device memory to add
1652 * @base: resource tree to look in
1653 *
1654 * This function tries to find an empty range of physical address big enough to
1655 * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
1656 * memory, which in turn allocates struct pages.
1657 */
1658struct resource *devm_request_free_mem_region(struct device *dev,
1659 struct resource *base, unsigned long size)
1660{ 1649{
1661 resource_size_t end, addr; 1650 resource_size_t end, addr;
1662 struct resource *res; 1651 struct resource *res;
@@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev,
1670 REGION_DISJOINT) 1659 REGION_DISJOINT)
1671 continue; 1660 continue;
1672 1661
1673 res = devm_request_mem_region(dev, addr, size, dev_name(dev)); 1662 if (dev)
1663 res = devm_request_mem_region(dev, addr, size, name);
1664 else
1665 res = request_mem_region(addr, size, name);
1674 if (!res) 1666 if (!res)
1675 return ERR_PTR(-ENOMEM); 1667 return ERR_PTR(-ENOMEM);
1676 res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; 1668 res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
@@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev,
1679 1671
1680 return ERR_PTR(-ERANGE); 1672 return ERR_PTR(-ERANGE);
1681} 1673}
1674
1675/**
1676 * devm_request_free_mem_region - find free region for device private memory
1677 *
1678 * @dev: device struct to bind the resource to
1679 * @size: size in bytes of the device memory to add
1680 * @base: resource tree to look in
1681 *
1682 * This function tries to find an empty range of physical address big enough to
1683 * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
1684 * memory, which in turn allocates struct pages.
1685 */
1686struct resource *devm_request_free_mem_region(struct device *dev,
1687 struct resource *base, unsigned long size)
1688{
1689 return __request_free_mem_region(dev, base, size, dev_name(dev));
1690}
1682EXPORT_SYMBOL_GPL(devm_request_free_mem_region); 1691EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
1692
1693struct resource *request_free_mem_region(struct resource *base,
1694 unsigned long size, const char *name)
1695{
1696 return __request_free_mem_region(NULL, base, size, name);
1697}
1698EXPORT_SYMBOL_GPL(request_free_mem_region);
1699
1683#endif /* CONFIG_DEVICE_PRIVATE */ 1700#endif /* CONFIG_DEVICE_PRIVATE */
1684 1701
1685static int __init strict_iomem(char *str) 1702static int __init strict_iomem(char *str)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5e8387bdd09c..f9a1346a5fa9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3871,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev)
3871/* 3871/*
3872 * Various schedule()-time debugging checks and statistics: 3872 * Various schedule()-time debugging checks and statistics:
3873 */ 3873 */
3874static inline void schedule_debug(struct task_struct *prev) 3874static inline void schedule_debug(struct task_struct *prev, bool preempt)
3875{ 3875{
3876#ifdef CONFIG_SCHED_STACK_END_CHECK 3876#ifdef CONFIG_SCHED_STACK_END_CHECK
3877 if (task_stack_end_corrupted(prev)) 3877 if (task_stack_end_corrupted(prev))
3878 panic("corrupted stack end detected inside scheduler\n"); 3878 panic("corrupted stack end detected inside scheduler\n");
3879#endif 3879#endif
3880 3880
3881#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
3882 if (!preempt && prev->state && prev->non_block_count) {
3883 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
3884 prev->comm, prev->pid, prev->non_block_count);
3885 dump_stack();
3886 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3887 }
3888#endif
3889
3881 if (unlikely(in_atomic_preempt_off())) { 3890 if (unlikely(in_atomic_preempt_off())) {
3882 __schedule_bug(prev); 3891 __schedule_bug(prev);
3883 preempt_count_set(PREEMPT_DISABLED); 3892 preempt_count_set(PREEMPT_DISABLED);
@@ -3989,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt)
3989 rq = cpu_rq(cpu); 3998 rq = cpu_rq(cpu);
3990 prev = rq->curr; 3999 prev = rq->curr;
3991 4000
3992 schedule_debug(prev); 4001 schedule_debug(prev, preempt);
3993 4002
3994 if (sched_feat(HRTICK)) 4003 if (sched_feat(HRTICK))
3995 hrtick_clear(rq); 4004 hrtick_clear(rq);
@@ -6763,7 +6772,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
6763 rcu_sleep_check(); 6772 rcu_sleep_check();
6764 6773
6765 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 6774 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6766 !is_idle_task(current)) || 6775 !is_idle_task(current) && !current->non_block_count) ||
6767 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || 6776 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
6768 oops_in_progress) 6777 oops_in_progress)
6769 return; 6778 return;
@@ -6779,8 +6788,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
6779 "BUG: sleeping function called from invalid context at %s:%d\n", 6788 "BUG: sleeping function called from invalid context at %s:%d\n",
6780 file, line); 6789 file, line);
6781 printk(KERN_ERR 6790 printk(KERN_ERR
6782 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6791 "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
6783 in_atomic(), irqs_disabled(), 6792 in_atomic(), irqs_disabled(), current->non_block_count,
6784 current->pid, current->comm); 6793 current->pid, current->comm);
6785 6794
6786 if (task_stack_end_corrupted(current)) 6795 if (task_stack_end_corrupted(current))
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec636a1fc..2fe4902ad755 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -669,23 +669,17 @@ config ZONE_DEVICE
669 669
670 If FS_DAX is enabled, then say Y. 670 If FS_DAX is enabled, then say Y.
671 671
672config MIGRATE_VMA_HELPER
673 bool
674
675config DEV_PAGEMAP_OPS 672config DEV_PAGEMAP_OPS
676 bool 673 bool
677 674
675#
676# Helpers to mirror range of the CPU page tables of a process into device page
677# tables.
678#
678config HMM_MIRROR 679config HMM_MIRROR
679 bool "HMM mirror CPU page table into a device page table" 680 bool
680 depends on (X86_64 || PPC64) 681 depends on MMU
681 depends on MMU && 64BIT 682 depends on MMU_NOTIFIER
682 select MMU_NOTIFIER
683 help
684 Select HMM_MIRROR if you want to mirror range of the CPU page table of a
685 process into a device page table. Here, mirror means "keep synchronized".
686 Prerequisites: the device must provide the ability to write-protect its
687 page tables (at PAGE_SIZE granularity), and must be able to recover from
688 the resulting potential page faults.
689 683
690config DEVICE_PRIVATE 684config DEVICE_PRIVATE
691 bool "Unaddressable device memory (GPU memory, ...)" 685 bool "Unaddressable device memory (GPU memory, ...)"
diff --git a/mm/hmm.c b/mm/hmm.c
index 16b6731a34db..902f5fa6bf93 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -8,7 +8,7 @@
8 * Refer to include/linux/hmm.h for information about heterogeneous memory 8 * Refer to include/linux/hmm.h for information about heterogeneous memory
9 * management or HMM for short. 9 * management or HMM for short.
10 */ 10 */
11#include <linux/mm.h> 11#include <linux/pagewalk.h>
12#include <linux/hmm.h> 12#include <linux/hmm.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/rmap.h> 14#include <linux/rmap.h>
@@ -26,101 +26,37 @@
26#include <linux/mmu_notifier.h> 26#include <linux/mmu_notifier.h>
27#include <linux/memory_hotplug.h> 27#include <linux/memory_hotplug.h>
28 28
29static const struct mmu_notifier_ops hmm_mmu_notifier_ops; 29static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm)
30
31/**
32 * hmm_get_or_create - register HMM against an mm (HMM internal)
33 *
34 * @mm: mm struct to attach to
35 * Returns: returns an HMM object, either by referencing the existing
36 * (per-process) object, or by creating a new one.
37 *
38 * This is not intended to be used directly by device drivers. If mm already
39 * has an HMM struct then it get a reference on it and returns it. Otherwise
40 * it allocates an HMM struct, initializes it, associate it with the mm and
41 * returns it.
42 */
43static struct hmm *hmm_get_or_create(struct mm_struct *mm)
44{ 30{
45 struct hmm *hmm; 31 struct hmm *hmm;
46 32
47 lockdep_assert_held_write(&mm->mmap_sem); 33 hmm = kzalloc(sizeof(*hmm), GFP_KERNEL);
48
49 /* Abuse the page_table_lock to also protect mm->hmm. */
50 spin_lock(&mm->page_table_lock);
51 hmm = mm->hmm;
52 if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref))
53 goto out_unlock;
54 spin_unlock(&mm->page_table_lock);
55
56 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
57 if (!hmm) 34 if (!hmm)
58 return NULL; 35 return ERR_PTR(-ENOMEM);
36
59 init_waitqueue_head(&hmm->wq); 37 init_waitqueue_head(&hmm->wq);
60 INIT_LIST_HEAD(&hmm->mirrors); 38 INIT_LIST_HEAD(&hmm->mirrors);
61 init_rwsem(&hmm->mirrors_sem); 39 init_rwsem(&hmm->mirrors_sem);
62 hmm->mmu_notifier.ops = NULL;
63 INIT_LIST_HEAD(&hmm->ranges); 40 INIT_LIST_HEAD(&hmm->ranges);
64 spin_lock_init(&hmm->ranges_lock); 41 spin_lock_init(&hmm->ranges_lock);
65 kref_init(&hmm->kref);
66 hmm->notifiers = 0; 42 hmm->notifiers = 0;
67 hmm->mm = mm; 43 return &hmm->mmu_notifier;
68
69 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
70 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
71 kfree(hmm);
72 return NULL;
73 }
74
75 mmgrab(hmm->mm);
76
77 /*
78 * We hold the exclusive mmap_sem here so we know that mm->hmm is
79 * still NULL or 0 kref, and is safe to update.
80 */
81 spin_lock(&mm->page_table_lock);
82 mm->hmm = hmm;
83
84out_unlock:
85 spin_unlock(&mm->page_table_lock);
86 return hmm;
87} 44}
88 45
89static void hmm_free_rcu(struct rcu_head *rcu) 46static void hmm_free_notifier(struct mmu_notifier *mn)
90{ 47{
91 struct hmm *hmm = container_of(rcu, struct hmm, rcu); 48 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
92 49
93 mmdrop(hmm->mm); 50 WARN_ON(!list_empty(&hmm->ranges));
51 WARN_ON(!list_empty(&hmm->mirrors));
94 kfree(hmm); 52 kfree(hmm);
95} 53}
96 54
97static void hmm_free(struct kref *kref)
98{
99 struct hmm *hmm = container_of(kref, struct hmm, kref);
100
101 spin_lock(&hmm->mm->page_table_lock);
102 if (hmm->mm->hmm == hmm)
103 hmm->mm->hmm = NULL;
104 spin_unlock(&hmm->mm->page_table_lock);
105
106 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm);
107 mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
108}
109
110static inline void hmm_put(struct hmm *hmm)
111{
112 kref_put(&hmm->kref, hmm_free);
113}
114
115static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) 55static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
116{ 56{
117 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 57 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
118 struct hmm_mirror *mirror; 58 struct hmm_mirror *mirror;
119 59
120 /* Bail out if hmm is in the process of being freed */
121 if (!kref_get_unless_zero(&hmm->kref))
122 return;
123
124 /* 60 /*
125 * Since hmm_range_register() holds the mmget() lock hmm_release() is 61 * Since hmm_range_register() holds the mmget() lock hmm_release() is
126 * prevented as long as a range exists. 62 * prevented as long as a range exists.
@@ -137,8 +73,6 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
137 mirror->ops->release(mirror); 73 mirror->ops->release(mirror);
138 } 74 }
139 up_read(&hmm->mirrors_sem); 75 up_read(&hmm->mirrors_sem);
140
141 hmm_put(hmm);
142} 76}
143 77
144static void notifiers_decrement(struct hmm *hmm) 78static void notifiers_decrement(struct hmm *hmm)
@@ -165,23 +99,14 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
165{ 99{
166 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 100 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
167 struct hmm_mirror *mirror; 101 struct hmm_mirror *mirror;
168 struct hmm_update update;
169 struct hmm_range *range; 102 struct hmm_range *range;
170 unsigned long flags; 103 unsigned long flags;
171 int ret = 0; 104 int ret = 0;
172 105
173 if (!kref_get_unless_zero(&hmm->kref))
174 return 0;
175
176 update.start = nrange->start;
177 update.end = nrange->end;
178 update.event = HMM_UPDATE_INVALIDATE;
179 update.blockable = mmu_notifier_range_blockable(nrange);
180
181 spin_lock_irqsave(&hmm->ranges_lock, flags); 106 spin_lock_irqsave(&hmm->ranges_lock, flags);
182 hmm->notifiers++; 107 hmm->notifiers++;
183 list_for_each_entry(range, &hmm->ranges, list) { 108 list_for_each_entry(range, &hmm->ranges, list) {
184 if (update.end < range->start || update.start >= range->end) 109 if (nrange->end < range->start || nrange->start >= range->end)
185 continue; 110 continue;
186 111
187 range->valid = false; 112 range->valid = false;
@@ -198,9 +123,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
198 list_for_each_entry(mirror, &hmm->mirrors, list) { 123 list_for_each_entry(mirror, &hmm->mirrors, list) {
199 int rc; 124 int rc;
200 125
201 rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); 126 rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange);
202 if (rc) { 127 if (rc) {
203 if (WARN_ON(update.blockable || rc != -EAGAIN)) 128 if (WARN_ON(mmu_notifier_range_blockable(nrange) ||
129 rc != -EAGAIN))
204 continue; 130 continue;
205 ret = -EAGAIN; 131 ret = -EAGAIN;
206 break; 132 break;
@@ -211,7 +137,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
211out: 137out:
212 if (ret) 138 if (ret)
213 notifiers_decrement(hmm); 139 notifiers_decrement(hmm);
214 hmm_put(hmm);
215 return ret; 140 return ret;
216} 141}
217 142
@@ -220,17 +145,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
220{ 145{
221 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); 146 struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
222 147
223 if (!kref_get_unless_zero(&hmm->kref))
224 return;
225
226 notifiers_decrement(hmm); 148 notifiers_decrement(hmm);
227 hmm_put(hmm);
228} 149}
229 150
230static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { 151static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
231 .release = hmm_release, 152 .release = hmm_release,
232 .invalidate_range_start = hmm_invalidate_range_start, 153 .invalidate_range_start = hmm_invalidate_range_start,
233 .invalidate_range_end = hmm_invalidate_range_end, 154 .invalidate_range_end = hmm_invalidate_range_end,
155 .alloc_notifier = hmm_alloc_notifier,
156 .free_notifier = hmm_free_notifier,
234}; 157};
235 158
236/* 159/*
@@ -242,18 +165,27 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
242 * 165 *
243 * To start mirroring a process address space, the device driver must register 166 * To start mirroring a process address space, the device driver must register
244 * an HMM mirror struct. 167 * an HMM mirror struct.
168 *
169 * The caller cannot unregister the hmm_mirror while any ranges are
170 * registered.
171 *
172 * Callers using this function must put a call to mmu_notifier_synchronize()
173 * in their module exit functions.
245 */ 174 */
246int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) 175int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
247{ 176{
177 struct mmu_notifier *mn;
178
248 lockdep_assert_held_write(&mm->mmap_sem); 179 lockdep_assert_held_write(&mm->mmap_sem);
249 180
250 /* Sanity check */ 181 /* Sanity check */
251 if (!mm || !mirror || !mirror->ops) 182 if (!mm || !mirror || !mirror->ops)
252 return -EINVAL; 183 return -EINVAL;
253 184
254 mirror->hmm = hmm_get_or_create(mm); 185 mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm);
255 if (!mirror->hmm) 186 if (IS_ERR(mn))
256 return -ENOMEM; 187 return PTR_ERR(mn);
188 mirror->hmm = container_of(mn, struct hmm, mmu_notifier);
257 189
258 down_write(&mirror->hmm->mirrors_sem); 190 down_write(&mirror->hmm->mirrors_sem);
259 list_add(&mirror->list, &mirror->hmm->mirrors); 191 list_add(&mirror->list, &mirror->hmm->mirrors);
@@ -277,7 +209,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
277 down_write(&hmm->mirrors_sem); 209 down_write(&hmm->mirrors_sem);
278 list_del(&mirror->list); 210 list_del(&mirror->list);
279 up_write(&hmm->mirrors_sem); 211 up_write(&hmm->mirrors_sem);
280 hmm_put(hmm); 212 mmu_notifier_put(&hmm->mmu_notifier);
281} 213}
282EXPORT_SYMBOL(hmm_mirror_unregister); 214EXPORT_SYMBOL(hmm_mirror_unregister);
283 215
@@ -285,8 +217,7 @@ struct hmm_vma_walk {
285 struct hmm_range *range; 217 struct hmm_range *range;
286 struct dev_pagemap *pgmap; 218 struct dev_pagemap *pgmap;
287 unsigned long last; 219 unsigned long last;
288 bool fault; 220 unsigned int flags;
289 bool block;
290}; 221};
291 222
292static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 223static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
@@ -298,17 +229,27 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
298 struct vm_area_struct *vma = walk->vma; 229 struct vm_area_struct *vma = walk->vma;
299 vm_fault_t ret; 230 vm_fault_t ret;
300 231
301 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 232 if (!vma)
302 flags |= write_fault ? FAULT_FLAG_WRITE : 0; 233 goto err;
234
235 if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
236 flags |= FAULT_FLAG_ALLOW_RETRY;
237 if (write_fault)
238 flags |= FAULT_FLAG_WRITE;
239
303 ret = handle_mm_fault(vma, addr, flags); 240 ret = handle_mm_fault(vma, addr, flags);
304 if (ret & VM_FAULT_RETRY) 241 if (ret & VM_FAULT_RETRY) {
242 /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
305 return -EAGAIN; 243 return -EAGAIN;
306 if (ret & VM_FAULT_ERROR) {
307 *pfn = range->values[HMM_PFN_ERROR];
308 return -EFAULT;
309 } 244 }
245 if (ret & VM_FAULT_ERROR)
246 goto err;
310 247
311 return -EBUSY; 248 return -EBUSY;
249
250err:
251 *pfn = range->values[HMM_PFN_ERROR];
252 return -EFAULT;
312} 253}
313 254
314static int hmm_pfns_bad(unsigned long addr, 255static int hmm_pfns_bad(unsigned long addr,
@@ -328,8 +269,8 @@ static int hmm_pfns_bad(unsigned long addr,
328} 269}
329 270
330/* 271/*
331 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 272 * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
332 * @start: range virtual start address (inclusive) 273 * @addr: range virtual start address (inclusive)
333 * @end: range virtual end address (exclusive) 274 * @end: range virtual end address (exclusive)
334 * @fault: should we fault or not ? 275 * @fault: should we fault or not ?
335 * @write_fault: write fault ? 276 * @write_fault: write fault ?
@@ -346,13 +287,15 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
346 struct hmm_vma_walk *hmm_vma_walk = walk->private; 287 struct hmm_vma_walk *hmm_vma_walk = walk->private;
347 struct hmm_range *range = hmm_vma_walk->range; 288 struct hmm_range *range = hmm_vma_walk->range;
348 uint64_t *pfns = range->pfns; 289 uint64_t *pfns = range->pfns;
349 unsigned long i, page_size; 290 unsigned long i;
350 291
351 hmm_vma_walk->last = addr; 292 hmm_vma_walk->last = addr;
352 page_size = hmm_range_page_size(range); 293 i = (addr - range->start) >> PAGE_SHIFT;
353 i = (addr - range->start) >> range->page_shift; 294
295 if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
296 return -EPERM;
354 297
355 for (; addr < end; addr += page_size, i++) { 298 for (; addr < end; addr += PAGE_SIZE, i++) {
356 pfns[i] = range->values[HMM_PFN_NONE]; 299 pfns[i] = range->values[HMM_PFN_NONE];
357 if (fault || write_fault) { 300 if (fault || write_fault) {
358 int ret; 301 int ret;
@@ -373,15 +316,15 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
373{ 316{
374 struct hmm_range *range = hmm_vma_walk->range; 317 struct hmm_range *range = hmm_vma_walk->range;
375 318
376 if (!hmm_vma_walk->fault) 319 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
377 return; 320 return;
378 321
379 /* 322 /*
380 * So we not only consider the individual per page request we also 323 * So we not only consider the individual per page request we also
381 * consider the default flags requested for the range. The API can 324 * consider the default flags requested for the range. The API can
382 * be use in 2 fashions. The first one where the HMM user coalesce 325 * be used 2 ways. The first one where the HMM user coalesces
383 * multiple page fault into one request and set flags per pfns for 326 * multiple page faults into one request and sets flags per pfn for
384 * of those faults. The second one where the HMM user want to pre- 327 * those faults. The second one where the HMM user wants to pre-
385 * fault a range with specific flags. For the latter one it is a 328 * fault a range with specific flags. For the latter one it is a
386 * waste to have the user pre-fill the pfn arrays with a default 329 * waste to have the user pre-fill the pfn arrays with a default
387 * flags value. 330 * flags value.
@@ -391,7 +334,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
391 /* We aren't ask to do anything ... */ 334 /* We aren't ask to do anything ... */
392 if (!(pfns & range->flags[HMM_PFN_VALID])) 335 if (!(pfns & range->flags[HMM_PFN_VALID]))
393 return; 336 return;
394 /* If this is device memory than only fault if explicitly requested */ 337 /* If this is device memory then only fault if explicitly requested */
395 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 338 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
396 /* Do we fault on device memory ? */ 339 /* Do we fault on device memory ? */
397 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 340 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
@@ -418,7 +361,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
418{ 361{
419 unsigned long i; 362 unsigned long i;
420 363
421 if (!hmm_vma_walk->fault) { 364 if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
422 *fault = *write_fault = false; 365 *fault = *write_fault = false;
423 return; 366 return;
424 } 367 }
@@ -458,22 +401,10 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
458 range->flags[HMM_PFN_VALID]; 401 range->flags[HMM_PFN_VALID];
459} 402}
460 403
461static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
462{
463 if (!pud_present(pud))
464 return 0;
465 return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
466 range->flags[HMM_PFN_WRITE] :
467 range->flags[HMM_PFN_VALID];
468}
469
470static int hmm_vma_handle_pmd(struct mm_walk *walk,
471 unsigned long addr,
472 unsigned long end,
473 uint64_t *pfns,
474 pmd_t pmd)
475{
476#ifdef CONFIG_TRANSPARENT_HUGEPAGE 404#ifdef CONFIG_TRANSPARENT_HUGEPAGE
405static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
406 unsigned long end, uint64_t *pfns, pmd_t pmd)
407{
477 struct hmm_vma_walk *hmm_vma_walk = walk->private; 408 struct hmm_vma_walk *hmm_vma_walk = walk->private;
478 struct hmm_range *range = hmm_vma_walk->range; 409 struct hmm_range *range = hmm_vma_walk->range;
479 unsigned long pfn, npages, i; 410 unsigned long pfn, npages, i;
@@ -488,7 +419,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
488 if (pmd_protnone(pmd) || fault || write_fault) 419 if (pmd_protnone(pmd) || fault || write_fault)
489 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 420 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
490 421
491 pfn = pmd_pfn(pmd) + pte_index(addr); 422 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
492 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { 423 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
493 if (pmd_devmap(pmd)) { 424 if (pmd_devmap(pmd)) {
494 hmm_vma_walk->pgmap = get_dev_pagemap(pfn, 425 hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
@@ -504,11 +435,12 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
504 } 435 }
505 hmm_vma_walk->last = end; 436 hmm_vma_walk->last = end;
506 return 0; 437 return 0;
507#else
508 /* If THP is not enabled then we should never reach that code ! */
509 return -EINVAL;
510#endif
511} 438}
439#else /* CONFIG_TRANSPARENT_HUGEPAGE */
440/* stub to allow the code below to compile */
441int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
442 unsigned long end, uint64_t *pfns, pmd_t pmd);
443#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
512 444
513static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 445static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
514{ 446{
@@ -525,7 +457,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
525{ 457{
526 struct hmm_vma_walk *hmm_vma_walk = walk->private; 458 struct hmm_vma_walk *hmm_vma_walk = walk->private;
527 struct hmm_range *range = hmm_vma_walk->range; 459 struct hmm_range *range = hmm_vma_walk->range;
528 struct vm_area_struct *vma = walk->vma;
529 bool fault, write_fault; 460 bool fault, write_fault;
530 uint64_t cpu_flags; 461 uint64_t cpu_flags;
531 pte_t pte = *ptep; 462 pte_t pte = *ptep;
@@ -546,6 +477,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
546 swp_entry_t entry = pte_to_swp_entry(pte); 477 swp_entry_t entry = pte_to_swp_entry(pte);
547 478
548 if (!non_swap_entry(entry)) { 479 if (!non_swap_entry(entry)) {
480 cpu_flags = pte_to_hmm_pfn_flags(range, pte);
481 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
482 &fault, &write_fault);
549 if (fault || write_fault) 483 if (fault || write_fault)
550 goto fault; 484 goto fault;
551 return 0; 485 return 0;
@@ -574,8 +508,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
574 if (fault || write_fault) { 508 if (fault || write_fault) {
575 pte_unmap(ptep); 509 pte_unmap(ptep);
576 hmm_vma_walk->last = addr; 510 hmm_vma_walk->last = addr;
577 migration_entry_wait(vma->vm_mm, 511 migration_entry_wait(walk->mm, pmdp, addr);
578 pmdp, addr);
579 return -EBUSY; 512 return -EBUSY;
580 } 513 }
581 return 0; 514 return 0;
@@ -623,21 +556,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
623{ 556{
624 struct hmm_vma_walk *hmm_vma_walk = walk->private; 557 struct hmm_vma_walk *hmm_vma_walk = walk->private;
625 struct hmm_range *range = hmm_vma_walk->range; 558 struct hmm_range *range = hmm_vma_walk->range;
626 struct vm_area_struct *vma = walk->vma;
627 uint64_t *pfns = range->pfns; 559 uint64_t *pfns = range->pfns;
628 unsigned long addr = start, i; 560 unsigned long addr = start, i;
629 pte_t *ptep; 561 pte_t *ptep;
630 pmd_t pmd; 562 pmd_t pmd;
631 563
632
633again: 564again:
634 pmd = READ_ONCE(*pmdp); 565 pmd = READ_ONCE(*pmdp);
635 if (pmd_none(pmd)) 566 if (pmd_none(pmd))
636 return hmm_vma_walk_hole(start, end, walk); 567 return hmm_vma_walk_hole(start, end, walk);
637 568
638 if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
639 return hmm_pfns_bad(start, end, walk);
640
641 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { 569 if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
642 bool fault, write_fault; 570 bool fault, write_fault;
643 unsigned long npages; 571 unsigned long npages;
@@ -651,7 +579,7 @@ again:
651 0, &fault, &write_fault); 579 0, &fault, &write_fault);
652 if (fault || write_fault) { 580 if (fault || write_fault) {
653 hmm_vma_walk->last = addr; 581 hmm_vma_walk->last = addr;
654 pmd_migration_entry_wait(vma->vm_mm, pmdp); 582 pmd_migration_entry_wait(walk->mm, pmdp);
655 return -EBUSY; 583 return -EBUSY;
656 } 584 }
657 return 0; 585 return 0;
@@ -660,11 +588,11 @@ again:
660 588
661 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { 589 if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
662 /* 590 /*
663 * No need to take pmd_lock here, even if some other threads 591 * No need to take pmd_lock here, even if some other thread
664 * is splitting the huge pmd we will get that event through 592 * is splitting the huge pmd we will get that event through
665 * mmu_notifier callback. 593 * mmu_notifier callback.
666 * 594 *
667 * So just read pmd value and check again its a transparent 595 * So just read pmd value and check again it's a transparent
668 * huge or device mapping one and compute corresponding pfn 596 * huge or device mapping one and compute corresponding pfn
669 * values. 597 * values.
670 */ 598 */
@@ -678,7 +606,7 @@ again:
678 } 606 }
679 607
680 /* 608 /*
681 * We have handled all the valid case above ie either none, migration, 609 * We have handled all the valid cases above ie either none, migration,
682 * huge or transparent huge. At this point either it is a valid pmd 610 * huge or transparent huge. At this point either it is a valid pmd
683 * entry pointing to pte directory or it is a bad pmd that will not 611 * entry pointing to pte directory or it is a bad pmd that will not
684 * recover. 612 * recover.
@@ -714,10 +642,19 @@ again:
714 return 0; 642 return 0;
715} 643}
716 644
717static int hmm_vma_walk_pud(pud_t *pudp, 645#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
718 unsigned long start, 646 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
719 unsigned long end, 647static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
720 struct mm_walk *walk) 648{
649 if (!pud_present(pud))
650 return 0;
651 return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
652 range->flags[HMM_PFN_WRITE] :
653 range->flags[HMM_PFN_VALID];
654}
655
656static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
657 struct mm_walk *walk)
721{ 658{
722 struct hmm_vma_walk *hmm_vma_walk = walk->private; 659 struct hmm_vma_walk *hmm_vma_walk = walk->private;
723 struct hmm_range *range = hmm_vma_walk->range; 660 struct hmm_range *range = hmm_vma_walk->range;
@@ -781,42 +718,29 @@ again:
781 718
782 return 0; 719 return 0;
783} 720}
721#else
722#define hmm_vma_walk_pud NULL
723#endif
784 724
725#ifdef CONFIG_HUGETLB_PAGE
785static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, 726static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
786 unsigned long start, unsigned long end, 727 unsigned long start, unsigned long end,
787 struct mm_walk *walk) 728 struct mm_walk *walk)
788{ 729{
789#ifdef CONFIG_HUGETLB_PAGE 730 unsigned long addr = start, i, pfn;
790 unsigned long addr = start, i, pfn, mask, size, pfn_inc;
791 struct hmm_vma_walk *hmm_vma_walk = walk->private; 731 struct hmm_vma_walk *hmm_vma_walk = walk->private;
792 struct hmm_range *range = hmm_vma_walk->range; 732 struct hmm_range *range = hmm_vma_walk->range;
793 struct vm_area_struct *vma = walk->vma; 733 struct vm_area_struct *vma = walk->vma;
794 struct hstate *h = hstate_vma(vma);
795 uint64_t orig_pfn, cpu_flags; 734 uint64_t orig_pfn, cpu_flags;
796 bool fault, write_fault; 735 bool fault, write_fault;
797 spinlock_t *ptl; 736 spinlock_t *ptl;
798 pte_t entry; 737 pte_t entry;
799 int ret = 0; 738 int ret = 0;
800 739
801 size = 1UL << huge_page_shift(h); 740 ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
802 mask = size - 1;
803 if (range->page_shift != PAGE_SHIFT) {
804 /* Make sure we are looking at full page. */
805 if (start & mask)
806 return -EINVAL;
807 if (end < (start + size))
808 return -EINVAL;
809 pfn_inc = size >> PAGE_SHIFT;
810 } else {
811 pfn_inc = 1;
812 size = PAGE_SIZE;
813 }
814
815
816 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
817 entry = huge_ptep_get(pte); 741 entry = huge_ptep_get(pte);
818 742
819 i = (start - range->start) >> range->page_shift; 743 i = (start - range->start) >> PAGE_SHIFT;
820 orig_pfn = range->pfns[i]; 744 orig_pfn = range->pfns[i];
821 range->pfns[i] = range->values[HMM_PFN_NONE]; 745 range->pfns[i] = range->values[HMM_PFN_NONE];
822 cpu_flags = pte_to_hmm_pfn_flags(range, entry); 746 cpu_flags = pte_to_hmm_pfn_flags(range, entry);
@@ -828,8 +752,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
828 goto unlock; 752 goto unlock;
829 } 753 }
830 754
831 pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); 755 pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
832 for (; addr < end; addr += size, i++, pfn += pfn_inc) 756 for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
833 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | 757 range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
834 cpu_flags; 758 cpu_flags;
835 hmm_vma_walk->last = end; 759 hmm_vma_walk->last = end;
@@ -841,10 +765,10 @@ unlock:
841 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 765 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
842 766
843 return ret; 767 return ret;
844#else /* CONFIG_HUGETLB_PAGE */
845 return -EINVAL;
846#endif
847} 768}
769#else
770#define hmm_vma_walk_hugetlb_entry NULL
771#endif /* CONFIG_HUGETLB_PAGE */
848 772
849static void hmm_pfns_clear(struct hmm_range *range, 773static void hmm_pfns_clear(struct hmm_range *range,
850 uint64_t *pfns, 774 uint64_t *pfns,
@@ -859,44 +783,32 @@ static void hmm_pfns_clear(struct hmm_range *range,
859 * hmm_range_register() - start tracking change to CPU page table over a range 783 * hmm_range_register() - start tracking change to CPU page table over a range
860 * @range: range 784 * @range: range
861 * @mm: the mm struct for the range of virtual address 785 * @mm: the mm struct for the range of virtual address
862 * @start: start virtual address (inclusive) 786 *
863 * @end: end virtual address (exclusive) 787 * Return: 0 on success, -EFAULT if the address space is no longer valid
864 * @page_shift: expect page shift for the range
865 * Returns 0 on success, -EFAULT if the address space is no longer valid
866 * 788 *
867 * Track updates to the CPU page table see include/linux/hmm.h 789 * Track updates to the CPU page table see include/linux/hmm.h
868 */ 790 */
869int hmm_range_register(struct hmm_range *range, 791int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
870 struct hmm_mirror *mirror,
871 unsigned long start,
872 unsigned long end,
873 unsigned page_shift)
874{ 792{
875 unsigned long mask = ((1UL << page_shift) - 1UL);
876 struct hmm *hmm = mirror->hmm; 793 struct hmm *hmm = mirror->hmm;
877 unsigned long flags; 794 unsigned long flags;
878 795
879 range->valid = false; 796 range->valid = false;
880 range->hmm = NULL; 797 range->hmm = NULL;
881 798
882 if ((start & mask) || (end & mask)) 799 if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
883 return -EINVAL; 800 return -EINVAL;
884 if (start >= end) 801 if (range->start >= range->end)
885 return -EINVAL; 802 return -EINVAL;
886 803
887 range->page_shift = page_shift;
888 range->start = start;
889 range->end = end;
890
891 /* Prevent hmm_release() from running while the range is valid */ 804 /* Prevent hmm_release() from running while the range is valid */
892 if (!mmget_not_zero(hmm->mm)) 805 if (!mmget_not_zero(hmm->mmu_notifier.mm))
893 return -EFAULT; 806 return -EFAULT;
894 807
895 /* Initialize range to track CPU page table updates. */ 808 /* Initialize range to track CPU page table updates. */
896 spin_lock_irqsave(&hmm->ranges_lock, flags); 809 spin_lock_irqsave(&hmm->ranges_lock, flags);
897 810
898 range->hmm = hmm; 811 range->hmm = hmm;
899 kref_get(&hmm->kref);
900 list_add(&range->list, &hmm->ranges); 812 list_add(&range->list, &hmm->ranges);
901 813
902 /* 814 /*
@@ -928,8 +840,7 @@ void hmm_range_unregister(struct hmm_range *range)
928 spin_unlock_irqrestore(&hmm->ranges_lock, flags); 840 spin_unlock_irqrestore(&hmm->ranges_lock, flags);
929 841
930 /* Drop reference taken by hmm_range_register() */ 842 /* Drop reference taken by hmm_range_register() */
931 mmput(hmm->mm); 843 mmput(hmm->mmu_notifier.mm);
932 hmm_put(hmm);
933 844
934 /* 845 /*
935 * The range is now invalid and the ref on the hmm is dropped, so 846 * The range is now invalid and the ref on the hmm is dropped, so
@@ -941,105 +852,33 @@ void hmm_range_unregister(struct hmm_range *range)
941} 852}
942EXPORT_SYMBOL(hmm_range_unregister); 853EXPORT_SYMBOL(hmm_range_unregister);
943 854
944/* 855static const struct mm_walk_ops hmm_walk_ops = {
945 * hmm_range_snapshot() - snapshot CPU page table for a range 856 .pud_entry = hmm_vma_walk_pud,
946 * @range: range 857 .pmd_entry = hmm_vma_walk_pmd,
947 * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 858 .pte_hole = hmm_vma_walk_hole,
948 * permission (for instance asking for write and range is read only), 859 .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
949 * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid 860};
950 * vma or it is illegal to access that range), number of valid pages
951 * in range->pfns[] (from range start address).
952 *
953 * This snapshots the CPU page table for a range of virtual addresses. Snapshot
954 * validity is tracked by range struct. See in include/linux/hmm.h for example
955 * on how to use.
956 */
957long hmm_range_snapshot(struct hmm_range *range)
958{
959 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
960 unsigned long start = range->start, end;
961 struct hmm_vma_walk hmm_vma_walk;
962 struct hmm *hmm = range->hmm;
963 struct vm_area_struct *vma;
964 struct mm_walk mm_walk;
965
966 lockdep_assert_held(&hmm->mm->mmap_sem);
967 do {
968 /* If range is no longer valid force retry. */
969 if (!range->valid)
970 return -EBUSY;
971
972 vma = find_vma(hmm->mm, start);
973 if (vma == NULL || (vma->vm_flags & device_vma))
974 return -EFAULT;
975
976 if (is_vm_hugetlb_page(vma)) {
977 if (huge_page_shift(hstate_vma(vma)) !=
978 range->page_shift &&
979 range->page_shift != PAGE_SHIFT)
980 return -EINVAL;
981 } else {
982 if (range->page_shift != PAGE_SHIFT)
983 return -EINVAL;
984 }
985
986 if (!(vma->vm_flags & VM_READ)) {
987 /*
988 * If vma do not allow read access, then assume that it
989 * does not allow write access, either. HMM does not
990 * support architecture that allow write without read.
991 */
992 hmm_pfns_clear(range, range->pfns,
993 range->start, range->end);
994 return -EPERM;
995 }
996
997 range->vma = vma;
998 hmm_vma_walk.pgmap = NULL;
999 hmm_vma_walk.last = start;
1000 hmm_vma_walk.fault = false;
1001 hmm_vma_walk.range = range;
1002 mm_walk.private = &hmm_vma_walk;
1003 end = min(range->end, vma->vm_end);
1004
1005 mm_walk.vma = vma;
1006 mm_walk.mm = vma->vm_mm;
1007 mm_walk.pte_entry = NULL;
1008 mm_walk.test_walk = NULL;
1009 mm_walk.hugetlb_entry = NULL;
1010 mm_walk.pud_entry = hmm_vma_walk_pud;
1011 mm_walk.pmd_entry = hmm_vma_walk_pmd;
1012 mm_walk.pte_hole = hmm_vma_walk_hole;
1013 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
1014
1015 walk_page_range(start, end, &mm_walk);
1016 start = end;
1017 } while (start < range->end);
1018
1019 return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
1020}
1021EXPORT_SYMBOL(hmm_range_snapshot);
1022 861
1023/* 862/**
1024 * hmm_range_fault() - try to fault some address in a virtual address range 863 * hmm_range_fault - try to fault some address in a virtual address range
1025 * @range: range being faulted 864 * @range: range being faulted
1026 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 865 * @flags: HMM_FAULT_* flags
1027 * Return: number of valid pages in range->pfns[] (from range start
1028 * address). This may be zero. If the return value is negative,
1029 * then one of the following values may be returned:
1030 * 866 *
1031 * -EINVAL invalid arguments or mm or virtual address are in an 867 * Return: the number of valid pages in range->pfns[] (from range start
1032 * invalid vma (for instance device file vma). 868 * address), which may be zero. On error one of the following status codes
1033 * -ENOMEM: Out of memory. 869 * can be returned:
1034 * -EPERM: Invalid permission (for instance asking for write and 870 *
1035 * range is read only). 871 * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
1036 * -EAGAIN: If you need to retry and mmap_sem was drop. This can only 872 * (e.g., device file vma).
1037 * happens if block argument is false. 873 * -ENOMEM: Out of memory.
1038 * -EBUSY: If the the range is being invalidated and you should wait 874 * -EPERM: Invalid permission (e.g., asking for write and range is read
1039 * for invalidation to finish. 875 * only).
1040 * -EFAULT: Invalid (ie either no valid vma or it is illegal to access 876 * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped.
1041 * that range), number of valid pages in range->pfns[] (from 877 * -EBUSY: The range has been invalidated and the caller needs to wait for
1042 * range start address). 878 * the invalidation to finish.
879 * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
880 * that range) number of valid pages in range->pfns[] (from
881 * range start address).
1043 * 882 *
1044 * This is similar to a regular CPU page fault except that it will not trigger 883 * This is similar to a regular CPU page fault except that it will not trigger
1045 * any memory migration if the memory being faulted is not accessible by CPUs 884 * any memory migration if the memory being faulted is not accessible by CPUs
@@ -1048,37 +887,26 @@ EXPORT_SYMBOL(hmm_range_snapshot);
1048 * On error, for one virtual address in the range, the function will mark the 887 * On error, for one virtual address in the range, the function will mark the
1049 * corresponding HMM pfn entry with an error flag. 888 * corresponding HMM pfn entry with an error flag.
1050 */ 889 */
1051long hmm_range_fault(struct hmm_range *range, bool block) 890long hmm_range_fault(struct hmm_range *range, unsigned int flags)
1052{ 891{
1053 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; 892 const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
1054 unsigned long start = range->start, end; 893 unsigned long start = range->start, end;
1055 struct hmm_vma_walk hmm_vma_walk; 894 struct hmm_vma_walk hmm_vma_walk;
1056 struct hmm *hmm = range->hmm; 895 struct hmm *hmm = range->hmm;
1057 struct vm_area_struct *vma; 896 struct vm_area_struct *vma;
1058 struct mm_walk mm_walk;
1059 int ret; 897 int ret;
1060 898
1061 lockdep_assert_held(&hmm->mm->mmap_sem); 899 lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
1062 900
1063 do { 901 do {
1064 /* If range is no longer valid force retry. */ 902 /* If range is no longer valid force retry. */
1065 if (!range->valid) 903 if (!range->valid)
1066 return -EBUSY; 904 return -EBUSY;
1067 905
1068 vma = find_vma(hmm->mm, start); 906 vma = find_vma(hmm->mmu_notifier.mm, start);
1069 if (vma == NULL || (vma->vm_flags & device_vma)) 907 if (vma == NULL || (vma->vm_flags & device_vma))
1070 return -EFAULT; 908 return -EFAULT;
1071 909
1072 if (is_vm_hugetlb_page(vma)) {
1073 if (huge_page_shift(hstate_vma(vma)) !=
1074 range->page_shift &&
1075 range->page_shift != PAGE_SHIFT)
1076 return -EINVAL;
1077 } else {
1078 if (range->page_shift != PAGE_SHIFT)
1079 return -EINVAL;
1080 }
1081
1082 if (!(vma->vm_flags & VM_READ)) { 910 if (!(vma->vm_flags & VM_READ)) {
1083 /* 911 /*
1084 * If vma do not allow read access, then assume that it 912 * If vma do not allow read access, then assume that it
@@ -1090,27 +918,18 @@ long hmm_range_fault(struct hmm_range *range, bool block)
1090 return -EPERM; 918 return -EPERM;
1091 } 919 }
1092 920
1093 range->vma = vma;
1094 hmm_vma_walk.pgmap = NULL; 921 hmm_vma_walk.pgmap = NULL;
1095 hmm_vma_walk.last = start; 922 hmm_vma_walk.last = start;
1096 hmm_vma_walk.fault = true; 923 hmm_vma_walk.flags = flags;
1097 hmm_vma_walk.block = block;
1098 hmm_vma_walk.range = range; 924 hmm_vma_walk.range = range;
1099 mm_walk.private = &hmm_vma_walk;
1100 end = min(range->end, vma->vm_end); 925 end = min(range->end, vma->vm_end);
1101 926
1102 mm_walk.vma = vma; 927 walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
1103 mm_walk.mm = vma->vm_mm; 928 &hmm_vma_walk);
1104 mm_walk.pte_entry = NULL;
1105 mm_walk.test_walk = NULL;
1106 mm_walk.hugetlb_entry = NULL;
1107 mm_walk.pud_entry = hmm_vma_walk_pud;
1108 mm_walk.pmd_entry = hmm_vma_walk_pmd;
1109 mm_walk.pte_hole = hmm_vma_walk_hole;
1110 mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
1111 929
1112 do { 930 do {
1113 ret = walk_page_range(start, end, &mm_walk); 931 ret = walk_page_range(vma->vm_mm, start, end,
932 &hmm_walk_ops, &hmm_vma_walk);
1114 start = hmm_vma_walk.last; 933 start = hmm_vma_walk.last;
1115 934
1116 /* Keep trying while the range is valid. */ 935 /* Keep trying while the range is valid. */
@@ -1133,25 +952,22 @@ long hmm_range_fault(struct hmm_range *range, bool block)
1133EXPORT_SYMBOL(hmm_range_fault); 952EXPORT_SYMBOL(hmm_range_fault);
1134 953
1135/** 954/**
1136 * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. 955 * hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
1137 * @range: range being faulted 956 * @range: range being faulted
1138 * @device: device against to dma map page to 957 * @device: device to map page to
1139 * @daddrs: dma address of mapped pages 958 * @daddrs: array of dma addresses for the mapped pages
1140 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 959 * @flags: HMM_FAULT_*
1141 * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
1142 * drop and you need to try again, some other error value otherwise
1143 * 960 *
1144 * Note same usage pattern as hmm_range_fault(). 961 * Return: the number of pages mapped on success (including zero), or any
962 * status return from hmm_range_fault() otherwise.
1145 */ 963 */
1146long hmm_range_dma_map(struct hmm_range *range, 964long hmm_range_dma_map(struct hmm_range *range, struct device *device,
1147 struct device *device, 965 dma_addr_t *daddrs, unsigned int flags)
1148 dma_addr_t *daddrs,
1149 bool block)
1150{ 966{
1151 unsigned long i, npages, mapped; 967 unsigned long i, npages, mapped;
1152 long ret; 968 long ret;
1153 969
1154 ret = hmm_range_fault(range, block); 970 ret = hmm_range_fault(range, flags);
1155 if (ret <= 0) 971 if (ret <= 0)
1156 return ret ? ret : -EBUSY; 972 return ret ? ret : -EBUSY;
1157 973
@@ -1222,7 +1038,6 @@ EXPORT_SYMBOL(hmm_range_dma_map);
1222/** 1038/**
1223 * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() 1039 * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
1224 * @range: range being unmapped 1040 * @range: range being unmapped
1225 * @vma: the vma against which the range (optional)
1226 * @device: device against which dma map was done 1041 * @device: device against which dma map was done
1227 * @daddrs: dma address of mapped pages 1042 * @daddrs: dma address of mapped pages
1228 * @dirty: dirty page if it had the write flag set 1043 * @dirty: dirty page if it had the write flag set
@@ -1234,7 +1049,6 @@ EXPORT_SYMBOL(hmm_range_dma_map);
1234 * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. 1049 * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
1235 */ 1050 */
1236long hmm_range_dma_unmap(struct hmm_range *range, 1051long hmm_range_dma_unmap(struct hmm_range *range,
1237 struct vm_area_struct *vma,
1238 struct device *device, 1052 struct device *device,
1239 dma_addr_t *daddrs, 1053 dma_addr_t *daddrs,
1240 bool dirty) 1054 bool dirty)
diff --git a/mm/madvise.c b/mm/madvise.c
index bac973b9f2cc..88babcc384b9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -21,6 +21,7 @@
21#include <linux/file.h> 21#include <linux/file.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/pagewalk.h>
24#include <linux/swap.h> 25#include <linux/swap.h>
25#include <linux/swapops.h> 26#include <linux/swapops.h>
26#include <linux/shmem_fs.h> 27#include <linux/shmem_fs.h>
@@ -226,19 +227,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
226 return 0; 227 return 0;
227} 228}
228 229
229static void force_swapin_readahead(struct vm_area_struct *vma, 230static const struct mm_walk_ops swapin_walk_ops = {
230 unsigned long start, unsigned long end) 231 .pmd_entry = swapin_walk_pmd_entry,
231{ 232};
232 struct mm_walk walk = {
233 .mm = vma->vm_mm,
234 .pmd_entry = swapin_walk_pmd_entry,
235 .private = vma,
236 };
237
238 walk_page_range(start, end, &walk);
239
240 lru_add_drain(); /* Push any new pages onto the LRU now */
241}
242 233
243static void force_shm_swapin_readahead(struct vm_area_struct *vma, 234static void force_shm_swapin_readahead(struct vm_area_struct *vma,
244 unsigned long start, unsigned long end, 235 unsigned long start, unsigned long end,
@@ -281,7 +272,8 @@ static long madvise_willneed(struct vm_area_struct *vma,
281 *prev = vma; 272 *prev = vma;
282#ifdef CONFIG_SWAP 273#ifdef CONFIG_SWAP
283 if (!file) { 274 if (!file) {
284 force_swapin_readahead(vma, start, end); 275 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
276 lru_add_drain(); /* Push any new pages onto the LRU now */
285 return 0; 277 return 0;
286 } 278 }
287 279
@@ -450,20 +442,9 @@ next:
450 return 0; 442 return 0;
451} 443}
452 444
453static void madvise_free_page_range(struct mmu_gather *tlb, 445static const struct mm_walk_ops madvise_free_walk_ops = {
454 struct vm_area_struct *vma, 446 .pmd_entry = madvise_free_pte_range,
455 unsigned long addr, unsigned long end) 447};
456{
457 struct mm_walk free_walk = {
458 .pmd_entry = madvise_free_pte_range,
459 .mm = vma->vm_mm,
460 .private = tlb,
461 };
462
463 tlb_start_vma(tlb, vma);
464 walk_page_range(addr, end, &free_walk);
465 tlb_end_vma(tlb, vma);
466}
467 448
468static int madvise_free_single_vma(struct vm_area_struct *vma, 449static int madvise_free_single_vma(struct vm_area_struct *vma,
469 unsigned long start_addr, unsigned long end_addr) 450 unsigned long start_addr, unsigned long end_addr)
@@ -490,7 +471,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
490 update_hiwater_rss(mm); 471 update_hiwater_rss(mm);
491 472
492 mmu_notifier_invalidate_range_start(&range); 473 mmu_notifier_invalidate_range_start(&range);
493 madvise_free_page_range(&tlb, vma, range.start, range.end); 474 tlb_start_vma(&tlb, vma);
475 walk_page_range(vma->vm_mm, range.start, range.end,
476 &madvise_free_walk_ops, &tlb);
477 tlb_end_vma(&tlb, vma);
494 mmu_notifier_invalidate_range_end(&range); 478 mmu_notifier_invalidate_range_end(&range);
495 tlb_finish_mmu(&tlb, range.start, range.end); 479 tlb_finish_mmu(&tlb, range.start, range.end);
496 480
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 597d58101872..f3c15bb07cce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
25#include <linux/page_counter.h> 25#include <linux/page_counter.h>
26#include <linux/memcontrol.h> 26#include <linux/memcontrol.h>
27#include <linux/cgroup.h> 27#include <linux/cgroup.h>
28#include <linux/mm.h> 28#include <linux/pagewalk.h>
29#include <linux/sched/mm.h> 29#include <linux/sched/mm.h>
30#include <linux/shmem_fs.h> 30#include <linux/shmem_fs.h>
31#include <linux/hugetlb.h> 31#include <linux/hugetlb.h>
@@ -5499,17 +5499,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5499 return 0; 5499 return 0;
5500} 5500}
5501 5501
5502static const struct mm_walk_ops precharge_walk_ops = {
5503 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5504};
5505
5502static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5506static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5503{ 5507{
5504 unsigned long precharge; 5508 unsigned long precharge;
5505 5509
5506 struct mm_walk mem_cgroup_count_precharge_walk = {
5507 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5508 .mm = mm,
5509 };
5510 down_read(&mm->mmap_sem); 5510 down_read(&mm->mmap_sem);
5511 walk_page_range(0, mm->highest_vm_end, 5511 walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5512 &mem_cgroup_count_precharge_walk);
5513 up_read(&mm->mmap_sem); 5512 up_read(&mm->mmap_sem);
5514 5513
5515 precharge = mc.precharge; 5514 precharge = mc.precharge;
@@ -5778,13 +5777,12 @@ put: /* get_mctgt_type() gets the page */
5778 return ret; 5777 return ret;
5779} 5778}
5780 5779
5780static const struct mm_walk_ops charge_walk_ops = {
5781 .pmd_entry = mem_cgroup_move_charge_pte_range,
5782};
5783
5781static void mem_cgroup_move_charge(void) 5784static void mem_cgroup_move_charge(void)
5782{ 5785{
5783 struct mm_walk mem_cgroup_move_charge_walk = {
5784 .pmd_entry = mem_cgroup_move_charge_pte_range,
5785 .mm = mc.mm,
5786 };
5787
5788 lru_add_drain_all(); 5786 lru_add_drain_all();
5789 /* 5787 /*
5790 * Signal lock_page_memcg() to take the memcg's move_lock 5788 * Signal lock_page_memcg() to take the memcg's move_lock
@@ -5810,7 +5808,8 @@ retry:
5810 * When we have consumed all precharges and failed in doing 5808 * When we have consumed all precharges and failed in doing
5811 * additional charge, the page walk just aborts. 5809 * additional charge, the page walk just aborts.
5812 */ 5810 */
5813 walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); 5811 walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
5812 NULL);
5814 5813
5815 up_read(&mc.mm->mmap_sem); 5814 up_read(&mc.mm->mmap_sem);
5816 atomic_dec(&mc.from->moving_account); 5815 atomic_dec(&mc.from->moving_account);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 65e0874fce17..f000771558d8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -68,7 +68,7 @@
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69 69
70#include <linux/mempolicy.h> 70#include <linux/mempolicy.h>
71#include <linux/mm.h> 71#include <linux/pagewalk.h>
72#include <linux/highmem.h> 72#include <linux/highmem.h>
73#include <linux/hugetlb.h> 73#include <linux/hugetlb.h>
74#include <linux/kernel.h> 74#include <linux/kernel.h>
@@ -655,6 +655,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
655 return 1; 655 return 1;
656} 656}
657 657
658static const struct mm_walk_ops queue_pages_walk_ops = {
659 .hugetlb_entry = queue_pages_hugetlb,
660 .pmd_entry = queue_pages_pte_range,
661 .test_walk = queue_pages_test_walk,
662};
663
658/* 664/*
659 * Walk through page tables and collect pages to be migrated. 665 * Walk through page tables and collect pages to be migrated.
660 * 666 *
@@ -679,15 +685,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
679 .nmask = nodes, 685 .nmask = nodes,
680 .prev = NULL, 686 .prev = NULL,
681 }; 687 };
682 struct mm_walk queue_pages_walk = {
683 .hugetlb_entry = queue_pages_hugetlb,
684 .pmd_entry = queue_pages_pte_range,
685 .test_walk = queue_pages_test_walk,
686 .mm = mm,
687 .private = &qp,
688 };
689 688
690 return walk_page_range(start, end, &queue_pages_walk); 689 return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
691} 690}
692 691
693/* 692/*
diff --git a/mm/memremap.c b/mm/memremap.c
index ed70c4e8e52a..32c79b51af86 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -21,13 +21,13 @@ DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
21EXPORT_SYMBOL(devmap_managed_key); 21EXPORT_SYMBOL(devmap_managed_key);
22static atomic_t devmap_managed_enable; 22static atomic_t devmap_managed_enable;
23 23
24static void devmap_managed_enable_put(void *data) 24static void devmap_managed_enable_put(void)
25{ 25{
26 if (atomic_dec_and_test(&devmap_managed_enable)) 26 if (atomic_dec_and_test(&devmap_managed_enable))
27 static_branch_disable(&devmap_managed_key); 27 static_branch_disable(&devmap_managed_key);
28} 28}
29 29
30static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) 30static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
31{ 31{
32 if (!pgmap->ops || !pgmap->ops->page_free) { 32 if (!pgmap->ops || !pgmap->ops->page_free) {
33 WARN(1, "Missing page_free method\n"); 33 WARN(1, "Missing page_free method\n");
@@ -36,13 +36,16 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm
36 36
37 if (atomic_inc_return(&devmap_managed_enable) == 1) 37 if (atomic_inc_return(&devmap_managed_enable) == 1)
38 static_branch_enable(&devmap_managed_key); 38 static_branch_enable(&devmap_managed_key);
39 return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); 39 return 0;
40} 40}
41#else 41#else
42static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) 42static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
43{ 43{
44 return -EINVAL; 44 return -EINVAL;
45} 45}
46static void devmap_managed_enable_put(void)
47{
48}
46#endif /* CONFIG_DEV_PAGEMAP_OPS */ 49#endif /* CONFIG_DEV_PAGEMAP_OPS */
47 50
48static void pgmap_array_delete(struct resource *res) 51static void pgmap_array_delete(struct resource *res)
@@ -99,10 +102,8 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
99 pgmap->ref = NULL; 102 pgmap->ref = NULL;
100} 103}
101 104
102static void devm_memremap_pages_release(void *data) 105void memunmap_pages(struct dev_pagemap *pgmap)
103{ 106{
104 struct dev_pagemap *pgmap = data;
105 struct device *dev = pgmap->dev;
106 struct resource *res = &pgmap->res; 107 struct resource *res = &pgmap->res;
107 unsigned long pfn; 108 unsigned long pfn;
108 int nid; 109 int nid;
@@ -129,8 +130,14 @@ static void devm_memremap_pages_release(void *data)
129 130
130 untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); 131 untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
131 pgmap_array_delete(res); 132 pgmap_array_delete(res);
132 dev_WARN_ONCE(dev, pgmap->altmap.alloc, 133 WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
133 "%s: failed to free all reserved pages\n", __func__); 134 devmap_managed_enable_put();
135}
136EXPORT_SYMBOL_GPL(memunmap_pages);
137
138static void devm_memremap_pages_release(void *data)
139{
140 memunmap_pages(data);
134} 141}
135 142
136static void dev_pagemap_percpu_release(struct percpu_ref *ref) 143static void dev_pagemap_percpu_release(struct percpu_ref *ref)
@@ -141,27 +148,12 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
141 complete(&pgmap->done); 148 complete(&pgmap->done);
142} 149}
143 150
144/** 151/*
145 * devm_memremap_pages - remap and provide memmap backing for the given resource 152 * Not device managed version of dev_memremap_pages, undone by
146 * @dev: hosting device for @res 153 * memunmap_pages(). Please use dev_memremap_pages if you have a struct
147 * @pgmap: pointer to a struct dev_pagemap 154 * device available.
148 *
149 * Notes:
150 * 1/ At a minimum the res and type members of @pgmap must be initialized
151 * by the caller before passing it to this function
152 *
153 * 2/ The altmap field may optionally be initialized, in which case
154 * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
155 *
156 * 3/ The ref field may optionally be provided, in which pgmap->ref must be
157 * 'live' on entry and will be killed and reaped at
158 * devm_memremap_pages_release() time, or if this routine fails.
159 *
160 * 4/ res is expected to be a host memory range that could feasibly be
161 * treated as a "System RAM" range, i.e. not a device mmio range, but
162 * this is not enforced.
163 */ 155 */
164void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) 156void *memremap_pages(struct dev_pagemap *pgmap, int nid)
165{ 157{
166 struct resource *res = &pgmap->res; 158 struct resource *res = &pgmap->res;
167 struct dev_pagemap *conflict_pgmap; 159 struct dev_pagemap *conflict_pgmap;
@@ -172,7 +164,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
172 .altmap = pgmap_altmap(pgmap), 164 .altmap = pgmap_altmap(pgmap),
173 }; 165 };
174 pgprot_t pgprot = PAGE_KERNEL; 166 pgprot_t pgprot = PAGE_KERNEL;
175 int error, nid, is_ram; 167 int error, is_ram;
176 bool need_devmap_managed = true; 168 bool need_devmap_managed = true;
177 169
178 switch (pgmap->type) { 170 switch (pgmap->type) {
@@ -220,14 +212,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
220 } 212 }
221 213
222 if (need_devmap_managed) { 214 if (need_devmap_managed) {
223 error = devmap_managed_enable_get(dev, pgmap); 215 error = devmap_managed_enable_get(pgmap);
224 if (error) 216 if (error)
225 return ERR_PTR(error); 217 return ERR_PTR(error);
226 } 218 }
227 219
228 conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); 220 conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
229 if (conflict_pgmap) { 221 if (conflict_pgmap) {
230 dev_WARN(dev, "Conflicting mapping in same section\n"); 222 WARN(1, "Conflicting mapping in same section\n");
231 put_dev_pagemap(conflict_pgmap); 223 put_dev_pagemap(conflict_pgmap);
232 error = -ENOMEM; 224 error = -ENOMEM;
233 goto err_array; 225 goto err_array;
@@ -235,7 +227,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
235 227
236 conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); 228 conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
237 if (conflict_pgmap) { 229 if (conflict_pgmap) {
238 dev_WARN(dev, "Conflicting mapping in same section\n"); 230 WARN(1, "Conflicting mapping in same section\n");
239 put_dev_pagemap(conflict_pgmap); 231 put_dev_pagemap(conflict_pgmap);
240 error = -ENOMEM; 232 error = -ENOMEM;
241 goto err_array; 233 goto err_array;
@@ -251,14 +243,11 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
251 goto err_array; 243 goto err_array;
252 } 244 }
253 245
254 pgmap->dev = dev;
255
256 error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), 246 error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
257 PHYS_PFN(res->end), pgmap, GFP_KERNEL)); 247 PHYS_PFN(res->end), pgmap, GFP_KERNEL));
258 if (error) 248 if (error)
259 goto err_array; 249 goto err_array;
260 250
261 nid = dev_to_node(dev);
262 if (nid < 0) 251 if (nid < 0)
263 nid = numa_mem_id(); 252 nid = numa_mem_id();
264 253
@@ -314,12 +303,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
314 PHYS_PFN(res->start), 303 PHYS_PFN(res->start),
315 PHYS_PFN(resource_size(res)), pgmap); 304 PHYS_PFN(resource_size(res)), pgmap);
316 percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); 305 percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
317
318 error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
319 pgmap);
320 if (error)
321 return ERR_PTR(error);
322
323 return __va(res->start); 306 return __va(res->start);
324 307
325 err_add_memory: 308 err_add_memory:
@@ -331,8 +314,46 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
331 err_array: 314 err_array:
332 dev_pagemap_kill(pgmap); 315 dev_pagemap_kill(pgmap);
333 dev_pagemap_cleanup(pgmap); 316 dev_pagemap_cleanup(pgmap);
317 devmap_managed_enable_put();
334 return ERR_PTR(error); 318 return ERR_PTR(error);
335} 319}
320EXPORT_SYMBOL_GPL(memremap_pages);
321
322/**
323 * devm_memremap_pages - remap and provide memmap backing for the given resource
324 * @dev: hosting device for @res
325 * @pgmap: pointer to a struct dev_pagemap
326 *
327 * Notes:
328 * 1/ At a minimum the res and type members of @pgmap must be initialized
329 * by the caller before passing it to this function
330 *
331 * 2/ The altmap field may optionally be initialized, in which case
332 * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
333 *
334 * 3/ The ref field may optionally be provided, in which pgmap->ref must be
335 * 'live' on entry and will be killed and reaped at
336 * devm_memremap_pages_release() time, or if this routine fails.
337 *
338 * 4/ res is expected to be a host memory range that could feasibly be
339 * treated as a "System RAM" range, i.e. not a device mmio range, but
340 * this is not enforced.
341 */
342void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
343{
344 int error;
345 void *ret;
346
347 ret = memremap_pages(pgmap, dev_to_node(dev));
348 if (IS_ERR(ret))
349 return ret;
350
351 error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
352 pgmap);
353 if (error)
354 return ERR_PTR(error);
355 return ret;
356}
336EXPORT_SYMBOL_GPL(devm_memremap_pages); 357EXPORT_SYMBOL_GPL(devm_memremap_pages);
337 358
338void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) 359void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
diff --git a/mm/migrate.c b/mm/migrate.c
index a42858d8e00b..9f4ed4e985c1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
38#include <linux/hugetlb.h> 38#include <linux/hugetlb.h>
39#include <linux/hugetlb_cgroup.h> 39#include <linux/hugetlb_cgroup.h>
40#include <linux/gfp.h> 40#include <linux/gfp.h>
41#include <linux/pagewalk.h>
41#include <linux/pfn_t.h> 42#include <linux/pfn_t.h>
42#include <linux/memremap.h> 43#include <linux/memremap.h>
43#include <linux/userfaultfd_k.h> 44#include <linux/userfaultfd_k.h>
@@ -2119,17 +2120,7 @@ out_unlock:
2119 2120
2120#endif /* CONFIG_NUMA */ 2121#endif /* CONFIG_NUMA */
2121 2122
2122#if defined(CONFIG_MIGRATE_VMA_HELPER) 2123#ifdef CONFIG_DEVICE_PRIVATE
2123struct migrate_vma {
2124 struct vm_area_struct *vma;
2125 unsigned long *dst;
2126 unsigned long *src;
2127 unsigned long cpages;
2128 unsigned long npages;
2129 unsigned long start;
2130 unsigned long end;
2131};
2132
2133static int migrate_vma_collect_hole(unsigned long start, 2124static int migrate_vma_collect_hole(unsigned long start,
2134 unsigned long end, 2125 unsigned long end,
2135 struct mm_walk *walk) 2126 struct mm_walk *walk)
@@ -2249,8 +2240,8 @@ again:
2249 goto next; 2240 goto next;
2250 2241
2251 page = device_private_entry_to_page(entry); 2242 page = device_private_entry_to_page(entry);
2252 mpfn = migrate_pfn(page_to_pfn(page))| 2243 mpfn = migrate_pfn(page_to_pfn(page)) |
2253 MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; 2244 MIGRATE_PFN_MIGRATE;
2254 if (is_write_device_private_entry(entry)) 2245 if (is_write_device_private_entry(entry))
2255 mpfn |= MIGRATE_PFN_WRITE; 2246 mpfn |= MIGRATE_PFN_WRITE;
2256 } else { 2247 } else {
@@ -2329,6 +2320,11 @@ next:
2329 return 0; 2320 return 0;
2330} 2321}
2331 2322
2323static const struct mm_walk_ops migrate_vma_walk_ops = {
2324 .pmd_entry = migrate_vma_collect_pmd,
2325 .pte_hole = migrate_vma_collect_hole,
2326};
2327
2332/* 2328/*
2333 * migrate_vma_collect() - collect pages over a range of virtual addresses 2329 * migrate_vma_collect() - collect pages over a range of virtual addresses
2334 * @migrate: migrate struct containing all migration information 2330 * @migrate: migrate struct containing all migration information
@@ -2340,21 +2336,15 @@ next:
2340static void migrate_vma_collect(struct migrate_vma *migrate) 2336static void migrate_vma_collect(struct migrate_vma *migrate)
2341{ 2337{
2342 struct mmu_notifier_range range; 2338 struct mmu_notifier_range range;
2343 struct mm_walk mm_walk = {
2344 .pmd_entry = migrate_vma_collect_pmd,
2345 .pte_hole = migrate_vma_collect_hole,
2346 .vma = migrate->vma,
2347 .mm = migrate->vma->vm_mm,
2348 .private = migrate,
2349 };
2350 2339
2351 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, 2340 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
2352 migrate->start, 2341 migrate->vma->vm_mm, migrate->start, migrate->end);
2353 migrate->end);
2354 mmu_notifier_invalidate_range_start(&range); 2342 mmu_notifier_invalidate_range_start(&range);
2355 walk_page_range(migrate->start, migrate->end, &mm_walk);
2356 mmu_notifier_invalidate_range_end(&range);
2357 2343
2344 walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
2345 &migrate_vma_walk_ops, migrate);
2346
2347 mmu_notifier_invalidate_range_end(&range);
2358 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); 2348 migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
2359} 2349}
2360 2350
@@ -2577,6 +2567,110 @@ restore:
2577 } 2567 }
2578} 2568}
2579 2569
2570/**
2571 * migrate_vma_setup() - prepare to migrate a range of memory
2572 * @args: contains the vma, start, and and pfns arrays for the migration
2573 *
2574 * Returns: negative errno on failures, 0 when 0 or more pages were migrated
2575 * without an error.
2576 *
2577 * Prepare to migrate a range of memory virtual address range by collecting all
2578 * the pages backing each virtual address in the range, saving them inside the
2579 * src array. Then lock those pages and unmap them. Once the pages are locked
2580 * and unmapped, check whether each page is pinned or not. Pages that aren't
2581 * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
2582 * corresponding src array entry. Then restores any pages that are pinned, by
2583 * remapping and unlocking those pages.
2584 *
2585 * The caller should then allocate destination memory and copy source memory to
2586 * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
2587 * flag set). Once these are allocated and copied, the caller must update each
2588 * corresponding entry in the dst array with the pfn value of the destination
2589 * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
2590 * (destination pages must have their struct pages locked, via lock_page()).
2591 *
2592 * Note that the caller does not have to migrate all the pages that are marked
2593 * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
2594 * device memory to system memory. If the caller cannot migrate a device page
2595 * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
2596 * consequences for the userspace process, so it must be avoided if at all
2597 * possible.
2598 *
2599 * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
2600 * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
2601 * allowing the caller to allocate device memory for those unback virtual
2602 * address. For this the caller simply has to allocate device memory and
2603 * properly set the destination entry like for regular migration. Note that
2604 * this can still fails and thus inside the device driver must check if the
2605 * migration was successful for those entries after calling migrate_vma_pages()
2606 * just like for regular migration.
2607 *
2608 * After that, the callers must call migrate_vma_pages() to go over each entry
2609 * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2610 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2611 * then migrate_vma_pages() to migrate struct page information from the source
2612 * struct page to the destination struct page. If it fails to migrate the
2613 * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
2614 * src array.
2615 *
2616 * At this point all successfully migrated pages have an entry in the src
2617 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2618 * array entry with MIGRATE_PFN_VALID flag set.
2619 *
2620 * Once migrate_vma_pages() returns the caller may inspect which pages were
2621 * successfully migrated, and which were not. Successfully migrated pages will
2622 * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
2623 *
2624 * It is safe to update device page table after migrate_vma_pages() because
2625 * both destination and source page are still locked, and the mmap_sem is held
2626 * in read mode (hence no one can unmap the range being migrated).
2627 *
2628 * Once the caller is done cleaning up things and updating its page table (if it
2629 * chose to do so, this is not an obligation) it finally calls
2630 * migrate_vma_finalize() to update the CPU page table to point to new pages
2631 * for successfully migrated pages or otherwise restore the CPU page table to
2632 * point to the original source pages.
2633 */
2634int migrate_vma_setup(struct migrate_vma *args)
2635{
2636 long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
2637
2638 args->start &= PAGE_MASK;
2639 args->end &= PAGE_MASK;
2640 if (!args->vma || is_vm_hugetlb_page(args->vma) ||
2641 (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
2642 return -EINVAL;
2643 if (nr_pages <= 0)
2644 return -EINVAL;
2645 if (args->start < args->vma->vm_start ||
2646 args->start >= args->vma->vm_end)
2647 return -EINVAL;
2648 if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
2649 return -EINVAL;
2650 if (!args->src || !args->dst)
2651 return -EINVAL;
2652
2653 memset(args->src, 0, sizeof(*args->src) * nr_pages);
2654 args->cpages = 0;
2655 args->npages = 0;
2656
2657 migrate_vma_collect(args);
2658
2659 if (args->cpages)
2660 migrate_vma_prepare(args);
2661 if (args->cpages)
2662 migrate_vma_unmap(args);
2663
2664 /*
2665 * At this point pages are locked and unmapped, and thus they have
2666 * stable content and can safely be copied to destination memory that
2667 * is allocated by the drivers.
2668 */
2669 return 0;
2670
2671}
2672EXPORT_SYMBOL(migrate_vma_setup);
2673
2580static void migrate_vma_insert_page(struct migrate_vma *migrate, 2674static void migrate_vma_insert_page(struct migrate_vma *migrate,
2581 unsigned long addr, 2675 unsigned long addr,
2582 struct page *page, 2676 struct page *page,
@@ -2708,7 +2802,7 @@ abort:
2708 *src &= ~MIGRATE_PFN_MIGRATE; 2802 *src &= ~MIGRATE_PFN_MIGRATE;
2709} 2803}
2710 2804
2711/* 2805/**
2712 * migrate_vma_pages() - migrate meta-data from src page to dst page 2806 * migrate_vma_pages() - migrate meta-data from src page to dst page
2713 * @migrate: migrate struct containing all migration information 2807 * @migrate: migrate struct containing all migration information
2714 * 2808 *
@@ -2716,7 +2810,7 @@ abort:
2716 * struct page. This effectively finishes the migration from source page to the 2810 * struct page. This effectively finishes the migration from source page to the
2717 * destination page. 2811 * destination page.
2718 */ 2812 */
2719static void migrate_vma_pages(struct migrate_vma *migrate) 2813void migrate_vma_pages(struct migrate_vma *migrate)
2720{ 2814{
2721 const unsigned long npages = migrate->npages; 2815 const unsigned long npages = migrate->npages;
2722 const unsigned long start = migrate->start; 2816 const unsigned long start = migrate->start;
@@ -2790,8 +2884,9 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2790 if (notified) 2884 if (notified)
2791 mmu_notifier_invalidate_range_only_end(&range); 2885 mmu_notifier_invalidate_range_only_end(&range);
2792} 2886}
2887EXPORT_SYMBOL(migrate_vma_pages);
2793 2888
2794/* 2889/**
2795 * migrate_vma_finalize() - restore CPU page table entry 2890 * migrate_vma_finalize() - restore CPU page table entry
2796 * @migrate: migrate struct containing all migration information 2891 * @migrate: migrate struct containing all migration information
2797 * 2892 *
@@ -2802,7 +2897,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
2802 * This also unlocks the pages and puts them back on the lru, or drops the extra 2897 * This also unlocks the pages and puts them back on the lru, or drops the extra
2803 * refcount, for device pages. 2898 * refcount, for device pages.
2804 */ 2899 */
2805static void migrate_vma_finalize(struct migrate_vma *migrate) 2900void migrate_vma_finalize(struct migrate_vma *migrate)
2806{ 2901{
2807 const unsigned long npages = migrate->npages; 2902 const unsigned long npages = migrate->npages;
2808 unsigned long i; 2903 unsigned long i;
@@ -2845,124 +2940,5 @@ static void migrate_vma_finalize(struct migrate_vma *migrate)
2845 } 2940 }
2846 } 2941 }
2847} 2942}
2848 2943EXPORT_SYMBOL(migrate_vma_finalize);
2849/* 2944#endif /* CONFIG_DEVICE_PRIVATE */
2850 * migrate_vma() - migrate a range of memory inside vma
2851 *
2852 * @ops: migration callback for allocating destination memory and copying
2853 * @vma: virtual memory area containing the range to be migrated
2854 * @start: start address of the range to migrate (inclusive)
2855 * @end: end address of the range to migrate (exclusive)
2856 * @src: array of hmm_pfn_t containing source pfns
2857 * @dst: array of hmm_pfn_t containing destination pfns
2858 * @private: pointer passed back to each of the callback
2859 * Returns: 0 on success, error code otherwise
2860 *
2861 * This function tries to migrate a range of memory virtual address range, using
2862 * callbacks to allocate and copy memory from source to destination. First it
2863 * collects all the pages backing each virtual address in the range, saving this
2864 * inside the src array. Then it locks those pages and unmaps them. Once the pages
2865 * are locked and unmapped, it checks whether each page is pinned or not. Pages
2866 * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
2867 * in the corresponding src array entry. It then restores any pages that are
2868 * pinned, by remapping and unlocking those pages.
2869 *
2870 * At this point it calls the alloc_and_copy() callback. For documentation on
2871 * what is expected from that callback, see struct migrate_vma_ops comments in
2872 * include/linux/migrate.h
2873 *
2874 * After the alloc_and_copy() callback, this function goes over each entry in
2875 * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
2876 * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
2877 * then the function tries to migrate struct page information from the source
2878 * struct page to the destination struct page. If it fails to migrate the struct
2879 * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
2880 * array.
2881 *
2882 * At this point all successfully migrated pages have an entry in the src
2883 * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
2884 * array entry with MIGRATE_PFN_VALID flag set.
2885 *
2886 * It then calls the finalize_and_map() callback. See comments for "struct
2887 * migrate_vma_ops", in include/linux/migrate.h for details about
2888 * finalize_and_map() behavior.
2889 *
2890 * After the finalize_and_map() callback, for successfully migrated pages, this
2891 * function updates the CPU page table to point to new pages, otherwise it
2892 * restores the CPU page table to point to the original source pages.
2893 *
2894 * Function returns 0 after the above steps, even if no pages were migrated
2895 * (The function only returns an error if any of the arguments are invalid.)
2896 *
2897 * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
2898 * unsigned long entries.
2899 */
2900int migrate_vma(const struct migrate_vma_ops *ops,
2901 struct vm_area_struct *vma,
2902 unsigned long start,
2903 unsigned long end,
2904 unsigned long *src,
2905 unsigned long *dst,
2906 void *private)
2907{
2908 struct migrate_vma migrate;
2909
2910 /* Sanity check the arguments */
2911 start &= PAGE_MASK;
2912 end &= PAGE_MASK;
2913 if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
2914 vma_is_dax(vma))
2915 return -EINVAL;
2916 if (start < vma->vm_start || start >= vma->vm_end)
2917 return -EINVAL;
2918 if (end <= vma->vm_start || end > vma->vm_end)
2919 return -EINVAL;
2920 if (!ops || !src || !dst || start >= end)
2921 return -EINVAL;
2922
2923 memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
2924 migrate.src = src;
2925 migrate.dst = dst;
2926 migrate.start = start;
2927 migrate.npages = 0;
2928 migrate.cpages = 0;
2929 migrate.end = end;
2930 migrate.vma = vma;
2931
2932 /* Collect, and try to unmap source pages */
2933 migrate_vma_collect(&migrate);
2934 if (!migrate.cpages)
2935 return 0;
2936
2937 /* Lock and isolate page */
2938 migrate_vma_prepare(&migrate);
2939 if (!migrate.cpages)
2940 return 0;
2941
2942 /* Unmap pages */
2943 migrate_vma_unmap(&migrate);
2944 if (!migrate.cpages)
2945 return 0;
2946
2947 /*
2948 * At this point pages are locked and unmapped, and thus they have
2949 * stable content and can safely be copied to destination memory that
2950 * is allocated by the callback.
2951 *
2952 * Note that migration can fail in migrate_vma_struct_page() for each
2953 * individual page.
2954 */
2955 ops->alloc_and_copy(vma, src, dst, start, end, private);
2956
2957 /* This does the real migration of struct page */
2958 migrate_vma_pages(&migrate);
2959
2960 ops->finalize_and_map(vma, src, dst, start, end, private);
2961
2962 /* Unlock and remap pages */
2963 migrate_vma_finalize(&migrate);
2964
2965 return 0;
2966}
2967EXPORT_SYMBOL(migrate_vma);
2968#endif /* defined(MIGRATE_VMA_HELPER) */
diff --git a/mm/mincore.c b/mm/mincore.c
index 4fe91d497436..f9a9dbe8cd33 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -10,7 +10,7 @@
10 */ 10 */
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/gfp.h> 12#include <linux/gfp.h>
13#include <linux/mm.h> 13#include <linux/pagewalk.h>
14#include <linux/mman.h> 14#include <linux/mman.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/swap.h> 16#include <linux/swap.h>
@@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
193 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; 193 inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
194} 194}
195 195
196static const struct mm_walk_ops mincore_walk_ops = {
197 .pmd_entry = mincore_pte_range,
198 .pte_hole = mincore_unmapped_range,
199 .hugetlb_entry = mincore_hugetlb,
200};
201
196/* 202/*
197 * Do a chunk of "sys_mincore()". We've already checked 203 * Do a chunk of "sys_mincore()". We've already checked
198 * all the arguments, we hold the mmap semaphore: we should 204 * all the arguments, we hold the mmap semaphore: we should
@@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
203 struct vm_area_struct *vma; 209 struct vm_area_struct *vma;
204 unsigned long end; 210 unsigned long end;
205 int err; 211 int err;
206 struct mm_walk mincore_walk = {
207 .pmd_entry = mincore_pte_range,
208 .pte_hole = mincore_unmapped_range,
209 .hugetlb_entry = mincore_hugetlb,
210 .private = vec,
211 };
212 212
213 vma = find_vma(current->mm, addr); 213 vma = find_vma(current->mm, addr);
214 if (!vma || addr < vma->vm_start) 214 if (!vma || addr < vma->vm_start)
@@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
219 memset(vec, 1, pages); 219 memset(vec, 1, pages);
220 return pages; 220 return pages;
221 } 221 }
222 mincore_walk.mm = vma->vm_mm; 222 err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
223 err = walk_page_range(addr, end, &mincore_walk);
224 if (err < 0) 223 if (err < 0)
225 return err; 224 return err;
226 return (end - addr) >> PAGE_SHIFT; 225 return (end - addr) >> PAGE_SHIFT;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index b5670620aea0..7fde88695f35 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -21,17 +21,11 @@
21/* global SRCU for all MMs */ 21/* global SRCU for all MMs */
22DEFINE_STATIC_SRCU(srcu); 22DEFINE_STATIC_SRCU(srcu);
23 23
24/* 24#ifdef CONFIG_LOCKDEP
25 * This function allows mmu_notifier::release callback to delay a call to 25struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
26 * a function that will free appropriate resources. The function must be 26 .name = "mmu_notifier_invalidate_range_start"
27 * quick and must not block. 27};
28 */ 28#endif
29void mmu_notifier_call_srcu(struct rcu_head *rcu,
30 void (*func)(struct rcu_head *rcu))
31{
32 call_srcu(&srcu, rcu, func);
33}
34EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
35 29
36/* 30/*
37 * This function can't run concurrently against mmu_notifier_register 31 * This function can't run concurrently against mmu_notifier_register
@@ -174,11 +168,19 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
174 id = srcu_read_lock(&srcu); 168 id = srcu_read_lock(&srcu);
175 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { 169 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
176 if (mn->ops->invalidate_range_start) { 170 if (mn->ops->invalidate_range_start) {
177 int _ret = mn->ops->invalidate_range_start(mn, range); 171 int _ret;
172
173 if (!mmu_notifier_range_blockable(range))
174 non_block_start();
175 _ret = mn->ops->invalidate_range_start(mn, range);
176 if (!mmu_notifier_range_blockable(range))
177 non_block_end();
178 if (_ret) { 178 if (_ret) {
179 pr_info("%pS callback failed with %d in %sblockable context.\n", 179 pr_info("%pS callback failed with %d in %sblockable context.\n",
180 mn->ops->invalidate_range_start, _ret, 180 mn->ops->invalidate_range_start, _ret,
181 !mmu_notifier_range_blockable(range) ? "non-" : ""); 181 !mmu_notifier_range_blockable(range) ? "non-" : "");
182 WARN_ON(mmu_notifier_range_blockable(range) ||
183 ret != -EAGAIN);
182 ret = _ret; 184 ret = _ret;
183 } 185 }
184 } 186 }
@@ -187,7 +189,6 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
187 189
188 return ret; 190 return ret;
189} 191}
190EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
191 192
192void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, 193void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
193 bool only_end) 194 bool only_end)
@@ -195,6 +196,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
195 struct mmu_notifier *mn; 196 struct mmu_notifier *mn;
196 int id; 197 int id;
197 198
199 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
198 id = srcu_read_lock(&srcu); 200 id = srcu_read_lock(&srcu);
199 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { 201 hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
200 /* 202 /*
@@ -214,12 +216,17 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
214 mn->ops->invalidate_range(mn, range->mm, 216 mn->ops->invalidate_range(mn, range->mm,
215 range->start, 217 range->start,
216 range->end); 218 range->end);
217 if (mn->ops->invalidate_range_end) 219 if (mn->ops->invalidate_range_end) {
220 if (!mmu_notifier_range_blockable(range))
221 non_block_start();
218 mn->ops->invalidate_range_end(mn, range); 222 mn->ops->invalidate_range_end(mn, range);
223 if (!mmu_notifier_range_blockable(range))
224 non_block_end();
225 }
219 } 226 }
220 srcu_read_unlock(&srcu, id); 227 srcu_read_unlock(&srcu, id);
228 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
221} 229}
222EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
223 230
224void __mmu_notifier_invalidate_range(struct mm_struct *mm, 231void __mmu_notifier_invalidate_range(struct mm_struct *mm,
225 unsigned long start, unsigned long end) 232 unsigned long start, unsigned long end)
@@ -234,35 +241,49 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
234 } 241 }
235 srcu_read_unlock(&srcu, id); 242 srcu_read_unlock(&srcu, id);
236} 243}
237EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
238 244
239static int do_mmu_notifier_register(struct mmu_notifier *mn, 245/*
240 struct mm_struct *mm, 246 * Same as mmu_notifier_register but here the caller must hold the
241 int take_mmap_sem) 247 * mmap_sem in write mode.
248 */
249int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
242{ 250{
243 struct mmu_notifier_mm *mmu_notifier_mm; 251 struct mmu_notifier_mm *mmu_notifier_mm = NULL;
244 int ret; 252 int ret;
245 253
254 lockdep_assert_held_write(&mm->mmap_sem);
246 BUG_ON(atomic_read(&mm->mm_users) <= 0); 255 BUG_ON(atomic_read(&mm->mm_users) <= 0);
247 256
248 ret = -ENOMEM; 257 if (IS_ENABLED(CONFIG_LOCKDEP)) {
249 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 258 fs_reclaim_acquire(GFP_KERNEL);
250 if (unlikely(!mmu_notifier_mm)) 259 lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
251 goto out; 260 lock_map_release(&__mmu_notifier_invalidate_range_start_map);
261 fs_reclaim_release(GFP_KERNEL);
262 }
252 263
253 if (take_mmap_sem) 264 mn->mm = mm;
254 down_write(&mm->mmap_sem); 265 mn->users = 1;
255 ret = mm_take_all_locks(mm); 266
256 if (unlikely(ret)) 267 if (!mm->mmu_notifier_mm) {
257 goto out_clean; 268 /*
269 * kmalloc cannot be called under mm_take_all_locks(), but we
270 * know that mm->mmu_notifier_mm can't change while we hold
271 * the write side of the mmap_sem.
272 */
273 mmu_notifier_mm =
274 kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
275 if (!mmu_notifier_mm)
276 return -ENOMEM;
258 277
259 if (!mm_has_notifiers(mm)) {
260 INIT_HLIST_HEAD(&mmu_notifier_mm->list); 278 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
261 spin_lock_init(&mmu_notifier_mm->lock); 279 spin_lock_init(&mmu_notifier_mm->lock);
262
263 mm->mmu_notifier_mm = mmu_notifier_mm;
264 mmu_notifier_mm = NULL;
265 } 280 }
281
282 ret = mm_take_all_locks(mm);
283 if (unlikely(ret))
284 goto out_clean;
285
286 /* Pairs with the mmdrop in mmu_notifier_unregister_* */
266 mmgrab(mm); 287 mmgrab(mm);
267 288
268 /* 289 /*
@@ -273,48 +294,118 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
273 * We can't race against any other mmu notifier method either 294 * We can't race against any other mmu notifier method either
274 * thanks to mm_take_all_locks(). 295 * thanks to mm_take_all_locks().
275 */ 296 */
297 if (mmu_notifier_mm)
298 mm->mmu_notifier_mm = mmu_notifier_mm;
299
276 spin_lock(&mm->mmu_notifier_mm->lock); 300 spin_lock(&mm->mmu_notifier_mm->lock);
277 hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); 301 hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
278 spin_unlock(&mm->mmu_notifier_mm->lock); 302 spin_unlock(&mm->mmu_notifier_mm->lock);
279 303
280 mm_drop_all_locks(mm); 304 mm_drop_all_locks(mm);
305 BUG_ON(atomic_read(&mm->mm_users) <= 0);
306 return 0;
307
281out_clean: 308out_clean:
282 if (take_mmap_sem)
283 up_write(&mm->mmap_sem);
284 kfree(mmu_notifier_mm); 309 kfree(mmu_notifier_mm);
285out:
286 BUG_ON(atomic_read(&mm->mm_users) <= 0);
287 return ret; 310 return ret;
288} 311}
312EXPORT_SYMBOL_GPL(__mmu_notifier_register);
289 313
290/* 314/**
315 * mmu_notifier_register - Register a notifier on a mm
316 * @mn: The notifier to attach
317 * @mm: The mm to attach the notifier to
318 *
291 * Must not hold mmap_sem nor any other VM related lock when calling 319 * Must not hold mmap_sem nor any other VM related lock when calling
292 * this registration function. Must also ensure mm_users can't go down 320 * this registration function. Must also ensure mm_users can't go down
293 * to zero while this runs to avoid races with mmu_notifier_release, 321 * to zero while this runs to avoid races with mmu_notifier_release,
294 * so mm has to be current->mm or the mm should be pinned safely such 322 * so mm has to be current->mm or the mm should be pinned safely such
295 * as with get_task_mm(). If the mm is not current->mm, the mm_users 323 * as with get_task_mm(). If the mm is not current->mm, the mm_users
296 * pin should be released by calling mmput after mmu_notifier_register 324 * pin should be released by calling mmput after mmu_notifier_register
297 * returns. mmu_notifier_unregister must be always called to 325 * returns.
298 * unregister the notifier. mm_count is automatically pinned to allow 326 *
299 * mmu_notifier_unregister to safely run at any time later, before or 327 * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
300 * after exit_mmap. ->release will always be called before exit_mmap 328 * unregister the notifier.
301 * frees the pages. 329 *
330 * While the caller has a mmu_notifier get the mn->mm pointer will remain
331 * valid, and can be converted to an active mm pointer via mmget_not_zero().
302 */ 332 */
303int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 333int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
304{ 334{
305 return do_mmu_notifier_register(mn, mm, 1); 335 int ret;
336
337 down_write(&mm->mmap_sem);
338 ret = __mmu_notifier_register(mn, mm);
339 up_write(&mm->mmap_sem);
340 return ret;
306} 341}
307EXPORT_SYMBOL_GPL(mmu_notifier_register); 342EXPORT_SYMBOL_GPL(mmu_notifier_register);
308 343
309/* 344static struct mmu_notifier *
310 * Same as mmu_notifier_register but here the caller must hold the 345find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
311 * mmap_sem in write mode. 346{
347 struct mmu_notifier *mn;
348
349 spin_lock(&mm->mmu_notifier_mm->lock);
350 hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) {
351 if (mn->ops != ops)
352 continue;
353
354 if (likely(mn->users != UINT_MAX))
355 mn->users++;
356 else
357 mn = ERR_PTR(-EOVERFLOW);
358 spin_unlock(&mm->mmu_notifier_mm->lock);
359 return mn;
360 }
361 spin_unlock(&mm->mmu_notifier_mm->lock);
362 return NULL;
363}
364
365/**
366 * mmu_notifier_get_locked - Return the single struct mmu_notifier for
367 * the mm & ops
368 * @ops: The operations struct being subscribe with
369 * @mm : The mm to attach notifiers too
370 *
371 * This function either allocates a new mmu_notifier via
372 * ops->alloc_notifier(), or returns an already existing notifier on the
373 * list. The value of the ops pointer is used to determine when two notifiers
374 * are the same.
375 *
376 * Each call to mmu_notifier_get() must be paired with a call to
377 * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem.
378 *
379 * While the caller has a mmu_notifier get the mm pointer will remain valid,
380 * and can be converted to an active mm pointer via mmget_not_zero().
312 */ 381 */
313int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) 382struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
383 struct mm_struct *mm)
314{ 384{
315 return do_mmu_notifier_register(mn, mm, 0); 385 struct mmu_notifier *mn;
386 int ret;
387
388 lockdep_assert_held_write(&mm->mmap_sem);
389
390 if (mm->mmu_notifier_mm) {
391 mn = find_get_mmu_notifier(mm, ops);
392 if (mn)
393 return mn;
394 }
395
396 mn = ops->alloc_notifier(mm);
397 if (IS_ERR(mn))
398 return mn;
399 mn->ops = ops;
400 ret = __mmu_notifier_register(mn, mm);
401 if (ret)
402 goto out_free;
403 return mn;
404out_free:
405 mn->ops->free_notifier(mn);
406 return ERR_PTR(ret);
316} 407}
317EXPORT_SYMBOL_GPL(__mmu_notifier_register); 408EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
318 409
319/* this is called after the last mmu_notifier_unregister() returned */ 410/* this is called after the last mmu_notifier_unregister() returned */
320void __mmu_notifier_mm_destroy(struct mm_struct *mm) 411void __mmu_notifier_mm_destroy(struct mm_struct *mm)
@@ -375,24 +466,74 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
375} 466}
376EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 467EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
377 468
378/* 469static void mmu_notifier_free_rcu(struct rcu_head *rcu)
379 * Same as mmu_notifier_unregister but no callback and no srcu synchronization. 470{
471 struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu);
472 struct mm_struct *mm = mn->mm;
473
474 mn->ops->free_notifier(mn);
475 /* Pairs with the get in __mmu_notifier_register() */
476 mmdrop(mm);
477}
478
479/**
480 * mmu_notifier_put - Release the reference on the notifier
481 * @mn: The notifier to act on
482 *
483 * This function must be paired with each mmu_notifier_get(), it releases the
484 * reference obtained by the get. If this is the last reference then process
485 * to free the notifier will be run asynchronously.
486 *
487 * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
488 * when the mm_struct is destroyed. Instead free_notifier is always called to
489 * release any resources held by the user.
490 *
491 * As ops->release is not guaranteed to be called, the user must ensure that
492 * all sptes are dropped, and no new sptes can be established before
493 * mmu_notifier_put() is called.
494 *
495 * This function can be called from the ops->release callback, however the
496 * caller must still ensure it is called pairwise with mmu_notifier_get().
497 *
498 * Modules calling this function must call mmu_notifier_synchronize() in
499 * their __exit functions to ensure the async work is completed.
380 */ 500 */
381void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, 501void mmu_notifier_put(struct mmu_notifier *mn)
382 struct mm_struct *mm)
383{ 502{
503 struct mm_struct *mm = mn->mm;
504
384 spin_lock(&mm->mmu_notifier_mm->lock); 505 spin_lock(&mm->mmu_notifier_mm->lock);
385 /* 506 if (WARN_ON(!mn->users) || --mn->users)
386 * Can not use list_del_rcu() since __mmu_notifier_release 507 goto out_unlock;
387 * can delete it before we hold the lock.
388 */
389 hlist_del_init_rcu(&mn->hlist); 508 hlist_del_init_rcu(&mn->hlist);
390 spin_unlock(&mm->mmu_notifier_mm->lock); 509 spin_unlock(&mm->mmu_notifier_mm->lock);
391 510
392 BUG_ON(atomic_read(&mm->mm_count) <= 0); 511 call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu);
393 mmdrop(mm); 512 return;
513
514out_unlock:
515 spin_unlock(&mm->mmu_notifier_mm->lock);
516}
517EXPORT_SYMBOL_GPL(mmu_notifier_put);
518
519/**
520 * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
521 *
522 * This function ensures that all outstanding async SRU work from
523 * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
524 * associated with an unused mmu_notifier will no longer be called.
525 *
526 * Before using the caller must ensure that all of its mmu_notifiers have been
527 * fully released via mmu_notifier_put().
528 *
529 * Modules using the mmu_notifier_put() API should call this in their __exit
530 * function to avoid module unloading races.
531 */
532void mmu_notifier_synchronize(void)
533{
534 synchronize_srcu(&srcu);
394} 535}
395EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); 536EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
396 537
397bool 538bool
398mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) 539mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bf38dfbbb4b4..675e5d34a507 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -9,7 +9,7 @@
9 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 9 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
10 */ 10 */
11 11
12#include <linux/mm.h> 12#include <linux/pagewalk.h>
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/shm.h> 14#include <linux/shm.h>
15#include <linux/mman.h> 15#include <linux/mman.h>
@@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next,
329 return 0; 329 return 0;
330} 330}
331 331
332static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, 332static const struct mm_walk_ops prot_none_walk_ops = {
333 unsigned long end, unsigned long newflags) 333 .pte_entry = prot_none_pte_entry,
334{ 334 .hugetlb_entry = prot_none_hugetlb_entry,
335 pgprot_t new_pgprot = vm_get_page_prot(newflags); 335 .test_walk = prot_none_test,
336 struct mm_walk prot_none_walk = { 336};
337 .pte_entry = prot_none_pte_entry,
338 .hugetlb_entry = prot_none_hugetlb_entry,
339 .test_walk = prot_none_test,
340 .mm = current->mm,
341 .private = &new_pgprot,
342 };
343
344 return walk_page_range(start, end, &prot_none_walk);
345}
346 337
347int 338int
348mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, 339mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
@@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
369 if (arch_has_pfn_modify_check() && 360 if (arch_has_pfn_modify_check() &&
370 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && 361 (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
371 (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { 362 (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
372 error = prot_none_walk(vma, start, end, newflags); 363 pgprot_t new_pgprot = vm_get_page_prot(newflags);
364
365 error = walk_page_range(current->mm, start, end,
366 &prot_none_walk_ops, &new_pgprot);
373 if (error) 367 if (error)
374 return error; 368 return error;
375 } 369 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6991ccec9c32..ff5484fdbdf9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5971,7 +5971,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
5971 } 5971 }
5972 } 5972 }
5973 5973
5974 pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), 5974 pr_info("%s initialised %lu pages in %ums\n", __func__,
5975 size, jiffies_to_msecs(jiffies - start)); 5975 size, jiffies_to_msecs(jiffies - start));
5976} 5976}
5977 5977
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3084ff2569d..d48c2a986ea3 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,5 +1,5 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/mm.h> 2#include <linux/pagewalk.h>
3#include <linux/highmem.h> 3#include <linux/highmem.h>
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/hugetlb.h> 5#include <linux/hugetlb.h>
@@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
9{ 9{
10 pte_t *pte; 10 pte_t *pte;
11 int err = 0; 11 int err = 0;
12 const struct mm_walk_ops *ops = walk->ops;
12 13
13 pte = pte_offset_map(pmd, addr); 14 pte = pte_offset_map(pmd, addr);
14 for (;;) { 15 for (;;) {
15 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); 16 err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
16 if (err) 17 if (err)
17 break; 18 break;
18 addr += PAGE_SIZE; 19 addr += PAGE_SIZE;
@@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
30{ 31{
31 pmd_t *pmd; 32 pmd_t *pmd;
32 unsigned long next; 33 unsigned long next;
34 const struct mm_walk_ops *ops = walk->ops;
33 int err = 0; 35 int err = 0;
34 36
35 pmd = pmd_offset(pud, addr); 37 pmd = pmd_offset(pud, addr);
@@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
37again: 39again:
38 next = pmd_addr_end(addr, end); 40 next = pmd_addr_end(addr, end);
39 if (pmd_none(*pmd) || !walk->vma) { 41 if (pmd_none(*pmd) || !walk->vma) {
40 if (walk->pte_hole) 42 if (ops->pte_hole)
41 err = walk->pte_hole(addr, next, walk); 43 err = ops->pte_hole(addr, next, walk);
42 if (err) 44 if (err)
43 break; 45 break;
44 continue; 46 continue;
@@ -47,8 +49,8 @@ again:
47 * This implies that each ->pmd_entry() handler 49 * This implies that each ->pmd_entry() handler
48 * needs to know about pmd_trans_huge() pmds 50 * needs to know about pmd_trans_huge() pmds
49 */ 51 */
50 if (walk->pmd_entry) 52 if (ops->pmd_entry)
51 err = walk->pmd_entry(pmd, addr, next, walk); 53 err = ops->pmd_entry(pmd, addr, next, walk);
52 if (err) 54 if (err)
53 break; 55 break;
54 56
@@ -56,7 +58,7 @@ again:
56 * Check this here so we only break down trans_huge 58 * Check this here so we only break down trans_huge
57 * pages when we _need_ to 59 * pages when we _need_ to
58 */ 60 */
59 if (!walk->pte_entry) 61 if (!ops->pte_entry)
60 continue; 62 continue;
61 63
62 split_huge_pmd(walk->vma, pmd, addr); 64 split_huge_pmd(walk->vma, pmd, addr);
@@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
75{ 77{
76 pud_t *pud; 78 pud_t *pud;
77 unsigned long next; 79 unsigned long next;
80 const struct mm_walk_ops *ops = walk->ops;
78 int err = 0; 81 int err = 0;
79 82
80 pud = pud_offset(p4d, addr); 83 pud = pud_offset(p4d, addr);
@@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
82 again: 85 again:
83 next = pud_addr_end(addr, end); 86 next = pud_addr_end(addr, end);
84 if (pud_none(*pud) || !walk->vma) { 87 if (pud_none(*pud) || !walk->vma) {
85 if (walk->pte_hole) 88 if (ops->pte_hole)
86 err = walk->pte_hole(addr, next, walk); 89 err = ops->pte_hole(addr, next, walk);
87 if (err) 90 if (err)
88 break; 91 break;
89 continue; 92 continue;
90 } 93 }
91 94
92 if (walk->pud_entry) { 95 if (ops->pud_entry) {
93 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); 96 spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
94 97
95 if (ptl) { 98 if (ptl) {
96 err = walk->pud_entry(pud, addr, next, walk); 99 err = ops->pud_entry(pud, addr, next, walk);
97 spin_unlock(ptl); 100 spin_unlock(ptl);
98 if (err) 101 if (err)
99 break; 102 break;
@@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
105 if (pud_none(*pud)) 108 if (pud_none(*pud))
106 goto again; 109 goto again;
107 110
108 if (walk->pmd_entry || walk->pte_entry) 111 if (ops->pmd_entry || ops->pte_entry)
109 err = walk_pmd_range(pud, addr, next, walk); 112 err = walk_pmd_range(pud, addr, next, walk);
110 if (err) 113 if (err)
111 break; 114 break;
@@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
119{ 122{
120 p4d_t *p4d; 123 p4d_t *p4d;
121 unsigned long next; 124 unsigned long next;
125 const struct mm_walk_ops *ops = walk->ops;
122 int err = 0; 126 int err = 0;
123 127
124 p4d = p4d_offset(pgd, addr); 128 p4d = p4d_offset(pgd, addr);
125 do { 129 do {
126 next = p4d_addr_end(addr, end); 130 next = p4d_addr_end(addr, end);
127 if (p4d_none_or_clear_bad(p4d)) { 131 if (p4d_none_or_clear_bad(p4d)) {
128 if (walk->pte_hole) 132 if (ops->pte_hole)
129 err = walk->pte_hole(addr, next, walk); 133 err = ops->pte_hole(addr, next, walk);
130 if (err) 134 if (err)
131 break; 135 break;
132 continue; 136 continue;
133 } 137 }
134 if (walk->pmd_entry || walk->pte_entry) 138 if (ops->pmd_entry || ops->pte_entry)
135 err = walk_pud_range(p4d, addr, next, walk); 139 err = walk_pud_range(p4d, addr, next, walk);
136 if (err) 140 if (err)
137 break; 141 break;
@@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
145{ 149{
146 pgd_t *pgd; 150 pgd_t *pgd;
147 unsigned long next; 151 unsigned long next;
152 const struct mm_walk_ops *ops = walk->ops;
148 int err = 0; 153 int err = 0;
149 154
150 pgd = pgd_offset(walk->mm, addr); 155 pgd = pgd_offset(walk->mm, addr);
151 do { 156 do {
152 next = pgd_addr_end(addr, end); 157 next = pgd_addr_end(addr, end);
153 if (pgd_none_or_clear_bad(pgd)) { 158 if (pgd_none_or_clear_bad(pgd)) {
154 if (walk->pte_hole) 159 if (ops->pte_hole)
155 err = walk->pte_hole(addr, next, walk); 160 err = ops->pte_hole(addr, next, walk);
156 if (err) 161 if (err)
157 break; 162 break;
158 continue; 163 continue;
159 } 164 }
160 if (walk->pmd_entry || walk->pte_entry) 165 if (ops->pmd_entry || ops->pte_entry)
161 err = walk_p4d_range(pgd, addr, next, walk); 166 err = walk_p4d_range(pgd, addr, next, walk);
162 if (err) 167 if (err)
163 break; 168 break;
@@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
183 unsigned long hmask = huge_page_mask(h); 188 unsigned long hmask = huge_page_mask(h);
184 unsigned long sz = huge_page_size(h); 189 unsigned long sz = huge_page_size(h);
185 pte_t *pte; 190 pte_t *pte;
191 const struct mm_walk_ops *ops = walk->ops;
186 int err = 0; 192 int err = 0;
187 193
188 do { 194 do {
@@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
190 pte = huge_pte_offset(walk->mm, addr & hmask, sz); 196 pte = huge_pte_offset(walk->mm, addr & hmask, sz);
191 197
192 if (pte) 198 if (pte)
193 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 199 err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
194 else if (walk->pte_hole) 200 else if (ops->pte_hole)
195 err = walk->pte_hole(addr, next, walk); 201 err = ops->pte_hole(addr, next, walk);
196 202
197 if (err) 203 if (err)
198 break; 204 break;
@@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end,
220 struct mm_walk *walk) 226 struct mm_walk *walk)
221{ 227{
222 struct vm_area_struct *vma = walk->vma; 228 struct vm_area_struct *vma = walk->vma;
229 const struct mm_walk_ops *ops = walk->ops;
223 230
224 if (walk->test_walk) 231 if (ops->test_walk)
225 return walk->test_walk(start, end, walk); 232 return ops->test_walk(start, end, walk);
226 233
227 /* 234 /*
228 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP 235 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
@@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end,
234 */ 241 */
235 if (vma->vm_flags & VM_PFNMAP) { 242 if (vma->vm_flags & VM_PFNMAP) {
236 int err = 1; 243 int err = 1;
237 if (walk->pte_hole) 244 if (ops->pte_hole)
238 err = walk->pte_hole(start, end, walk); 245 err = ops->pte_hole(start, end, walk);
239 return err ? err : 1; 246 return err ? err : 1;
240 } 247 }
241 return 0; 248 return 0;
@@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
248 struct vm_area_struct *vma = walk->vma; 255 struct vm_area_struct *vma = walk->vma;
249 256
250 if (vma && is_vm_hugetlb_page(vma)) { 257 if (vma && is_vm_hugetlb_page(vma)) {
251 if (walk->hugetlb_entry) 258 if (walk->ops->hugetlb_entry)
252 err = walk_hugetlb_range(start, end, walk); 259 err = walk_hugetlb_range(start, end, walk);
253 } else 260 } else
254 err = walk_pgd_range(start, end, walk); 261 err = walk_pgd_range(start, end, walk);
@@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end,
258 265
259/** 266/**
260 * walk_page_range - walk page table with caller specific callbacks 267 * walk_page_range - walk page table with caller specific callbacks
261 * @start: start address of the virtual address range 268 * @mm: mm_struct representing the target process of page table walk
262 * @end: end address of the virtual address range 269 * @start: start address of the virtual address range
263 * @walk: mm_walk structure defining the callbacks and the target address space 270 * @end: end address of the virtual address range
271 * @ops: operation to call during the walk
272 * @private: private data for callbacks' usage
264 * 273 *
265 * Recursively walk the page table tree of the process represented by @walk->mm 274 * Recursively walk the page table tree of the process represented by @mm
266 * within the virtual address range [@start, @end). During walking, we can do 275 * within the virtual address range [@start, @end). During walking, we can do
267 * some caller-specific works for each entry, by setting up pmd_entry(), 276 * some caller-specific works for each entry, by setting up pmd_entry(),
268 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these 277 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
@@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end,
278 * 287 *
279 * Before starting to walk page table, some callers want to check whether 288 * Before starting to walk page table, some callers want to check whether
280 * they really want to walk over the current vma, typically by checking 289 * they really want to walk over the current vma, typically by checking
281 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this 290 * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
282 * purpose. 291 * purpose.
283 * 292 *
284 * struct mm_walk keeps current values of some common data like vma and pmd, 293 * struct mm_walk keeps current values of some common data like vma and pmd,
285 * which are useful for the access from callbacks. If you want to pass some 294 * which are useful for the access from callbacks. If you want to pass some
286 * caller-specific data to callbacks, @walk->private should be helpful. 295 * caller-specific data to callbacks, @private should be helpful.
287 * 296 *
288 * Locking: 297 * Locking:
289 * Callers of walk_page_range() and walk_page_vma() should hold 298 * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
290 * @walk->mm->mmap_sem, because these function traverse vma list and/or 299 * because these function traverse vma list and/or access to vma's data.
291 * access to vma's data.
292 */ 300 */
293int walk_page_range(unsigned long start, unsigned long end, 301int walk_page_range(struct mm_struct *mm, unsigned long start,
294 struct mm_walk *walk) 302 unsigned long end, const struct mm_walk_ops *ops,
303 void *private)
295{ 304{
296 int err = 0; 305 int err = 0;
297 unsigned long next; 306 unsigned long next;
298 struct vm_area_struct *vma; 307 struct vm_area_struct *vma;
308 struct mm_walk walk = {
309 .ops = ops,
310 .mm = mm,
311 .private = private,
312 };
299 313
300 if (start >= end) 314 if (start >= end)
301 return -EINVAL; 315 return -EINVAL;
302 316
303 if (!walk->mm) 317 if (!walk.mm)
304 return -EINVAL; 318 return -EINVAL;
305 319
306 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 320 lockdep_assert_held(&walk.mm->mmap_sem);
307 321
308 vma = find_vma(walk->mm, start); 322 vma = find_vma(walk.mm, start);
309 do { 323 do {
310 if (!vma) { /* after the last vma */ 324 if (!vma) { /* after the last vma */
311 walk->vma = NULL; 325 walk.vma = NULL;
312 next = end; 326 next = end;
313 } else if (start < vma->vm_start) { /* outside vma */ 327 } else if (start < vma->vm_start) { /* outside vma */
314 walk->vma = NULL; 328 walk.vma = NULL;
315 next = min(end, vma->vm_start); 329 next = min(end, vma->vm_start);
316 } else { /* inside vma */ 330 } else { /* inside vma */
317 walk->vma = vma; 331 walk.vma = vma;
318 next = min(end, vma->vm_end); 332 next = min(end, vma->vm_end);
319 vma = vma->vm_next; 333 vma = vma->vm_next;
320 334
321 err = walk_page_test(start, next, walk); 335 err = walk_page_test(start, next, &walk);
322 if (err > 0) { 336 if (err > 0) {
323 /* 337 /*
324 * positive return values are purely for 338 * positive return values are purely for
@@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end,
331 if (err < 0) 345 if (err < 0)
332 break; 346 break;
333 } 347 }
334 if (walk->vma || walk->pte_hole) 348 if (walk.vma || walk.ops->pte_hole)
335 err = __walk_page_range(start, next, walk); 349 err = __walk_page_range(start, next, &walk);
336 if (err) 350 if (err)
337 break; 351 break;
338 } while (start = next, start < end); 352 } while (start = next, start < end);
339 return err; 353 return err;
340} 354}
341 355
342int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) 356int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
357 void *private)
343{ 358{
359 struct mm_walk walk = {
360 .ops = ops,
361 .mm = vma->vm_mm,
362 .vma = vma,
363 .private = private,
364 };
344 int err; 365 int err;
345 366
346 if (!walk->mm) 367 if (!walk.mm)
347 return -EINVAL; 368 return -EINVAL;
348 369
349 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); 370 lockdep_assert_held(&walk.mm->mmap_sem);
350 VM_BUG_ON(!vma); 371
351 walk->vma = vma; 372 err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
352 err = walk_page_test(vma->vm_start, vma->vm_end, walk);
353 if (err > 0) 373 if (err > 0)
354 return 0; 374 return 0;
355 if (err < 0) 375 if (err < 0)
356 return err; 376 return err;
357 return __walk_page_range(vma->vm_start, vma->vm_end, walk); 377 return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
358} 378}
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index cd040b5abffe..3f55f2f99112 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -132,7 +132,6 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
132 if (!nfit_res) 132 if (!nfit_res)
133 return devm_memremap_pages(dev, pgmap); 133 return devm_memremap_pages(dev, pgmap);
134 134
135 pgmap->dev = dev;
136 if (!pgmap->ref) { 135 if (!pgmap->ref) {
137 if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) 136 if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
138 return ERR_PTR(-EINVAL); 137 return ERR_PTR(-EINVAL);