diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 13:07:42 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-21 13:07:42 -0400 |
commit | 84da111de0b4be15bd500deff773f5116f39f7be (patch) | |
tree | 76b5796f8258397bf7a3926b742a89166a8501ef | |
parent | 227c3e9eb5cf3552c2cc83225df6d14adb05f8e8 (diff) | |
parent | 62974fc389b364d8af70e044836362222bd3ae53 (diff) |
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe:
"This is more cleanup and consolidation of the hmm APIs and the very
strongly related mmu_notifier interfaces. Many places across the tree
using these interfaces are touched in the process. Beyond that a
cleanup to the page walker API and a few memremap related changes
round out the series:
- General improvement of hmm_range_fault() and related APIs, more
documentation, bug fixes from testing, API simplification &
consolidation, and unused API removal
- Simplify the hmm related kconfigs to HMM_MIRROR and DEVICE_PRIVATE,
and make them internal kconfig selects
- Hoist a lot of code related to mmu notifier attachment out of
drivers by using a refcount get/put attachment idiom and remove the
convoluted mmu_notifier_unregister_no_release() and related APIs.
- General API improvement for the migrate_vma API and revision of its
only user in nouveau
- Annotate mmu_notifiers with lockdep and sleeping region debugging
Two series unrelated to HMM or mmu_notifiers came along due to
dependencies:
- Allow pagemap's memremap_pages family of APIs to work without
providing a struct device
- Make walk_page_range() and related use a constant structure for
function pointers"
* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (75 commits)
libnvdimm: Enable unit test infrastructure compile checks
mm, notifier: Catch sleeping/blocking for !blockable
kernel.h: Add non_block_start/end()
drm/radeon: guard against calling an unpaired radeon_mn_unregister()
csky: add missing brackets in a macro for tlb.h
pagewalk: use lockdep_assert_held for locking validation
pagewalk: separate function pointers from iterator data
mm: split out a new pagewalk.h header from mm.h
mm/mmu_notifiers: annotate with might_sleep()
mm/mmu_notifiers: prime lockdep
mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end
mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports
mm/hmm: hmm_range_fault() infinite loop
mm/hmm: hmm_range_fault() NULL pointer bug
mm/hmm: fix hmm_range_fault()'s handling of swapped out pages
mm/mmu_notifiers: remove unregister_no_release
RDMA/odp: remove ib_ucontext from ib_umem
RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm'
RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr
RDMA/mlx5: Use ib_umem_start instead of umem.address
...
65 files changed, 1684 insertions, 2184 deletions
diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 710ce1c701bf..0a5960beccf7 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst | |||
@@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must complete the update before | |||
192 | the driver callback returns. | 192 | the driver callback returns. |
193 | 193 | ||
194 | When the device driver wants to populate a range of virtual addresses, it can | 194 | When the device driver wants to populate a range of virtual addresses, it can |
195 | use either:: | 195 | use:: |
196 | 196 | ||
197 | long hmm_range_snapshot(struct hmm_range *range); | 197 | long hmm_range_fault(struct hmm_range *range, unsigned int flags); |
198 | long hmm_range_fault(struct hmm_range *range, bool block); | ||
199 | 198 | ||
200 | The first one (hmm_range_snapshot()) will only fetch present CPU page table | 199 | With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table |
201 | entries and will not trigger a page fault on missing or non-present entries. | 200 | entries and will not trigger a page fault on missing or non-present entries. |
202 | The second one does trigger a page fault on missing or read-only entries if | 201 | Without that flag, it does trigger a page fault on missing or read-only entries |
203 | write access is requested (see below). Page faults use the generic mm page | 202 | if write access is requested (see below). Page faults use the generic mm page |
204 | fault code path just like a CPU page fault. | 203 | fault code path just like a CPU page fault. |
205 | 204 | ||
206 | Both functions copy CPU page table entries into their pfns array argument. Each | 205 | Both functions copy CPU page table entries into their pfns array argument. Each |
@@ -223,24 +222,24 @@ The usage pattern is:: | |||
223 | range.flags = ...; | 222 | range.flags = ...; |
224 | range.values = ...; | 223 | range.values = ...; |
225 | range.pfn_shift = ...; | 224 | range.pfn_shift = ...; |
226 | hmm_range_register(&range); | 225 | hmm_range_register(&range, mirror); |
227 | 226 | ||
228 | /* | 227 | /* |
229 | * Just wait for range to be valid, safe to ignore return value as we | 228 | * Just wait for range to be valid, safe to ignore return value as we |
230 | * will use the return value of hmm_range_snapshot() below under the | 229 | * will use the return value of hmm_range_fault() below under the |
231 | * mmap_sem to ascertain the validity of the range. | 230 | * mmap_sem to ascertain the validity of the range. |
232 | */ | 231 | */ |
233 | hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); | 232 | hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); |
234 | 233 | ||
235 | again: | 234 | again: |
236 | down_read(&mm->mmap_sem); | 235 | down_read(&mm->mmap_sem); |
237 | ret = hmm_range_snapshot(&range); | 236 | ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); |
238 | if (ret) { | 237 | if (ret) { |
239 | up_read(&mm->mmap_sem); | 238 | up_read(&mm->mmap_sem); |
240 | if (ret == -EBUSY) { | 239 | if (ret == -EBUSY) { |
241 | /* | 240 | /* |
242 | * No need to check hmm_range_wait_until_valid() return value | 241 | * No need to check hmm_range_wait_until_valid() return value |
243 | * on retry we will get proper error with hmm_range_snapshot() | 242 | * on retry we will get proper error with hmm_range_fault() |
244 | */ | 243 | */ |
245 | hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); | 244 | hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); |
246 | goto again; | 245 | goto again; |
@@ -340,58 +339,8 @@ Migration to and from device memory | |||
340 | =================================== | 339 | =================================== |
341 | 340 | ||
342 | Because the CPU cannot access device memory, migration must use the device DMA | 341 | Because the CPU cannot access device memory, migration must use the device DMA |
343 | engine to perform copy from and to device memory. For this we need a new | 342 | engine to perform copy from and to device memory. For this we need to use |
344 | migration helper:: | 343 | migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() helpers. |
345 | |||
346 | int migrate_vma(const struct migrate_vma_ops *ops, | ||
347 | struct vm_area_struct *vma, | ||
348 | unsigned long mentries, | ||
349 | unsigned long start, | ||
350 | unsigned long end, | ||
351 | unsigned long *src, | ||
352 | unsigned long *dst, | ||
353 | void *private); | ||
354 | |||
355 | Unlike other migration functions it works on a range of virtual address, there | ||
356 | are two reasons for that. First, device DMA copy has a high setup overhead cost | ||
357 | and thus batching multiple pages is needed as otherwise the migration overhead | ||
358 | makes the whole exercise pointless. The second reason is because the | ||
359 | migration might be for a range of addresses the device is actively accessing. | ||
360 | |||
361 | The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) | ||
362 | controls destination memory allocation and copy operation. Second one is there | ||
363 | to allow the device driver to perform cleanup operations after migration:: | ||
364 | |||
365 | struct migrate_vma_ops { | ||
366 | void (*alloc_and_copy)(struct vm_area_struct *vma, | ||
367 | const unsigned long *src, | ||
368 | unsigned long *dst, | ||
369 | unsigned long start, | ||
370 | unsigned long end, | ||
371 | void *private); | ||
372 | void (*finalize_and_map)(struct vm_area_struct *vma, | ||
373 | const unsigned long *src, | ||
374 | const unsigned long *dst, | ||
375 | unsigned long start, | ||
376 | unsigned long end, | ||
377 | void *private); | ||
378 | }; | ||
379 | |||
380 | It is important to stress that these migration helpers allow for holes in the | ||
381 | virtual address range. Some pages in the range might not be migrated for all | ||
382 | the usual reasons (page is pinned, page is locked, ...). This helper does not | ||
383 | fail but just skips over those pages. | ||
384 | |||
385 | The alloc_and_copy() might decide to not migrate all pages in the | ||
386 | range (for reasons under the callback control). For those, the callback just | ||
387 | has to leave the corresponding dst entry empty. | ||
388 | |||
389 | Finally, the migration of the struct page might fail (for file backed page) for | ||
390 | various reasons (failure to freeze reference, or update page cache, ...). If | ||
391 | that happens, then the finalize_and_map() can catch any pages that were not | ||
392 | migrated. Note those pages were still copied to a new page and thus we wasted | ||
393 | bandwidth but this is considered as a rare event and a price that we are | ||
394 | willing to pay to keep all the code simpler. | ||
395 | 344 | ||
396 | 345 | ||
397 | Memory cgroup (memcg) and rss accounting | 346 | Memory cgroup (memcg) and rss accounting |
diff --git a/arch/csky/include/asm/tlb.h b/arch/csky/include/asm/tlb.h index 8c7cc097666f..fdff9b8d70c8 100644 --- a/arch/csky/include/asm/tlb.h +++ b/arch/csky/include/asm/tlb.h | |||
@@ -8,14 +8,14 @@ | |||
8 | 8 | ||
9 | #define tlb_start_vma(tlb, vma) \ | 9 | #define tlb_start_vma(tlb, vma) \ |
10 | do { \ | 10 | do { \ |
11 | if (!tlb->fullmm) \ | 11 | if (!(tlb)->fullmm) \ |
12 | flush_cache_range(vma, vma->vm_start, vma->vm_end); \ | 12 | flush_cache_range(vma, (vma)->vm_start, (vma)->vm_end); \ |
13 | } while (0) | 13 | } while (0) |
14 | 14 | ||
15 | #define tlb_end_vma(tlb, vma) \ | 15 | #define tlb_end_vma(tlb, vma) \ |
16 | do { \ | 16 | do { \ |
17 | if (!tlb->fullmm) \ | 17 | if (!(tlb)->fullmm) \ |
18 | flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ | 18 | flush_tlb_range(vma, (vma)->vm_start, (vma)->vm_end); \ |
19 | } while (0) | 19 | } while (0) |
20 | 20 | ||
21 | #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) | 21 | #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) |
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index b41a79fcdbd9..4d5b8bd1d795 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c | |||
@@ -16,6 +16,7 @@ | |||
16 | */ | 16 | */ |
17 | 17 | ||
18 | #include <linux/dma-noncoherent.h> | 18 | #include <linux/dma-noncoherent.h> |
19 | #include <linux/pagewalk.h> | ||
19 | 20 | ||
20 | #include <asm/cpuinfo.h> | 21 | #include <asm/cpuinfo.h> |
21 | #include <asm/spr_defs.h> | 22 | #include <asm/spr_defs.h> |
@@ -43,6 +44,10 @@ page_set_nocache(pte_t *pte, unsigned long addr, | |||
43 | return 0; | 44 | return 0; |
44 | } | 45 | } |
45 | 46 | ||
47 | static const struct mm_walk_ops set_nocache_walk_ops = { | ||
48 | .pte_entry = page_set_nocache, | ||
49 | }; | ||
50 | |||
46 | static int | 51 | static int |
47 | page_clear_nocache(pte_t *pte, unsigned long addr, | 52 | page_clear_nocache(pte_t *pte, unsigned long addr, |
48 | unsigned long next, struct mm_walk *walk) | 53 | unsigned long next, struct mm_walk *walk) |
@@ -58,6 +63,10 @@ page_clear_nocache(pte_t *pte, unsigned long addr, | |||
58 | return 0; | 63 | return 0; |
59 | } | 64 | } |
60 | 65 | ||
66 | static const struct mm_walk_ops clear_nocache_walk_ops = { | ||
67 | .pte_entry = page_clear_nocache, | ||
68 | }; | ||
69 | |||
61 | /* | 70 | /* |
62 | * Alloc "coherent" memory, which for OpenRISC means simply uncached. | 71 | * Alloc "coherent" memory, which for OpenRISC means simply uncached. |
63 | * | 72 | * |
@@ -80,10 +89,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
80 | { | 89 | { |
81 | unsigned long va; | 90 | unsigned long va; |
82 | void *page; | 91 | void *page; |
83 | struct mm_walk walk = { | ||
84 | .pte_entry = page_set_nocache, | ||
85 | .mm = &init_mm | ||
86 | }; | ||
87 | 92 | ||
88 | page = alloc_pages_exact(size, gfp | __GFP_ZERO); | 93 | page = alloc_pages_exact(size, gfp | __GFP_ZERO); |
89 | if (!page) | 94 | if (!page) |
@@ -98,7 +103,8 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
98 | * We need to iterate through the pages, clearing the dcache for | 103 | * We need to iterate through the pages, clearing the dcache for |
99 | * them and setting the cache-inhibit bit. | 104 | * them and setting the cache-inhibit bit. |
100 | */ | 105 | */ |
101 | if (walk_page_range(va, va + size, &walk)) { | 106 | if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, |
107 | NULL)) { | ||
102 | free_pages_exact(page, size); | 108 | free_pages_exact(page, size); |
103 | return NULL; | 109 | return NULL; |
104 | } | 110 | } |
@@ -111,13 +117,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr, | |||
111 | dma_addr_t dma_handle, unsigned long attrs) | 117 | dma_addr_t dma_handle, unsigned long attrs) |
112 | { | 118 | { |
113 | unsigned long va = (unsigned long)vaddr; | 119 | unsigned long va = (unsigned long)vaddr; |
114 | struct mm_walk walk = { | ||
115 | .pte_entry = page_clear_nocache, | ||
116 | .mm = &init_mm | ||
117 | }; | ||
118 | 120 | ||
119 | /* walk_page_range shouldn't be able to fail here */ | 121 | /* walk_page_range shouldn't be able to fail here */ |
120 | WARN_ON(walk_page_range(va, va + size, &walk)); | 122 | WARN_ON(walk_page_range(&init_mm, va, va + size, |
123 | &clear_nocache_walk_ops, NULL)); | ||
121 | 124 | ||
122 | free_pages_exact(vaddr, size); | 125 | free_pages_exact(vaddr, size); |
123 | } | 126 | } |
diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 9ba07e55c489..2ef24a53f4c9 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
8 | #include <linux/gfp.h> | 8 | #include <linux/gfp.h> |
9 | #include <linux/types.h> | 9 | #include <linux/types.h> |
10 | #include <linux/mm.h> | 10 | #include <linux/pagewalk.h> |
11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
12 | #include <linux/syscalls.h> | 12 | #include <linux/syscalls.h> |
13 | 13 | ||
@@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
139 | return 0; | 139 | return 0; |
140 | } | 140 | } |
141 | 141 | ||
142 | static const struct mm_walk_ops subpage_walk_ops = { | ||
143 | .pmd_entry = subpage_walk_pmd_entry, | ||
144 | }; | ||
145 | |||
142 | static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, | 146 | static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, |
143 | unsigned long len) | 147 | unsigned long len) |
144 | { | 148 | { |
145 | struct vm_area_struct *vma; | 149 | struct vm_area_struct *vma; |
146 | struct mm_walk subpage_proto_walk = { | ||
147 | .mm = mm, | ||
148 | .pmd_entry = subpage_walk_pmd_entry, | ||
149 | }; | ||
150 | 150 | ||
151 | /* | 151 | /* |
152 | * We don't try too hard, we just mark all the vma in that range | 152 | * We don't try too hard, we just mark all the vma in that range |
@@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, | |||
163 | if (vma->vm_start >= (addr + len)) | 163 | if (vma->vm_start >= (addr + len)) |
164 | break; | 164 | break; |
165 | vma->vm_flags |= VM_NOHUGEPAGE; | 165 | vma->vm_flags |= VM_NOHUGEPAGE; |
166 | walk_page_vma(vma, &subpage_proto_walk); | 166 | walk_page_vma(vma, &subpage_walk_ops, NULL); |
167 | vma = vma->vm_next; | 167 | vma = vma->vm_next; |
168 | } | 168 | } |
169 | } | 169 | } |
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index cd8e03f04d6d..edcdca97e85e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c | |||
@@ -9,7 +9,7 @@ | |||
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/pagewalk.h> |
13 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
14 | #include <linux/smp.h> | 14 | #include <linux/smp.h> |
15 | #include <linux/spinlock.h> | 15 | #include <linux/spinlock.h> |
@@ -2521,13 +2521,9 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, | |||
2521 | return 0; | 2521 | return 0; |
2522 | } | 2522 | } |
2523 | 2523 | ||
2524 | static inline void zap_zero_pages(struct mm_struct *mm) | 2524 | static const struct mm_walk_ops zap_zero_walk_ops = { |
2525 | { | 2525 | .pmd_entry = __zap_zero_pages, |
2526 | struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; | 2526 | }; |
2527 | |||
2528 | walk.mm = mm; | ||
2529 | walk_page_range(0, TASK_SIZE, &walk); | ||
2530 | } | ||
2531 | 2527 | ||
2532 | /* | 2528 | /* |
2533 | * switch on pgstes for its userspace process (for kvm) | 2529 | * switch on pgstes for its userspace process (for kvm) |
@@ -2546,7 +2542,7 @@ int s390_enable_sie(void) | |||
2546 | mm->context.has_pgste = 1; | 2542 | mm->context.has_pgste = 1; |
2547 | /* split thp mappings and disable thp for future mappings */ | 2543 | /* split thp mappings and disable thp for future mappings */ |
2548 | thp_split_mm(mm); | 2544 | thp_split_mm(mm); |
2549 | zap_zero_pages(mm); | 2545 | walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); |
2550 | up_write(&mm->mmap_sem); | 2546 | up_write(&mm->mmap_sem); |
2551 | return 0; | 2547 | return 0; |
2552 | } | 2548 | } |
@@ -2589,12 +2585,13 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, | |||
2589 | return 0; | 2585 | return 0; |
2590 | } | 2586 | } |
2591 | 2587 | ||
2588 | static const struct mm_walk_ops enable_skey_walk_ops = { | ||
2589 | .hugetlb_entry = __s390_enable_skey_hugetlb, | ||
2590 | .pte_entry = __s390_enable_skey_pte, | ||
2591 | }; | ||
2592 | |||
2592 | int s390_enable_skey(void) | 2593 | int s390_enable_skey(void) |
2593 | { | 2594 | { |
2594 | struct mm_walk walk = { | ||
2595 | .hugetlb_entry = __s390_enable_skey_hugetlb, | ||
2596 | .pte_entry = __s390_enable_skey_pte, | ||
2597 | }; | ||
2598 | struct mm_struct *mm = current->mm; | 2595 | struct mm_struct *mm = current->mm; |
2599 | struct vm_area_struct *vma; | 2596 | struct vm_area_struct *vma; |
2600 | int rc = 0; | 2597 | int rc = 0; |
@@ -2614,8 +2611,7 @@ int s390_enable_skey(void) | |||
2614 | } | 2611 | } |
2615 | mm->def_flags &= ~VM_MERGEABLE; | 2612 | mm->def_flags &= ~VM_MERGEABLE; |
2616 | 2613 | ||
2617 | walk.mm = mm; | 2614 | walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); |
2618 | walk_page_range(0, TASK_SIZE, &walk); | ||
2619 | 2615 | ||
2620 | out_up: | 2616 | out_up: |
2621 | up_write(&mm->mmap_sem); | 2617 | up_write(&mm->mmap_sem); |
@@ -2633,13 +2629,14 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, | |||
2633 | return 0; | 2629 | return 0; |
2634 | } | 2630 | } |
2635 | 2631 | ||
2632 | static const struct mm_walk_ops reset_cmma_walk_ops = { | ||
2633 | .pte_entry = __s390_reset_cmma, | ||
2634 | }; | ||
2635 | |||
2636 | void s390_reset_cmma(struct mm_struct *mm) | 2636 | void s390_reset_cmma(struct mm_struct *mm) |
2637 | { | 2637 | { |
2638 | struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; | ||
2639 | |||
2640 | down_write(&mm->mmap_sem); | 2638 | down_write(&mm->mmap_sem); |
2641 | walk.mm = mm; | 2639 | walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); |
2642 | walk_page_range(0, TASK_SIZE, &walk); | ||
2643 | up_write(&mm->mmap_sem); | 2640 | up_write(&mm->mmap_sem); |
2644 | } | 2641 | } |
2645 | EXPORT_SYMBOL_GPL(s390_reset_cmma); | 2642 | EXPORT_SYMBOL_GPL(s390_reset_cmma); |
diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig index f6e5c0282fc1..2e98c016cb47 100644 --- a/drivers/gpu/drm/amd/amdgpu/Kconfig +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig | |||
@@ -27,7 +27,9 @@ config DRM_AMDGPU_CIK | |||
27 | config DRM_AMDGPU_USERPTR | 27 | config DRM_AMDGPU_USERPTR |
28 | bool "Always enable userptr write support" | 28 | bool "Always enable userptr write support" |
29 | depends on DRM_AMDGPU | 29 | depends on DRM_AMDGPU |
30 | depends on HMM_MIRROR | 30 | depends on MMU |
31 | select HMM_MIRROR | ||
32 | select MMU_NOTIFIER | ||
31 | help | 33 | help |
32 | This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it | 34 | This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it |
33 | isn't already selected to enabled full userptr support. | 35 | isn't already selected to enabled full userptr support. |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 48a2070e72f2..bdf849da32e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/pm_runtime.h> | 35 | #include <linux/pm_runtime.h> |
36 | #include <linux/vga_switcheroo.h> | 36 | #include <linux/vga_switcheroo.h> |
37 | #include <drm/drm_probe_helper.h> | 37 | #include <drm/drm_probe_helper.h> |
38 | #include <linux/mmu_notifier.h> | ||
38 | 39 | ||
39 | #include "amdgpu.h" | 40 | #include "amdgpu.h" |
40 | #include "amdgpu_irq.h" | 41 | #include "amdgpu_irq.h" |
@@ -1469,6 +1470,7 @@ static void __exit amdgpu_exit(void) | |||
1469 | amdgpu_unregister_atpx_handler(); | 1470 | amdgpu_unregister_atpx_handler(); |
1470 | amdgpu_sync_fini(); | 1471 | amdgpu_sync_fini(); |
1471 | amdgpu_fence_slab_fini(); | 1472 | amdgpu_fence_slab_fini(); |
1473 | mmu_notifier_synchronize(); | ||
1472 | } | 1474 | } |
1473 | 1475 | ||
1474 | module_init(amdgpu_init); | 1476 | module_init(amdgpu_init); |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index f1f8cdd695d3..31d4deb5d294 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | |||
@@ -195,13 +195,14 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, | |||
195 | * Block for operations on BOs to finish and mark pages as accessed and | 195 | * Block for operations on BOs to finish and mark pages as accessed and |
196 | * potentially dirty. | 196 | * potentially dirty. |
197 | */ | 197 | */ |
198 | static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, | 198 | static int |
199 | const struct hmm_update *update) | 199 | amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, |
200 | const struct mmu_notifier_range *update) | ||
200 | { | 201 | { |
201 | struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); | 202 | struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); |
202 | unsigned long start = update->start; | 203 | unsigned long start = update->start; |
203 | unsigned long end = update->end; | 204 | unsigned long end = update->end; |
204 | bool blockable = update->blockable; | 205 | bool blockable = mmu_notifier_range_blockable(update); |
205 | struct interval_tree_node *it; | 206 | struct interval_tree_node *it; |
206 | 207 | ||
207 | /* notification is exclusive, but interval is inclusive */ | 208 | /* notification is exclusive, but interval is inclusive */ |
@@ -243,13 +244,14 @@ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, | |||
243 | * necessitates evicting all user-mode queues of the process. The BOs | 244 | * necessitates evicting all user-mode queues of the process. The BOs |
244 | * are restorted in amdgpu_mn_invalidate_range_end_hsa. | 245 | * are restorted in amdgpu_mn_invalidate_range_end_hsa. |
245 | */ | 246 | */ |
246 | static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, | 247 | static int |
247 | const struct hmm_update *update) | 248 | amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, |
249 | const struct mmu_notifier_range *update) | ||
248 | { | 250 | { |
249 | struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); | 251 | struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); |
250 | unsigned long start = update->start; | 252 | unsigned long start = update->start; |
251 | unsigned long end = update->end; | 253 | unsigned long end = update->end; |
252 | bool blockable = update->blockable; | 254 | bool blockable = mmu_notifier_range_blockable(update); |
253 | struct interval_tree_node *it; | 255 | struct interval_tree_node *it; |
254 | 256 | ||
255 | /* notification is exclusive, but interval is inclusive */ | 257 | /* notification is exclusive, but interval is inclusive */ |
@@ -482,6 +484,5 @@ void amdgpu_hmm_init_range(struct hmm_range *range) | |||
482 | range->flags = hmm_range_flags; | 484 | range->flags = hmm_range_flags; |
483 | range->values = hmm_range_values; | 485 | range->values = hmm_range_values; |
484 | range->pfn_shift = PAGE_SHIFT; | 486 | range->pfn_shift = PAGE_SHIFT; |
485 | INIT_LIST_HEAD(&range->list); | ||
486 | } | 487 | } |
487 | } | 488 | } |
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 13b144c8f67d..dff41d0a85fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | |||
@@ -794,7 +794,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) | |||
794 | struct hmm_range *range; | 794 | struct hmm_range *range; |
795 | unsigned long i; | 795 | unsigned long i; |
796 | uint64_t *pfns; | 796 | uint64_t *pfns; |
797 | int retry = 0; | ||
798 | int r = 0; | 797 | int r = 0; |
799 | 798 | ||
800 | if (!mm) /* Happens during process shutdown */ | 799 | if (!mm) /* Happens during process shutdown */ |
@@ -835,10 +834,11 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) | |||
835 | 0 : range->flags[HMM_PFN_WRITE]; | 834 | 0 : range->flags[HMM_PFN_WRITE]; |
836 | range->pfn_flags_mask = 0; | 835 | range->pfn_flags_mask = 0; |
837 | range->pfns = pfns; | 836 | range->pfns = pfns; |
838 | hmm_range_register(range, mirror, start, | 837 | range->start = start; |
839 | start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT); | 838 | range->end = start + ttm->num_pages * PAGE_SIZE; |
839 | |||
840 | hmm_range_register(range, mirror); | ||
840 | 841 | ||
841 | retry: | ||
842 | /* | 842 | /* |
843 | * Just wait for range to be valid, safe to ignore return value as we | 843 | * Just wait for range to be valid, safe to ignore return value as we |
844 | * will use the return value of hmm_range_fault() below under the | 844 | * will use the return value of hmm_range_fault() below under the |
@@ -847,24 +847,12 @@ retry: | |||
847 | hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT); | 847 | hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT); |
848 | 848 | ||
849 | down_read(&mm->mmap_sem); | 849 | down_read(&mm->mmap_sem); |
850 | 850 | r = hmm_range_fault(range, 0); | |
851 | r = hmm_range_fault(range, true); | ||
852 | if (unlikely(r < 0)) { | ||
853 | if (likely(r == -EAGAIN)) { | ||
854 | /* | ||
855 | * return -EAGAIN, mmap_sem is dropped | ||
856 | */ | ||
857 | if (retry++ < MAX_RETRY_HMM_RANGE_FAULT) | ||
858 | goto retry; | ||
859 | else | ||
860 | pr_err("Retry hmm fault too many times\n"); | ||
861 | } | ||
862 | |||
863 | goto out_up_read; | ||
864 | } | ||
865 | |||
866 | up_read(&mm->mmap_sem); | 851 | up_read(&mm->mmap_sem); |
867 | 852 | ||
853 | if (unlikely(r < 0)) | ||
854 | goto out_free_pfns; | ||
855 | |||
868 | for (i = 0; i < ttm->num_pages; i++) { | 856 | for (i = 0; i < ttm->num_pages; i++) { |
869 | pages[i] = hmm_device_entry_to_page(range, pfns[i]); | 857 | pages[i] = hmm_device_entry_to_page(range, pfns[i]); |
870 | if (unlikely(!pages[i])) { | 858 | if (unlikely(!pages[i])) { |
@@ -880,9 +868,6 @@ retry: | |||
880 | 868 | ||
881 | return 0; | 869 | return 0; |
882 | 870 | ||
883 | out_up_read: | ||
884 | if (likely(r != -EAGAIN)) | ||
885 | up_read(&mm->mmap_sem); | ||
886 | out_free_pfns: | 871 | out_free_pfns: |
887 | hmm_range_unregister(range); | 872 | hmm_range_unregister(range); |
888 | kvfree(pfns); | 873 | kvfree(pfns); |
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 3bb75d11a662..c89326125d71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h | |||
@@ -687,9 +687,6 @@ struct kfd_process { | |||
687 | /* We want to receive a notification when the mm_struct is destroyed */ | 687 | /* We want to receive a notification when the mm_struct is destroyed */ |
688 | struct mmu_notifier mmu_notifier; | 688 | struct mmu_notifier mmu_notifier; |
689 | 689 | ||
690 | /* Use for delayed freeing of kfd_process structure */ | ||
691 | struct rcu_head rcu; | ||
692 | |||
693 | unsigned int pasid; | 690 | unsigned int pasid; |
694 | unsigned int doorbell_index; | 691 | unsigned int doorbell_index; |
695 | 692 | ||
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 0c6ac043ae3c..40e3fc0c6942 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c | |||
@@ -62,8 +62,8 @@ static struct workqueue_struct *kfd_restore_wq; | |||
62 | 62 | ||
63 | static struct kfd_process *find_process(const struct task_struct *thread); | 63 | static struct kfd_process *find_process(const struct task_struct *thread); |
64 | static void kfd_process_ref_release(struct kref *ref); | 64 | static void kfd_process_ref_release(struct kref *ref); |
65 | static struct kfd_process *create_process(const struct task_struct *thread, | 65 | static struct kfd_process *create_process(const struct task_struct *thread); |
66 | struct file *filep); | 66 | static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep); |
67 | 67 | ||
68 | static void evict_process_worker(struct work_struct *work); | 68 | static void evict_process_worker(struct work_struct *work); |
69 | static void restore_process_worker(struct work_struct *work); | 69 | static void restore_process_worker(struct work_struct *work); |
@@ -289,7 +289,15 @@ struct kfd_process *kfd_create_process(struct file *filep) | |||
289 | if (process) { | 289 | if (process) { |
290 | pr_debug("Process already found\n"); | 290 | pr_debug("Process already found\n"); |
291 | } else { | 291 | } else { |
292 | process = create_process(thread, filep); | 292 | process = create_process(thread); |
293 | if (IS_ERR(process)) | ||
294 | goto out; | ||
295 | |||
296 | ret = kfd_process_init_cwsr_apu(process, filep); | ||
297 | if (ret) { | ||
298 | process = ERR_PTR(ret); | ||
299 | goto out; | ||
300 | } | ||
293 | 301 | ||
294 | if (!procfs.kobj) | 302 | if (!procfs.kobj) |
295 | goto out; | 303 | goto out; |
@@ -478,11 +486,9 @@ static void kfd_process_ref_release(struct kref *ref) | |||
478 | queue_work(kfd_process_wq, &p->release_work); | 486 | queue_work(kfd_process_wq, &p->release_work); |
479 | } | 487 | } |
480 | 488 | ||
481 | static void kfd_process_destroy_delayed(struct rcu_head *rcu) | 489 | static void kfd_process_free_notifier(struct mmu_notifier *mn) |
482 | { | 490 | { |
483 | struct kfd_process *p = container_of(rcu, struct kfd_process, rcu); | 491 | kfd_unref_process(container_of(mn, struct kfd_process, mmu_notifier)); |
484 | |||
485 | kfd_unref_process(p); | ||
486 | } | 492 | } |
487 | 493 | ||
488 | static void kfd_process_notifier_release(struct mmu_notifier *mn, | 494 | static void kfd_process_notifier_release(struct mmu_notifier *mn, |
@@ -534,12 +540,12 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn, | |||
534 | 540 | ||
535 | mutex_unlock(&p->mutex); | 541 | mutex_unlock(&p->mutex); |
536 | 542 | ||
537 | mmu_notifier_unregister_no_release(&p->mmu_notifier, mm); | 543 | mmu_notifier_put(&p->mmu_notifier); |
538 | mmu_notifier_call_srcu(&p->rcu, &kfd_process_destroy_delayed); | ||
539 | } | 544 | } |
540 | 545 | ||
541 | static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { | 546 | static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = { |
542 | .release = kfd_process_notifier_release, | 547 | .release = kfd_process_notifier_release, |
548 | .free_notifier = kfd_process_free_notifier, | ||
543 | }; | 549 | }; |
544 | 550 | ||
545 | static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) | 551 | static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep) |
@@ -609,81 +615,69 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) | |||
609 | return 0; | 615 | return 0; |
610 | } | 616 | } |
611 | 617 | ||
612 | static struct kfd_process *create_process(const struct task_struct *thread, | 618 | /* |
613 | struct file *filep) | 619 | * On return the kfd_process is fully operational and will be freed when the |
620 | * mm is released | ||
621 | */ | ||
622 | static struct kfd_process *create_process(const struct task_struct *thread) | ||
614 | { | 623 | { |
615 | struct kfd_process *process; | 624 | struct kfd_process *process; |
616 | int err = -ENOMEM; | 625 | int err = -ENOMEM; |
617 | 626 | ||
618 | process = kzalloc(sizeof(*process), GFP_KERNEL); | 627 | process = kzalloc(sizeof(*process), GFP_KERNEL); |
619 | |||
620 | if (!process) | 628 | if (!process) |
621 | goto err_alloc_process; | 629 | goto err_alloc_process; |
622 | 630 | ||
623 | process->pasid = kfd_pasid_alloc(); | ||
624 | if (process->pasid == 0) | ||
625 | goto err_alloc_pasid; | ||
626 | |||
627 | if (kfd_alloc_process_doorbells(process) < 0) | ||
628 | goto err_alloc_doorbells; | ||
629 | |||
630 | kref_init(&process->ref); | 631 | kref_init(&process->ref); |
631 | |||
632 | mutex_init(&process->mutex); | 632 | mutex_init(&process->mutex); |
633 | |||
634 | process->mm = thread->mm; | 633 | process->mm = thread->mm; |
635 | |||
636 | /* register notifier */ | ||
637 | process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; | ||
638 | err = mmu_notifier_register(&process->mmu_notifier, process->mm); | ||
639 | if (err) | ||
640 | goto err_mmu_notifier; | ||
641 | |||
642 | hash_add_rcu(kfd_processes_table, &process->kfd_processes, | ||
643 | (uintptr_t)process->mm); | ||
644 | |||
645 | process->lead_thread = thread->group_leader; | 634 | process->lead_thread = thread->group_leader; |
646 | get_task_struct(process->lead_thread); | ||
647 | |||
648 | INIT_LIST_HEAD(&process->per_device_data); | 635 | INIT_LIST_HEAD(&process->per_device_data); |
649 | 636 | INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); | |
637 | INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); | ||
638 | process->last_restore_timestamp = get_jiffies_64(); | ||
650 | kfd_event_init_process(process); | 639 | kfd_event_init_process(process); |
640 | process->is_32bit_user_mode = in_compat_syscall(); | ||
641 | |||
642 | process->pasid = kfd_pasid_alloc(); | ||
643 | if (process->pasid == 0) | ||
644 | goto err_alloc_pasid; | ||
645 | |||
646 | if (kfd_alloc_process_doorbells(process) < 0) | ||
647 | goto err_alloc_doorbells; | ||
651 | 648 | ||
652 | err = pqm_init(&process->pqm, process); | 649 | err = pqm_init(&process->pqm, process); |
653 | if (err != 0) | 650 | if (err != 0) |
654 | goto err_process_pqm_init; | 651 | goto err_process_pqm_init; |
655 | 652 | ||
656 | /* init process apertures*/ | 653 | /* init process apertures*/ |
657 | process->is_32bit_user_mode = in_compat_syscall(); | ||
658 | err = kfd_init_apertures(process); | 654 | err = kfd_init_apertures(process); |
659 | if (err != 0) | 655 | if (err != 0) |
660 | goto err_init_apertures; | 656 | goto err_init_apertures; |
661 | 657 | ||
662 | INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); | 658 | /* Must be last, have to use release destruction after this */ |
663 | INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); | 659 | process->mmu_notifier.ops = &kfd_process_mmu_notifier_ops; |
664 | process->last_restore_timestamp = get_jiffies_64(); | 660 | err = mmu_notifier_register(&process->mmu_notifier, process->mm); |
665 | |||
666 | err = kfd_process_init_cwsr_apu(process, filep); | ||
667 | if (err) | 661 | if (err) |
668 | goto err_init_cwsr; | 662 | goto err_register_notifier; |
663 | |||
664 | get_task_struct(process->lead_thread); | ||
665 | hash_add_rcu(kfd_processes_table, &process->kfd_processes, | ||
666 | (uintptr_t)process->mm); | ||
669 | 667 | ||
670 | return process; | 668 | return process; |
671 | 669 | ||
672 | err_init_cwsr: | 670 | err_register_notifier: |
673 | kfd_process_free_outstanding_kfd_bos(process); | 671 | kfd_process_free_outstanding_kfd_bos(process); |
674 | kfd_process_destroy_pdds(process); | 672 | kfd_process_destroy_pdds(process); |
675 | err_init_apertures: | 673 | err_init_apertures: |
676 | pqm_uninit(&process->pqm); | 674 | pqm_uninit(&process->pqm); |
677 | err_process_pqm_init: | 675 | err_process_pqm_init: |
678 | hash_del_rcu(&process->kfd_processes); | ||
679 | synchronize_rcu(); | ||
680 | mmu_notifier_unregister_no_release(&process->mmu_notifier, process->mm); | ||
681 | err_mmu_notifier: | ||
682 | mutex_destroy(&process->mutex); | ||
683 | kfd_free_process_doorbells(process); | 676 | kfd_free_process_doorbells(process); |
684 | err_alloc_doorbells: | 677 | err_alloc_doorbells: |
685 | kfd_pasid_free(process->pasid); | 678 | kfd_pasid_free(process->pasid); |
686 | err_alloc_pasid: | 679 | err_alloc_pasid: |
680 | mutex_destroy(&process->mutex); | ||
687 | kfree(process); | 681 | kfree(process); |
688 | err_alloc_process: | 682 | err_alloc_process: |
689 | return ERR_PTR(err); | 683 | return ERR_PTR(err); |
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index 96b9814e6d06..3558df043592 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig | |||
@@ -86,9 +86,10 @@ config DRM_NOUVEAU_SVM | |||
86 | bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" | 86 | bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" |
87 | depends on DEVICE_PRIVATE | 87 | depends on DEVICE_PRIVATE |
88 | depends on DRM_NOUVEAU | 88 | depends on DRM_NOUVEAU |
89 | depends on HMM_MIRROR | 89 | depends on MMU |
90 | depends on STAGING | 90 | depends on STAGING |
91 | select MIGRATE_VMA_HELPER | 91 | select HMM_MIRROR |
92 | select MMU_NOTIFIER | ||
92 | default n | 93 | default n |
93 | help | 94 | help |
94 | Say Y here if you want to enable experimental support for | 95 | Say Y here if you want to enable experimental support for |
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 1333220787a1..fa1439941596 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c | |||
@@ -44,8 +44,6 @@ | |||
44 | #define DMEM_CHUNK_SIZE (2UL << 20) | 44 | #define DMEM_CHUNK_SIZE (2UL << 20) |
45 | #define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT) | 45 | #define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT) |
46 | 46 | ||
47 | struct nouveau_migrate; | ||
48 | |||
49 | enum nouveau_aper { | 47 | enum nouveau_aper { |
50 | NOUVEAU_APER_VIRT, | 48 | NOUVEAU_APER_VIRT, |
51 | NOUVEAU_APER_VRAM, | 49 | NOUVEAU_APER_VRAM, |
@@ -86,21 +84,13 @@ static inline struct nouveau_dmem *page_to_dmem(struct page *page) | |||
86 | return container_of(page->pgmap, struct nouveau_dmem, pagemap); | 84 | return container_of(page->pgmap, struct nouveau_dmem, pagemap); |
87 | } | 85 | } |
88 | 86 | ||
89 | struct nouveau_dmem_fault { | 87 | static unsigned long nouveau_dmem_page_addr(struct page *page) |
90 | struct nouveau_drm *drm; | 88 | { |
91 | struct nouveau_fence *fence; | 89 | struct nouveau_dmem_chunk *chunk = page->zone_device_data; |
92 | dma_addr_t *dma; | 90 | unsigned long idx = page_to_pfn(page) - chunk->pfn_first; |
93 | unsigned long npages; | ||
94 | }; | ||
95 | 91 | ||
96 | struct nouveau_migrate { | 92 | return (idx << PAGE_SHIFT) + chunk->bo->bo.offset; |
97 | struct vm_area_struct *vma; | 93 | } |
98 | struct nouveau_drm *drm; | ||
99 | struct nouveau_fence *fence; | ||
100 | unsigned long npages; | ||
101 | dma_addr_t *dma; | ||
102 | unsigned long dma_nr; | ||
103 | }; | ||
104 | 94 | ||
105 | static void nouveau_dmem_page_free(struct page *page) | 95 | static void nouveau_dmem_page_free(struct page *page) |
106 | { | 96 | { |
@@ -125,165 +115,90 @@ static void nouveau_dmem_page_free(struct page *page) | |||
125 | spin_unlock(&chunk->lock); | 115 | spin_unlock(&chunk->lock); |
126 | } | 116 | } |
127 | 117 | ||
128 | static void | 118 | static void nouveau_dmem_fence_done(struct nouveau_fence **fence) |
129 | nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, | ||
130 | const unsigned long *src_pfns, | ||
131 | unsigned long *dst_pfns, | ||
132 | unsigned long start, | ||
133 | unsigned long end, | ||
134 | void *private) | ||
135 | { | 119 | { |
136 | struct nouveau_dmem_fault *fault = private; | 120 | if (fence) { |
137 | struct nouveau_drm *drm = fault->drm; | 121 | nouveau_fence_wait(*fence, true, false); |
138 | struct device *dev = drm->dev->dev; | 122 | nouveau_fence_unref(fence); |
139 | unsigned long addr, i, npages = 0; | 123 | } else { |
140 | nouveau_migrate_copy_t copy; | 124 | /* |
141 | int ret; | 125 | * FIXME wait for channel to be IDLE before calling finalizing |
142 | 126 | * the hmem object. | |
143 | 127 | */ | |
144 | /* First allocate new memory */ | ||
145 | for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { | ||
146 | struct page *dpage, *spage; | ||
147 | |||
148 | dst_pfns[i] = 0; | ||
149 | spage = migrate_pfn_to_page(src_pfns[i]); | ||
150 | if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) | ||
151 | continue; | ||
152 | |||
153 | dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr); | ||
154 | if (!dpage) { | ||
155 | dst_pfns[i] = MIGRATE_PFN_ERROR; | ||
156 | continue; | ||
157 | } | ||
158 | lock_page(dpage); | ||
159 | |||
160 | dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | | ||
161 | MIGRATE_PFN_LOCKED; | ||
162 | npages++; | ||
163 | } | ||
164 | |||
165 | /* Allocate storage for DMA addresses, so we can unmap later. */ | ||
166 | fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL); | ||
167 | if (!fault->dma) | ||
168 | goto error; | ||
169 | |||
170 | /* Copy things over */ | ||
171 | copy = drm->dmem->migrate.copy_func; | ||
172 | for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { | ||
173 | struct nouveau_dmem_chunk *chunk; | ||
174 | struct page *spage, *dpage; | ||
175 | u64 src_addr, dst_addr; | ||
176 | |||
177 | dpage = migrate_pfn_to_page(dst_pfns[i]); | ||
178 | if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) | ||
179 | continue; | ||
180 | |||
181 | spage = migrate_pfn_to_page(src_pfns[i]); | ||
182 | if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { | ||
183 | dst_pfns[i] = MIGRATE_PFN_ERROR; | ||
184 | __free_page(dpage); | ||
185 | continue; | ||
186 | } | ||
187 | |||
188 | fault->dma[fault->npages] = | ||
189 | dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE, | ||
190 | PCI_DMA_BIDIRECTIONAL, | ||
191 | DMA_ATTR_SKIP_CPU_SYNC); | ||
192 | if (dma_mapping_error(dev, fault->dma[fault->npages])) { | ||
193 | dst_pfns[i] = MIGRATE_PFN_ERROR; | ||
194 | __free_page(dpage); | ||
195 | continue; | ||
196 | } | ||
197 | |||
198 | dst_addr = fault->dma[fault->npages++]; | ||
199 | |||
200 | chunk = spage->zone_device_data; | ||
201 | src_addr = page_to_pfn(spage) - chunk->pfn_first; | ||
202 | src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset; | ||
203 | |||
204 | ret = copy(drm, 1, NOUVEAU_APER_HOST, dst_addr, | ||
205 | NOUVEAU_APER_VRAM, src_addr); | ||
206 | if (ret) { | ||
207 | dst_pfns[i] = MIGRATE_PFN_ERROR; | ||
208 | __free_page(dpage); | ||
209 | continue; | ||
210 | } | ||
211 | } | 128 | } |
129 | } | ||
212 | 130 | ||
213 | nouveau_fence_new(drm->dmem->migrate.chan, false, &fault->fence); | 131 | static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, |
214 | 132 | struct vm_fault *vmf, struct migrate_vma *args, | |
215 | return; | 133 | dma_addr_t *dma_addr) |
216 | 134 | { | |
217 | error: | 135 | struct device *dev = drm->dev->dev; |
218 | for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, ++i) { | 136 | struct page *dpage, *spage; |
219 | struct page *page; | ||
220 | 137 | ||
221 | if (!dst_pfns[i] || dst_pfns[i] == MIGRATE_PFN_ERROR) | 138 | spage = migrate_pfn_to_page(args->src[0]); |
222 | continue; | 139 | if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) |
140 | return 0; | ||
223 | 141 | ||
224 | page = migrate_pfn_to_page(dst_pfns[i]); | 142 | dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); |
225 | dst_pfns[i] = MIGRATE_PFN_ERROR; | 143 | if (!dpage) |
226 | if (page == NULL) | 144 | return VM_FAULT_SIGBUS; |
227 | continue; | 145 | lock_page(dpage); |
228 | 146 | ||
229 | __free_page(page); | 147 | *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); |
230 | } | 148 | if (dma_mapping_error(dev, *dma_addr)) |
231 | } | 149 | goto error_free_page; |
232 | 150 | ||
233 | void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, | 151 | if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, |
234 | const unsigned long *src_pfns, | 152 | NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) |
235 | const unsigned long *dst_pfns, | 153 | goto error_dma_unmap; |
236 | unsigned long start, | ||
237 | unsigned long end, | ||
238 | void *private) | ||
239 | { | ||
240 | struct nouveau_dmem_fault *fault = private; | ||
241 | struct nouveau_drm *drm = fault->drm; | ||
242 | 154 | ||
243 | if (fault->fence) { | 155 | args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; |
244 | nouveau_fence_wait(fault->fence, true, false); | 156 | return 0; |
245 | nouveau_fence_unref(&fault->fence); | ||
246 | } else { | ||
247 | /* | ||
248 | * FIXME wait for channel to be IDLE before calling finalizing | ||
249 | * the hmem object below (nouveau_migrate_hmem_fini()). | ||
250 | */ | ||
251 | } | ||
252 | 157 | ||
253 | while (fault->npages--) { | 158 | error_dma_unmap: |
254 | dma_unmap_page(drm->dev->dev, fault->dma[fault->npages], | 159 | dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); |
255 | PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); | 160 | error_free_page: |
256 | } | 161 | __free_page(dpage); |
257 | kfree(fault->dma); | 162 | return VM_FAULT_SIGBUS; |
258 | } | 163 | } |
259 | 164 | ||
260 | static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = { | ||
261 | .alloc_and_copy = nouveau_dmem_fault_alloc_and_copy, | ||
262 | .finalize_and_map = nouveau_dmem_fault_finalize_and_map, | ||
263 | }; | ||
264 | |||
265 | static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) | 165 | static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) |
266 | { | 166 | { |
267 | struct nouveau_dmem *dmem = page_to_dmem(vmf->page); | 167 | struct nouveau_dmem *dmem = page_to_dmem(vmf->page); |
268 | unsigned long src[1] = {0}, dst[1] = {0}; | 168 | struct nouveau_drm *drm = dmem->drm; |
269 | struct nouveau_dmem_fault fault = { .drm = dmem->drm }; | 169 | struct nouveau_fence *fence; |
270 | int ret; | 170 | unsigned long src = 0, dst = 0; |
171 | dma_addr_t dma_addr = 0; | ||
172 | vm_fault_t ret; | ||
173 | struct migrate_vma args = { | ||
174 | .vma = vmf->vma, | ||
175 | .start = vmf->address, | ||
176 | .end = vmf->address + PAGE_SIZE, | ||
177 | .src = &src, | ||
178 | .dst = &dst, | ||
179 | }; | ||
271 | 180 | ||
272 | /* | 181 | /* |
273 | * FIXME what we really want is to find some heuristic to migrate more | 182 | * FIXME what we really want is to find some heuristic to migrate more |
274 | * than just one page on CPU fault. When such fault happens it is very | 183 | * than just one page on CPU fault. When such fault happens it is very |
275 | * likely that more surrounding page will CPU fault too. | 184 | * likely that more surrounding page will CPU fault too. |
276 | */ | 185 | */ |
277 | ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma, | 186 | if (migrate_vma_setup(&args) < 0) |
278 | vmf->address, vmf->address + PAGE_SIZE, | ||
279 | src, dst, &fault); | ||
280 | if (ret) | ||
281 | return VM_FAULT_SIGBUS; | 187 | return VM_FAULT_SIGBUS; |
188 | if (!args.cpages) | ||
189 | return 0; | ||
282 | 190 | ||
283 | if (dst[0] == MIGRATE_PFN_ERROR) | 191 | ret = nouveau_dmem_fault_copy_one(drm, vmf, &args, &dma_addr); |
284 | return VM_FAULT_SIGBUS; | 192 | if (ret || dst == 0) |
193 | goto done; | ||
285 | 194 | ||
286 | return 0; | 195 | nouveau_fence_new(dmem->migrate.chan, false, &fence); |
196 | migrate_vma_pages(&args); | ||
197 | nouveau_dmem_fence_done(&fence); | ||
198 | dma_unmap_page(drm->dev->dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); | ||
199 | done: | ||
200 | migrate_vma_finalize(&args); | ||
201 | return ret; | ||
287 | } | 202 | } |
288 | 203 | ||
289 | static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { | 204 | static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = { |
@@ -642,188 +557,115 @@ out_free: | |||
642 | drm->dmem = NULL; | 557 | drm->dmem = NULL; |
643 | } | 558 | } |
644 | 559 | ||
645 | static void | 560 | static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, |
646 | nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, | 561 | unsigned long src, dma_addr_t *dma_addr) |
647 | const unsigned long *src_pfns, | ||
648 | unsigned long *dst_pfns, | ||
649 | unsigned long start, | ||
650 | unsigned long end, | ||
651 | void *private) | ||
652 | { | 562 | { |
653 | struct nouveau_migrate *migrate = private; | ||
654 | struct nouveau_drm *drm = migrate->drm; | ||
655 | struct device *dev = drm->dev->dev; | 563 | struct device *dev = drm->dev->dev; |
656 | unsigned long addr, i, npages = 0; | 564 | struct page *dpage, *spage; |
657 | nouveau_migrate_copy_t copy; | ||
658 | int ret; | ||
659 | |||
660 | /* First allocate new memory */ | ||
661 | for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { | ||
662 | struct page *dpage, *spage; | ||
663 | |||
664 | dst_pfns[i] = 0; | ||
665 | spage = migrate_pfn_to_page(src_pfns[i]); | ||
666 | if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) | ||
667 | continue; | ||
668 | |||
669 | dpage = nouveau_dmem_page_alloc_locked(drm); | ||
670 | if (!dpage) | ||
671 | continue; | ||
672 | |||
673 | dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | | ||
674 | MIGRATE_PFN_LOCKED | | ||
675 | MIGRATE_PFN_DEVICE; | ||
676 | npages++; | ||
677 | } | ||
678 | |||
679 | if (!npages) | ||
680 | return; | ||
681 | |||
682 | /* Allocate storage for DMA addresses, so we can unmap later. */ | ||
683 | migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL); | ||
684 | if (!migrate->dma) | ||
685 | goto error; | ||
686 | |||
687 | /* Copy things over */ | ||
688 | copy = drm->dmem->migrate.copy_func; | ||
689 | for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { | ||
690 | struct nouveau_dmem_chunk *chunk; | ||
691 | struct page *spage, *dpage; | ||
692 | u64 src_addr, dst_addr; | ||
693 | |||
694 | dpage = migrate_pfn_to_page(dst_pfns[i]); | ||
695 | if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) | ||
696 | continue; | ||
697 | |||
698 | chunk = dpage->zone_device_data; | ||
699 | dst_addr = page_to_pfn(dpage) - chunk->pfn_first; | ||
700 | dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset; | ||
701 | |||
702 | spage = migrate_pfn_to_page(src_pfns[i]); | ||
703 | if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { | ||
704 | nouveau_dmem_page_free_locked(drm, dpage); | ||
705 | dst_pfns[i] = 0; | ||
706 | continue; | ||
707 | } | ||
708 | |||
709 | migrate->dma[migrate->dma_nr] = | ||
710 | dma_map_page_attrs(dev, spage, 0, PAGE_SIZE, | ||
711 | PCI_DMA_BIDIRECTIONAL, | ||
712 | DMA_ATTR_SKIP_CPU_SYNC); | ||
713 | if (dma_mapping_error(dev, migrate->dma[migrate->dma_nr])) { | ||
714 | nouveau_dmem_page_free_locked(drm, dpage); | ||
715 | dst_pfns[i] = 0; | ||
716 | continue; | ||
717 | } | ||
718 | |||
719 | src_addr = migrate->dma[migrate->dma_nr++]; | ||
720 | 565 | ||
721 | ret = copy(drm, 1, NOUVEAU_APER_VRAM, dst_addr, | 566 | spage = migrate_pfn_to_page(src); |
722 | NOUVEAU_APER_HOST, src_addr); | 567 | if (!spage || !(src & MIGRATE_PFN_MIGRATE)) |
723 | if (ret) { | 568 | goto out; |
724 | nouveau_dmem_page_free_locked(drm, dpage); | ||
725 | dst_pfns[i] = 0; | ||
726 | continue; | ||
727 | } | ||
728 | } | ||
729 | |||
730 | nouveau_fence_new(drm->dmem->migrate.chan, false, &migrate->fence); | ||
731 | 569 | ||
732 | return; | 570 | dpage = nouveau_dmem_page_alloc_locked(drm); |
571 | if (!dpage) | ||
572 | return 0; | ||
733 | 573 | ||
734 | error: | 574 | *dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); |
735 | for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, ++i) { | 575 | if (dma_mapping_error(dev, *dma_addr)) |
736 | struct page *page; | 576 | goto out_free_page; |
737 | 577 | ||
738 | if (!dst_pfns[i] || dst_pfns[i] == MIGRATE_PFN_ERROR) | 578 | if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM, |
739 | continue; | 579 | nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST, |
580 | *dma_addr)) | ||
581 | goto out_dma_unmap; | ||
740 | 582 | ||
741 | page = migrate_pfn_to_page(dst_pfns[i]); | 583 | return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; |
742 | dst_pfns[i] = MIGRATE_PFN_ERROR; | ||
743 | if (page == NULL) | ||
744 | continue; | ||
745 | 584 | ||
746 | __free_page(page); | 585 | out_dma_unmap: |
747 | } | 586 | dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); |
587 | out_free_page: | ||
588 | nouveau_dmem_page_free_locked(drm, dpage); | ||
589 | out: | ||
590 | return 0; | ||
748 | } | 591 | } |
749 | 592 | ||
750 | void nouveau_dmem_migrate_finalize_and_map(struct vm_area_struct *vma, | 593 | static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm, |
751 | const unsigned long *src_pfns, | 594 | struct migrate_vma *args, dma_addr_t *dma_addrs) |
752 | const unsigned long *dst_pfns, | ||
753 | unsigned long start, | ||
754 | unsigned long end, | ||
755 | void *private) | ||
756 | { | 595 | { |
757 | struct nouveau_migrate *migrate = private; | 596 | struct nouveau_fence *fence; |
758 | struct nouveau_drm *drm = migrate->drm; | 597 | unsigned long addr = args->start, nr_dma = 0, i; |
759 | 598 | ||
760 | if (migrate->fence) { | 599 | for (i = 0; addr < args->end; i++) { |
761 | nouveau_fence_wait(migrate->fence, true, false); | 600 | args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i], |
762 | nouveau_fence_unref(&migrate->fence); | 601 | dma_addrs + nr_dma); |
763 | } else { | 602 | if (args->dst[i]) |
764 | /* | 603 | nr_dma++; |
765 | * FIXME wait for channel to be IDLE before finalizing | 604 | addr += PAGE_SIZE; |
766 | * the hmem object below (nouveau_migrate_hmem_fini()) ? | ||
767 | */ | ||
768 | } | 605 | } |
769 | 606 | ||
770 | while (migrate->dma_nr--) { | 607 | nouveau_fence_new(drm->dmem->migrate.chan, false, &fence); |
771 | dma_unmap_page(drm->dev->dev, migrate->dma[migrate->dma_nr], | 608 | migrate_vma_pages(args); |
772 | PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); | 609 | nouveau_dmem_fence_done(&fence); |
773 | } | ||
774 | kfree(migrate->dma); | ||
775 | 610 | ||
611 | while (nr_dma--) { | ||
612 | dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE, | ||
613 | DMA_BIDIRECTIONAL); | ||
614 | } | ||
776 | /* | 615 | /* |
777 | * FIXME optimization: update GPU page table to point to newly | 616 | * FIXME optimization: update GPU page table to point to newly migrated |
778 | * migrated memory. | 617 | * memory. |
779 | */ | 618 | */ |
619 | migrate_vma_finalize(args); | ||
780 | } | 620 | } |
781 | 621 | ||
782 | static const struct migrate_vma_ops nouveau_dmem_migrate_ops = { | ||
783 | .alloc_and_copy = nouveau_dmem_migrate_alloc_and_copy, | ||
784 | .finalize_and_map = nouveau_dmem_migrate_finalize_and_map, | ||
785 | }; | ||
786 | |||
787 | int | 622 | int |
788 | nouveau_dmem_migrate_vma(struct nouveau_drm *drm, | 623 | nouveau_dmem_migrate_vma(struct nouveau_drm *drm, |
789 | struct vm_area_struct *vma, | 624 | struct vm_area_struct *vma, |
790 | unsigned long start, | 625 | unsigned long start, |
791 | unsigned long end) | 626 | unsigned long end) |
792 | { | 627 | { |
793 | unsigned long *src_pfns, *dst_pfns, npages; | 628 | unsigned long npages = (end - start) >> PAGE_SHIFT; |
794 | struct nouveau_migrate migrate = {0}; | 629 | unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages); |
795 | unsigned long i, c, max; | 630 | dma_addr_t *dma_addrs; |
796 | int ret = 0; | 631 | struct migrate_vma args = { |
797 | 632 | .vma = vma, | |
798 | npages = (end - start) >> PAGE_SHIFT; | 633 | .start = start, |
799 | max = min(SG_MAX_SINGLE_ALLOC, npages); | 634 | }; |
800 | src_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); | 635 | unsigned long c, i; |
801 | if (src_pfns == NULL) | 636 | int ret = -ENOMEM; |
802 | return -ENOMEM; | 637 | |
803 | dst_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); | 638 | args.src = kcalloc(max, sizeof(args.src), GFP_KERNEL); |
804 | if (dst_pfns == NULL) { | 639 | if (!args.src) |
805 | kfree(src_pfns); | 640 | goto out; |
806 | return -ENOMEM; | 641 | args.dst = kcalloc(max, sizeof(args.dst), GFP_KERNEL); |
807 | } | 642 | if (!args.dst) |
643 | goto out_free_src; | ||
808 | 644 | ||
809 | migrate.drm = drm; | 645 | dma_addrs = kmalloc_array(max, sizeof(*dma_addrs), GFP_KERNEL); |
810 | migrate.vma = vma; | 646 | if (!dma_addrs) |
811 | migrate.npages = npages; | 647 | goto out_free_dst; |
812 | for (i = 0; i < npages; i += c) { | ||
813 | unsigned long next; | ||
814 | 648 | ||
649 | for (i = 0; i < npages; i += c) { | ||
815 | c = min(SG_MAX_SINGLE_ALLOC, npages); | 650 | c = min(SG_MAX_SINGLE_ALLOC, npages); |
816 | next = start + (c << PAGE_SHIFT); | 651 | args.end = start + (c << PAGE_SHIFT); |
817 | ret = migrate_vma(&nouveau_dmem_migrate_ops, vma, start, | 652 | ret = migrate_vma_setup(&args); |
818 | next, src_pfns, dst_pfns, &migrate); | ||
819 | if (ret) | 653 | if (ret) |
820 | goto out; | 654 | goto out_free_dma; |
821 | start = next; | 655 | |
656 | if (args.cpages) | ||
657 | nouveau_dmem_migrate_chunk(drm, &args, dma_addrs); | ||
658 | args.start = args.end; | ||
822 | } | 659 | } |
823 | 660 | ||
661 | ret = 0; | ||
662 | out_free_dma: | ||
663 | kfree(dma_addrs); | ||
664 | out_free_dst: | ||
665 | kfree(args.dst); | ||
666 | out_free_src: | ||
667 | kfree(args.src); | ||
824 | out: | 668 | out: |
825 | kfree(dst_pfns); | ||
826 | kfree(src_pfns); | ||
827 | return ret; | 669 | return ret; |
828 | } | 670 | } |
829 | 671 | ||
@@ -841,11 +683,10 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, | |||
841 | 683 | ||
842 | npages = (range->end - range->start) >> PAGE_SHIFT; | 684 | npages = (range->end - range->start) >> PAGE_SHIFT; |
843 | for (i = 0; i < npages; ++i) { | 685 | for (i = 0; i < npages; ++i) { |
844 | struct nouveau_dmem_chunk *chunk; | ||
845 | struct page *page; | 686 | struct page *page; |
846 | uint64_t addr; | 687 | uint64_t addr; |
847 | 688 | ||
848 | page = hmm_pfn_to_page(range, range->pfns[i]); | 689 | page = hmm_device_entry_to_page(range, range->pfns[i]); |
849 | if (page == NULL) | 690 | if (page == NULL) |
850 | continue; | 691 | continue; |
851 | 692 | ||
@@ -859,10 +700,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, | |||
859 | continue; | 700 | continue; |
860 | } | 701 | } |
861 | 702 | ||
862 | chunk = page->zone_device_data; | 703 | addr = nouveau_dmem_page_addr(page); |
863 | addr = page_to_pfn(page) - chunk->pfn_first; | ||
864 | addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT; | ||
865 | |||
866 | range->pfns[i] &= ((1UL << range->pfn_shift) - 1); | 704 | range->pfns[i] &= ((1UL << range->pfn_shift) - 1); |
867 | range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; | 705 | range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; |
868 | } | 706 | } |
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.h b/drivers/gpu/drm/nouveau/nouveau_dmem.h index 9d97d756fb7d..92394be5d649 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.h +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.h | |||
@@ -45,16 +45,5 @@ static inline void nouveau_dmem_init(struct nouveau_drm *drm) {} | |||
45 | static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {} | 45 | static inline void nouveau_dmem_fini(struct nouveau_drm *drm) {} |
46 | static inline void nouveau_dmem_suspend(struct nouveau_drm *drm) {} | 46 | static inline void nouveau_dmem_suspend(struct nouveau_drm *drm) {} |
47 | static inline void nouveau_dmem_resume(struct nouveau_drm *drm) {} | 47 | static inline void nouveau_dmem_resume(struct nouveau_drm *drm) {} |
48 | |||
49 | static inline int nouveau_dmem_migrate_vma(struct nouveau_drm *drm, | ||
50 | struct vm_area_struct *vma, | ||
51 | unsigned long start, | ||
52 | unsigned long end) | ||
53 | { | ||
54 | return 0; | ||
55 | } | ||
56 | |||
57 | static inline void nouveau_dmem_convert_pfn(struct nouveau_drm *drm, | ||
58 | struct hmm_range *range) {} | ||
59 | #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ | 48 | #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ |
60 | #endif | 49 | #endif |
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index bdc948352467..2cd83849600f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/pci.h> | 28 | #include <linux/pci.h> |
29 | #include <linux/pm_runtime.h> | 29 | #include <linux/pm_runtime.h> |
30 | #include <linux/vga_switcheroo.h> | 30 | #include <linux/vga_switcheroo.h> |
31 | #include <linux/mmu_notifier.h> | ||
31 | 32 | ||
32 | #include <drm/drm_crtc_helper.h> | 33 | #include <drm/drm_crtc_helper.h> |
33 | #include <drm/drm_ioctl.h> | 34 | #include <drm/drm_ioctl.h> |
@@ -1290,6 +1291,8 @@ nouveau_drm_exit(void) | |||
1290 | #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER | 1291 | #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER |
1291 | platform_driver_unregister(&nouveau_platform_driver); | 1292 | platform_driver_unregister(&nouveau_platform_driver); |
1292 | #endif | 1293 | #endif |
1294 | if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)) | ||
1295 | mmu_notifier_synchronize(); | ||
1293 | } | 1296 | } |
1294 | 1297 | ||
1295 | module_init(nouveau_drm_init); | 1298 | module_init(nouveau_drm_init); |
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index a835cebb6d90..668d4bd0c118 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c | |||
@@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) | |||
252 | 252 | ||
253 | static int | 253 | static int |
254 | nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, | 254 | nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, |
255 | const struct hmm_update *update) | 255 | const struct mmu_notifier_range *update) |
256 | { | 256 | { |
257 | struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); | 257 | struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); |
258 | unsigned long start = update->start; | 258 | unsigned long start = update->start; |
259 | unsigned long limit = update->end; | 259 | unsigned long limit = update->end; |
260 | 260 | ||
261 | if (!update->blockable) | 261 | if (!mmu_notifier_range_blockable(update)) |
262 | return -EAGAIN; | 262 | return -EAGAIN; |
263 | 263 | ||
264 | SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); | 264 | SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); |
@@ -485,31 +485,29 @@ nouveau_range_done(struct hmm_range *range) | |||
485 | } | 485 | } |
486 | 486 | ||
487 | static int | 487 | static int |
488 | nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) | 488 | nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range) |
489 | { | 489 | { |
490 | long ret; | 490 | long ret; |
491 | 491 | ||
492 | range->default_flags = 0; | 492 | range->default_flags = 0; |
493 | range->pfn_flags_mask = -1UL; | 493 | range->pfn_flags_mask = -1UL; |
494 | 494 | ||
495 | ret = hmm_range_register(range, mirror, | 495 | ret = hmm_range_register(range, &svmm->mirror); |
496 | range->start, range->end, | ||
497 | PAGE_SHIFT); | ||
498 | if (ret) { | 496 | if (ret) { |
499 | up_read(&range->vma->vm_mm->mmap_sem); | 497 | up_read(&svmm->mm->mmap_sem); |
500 | return (int)ret; | 498 | return (int)ret; |
501 | } | 499 | } |
502 | 500 | ||
503 | if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { | 501 | if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { |
504 | up_read(&range->vma->vm_mm->mmap_sem); | 502 | up_read(&svmm->mm->mmap_sem); |
505 | return -EAGAIN; | 503 | return -EBUSY; |
506 | } | 504 | } |
507 | 505 | ||
508 | ret = hmm_range_fault(range, true); | 506 | ret = hmm_range_fault(range, 0); |
509 | if (ret <= 0) { | 507 | if (ret <= 0) { |
510 | if (ret == 0) | 508 | if (ret == 0) |
511 | ret = -EBUSY; | 509 | ret = -EBUSY; |
512 | up_read(&range->vma->vm_mm->mmap_sem); | 510 | up_read(&svmm->mm->mmap_sem); |
513 | hmm_range_unregister(range); | 511 | hmm_range_unregister(range); |
514 | return ret; | 512 | return ret; |
515 | } | 513 | } |
@@ -682,7 +680,6 @@ nouveau_svm_fault(struct nvif_notify *notify) | |||
682 | args.i.p.addr + args.i.p.size, fn - fi); | 680 | args.i.p.addr + args.i.p.size, fn - fi); |
683 | 681 | ||
684 | /* Have HMM fault pages within the fault window to the GPU. */ | 682 | /* Have HMM fault pages within the fault window to the GPU. */ |
685 | range.vma = vma; | ||
686 | range.start = args.i.p.addr; | 683 | range.start = args.i.p.addr; |
687 | range.end = args.i.p.addr + args.i.p.size; | 684 | range.end = args.i.p.addr + args.i.p.size; |
688 | range.pfns = args.phys; | 685 | range.pfns = args.phys; |
@@ -690,7 +687,7 @@ nouveau_svm_fault(struct nvif_notify *notify) | |||
690 | range.values = nouveau_svm_pfn_values; | 687 | range.values = nouveau_svm_pfn_values; |
691 | range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; | 688 | range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; |
692 | again: | 689 | again: |
693 | ret = nouveau_range_fault(&svmm->mirror, &range); | 690 | ret = nouveau_range_fault(svmm, &range); |
694 | if (ret == 0) { | 691 | if (ret == 0) { |
695 | mutex_lock(&svmm->mutex); | 692 | mutex_lock(&svmm->mutex); |
696 | if (!nouveau_range_done(&range)) { | 693 | if (!nouveau_range_done(&range)) { |
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 05b88491ccb9..d59b004f6695 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h | |||
@@ -2449,9 +2449,6 @@ struct radeon_device { | |||
2449 | /* tracking pinned memory */ | 2449 | /* tracking pinned memory */ |
2450 | u64 vram_pin_size; | 2450 | u64 vram_pin_size; |
2451 | u64 gart_pin_size; | 2451 | u64 gart_pin_size; |
2452 | |||
2453 | struct mutex mn_lock; | ||
2454 | DECLARE_HASHTABLE(mn_hash, 7); | ||
2455 | }; | 2452 | }; |
2456 | 2453 | ||
2457 | bool radeon_is_px(struct drm_device *dev); | 2454 | bool radeon_is_px(struct drm_device *dev); |
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 88eb7cb522bb..5d017f0aec66 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c | |||
@@ -1325,8 +1325,6 @@ int radeon_device_init(struct radeon_device *rdev, | |||
1325 | init_rwsem(&rdev->pm.mclk_lock); | 1325 | init_rwsem(&rdev->pm.mclk_lock); |
1326 | init_rwsem(&rdev->exclusive_lock); | 1326 | init_rwsem(&rdev->exclusive_lock); |
1327 | init_waitqueue_head(&rdev->irq.vblank_queue); | 1327 | init_waitqueue_head(&rdev->irq.vblank_queue); |
1328 | mutex_init(&rdev->mn_lock); | ||
1329 | hash_init(rdev->mn_hash); | ||
1330 | r = radeon_gem_init(rdev); | 1328 | r = radeon_gem_init(rdev); |
1331 | if (r) | 1329 | if (r) |
1332 | return r; | 1330 | return r; |
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 5838162f687f..431e6b64b77d 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/module.h> | 35 | #include <linux/module.h> |
36 | #include <linux/pm_runtime.h> | 36 | #include <linux/pm_runtime.h> |
37 | #include <linux/vga_switcheroo.h> | 37 | #include <linux/vga_switcheroo.h> |
38 | #include <linux/mmu_notifier.h> | ||
38 | 39 | ||
39 | #include <drm/drm_crtc_helper.h> | 40 | #include <drm/drm_crtc_helper.h> |
40 | #include <drm/drm_drv.h> | 41 | #include <drm/drm_drv.h> |
@@ -623,6 +624,7 @@ static void __exit radeon_exit(void) | |||
623 | { | 624 | { |
624 | pci_unregister_driver(pdriver); | 625 | pci_unregister_driver(pdriver); |
625 | radeon_unregister_atpx_handler(); | 626 | radeon_unregister_atpx_handler(); |
627 | mmu_notifier_synchronize(); | ||
626 | } | 628 | } |
627 | 629 | ||
628 | module_init(radeon_init); | 630 | module_init(radeon_init); |
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index 6902f998ede9..dbab9a3a969b 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c | |||
@@ -37,17 +37,8 @@ | |||
37 | #include "radeon.h" | 37 | #include "radeon.h" |
38 | 38 | ||
39 | struct radeon_mn { | 39 | struct radeon_mn { |
40 | /* constant after initialisation */ | ||
41 | struct radeon_device *rdev; | ||
42 | struct mm_struct *mm; | ||
43 | struct mmu_notifier mn; | 40 | struct mmu_notifier mn; |
44 | 41 | ||
45 | /* only used on destruction */ | ||
46 | struct work_struct work; | ||
47 | |||
48 | /* protected by rdev->mn_lock */ | ||
49 | struct hlist_node node; | ||
50 | |||
51 | /* objects protected by lock */ | 42 | /* objects protected by lock */ |
52 | struct mutex lock; | 43 | struct mutex lock; |
53 | struct rb_root_cached objects; | 44 | struct rb_root_cached objects; |
@@ -59,55 +50,6 @@ struct radeon_mn_node { | |||
59 | }; | 50 | }; |
60 | 51 | ||
61 | /** | 52 | /** |
62 | * radeon_mn_destroy - destroy the rmn | ||
63 | * | ||
64 | * @work: previously sheduled work item | ||
65 | * | ||
66 | * Lazy destroys the notifier from a work item | ||
67 | */ | ||
68 | static void radeon_mn_destroy(struct work_struct *work) | ||
69 | { | ||
70 | struct radeon_mn *rmn = container_of(work, struct radeon_mn, work); | ||
71 | struct radeon_device *rdev = rmn->rdev; | ||
72 | struct radeon_mn_node *node, *next_node; | ||
73 | struct radeon_bo *bo, *next_bo; | ||
74 | |||
75 | mutex_lock(&rdev->mn_lock); | ||
76 | mutex_lock(&rmn->lock); | ||
77 | hash_del(&rmn->node); | ||
78 | rbtree_postorder_for_each_entry_safe(node, next_node, | ||
79 | &rmn->objects.rb_root, it.rb) { | ||
80 | |||
81 | interval_tree_remove(&node->it, &rmn->objects); | ||
82 | list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { | ||
83 | bo->mn = NULL; | ||
84 | list_del_init(&bo->mn_list); | ||
85 | } | ||
86 | kfree(node); | ||
87 | } | ||
88 | mutex_unlock(&rmn->lock); | ||
89 | mutex_unlock(&rdev->mn_lock); | ||
90 | mmu_notifier_unregister(&rmn->mn, rmn->mm); | ||
91 | kfree(rmn); | ||
92 | } | ||
93 | |||
94 | /** | ||
95 | * radeon_mn_release - callback to notify about mm destruction | ||
96 | * | ||
97 | * @mn: our notifier | ||
98 | * @mn: the mm this callback is about | ||
99 | * | ||
100 | * Shedule a work item to lazy destroy our notifier. | ||
101 | */ | ||
102 | static void radeon_mn_release(struct mmu_notifier *mn, | ||
103 | struct mm_struct *mm) | ||
104 | { | ||
105 | struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); | ||
106 | INIT_WORK(&rmn->work, radeon_mn_destroy); | ||
107 | schedule_work(&rmn->work); | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * radeon_mn_invalidate_range_start - callback to notify about mm change | 53 | * radeon_mn_invalidate_range_start - callback to notify about mm change |
112 | * | 54 | * |
113 | * @mn: our notifier | 55 | * @mn: our notifier |
@@ -183,65 +125,44 @@ out_unlock: | |||
183 | return ret; | 125 | return ret; |
184 | } | 126 | } |
185 | 127 | ||
186 | static const struct mmu_notifier_ops radeon_mn_ops = { | 128 | static void radeon_mn_release(struct mmu_notifier *mn, struct mm_struct *mm) |
187 | .release = radeon_mn_release, | 129 | { |
188 | .invalidate_range_start = radeon_mn_invalidate_range_start, | 130 | struct mmu_notifier_range range = { |
189 | }; | 131 | .mm = mm, |
132 | .start = 0, | ||
133 | .end = ULONG_MAX, | ||
134 | .flags = 0, | ||
135 | .event = MMU_NOTIFY_UNMAP, | ||
136 | }; | ||
137 | |||
138 | radeon_mn_invalidate_range_start(mn, &range); | ||
139 | } | ||
190 | 140 | ||
191 | /** | 141 | static struct mmu_notifier *radeon_mn_alloc_notifier(struct mm_struct *mm) |
192 | * radeon_mn_get - create notifier context | ||
193 | * | ||
194 | * @rdev: radeon device pointer | ||
195 | * | ||
196 | * Creates a notifier context for current->mm. | ||
197 | */ | ||
198 | static struct radeon_mn *radeon_mn_get(struct radeon_device *rdev) | ||
199 | { | 142 | { |
200 | struct mm_struct *mm = current->mm; | ||
201 | struct radeon_mn *rmn; | 143 | struct radeon_mn *rmn; |
202 | int r; | ||
203 | |||
204 | if (down_write_killable(&mm->mmap_sem)) | ||
205 | return ERR_PTR(-EINTR); | ||
206 | |||
207 | mutex_lock(&rdev->mn_lock); | ||
208 | |||
209 | hash_for_each_possible(rdev->mn_hash, rmn, node, (unsigned long)mm) | ||
210 | if (rmn->mm == mm) | ||
211 | goto release_locks; | ||
212 | 144 | ||
213 | rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); | 145 | rmn = kzalloc(sizeof(*rmn), GFP_KERNEL); |
214 | if (!rmn) { | 146 | if (!rmn) |
215 | rmn = ERR_PTR(-ENOMEM); | 147 | return ERR_PTR(-ENOMEM); |
216 | goto release_locks; | ||
217 | } | ||
218 | 148 | ||
219 | rmn->rdev = rdev; | ||
220 | rmn->mm = mm; | ||
221 | rmn->mn.ops = &radeon_mn_ops; | ||
222 | mutex_init(&rmn->lock); | 149 | mutex_init(&rmn->lock); |
223 | rmn->objects = RB_ROOT_CACHED; | 150 | rmn->objects = RB_ROOT_CACHED; |
224 | 151 | return &rmn->mn; | |
225 | r = __mmu_notifier_register(&rmn->mn, mm); | 152 | } |
226 | if (r) | ||
227 | goto free_rmn; | ||
228 | |||
229 | hash_add(rdev->mn_hash, &rmn->node, (unsigned long)mm); | ||
230 | |||
231 | release_locks: | ||
232 | mutex_unlock(&rdev->mn_lock); | ||
233 | up_write(&mm->mmap_sem); | ||
234 | |||
235 | return rmn; | ||
236 | |||
237 | free_rmn: | ||
238 | mutex_unlock(&rdev->mn_lock); | ||
239 | up_write(&mm->mmap_sem); | ||
240 | kfree(rmn); | ||
241 | 153 | ||
242 | return ERR_PTR(r); | 154 | static void radeon_mn_free_notifier(struct mmu_notifier *mn) |
155 | { | ||
156 | kfree(container_of(mn, struct radeon_mn, mn)); | ||
243 | } | 157 | } |
244 | 158 | ||
159 | static const struct mmu_notifier_ops radeon_mn_ops = { | ||
160 | .release = radeon_mn_release, | ||
161 | .invalidate_range_start = radeon_mn_invalidate_range_start, | ||
162 | .alloc_notifier = radeon_mn_alloc_notifier, | ||
163 | .free_notifier = radeon_mn_free_notifier, | ||
164 | }; | ||
165 | |||
245 | /** | 166 | /** |
246 | * radeon_mn_register - register a BO for notifier updates | 167 | * radeon_mn_register - register a BO for notifier updates |
247 | * | 168 | * |
@@ -254,15 +175,16 @@ free_rmn: | |||
254 | int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) | 175 | int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) |
255 | { | 176 | { |
256 | unsigned long end = addr + radeon_bo_size(bo) - 1; | 177 | unsigned long end = addr + radeon_bo_size(bo) - 1; |
257 | struct radeon_device *rdev = bo->rdev; | 178 | struct mmu_notifier *mn; |
258 | struct radeon_mn *rmn; | 179 | struct radeon_mn *rmn; |
259 | struct radeon_mn_node *node = NULL; | 180 | struct radeon_mn_node *node = NULL; |
260 | struct list_head bos; | 181 | struct list_head bos; |
261 | struct interval_tree_node *it; | 182 | struct interval_tree_node *it; |
262 | 183 | ||
263 | rmn = radeon_mn_get(rdev); | 184 | mn = mmu_notifier_get(&radeon_mn_ops, current->mm); |
264 | if (IS_ERR(rmn)) | 185 | if (IS_ERR(mn)) |
265 | return PTR_ERR(rmn); | 186 | return PTR_ERR(mn); |
187 | rmn = container_of(mn, struct radeon_mn, mn); | ||
266 | 188 | ||
267 | INIT_LIST_HEAD(&bos); | 189 | INIT_LIST_HEAD(&bos); |
268 | 190 | ||
@@ -309,22 +231,16 @@ int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) | |||
309 | */ | 231 | */ |
310 | void radeon_mn_unregister(struct radeon_bo *bo) | 232 | void radeon_mn_unregister(struct radeon_bo *bo) |
311 | { | 233 | { |
312 | struct radeon_device *rdev = bo->rdev; | 234 | struct radeon_mn *rmn = bo->mn; |
313 | struct radeon_mn *rmn; | ||
314 | struct list_head *head; | 235 | struct list_head *head; |
315 | 236 | ||
316 | mutex_lock(&rdev->mn_lock); | 237 | if (!rmn) |
317 | rmn = bo->mn; | ||
318 | if (rmn == NULL) { | ||
319 | mutex_unlock(&rdev->mn_lock); | ||
320 | return; | 238 | return; |
321 | } | ||
322 | 239 | ||
323 | mutex_lock(&rmn->lock); | 240 | mutex_lock(&rmn->lock); |
324 | /* save the next list entry for later */ | 241 | /* save the next list entry for later */ |
325 | head = bo->mn_list.next; | 242 | head = bo->mn_list.next; |
326 | 243 | ||
327 | bo->mn = NULL; | ||
328 | list_del(&bo->mn_list); | 244 | list_del(&bo->mn_list); |
329 | 245 | ||
330 | if (list_empty(head)) { | 246 | if (list_empty(head)) { |
@@ -335,5 +251,7 @@ void radeon_mn_unregister(struct radeon_bo *bo) | |||
335 | } | 251 | } |
336 | 252 | ||
337 | mutex_unlock(&rmn->lock); | 253 | mutex_unlock(&rmn->lock); |
338 | mutex_unlock(&rdev->mn_lock); | 254 | |
255 | mmu_notifier_put(&rmn->mn); | ||
256 | bo->mn = NULL; | ||
339 | } | 257 | } |
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 85e103b147cc..b44b1c322ec8 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig | |||
@@ -55,6 +55,7 @@ config INFINIBAND_ON_DEMAND_PAGING | |||
55 | bool "InfiniBand on-demand paging support" | 55 | bool "InfiniBand on-demand paging support" |
56 | depends on INFINIBAND_USER_MEM | 56 | depends on INFINIBAND_USER_MEM |
57 | select MMU_NOTIFIER | 57 | select MMU_NOTIFIER |
58 | select INTERVAL_TREE | ||
58 | default y | 59 | default y |
59 | ---help--- | 60 | ---help--- |
60 | On demand paging support for the InfiniBand subsystem. | 61 | On demand paging support for the InfiniBand subsystem. |
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ea8661a00651..b5631b8a0397 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c | |||
@@ -2562,6 +2562,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) | |||
2562 | SET_DEVICE_OP(dev_ops, get_vf_config); | 2562 | SET_DEVICE_OP(dev_ops, get_vf_config); |
2563 | SET_DEVICE_OP(dev_ops, get_vf_stats); | 2563 | SET_DEVICE_OP(dev_ops, get_vf_stats); |
2564 | SET_DEVICE_OP(dev_ops, init_port); | 2564 | SET_DEVICE_OP(dev_ops, init_port); |
2565 | SET_DEVICE_OP(dev_ops, invalidate_range); | ||
2565 | SET_DEVICE_OP(dev_ops, iw_accept); | 2566 | SET_DEVICE_OP(dev_ops, iw_accept); |
2566 | SET_DEVICE_OP(dev_ops, iw_add_ref); | 2567 | SET_DEVICE_OP(dev_ops, iw_add_ref); |
2567 | SET_DEVICE_OP(dev_ops, iw_connect); | 2568 | SET_DEVICE_OP(dev_ops, iw_connect); |
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 56553668256f..41f9e268e3fb 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c | |||
@@ -184,9 +184,6 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); | |||
184 | /** | 184 | /** |
185 | * ib_umem_get - Pin and DMA map userspace memory. | 185 | * ib_umem_get - Pin and DMA map userspace memory. |
186 | * | 186 | * |
187 | * If access flags indicate ODP memory, avoid pinning. Instead, stores | ||
188 | * the mm for future page fault handling in conjunction with MMU notifiers. | ||
189 | * | ||
190 | * @udata: userspace context to pin memory for | 187 | * @udata: userspace context to pin memory for |
191 | * @addr: userspace virtual address to start at | 188 | * @addr: userspace virtual address to start at |
192 | * @size: length of region to pin | 189 | * @size: length of region to pin |
@@ -231,36 +228,19 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, | |||
231 | if (!can_do_mlock()) | 228 | if (!can_do_mlock()) |
232 | return ERR_PTR(-EPERM); | 229 | return ERR_PTR(-EPERM); |
233 | 230 | ||
234 | if (access & IB_ACCESS_ON_DEMAND) { | 231 | if (access & IB_ACCESS_ON_DEMAND) |
235 | umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); | 232 | return ERR_PTR(-EOPNOTSUPP); |
236 | if (!umem) | ||
237 | return ERR_PTR(-ENOMEM); | ||
238 | umem->is_odp = 1; | ||
239 | } else { | ||
240 | umem = kzalloc(sizeof(*umem), GFP_KERNEL); | ||
241 | if (!umem) | ||
242 | return ERR_PTR(-ENOMEM); | ||
243 | } | ||
244 | 233 | ||
245 | umem->context = context; | 234 | umem = kzalloc(sizeof(*umem), GFP_KERNEL); |
235 | if (!umem) | ||
236 | return ERR_PTR(-ENOMEM); | ||
237 | umem->ibdev = context->device; | ||
246 | umem->length = size; | 238 | umem->length = size; |
247 | umem->address = addr; | 239 | umem->address = addr; |
248 | umem->writable = ib_access_writable(access); | 240 | umem->writable = ib_access_writable(access); |
249 | umem->owning_mm = mm = current->mm; | 241 | umem->owning_mm = mm = current->mm; |
250 | mmgrab(mm); | 242 | mmgrab(mm); |
251 | 243 | ||
252 | if (access & IB_ACCESS_ON_DEMAND) { | ||
253 | if (WARN_ON_ONCE(!context->invalidate_range)) { | ||
254 | ret = -EINVAL; | ||
255 | goto umem_kfree; | ||
256 | } | ||
257 | |||
258 | ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); | ||
259 | if (ret) | ||
260 | goto umem_kfree; | ||
261 | return umem; | ||
262 | } | ||
263 | |||
264 | page_list = (struct page **) __get_free_page(GFP_KERNEL); | 244 | page_list = (struct page **) __get_free_page(GFP_KERNEL); |
265 | if (!page_list) { | 245 | if (!page_list) { |
266 | ret = -ENOMEM; | 246 | ret = -ENOMEM; |
@@ -346,15 +326,6 @@ umem_kfree: | |||
346 | } | 326 | } |
347 | EXPORT_SYMBOL(ib_umem_get); | 327 | EXPORT_SYMBOL(ib_umem_get); |
348 | 328 | ||
349 | static void __ib_umem_release_tail(struct ib_umem *umem) | ||
350 | { | ||
351 | mmdrop(umem->owning_mm); | ||
352 | if (umem->is_odp) | ||
353 | kfree(to_ib_umem_odp(umem)); | ||
354 | else | ||
355 | kfree(umem); | ||
356 | } | ||
357 | |||
358 | /** | 329 | /** |
359 | * ib_umem_release - release memory pinned with ib_umem_get | 330 | * ib_umem_release - release memory pinned with ib_umem_get |
360 | * @umem: umem struct to release | 331 | * @umem: umem struct to release |
@@ -363,17 +334,14 @@ void ib_umem_release(struct ib_umem *umem) | |||
363 | { | 334 | { |
364 | if (!umem) | 335 | if (!umem) |
365 | return; | 336 | return; |
337 | if (umem->is_odp) | ||
338 | return ib_umem_odp_release(to_ib_umem_odp(umem)); | ||
366 | 339 | ||
367 | if (umem->is_odp) { | 340 | __ib_umem_release(umem->ibdev, umem, 1); |
368 | ib_umem_odp_release(to_ib_umem_odp(umem)); | ||
369 | __ib_umem_release_tail(umem); | ||
370 | return; | ||
371 | } | ||
372 | |||
373 | __ib_umem_release(umem->context->device, umem, 1); | ||
374 | 341 | ||
375 | atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); | 342 | atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); |
376 | __ib_umem_release_tail(umem); | 343 | mmdrop(umem->owning_mm); |
344 | kfree(umem); | ||
377 | } | 345 | } |
378 | EXPORT_SYMBOL(ib_umem_release); | 346 | EXPORT_SYMBOL(ib_umem_release); |
379 | 347 | ||
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c0e15db34680..9aebe9ce8b07 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c | |||
@@ -39,44 +39,14 @@ | |||
39 | #include <linux/export.h> | 39 | #include <linux/export.h> |
40 | #include <linux/vmalloc.h> | 40 | #include <linux/vmalloc.h> |
41 | #include <linux/hugetlb.h> | 41 | #include <linux/hugetlb.h> |
42 | #include <linux/interval_tree_generic.h> | 42 | #include <linux/interval_tree.h> |
43 | #include <linux/pagemap.h> | 43 | #include <linux/pagemap.h> |
44 | 44 | ||
45 | #include <rdma/ib_verbs.h> | 45 | #include <rdma/ib_verbs.h> |
46 | #include <rdma/ib_umem.h> | 46 | #include <rdma/ib_umem.h> |
47 | #include <rdma/ib_umem_odp.h> | 47 | #include <rdma/ib_umem_odp.h> |
48 | 48 | ||
49 | /* | 49 | #include "uverbs.h" |
50 | * The ib_umem list keeps track of memory regions for which the HW | ||
51 | * device request to receive notification when the related memory | ||
52 | * mapping is changed. | ||
53 | * | ||
54 | * ib_umem_lock protects the list. | ||
55 | */ | ||
56 | |||
57 | static u64 node_start(struct umem_odp_node *n) | ||
58 | { | ||
59 | struct ib_umem_odp *umem_odp = | ||
60 | container_of(n, struct ib_umem_odp, interval_tree); | ||
61 | |||
62 | return ib_umem_start(umem_odp); | ||
63 | } | ||
64 | |||
65 | /* Note that the representation of the intervals in the interval tree | ||
66 | * considers the ending point as contained in the interval, while the | ||
67 | * function ib_umem_end returns the first address which is not contained | ||
68 | * in the umem. | ||
69 | */ | ||
70 | static u64 node_last(struct umem_odp_node *n) | ||
71 | { | ||
72 | struct ib_umem_odp *umem_odp = | ||
73 | container_of(n, struct ib_umem_odp, interval_tree); | ||
74 | |||
75 | return ib_umem_end(umem_odp) - 1; | ||
76 | } | ||
77 | |||
78 | INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, | ||
79 | node_start, node_last, static, rbt_ib_umem) | ||
80 | 50 | ||
81 | static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) | 51 | static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) |
82 | { | 52 | { |
@@ -104,31 +74,34 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) | |||
104 | mutex_unlock(&umem_odp->umem_mutex); | 74 | mutex_unlock(&umem_odp->umem_mutex); |
105 | } | 75 | } |
106 | 76 | ||
107 | static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, | ||
108 | u64 start, u64 end, void *cookie) | ||
109 | { | ||
110 | /* | ||
111 | * Increase the number of notifiers running, to | ||
112 | * prevent any further fault handling on this MR. | ||
113 | */ | ||
114 | ib_umem_notifier_start_account(umem_odp); | ||
115 | complete_all(&umem_odp->notifier_completion); | ||
116 | umem_odp->umem.context->invalidate_range( | ||
117 | umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); | ||
118 | return 0; | ||
119 | } | ||
120 | |||
121 | static void ib_umem_notifier_release(struct mmu_notifier *mn, | 77 | static void ib_umem_notifier_release(struct mmu_notifier *mn, |
122 | struct mm_struct *mm) | 78 | struct mm_struct *mm) |
123 | { | 79 | { |
124 | struct ib_ucontext_per_mm *per_mm = | 80 | struct ib_ucontext_per_mm *per_mm = |
125 | container_of(mn, struct ib_ucontext_per_mm, mn); | 81 | container_of(mn, struct ib_ucontext_per_mm, mn); |
82 | struct rb_node *node; | ||
126 | 83 | ||
127 | down_read(&per_mm->umem_rwsem); | 84 | down_read(&per_mm->umem_rwsem); |
128 | if (per_mm->active) | 85 | if (!per_mm->mn.users) |
129 | rbt_ib_umem_for_each_in_range( | 86 | goto out; |
130 | &per_mm->umem_tree, 0, ULLONG_MAX, | 87 | |
131 | ib_umem_notifier_release_trampoline, true, NULL); | 88 | for (node = rb_first_cached(&per_mm->umem_tree); node; |
89 | node = rb_next(node)) { | ||
90 | struct ib_umem_odp *umem_odp = | ||
91 | rb_entry(node, struct ib_umem_odp, interval_tree.rb); | ||
92 | |||
93 | /* | ||
94 | * Increase the number of notifiers running, to prevent any | ||
95 | * further fault handling on this MR. | ||
96 | */ | ||
97 | ib_umem_notifier_start_account(umem_odp); | ||
98 | complete_all(&umem_odp->notifier_completion); | ||
99 | umem_odp->umem.ibdev->ops.invalidate_range( | ||
100 | umem_odp, ib_umem_start(umem_odp), | ||
101 | ib_umem_end(umem_odp)); | ||
102 | } | ||
103 | |||
104 | out: | ||
132 | up_read(&per_mm->umem_rwsem); | 105 | up_read(&per_mm->umem_rwsem); |
133 | } | 106 | } |
134 | 107 | ||
@@ -136,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, | |||
136 | u64 start, u64 end, void *cookie) | 109 | u64 start, u64 end, void *cookie) |
137 | { | 110 | { |
138 | ib_umem_notifier_start_account(item); | 111 | ib_umem_notifier_start_account(item); |
139 | item->umem.context->invalidate_range(item, start, end); | 112 | item->umem.ibdev->ops.invalidate_range(item, start, end); |
140 | return 0; | 113 | return 0; |
141 | } | 114 | } |
142 | 115 | ||
@@ -152,10 +125,10 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, | |||
152 | else if (!down_read_trylock(&per_mm->umem_rwsem)) | 125 | else if (!down_read_trylock(&per_mm->umem_rwsem)) |
153 | return -EAGAIN; | 126 | return -EAGAIN; |
154 | 127 | ||
155 | if (!per_mm->active) { | 128 | if (!per_mm->mn.users) { |
156 | up_read(&per_mm->umem_rwsem); | 129 | up_read(&per_mm->umem_rwsem); |
157 | /* | 130 | /* |
158 | * At this point active is permanently set and visible to this | 131 | * At this point users is permanently zero and visible to this |
159 | * CPU without a lock, that fact is relied on to skip the unlock | 132 | * CPU without a lock, that fact is relied on to skip the unlock |
160 | * in range_end. | 133 | * in range_end. |
161 | */ | 134 | */ |
@@ -185,7 +158,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | |||
185 | struct ib_ucontext_per_mm *per_mm = | 158 | struct ib_ucontext_per_mm *per_mm = |
186 | container_of(mn, struct ib_ucontext_per_mm, mn); | 159 | container_of(mn, struct ib_ucontext_per_mm, mn); |
187 | 160 | ||
188 | if (unlikely(!per_mm->active)) | 161 | if (unlikely(!per_mm->mn.users)) |
189 | return; | 162 | return; |
190 | 163 | ||
191 | rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, | 164 | rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, |
@@ -194,212 +167,250 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, | |||
194 | up_read(&per_mm->umem_rwsem); | 167 | up_read(&per_mm->umem_rwsem); |
195 | } | 168 | } |
196 | 169 | ||
197 | static const struct mmu_notifier_ops ib_umem_notifiers = { | 170 | static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) |
198 | .release = ib_umem_notifier_release, | ||
199 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, | ||
200 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | ||
201 | }; | ||
202 | |||
203 | static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) | ||
204 | { | ||
205 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | ||
206 | |||
207 | down_write(&per_mm->umem_rwsem); | ||
208 | if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) | ||
209 | rbt_ib_umem_insert(&umem_odp->interval_tree, | ||
210 | &per_mm->umem_tree); | ||
211 | up_write(&per_mm->umem_rwsem); | ||
212 | } | ||
213 | |||
214 | static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) | ||
215 | { | ||
216 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | ||
217 | |||
218 | down_write(&per_mm->umem_rwsem); | ||
219 | if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) | ||
220 | rbt_ib_umem_remove(&umem_odp->interval_tree, | ||
221 | &per_mm->umem_tree); | ||
222 | complete_all(&umem_odp->notifier_completion); | ||
223 | |||
224 | up_write(&per_mm->umem_rwsem); | ||
225 | } | ||
226 | |||
227 | static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, | ||
228 | struct mm_struct *mm) | ||
229 | { | 171 | { |
230 | struct ib_ucontext_per_mm *per_mm; | 172 | struct ib_ucontext_per_mm *per_mm; |
231 | int ret; | ||
232 | 173 | ||
233 | per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); | 174 | per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); |
234 | if (!per_mm) | 175 | if (!per_mm) |
235 | return ERR_PTR(-ENOMEM); | 176 | return ERR_PTR(-ENOMEM); |
236 | 177 | ||
237 | per_mm->context = ctx; | ||
238 | per_mm->mm = mm; | ||
239 | per_mm->umem_tree = RB_ROOT_CACHED; | 178 | per_mm->umem_tree = RB_ROOT_CACHED; |
240 | init_rwsem(&per_mm->umem_rwsem); | 179 | init_rwsem(&per_mm->umem_rwsem); |
241 | per_mm->active = true; | ||
242 | 180 | ||
181 | WARN_ON(mm != current->mm); | ||
243 | rcu_read_lock(); | 182 | rcu_read_lock(); |
244 | per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); | 183 | per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); |
245 | rcu_read_unlock(); | 184 | rcu_read_unlock(); |
185 | return &per_mm->mn; | ||
186 | } | ||
246 | 187 | ||
247 | WARN_ON(mm != current->mm); | 188 | static void ib_umem_free_notifier(struct mmu_notifier *mn) |
248 | 189 | { | |
249 | per_mm->mn.ops = &ib_umem_notifiers; | 190 | struct ib_ucontext_per_mm *per_mm = |
250 | ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); | 191 | container_of(mn, struct ib_ucontext_per_mm, mn); |
251 | if (ret) { | ||
252 | dev_err(&ctx->device->dev, | ||
253 | "Failed to register mmu_notifier %d\n", ret); | ||
254 | goto out_pid; | ||
255 | } | ||
256 | 192 | ||
257 | list_add(&per_mm->ucontext_list, &ctx->per_mm_list); | 193 | WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); |
258 | return per_mm; | ||
259 | 194 | ||
260 | out_pid: | ||
261 | put_pid(per_mm->tgid); | 195 | put_pid(per_mm->tgid); |
262 | kfree(per_mm); | 196 | kfree(per_mm); |
263 | return ERR_PTR(ret); | ||
264 | } | 197 | } |
265 | 198 | ||
266 | static int get_per_mm(struct ib_umem_odp *umem_odp) | 199 | static const struct mmu_notifier_ops ib_umem_notifiers = { |
200 | .release = ib_umem_notifier_release, | ||
201 | .invalidate_range_start = ib_umem_notifier_invalidate_range_start, | ||
202 | .invalidate_range_end = ib_umem_notifier_invalidate_range_end, | ||
203 | .alloc_notifier = ib_umem_alloc_notifier, | ||
204 | .free_notifier = ib_umem_free_notifier, | ||
205 | }; | ||
206 | |||
207 | static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) | ||
267 | { | 208 | { |
268 | struct ib_ucontext *ctx = umem_odp->umem.context; | ||
269 | struct ib_ucontext_per_mm *per_mm; | 209 | struct ib_ucontext_per_mm *per_mm; |
210 | struct mmu_notifier *mn; | ||
211 | int ret; | ||
270 | 212 | ||
271 | /* | 213 | umem_odp->umem.is_odp = 1; |
272 | * Generally speaking we expect only one or two per_mm in this list, | 214 | if (!umem_odp->is_implicit_odp) { |
273 | * so no reason to optimize this search today. | 215 | size_t page_size = 1UL << umem_odp->page_shift; |
274 | */ | 216 | size_t pages; |
275 | mutex_lock(&ctx->per_mm_list_lock); | 217 | |
276 | list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { | 218 | umem_odp->interval_tree.start = |
277 | if (per_mm->mm == umem_odp->umem.owning_mm) | 219 | ALIGN_DOWN(umem_odp->umem.address, page_size); |
278 | goto found; | 220 | if (check_add_overflow(umem_odp->umem.address, |
221 | umem_odp->umem.length, | ||
222 | &umem_odp->interval_tree.last)) | ||
223 | return -EOVERFLOW; | ||
224 | umem_odp->interval_tree.last = | ||
225 | ALIGN(umem_odp->interval_tree.last, page_size); | ||
226 | if (unlikely(umem_odp->interval_tree.last < page_size)) | ||
227 | return -EOVERFLOW; | ||
228 | |||
229 | pages = (umem_odp->interval_tree.last - | ||
230 | umem_odp->interval_tree.start) >> | ||
231 | umem_odp->page_shift; | ||
232 | if (!pages) | ||
233 | return -EINVAL; | ||
234 | |||
235 | /* | ||
236 | * Note that the representation of the intervals in the | ||
237 | * interval tree considers the ending point as contained in | ||
238 | * the interval. | ||
239 | */ | ||
240 | umem_odp->interval_tree.last--; | ||
241 | |||
242 | umem_odp->page_list = kvcalloc( | ||
243 | pages, sizeof(*umem_odp->page_list), GFP_KERNEL); | ||
244 | if (!umem_odp->page_list) | ||
245 | return -ENOMEM; | ||
246 | |||
247 | umem_odp->dma_list = kvcalloc( | ||
248 | pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); | ||
249 | if (!umem_odp->dma_list) { | ||
250 | ret = -ENOMEM; | ||
251 | goto out_page_list; | ||
252 | } | ||
279 | } | 253 | } |
280 | 254 | ||
281 | per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); | 255 | mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); |
282 | if (IS_ERR(per_mm)) { | 256 | if (IS_ERR(mn)) { |
283 | mutex_unlock(&ctx->per_mm_list_lock); | 257 | ret = PTR_ERR(mn); |
284 | return PTR_ERR(per_mm); | 258 | goto out_dma_list; |
285 | } | 259 | } |
260 | umem_odp->per_mm = per_mm = | ||
261 | container_of(mn, struct ib_ucontext_per_mm, mn); | ||
286 | 262 | ||
287 | found: | 263 | mutex_init(&umem_odp->umem_mutex); |
288 | umem_odp->per_mm = per_mm; | 264 | init_completion(&umem_odp->notifier_completion); |
289 | per_mm->odp_mrs_count++; | 265 | |
290 | mutex_unlock(&ctx->per_mm_list_lock); | 266 | if (!umem_odp->is_implicit_odp) { |
267 | down_write(&per_mm->umem_rwsem); | ||
268 | interval_tree_insert(&umem_odp->interval_tree, | ||
269 | &per_mm->umem_tree); | ||
270 | up_write(&per_mm->umem_rwsem); | ||
271 | } | ||
272 | mmgrab(umem_odp->umem.owning_mm); | ||
291 | 273 | ||
292 | return 0; | 274 | return 0; |
293 | } | ||
294 | 275 | ||
295 | static void free_per_mm(struct rcu_head *rcu) | 276 | out_dma_list: |
296 | { | 277 | kvfree(umem_odp->dma_list); |
297 | kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); | 278 | out_page_list: |
279 | kvfree(umem_odp->page_list); | ||
280 | return ret; | ||
298 | } | 281 | } |
299 | 282 | ||
300 | static void put_per_mm(struct ib_umem_odp *umem_odp) | 283 | /** |
284 | * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem | ||
285 | * | ||
286 | * Implicit ODP umems do not have a VA range and do not have any page lists. | ||
287 | * They exist only to hold the per_mm reference to help the driver create | ||
288 | * children umems. | ||
289 | * | ||
290 | * @udata: udata from the syscall being used to create the umem | ||
291 | * @access: ib_reg_mr access flags | ||
292 | */ | ||
293 | struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, | ||
294 | int access) | ||
301 | { | 295 | { |
302 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | 296 | struct ib_ucontext *context = |
303 | struct ib_ucontext *ctx = umem_odp->umem.context; | 297 | container_of(udata, struct uverbs_attr_bundle, driver_udata) |
304 | bool need_free; | 298 | ->context; |
305 | 299 | struct ib_umem *umem; | |
306 | mutex_lock(&ctx->per_mm_list_lock); | 300 | struct ib_umem_odp *umem_odp; |
307 | umem_odp->per_mm = NULL; | 301 | int ret; |
308 | per_mm->odp_mrs_count--; | ||
309 | need_free = per_mm->odp_mrs_count == 0; | ||
310 | if (need_free) | ||
311 | list_del(&per_mm->ucontext_list); | ||
312 | mutex_unlock(&ctx->per_mm_list_lock); | ||
313 | |||
314 | if (!need_free) | ||
315 | return; | ||
316 | 302 | ||
317 | /* | 303 | if (access & IB_ACCESS_HUGETLB) |
318 | * NOTE! mmu_notifier_unregister() can happen between a start/end | 304 | return ERR_PTR(-EINVAL); |
319 | * callback, resulting in an start/end, and thus an unbalanced | ||
320 | * lock. This doesn't really matter to us since we are about to kfree | ||
321 | * the memory that holds the lock, however LOCKDEP doesn't like this. | ||
322 | */ | ||
323 | down_write(&per_mm->umem_rwsem); | ||
324 | per_mm->active = false; | ||
325 | up_write(&per_mm->umem_rwsem); | ||
326 | 305 | ||
327 | WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); | 306 | if (!context) |
328 | mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); | 307 | return ERR_PTR(-EIO); |
329 | put_pid(per_mm->tgid); | 308 | if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) |
330 | mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); | 309 | return ERR_PTR(-EINVAL); |
310 | |||
311 | umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); | ||
312 | if (!umem_odp) | ||
313 | return ERR_PTR(-ENOMEM); | ||
314 | umem = &umem_odp->umem; | ||
315 | umem->ibdev = context->device; | ||
316 | umem->writable = ib_access_writable(access); | ||
317 | umem->owning_mm = current->mm; | ||
318 | umem_odp->is_implicit_odp = 1; | ||
319 | umem_odp->page_shift = PAGE_SHIFT; | ||
320 | |||
321 | ret = ib_init_umem_odp(umem_odp); | ||
322 | if (ret) { | ||
323 | kfree(umem_odp); | ||
324 | return ERR_PTR(ret); | ||
325 | } | ||
326 | return umem_odp; | ||
331 | } | 327 | } |
328 | EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); | ||
332 | 329 | ||
333 | struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, | 330 | /** |
334 | unsigned long addr, size_t size) | 331 | * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit |
332 | * parent ODP umem | ||
333 | * | ||
334 | * @root: The parent umem enclosing the child. This must be allocated using | ||
335 | * ib_alloc_implicit_odp_umem() | ||
336 | * @addr: The starting userspace VA | ||
337 | * @size: The length of the userspace VA | ||
338 | */ | ||
339 | struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, | ||
340 | unsigned long addr, size_t size) | ||
335 | { | 341 | { |
336 | struct ib_ucontext_per_mm *per_mm = root->per_mm; | 342 | /* |
337 | struct ib_ucontext *ctx = per_mm->context; | 343 | * Caller must ensure that root cannot be freed during the call to |
344 | * ib_alloc_odp_umem. | ||
345 | */ | ||
338 | struct ib_umem_odp *odp_data; | 346 | struct ib_umem_odp *odp_data; |
339 | struct ib_umem *umem; | 347 | struct ib_umem *umem; |
340 | int pages = size >> PAGE_SHIFT; | ||
341 | int ret; | 348 | int ret; |
342 | 349 | ||
350 | if (WARN_ON(!root->is_implicit_odp)) | ||
351 | return ERR_PTR(-EINVAL); | ||
352 | |||
343 | odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); | 353 | odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); |
344 | if (!odp_data) | 354 | if (!odp_data) |
345 | return ERR_PTR(-ENOMEM); | 355 | return ERR_PTR(-ENOMEM); |
346 | umem = &odp_data->umem; | 356 | umem = &odp_data->umem; |
347 | umem->context = ctx; | 357 | umem->ibdev = root->umem.ibdev; |
348 | umem->length = size; | 358 | umem->length = size; |
349 | umem->address = addr; | 359 | umem->address = addr; |
350 | odp_data->page_shift = PAGE_SHIFT; | ||
351 | umem->writable = root->umem.writable; | 360 | umem->writable = root->umem.writable; |
352 | umem->is_odp = 1; | 361 | umem->owning_mm = root->umem.owning_mm; |
353 | odp_data->per_mm = per_mm; | 362 | odp_data->page_shift = PAGE_SHIFT; |
354 | umem->owning_mm = per_mm->mm; | ||
355 | mmgrab(umem->owning_mm); | ||
356 | |||
357 | mutex_init(&odp_data->umem_mutex); | ||
358 | init_completion(&odp_data->notifier_completion); | ||
359 | |||
360 | odp_data->page_list = | ||
361 | vzalloc(array_size(pages, sizeof(*odp_data->page_list))); | ||
362 | if (!odp_data->page_list) { | ||
363 | ret = -ENOMEM; | ||
364 | goto out_odp_data; | ||
365 | } | ||
366 | 363 | ||
367 | odp_data->dma_list = | 364 | ret = ib_init_umem_odp(odp_data); |
368 | vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); | 365 | if (ret) { |
369 | if (!odp_data->dma_list) { | 366 | kfree(odp_data); |
370 | ret = -ENOMEM; | 367 | return ERR_PTR(ret); |
371 | goto out_page_list; | ||
372 | } | 368 | } |
373 | |||
374 | /* | ||
375 | * Caller must ensure that the umem_odp that the per_mm came from | ||
376 | * cannot be freed during the call to ib_alloc_odp_umem. | ||
377 | */ | ||
378 | mutex_lock(&ctx->per_mm_list_lock); | ||
379 | per_mm->odp_mrs_count++; | ||
380 | mutex_unlock(&ctx->per_mm_list_lock); | ||
381 | add_umem_to_per_mm(odp_data); | ||
382 | |||
383 | return odp_data; | 369 | return odp_data; |
384 | |||
385 | out_page_list: | ||
386 | vfree(odp_data->page_list); | ||
387 | out_odp_data: | ||
388 | mmdrop(umem->owning_mm); | ||
389 | kfree(odp_data); | ||
390 | return ERR_PTR(ret); | ||
391 | } | 370 | } |
392 | EXPORT_SYMBOL(ib_alloc_odp_umem); | 371 | EXPORT_SYMBOL(ib_umem_odp_alloc_child); |
393 | 372 | ||
394 | int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) | 373 | /** |
374 | * ib_umem_odp_get - Create a umem_odp for a userspace va | ||
375 | * | ||
376 | * @udata: userspace context to pin memory for | ||
377 | * @addr: userspace virtual address to start at | ||
378 | * @size: length of region to pin | ||
379 | * @access: IB_ACCESS_xxx flags for memory being pinned | ||
380 | * | ||
381 | * The driver should use when the access flags indicate ODP memory. It avoids | ||
382 | * pinning, instead, stores the mm for future page fault handling in | ||
383 | * conjunction with MMU notifiers. | ||
384 | */ | ||
385 | struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, | ||
386 | size_t size, int access) | ||
395 | { | 387 | { |
396 | struct ib_umem *umem = &umem_odp->umem; | 388 | struct ib_umem_odp *umem_odp; |
397 | /* | 389 | struct ib_ucontext *context; |
398 | * NOTE: This must called in a process context where umem->owning_mm | 390 | struct mm_struct *mm; |
399 | * == current->mm | 391 | int ret; |
400 | */ | 392 | |
401 | struct mm_struct *mm = umem->owning_mm; | 393 | if (!udata) |
402 | int ret_val; | 394 | return ERR_PTR(-EIO); |
395 | |||
396 | context = container_of(udata, struct uverbs_attr_bundle, driver_udata) | ||
397 | ->context; | ||
398 | if (!context) | ||
399 | return ERR_PTR(-EIO); | ||
400 | |||
401 | if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || | ||
402 | WARN_ON_ONCE(!context->device->ops.invalidate_range)) | ||
403 | return ERR_PTR(-EINVAL); | ||
404 | |||
405 | umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); | ||
406 | if (!umem_odp) | ||
407 | return ERR_PTR(-ENOMEM); | ||
408 | |||
409 | umem_odp->umem.ibdev = context->device; | ||
410 | umem_odp->umem.length = size; | ||
411 | umem_odp->umem.address = addr; | ||
412 | umem_odp->umem.writable = ib_access_writable(access); | ||
413 | umem_odp->umem.owning_mm = mm = current->mm; | ||
403 | 414 | ||
404 | umem_odp->page_shift = PAGE_SHIFT; | 415 | umem_odp->page_shift = PAGE_SHIFT; |
405 | if (access & IB_ACCESS_HUGETLB) { | 416 | if (access & IB_ACCESS_HUGETLB) { |
@@ -410,63 +421,63 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) | |||
410 | vma = find_vma(mm, ib_umem_start(umem_odp)); | 421 | vma = find_vma(mm, ib_umem_start(umem_odp)); |
411 | if (!vma || !is_vm_hugetlb_page(vma)) { | 422 | if (!vma || !is_vm_hugetlb_page(vma)) { |
412 | up_read(&mm->mmap_sem); | 423 | up_read(&mm->mmap_sem); |
413 | return -EINVAL; | 424 | ret = -EINVAL; |
425 | goto err_free; | ||
414 | } | 426 | } |
415 | h = hstate_vma(vma); | 427 | h = hstate_vma(vma); |
416 | umem_odp->page_shift = huge_page_shift(h); | 428 | umem_odp->page_shift = huge_page_shift(h); |
417 | up_read(&mm->mmap_sem); | 429 | up_read(&mm->mmap_sem); |
418 | } | 430 | } |
419 | 431 | ||
420 | mutex_init(&umem_odp->umem_mutex); | 432 | ret = ib_init_umem_odp(umem_odp); |
421 | 433 | if (ret) | |
422 | init_completion(&umem_odp->notifier_completion); | 434 | goto err_free; |
423 | 435 | return umem_odp; | |
424 | if (ib_umem_odp_num_pages(umem_odp)) { | ||
425 | umem_odp->page_list = | ||
426 | vzalloc(array_size(sizeof(*umem_odp->page_list), | ||
427 | ib_umem_odp_num_pages(umem_odp))); | ||
428 | if (!umem_odp->page_list) | ||
429 | return -ENOMEM; | ||
430 | |||
431 | umem_odp->dma_list = | ||
432 | vzalloc(array_size(sizeof(*umem_odp->dma_list), | ||
433 | ib_umem_odp_num_pages(umem_odp))); | ||
434 | if (!umem_odp->dma_list) { | ||
435 | ret_val = -ENOMEM; | ||
436 | goto out_page_list; | ||
437 | } | ||
438 | } | ||
439 | |||
440 | ret_val = get_per_mm(umem_odp); | ||
441 | if (ret_val) | ||
442 | goto out_dma_list; | ||
443 | add_umem_to_per_mm(umem_odp); | ||
444 | |||
445 | return 0; | ||
446 | 436 | ||
447 | out_dma_list: | 437 | err_free: |
448 | vfree(umem_odp->dma_list); | 438 | kfree(umem_odp); |
449 | out_page_list: | 439 | return ERR_PTR(ret); |
450 | vfree(umem_odp->page_list); | ||
451 | return ret_val; | ||
452 | } | 440 | } |
441 | EXPORT_SYMBOL(ib_umem_odp_get); | ||
453 | 442 | ||
454 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp) | 443 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp) |
455 | { | 444 | { |
445 | struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; | ||
446 | |||
456 | /* | 447 | /* |
457 | * Ensure that no more pages are mapped in the umem. | 448 | * Ensure that no more pages are mapped in the umem. |
458 | * | 449 | * |
459 | * It is the driver's responsibility to ensure, before calling us, | 450 | * It is the driver's responsibility to ensure, before calling us, |
460 | * that the hardware will not attempt to access the MR any more. | 451 | * that the hardware will not attempt to access the MR any more. |
461 | */ | 452 | */ |
462 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), | 453 | if (!umem_odp->is_implicit_odp) { |
463 | ib_umem_end(umem_odp)); | 454 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), |
455 | ib_umem_end(umem_odp)); | ||
456 | kvfree(umem_odp->dma_list); | ||
457 | kvfree(umem_odp->page_list); | ||
458 | } | ||
464 | 459 | ||
465 | remove_umem_from_per_mm(umem_odp); | 460 | down_write(&per_mm->umem_rwsem); |
466 | put_per_mm(umem_odp); | 461 | if (!umem_odp->is_implicit_odp) { |
467 | vfree(umem_odp->dma_list); | 462 | interval_tree_remove(&umem_odp->interval_tree, |
468 | vfree(umem_odp->page_list); | 463 | &per_mm->umem_tree); |
464 | complete_all(&umem_odp->notifier_completion); | ||
465 | } | ||
466 | /* | ||
467 | * NOTE! mmu_notifier_unregister() can happen between a start/end | ||
468 | * callback, resulting in a missing end, and thus an unbalanced | ||
469 | * lock. This doesn't really matter to us since we are about to kfree | ||
470 | * the memory that holds the lock, however LOCKDEP doesn't like this. | ||
471 | * Thus we call the mmu_notifier_put under the rwsem and test the | ||
472 | * internal users count to reliably see if we are past this point. | ||
473 | */ | ||
474 | mmu_notifier_put(&per_mm->mn); | ||
475 | up_write(&per_mm->umem_rwsem); | ||
476 | |||
477 | mmdrop(umem_odp->umem.owning_mm); | ||
478 | kfree(umem_odp); | ||
469 | } | 479 | } |
480 | EXPORT_SYMBOL(ib_umem_odp_release); | ||
470 | 481 | ||
471 | /* | 482 | /* |
472 | * Map for DMA and insert a single page into the on-demand paging page tables. | 483 | * Map for DMA and insert a single page into the on-demand paging page tables. |
@@ -493,8 +504,7 @@ static int ib_umem_odp_map_dma_single_page( | |||
493 | u64 access_mask, | 504 | u64 access_mask, |
494 | unsigned long current_seq) | 505 | unsigned long current_seq) |
495 | { | 506 | { |
496 | struct ib_ucontext *context = umem_odp->umem.context; | 507 | struct ib_device *dev = umem_odp->umem.ibdev; |
497 | struct ib_device *dev = context->device; | ||
498 | dma_addr_t dma_addr; | 508 | dma_addr_t dma_addr; |
499 | int remove_existing_mapping = 0; | 509 | int remove_existing_mapping = 0; |
500 | int ret = 0; | 510 | int ret = 0; |
@@ -534,7 +544,7 @@ out: | |||
534 | 544 | ||
535 | if (remove_existing_mapping) { | 545 | if (remove_existing_mapping) { |
536 | ib_umem_notifier_start_account(umem_odp); | 546 | ib_umem_notifier_start_account(umem_odp); |
537 | context->invalidate_range( | 547 | dev->ops.invalidate_range( |
538 | umem_odp, | 548 | umem_odp, |
539 | ib_umem_start(umem_odp) + | 549 | ib_umem_start(umem_odp) + |
540 | (page_index << umem_odp->page_shift), | 550 | (page_index << umem_odp->page_shift), |
@@ -707,7 +717,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, | |||
707 | { | 717 | { |
708 | int idx; | 718 | int idx; |
709 | u64 addr; | 719 | u64 addr; |
710 | struct ib_device *dev = umem_odp->umem.context->device; | 720 | struct ib_device *dev = umem_odp->umem.ibdev; |
711 | 721 | ||
712 | virt = max_t(u64, virt, ib_umem_start(umem_odp)); | 722 | virt = max_t(u64, virt, ib_umem_start(umem_odp)); |
713 | bound = min_t(u64, bound, ib_umem_end(umem_odp)); | 723 | bound = min_t(u64, bound, ib_umem_end(umem_odp)); |
@@ -761,35 +771,21 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, | |||
761 | void *cookie) | 771 | void *cookie) |
762 | { | 772 | { |
763 | int ret_val = 0; | 773 | int ret_val = 0; |
764 | struct umem_odp_node *node, *next; | 774 | struct interval_tree_node *node, *next; |
765 | struct ib_umem_odp *umem; | 775 | struct ib_umem_odp *umem; |
766 | 776 | ||
767 | if (unlikely(start == last)) | 777 | if (unlikely(start == last)) |
768 | return ret_val; | 778 | return ret_val; |
769 | 779 | ||
770 | for (node = rbt_ib_umem_iter_first(root, start, last - 1); | 780 | for (node = interval_tree_iter_first(root, start, last - 1); |
771 | node; node = next) { | 781 | node; node = next) { |
772 | /* TODO move the blockable decision up to the callback */ | 782 | /* TODO move the blockable decision up to the callback */ |
773 | if (!blockable) | 783 | if (!blockable) |
774 | return -EAGAIN; | 784 | return -EAGAIN; |
775 | next = rbt_ib_umem_iter_next(node, start, last - 1); | 785 | next = interval_tree_iter_next(node, start, last - 1); |
776 | umem = container_of(node, struct ib_umem_odp, interval_tree); | 786 | umem = container_of(node, struct ib_umem_odp, interval_tree); |
777 | ret_val = cb(umem, start, last, cookie) || ret_val; | 787 | ret_val = cb(umem, start, last, cookie) || ret_val; |
778 | } | 788 | } |
779 | 789 | ||
780 | return ret_val; | 790 | return ret_val; |
781 | } | 791 | } |
782 | EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); | ||
783 | |||
784 | struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, | ||
785 | u64 addr, u64 length) | ||
786 | { | ||
787 | struct umem_odp_node *node; | ||
788 | |||
789 | node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); | ||
790 | if (node) | ||
791 | return container_of(node, struct ib_umem_odp, interval_tree); | ||
792 | return NULL; | ||
793 | |||
794 | } | ||
795 | EXPORT_SYMBOL(rbt_ib_umem_lookup); | ||
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 7ddd0e5bc6b3..7c10dfe417a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c | |||
@@ -252,9 +252,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) | |||
252 | ucontext->closing = false; | 252 | ucontext->closing = false; |
253 | ucontext->cleanup_retryable = false; | 253 | ucontext->cleanup_retryable = false; |
254 | 254 | ||
255 | mutex_init(&ucontext->per_mm_list_lock); | ||
256 | INIT_LIST_HEAD(&ucontext->per_mm_list); | ||
257 | |||
258 | ret = get_unused_fd_flags(O_CLOEXEC); | 255 | ret = get_unused_fd_flags(O_CLOEXEC); |
259 | if (ret < 0) | 256 | if (ret < 0) |
260 | goto err_free; | 257 | goto err_free; |
@@ -275,8 +272,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) | |||
275 | ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); | 272 | ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); |
276 | if (ret) | 273 | if (ret) |
277 | goto err_file; | 274 | goto err_file; |
278 | if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) | ||
279 | ucontext->invalidate_range = NULL; | ||
280 | 275 | ||
281 | rdma_restrack_uadd(&ucontext->res); | 276 | rdma_restrack_uadd(&ucontext->res); |
282 | 277 | ||
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 11c13c1381cf..e369ac0d6f51 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c | |||
@@ -1487,6 +1487,7 @@ static void __exit ib_uverbs_cleanup(void) | |||
1487 | IB_UVERBS_NUM_FIXED_MINOR); | 1487 | IB_UVERBS_NUM_FIXED_MINOR); |
1488 | unregister_chrdev_region(dynamic_uverbs_dev, | 1488 | unregister_chrdev_region(dynamic_uverbs_dev, |
1489 | IB_UVERBS_NUM_DYNAMIC_MINOR); | 1489 | IB_UVERBS_NUM_DYNAMIC_MINOR); |
1490 | mmu_notifier_synchronize(); | ||
1490 | } | 1491 | } |
1491 | 1492 | ||
1492 | module_init(ib_uverbs_init); | 1493 | module_init(ib_uverbs_init); |
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4e9f1507ffd9..bface798ee59 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c | |||
@@ -1867,10 +1867,6 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, | |||
1867 | if (err) | 1867 | if (err) |
1868 | goto out_sys_pages; | 1868 | goto out_sys_pages; |
1869 | 1869 | ||
1870 | if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) | ||
1871 | context->ibucontext.invalidate_range = | ||
1872 | &mlx5_ib_invalidate_range; | ||
1873 | |||
1874 | if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { | 1870 | if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { |
1875 | err = mlx5_ib_devx_create(dev, true); | 1871 | err = mlx5_ib_devx_create(dev, true); |
1876 | if (err < 0) | 1872 | if (err < 0) |
@@ -1999,11 +1995,6 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) | |||
1999 | struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); | 1995 | struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); |
2000 | struct mlx5_bfreg_info *bfregi; | 1996 | struct mlx5_bfreg_info *bfregi; |
2001 | 1997 | ||
2002 | /* All umem's must be destroyed before destroying the ucontext. */ | ||
2003 | mutex_lock(&ibcontext->per_mm_list_lock); | ||
2004 | WARN_ON(!list_empty(&ibcontext->per_mm_list)); | ||
2005 | mutex_unlock(&ibcontext->per_mm_list_lock); | ||
2006 | |||
2007 | bfregi = &context->bfregi; | 1998 | bfregi = &context->bfregi; |
2008 | mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); | 1999 | mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); |
2009 | 2000 | ||
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index a40e0abf2338..b5aece786b36 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c | |||
@@ -56,19 +56,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, | |||
56 | struct scatterlist *sg; | 56 | struct scatterlist *sg; |
57 | int entry; | 57 | int entry; |
58 | 58 | ||
59 | if (umem->is_odp) { | ||
60 | struct ib_umem_odp *odp = to_ib_umem_odp(umem); | ||
61 | unsigned int page_shift = odp->page_shift; | ||
62 | |||
63 | *ncont = ib_umem_odp_num_pages(odp); | ||
64 | *count = *ncont << (page_shift - PAGE_SHIFT); | ||
65 | *shift = page_shift; | ||
66 | if (order) | ||
67 | *order = ilog2(roundup_pow_of_two(*ncont)); | ||
68 | |||
69 | return; | ||
70 | } | ||
71 | |||
72 | addr = addr >> PAGE_SHIFT; | 59 | addr = addr >> PAGE_SHIFT; |
73 | tmp = (unsigned long)addr; | 60 | tmp = (unsigned long)addr; |
74 | m = find_first_bit(&tmp, BITS_PER_LONG); | 61 | m = find_first_bit(&tmp, BITS_PER_LONG); |
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 3401f5f6792e..1eff031ef048 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c | |||
@@ -784,19 +784,37 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, | |||
784 | int *ncont, int *order) | 784 | int *ncont, int *order) |
785 | { | 785 | { |
786 | struct ib_umem *u; | 786 | struct ib_umem *u; |
787 | int err; | ||
788 | 787 | ||
789 | *umem = NULL; | 788 | *umem = NULL; |
790 | 789 | ||
791 | u = ib_umem_get(udata, start, length, access_flags, 0); | 790 | if (access_flags & IB_ACCESS_ON_DEMAND) { |
792 | err = PTR_ERR_OR_ZERO(u); | 791 | struct ib_umem_odp *odp; |
793 | if (err) { | 792 | |
794 | mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); | 793 | odp = ib_umem_odp_get(udata, start, length, access_flags); |
795 | return err; | 794 | if (IS_ERR(odp)) { |
795 | mlx5_ib_dbg(dev, "umem get failed (%ld)\n", | ||
796 | PTR_ERR(odp)); | ||
797 | return PTR_ERR(odp); | ||
798 | } | ||
799 | |||
800 | u = &odp->umem; | ||
801 | |||
802 | *page_shift = odp->page_shift; | ||
803 | *ncont = ib_umem_odp_num_pages(odp); | ||
804 | *npages = *ncont << (*page_shift - PAGE_SHIFT); | ||
805 | if (order) | ||
806 | *order = ilog2(roundup_pow_of_two(*ncont)); | ||
807 | } else { | ||
808 | u = ib_umem_get(udata, start, length, access_flags, 0); | ||
809 | if (IS_ERR(u)) { | ||
810 | mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); | ||
811 | return PTR_ERR(u); | ||
812 | } | ||
813 | |||
814 | mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, | ||
815 | page_shift, ncont, order); | ||
796 | } | 816 | } |
797 | 817 | ||
798 | mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, | ||
799 | page_shift, ncont, order); | ||
800 | if (!*npages) { | 818 | if (!*npages) { |
801 | mlx5_ib_warn(dev, "avoid zero region\n"); | 819 | mlx5_ib_warn(dev, "avoid zero region\n"); |
802 | ib_umem_release(u); | 820 | ib_umem_release(u); |
@@ -1599,7 +1617,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) | |||
1599 | /* Wait for all running page-fault handlers to finish. */ | 1617 | /* Wait for all running page-fault handlers to finish. */ |
1600 | synchronize_srcu(&dev->mr_srcu); | 1618 | synchronize_srcu(&dev->mr_srcu); |
1601 | /* Destroy all page mappings */ | 1619 | /* Destroy all page mappings */ |
1602 | if (umem_odp->page_list) | 1620 | if (!umem_odp->is_implicit_odp) |
1603 | mlx5_ib_invalidate_range(umem_odp, | 1621 | mlx5_ib_invalidate_range(umem_odp, |
1604 | ib_umem_start(umem_odp), | 1622 | ib_umem_start(umem_odp), |
1605 | ib_umem_end(umem_odp)); | 1623 | ib_umem_end(umem_odp)); |
@@ -1610,7 +1628,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) | |||
1610 | * so that there will not be any invalidations in | 1628 | * so that there will not be any invalidations in |
1611 | * flight, looking at the *mr struct. | 1629 | * flight, looking at the *mr struct. |
1612 | */ | 1630 | */ |
1613 | ib_umem_release(umem); | 1631 | ib_umem_odp_release(umem_odp); |
1614 | atomic_sub(npages, &dev->mdev->priv.reg_pages); | 1632 | atomic_sub(npages, &dev->mdev->priv.reg_pages); |
1615 | 1633 | ||
1616 | /* Avoid double-freeing the umem. */ | 1634 | /* Avoid double-freeing the umem. */ |
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 0a59912a4cef..dd26e7acb37e 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c | |||
@@ -184,7 +184,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, | |||
184 | for (i = 0; i < nentries; i++, pklm++) { | 184 | for (i = 0; i < nentries; i++, pklm++) { |
185 | pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); | 185 | pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); |
186 | va = (offset + i) * MLX5_IMR_MTT_SIZE; | 186 | va = (offset + i) * MLX5_IMR_MTT_SIZE; |
187 | if (odp && odp->umem.address == va) { | 187 | if (odp && ib_umem_start(odp) == va) { |
188 | struct mlx5_ib_mr *mtt = odp->private; | 188 | struct mlx5_ib_mr *mtt = odp->private; |
189 | 189 | ||
190 | pklm->key = cpu_to_be32(mtt->ibmr.lkey); | 190 | pklm->key = cpu_to_be32(mtt->ibmr.lkey); |
@@ -206,7 +206,7 @@ static void mr_leaf_free_action(struct work_struct *work) | |||
206 | mr->parent = NULL; | 206 | mr->parent = NULL; |
207 | synchronize_srcu(&mr->dev->mr_srcu); | 207 | synchronize_srcu(&mr->dev->mr_srcu); |
208 | 208 | ||
209 | ib_umem_release(&odp->umem); | 209 | ib_umem_odp_release(odp); |
210 | if (imr->live) | 210 | if (imr->live) |
211 | mlx5_ib_update_xlt(imr, idx, 1, 0, | 211 | mlx5_ib_update_xlt(imr, idx, 1, 0, |
212 | MLX5_IB_UPD_XLT_INDIRECT | | 212 | MLX5_IB_UPD_XLT_INDIRECT | |
@@ -386,7 +386,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, | |||
386 | } | 386 | } |
387 | 387 | ||
388 | static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, | 388 | static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, |
389 | struct ib_umem *umem, | 389 | struct ib_umem_odp *umem_odp, |
390 | bool ksm, int access_flags) | 390 | bool ksm, int access_flags) |
391 | { | 391 | { |
392 | struct mlx5_ib_dev *dev = to_mdev(pd->device); | 392 | struct mlx5_ib_dev *dev = to_mdev(pd->device); |
@@ -404,7 +404,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, | |||
404 | mr->dev = dev; | 404 | mr->dev = dev; |
405 | mr->access_flags = access_flags; | 405 | mr->access_flags = access_flags; |
406 | mr->mmkey.iova = 0; | 406 | mr->mmkey.iova = 0; |
407 | mr->umem = umem; | 407 | mr->umem = &umem_odp->umem; |
408 | 408 | ||
409 | if (ksm) { | 409 | if (ksm) { |
410 | err = mlx5_ib_update_xlt(mr, 0, | 410 | err = mlx5_ib_update_xlt(mr, 0, |
@@ -464,18 +464,17 @@ next_mr: | |||
464 | if (nentries) | 464 | if (nentries) |
465 | nentries++; | 465 | nentries++; |
466 | } else { | 466 | } else { |
467 | odp = ib_alloc_odp_umem(odp_mr, addr, | 467 | odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); |
468 | MLX5_IMR_MTT_SIZE); | ||
469 | if (IS_ERR(odp)) { | 468 | if (IS_ERR(odp)) { |
470 | mutex_unlock(&odp_mr->umem_mutex); | 469 | mutex_unlock(&odp_mr->umem_mutex); |
471 | return ERR_CAST(odp); | 470 | return ERR_CAST(odp); |
472 | } | 471 | } |
473 | 472 | ||
474 | mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, | 473 | mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, |
475 | mr->access_flags); | 474 | mr->access_flags); |
476 | if (IS_ERR(mtt)) { | 475 | if (IS_ERR(mtt)) { |
477 | mutex_unlock(&odp_mr->umem_mutex); | 476 | mutex_unlock(&odp_mr->umem_mutex); |
478 | ib_umem_release(&odp->umem); | 477 | ib_umem_odp_release(odp); |
479 | return ERR_CAST(mtt); | 478 | return ERR_CAST(mtt); |
480 | } | 479 | } |
481 | 480 | ||
@@ -497,7 +496,7 @@ next_mr: | |||
497 | addr += MLX5_IMR_MTT_SIZE; | 496 | addr += MLX5_IMR_MTT_SIZE; |
498 | if (unlikely(addr < io_virt + bcnt)) { | 497 | if (unlikely(addr < io_virt + bcnt)) { |
499 | odp = odp_next(odp); | 498 | odp = odp_next(odp); |
500 | if (odp && odp->umem.address != addr) | 499 | if (odp && ib_umem_start(odp) != addr) |
501 | odp = NULL; | 500 | odp = NULL; |
502 | goto next_mr; | 501 | goto next_mr; |
503 | } | 502 | } |
@@ -521,19 +520,19 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, | |||
521 | int access_flags) | 520 | int access_flags) |
522 | { | 521 | { |
523 | struct mlx5_ib_mr *imr; | 522 | struct mlx5_ib_mr *imr; |
524 | struct ib_umem *umem; | 523 | struct ib_umem_odp *umem_odp; |
525 | 524 | ||
526 | umem = ib_umem_get(udata, 0, 0, access_flags, 0); | 525 | umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); |
527 | if (IS_ERR(umem)) | 526 | if (IS_ERR(umem_odp)) |
528 | return ERR_CAST(umem); | 527 | return ERR_CAST(umem_odp); |
529 | 528 | ||
530 | imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); | 529 | imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); |
531 | if (IS_ERR(imr)) { | 530 | if (IS_ERR(imr)) { |
532 | ib_umem_release(umem); | 531 | ib_umem_odp_release(umem_odp); |
533 | return ERR_CAST(imr); | 532 | return ERR_CAST(imr); |
534 | } | 533 | } |
535 | 534 | ||
536 | imr->umem = umem; | 535 | imr->umem = &umem_odp->umem; |
537 | init_waitqueue_head(&imr->q_leaf_free); | 536 | init_waitqueue_head(&imr->q_leaf_free); |
538 | atomic_set(&imr->num_leaf_free, 0); | 537 | atomic_set(&imr->num_leaf_free, 0); |
539 | atomic_set(&imr->num_pending_prefetch, 0); | 538 | atomic_set(&imr->num_pending_prefetch, 0); |
@@ -541,34 +540,31 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, | |||
541 | return imr; | 540 | return imr; |
542 | } | 541 | } |
543 | 542 | ||
544 | static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, | 543 | void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) |
545 | void *cookie) | ||
546 | { | 544 | { |
547 | struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; | 545 | struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); |
548 | 546 | struct rb_node *node; | |
549 | if (mr->parent != imr) | ||
550 | return 0; | ||
551 | |||
552 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), | ||
553 | ib_umem_end(umem_odp)); | ||
554 | 547 | ||
555 | if (umem_odp->dying) | 548 | down_read(&per_mm->umem_rwsem); |
556 | return 0; | 549 | for (node = rb_first_cached(&per_mm->umem_tree); node; |
550 | node = rb_next(node)) { | ||
551 | struct ib_umem_odp *umem_odp = | ||
552 | rb_entry(node, struct ib_umem_odp, interval_tree.rb); | ||
553 | struct mlx5_ib_mr *mr = umem_odp->private; | ||
557 | 554 | ||
558 | WRITE_ONCE(umem_odp->dying, 1); | 555 | if (mr->parent != imr) |
559 | atomic_inc(&imr->num_leaf_free); | 556 | continue; |
560 | schedule_work(&umem_odp->work); | ||
561 | 557 | ||
562 | return 0; | 558 | ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), |
563 | } | 559 | ib_umem_end(umem_odp)); |
564 | 560 | ||
565 | void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) | 561 | if (umem_odp->dying) |
566 | { | 562 | continue; |
567 | struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); | ||
568 | 563 | ||
569 | down_read(&per_mm->umem_rwsem); | 564 | WRITE_ONCE(umem_odp->dying, 1); |
570 | rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, | 565 | atomic_inc(&imr->num_leaf_free); |
571 | mr_leaf_free, true, imr); | 566 | schedule_work(&umem_odp->work); |
567 | } | ||
572 | up_read(&per_mm->umem_rwsem); | 568 | up_read(&per_mm->umem_rwsem); |
573 | 569 | ||
574 | wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); | 570 | wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); |
@@ -589,7 +585,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, | |||
589 | struct ib_umem_odp *odp; | 585 | struct ib_umem_odp *odp; |
590 | size_t size; | 586 | size_t size; |
591 | 587 | ||
592 | if (!odp_mr->page_list) { | 588 | if (odp_mr->is_implicit_odp) { |
593 | odp = implicit_mr_get_data(mr, io_virt, bcnt); | 589 | odp = implicit_mr_get_data(mr, io_virt, bcnt); |
594 | 590 | ||
595 | if (IS_ERR(odp)) | 591 | if (IS_ERR(odp)) |
@@ -607,7 +603,7 @@ next_mr: | |||
607 | start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; | 603 | start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; |
608 | access_mask = ODP_READ_ALLOWED_BIT; | 604 | access_mask = ODP_READ_ALLOWED_BIT; |
609 | 605 | ||
610 | if (prefetch && !downgrade && !mr->umem->writable) { | 606 | if (prefetch && !downgrade && !odp->umem.writable) { |
611 | /* prefetch with write-access must | 607 | /* prefetch with write-access must |
612 | * be supported by the MR | 608 | * be supported by the MR |
613 | */ | 609 | */ |
@@ -615,7 +611,7 @@ next_mr: | |||
615 | goto out; | 611 | goto out; |
616 | } | 612 | } |
617 | 613 | ||
618 | if (mr->umem->writable && !downgrade) | 614 | if (odp->umem.writable && !downgrade) |
619 | access_mask |= ODP_WRITE_ALLOWED_BIT; | 615 | access_mask |= ODP_WRITE_ALLOWED_BIT; |
620 | 616 | ||
621 | current_seq = READ_ONCE(odp->notifiers_seq); | 617 | current_seq = READ_ONCE(odp->notifiers_seq); |
@@ -625,8 +621,8 @@ next_mr: | |||
625 | */ | 621 | */ |
626 | smp_rmb(); | 622 | smp_rmb(); |
627 | 623 | ||
628 | ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, | 624 | ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, |
629 | access_mask, current_seq); | 625 | current_seq); |
630 | 626 | ||
631 | if (ret < 0) | 627 | if (ret < 0) |
632 | goto out; | 628 | goto out; |
@@ -634,8 +630,7 @@ next_mr: | |||
634 | np = ret; | 630 | np = ret; |
635 | 631 | ||
636 | mutex_lock(&odp->umem_mutex); | 632 | mutex_lock(&odp->umem_mutex); |
637 | if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), | 633 | if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { |
638 | current_seq)) { | ||
639 | /* | 634 | /* |
640 | * No need to check whether the MTTs really belong to | 635 | * No need to check whether the MTTs really belong to |
641 | * this MR, since ib_umem_odp_map_dma_pages already | 636 | * this MR, since ib_umem_odp_map_dma_pages already |
@@ -668,7 +663,7 @@ next_mr: | |||
668 | 663 | ||
669 | io_virt += size; | 664 | io_virt += size; |
670 | next = odp_next(odp); | 665 | next = odp_next(odp); |
671 | if (unlikely(!next || next->umem.address != io_virt)) { | 666 | if (unlikely(!next || ib_umem_start(next) != io_virt)) { |
672 | mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", | 667 | mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", |
673 | io_virt, next); | 668 | io_virt, next); |
674 | return -EAGAIN; | 669 | return -EAGAIN; |
@@ -1618,6 +1613,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) | |||
1618 | 1613 | ||
1619 | static const struct ib_device_ops mlx5_ib_dev_odp_ops = { | 1614 | static const struct ib_device_ops mlx5_ib_dev_odp_ops = { |
1620 | .advise_mr = mlx5_ib_advise_mr, | 1615 | .advise_mr = mlx5_ib_advise_mr, |
1616 | .invalidate_range = mlx5_ib_invalidate_range, | ||
1621 | }; | 1617 | }; |
1622 | 1618 | ||
1623 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) | 1619 | int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) |
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index a2a142ae087b..9d042310214f 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c | |||
@@ -573,6 +573,7 @@ static void __exit gru_exit(void) | |||
573 | gru_free_tables(); | 573 | gru_free_tables(); |
574 | misc_deregister(&gru_miscdev); | 574 | misc_deregister(&gru_miscdev); |
575 | gru_proc_exit(); | 575 | gru_proc_exit(); |
576 | mmu_notifier_synchronize(); | ||
576 | } | 577 | } |
577 | 578 | ||
578 | static const struct file_operations gru_fops = { | 579 | static const struct file_operations gru_fops = { |
diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h index 438191c22057..a7e44b2eb413 100644 --- a/drivers/misc/sgi-gru/grutables.h +++ b/drivers/misc/sgi-gru/grutables.h | |||
@@ -307,10 +307,8 @@ struct gru_mm_tracker { /* pack to reduce size */ | |||
307 | 307 | ||
308 | struct gru_mm_struct { | 308 | struct gru_mm_struct { |
309 | struct mmu_notifier ms_notifier; | 309 | struct mmu_notifier ms_notifier; |
310 | atomic_t ms_refcnt; | ||
311 | spinlock_t ms_asid_lock; /* protects ASID assignment */ | 310 | spinlock_t ms_asid_lock; /* protects ASID assignment */ |
312 | atomic_t ms_range_active;/* num range_invals active */ | 311 | atomic_t ms_range_active;/* num range_invals active */ |
313 | char ms_released; | ||
314 | wait_queue_head_t ms_wait_queue; | 312 | wait_queue_head_t ms_wait_queue; |
315 | DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS); | 313 | DECLARE_BITMAP(ms_asidmap, GRU_MAX_GRUS); |
316 | struct gru_mm_tracker ms_asids[GRU_MAX_GRUS]; | 314 | struct gru_mm_tracker ms_asids[GRU_MAX_GRUS]; |
diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index 59ba0adf23ce..10921cd2608d 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c | |||
@@ -235,83 +235,47 @@ static void gru_invalidate_range_end(struct mmu_notifier *mn, | |||
235 | gms, range->start, range->end); | 235 | gms, range->start, range->end); |
236 | } | 236 | } |
237 | 237 | ||
238 | static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) | 238 | static struct mmu_notifier *gru_alloc_notifier(struct mm_struct *mm) |
239 | { | 239 | { |
240 | struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, | 240 | struct gru_mm_struct *gms; |
241 | ms_notifier); | 241 | |
242 | gms = kzalloc(sizeof(*gms), GFP_KERNEL); | ||
243 | if (!gms) | ||
244 | return ERR_PTR(-ENOMEM); | ||
245 | STAT(gms_alloc); | ||
246 | spin_lock_init(&gms->ms_asid_lock); | ||
247 | init_waitqueue_head(&gms->ms_wait_queue); | ||
242 | 248 | ||
243 | gms->ms_released = 1; | 249 | return &gms->ms_notifier; |
244 | gru_dbg(grudev, "gms %p\n", gms); | ||
245 | } | 250 | } |
246 | 251 | ||
252 | static void gru_free_notifier(struct mmu_notifier *mn) | ||
253 | { | ||
254 | kfree(container_of(mn, struct gru_mm_struct, ms_notifier)); | ||
255 | STAT(gms_free); | ||
256 | } | ||
247 | 257 | ||
248 | static const struct mmu_notifier_ops gru_mmuops = { | 258 | static const struct mmu_notifier_ops gru_mmuops = { |
249 | .invalidate_range_start = gru_invalidate_range_start, | 259 | .invalidate_range_start = gru_invalidate_range_start, |
250 | .invalidate_range_end = gru_invalidate_range_end, | 260 | .invalidate_range_end = gru_invalidate_range_end, |
251 | .release = gru_release, | 261 | .alloc_notifier = gru_alloc_notifier, |
262 | .free_notifier = gru_free_notifier, | ||
252 | }; | 263 | }; |
253 | 264 | ||
254 | /* Move this to the basic mmu_notifier file. But for now... */ | ||
255 | static struct mmu_notifier *mmu_find_ops(struct mm_struct *mm, | ||
256 | const struct mmu_notifier_ops *ops) | ||
257 | { | ||
258 | struct mmu_notifier *mn, *gru_mn = NULL; | ||
259 | |||
260 | if (mm->mmu_notifier_mm) { | ||
261 | rcu_read_lock(); | ||
262 | hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, | ||
263 | hlist) | ||
264 | if (mn->ops == ops) { | ||
265 | gru_mn = mn; | ||
266 | break; | ||
267 | } | ||
268 | rcu_read_unlock(); | ||
269 | } | ||
270 | return gru_mn; | ||
271 | } | ||
272 | |||
273 | struct gru_mm_struct *gru_register_mmu_notifier(void) | 265 | struct gru_mm_struct *gru_register_mmu_notifier(void) |
274 | { | 266 | { |
275 | struct gru_mm_struct *gms; | ||
276 | struct mmu_notifier *mn; | 267 | struct mmu_notifier *mn; |
277 | int err; | 268 | |
278 | 269 | mn = mmu_notifier_get_locked(&gru_mmuops, current->mm); | |
279 | mn = mmu_find_ops(current->mm, &gru_mmuops); | 270 | if (IS_ERR(mn)) |
280 | if (mn) { | 271 | return ERR_CAST(mn); |
281 | gms = container_of(mn, struct gru_mm_struct, ms_notifier); | 272 | |
282 | atomic_inc(&gms->ms_refcnt); | 273 | return container_of(mn, struct gru_mm_struct, ms_notifier); |
283 | } else { | ||
284 | gms = kzalloc(sizeof(*gms), GFP_KERNEL); | ||
285 | if (!gms) | ||
286 | return ERR_PTR(-ENOMEM); | ||
287 | STAT(gms_alloc); | ||
288 | spin_lock_init(&gms->ms_asid_lock); | ||
289 | gms->ms_notifier.ops = &gru_mmuops; | ||
290 | atomic_set(&gms->ms_refcnt, 1); | ||
291 | init_waitqueue_head(&gms->ms_wait_queue); | ||
292 | err = __mmu_notifier_register(&gms->ms_notifier, current->mm); | ||
293 | if (err) | ||
294 | goto error; | ||
295 | } | ||
296 | if (gms) | ||
297 | gru_dbg(grudev, "gms %p, refcnt %d\n", gms, | ||
298 | atomic_read(&gms->ms_refcnt)); | ||
299 | return gms; | ||
300 | error: | ||
301 | kfree(gms); | ||
302 | return ERR_PTR(err); | ||
303 | } | 274 | } |
304 | 275 | ||
305 | void gru_drop_mmu_notifier(struct gru_mm_struct *gms) | 276 | void gru_drop_mmu_notifier(struct gru_mm_struct *gms) |
306 | { | 277 | { |
307 | gru_dbg(grudev, "gms %p, refcnt %d, released %d\n", gms, | 278 | mmu_notifier_put(&gms->ms_notifier); |
308 | atomic_read(&gms->ms_refcnt), gms->ms_released); | ||
309 | if (atomic_dec_return(&gms->ms_refcnt) == 0) { | ||
310 | if (!gms->ms_released) | ||
311 | mmu_notifier_unregister(&gms->ms_notifier, current->mm); | ||
312 | kfree(gms); | ||
313 | STAT(gms_free); | ||
314 | } | ||
315 | } | 279 | } |
316 | 280 | ||
317 | /* | 281 | /* |
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index a5fde15e91d3..36af7af6b7cf 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig | |||
@@ -118,4 +118,16 @@ config NVDIMM_KEYS | |||
118 | depends on ENCRYPTED_KEYS | 118 | depends on ENCRYPTED_KEYS |
119 | depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m | 119 | depends on (LIBNVDIMM=ENCRYPTED_KEYS) || LIBNVDIMM=m |
120 | 120 | ||
121 | config NVDIMM_TEST_BUILD | ||
122 | tristate "Build the unit test core" | ||
123 | depends on m | ||
124 | depends on COMPILE_TEST && X86_64 | ||
125 | default m if COMPILE_TEST | ||
126 | help | ||
127 | Build the core of the unit test infrastructure. The result of | ||
128 | this build is non-functional for unit test execution, but it | ||
129 | otherwise helps catch build errors induced by changes to the | ||
130 | core devm_memremap_pages() implementation and other | ||
131 | infrastructure. | ||
132 | |||
121 | endif | 133 | endif |
diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index cefe233e0b52..29203f3d3069 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile | |||
@@ -29,3 +29,7 @@ libnvdimm-$(CONFIG_BTT) += btt_devs.o | |||
29 | libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o | 29 | libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o |
30 | libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o | 30 | libnvdimm-$(CONFIG_NVDIMM_DAX) += dax_devs.o |
31 | libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o | 31 | libnvdimm-$(CONFIG_NVDIMM_KEYS) += security.o |
32 | |||
33 | TOOLS := ../../tools | ||
34 | TEST_SRC := $(TOOLS)/testing/nvdimm/test | ||
35 | obj-$(CONFIG_NVDIMM_TEST_BUILD) += $(TEST_SRC)/iomap.o | ||
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 731642e0f5a0..bf43d1d60059 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -1,5 +1,5 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/mm.h> | 2 | #include <linux/pagewalk.h> |
3 | #include <linux/vmacache.h> | 3 | #include <linux/vmacache.h> |
4 | #include <linux/hugetlb.h> | 4 | #include <linux/hugetlb.h> |
5 | #include <linux/huge_mm.h> | 5 | #include <linux/huge_mm.h> |
@@ -513,7 +513,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, | |||
513 | 513 | ||
514 | return 0; | 514 | return 0; |
515 | } | 515 | } |
516 | #endif | 516 | #else |
517 | #define smaps_pte_hole NULL | ||
518 | #endif /* CONFIG_SHMEM */ | ||
517 | 519 | ||
518 | static void smaps_pte_entry(pte_t *pte, unsigned long addr, | 520 | static void smaps_pte_entry(pte_t *pte, unsigned long addr, |
519 | struct mm_walk *walk) | 521 | struct mm_walk *walk) |
@@ -729,21 +731,24 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
729 | } | 731 | } |
730 | return 0; | 732 | return 0; |
731 | } | 733 | } |
734 | #else | ||
735 | #define smaps_hugetlb_range NULL | ||
732 | #endif /* HUGETLB_PAGE */ | 736 | #endif /* HUGETLB_PAGE */ |
733 | 737 | ||
738 | static const struct mm_walk_ops smaps_walk_ops = { | ||
739 | .pmd_entry = smaps_pte_range, | ||
740 | .hugetlb_entry = smaps_hugetlb_range, | ||
741 | }; | ||
742 | |||
743 | static const struct mm_walk_ops smaps_shmem_walk_ops = { | ||
744 | .pmd_entry = smaps_pte_range, | ||
745 | .hugetlb_entry = smaps_hugetlb_range, | ||
746 | .pte_hole = smaps_pte_hole, | ||
747 | }; | ||
748 | |||
734 | static void smap_gather_stats(struct vm_area_struct *vma, | 749 | static void smap_gather_stats(struct vm_area_struct *vma, |
735 | struct mem_size_stats *mss) | 750 | struct mem_size_stats *mss) |
736 | { | 751 | { |
737 | struct mm_walk smaps_walk = { | ||
738 | .pmd_entry = smaps_pte_range, | ||
739 | #ifdef CONFIG_HUGETLB_PAGE | ||
740 | .hugetlb_entry = smaps_hugetlb_range, | ||
741 | #endif | ||
742 | .mm = vma->vm_mm, | ||
743 | }; | ||
744 | |||
745 | smaps_walk.private = mss; | ||
746 | |||
747 | #ifdef CONFIG_SHMEM | 752 | #ifdef CONFIG_SHMEM |
748 | /* In case of smaps_rollup, reset the value from previous vma */ | 753 | /* In case of smaps_rollup, reset the value from previous vma */ |
749 | mss->check_shmem_swap = false; | 754 | mss->check_shmem_swap = false; |
@@ -765,12 +770,13 @@ static void smap_gather_stats(struct vm_area_struct *vma, | |||
765 | mss->swap += shmem_swapped; | 770 | mss->swap += shmem_swapped; |
766 | } else { | 771 | } else { |
767 | mss->check_shmem_swap = true; | 772 | mss->check_shmem_swap = true; |
768 | smaps_walk.pte_hole = smaps_pte_hole; | 773 | walk_page_vma(vma, &smaps_shmem_walk_ops, mss); |
774 | return; | ||
769 | } | 775 | } |
770 | } | 776 | } |
771 | #endif | 777 | #endif |
772 | /* mmap_sem is held in m_start */ | 778 | /* mmap_sem is held in m_start */ |
773 | walk_page_vma(vma, &smaps_walk); | 779 | walk_page_vma(vma, &smaps_walk_ops, mss); |
774 | } | 780 | } |
775 | 781 | ||
776 | #define SEQ_PUT_DEC(str, val) \ | 782 | #define SEQ_PUT_DEC(str, val) \ |
@@ -1118,6 +1124,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, | |||
1118 | return 0; | 1124 | return 0; |
1119 | } | 1125 | } |
1120 | 1126 | ||
1127 | static const struct mm_walk_ops clear_refs_walk_ops = { | ||
1128 | .pmd_entry = clear_refs_pte_range, | ||
1129 | .test_walk = clear_refs_test_walk, | ||
1130 | }; | ||
1131 | |||
1121 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, | 1132 | static ssize_t clear_refs_write(struct file *file, const char __user *buf, |
1122 | size_t count, loff_t *ppos) | 1133 | size_t count, loff_t *ppos) |
1123 | { | 1134 | { |
@@ -1151,12 +1162,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
1151 | struct clear_refs_private cp = { | 1162 | struct clear_refs_private cp = { |
1152 | .type = type, | 1163 | .type = type, |
1153 | }; | 1164 | }; |
1154 | struct mm_walk clear_refs_walk = { | ||
1155 | .pmd_entry = clear_refs_pte_range, | ||
1156 | .test_walk = clear_refs_test_walk, | ||
1157 | .mm = mm, | ||
1158 | .private = &cp, | ||
1159 | }; | ||
1160 | 1165 | ||
1161 | if (type == CLEAR_REFS_MM_HIWATER_RSS) { | 1166 | if (type == CLEAR_REFS_MM_HIWATER_RSS) { |
1162 | if (down_write_killable(&mm->mmap_sem)) { | 1167 | if (down_write_killable(&mm->mmap_sem)) { |
@@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
1217 | 0, NULL, mm, 0, -1UL); | 1222 | 0, NULL, mm, 0, -1UL); |
1218 | mmu_notifier_invalidate_range_start(&range); | 1223 | mmu_notifier_invalidate_range_start(&range); |
1219 | } | 1224 | } |
1220 | walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); | 1225 | walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, |
1226 | &cp); | ||
1221 | if (type == CLEAR_REFS_SOFT_DIRTY) | 1227 | if (type == CLEAR_REFS_SOFT_DIRTY) |
1222 | mmu_notifier_invalidate_range_end(&range); | 1228 | mmu_notifier_invalidate_range_end(&range); |
1223 | tlb_finish_mmu(&tlb, 0, -1); | 1229 | tlb_finish_mmu(&tlb, 0, -1); |
@@ -1489,8 +1495,16 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, | |||
1489 | 1495 | ||
1490 | return err; | 1496 | return err; |
1491 | } | 1497 | } |
1498 | #else | ||
1499 | #define pagemap_hugetlb_range NULL | ||
1492 | #endif /* HUGETLB_PAGE */ | 1500 | #endif /* HUGETLB_PAGE */ |
1493 | 1501 | ||
1502 | static const struct mm_walk_ops pagemap_ops = { | ||
1503 | .pmd_entry = pagemap_pmd_range, | ||
1504 | .pte_hole = pagemap_pte_hole, | ||
1505 | .hugetlb_entry = pagemap_hugetlb_range, | ||
1506 | }; | ||
1507 | |||
1494 | /* | 1508 | /* |
1495 | * /proc/pid/pagemap - an array mapping virtual pages to pfns | 1509 | * /proc/pid/pagemap - an array mapping virtual pages to pfns |
1496 | * | 1510 | * |
@@ -1522,7 +1536,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, | |||
1522 | { | 1536 | { |
1523 | struct mm_struct *mm = file->private_data; | 1537 | struct mm_struct *mm = file->private_data; |
1524 | struct pagemapread pm; | 1538 | struct pagemapread pm; |
1525 | struct mm_walk pagemap_walk = {}; | ||
1526 | unsigned long src; | 1539 | unsigned long src; |
1527 | unsigned long svpfn; | 1540 | unsigned long svpfn; |
1528 | unsigned long start_vaddr; | 1541 | unsigned long start_vaddr; |
@@ -1550,14 +1563,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, | |||
1550 | if (!pm.buffer) | 1563 | if (!pm.buffer) |
1551 | goto out_mm; | 1564 | goto out_mm; |
1552 | 1565 | ||
1553 | pagemap_walk.pmd_entry = pagemap_pmd_range; | ||
1554 | pagemap_walk.pte_hole = pagemap_pte_hole; | ||
1555 | #ifdef CONFIG_HUGETLB_PAGE | ||
1556 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; | ||
1557 | #endif | ||
1558 | pagemap_walk.mm = mm; | ||
1559 | pagemap_walk.private = ± | ||
1560 | |||
1561 | src = *ppos; | 1566 | src = *ppos; |
1562 | svpfn = src / PM_ENTRY_BYTES; | 1567 | svpfn = src / PM_ENTRY_BYTES; |
1563 | start_vaddr = svpfn << PAGE_SHIFT; | 1568 | start_vaddr = svpfn << PAGE_SHIFT; |
@@ -1586,7 +1591,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, | |||
1586 | ret = down_read_killable(&mm->mmap_sem); | 1591 | ret = down_read_killable(&mm->mmap_sem); |
1587 | if (ret) | 1592 | if (ret) |
1588 | goto out_free; | 1593 | goto out_free; |
1589 | ret = walk_page_range(start_vaddr, end, &pagemap_walk); | 1594 | ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); |
1590 | up_read(&mm->mmap_sem); | 1595 | up_read(&mm->mmap_sem); |
1591 | start_vaddr = end; | 1596 | start_vaddr = end; |
1592 | 1597 | ||
@@ -1798,6 +1803,11 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, | |||
1798 | } | 1803 | } |
1799 | #endif | 1804 | #endif |
1800 | 1805 | ||
1806 | static const struct mm_walk_ops show_numa_ops = { | ||
1807 | .hugetlb_entry = gather_hugetlb_stats, | ||
1808 | .pmd_entry = gather_pte_stats, | ||
1809 | }; | ||
1810 | |||
1801 | /* | 1811 | /* |
1802 | * Display pages allocated per node and memory policy via /proc. | 1812 | * Display pages allocated per node and memory policy via /proc. |
1803 | */ | 1813 | */ |
@@ -1809,12 +1819,6 @@ static int show_numa_map(struct seq_file *m, void *v) | |||
1809 | struct numa_maps *md = &numa_priv->md; | 1819 | struct numa_maps *md = &numa_priv->md; |
1810 | struct file *file = vma->vm_file; | 1820 | struct file *file = vma->vm_file; |
1811 | struct mm_struct *mm = vma->vm_mm; | 1821 | struct mm_struct *mm = vma->vm_mm; |
1812 | struct mm_walk walk = { | ||
1813 | .hugetlb_entry = gather_hugetlb_stats, | ||
1814 | .pmd_entry = gather_pte_stats, | ||
1815 | .private = md, | ||
1816 | .mm = mm, | ||
1817 | }; | ||
1818 | struct mempolicy *pol; | 1822 | struct mempolicy *pol; |
1819 | char buffer[64]; | 1823 | char buffer[64]; |
1820 | int nid; | 1824 | int nid; |
@@ -1848,7 +1852,7 @@ static int show_numa_map(struct seq_file *m, void *v) | |||
1848 | seq_puts(m, " huge"); | 1852 | seq_puts(m, " huge"); |
1849 | 1853 | ||
1850 | /* mmap_sem is held by m_start */ | 1854 | /* mmap_sem is held by m_start */ |
1851 | walk_page_vma(vma, &walk); | 1855 | walk_page_vma(vma, &show_numa_ops, md); |
1852 | 1856 | ||
1853 | if (!md->pages) | 1857 | if (!md->pages) |
1854 | goto out; | 1858 | goto out; |
diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7ef56dc18050..3fec513b9c00 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h | |||
@@ -84,15 +84,12 @@ | |||
84 | * @notifiers: count of active mmu notifiers | 84 | * @notifiers: count of active mmu notifiers |
85 | */ | 85 | */ |
86 | struct hmm { | 86 | struct hmm { |
87 | struct mm_struct *mm; | 87 | struct mmu_notifier mmu_notifier; |
88 | struct kref kref; | ||
89 | spinlock_t ranges_lock; | 88 | spinlock_t ranges_lock; |
90 | struct list_head ranges; | 89 | struct list_head ranges; |
91 | struct list_head mirrors; | 90 | struct list_head mirrors; |
92 | struct mmu_notifier mmu_notifier; | ||
93 | struct rw_semaphore mirrors_sem; | 91 | struct rw_semaphore mirrors_sem; |
94 | wait_queue_head_t wq; | 92 | wait_queue_head_t wq; |
95 | struct rcu_head rcu; | ||
96 | long notifiers; | 93 | long notifiers; |
97 | }; | 94 | }; |
98 | 95 | ||
@@ -158,13 +155,11 @@ enum hmm_pfn_value_e { | |||
158 | * @values: pfn value for some special case (none, special, error, ...) | 155 | * @values: pfn value for some special case (none, special, error, ...) |
159 | * @default_flags: default flags for the range (write, read, ... see hmm doc) | 156 | * @default_flags: default flags for the range (write, read, ... see hmm doc) |
160 | * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter | 157 | * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter |
161 | * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) | ||
162 | * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) | 158 | * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) |
163 | * @valid: pfns array did not change since it has been fill by an HMM function | 159 | * @valid: pfns array did not change since it has been fill by an HMM function |
164 | */ | 160 | */ |
165 | struct hmm_range { | 161 | struct hmm_range { |
166 | struct hmm *hmm; | 162 | struct hmm *hmm; |
167 | struct vm_area_struct *vma; | ||
168 | struct list_head list; | 163 | struct list_head list; |
169 | unsigned long start; | 164 | unsigned long start; |
170 | unsigned long end; | 165 | unsigned long end; |
@@ -173,32 +168,11 @@ struct hmm_range { | |||
173 | const uint64_t *values; | 168 | const uint64_t *values; |
174 | uint64_t default_flags; | 169 | uint64_t default_flags; |
175 | uint64_t pfn_flags_mask; | 170 | uint64_t pfn_flags_mask; |
176 | uint8_t page_shift; | ||
177 | uint8_t pfn_shift; | 171 | uint8_t pfn_shift; |
178 | bool valid; | 172 | bool valid; |
179 | }; | 173 | }; |
180 | 174 | ||
181 | /* | 175 | /* |
182 | * hmm_range_page_shift() - return the page shift for the range | ||
183 | * @range: range being queried | ||
184 | * Return: page shift (page size = 1 << page shift) for the range | ||
185 | */ | ||
186 | static inline unsigned hmm_range_page_shift(const struct hmm_range *range) | ||
187 | { | ||
188 | return range->page_shift; | ||
189 | } | ||
190 | |||
191 | /* | ||
192 | * hmm_range_page_size() - return the page size for the range | ||
193 | * @range: range being queried | ||
194 | * Return: page size for the range in bytes | ||
195 | */ | ||
196 | static inline unsigned long hmm_range_page_size(const struct hmm_range *range) | ||
197 | { | ||
198 | return 1UL << hmm_range_page_shift(range); | ||
199 | } | ||
200 | |||
201 | /* | ||
202 | * hmm_range_wait_until_valid() - wait for range to be valid | 176 | * hmm_range_wait_until_valid() - wait for range to be valid |
203 | * @range: range affected by invalidation to wait on | 177 | * @range: range affected by invalidation to wait on |
204 | * @timeout: time out for wait in ms (ie abort wait after that period of time) | 178 | * @timeout: time out for wait in ms (ie abort wait after that period of time) |
@@ -291,40 +265,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, | |||
291 | } | 265 | } |
292 | 266 | ||
293 | /* | 267 | /* |
294 | * Old API: | ||
295 | * hmm_pfn_to_page() | ||
296 | * hmm_pfn_to_pfn() | ||
297 | * hmm_pfn_from_page() | ||
298 | * hmm_pfn_from_pfn() | ||
299 | * | ||
300 | * This are the OLD API please use new API, it is here to avoid cross-tree | ||
301 | * merge painfullness ie we convert things to new API in stages. | ||
302 | */ | ||
303 | static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, | ||
304 | uint64_t pfn) | ||
305 | { | ||
306 | return hmm_device_entry_to_page(range, pfn); | ||
307 | } | ||
308 | |||
309 | static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, | ||
310 | uint64_t pfn) | ||
311 | { | ||
312 | return hmm_device_entry_to_pfn(range, pfn); | ||
313 | } | ||
314 | |||
315 | static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, | ||
316 | struct page *page) | ||
317 | { | ||
318 | return hmm_device_entry_from_page(range, page); | ||
319 | } | ||
320 | |||
321 | static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, | ||
322 | unsigned long pfn) | ||
323 | { | ||
324 | return hmm_device_entry_from_pfn(range, pfn); | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Mirroring: how to synchronize device page table with CPU page table. | 268 | * Mirroring: how to synchronize device page table with CPU page table. |
329 | * | 269 | * |
330 | * A device driver that is participating in HMM mirroring must always | 270 | * A device driver that is participating in HMM mirroring must always |
@@ -375,29 +315,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, | |||
375 | struct hmm_mirror; | 315 | struct hmm_mirror; |
376 | 316 | ||
377 | /* | 317 | /* |
378 | * enum hmm_update_event - type of update | ||
379 | * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) | ||
380 | */ | ||
381 | enum hmm_update_event { | ||
382 | HMM_UPDATE_INVALIDATE, | ||
383 | }; | ||
384 | |||
385 | /* | ||
386 | * struct hmm_update - HMM update information for callback | ||
387 | * | ||
388 | * @start: virtual start address of the range to update | ||
389 | * @end: virtual end address of the range to update | ||
390 | * @event: event triggering the update (what is happening) | ||
391 | * @blockable: can the callback block/sleep ? | ||
392 | */ | ||
393 | struct hmm_update { | ||
394 | unsigned long start; | ||
395 | unsigned long end; | ||
396 | enum hmm_update_event event; | ||
397 | bool blockable; | ||
398 | }; | ||
399 | |||
400 | /* | ||
401 | * struct hmm_mirror_ops - HMM mirror device operations callback | 318 | * struct hmm_mirror_ops - HMM mirror device operations callback |
402 | * | 319 | * |
403 | * @update: callback to update range on a device | 320 | * @update: callback to update range on a device |
@@ -417,9 +334,9 @@ struct hmm_mirror_ops { | |||
417 | /* sync_cpu_device_pagetables() - synchronize page tables | 334 | /* sync_cpu_device_pagetables() - synchronize page tables |
418 | * | 335 | * |
419 | * @mirror: pointer to struct hmm_mirror | 336 | * @mirror: pointer to struct hmm_mirror |
420 | * @update: update information (see struct hmm_update) | 337 | * @update: update information (see struct mmu_notifier_range) |
421 | * Return: -EAGAIN if update.blockable false and callback need to | 338 | * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false |
422 | * block, 0 otherwise. | 339 | * and callback needs to block, 0 otherwise. |
423 | * | 340 | * |
424 | * This callback ultimately originates from mmu_notifiers when the CPU | 341 | * This callback ultimately originates from mmu_notifiers when the CPU |
425 | * page table is updated. The device driver must update its page table | 342 | * page table is updated. The device driver must update its page table |
@@ -430,8 +347,9 @@ struct hmm_mirror_ops { | |||
430 | * page tables are completely updated (TLBs flushed, etc); this is a | 347 | * page tables are completely updated (TLBs flushed, etc); this is a |
431 | * synchronous call. | 348 | * synchronous call. |
432 | */ | 349 | */ |
433 | int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, | 350 | int (*sync_cpu_device_pagetables)( |
434 | const struct hmm_update *update); | 351 | struct hmm_mirror *mirror, |
352 | const struct mmu_notifier_range *update); | ||
435 | }; | 353 | }; |
436 | 354 | ||
437 | /* | 355 | /* |
@@ -457,20 +375,24 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); | |||
457 | /* | 375 | /* |
458 | * Please see Documentation/vm/hmm.rst for how to use the range API. | 376 | * Please see Documentation/vm/hmm.rst for how to use the range API. |
459 | */ | 377 | */ |
460 | int hmm_range_register(struct hmm_range *range, | 378 | int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror); |
461 | struct hmm_mirror *mirror, | ||
462 | unsigned long start, | ||
463 | unsigned long end, | ||
464 | unsigned page_shift); | ||
465 | void hmm_range_unregister(struct hmm_range *range); | 379 | void hmm_range_unregister(struct hmm_range *range); |
466 | long hmm_range_snapshot(struct hmm_range *range); | 380 | |
467 | long hmm_range_fault(struct hmm_range *range, bool block); | 381 | /* |
382 | * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. | ||
383 | */ | ||
384 | #define HMM_FAULT_ALLOW_RETRY (1 << 0) | ||
385 | |||
386 | /* Don't fault in missing PTEs, just snapshot the current state. */ | ||
387 | #define HMM_FAULT_SNAPSHOT (1 << 1) | ||
388 | |||
389 | long hmm_range_fault(struct hmm_range *range, unsigned int flags); | ||
390 | |||
468 | long hmm_range_dma_map(struct hmm_range *range, | 391 | long hmm_range_dma_map(struct hmm_range *range, |
469 | struct device *device, | 392 | struct device *device, |
470 | dma_addr_t *daddrs, | 393 | dma_addr_t *daddrs, |
471 | bool block); | 394 | unsigned int flags); |
472 | long hmm_range_dma_unmap(struct hmm_range *range, | 395 | long hmm_range_dma_unmap(struct hmm_range *range, |
473 | struct vm_area_struct *vma, | ||
474 | struct device *device, | 396 | struct device *device, |
475 | dma_addr_t *daddrs, | 397 | dma_addr_t *daddrs, |
476 | bool dirty); | 398 | bool dirty); |
@@ -484,13 +406,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, | |||
484 | */ | 406 | */ |
485 | #define HMM_RANGE_DEFAULT_TIMEOUT 1000 | 407 | #define HMM_RANGE_DEFAULT_TIMEOUT 1000 |
486 | 408 | ||
487 | /* Below are for HMM internal use only! Not to be used by device driver! */ | ||
488 | static inline void hmm_mm_init(struct mm_struct *mm) | ||
489 | { | ||
490 | mm->hmm = NULL; | ||
491 | } | ||
492 | #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ | ||
493 | static inline void hmm_mm_init(struct mm_struct *mm) {} | ||
494 | #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ | 409 | #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ |
495 | 410 | ||
496 | #endif /* LINUX_HMM_H */ | 411 | #endif /* LINUX_HMM_H */ |
diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5b6a7121c9f0..7bddddfc76d6 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h | |||
@@ -297,6 +297,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) | |||
297 | 297 | ||
298 | struct resource *devm_request_free_mem_region(struct device *dev, | 298 | struct resource *devm_request_free_mem_region(struct device *dev, |
299 | struct resource *base, unsigned long size); | 299 | struct resource *base, unsigned long size); |
300 | struct resource *request_free_mem_region(struct resource *base, | ||
301 | unsigned long size, const char *name); | ||
300 | 302 | ||
301 | #endif /* __ASSEMBLY__ */ | 303 | #endif /* __ASSEMBLY__ */ |
302 | #endif /* _LINUX_IOPORT_H */ | 304 | #endif /* _LINUX_IOPORT_H */ |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4fa360a13c1e..d83d403dac2e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -217,7 +217,9 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); | |||
217 | * might_sleep - annotation for functions that can sleep | 217 | * might_sleep - annotation for functions that can sleep |
218 | * | 218 | * |
219 | * this macro will print a stack trace if it is executed in an atomic | 219 | * this macro will print a stack trace if it is executed in an atomic |
220 | * context (spinlock, irq-handler, ...). | 220 | * context (spinlock, irq-handler, ...). Additional sections where blocking is |
221 | * not allowed can be annotated with non_block_start() and non_block_end() | ||
222 | * pairs. | ||
221 | * | 223 | * |
222 | * This is a useful debugging help to be able to catch problems early and not | 224 | * This is a useful debugging help to be able to catch problems early and not |
223 | * be bitten later when the calling function happens to sleep when it is not | 225 | * be bitten later when the calling function happens to sleep when it is not |
@@ -233,6 +235,23 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); | |||
233 | # define cant_sleep() \ | 235 | # define cant_sleep() \ |
234 | do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) | 236 | do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) |
235 | # define sched_annotate_sleep() (current->task_state_change = 0) | 237 | # define sched_annotate_sleep() (current->task_state_change = 0) |
238 | /** | ||
239 | * non_block_start - annotate the start of section where sleeping is prohibited | ||
240 | * | ||
241 | * This is on behalf of the oom reaper, specifically when it is calling the mmu | ||
242 | * notifiers. The problem is that if the notifier were to block on, for example, | ||
243 | * mutex_lock() and if the process which holds that mutex were to perform a | ||
244 | * sleeping memory allocation, the oom reaper is now blocked on completion of | ||
245 | * that memory allocation. Other blocking calls like wait_event() pose similar | ||
246 | * issues. | ||
247 | */ | ||
248 | # define non_block_start() (current->non_block_count++) | ||
249 | /** | ||
250 | * non_block_end - annotate the end of section where sleeping is prohibited | ||
251 | * | ||
252 | * Closes a section opened by non_block_start(). | ||
253 | */ | ||
254 | # define non_block_end() WARN_ON(current->non_block_count-- == 0) | ||
236 | #else | 255 | #else |
237 | static inline void ___might_sleep(const char *file, int line, | 256 | static inline void ___might_sleep(const char *file, int line, |
238 | int preempt_offset) { } | 257 | int preempt_offset) { } |
@@ -241,6 +260,8 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); | |||
241 | # define might_sleep() do { might_resched(); } while (0) | 260 | # define might_sleep() do { might_resched(); } while (0) |
242 | # define cant_sleep() do { } while (0) | 261 | # define cant_sleep() do { } while (0) |
243 | # define sched_annotate_sleep() do { } while (0) | 262 | # define sched_annotate_sleep() do { } while (0) |
263 | # define non_block_start() do { } while (0) | ||
264 | # define non_block_end() do { } while (0) | ||
244 | #endif | 265 | #endif |
245 | 266 | ||
246 | #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) | 267 | #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) |
diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..fb2a0bd826b9 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h | |||
@@ -109,7 +109,6 @@ struct dev_pagemap { | |||
109 | struct percpu_ref *ref; | 109 | struct percpu_ref *ref; |
110 | struct percpu_ref internal_ref; | 110 | struct percpu_ref internal_ref; |
111 | struct completion done; | 111 | struct completion done; |
112 | struct device *dev; | ||
113 | enum memory_type type; | 112 | enum memory_type type; |
114 | unsigned int flags; | 113 | unsigned int flags; |
115 | u64 pci_p2pdma_bus_offset; | 114 | u64 pci_p2pdma_bus_offset; |
@@ -124,6 +123,8 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) | |||
124 | } | 123 | } |
125 | 124 | ||
126 | #ifdef CONFIG_ZONE_DEVICE | 125 | #ifdef CONFIG_ZONE_DEVICE |
126 | void *memremap_pages(struct dev_pagemap *pgmap, int nid); | ||
127 | void memunmap_pages(struct dev_pagemap *pgmap); | ||
127 | void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); | 128 | void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); |
128 | void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); | 129 | void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); |
129 | struct dev_pagemap *get_dev_pagemap(unsigned long pfn, | 130 | struct dev_pagemap *get_dev_pagemap(unsigned long pfn, |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7f04754c7f2b..72120061b7d4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -166,8 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
166 | #define MIGRATE_PFN_MIGRATE (1UL << 1) | 166 | #define MIGRATE_PFN_MIGRATE (1UL << 1) |
167 | #define MIGRATE_PFN_LOCKED (1UL << 2) | 167 | #define MIGRATE_PFN_LOCKED (1UL << 2) |
168 | #define MIGRATE_PFN_WRITE (1UL << 3) | 168 | #define MIGRATE_PFN_WRITE (1UL << 3) |
169 | #define MIGRATE_PFN_DEVICE (1UL << 4) | ||
170 | #define MIGRATE_PFN_ERROR (1UL << 5) | ||
171 | #define MIGRATE_PFN_SHIFT 6 | 169 | #define MIGRATE_PFN_SHIFT 6 |
172 | 170 | ||
173 | static inline struct page *migrate_pfn_to_page(unsigned long mpfn) | 171 | static inline struct page *migrate_pfn_to_page(unsigned long mpfn) |
@@ -182,107 +180,27 @@ static inline unsigned long migrate_pfn(unsigned long pfn) | |||
182 | return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; | 180 | return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; |
183 | } | 181 | } |
184 | 182 | ||
185 | /* | 183 | struct migrate_vma { |
186 | * struct migrate_vma_ops - migrate operation callback | 184 | struct vm_area_struct *vma; |
187 | * | 185 | /* |
188 | * @alloc_and_copy: alloc destination memory and copy source memory to it | 186 | * Both src and dst array must be big enough for |
189 | * @finalize_and_map: allow caller to map the successfully migrated pages | 187 | * (end - start) >> PAGE_SHIFT entries. |
190 | * | 188 | * |
191 | * | 189 | * The src array must not be modified by the caller after |
192 | * The alloc_and_copy() callback happens once all source pages have been locked, | 190 | * migrate_vma_setup(), and must not change the dst array after |
193 | * unmapped and checked (checked whether pinned or not). All pages that can be | 191 | * migrate_vma_pages() returns. |
194 | * migrated will have an entry in the src array set with the pfn value of the | 192 | */ |
195 | * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other | 193 | unsigned long *dst; |
196 | * flags might be set but should be ignored by the callback). | 194 | unsigned long *src; |
197 | * | 195 | unsigned long cpages; |
198 | * The alloc_and_copy() callback can then allocate destination memory and copy | 196 | unsigned long npages; |
199 | * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and | 197 | unsigned long start; |
200 | * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the | 198 | unsigned long end; |
201 | * callback must update each corresponding entry in the dst array with the pfn | ||
202 | * value of the destination page and with the MIGRATE_PFN_VALID and | ||
203 | * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages | ||
204 | * locked, via lock_page()). | ||
205 | * | ||
206 | * At this point the alloc_and_copy() callback is done and returns. | ||
207 | * | ||
208 | * Note that the callback does not have to migrate all the pages that are | ||
209 | * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration | ||
210 | * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also | ||
211 | * set in the src array entry). If the device driver cannot migrate a device | ||
212 | * page back to system memory, then it must set the corresponding dst array | ||
213 | * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to | ||
214 | * access any of the virtual addresses originally backed by this page. Because | ||
215 | * a SIGBUS is such a severe result for the userspace process, the device | ||
216 | * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an | ||
217 | * unrecoverable state. | ||
218 | * | ||
219 | * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we | ||
220 | * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus | ||
221 | * allowing device driver to allocate device memory for those unback virtual | ||
222 | * address. For this the device driver simply have to allocate device memory | ||
223 | * and properly set the destination entry like for regular migration. Note that | ||
224 | * this can still fails and thus inside the device driver must check if the | ||
225 | * migration was successful for those entry inside the finalize_and_map() | ||
226 | * callback just like for regular migration. | ||
227 | * | ||
228 | * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES | ||
229 | * OR BAD THINGS WILL HAPPEN ! | ||
230 | * | ||
231 | * | ||
232 | * The finalize_and_map() callback happens after struct page migration from | ||
233 | * source to destination (destination struct pages are the struct pages for the | ||
234 | * memory allocated by the alloc_and_copy() callback). Migration can fail, and | ||
235 | * thus the finalize_and_map() allows the driver to inspect which pages were | ||
236 | * successfully migrated, and which were not. Successfully migrated pages will | ||
237 | * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. | ||
238 | * | ||
239 | * It is safe to update device page table from within the finalize_and_map() | ||
240 | * callback because both destination and source page are still locked, and the | ||
241 | * mmap_sem is held in read mode (hence no one can unmap the range being | ||
242 | * migrated). | ||
243 | * | ||
244 | * Once callback is done cleaning up things and updating its page table (if it | ||
245 | * chose to do so, this is not an obligation) then it returns. At this point, | ||
246 | * the HMM core will finish up the final steps, and the migration is complete. | ||
247 | * | ||
248 | * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY | ||
249 | * ENTRIES OR BAD THINGS WILL HAPPEN ! | ||
250 | */ | ||
251 | struct migrate_vma_ops { | ||
252 | void (*alloc_and_copy)(struct vm_area_struct *vma, | ||
253 | const unsigned long *src, | ||
254 | unsigned long *dst, | ||
255 | unsigned long start, | ||
256 | unsigned long end, | ||
257 | void *private); | ||
258 | void (*finalize_and_map)(struct vm_area_struct *vma, | ||
259 | const unsigned long *src, | ||
260 | const unsigned long *dst, | ||
261 | unsigned long start, | ||
262 | unsigned long end, | ||
263 | void *private); | ||
264 | }; | 199 | }; |
265 | 200 | ||
266 | #if defined(CONFIG_MIGRATE_VMA_HELPER) | 201 | int migrate_vma_setup(struct migrate_vma *args); |
267 | int migrate_vma(const struct migrate_vma_ops *ops, | 202 | void migrate_vma_pages(struct migrate_vma *migrate); |
268 | struct vm_area_struct *vma, | 203 | void migrate_vma_finalize(struct migrate_vma *migrate); |
269 | unsigned long start, | ||
270 | unsigned long end, | ||
271 | unsigned long *src, | ||
272 | unsigned long *dst, | ||
273 | void *private); | ||
274 | #else | ||
275 | static inline int migrate_vma(const struct migrate_vma_ops *ops, | ||
276 | struct vm_area_struct *vma, | ||
277 | unsigned long start, | ||
278 | unsigned long end, | ||
279 | unsigned long *src, | ||
280 | unsigned long *dst, | ||
281 | void *private) | ||
282 | { | ||
283 | return -EINVAL; | ||
284 | } | ||
285 | #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */ | ||
286 | 204 | ||
287 | #endif /* CONFIG_MIGRATION */ | 205 | #endif /* CONFIG_MIGRATION */ |
288 | 206 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0334ca97c584..7cf955feb823 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1430,54 +1430,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
1430 | void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 1430 | void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
1431 | unsigned long start, unsigned long end); | 1431 | unsigned long start, unsigned long end); |
1432 | 1432 | ||
1433 | /** | ||
1434 | * mm_walk - callbacks for walk_page_range | ||
1435 | * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry | ||
1436 | * this handler should only handle pud_trans_huge() puds. | ||
1437 | * the pmd_entry or pte_entry callbacks will be used for | ||
1438 | * regular PUDs. | ||
1439 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry | ||
1440 | * this handler is required to be able to handle | ||
1441 | * pmd_trans_huge() pmds. They may simply choose to | ||
1442 | * split_huge_page() instead of handling it explicitly. | ||
1443 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry | ||
1444 | * @pte_hole: if set, called for each hole at all levels | ||
1445 | * @hugetlb_entry: if set, called for each hugetlb entry | ||
1446 | * @test_walk: caller specific callback function to determine whether | ||
1447 | * we walk over the current vma or not. Returning 0 | ||
1448 | * value means "do page table walk over the current vma," | ||
1449 | * and a negative one means "abort current page table walk | ||
1450 | * right now." 1 means "skip the current vma." | ||
1451 | * @mm: mm_struct representing the target process of page table walk | ||
1452 | * @vma: vma currently walked (NULL if walking outside vmas) | ||
1453 | * @private: private data for callbacks' usage | ||
1454 | * | ||
1455 | * (see the comment on walk_page_range() for more details) | ||
1456 | */ | ||
1457 | struct mm_walk { | ||
1458 | int (*pud_entry)(pud_t *pud, unsigned long addr, | ||
1459 | unsigned long next, struct mm_walk *walk); | ||
1460 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, | ||
1461 | unsigned long next, struct mm_walk *walk); | ||
1462 | int (*pte_entry)(pte_t *pte, unsigned long addr, | ||
1463 | unsigned long next, struct mm_walk *walk); | ||
1464 | int (*pte_hole)(unsigned long addr, unsigned long next, | ||
1465 | struct mm_walk *walk); | ||
1466 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, | ||
1467 | unsigned long addr, unsigned long next, | ||
1468 | struct mm_walk *walk); | ||
1469 | int (*test_walk)(unsigned long addr, unsigned long next, | ||
1470 | struct mm_walk *walk); | ||
1471 | struct mm_struct *mm; | ||
1472 | struct vm_area_struct *vma; | ||
1473 | void *private; | ||
1474 | }; | ||
1475 | |||
1476 | struct mmu_notifier_range; | 1433 | struct mmu_notifier_range; |
1477 | 1434 | ||
1478 | int walk_page_range(unsigned long addr, unsigned long end, | ||
1479 | struct mm_walk *walk); | ||
1480 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); | ||
1481 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 1435 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
1482 | unsigned long end, unsigned long floor, unsigned long ceiling); | 1436 | unsigned long end, unsigned long floor, unsigned long ceiling); |
1483 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | 1437 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a7a1083b6fb..0b739f360cec 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -25,7 +25,6 @@ | |||
25 | 25 | ||
26 | struct address_space; | 26 | struct address_space; |
27 | struct mem_cgroup; | 27 | struct mem_cgroup; |
28 | struct hmm; | ||
29 | 28 | ||
30 | /* | 29 | /* |
31 | * Each physical page in the system has a struct page associated with | 30 | * Each physical page in the system has a struct page associated with |
@@ -511,11 +510,6 @@ struct mm_struct { | |||
511 | atomic_long_t hugetlb_usage; | 510 | atomic_long_t hugetlb_usage; |
512 | #endif | 511 | #endif |
513 | struct work_struct async_put_work; | 512 | struct work_struct async_put_work; |
514 | |||
515 | #ifdef CONFIG_HMM_MIRROR | ||
516 | /* HMM needs to track a few things per mm */ | ||
517 | struct hmm *hmm; | ||
518 | #endif | ||
519 | } __randomize_layout; | 513 | } __randomize_layout; |
520 | 514 | ||
521 | /* | 515 | /* |
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6a..1bd8e6a09a3c 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h | |||
@@ -42,6 +42,10 @@ enum mmu_notifier_event { | |||
42 | 42 | ||
43 | #ifdef CONFIG_MMU_NOTIFIER | 43 | #ifdef CONFIG_MMU_NOTIFIER |
44 | 44 | ||
45 | #ifdef CONFIG_LOCKDEP | ||
46 | extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; | ||
47 | #endif | ||
48 | |||
45 | /* | 49 | /* |
46 | * The mmu notifier_mm structure is allocated and installed in | 50 | * The mmu notifier_mm structure is allocated and installed in |
47 | * mm->mmu_notifier_mm inside the mm_take_all_locks() protected | 51 | * mm->mmu_notifier_mm inside the mm_take_all_locks() protected |
@@ -211,6 +215,19 @@ struct mmu_notifier_ops { | |||
211 | */ | 215 | */ |
212 | void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, | 216 | void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, |
213 | unsigned long start, unsigned long end); | 217 | unsigned long start, unsigned long end); |
218 | |||
219 | /* | ||
220 | * These callbacks are used with the get/put interface to manage the | ||
221 | * lifetime of the mmu_notifier memory. alloc_notifier() returns a new | ||
222 | * notifier for use with the mm. | ||
223 | * | ||
224 | * free_notifier() is only called after the mmu_notifier has been | ||
225 | * fully put, calls to any ops callback are prevented and no ops | ||
226 | * callbacks are currently running. It is called from a SRCU callback | ||
227 | * and cannot sleep. | ||
228 | */ | ||
229 | struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); | ||
230 | void (*free_notifier)(struct mmu_notifier *mn); | ||
214 | }; | 231 | }; |
215 | 232 | ||
216 | /* | 233 | /* |
@@ -227,6 +244,9 @@ struct mmu_notifier_ops { | |||
227 | struct mmu_notifier { | 244 | struct mmu_notifier { |
228 | struct hlist_node hlist; | 245 | struct hlist_node hlist; |
229 | const struct mmu_notifier_ops *ops; | 246 | const struct mmu_notifier_ops *ops; |
247 | struct mm_struct *mm; | ||
248 | struct rcu_head rcu; | ||
249 | unsigned int users; | ||
230 | }; | 250 | }; |
231 | 251 | ||
232 | static inline int mm_has_notifiers(struct mm_struct *mm) | 252 | static inline int mm_has_notifiers(struct mm_struct *mm) |
@@ -234,14 +254,27 @@ static inline int mm_has_notifiers(struct mm_struct *mm) | |||
234 | return unlikely(mm->mmu_notifier_mm); | 254 | return unlikely(mm->mmu_notifier_mm); |
235 | } | 255 | } |
236 | 256 | ||
257 | struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, | ||
258 | struct mm_struct *mm); | ||
259 | static inline struct mmu_notifier * | ||
260 | mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) | ||
261 | { | ||
262 | struct mmu_notifier *ret; | ||
263 | |||
264 | down_write(&mm->mmap_sem); | ||
265 | ret = mmu_notifier_get_locked(ops, mm); | ||
266 | up_write(&mm->mmap_sem); | ||
267 | return ret; | ||
268 | } | ||
269 | void mmu_notifier_put(struct mmu_notifier *mn); | ||
270 | void mmu_notifier_synchronize(void); | ||
271 | |||
237 | extern int mmu_notifier_register(struct mmu_notifier *mn, | 272 | extern int mmu_notifier_register(struct mmu_notifier *mn, |
238 | struct mm_struct *mm); | 273 | struct mm_struct *mm); |
239 | extern int __mmu_notifier_register(struct mmu_notifier *mn, | 274 | extern int __mmu_notifier_register(struct mmu_notifier *mn, |
240 | struct mm_struct *mm); | 275 | struct mm_struct *mm); |
241 | extern void mmu_notifier_unregister(struct mmu_notifier *mn, | 276 | extern void mmu_notifier_unregister(struct mmu_notifier *mn, |
242 | struct mm_struct *mm); | 277 | struct mm_struct *mm); |
243 | extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, | ||
244 | struct mm_struct *mm); | ||
245 | extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); | 278 | extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); |
246 | extern void __mmu_notifier_release(struct mm_struct *mm); | 279 | extern void __mmu_notifier_release(struct mm_struct *mm); |
247 | extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | 280 | extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, |
@@ -310,25 +343,36 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, | |||
310 | static inline void | 343 | static inline void |
311 | mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) | 344 | mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
312 | { | 345 | { |
346 | might_sleep(); | ||
347 | |||
348 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | ||
313 | if (mm_has_notifiers(range->mm)) { | 349 | if (mm_has_notifiers(range->mm)) { |
314 | range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; | 350 | range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; |
315 | __mmu_notifier_invalidate_range_start(range); | 351 | __mmu_notifier_invalidate_range_start(range); |
316 | } | 352 | } |
353 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); | ||
317 | } | 354 | } |
318 | 355 | ||
319 | static inline int | 356 | static inline int |
320 | mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) | 357 | mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) |
321 | { | 358 | { |
359 | int ret = 0; | ||
360 | |||
361 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | ||
322 | if (mm_has_notifiers(range->mm)) { | 362 | if (mm_has_notifiers(range->mm)) { |
323 | range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; | 363 | range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; |
324 | return __mmu_notifier_invalidate_range_start(range); | 364 | ret = __mmu_notifier_invalidate_range_start(range); |
325 | } | 365 | } |
326 | return 0; | 366 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); |
367 | return ret; | ||
327 | } | 368 | } |
328 | 369 | ||
329 | static inline void | 370 | static inline void |
330 | mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) | 371 | mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) |
331 | { | 372 | { |
373 | if (mmu_notifier_range_blockable(range)) | ||
374 | might_sleep(); | ||
375 | |||
332 | if (mm_has_notifiers(range->mm)) | 376 | if (mm_has_notifiers(range->mm)) |
333 | __mmu_notifier_invalidate_range_end(range, false); | 377 | __mmu_notifier_invalidate_range_end(range, false); |
334 | } | 378 | } |
@@ -482,9 +526,6 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, | |||
482 | set_pte_at(___mm, ___address, __ptep, ___pte); \ | 526 | set_pte_at(___mm, ___address, __ptep, ___pte); \ |
483 | }) | 527 | }) |
484 | 528 | ||
485 | extern void mmu_notifier_call_srcu(struct rcu_head *rcu, | ||
486 | void (*func)(struct rcu_head *rcu)); | ||
487 | |||
488 | #else /* CONFIG_MMU_NOTIFIER */ | 529 | #else /* CONFIG_MMU_NOTIFIER */ |
489 | 530 | ||
490 | struct mmu_notifier_range { | 531 | struct mmu_notifier_range { |
@@ -581,6 +622,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
581 | #define pudp_huge_clear_flush_notify pudp_huge_clear_flush | 622 | #define pudp_huge_clear_flush_notify pudp_huge_clear_flush |
582 | #define set_pte_at_notify set_pte_at | 623 | #define set_pte_at_notify set_pte_at |
583 | 624 | ||
625 | static inline void mmu_notifier_synchronize(void) | ||
626 | { | ||
627 | } | ||
628 | |||
584 | #endif /* CONFIG_MMU_NOTIFIER */ | 629 | #endif /* CONFIG_MMU_NOTIFIER */ |
585 | 630 | ||
586 | #endif /* _LINUX_MMU_NOTIFIER_H */ | 631 | #endif /* _LINUX_MMU_NOTIFIER_H */ |
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h new file mode 100644 index 000000000000..bddd9759bab9 --- /dev/null +++ b/include/linux/pagewalk.h | |||
@@ -0,0 +1,66 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef _LINUX_PAGEWALK_H | ||
3 | #define _LINUX_PAGEWALK_H | ||
4 | |||
5 | #include <linux/mm.h> | ||
6 | |||
7 | struct mm_walk; | ||
8 | |||
9 | /** | ||
10 | * mm_walk_ops - callbacks for walk_page_range | ||
11 | * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry | ||
12 | * this handler should only handle pud_trans_huge() puds. | ||
13 | * the pmd_entry or pte_entry callbacks will be used for | ||
14 | * regular PUDs. | ||
15 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry | ||
16 | * this handler is required to be able to handle | ||
17 | * pmd_trans_huge() pmds. They may simply choose to | ||
18 | * split_huge_page() instead of handling it explicitly. | ||
19 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry | ||
20 | * @pte_hole: if set, called for each hole at all levels | ||
21 | * @hugetlb_entry: if set, called for each hugetlb entry | ||
22 | * @test_walk: caller specific callback function to determine whether | ||
23 | * we walk over the current vma or not. Returning 0 means | ||
24 | * "do page table walk over the current vma", returning | ||
25 | * a negative value means "abort current page table walk | ||
26 | * right now" and returning 1 means "skip the current vma" | ||
27 | */ | ||
28 | struct mm_walk_ops { | ||
29 | int (*pud_entry)(pud_t *pud, unsigned long addr, | ||
30 | unsigned long next, struct mm_walk *walk); | ||
31 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, | ||
32 | unsigned long next, struct mm_walk *walk); | ||
33 | int (*pte_entry)(pte_t *pte, unsigned long addr, | ||
34 | unsigned long next, struct mm_walk *walk); | ||
35 | int (*pte_hole)(unsigned long addr, unsigned long next, | ||
36 | struct mm_walk *walk); | ||
37 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, | ||
38 | unsigned long addr, unsigned long next, | ||
39 | struct mm_walk *walk); | ||
40 | int (*test_walk)(unsigned long addr, unsigned long next, | ||
41 | struct mm_walk *walk); | ||
42 | }; | ||
43 | |||
44 | /** | ||
45 | * mm_walk - walk_page_range data | ||
46 | * @ops: operation to call during the walk | ||
47 | * @mm: mm_struct representing the target process of page table walk | ||
48 | * @vma: vma currently walked (NULL if walking outside vmas) | ||
49 | * @private: private data for callbacks' usage | ||
50 | * | ||
51 | * (see the comment on walk_page_range() for more details) | ||
52 | */ | ||
53 | struct mm_walk { | ||
54 | const struct mm_walk_ops *ops; | ||
55 | struct mm_struct *mm; | ||
56 | struct vm_area_struct *vma; | ||
57 | void *private; | ||
58 | }; | ||
59 | |||
60 | int walk_page_range(struct mm_struct *mm, unsigned long start, | ||
61 | unsigned long end, const struct mm_walk_ops *ops, | ||
62 | void *private); | ||
63 | int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, | ||
64 | void *private); | ||
65 | |||
66 | #endif /* _LINUX_PAGEWALK_H */ | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index b75b28287005..70db597d6fd4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -958,6 +958,10 @@ struct task_struct { | |||
958 | struct mutex_waiter *blocked_on; | 958 | struct mutex_waiter *blocked_on; |
959 | #endif | 959 | #endif |
960 | 960 | ||
961 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | ||
962 | int non_block_count; | ||
963 | #endif | ||
964 | |||
961 | #ifdef CONFIG_TRACE_IRQFLAGS | 965 | #ifdef CONFIG_TRACE_IRQFLAGS |
962 | unsigned int irq_events; | 966 | unsigned int irq_events; |
963 | unsigned long hardirq_enable_ip; | 967 | unsigned long hardirq_enable_ip; |
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 1052d0d62be7..a91b2af64ec4 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h | |||
@@ -42,7 +42,7 @@ struct ib_ucontext; | |||
42 | struct ib_umem_odp; | 42 | struct ib_umem_odp; |
43 | 43 | ||
44 | struct ib_umem { | 44 | struct ib_umem { |
45 | struct ib_ucontext *context; | 45 | struct ib_device *ibdev; |
46 | struct mm_struct *owning_mm; | 46 | struct mm_struct *owning_mm; |
47 | size_t length; | 47 | size_t length; |
48 | unsigned long address; | 48 | unsigned long address; |
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 479db5c98ff6..253df1a1fa54 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h | |||
@@ -37,11 +37,6 @@ | |||
37 | #include <rdma/ib_verbs.h> | 37 | #include <rdma/ib_verbs.h> |
38 | #include <linux/interval_tree.h> | 38 | #include <linux/interval_tree.h> |
39 | 39 | ||
40 | struct umem_odp_node { | ||
41 | u64 __subtree_last; | ||
42 | struct rb_node rb; | ||
43 | }; | ||
44 | |||
45 | struct ib_umem_odp { | 40 | struct ib_umem_odp { |
46 | struct ib_umem umem; | 41 | struct ib_umem umem; |
47 | struct ib_ucontext_per_mm *per_mm; | 42 | struct ib_ucontext_per_mm *per_mm; |
@@ -72,7 +67,15 @@ struct ib_umem_odp { | |||
72 | int npages; | 67 | int npages; |
73 | 68 | ||
74 | /* Tree tracking */ | 69 | /* Tree tracking */ |
75 | struct umem_odp_node interval_tree; | 70 | struct interval_tree_node interval_tree; |
71 | |||
72 | /* | ||
73 | * An implicit odp umem cannot be DMA mapped, has 0 length, and serves | ||
74 | * only as an anchor for the driver to hold onto the per_mm. FIXME: | ||
75 | * This should be removed and drivers should work with the per_mm | ||
76 | * directly. | ||
77 | */ | ||
78 | bool is_implicit_odp; | ||
76 | 79 | ||
77 | struct completion notifier_completion; | 80 | struct completion notifier_completion; |
78 | int dying; | 81 | int dying; |
@@ -88,14 +91,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) | |||
88 | /* Returns the first page of an ODP umem. */ | 91 | /* Returns the first page of an ODP umem. */ |
89 | static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) | 92 | static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) |
90 | { | 93 | { |
91 | return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift); | 94 | return umem_odp->interval_tree.start; |
92 | } | 95 | } |
93 | 96 | ||
94 | /* Returns the address of the page after the last one of an ODP umem. */ | 97 | /* Returns the address of the page after the last one of an ODP umem. */ |
95 | static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) | 98 | static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) |
96 | { | 99 | { |
97 | return ALIGN(umem_odp->umem.address + umem_odp->umem.length, | 100 | return umem_odp->interval_tree.last + 1; |
98 | 1UL << umem_odp->page_shift); | ||
99 | } | 101 | } |
100 | 102 | ||
101 | static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) | 103 | static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) |
@@ -120,25 +122,20 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) | |||
120 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING | 122 | #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING |
121 | 123 | ||
122 | struct ib_ucontext_per_mm { | 124 | struct ib_ucontext_per_mm { |
123 | struct ib_ucontext *context; | 125 | struct mmu_notifier mn; |
124 | struct mm_struct *mm; | ||
125 | struct pid *tgid; | 126 | struct pid *tgid; |
126 | bool active; | ||
127 | 127 | ||
128 | struct rb_root_cached umem_tree; | 128 | struct rb_root_cached umem_tree; |
129 | /* Protects umem_tree */ | 129 | /* Protects umem_tree */ |
130 | struct rw_semaphore umem_rwsem; | 130 | struct rw_semaphore umem_rwsem; |
131 | |||
132 | struct mmu_notifier mn; | ||
133 | unsigned int odp_mrs_count; | ||
134 | |||
135 | struct list_head ucontext_list; | ||
136 | struct rcu_head rcu; | ||
137 | }; | 131 | }; |
138 | 132 | ||
139 | int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); | 133 | struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, |
140 | struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, | 134 | size_t size, int access); |
141 | unsigned long addr, size_t size); | 135 | struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, |
136 | int access); | ||
137 | struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, | ||
138 | unsigned long addr, size_t size); | ||
142 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp); | 139 | void ib_umem_odp_release(struct ib_umem_odp *umem_odp); |
143 | 140 | ||
144 | int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, | 141 | int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, |
@@ -163,8 +160,17 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, | |||
163 | * Find first region intersecting with address range. | 160 | * Find first region intersecting with address range. |
164 | * Return NULL if not found | 161 | * Return NULL if not found |
165 | */ | 162 | */ |
166 | struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, | 163 | static inline struct ib_umem_odp * |
167 | u64 addr, u64 length); | 164 | rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) |
165 | { | ||
166 | struct interval_tree_node *node; | ||
167 | |||
168 | node = interval_tree_iter_first(root, addr, addr + length - 1); | ||
169 | if (!node) | ||
170 | return NULL; | ||
171 | return container_of(node, struct ib_umem_odp, interval_tree); | ||
172 | |||
173 | } | ||
168 | 174 | ||
169 | static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, | 175 | static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, |
170 | unsigned long mmu_seq) | 176 | unsigned long mmu_seq) |
@@ -185,9 +191,11 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, | |||
185 | 191 | ||
186 | #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ | 192 | #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ |
187 | 193 | ||
188 | static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) | 194 | static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, |
195 | unsigned long addr, | ||
196 | size_t size, int access) | ||
189 | { | 197 | { |
190 | return -EINVAL; | 198 | return ERR_PTR(-EINVAL); |
191 | } | 199 | } |
192 | 200 | ||
193 | static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} | 201 | static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} |
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4f225175cb91..f659f4a02aa9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h | |||
@@ -1417,11 +1417,6 @@ struct ib_ucontext { | |||
1417 | 1417 | ||
1418 | bool cleanup_retryable; | 1418 | bool cleanup_retryable; |
1419 | 1419 | ||
1420 | void (*invalidate_range)(struct ib_umem_odp *umem_odp, | ||
1421 | unsigned long start, unsigned long end); | ||
1422 | struct mutex per_mm_list_lock; | ||
1423 | struct list_head per_mm_list; | ||
1424 | |||
1425 | struct ib_rdmacg_object cg_obj; | 1420 | struct ib_rdmacg_object cg_obj; |
1426 | /* | 1421 | /* |
1427 | * Implementation details of the RDMA core, don't use in drivers: | 1422 | * Implementation details of the RDMA core, don't use in drivers: |
@@ -2378,6 +2373,8 @@ struct ib_device_ops { | |||
2378 | u64 iova); | 2373 | u64 iova); |
2379 | int (*unmap_fmr)(struct list_head *fmr_list); | 2374 | int (*unmap_fmr)(struct list_head *fmr_list); |
2380 | int (*dealloc_fmr)(struct ib_fmr *fmr); | 2375 | int (*dealloc_fmr)(struct ib_fmr *fmr); |
2376 | void (*invalidate_range)(struct ib_umem_odp *umem_odp, | ||
2377 | unsigned long start, unsigned long end); | ||
2381 | int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); | 2378 | int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); |
2382 | int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); | 2379 | int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); |
2383 | struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, | 2380 | struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, |
diff --git a/kernel/fork.c b/kernel/fork.c index 53e780748fe3..5a0fd518e04e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -1009,7 +1009,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, | |||
1009 | mm_init_owner(mm, p); | 1009 | mm_init_owner(mm, p); |
1010 | RCU_INIT_POINTER(mm->exe_file, NULL); | 1010 | RCU_INIT_POINTER(mm->exe_file, NULL); |
1011 | mmu_notifier_mm_init(mm); | 1011 | mmu_notifier_mm_init(mm); |
1012 | hmm_mm_init(mm); | ||
1013 | init_tlb_flush_pending(mm); | 1012 | init_tlb_flush_pending(mm); |
1014 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 1013 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
1015 | mm->pmd_huge_pte = NULL; | 1014 | mm->pmd_huge_pte = NULL; |
diff --git a/kernel/resource.c b/kernel/resource.c index 7ea4306503c5..74877e9d90ca 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -1644,19 +1644,8 @@ void resource_list_free(struct list_head *head) | |||
1644 | EXPORT_SYMBOL(resource_list_free); | 1644 | EXPORT_SYMBOL(resource_list_free); |
1645 | 1645 | ||
1646 | #ifdef CONFIG_DEVICE_PRIVATE | 1646 | #ifdef CONFIG_DEVICE_PRIVATE |
1647 | /** | 1647 | static struct resource *__request_free_mem_region(struct device *dev, |
1648 | * devm_request_free_mem_region - find free region for device private memory | 1648 | struct resource *base, unsigned long size, const char *name) |
1649 | * | ||
1650 | * @dev: device struct to bind the resource to | ||
1651 | * @size: size in bytes of the device memory to add | ||
1652 | * @base: resource tree to look in | ||
1653 | * | ||
1654 | * This function tries to find an empty range of physical address big enough to | ||
1655 | * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE | ||
1656 | * memory, which in turn allocates struct pages. | ||
1657 | */ | ||
1658 | struct resource *devm_request_free_mem_region(struct device *dev, | ||
1659 | struct resource *base, unsigned long size) | ||
1660 | { | 1649 | { |
1661 | resource_size_t end, addr; | 1650 | resource_size_t end, addr; |
1662 | struct resource *res; | 1651 | struct resource *res; |
@@ -1670,7 +1659,10 @@ struct resource *devm_request_free_mem_region(struct device *dev, | |||
1670 | REGION_DISJOINT) | 1659 | REGION_DISJOINT) |
1671 | continue; | 1660 | continue; |
1672 | 1661 | ||
1673 | res = devm_request_mem_region(dev, addr, size, dev_name(dev)); | 1662 | if (dev) |
1663 | res = devm_request_mem_region(dev, addr, size, name); | ||
1664 | else | ||
1665 | res = request_mem_region(addr, size, name); | ||
1674 | if (!res) | 1666 | if (!res) |
1675 | return ERR_PTR(-ENOMEM); | 1667 | return ERR_PTR(-ENOMEM); |
1676 | res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; | 1668 | res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; |
@@ -1679,7 +1671,32 @@ struct resource *devm_request_free_mem_region(struct device *dev, | |||
1679 | 1671 | ||
1680 | return ERR_PTR(-ERANGE); | 1672 | return ERR_PTR(-ERANGE); |
1681 | } | 1673 | } |
1674 | |||
1675 | /** | ||
1676 | * devm_request_free_mem_region - find free region for device private memory | ||
1677 | * | ||
1678 | * @dev: device struct to bind the resource to | ||
1679 | * @size: size in bytes of the device memory to add | ||
1680 | * @base: resource tree to look in | ||
1681 | * | ||
1682 | * This function tries to find an empty range of physical address big enough to | ||
1683 | * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE | ||
1684 | * memory, which in turn allocates struct pages. | ||
1685 | */ | ||
1686 | struct resource *devm_request_free_mem_region(struct device *dev, | ||
1687 | struct resource *base, unsigned long size) | ||
1688 | { | ||
1689 | return __request_free_mem_region(dev, base, size, dev_name(dev)); | ||
1690 | } | ||
1682 | EXPORT_SYMBOL_GPL(devm_request_free_mem_region); | 1691 | EXPORT_SYMBOL_GPL(devm_request_free_mem_region); |
1692 | |||
1693 | struct resource *request_free_mem_region(struct resource *base, | ||
1694 | unsigned long size, const char *name) | ||
1695 | { | ||
1696 | return __request_free_mem_region(NULL, base, size, name); | ||
1697 | } | ||
1698 | EXPORT_SYMBOL_GPL(request_free_mem_region); | ||
1699 | |||
1683 | #endif /* CONFIG_DEVICE_PRIVATE */ | 1700 | #endif /* CONFIG_DEVICE_PRIVATE */ |
1684 | 1701 | ||
1685 | static int __init strict_iomem(char *str) | 1702 | static int __init strict_iomem(char *str) |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e8387bdd09c..f9a1346a5fa9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3871,13 +3871,22 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3871 | /* | 3871 | /* |
3872 | * Various schedule()-time debugging checks and statistics: | 3872 | * Various schedule()-time debugging checks and statistics: |
3873 | */ | 3873 | */ |
3874 | static inline void schedule_debug(struct task_struct *prev) | 3874 | static inline void schedule_debug(struct task_struct *prev, bool preempt) |
3875 | { | 3875 | { |
3876 | #ifdef CONFIG_SCHED_STACK_END_CHECK | 3876 | #ifdef CONFIG_SCHED_STACK_END_CHECK |
3877 | if (task_stack_end_corrupted(prev)) | 3877 | if (task_stack_end_corrupted(prev)) |
3878 | panic("corrupted stack end detected inside scheduler\n"); | 3878 | panic("corrupted stack end detected inside scheduler\n"); |
3879 | #endif | 3879 | #endif |
3880 | 3880 | ||
3881 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | ||
3882 | if (!preempt && prev->state && prev->non_block_count) { | ||
3883 | printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", | ||
3884 | prev->comm, prev->pid, prev->non_block_count); | ||
3885 | dump_stack(); | ||
3886 | add_taint(TAINT_WARN, LOCKDEP_STILL_OK); | ||
3887 | } | ||
3888 | #endif | ||
3889 | |||
3881 | if (unlikely(in_atomic_preempt_off())) { | 3890 | if (unlikely(in_atomic_preempt_off())) { |
3882 | __schedule_bug(prev); | 3891 | __schedule_bug(prev); |
3883 | preempt_count_set(PREEMPT_DISABLED); | 3892 | preempt_count_set(PREEMPT_DISABLED); |
@@ -3989,7 +3998,7 @@ static void __sched notrace __schedule(bool preempt) | |||
3989 | rq = cpu_rq(cpu); | 3998 | rq = cpu_rq(cpu); |
3990 | prev = rq->curr; | 3999 | prev = rq->curr; |
3991 | 4000 | ||
3992 | schedule_debug(prev); | 4001 | schedule_debug(prev, preempt); |
3993 | 4002 | ||
3994 | if (sched_feat(HRTICK)) | 4003 | if (sched_feat(HRTICK)) |
3995 | hrtick_clear(rq); | 4004 | hrtick_clear(rq); |
@@ -6763,7 +6772,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
6763 | rcu_sleep_check(); | 6772 | rcu_sleep_check(); |
6764 | 6773 | ||
6765 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && | 6774 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && |
6766 | !is_idle_task(current)) || | 6775 | !is_idle_task(current) && !current->non_block_count) || |
6767 | system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || | 6776 | system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || |
6768 | oops_in_progress) | 6777 | oops_in_progress) |
6769 | return; | 6778 | return; |
@@ -6779,8 +6788,8 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
6779 | "BUG: sleeping function called from invalid context at %s:%d\n", | 6788 | "BUG: sleeping function called from invalid context at %s:%d\n", |
6780 | file, line); | 6789 | file, line); |
6781 | printk(KERN_ERR | 6790 | printk(KERN_ERR |
6782 | "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", | 6791 | "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", |
6783 | in_atomic(), irqs_disabled(), | 6792 | in_atomic(), irqs_disabled(), current->non_block_count, |
6784 | current->pid, current->comm); | 6793 | current->pid, current->comm); |
6785 | 6794 | ||
6786 | if (task_stack_end_corrupted(current)) | 6795 | if (task_stack_end_corrupted(current)) |
diff --git a/mm/Kconfig b/mm/Kconfig index 56cec636a1fc..2fe4902ad755 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -669,23 +669,17 @@ config ZONE_DEVICE | |||
669 | 669 | ||
670 | If FS_DAX is enabled, then say Y. | 670 | If FS_DAX is enabled, then say Y. |
671 | 671 | ||
672 | config MIGRATE_VMA_HELPER | ||
673 | bool | ||
674 | |||
675 | config DEV_PAGEMAP_OPS | 672 | config DEV_PAGEMAP_OPS |
676 | bool | 673 | bool |
677 | 674 | ||
675 | # | ||
676 | # Helpers to mirror range of the CPU page tables of a process into device page | ||
677 | # tables. | ||
678 | # | ||
678 | config HMM_MIRROR | 679 | config HMM_MIRROR |
679 | bool "HMM mirror CPU page table into a device page table" | 680 | bool |
680 | depends on (X86_64 || PPC64) | 681 | depends on MMU |
681 | depends on MMU && 64BIT | 682 | depends on MMU_NOTIFIER |
682 | select MMU_NOTIFIER | ||
683 | help | ||
684 | Select HMM_MIRROR if you want to mirror range of the CPU page table of a | ||
685 | process into a device page table. Here, mirror means "keep synchronized". | ||
686 | Prerequisites: the device must provide the ability to write-protect its | ||
687 | page tables (at PAGE_SIZE granularity), and must be able to recover from | ||
688 | the resulting potential page faults. | ||
689 | 683 | ||
690 | config DEVICE_PRIVATE | 684 | config DEVICE_PRIVATE |
691 | bool "Unaddressable device memory (GPU memory, ...)" | 685 | bool "Unaddressable device memory (GPU memory, ...)" |
@@ -8,7 +8,7 @@ | |||
8 | * Refer to include/linux/hmm.h for information about heterogeneous memory | 8 | * Refer to include/linux/hmm.h for information about heterogeneous memory |
9 | * management or HMM for short. | 9 | * management or HMM for short. |
10 | */ | 10 | */ |
11 | #include <linux/mm.h> | 11 | #include <linux/pagewalk.h> |
12 | #include <linux/hmm.h> | 12 | #include <linux/hmm.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/rmap.h> | 14 | #include <linux/rmap.h> |
@@ -26,101 +26,37 @@ | |||
26 | #include <linux/mmu_notifier.h> | 26 | #include <linux/mmu_notifier.h> |
27 | #include <linux/memory_hotplug.h> | 27 | #include <linux/memory_hotplug.h> |
28 | 28 | ||
29 | static const struct mmu_notifier_ops hmm_mmu_notifier_ops; | 29 | static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) |
30 | |||
31 | /** | ||
32 | * hmm_get_or_create - register HMM against an mm (HMM internal) | ||
33 | * | ||
34 | * @mm: mm struct to attach to | ||
35 | * Returns: returns an HMM object, either by referencing the existing | ||
36 | * (per-process) object, or by creating a new one. | ||
37 | * | ||
38 | * This is not intended to be used directly by device drivers. If mm already | ||
39 | * has an HMM struct then it get a reference on it and returns it. Otherwise | ||
40 | * it allocates an HMM struct, initializes it, associate it with the mm and | ||
41 | * returns it. | ||
42 | */ | ||
43 | static struct hmm *hmm_get_or_create(struct mm_struct *mm) | ||
44 | { | 30 | { |
45 | struct hmm *hmm; | 31 | struct hmm *hmm; |
46 | 32 | ||
47 | lockdep_assert_held_write(&mm->mmap_sem); | 33 | hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); |
48 | |||
49 | /* Abuse the page_table_lock to also protect mm->hmm. */ | ||
50 | spin_lock(&mm->page_table_lock); | ||
51 | hmm = mm->hmm; | ||
52 | if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) | ||
53 | goto out_unlock; | ||
54 | spin_unlock(&mm->page_table_lock); | ||
55 | |||
56 | hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); | ||
57 | if (!hmm) | 34 | if (!hmm) |
58 | return NULL; | 35 | return ERR_PTR(-ENOMEM); |
36 | |||
59 | init_waitqueue_head(&hmm->wq); | 37 | init_waitqueue_head(&hmm->wq); |
60 | INIT_LIST_HEAD(&hmm->mirrors); | 38 | INIT_LIST_HEAD(&hmm->mirrors); |
61 | init_rwsem(&hmm->mirrors_sem); | 39 | init_rwsem(&hmm->mirrors_sem); |
62 | hmm->mmu_notifier.ops = NULL; | ||
63 | INIT_LIST_HEAD(&hmm->ranges); | 40 | INIT_LIST_HEAD(&hmm->ranges); |
64 | spin_lock_init(&hmm->ranges_lock); | 41 | spin_lock_init(&hmm->ranges_lock); |
65 | kref_init(&hmm->kref); | ||
66 | hmm->notifiers = 0; | 42 | hmm->notifiers = 0; |
67 | hmm->mm = mm; | 43 | return &hmm->mmu_notifier; |
68 | |||
69 | hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; | ||
70 | if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { | ||
71 | kfree(hmm); | ||
72 | return NULL; | ||
73 | } | ||
74 | |||
75 | mmgrab(hmm->mm); | ||
76 | |||
77 | /* | ||
78 | * We hold the exclusive mmap_sem here so we know that mm->hmm is | ||
79 | * still NULL or 0 kref, and is safe to update. | ||
80 | */ | ||
81 | spin_lock(&mm->page_table_lock); | ||
82 | mm->hmm = hmm; | ||
83 | |||
84 | out_unlock: | ||
85 | spin_unlock(&mm->page_table_lock); | ||
86 | return hmm; | ||
87 | } | 44 | } |
88 | 45 | ||
89 | static void hmm_free_rcu(struct rcu_head *rcu) | 46 | static void hmm_free_notifier(struct mmu_notifier *mn) |
90 | { | 47 | { |
91 | struct hmm *hmm = container_of(rcu, struct hmm, rcu); | 48 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
92 | 49 | ||
93 | mmdrop(hmm->mm); | 50 | WARN_ON(!list_empty(&hmm->ranges)); |
51 | WARN_ON(!list_empty(&hmm->mirrors)); | ||
94 | kfree(hmm); | 52 | kfree(hmm); |
95 | } | 53 | } |
96 | 54 | ||
97 | static void hmm_free(struct kref *kref) | ||
98 | { | ||
99 | struct hmm *hmm = container_of(kref, struct hmm, kref); | ||
100 | |||
101 | spin_lock(&hmm->mm->page_table_lock); | ||
102 | if (hmm->mm->hmm == hmm) | ||
103 | hmm->mm->hmm = NULL; | ||
104 | spin_unlock(&hmm->mm->page_table_lock); | ||
105 | |||
106 | mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); | ||
107 | mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); | ||
108 | } | ||
109 | |||
110 | static inline void hmm_put(struct hmm *hmm) | ||
111 | { | ||
112 | kref_put(&hmm->kref, hmm_free); | ||
113 | } | ||
114 | |||
115 | static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) | 55 | static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) |
116 | { | 56 | { |
117 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); | 57 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
118 | struct hmm_mirror *mirror; | 58 | struct hmm_mirror *mirror; |
119 | 59 | ||
120 | /* Bail out if hmm is in the process of being freed */ | ||
121 | if (!kref_get_unless_zero(&hmm->kref)) | ||
122 | return; | ||
123 | |||
124 | /* | 60 | /* |
125 | * Since hmm_range_register() holds the mmget() lock hmm_release() is | 61 | * Since hmm_range_register() holds the mmget() lock hmm_release() is |
126 | * prevented as long as a range exists. | 62 | * prevented as long as a range exists. |
@@ -137,8 +73,6 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) | |||
137 | mirror->ops->release(mirror); | 73 | mirror->ops->release(mirror); |
138 | } | 74 | } |
139 | up_read(&hmm->mirrors_sem); | 75 | up_read(&hmm->mirrors_sem); |
140 | |||
141 | hmm_put(hmm); | ||
142 | } | 76 | } |
143 | 77 | ||
144 | static void notifiers_decrement(struct hmm *hmm) | 78 | static void notifiers_decrement(struct hmm *hmm) |
@@ -165,23 +99,14 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, | |||
165 | { | 99 | { |
166 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); | 100 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
167 | struct hmm_mirror *mirror; | 101 | struct hmm_mirror *mirror; |
168 | struct hmm_update update; | ||
169 | struct hmm_range *range; | 102 | struct hmm_range *range; |
170 | unsigned long flags; | 103 | unsigned long flags; |
171 | int ret = 0; | 104 | int ret = 0; |
172 | 105 | ||
173 | if (!kref_get_unless_zero(&hmm->kref)) | ||
174 | return 0; | ||
175 | |||
176 | update.start = nrange->start; | ||
177 | update.end = nrange->end; | ||
178 | update.event = HMM_UPDATE_INVALIDATE; | ||
179 | update.blockable = mmu_notifier_range_blockable(nrange); | ||
180 | |||
181 | spin_lock_irqsave(&hmm->ranges_lock, flags); | 106 | spin_lock_irqsave(&hmm->ranges_lock, flags); |
182 | hmm->notifiers++; | 107 | hmm->notifiers++; |
183 | list_for_each_entry(range, &hmm->ranges, list) { | 108 | list_for_each_entry(range, &hmm->ranges, list) { |
184 | if (update.end < range->start || update.start >= range->end) | 109 | if (nrange->end < range->start || nrange->start >= range->end) |
185 | continue; | 110 | continue; |
186 | 111 | ||
187 | range->valid = false; | 112 | range->valid = false; |
@@ -198,9 +123,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, | |||
198 | list_for_each_entry(mirror, &hmm->mirrors, list) { | 123 | list_for_each_entry(mirror, &hmm->mirrors, list) { |
199 | int rc; | 124 | int rc; |
200 | 125 | ||
201 | rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); | 126 | rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); |
202 | if (rc) { | 127 | if (rc) { |
203 | if (WARN_ON(update.blockable || rc != -EAGAIN)) | 128 | if (WARN_ON(mmu_notifier_range_blockable(nrange) || |
129 | rc != -EAGAIN)) | ||
204 | continue; | 130 | continue; |
205 | ret = -EAGAIN; | 131 | ret = -EAGAIN; |
206 | break; | 132 | break; |
@@ -211,7 +137,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, | |||
211 | out: | 137 | out: |
212 | if (ret) | 138 | if (ret) |
213 | notifiers_decrement(hmm); | 139 | notifiers_decrement(hmm); |
214 | hmm_put(hmm); | ||
215 | return ret; | 140 | return ret; |
216 | } | 141 | } |
217 | 142 | ||
@@ -220,17 +145,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, | |||
220 | { | 145 | { |
221 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); | 146 | struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); |
222 | 147 | ||
223 | if (!kref_get_unless_zero(&hmm->kref)) | ||
224 | return; | ||
225 | |||
226 | notifiers_decrement(hmm); | 148 | notifiers_decrement(hmm); |
227 | hmm_put(hmm); | ||
228 | } | 149 | } |
229 | 150 | ||
230 | static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { | 151 | static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { |
231 | .release = hmm_release, | 152 | .release = hmm_release, |
232 | .invalidate_range_start = hmm_invalidate_range_start, | 153 | .invalidate_range_start = hmm_invalidate_range_start, |
233 | .invalidate_range_end = hmm_invalidate_range_end, | 154 | .invalidate_range_end = hmm_invalidate_range_end, |
155 | .alloc_notifier = hmm_alloc_notifier, | ||
156 | .free_notifier = hmm_free_notifier, | ||
234 | }; | 157 | }; |
235 | 158 | ||
236 | /* | 159 | /* |
@@ -242,18 +165,27 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { | |||
242 | * | 165 | * |
243 | * To start mirroring a process address space, the device driver must register | 166 | * To start mirroring a process address space, the device driver must register |
244 | * an HMM mirror struct. | 167 | * an HMM mirror struct. |
168 | * | ||
169 | * The caller cannot unregister the hmm_mirror while any ranges are | ||
170 | * registered. | ||
171 | * | ||
172 | * Callers using this function must put a call to mmu_notifier_synchronize() | ||
173 | * in their module exit functions. | ||
245 | */ | 174 | */ |
246 | int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) | 175 | int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) |
247 | { | 176 | { |
177 | struct mmu_notifier *mn; | ||
178 | |||
248 | lockdep_assert_held_write(&mm->mmap_sem); | 179 | lockdep_assert_held_write(&mm->mmap_sem); |
249 | 180 | ||
250 | /* Sanity check */ | 181 | /* Sanity check */ |
251 | if (!mm || !mirror || !mirror->ops) | 182 | if (!mm || !mirror || !mirror->ops) |
252 | return -EINVAL; | 183 | return -EINVAL; |
253 | 184 | ||
254 | mirror->hmm = hmm_get_or_create(mm); | 185 | mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); |
255 | if (!mirror->hmm) | 186 | if (IS_ERR(mn)) |
256 | return -ENOMEM; | 187 | return PTR_ERR(mn); |
188 | mirror->hmm = container_of(mn, struct hmm, mmu_notifier); | ||
257 | 189 | ||
258 | down_write(&mirror->hmm->mirrors_sem); | 190 | down_write(&mirror->hmm->mirrors_sem); |
259 | list_add(&mirror->list, &mirror->hmm->mirrors); | 191 | list_add(&mirror->list, &mirror->hmm->mirrors); |
@@ -277,7 +209,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) | |||
277 | down_write(&hmm->mirrors_sem); | 209 | down_write(&hmm->mirrors_sem); |
278 | list_del(&mirror->list); | 210 | list_del(&mirror->list); |
279 | up_write(&hmm->mirrors_sem); | 211 | up_write(&hmm->mirrors_sem); |
280 | hmm_put(hmm); | 212 | mmu_notifier_put(&hmm->mmu_notifier); |
281 | } | 213 | } |
282 | EXPORT_SYMBOL(hmm_mirror_unregister); | 214 | EXPORT_SYMBOL(hmm_mirror_unregister); |
283 | 215 | ||
@@ -285,8 +217,7 @@ struct hmm_vma_walk { | |||
285 | struct hmm_range *range; | 217 | struct hmm_range *range; |
286 | struct dev_pagemap *pgmap; | 218 | struct dev_pagemap *pgmap; |
287 | unsigned long last; | 219 | unsigned long last; |
288 | bool fault; | 220 | unsigned int flags; |
289 | bool block; | ||
290 | }; | 221 | }; |
291 | 222 | ||
292 | static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, | 223 | static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, |
@@ -298,17 +229,27 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, | |||
298 | struct vm_area_struct *vma = walk->vma; | 229 | struct vm_area_struct *vma = walk->vma; |
299 | vm_fault_t ret; | 230 | vm_fault_t ret; |
300 | 231 | ||
301 | flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; | 232 | if (!vma) |
302 | flags |= write_fault ? FAULT_FLAG_WRITE : 0; | 233 | goto err; |
234 | |||
235 | if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) | ||
236 | flags |= FAULT_FLAG_ALLOW_RETRY; | ||
237 | if (write_fault) | ||
238 | flags |= FAULT_FLAG_WRITE; | ||
239 | |||
303 | ret = handle_mm_fault(vma, addr, flags); | 240 | ret = handle_mm_fault(vma, addr, flags); |
304 | if (ret & VM_FAULT_RETRY) | 241 | if (ret & VM_FAULT_RETRY) { |
242 | /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ | ||
305 | return -EAGAIN; | 243 | return -EAGAIN; |
306 | if (ret & VM_FAULT_ERROR) { | ||
307 | *pfn = range->values[HMM_PFN_ERROR]; | ||
308 | return -EFAULT; | ||
309 | } | 244 | } |
245 | if (ret & VM_FAULT_ERROR) | ||
246 | goto err; | ||
310 | 247 | ||
311 | return -EBUSY; | 248 | return -EBUSY; |
249 | |||
250 | err: | ||
251 | *pfn = range->values[HMM_PFN_ERROR]; | ||
252 | return -EFAULT; | ||
312 | } | 253 | } |
313 | 254 | ||
314 | static int hmm_pfns_bad(unsigned long addr, | 255 | static int hmm_pfns_bad(unsigned long addr, |
@@ -328,8 +269,8 @@ static int hmm_pfns_bad(unsigned long addr, | |||
328 | } | 269 | } |
329 | 270 | ||
330 | /* | 271 | /* |
331 | * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) | 272 | * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) |
332 | * @start: range virtual start address (inclusive) | 273 | * @addr: range virtual start address (inclusive) |
333 | * @end: range virtual end address (exclusive) | 274 | * @end: range virtual end address (exclusive) |
334 | * @fault: should we fault or not ? | 275 | * @fault: should we fault or not ? |
335 | * @write_fault: write fault ? | 276 | * @write_fault: write fault ? |
@@ -346,13 +287,15 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, | |||
346 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | 287 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
347 | struct hmm_range *range = hmm_vma_walk->range; | 288 | struct hmm_range *range = hmm_vma_walk->range; |
348 | uint64_t *pfns = range->pfns; | 289 | uint64_t *pfns = range->pfns; |
349 | unsigned long i, page_size; | 290 | unsigned long i; |
350 | 291 | ||
351 | hmm_vma_walk->last = addr; | 292 | hmm_vma_walk->last = addr; |
352 | page_size = hmm_range_page_size(range); | 293 | i = (addr - range->start) >> PAGE_SHIFT; |
353 | i = (addr - range->start) >> range->page_shift; | 294 | |
295 | if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) | ||
296 | return -EPERM; | ||
354 | 297 | ||
355 | for (; addr < end; addr += page_size, i++) { | 298 | for (; addr < end; addr += PAGE_SIZE, i++) { |
356 | pfns[i] = range->values[HMM_PFN_NONE]; | 299 | pfns[i] = range->values[HMM_PFN_NONE]; |
357 | if (fault || write_fault) { | 300 | if (fault || write_fault) { |
358 | int ret; | 301 | int ret; |
@@ -373,15 +316,15 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |||
373 | { | 316 | { |
374 | struct hmm_range *range = hmm_vma_walk->range; | 317 | struct hmm_range *range = hmm_vma_walk->range; |
375 | 318 | ||
376 | if (!hmm_vma_walk->fault) | 319 | if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) |
377 | return; | 320 | return; |
378 | 321 | ||
379 | /* | 322 | /* |
380 | * So we not only consider the individual per page request we also | 323 | * So we not only consider the individual per page request we also |
381 | * consider the default flags requested for the range. The API can | 324 | * consider the default flags requested for the range. The API can |
382 | * be use in 2 fashions. The first one where the HMM user coalesce | 325 | * be used 2 ways. The first one where the HMM user coalesces |
383 | * multiple page fault into one request and set flags per pfns for | 326 | * multiple page faults into one request and sets flags per pfn for |
384 | * of those faults. The second one where the HMM user want to pre- | 327 | * those faults. The second one where the HMM user wants to pre- |
385 | * fault a range with specific flags. For the latter one it is a | 328 | * fault a range with specific flags. For the latter one it is a |
386 | * waste to have the user pre-fill the pfn arrays with a default | 329 | * waste to have the user pre-fill the pfn arrays with a default |
387 | * flags value. | 330 | * flags value. |
@@ -391,7 +334,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |||
391 | /* We aren't ask to do anything ... */ | 334 | /* We aren't ask to do anything ... */ |
392 | if (!(pfns & range->flags[HMM_PFN_VALID])) | 335 | if (!(pfns & range->flags[HMM_PFN_VALID])) |
393 | return; | 336 | return; |
394 | /* If this is device memory than only fault if explicitly requested */ | 337 | /* If this is device memory then only fault if explicitly requested */ |
395 | if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { | 338 | if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { |
396 | /* Do we fault on device memory ? */ | 339 | /* Do we fault on device memory ? */ |
397 | if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { | 340 | if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { |
@@ -418,7 +361,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, | |||
418 | { | 361 | { |
419 | unsigned long i; | 362 | unsigned long i; |
420 | 363 | ||
421 | if (!hmm_vma_walk->fault) { | 364 | if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { |
422 | *fault = *write_fault = false; | 365 | *fault = *write_fault = false; |
423 | return; | 366 | return; |
424 | } | 367 | } |
@@ -458,22 +401,10 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) | |||
458 | range->flags[HMM_PFN_VALID]; | 401 | range->flags[HMM_PFN_VALID]; |
459 | } | 402 | } |
460 | 403 | ||
461 | static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) | ||
462 | { | ||
463 | if (!pud_present(pud)) | ||
464 | return 0; | ||
465 | return pud_write(pud) ? range->flags[HMM_PFN_VALID] | | ||
466 | range->flags[HMM_PFN_WRITE] : | ||
467 | range->flags[HMM_PFN_VALID]; | ||
468 | } | ||
469 | |||
470 | static int hmm_vma_handle_pmd(struct mm_walk *walk, | ||
471 | unsigned long addr, | ||
472 | unsigned long end, | ||
473 | uint64_t *pfns, | ||
474 | pmd_t pmd) | ||
475 | { | ||
476 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 404 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
405 | static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, | ||
406 | unsigned long end, uint64_t *pfns, pmd_t pmd) | ||
407 | { | ||
477 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | 408 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
478 | struct hmm_range *range = hmm_vma_walk->range; | 409 | struct hmm_range *range = hmm_vma_walk->range; |
479 | unsigned long pfn, npages, i; | 410 | unsigned long pfn, npages, i; |
@@ -488,7 +419,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, | |||
488 | if (pmd_protnone(pmd) || fault || write_fault) | 419 | if (pmd_protnone(pmd) || fault || write_fault) |
489 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | 420 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
490 | 421 | ||
491 | pfn = pmd_pfn(pmd) + pte_index(addr); | 422 | pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
492 | for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { | 423 | for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { |
493 | if (pmd_devmap(pmd)) { | 424 | if (pmd_devmap(pmd)) { |
494 | hmm_vma_walk->pgmap = get_dev_pagemap(pfn, | 425 | hmm_vma_walk->pgmap = get_dev_pagemap(pfn, |
@@ -504,11 +435,12 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, | |||
504 | } | 435 | } |
505 | hmm_vma_walk->last = end; | 436 | hmm_vma_walk->last = end; |
506 | return 0; | 437 | return 0; |
507 | #else | ||
508 | /* If THP is not enabled then we should never reach that code ! */ | ||
509 | return -EINVAL; | ||
510 | #endif | ||
511 | } | 438 | } |
439 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
440 | /* stub to allow the code below to compile */ | ||
441 | int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, | ||
442 | unsigned long end, uint64_t *pfns, pmd_t pmd); | ||
443 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
512 | 444 | ||
513 | static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) | 445 | static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) |
514 | { | 446 | { |
@@ -525,7 +457,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, | |||
525 | { | 457 | { |
526 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | 458 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
527 | struct hmm_range *range = hmm_vma_walk->range; | 459 | struct hmm_range *range = hmm_vma_walk->range; |
528 | struct vm_area_struct *vma = walk->vma; | ||
529 | bool fault, write_fault; | 460 | bool fault, write_fault; |
530 | uint64_t cpu_flags; | 461 | uint64_t cpu_flags; |
531 | pte_t pte = *ptep; | 462 | pte_t pte = *ptep; |
@@ -546,6 +477,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, | |||
546 | swp_entry_t entry = pte_to_swp_entry(pte); | 477 | swp_entry_t entry = pte_to_swp_entry(pte); |
547 | 478 | ||
548 | if (!non_swap_entry(entry)) { | 479 | if (!non_swap_entry(entry)) { |
480 | cpu_flags = pte_to_hmm_pfn_flags(range, pte); | ||
481 | hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, | ||
482 | &fault, &write_fault); | ||
549 | if (fault || write_fault) | 483 | if (fault || write_fault) |
550 | goto fault; | 484 | goto fault; |
551 | return 0; | 485 | return 0; |
@@ -574,8 +508,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, | |||
574 | if (fault || write_fault) { | 508 | if (fault || write_fault) { |
575 | pte_unmap(ptep); | 509 | pte_unmap(ptep); |
576 | hmm_vma_walk->last = addr; | 510 | hmm_vma_walk->last = addr; |
577 | migration_entry_wait(vma->vm_mm, | 511 | migration_entry_wait(walk->mm, pmdp, addr); |
578 | pmdp, addr); | ||
579 | return -EBUSY; | 512 | return -EBUSY; |
580 | } | 513 | } |
581 | return 0; | 514 | return 0; |
@@ -623,21 +556,16 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, | |||
623 | { | 556 | { |
624 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | 557 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
625 | struct hmm_range *range = hmm_vma_walk->range; | 558 | struct hmm_range *range = hmm_vma_walk->range; |
626 | struct vm_area_struct *vma = walk->vma; | ||
627 | uint64_t *pfns = range->pfns; | 559 | uint64_t *pfns = range->pfns; |
628 | unsigned long addr = start, i; | 560 | unsigned long addr = start, i; |
629 | pte_t *ptep; | 561 | pte_t *ptep; |
630 | pmd_t pmd; | 562 | pmd_t pmd; |
631 | 563 | ||
632 | |||
633 | again: | 564 | again: |
634 | pmd = READ_ONCE(*pmdp); | 565 | pmd = READ_ONCE(*pmdp); |
635 | if (pmd_none(pmd)) | 566 | if (pmd_none(pmd)) |
636 | return hmm_vma_walk_hole(start, end, walk); | 567 | return hmm_vma_walk_hole(start, end, walk); |
637 | 568 | ||
638 | if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) | ||
639 | return hmm_pfns_bad(start, end, walk); | ||
640 | |||
641 | if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { | 569 | if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { |
642 | bool fault, write_fault; | 570 | bool fault, write_fault; |
643 | unsigned long npages; | 571 | unsigned long npages; |
@@ -651,7 +579,7 @@ again: | |||
651 | 0, &fault, &write_fault); | 579 | 0, &fault, &write_fault); |
652 | if (fault || write_fault) { | 580 | if (fault || write_fault) { |
653 | hmm_vma_walk->last = addr; | 581 | hmm_vma_walk->last = addr; |
654 | pmd_migration_entry_wait(vma->vm_mm, pmdp); | 582 | pmd_migration_entry_wait(walk->mm, pmdp); |
655 | return -EBUSY; | 583 | return -EBUSY; |
656 | } | 584 | } |
657 | return 0; | 585 | return 0; |
@@ -660,11 +588,11 @@ again: | |||
660 | 588 | ||
661 | if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { | 589 | if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { |
662 | /* | 590 | /* |
663 | * No need to take pmd_lock here, even if some other threads | 591 | * No need to take pmd_lock here, even if some other thread |
664 | * is splitting the huge pmd we will get that event through | 592 | * is splitting the huge pmd we will get that event through |
665 | * mmu_notifier callback. | 593 | * mmu_notifier callback. |
666 | * | 594 | * |
667 | * So just read pmd value and check again its a transparent | 595 | * So just read pmd value and check again it's a transparent |
668 | * huge or device mapping one and compute corresponding pfn | 596 | * huge or device mapping one and compute corresponding pfn |
669 | * values. | 597 | * values. |
670 | */ | 598 | */ |
@@ -678,7 +606,7 @@ again: | |||
678 | } | 606 | } |
679 | 607 | ||
680 | /* | 608 | /* |
681 | * We have handled all the valid case above ie either none, migration, | 609 | * We have handled all the valid cases above ie either none, migration, |
682 | * huge or transparent huge. At this point either it is a valid pmd | 610 | * huge or transparent huge. At this point either it is a valid pmd |
683 | * entry pointing to pte directory or it is a bad pmd that will not | 611 | * entry pointing to pte directory or it is a bad pmd that will not |
684 | * recover. | 612 | * recover. |
@@ -714,10 +642,19 @@ again: | |||
714 | return 0; | 642 | return 0; |
715 | } | 643 | } |
716 | 644 | ||
717 | static int hmm_vma_walk_pud(pud_t *pudp, | 645 | #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ |
718 | unsigned long start, | 646 | defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) |
719 | unsigned long end, | 647 | static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) |
720 | struct mm_walk *walk) | 648 | { |
649 | if (!pud_present(pud)) | ||
650 | return 0; | ||
651 | return pud_write(pud) ? range->flags[HMM_PFN_VALID] | | ||
652 | range->flags[HMM_PFN_WRITE] : | ||
653 | range->flags[HMM_PFN_VALID]; | ||
654 | } | ||
655 | |||
656 | static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, | ||
657 | struct mm_walk *walk) | ||
721 | { | 658 | { |
722 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | 659 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
723 | struct hmm_range *range = hmm_vma_walk->range; | 660 | struct hmm_range *range = hmm_vma_walk->range; |
@@ -781,42 +718,29 @@ again: | |||
781 | 718 | ||
782 | return 0; | 719 | return 0; |
783 | } | 720 | } |
721 | #else | ||
722 | #define hmm_vma_walk_pud NULL | ||
723 | #endif | ||
784 | 724 | ||
725 | #ifdef CONFIG_HUGETLB_PAGE | ||
785 | static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, | 726 | static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, |
786 | unsigned long start, unsigned long end, | 727 | unsigned long start, unsigned long end, |
787 | struct mm_walk *walk) | 728 | struct mm_walk *walk) |
788 | { | 729 | { |
789 | #ifdef CONFIG_HUGETLB_PAGE | 730 | unsigned long addr = start, i, pfn; |
790 | unsigned long addr = start, i, pfn, mask, size, pfn_inc; | ||
791 | struct hmm_vma_walk *hmm_vma_walk = walk->private; | 731 | struct hmm_vma_walk *hmm_vma_walk = walk->private; |
792 | struct hmm_range *range = hmm_vma_walk->range; | 732 | struct hmm_range *range = hmm_vma_walk->range; |
793 | struct vm_area_struct *vma = walk->vma; | 733 | struct vm_area_struct *vma = walk->vma; |
794 | struct hstate *h = hstate_vma(vma); | ||
795 | uint64_t orig_pfn, cpu_flags; | 734 | uint64_t orig_pfn, cpu_flags; |
796 | bool fault, write_fault; | 735 | bool fault, write_fault; |
797 | spinlock_t *ptl; | 736 | spinlock_t *ptl; |
798 | pte_t entry; | 737 | pte_t entry; |
799 | int ret = 0; | 738 | int ret = 0; |
800 | 739 | ||
801 | size = 1UL << huge_page_shift(h); | 740 | ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); |
802 | mask = size - 1; | ||
803 | if (range->page_shift != PAGE_SHIFT) { | ||
804 | /* Make sure we are looking at full page. */ | ||
805 | if (start & mask) | ||
806 | return -EINVAL; | ||
807 | if (end < (start + size)) | ||
808 | return -EINVAL; | ||
809 | pfn_inc = size >> PAGE_SHIFT; | ||
810 | } else { | ||
811 | pfn_inc = 1; | ||
812 | size = PAGE_SIZE; | ||
813 | } | ||
814 | |||
815 | |||
816 | ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); | ||
817 | entry = huge_ptep_get(pte); | 741 | entry = huge_ptep_get(pte); |
818 | 742 | ||
819 | i = (start - range->start) >> range->page_shift; | 743 | i = (start - range->start) >> PAGE_SHIFT; |
820 | orig_pfn = range->pfns[i]; | 744 | orig_pfn = range->pfns[i]; |
821 | range->pfns[i] = range->values[HMM_PFN_NONE]; | 745 | range->pfns[i] = range->values[HMM_PFN_NONE]; |
822 | cpu_flags = pte_to_hmm_pfn_flags(range, entry); | 746 | cpu_flags = pte_to_hmm_pfn_flags(range, entry); |
@@ -828,8 +752,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, | |||
828 | goto unlock; | 752 | goto unlock; |
829 | } | 753 | } |
830 | 754 | ||
831 | pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); | 755 | pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); |
832 | for (; addr < end; addr += size, i++, pfn += pfn_inc) | 756 | for (; addr < end; addr += PAGE_SIZE, i++, pfn++) |
833 | range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | | 757 | range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | |
834 | cpu_flags; | 758 | cpu_flags; |
835 | hmm_vma_walk->last = end; | 759 | hmm_vma_walk->last = end; |
@@ -841,10 +765,10 @@ unlock: | |||
841 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); | 765 | return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); |
842 | 766 | ||
843 | return ret; | 767 | return ret; |
844 | #else /* CONFIG_HUGETLB_PAGE */ | ||
845 | return -EINVAL; | ||
846 | #endif | ||
847 | } | 768 | } |
769 | #else | ||
770 | #define hmm_vma_walk_hugetlb_entry NULL | ||
771 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
848 | 772 | ||
849 | static void hmm_pfns_clear(struct hmm_range *range, | 773 | static void hmm_pfns_clear(struct hmm_range *range, |
850 | uint64_t *pfns, | 774 | uint64_t *pfns, |
@@ -859,44 +783,32 @@ static void hmm_pfns_clear(struct hmm_range *range, | |||
859 | * hmm_range_register() - start tracking change to CPU page table over a range | 783 | * hmm_range_register() - start tracking change to CPU page table over a range |
860 | * @range: range | 784 | * @range: range |
861 | * @mm: the mm struct for the range of virtual address | 785 | * @mm: the mm struct for the range of virtual address |
862 | * @start: start virtual address (inclusive) | 786 | * |
863 | * @end: end virtual address (exclusive) | 787 | * Return: 0 on success, -EFAULT if the address space is no longer valid |
864 | * @page_shift: expect page shift for the range | ||
865 | * Returns 0 on success, -EFAULT if the address space is no longer valid | ||
866 | * | 788 | * |
867 | * Track updates to the CPU page table see include/linux/hmm.h | 789 | * Track updates to the CPU page table see include/linux/hmm.h |
868 | */ | 790 | */ |
869 | int hmm_range_register(struct hmm_range *range, | 791 | int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) |
870 | struct hmm_mirror *mirror, | ||
871 | unsigned long start, | ||
872 | unsigned long end, | ||
873 | unsigned page_shift) | ||
874 | { | 792 | { |
875 | unsigned long mask = ((1UL << page_shift) - 1UL); | ||
876 | struct hmm *hmm = mirror->hmm; | 793 | struct hmm *hmm = mirror->hmm; |
877 | unsigned long flags; | 794 | unsigned long flags; |
878 | 795 | ||
879 | range->valid = false; | 796 | range->valid = false; |
880 | range->hmm = NULL; | 797 | range->hmm = NULL; |
881 | 798 | ||
882 | if ((start & mask) || (end & mask)) | 799 | if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) |
883 | return -EINVAL; | 800 | return -EINVAL; |
884 | if (start >= end) | 801 | if (range->start >= range->end) |
885 | return -EINVAL; | 802 | return -EINVAL; |
886 | 803 | ||
887 | range->page_shift = page_shift; | ||
888 | range->start = start; | ||
889 | range->end = end; | ||
890 | |||
891 | /* Prevent hmm_release() from running while the range is valid */ | 804 | /* Prevent hmm_release() from running while the range is valid */ |
892 | if (!mmget_not_zero(hmm->mm)) | 805 | if (!mmget_not_zero(hmm->mmu_notifier.mm)) |
893 | return -EFAULT; | 806 | return -EFAULT; |
894 | 807 | ||
895 | /* Initialize range to track CPU page table updates. */ | 808 | /* Initialize range to track CPU page table updates. */ |
896 | spin_lock_irqsave(&hmm->ranges_lock, flags); | 809 | spin_lock_irqsave(&hmm->ranges_lock, flags); |
897 | 810 | ||
898 | range->hmm = hmm; | 811 | range->hmm = hmm; |
899 | kref_get(&hmm->kref); | ||
900 | list_add(&range->list, &hmm->ranges); | 812 | list_add(&range->list, &hmm->ranges); |
901 | 813 | ||
902 | /* | 814 | /* |
@@ -928,8 +840,7 @@ void hmm_range_unregister(struct hmm_range *range) | |||
928 | spin_unlock_irqrestore(&hmm->ranges_lock, flags); | 840 | spin_unlock_irqrestore(&hmm->ranges_lock, flags); |
929 | 841 | ||
930 | /* Drop reference taken by hmm_range_register() */ | 842 | /* Drop reference taken by hmm_range_register() */ |
931 | mmput(hmm->mm); | 843 | mmput(hmm->mmu_notifier.mm); |
932 | hmm_put(hmm); | ||
933 | 844 | ||
934 | /* | 845 | /* |
935 | * The range is now invalid and the ref on the hmm is dropped, so | 846 | * The range is now invalid and the ref on the hmm is dropped, so |
@@ -941,105 +852,33 @@ void hmm_range_unregister(struct hmm_range *range) | |||
941 | } | 852 | } |
942 | EXPORT_SYMBOL(hmm_range_unregister); | 853 | EXPORT_SYMBOL(hmm_range_unregister); |
943 | 854 | ||
944 | /* | 855 | static const struct mm_walk_ops hmm_walk_ops = { |
945 | * hmm_range_snapshot() - snapshot CPU page table for a range | 856 | .pud_entry = hmm_vma_walk_pud, |
946 | * @range: range | 857 | .pmd_entry = hmm_vma_walk_pmd, |
947 | * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid | 858 | .pte_hole = hmm_vma_walk_hole, |
948 | * permission (for instance asking for write and range is read only), | 859 | .hugetlb_entry = hmm_vma_walk_hugetlb_entry, |
949 | * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid | 860 | }; |
950 | * vma or it is illegal to access that range), number of valid pages | ||
951 | * in range->pfns[] (from range start address). | ||
952 | * | ||
953 | * This snapshots the CPU page table for a range of virtual addresses. Snapshot | ||
954 | * validity is tracked by range struct. See in include/linux/hmm.h for example | ||
955 | * on how to use. | ||
956 | */ | ||
957 | long hmm_range_snapshot(struct hmm_range *range) | ||
958 | { | ||
959 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; | ||
960 | unsigned long start = range->start, end; | ||
961 | struct hmm_vma_walk hmm_vma_walk; | ||
962 | struct hmm *hmm = range->hmm; | ||
963 | struct vm_area_struct *vma; | ||
964 | struct mm_walk mm_walk; | ||
965 | |||
966 | lockdep_assert_held(&hmm->mm->mmap_sem); | ||
967 | do { | ||
968 | /* If range is no longer valid force retry. */ | ||
969 | if (!range->valid) | ||
970 | return -EBUSY; | ||
971 | |||
972 | vma = find_vma(hmm->mm, start); | ||
973 | if (vma == NULL || (vma->vm_flags & device_vma)) | ||
974 | return -EFAULT; | ||
975 | |||
976 | if (is_vm_hugetlb_page(vma)) { | ||
977 | if (huge_page_shift(hstate_vma(vma)) != | ||
978 | range->page_shift && | ||
979 | range->page_shift != PAGE_SHIFT) | ||
980 | return -EINVAL; | ||
981 | } else { | ||
982 | if (range->page_shift != PAGE_SHIFT) | ||
983 | return -EINVAL; | ||
984 | } | ||
985 | |||
986 | if (!(vma->vm_flags & VM_READ)) { | ||
987 | /* | ||
988 | * If vma do not allow read access, then assume that it | ||
989 | * does not allow write access, either. HMM does not | ||
990 | * support architecture that allow write without read. | ||
991 | */ | ||
992 | hmm_pfns_clear(range, range->pfns, | ||
993 | range->start, range->end); | ||
994 | return -EPERM; | ||
995 | } | ||
996 | |||
997 | range->vma = vma; | ||
998 | hmm_vma_walk.pgmap = NULL; | ||
999 | hmm_vma_walk.last = start; | ||
1000 | hmm_vma_walk.fault = false; | ||
1001 | hmm_vma_walk.range = range; | ||
1002 | mm_walk.private = &hmm_vma_walk; | ||
1003 | end = min(range->end, vma->vm_end); | ||
1004 | |||
1005 | mm_walk.vma = vma; | ||
1006 | mm_walk.mm = vma->vm_mm; | ||
1007 | mm_walk.pte_entry = NULL; | ||
1008 | mm_walk.test_walk = NULL; | ||
1009 | mm_walk.hugetlb_entry = NULL; | ||
1010 | mm_walk.pud_entry = hmm_vma_walk_pud; | ||
1011 | mm_walk.pmd_entry = hmm_vma_walk_pmd; | ||
1012 | mm_walk.pte_hole = hmm_vma_walk_hole; | ||
1013 | mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; | ||
1014 | |||
1015 | walk_page_range(start, end, &mm_walk); | ||
1016 | start = end; | ||
1017 | } while (start < range->end); | ||
1018 | |||
1019 | return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; | ||
1020 | } | ||
1021 | EXPORT_SYMBOL(hmm_range_snapshot); | ||
1022 | 861 | ||
1023 | /* | 862 | /** |
1024 | * hmm_range_fault() - try to fault some address in a virtual address range | 863 | * hmm_range_fault - try to fault some address in a virtual address range |
1025 | * @range: range being faulted | 864 | * @range: range being faulted |
1026 | * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) | 865 | * @flags: HMM_FAULT_* flags |
1027 | * Return: number of valid pages in range->pfns[] (from range start | ||
1028 | * address). This may be zero. If the return value is negative, | ||
1029 | * then one of the following values may be returned: | ||
1030 | * | 866 | * |
1031 | * -EINVAL invalid arguments or mm or virtual address are in an | 867 | * Return: the number of valid pages in range->pfns[] (from range start |
1032 | * invalid vma (for instance device file vma). | 868 | * address), which may be zero. On error one of the following status codes |
1033 | * -ENOMEM: Out of memory. | 869 | * can be returned: |
1034 | * -EPERM: Invalid permission (for instance asking for write and | 870 | * |
1035 | * range is read only). | 871 | * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma |
1036 | * -EAGAIN: If you need to retry and mmap_sem was drop. This can only | 872 | * (e.g., device file vma). |
1037 | * happens if block argument is false. | 873 | * -ENOMEM: Out of memory. |
1038 | * -EBUSY: If the the range is being invalidated and you should wait | 874 | * -EPERM: Invalid permission (e.g., asking for write and range is read |
1039 | * for invalidation to finish. | 875 | * only). |
1040 | * -EFAULT: Invalid (ie either no valid vma or it is illegal to access | 876 | * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. |
1041 | * that range), number of valid pages in range->pfns[] (from | 877 | * -EBUSY: The range has been invalidated and the caller needs to wait for |
1042 | * range start address). | 878 | * the invalidation to finish. |
879 | * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access | ||
880 | * that range) number of valid pages in range->pfns[] (from | ||
881 | * range start address). | ||
1043 | * | 882 | * |
1044 | * This is similar to a regular CPU page fault except that it will not trigger | 883 | * This is similar to a regular CPU page fault except that it will not trigger |
1045 | * any memory migration if the memory being faulted is not accessible by CPUs | 884 | * any memory migration if the memory being faulted is not accessible by CPUs |
@@ -1048,37 +887,26 @@ EXPORT_SYMBOL(hmm_range_snapshot); | |||
1048 | * On error, for one virtual address in the range, the function will mark the | 887 | * On error, for one virtual address in the range, the function will mark the |
1049 | * corresponding HMM pfn entry with an error flag. | 888 | * corresponding HMM pfn entry with an error flag. |
1050 | */ | 889 | */ |
1051 | long hmm_range_fault(struct hmm_range *range, bool block) | 890 | long hmm_range_fault(struct hmm_range *range, unsigned int flags) |
1052 | { | 891 | { |
1053 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; | 892 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; |
1054 | unsigned long start = range->start, end; | 893 | unsigned long start = range->start, end; |
1055 | struct hmm_vma_walk hmm_vma_walk; | 894 | struct hmm_vma_walk hmm_vma_walk; |
1056 | struct hmm *hmm = range->hmm; | 895 | struct hmm *hmm = range->hmm; |
1057 | struct vm_area_struct *vma; | 896 | struct vm_area_struct *vma; |
1058 | struct mm_walk mm_walk; | ||
1059 | int ret; | 897 | int ret; |
1060 | 898 | ||
1061 | lockdep_assert_held(&hmm->mm->mmap_sem); | 899 | lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); |
1062 | 900 | ||
1063 | do { | 901 | do { |
1064 | /* If range is no longer valid force retry. */ | 902 | /* If range is no longer valid force retry. */ |
1065 | if (!range->valid) | 903 | if (!range->valid) |
1066 | return -EBUSY; | 904 | return -EBUSY; |
1067 | 905 | ||
1068 | vma = find_vma(hmm->mm, start); | 906 | vma = find_vma(hmm->mmu_notifier.mm, start); |
1069 | if (vma == NULL || (vma->vm_flags & device_vma)) | 907 | if (vma == NULL || (vma->vm_flags & device_vma)) |
1070 | return -EFAULT; | 908 | return -EFAULT; |
1071 | 909 | ||
1072 | if (is_vm_hugetlb_page(vma)) { | ||
1073 | if (huge_page_shift(hstate_vma(vma)) != | ||
1074 | range->page_shift && | ||
1075 | range->page_shift != PAGE_SHIFT) | ||
1076 | return -EINVAL; | ||
1077 | } else { | ||
1078 | if (range->page_shift != PAGE_SHIFT) | ||
1079 | return -EINVAL; | ||
1080 | } | ||
1081 | |||
1082 | if (!(vma->vm_flags & VM_READ)) { | 910 | if (!(vma->vm_flags & VM_READ)) { |
1083 | /* | 911 | /* |
1084 | * If vma do not allow read access, then assume that it | 912 | * If vma do not allow read access, then assume that it |
@@ -1090,27 +918,18 @@ long hmm_range_fault(struct hmm_range *range, bool block) | |||
1090 | return -EPERM; | 918 | return -EPERM; |
1091 | } | 919 | } |
1092 | 920 | ||
1093 | range->vma = vma; | ||
1094 | hmm_vma_walk.pgmap = NULL; | 921 | hmm_vma_walk.pgmap = NULL; |
1095 | hmm_vma_walk.last = start; | 922 | hmm_vma_walk.last = start; |
1096 | hmm_vma_walk.fault = true; | 923 | hmm_vma_walk.flags = flags; |
1097 | hmm_vma_walk.block = block; | ||
1098 | hmm_vma_walk.range = range; | 924 | hmm_vma_walk.range = range; |
1099 | mm_walk.private = &hmm_vma_walk; | ||
1100 | end = min(range->end, vma->vm_end); | 925 | end = min(range->end, vma->vm_end); |
1101 | 926 | ||
1102 | mm_walk.vma = vma; | 927 | walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, |
1103 | mm_walk.mm = vma->vm_mm; | 928 | &hmm_vma_walk); |
1104 | mm_walk.pte_entry = NULL; | ||
1105 | mm_walk.test_walk = NULL; | ||
1106 | mm_walk.hugetlb_entry = NULL; | ||
1107 | mm_walk.pud_entry = hmm_vma_walk_pud; | ||
1108 | mm_walk.pmd_entry = hmm_vma_walk_pmd; | ||
1109 | mm_walk.pte_hole = hmm_vma_walk_hole; | ||
1110 | mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; | ||
1111 | 929 | ||
1112 | do { | 930 | do { |
1113 | ret = walk_page_range(start, end, &mm_walk); | 931 | ret = walk_page_range(vma->vm_mm, start, end, |
932 | &hmm_walk_ops, &hmm_vma_walk); | ||
1114 | start = hmm_vma_walk.last; | 933 | start = hmm_vma_walk.last; |
1115 | 934 | ||
1116 | /* Keep trying while the range is valid. */ | 935 | /* Keep trying while the range is valid. */ |
@@ -1133,25 +952,22 @@ long hmm_range_fault(struct hmm_range *range, bool block) | |||
1133 | EXPORT_SYMBOL(hmm_range_fault); | 952 | EXPORT_SYMBOL(hmm_range_fault); |
1134 | 953 | ||
1135 | /** | 954 | /** |
1136 | * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. | 955 | * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. |
1137 | * @range: range being faulted | 956 | * @range: range being faulted |
1138 | * @device: device against to dma map page to | 957 | * @device: device to map page to |
1139 | * @daddrs: dma address of mapped pages | 958 | * @daddrs: array of dma addresses for the mapped pages |
1140 | * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) | 959 | * @flags: HMM_FAULT_* |
1141 | * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been | ||
1142 | * drop and you need to try again, some other error value otherwise | ||
1143 | * | 960 | * |
1144 | * Note same usage pattern as hmm_range_fault(). | 961 | * Return: the number of pages mapped on success (including zero), or any |
962 | * status return from hmm_range_fault() otherwise. | ||
1145 | */ | 963 | */ |
1146 | long hmm_range_dma_map(struct hmm_range *range, | 964 | long hmm_range_dma_map(struct hmm_range *range, struct device *device, |
1147 | struct device *device, | 965 | dma_addr_t *daddrs, unsigned int flags) |
1148 | dma_addr_t *daddrs, | ||
1149 | bool block) | ||
1150 | { | 966 | { |
1151 | unsigned long i, npages, mapped; | 967 | unsigned long i, npages, mapped; |
1152 | long ret; | 968 | long ret; |
1153 | 969 | ||
1154 | ret = hmm_range_fault(range, block); | 970 | ret = hmm_range_fault(range, flags); |
1155 | if (ret <= 0) | 971 | if (ret <= 0) |
1156 | return ret ? ret : -EBUSY; | 972 | return ret ? ret : -EBUSY; |
1157 | 973 | ||
@@ -1222,7 +1038,6 @@ EXPORT_SYMBOL(hmm_range_dma_map); | |||
1222 | /** | 1038 | /** |
1223 | * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() | 1039 | * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() |
1224 | * @range: range being unmapped | 1040 | * @range: range being unmapped |
1225 | * @vma: the vma against which the range (optional) | ||
1226 | * @device: device against which dma map was done | 1041 | * @device: device against which dma map was done |
1227 | * @daddrs: dma address of mapped pages | 1042 | * @daddrs: dma address of mapped pages |
1228 | * @dirty: dirty page if it had the write flag set | 1043 | * @dirty: dirty page if it had the write flag set |
@@ -1234,7 +1049,6 @@ EXPORT_SYMBOL(hmm_range_dma_map); | |||
1234 | * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. | 1049 | * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. |
1235 | */ | 1050 | */ |
1236 | long hmm_range_dma_unmap(struct hmm_range *range, | 1051 | long hmm_range_dma_unmap(struct hmm_range *range, |
1237 | struct vm_area_struct *vma, | ||
1238 | struct device *device, | 1052 | struct device *device, |
1239 | dma_addr_t *daddrs, | 1053 | dma_addr_t *daddrs, |
1240 | bool dirty) | 1054 | bool dirty) |
diff --git a/mm/madvise.c b/mm/madvise.c index bac973b9f2cc..88babcc384b9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
22 | #include <linux/blkdev.h> | 22 | #include <linux/blkdev.h> |
23 | #include <linux/backing-dev.h> | 23 | #include <linux/backing-dev.h> |
24 | #include <linux/pagewalk.h> | ||
24 | #include <linux/swap.h> | 25 | #include <linux/swap.h> |
25 | #include <linux/swapops.h> | 26 | #include <linux/swapops.h> |
26 | #include <linux/shmem_fs.h> | 27 | #include <linux/shmem_fs.h> |
@@ -226,19 +227,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | |||
226 | return 0; | 227 | return 0; |
227 | } | 228 | } |
228 | 229 | ||
229 | static void force_swapin_readahead(struct vm_area_struct *vma, | 230 | static const struct mm_walk_ops swapin_walk_ops = { |
230 | unsigned long start, unsigned long end) | 231 | .pmd_entry = swapin_walk_pmd_entry, |
231 | { | 232 | }; |
232 | struct mm_walk walk = { | ||
233 | .mm = vma->vm_mm, | ||
234 | .pmd_entry = swapin_walk_pmd_entry, | ||
235 | .private = vma, | ||
236 | }; | ||
237 | |||
238 | walk_page_range(start, end, &walk); | ||
239 | |||
240 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
241 | } | ||
242 | 233 | ||
243 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, | 234 | static void force_shm_swapin_readahead(struct vm_area_struct *vma, |
244 | unsigned long start, unsigned long end, | 235 | unsigned long start, unsigned long end, |
@@ -281,7 +272,8 @@ static long madvise_willneed(struct vm_area_struct *vma, | |||
281 | *prev = vma; | 272 | *prev = vma; |
282 | #ifdef CONFIG_SWAP | 273 | #ifdef CONFIG_SWAP |
283 | if (!file) { | 274 | if (!file) { |
284 | force_swapin_readahead(vma, start, end); | 275 | walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); |
276 | lru_add_drain(); /* Push any new pages onto the LRU now */ | ||
285 | return 0; | 277 | return 0; |
286 | } | 278 | } |
287 | 279 | ||
@@ -450,20 +442,9 @@ next: | |||
450 | return 0; | 442 | return 0; |
451 | } | 443 | } |
452 | 444 | ||
453 | static void madvise_free_page_range(struct mmu_gather *tlb, | 445 | static const struct mm_walk_ops madvise_free_walk_ops = { |
454 | struct vm_area_struct *vma, | 446 | .pmd_entry = madvise_free_pte_range, |
455 | unsigned long addr, unsigned long end) | 447 | }; |
456 | { | ||
457 | struct mm_walk free_walk = { | ||
458 | .pmd_entry = madvise_free_pte_range, | ||
459 | .mm = vma->vm_mm, | ||
460 | .private = tlb, | ||
461 | }; | ||
462 | |||
463 | tlb_start_vma(tlb, vma); | ||
464 | walk_page_range(addr, end, &free_walk); | ||
465 | tlb_end_vma(tlb, vma); | ||
466 | } | ||
467 | 448 | ||
468 | static int madvise_free_single_vma(struct vm_area_struct *vma, | 449 | static int madvise_free_single_vma(struct vm_area_struct *vma, |
469 | unsigned long start_addr, unsigned long end_addr) | 450 | unsigned long start_addr, unsigned long end_addr) |
@@ -490,7 +471,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, | |||
490 | update_hiwater_rss(mm); | 471 | update_hiwater_rss(mm); |
491 | 472 | ||
492 | mmu_notifier_invalidate_range_start(&range); | 473 | mmu_notifier_invalidate_range_start(&range); |
493 | madvise_free_page_range(&tlb, vma, range.start, range.end); | 474 | tlb_start_vma(&tlb, vma); |
475 | walk_page_range(vma->vm_mm, range.start, range.end, | ||
476 | &madvise_free_walk_ops, &tlb); | ||
477 | tlb_end_vma(&tlb, vma); | ||
494 | mmu_notifier_invalidate_range_end(&range); | 478 | mmu_notifier_invalidate_range_end(&range); |
495 | tlb_finish_mmu(&tlb, range.start, range.end); | 479 | tlb_finish_mmu(&tlb, range.start, range.end); |
496 | 480 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 597d58101872..f3c15bb07cce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/page_counter.h> | 25 | #include <linux/page_counter.h> |
26 | #include <linux/memcontrol.h> | 26 | #include <linux/memcontrol.h> |
27 | #include <linux/cgroup.h> | 27 | #include <linux/cgroup.h> |
28 | #include <linux/mm.h> | 28 | #include <linux/pagewalk.h> |
29 | #include <linux/sched/mm.h> | 29 | #include <linux/sched/mm.h> |
30 | #include <linux/shmem_fs.h> | 30 | #include <linux/shmem_fs.h> |
31 | #include <linux/hugetlb.h> | 31 | #include <linux/hugetlb.h> |
@@ -5499,17 +5499,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5499 | return 0; | 5499 | return 0; |
5500 | } | 5500 | } |
5501 | 5501 | ||
5502 | static const struct mm_walk_ops precharge_walk_ops = { | ||
5503 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
5504 | }; | ||
5505 | |||
5502 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 5506 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) |
5503 | { | 5507 | { |
5504 | unsigned long precharge; | 5508 | unsigned long precharge; |
5505 | 5509 | ||
5506 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
5507 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
5508 | .mm = mm, | ||
5509 | }; | ||
5510 | down_read(&mm->mmap_sem); | 5510 | down_read(&mm->mmap_sem); |
5511 | walk_page_range(0, mm->highest_vm_end, | 5511 | walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); |
5512 | &mem_cgroup_count_precharge_walk); | ||
5513 | up_read(&mm->mmap_sem); | 5512 | up_read(&mm->mmap_sem); |
5514 | 5513 | ||
5515 | precharge = mc.precharge; | 5514 | precharge = mc.precharge; |
@@ -5778,13 +5777,12 @@ put: /* get_mctgt_type() gets the page */ | |||
5778 | return ret; | 5777 | return ret; |
5779 | } | 5778 | } |
5780 | 5779 | ||
5780 | static const struct mm_walk_ops charge_walk_ops = { | ||
5781 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
5782 | }; | ||
5783 | |||
5781 | static void mem_cgroup_move_charge(void) | 5784 | static void mem_cgroup_move_charge(void) |
5782 | { | 5785 | { |
5783 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
5784 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
5785 | .mm = mc.mm, | ||
5786 | }; | ||
5787 | |||
5788 | lru_add_drain_all(); | 5786 | lru_add_drain_all(); |
5789 | /* | 5787 | /* |
5790 | * Signal lock_page_memcg() to take the memcg's move_lock | 5788 | * Signal lock_page_memcg() to take the memcg's move_lock |
@@ -5810,7 +5808,8 @@ retry: | |||
5810 | * When we have consumed all precharges and failed in doing | 5808 | * When we have consumed all precharges and failed in doing |
5811 | * additional charge, the page walk just aborts. | 5809 | * additional charge, the page walk just aborts. |
5812 | */ | 5810 | */ |
5813 | walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); | 5811 | walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, |
5812 | NULL); | ||
5814 | 5813 | ||
5815 | up_read(&mc.mm->mmap_sem); | 5814 | up_read(&mc.mm->mmap_sem); |
5816 | atomic_dec(&mc.from->moving_account); | 5815 | atomic_dec(&mc.from->moving_account); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65e0874fce17..f000771558d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -68,7 +68,7 @@ | |||
68 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 68 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
69 | 69 | ||
70 | #include <linux/mempolicy.h> | 70 | #include <linux/mempolicy.h> |
71 | #include <linux/mm.h> | 71 | #include <linux/pagewalk.h> |
72 | #include <linux/highmem.h> | 72 | #include <linux/highmem.h> |
73 | #include <linux/hugetlb.h> | 73 | #include <linux/hugetlb.h> |
74 | #include <linux/kernel.h> | 74 | #include <linux/kernel.h> |
@@ -655,6 +655,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, | |||
655 | return 1; | 655 | return 1; |
656 | } | 656 | } |
657 | 657 | ||
658 | static const struct mm_walk_ops queue_pages_walk_ops = { | ||
659 | .hugetlb_entry = queue_pages_hugetlb, | ||
660 | .pmd_entry = queue_pages_pte_range, | ||
661 | .test_walk = queue_pages_test_walk, | ||
662 | }; | ||
663 | |||
658 | /* | 664 | /* |
659 | * Walk through page tables and collect pages to be migrated. | 665 | * Walk through page tables and collect pages to be migrated. |
660 | * | 666 | * |
@@ -679,15 +685,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
679 | .nmask = nodes, | 685 | .nmask = nodes, |
680 | .prev = NULL, | 686 | .prev = NULL, |
681 | }; | 687 | }; |
682 | struct mm_walk queue_pages_walk = { | ||
683 | .hugetlb_entry = queue_pages_hugetlb, | ||
684 | .pmd_entry = queue_pages_pte_range, | ||
685 | .test_walk = queue_pages_test_walk, | ||
686 | .mm = mm, | ||
687 | .private = &qp, | ||
688 | }; | ||
689 | 688 | ||
690 | return walk_page_range(start, end, &queue_pages_walk); | 689 | return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); |
691 | } | 690 | } |
692 | 691 | ||
693 | /* | 692 | /* |
diff --git a/mm/memremap.c b/mm/memremap.c index ed70c4e8e52a..32c79b51af86 100644 --- a/mm/memremap.c +++ b/mm/memremap.c | |||
@@ -21,13 +21,13 @@ DEFINE_STATIC_KEY_FALSE(devmap_managed_key); | |||
21 | EXPORT_SYMBOL(devmap_managed_key); | 21 | EXPORT_SYMBOL(devmap_managed_key); |
22 | static atomic_t devmap_managed_enable; | 22 | static atomic_t devmap_managed_enable; |
23 | 23 | ||
24 | static void devmap_managed_enable_put(void *data) | 24 | static void devmap_managed_enable_put(void) |
25 | { | 25 | { |
26 | if (atomic_dec_and_test(&devmap_managed_enable)) | 26 | if (atomic_dec_and_test(&devmap_managed_enable)) |
27 | static_branch_disable(&devmap_managed_key); | 27 | static_branch_disable(&devmap_managed_key); |
28 | } | 28 | } |
29 | 29 | ||
30 | static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) | 30 | static int devmap_managed_enable_get(struct dev_pagemap *pgmap) |
31 | { | 31 | { |
32 | if (!pgmap->ops || !pgmap->ops->page_free) { | 32 | if (!pgmap->ops || !pgmap->ops->page_free) { |
33 | WARN(1, "Missing page_free method\n"); | 33 | WARN(1, "Missing page_free method\n"); |
@@ -36,13 +36,16 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm | |||
36 | 36 | ||
37 | if (atomic_inc_return(&devmap_managed_enable) == 1) | 37 | if (atomic_inc_return(&devmap_managed_enable) == 1) |
38 | static_branch_enable(&devmap_managed_key); | 38 | static_branch_enable(&devmap_managed_key); |
39 | return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL); | 39 | return 0; |
40 | } | 40 | } |
41 | #else | 41 | #else |
42 | static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap) | 42 | static int devmap_managed_enable_get(struct dev_pagemap *pgmap) |
43 | { | 43 | { |
44 | return -EINVAL; | 44 | return -EINVAL; |
45 | } | 45 | } |
46 | static void devmap_managed_enable_put(void) | ||
47 | { | ||
48 | } | ||
46 | #endif /* CONFIG_DEV_PAGEMAP_OPS */ | 49 | #endif /* CONFIG_DEV_PAGEMAP_OPS */ |
47 | 50 | ||
48 | static void pgmap_array_delete(struct resource *res) | 51 | static void pgmap_array_delete(struct resource *res) |
@@ -99,10 +102,8 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) | |||
99 | pgmap->ref = NULL; | 102 | pgmap->ref = NULL; |
100 | } | 103 | } |
101 | 104 | ||
102 | static void devm_memremap_pages_release(void *data) | 105 | void memunmap_pages(struct dev_pagemap *pgmap) |
103 | { | 106 | { |
104 | struct dev_pagemap *pgmap = data; | ||
105 | struct device *dev = pgmap->dev; | ||
106 | struct resource *res = &pgmap->res; | 107 | struct resource *res = &pgmap->res; |
107 | unsigned long pfn; | 108 | unsigned long pfn; |
108 | int nid; | 109 | int nid; |
@@ -129,8 +130,14 @@ static void devm_memremap_pages_release(void *data) | |||
129 | 130 | ||
130 | untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); | 131 | untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); |
131 | pgmap_array_delete(res); | 132 | pgmap_array_delete(res); |
132 | dev_WARN_ONCE(dev, pgmap->altmap.alloc, | 133 | WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); |
133 | "%s: failed to free all reserved pages\n", __func__); | 134 | devmap_managed_enable_put(); |
135 | } | ||
136 | EXPORT_SYMBOL_GPL(memunmap_pages); | ||
137 | |||
138 | static void devm_memremap_pages_release(void *data) | ||
139 | { | ||
140 | memunmap_pages(data); | ||
134 | } | 141 | } |
135 | 142 | ||
136 | static void dev_pagemap_percpu_release(struct percpu_ref *ref) | 143 | static void dev_pagemap_percpu_release(struct percpu_ref *ref) |
@@ -141,27 +148,12 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref) | |||
141 | complete(&pgmap->done); | 148 | complete(&pgmap->done); |
142 | } | 149 | } |
143 | 150 | ||
144 | /** | 151 | /* |
145 | * devm_memremap_pages - remap and provide memmap backing for the given resource | 152 | * Not device managed version of dev_memremap_pages, undone by |
146 | * @dev: hosting device for @res | 153 | * memunmap_pages(). Please use dev_memremap_pages if you have a struct |
147 | * @pgmap: pointer to a struct dev_pagemap | 154 | * device available. |
148 | * | ||
149 | * Notes: | ||
150 | * 1/ At a minimum the res and type members of @pgmap must be initialized | ||
151 | * by the caller before passing it to this function | ||
152 | * | ||
153 | * 2/ The altmap field may optionally be initialized, in which case | ||
154 | * PGMAP_ALTMAP_VALID must be set in pgmap->flags. | ||
155 | * | ||
156 | * 3/ The ref field may optionally be provided, in which pgmap->ref must be | ||
157 | * 'live' on entry and will be killed and reaped at | ||
158 | * devm_memremap_pages_release() time, or if this routine fails. | ||
159 | * | ||
160 | * 4/ res is expected to be a host memory range that could feasibly be | ||
161 | * treated as a "System RAM" range, i.e. not a device mmio range, but | ||
162 | * this is not enforced. | ||
163 | */ | 155 | */ |
164 | void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | 156 | void *memremap_pages(struct dev_pagemap *pgmap, int nid) |
165 | { | 157 | { |
166 | struct resource *res = &pgmap->res; | 158 | struct resource *res = &pgmap->res; |
167 | struct dev_pagemap *conflict_pgmap; | 159 | struct dev_pagemap *conflict_pgmap; |
@@ -172,7 +164,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
172 | .altmap = pgmap_altmap(pgmap), | 164 | .altmap = pgmap_altmap(pgmap), |
173 | }; | 165 | }; |
174 | pgprot_t pgprot = PAGE_KERNEL; | 166 | pgprot_t pgprot = PAGE_KERNEL; |
175 | int error, nid, is_ram; | 167 | int error, is_ram; |
176 | bool need_devmap_managed = true; | 168 | bool need_devmap_managed = true; |
177 | 169 | ||
178 | switch (pgmap->type) { | 170 | switch (pgmap->type) { |
@@ -220,14 +212,14 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
220 | } | 212 | } |
221 | 213 | ||
222 | if (need_devmap_managed) { | 214 | if (need_devmap_managed) { |
223 | error = devmap_managed_enable_get(dev, pgmap); | 215 | error = devmap_managed_enable_get(pgmap); |
224 | if (error) | 216 | if (error) |
225 | return ERR_PTR(error); | 217 | return ERR_PTR(error); |
226 | } | 218 | } |
227 | 219 | ||
228 | conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); | 220 | conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL); |
229 | if (conflict_pgmap) { | 221 | if (conflict_pgmap) { |
230 | dev_WARN(dev, "Conflicting mapping in same section\n"); | 222 | WARN(1, "Conflicting mapping in same section\n"); |
231 | put_dev_pagemap(conflict_pgmap); | 223 | put_dev_pagemap(conflict_pgmap); |
232 | error = -ENOMEM; | 224 | error = -ENOMEM; |
233 | goto err_array; | 225 | goto err_array; |
@@ -235,7 +227,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
235 | 227 | ||
236 | conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); | 228 | conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL); |
237 | if (conflict_pgmap) { | 229 | if (conflict_pgmap) { |
238 | dev_WARN(dev, "Conflicting mapping in same section\n"); | 230 | WARN(1, "Conflicting mapping in same section\n"); |
239 | put_dev_pagemap(conflict_pgmap); | 231 | put_dev_pagemap(conflict_pgmap); |
240 | error = -ENOMEM; | 232 | error = -ENOMEM; |
241 | goto err_array; | 233 | goto err_array; |
@@ -251,14 +243,11 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
251 | goto err_array; | 243 | goto err_array; |
252 | } | 244 | } |
253 | 245 | ||
254 | pgmap->dev = dev; | ||
255 | |||
256 | error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), | 246 | error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), |
257 | PHYS_PFN(res->end), pgmap, GFP_KERNEL)); | 247 | PHYS_PFN(res->end), pgmap, GFP_KERNEL)); |
258 | if (error) | 248 | if (error) |
259 | goto err_array; | 249 | goto err_array; |
260 | 250 | ||
261 | nid = dev_to_node(dev); | ||
262 | if (nid < 0) | 251 | if (nid < 0) |
263 | nid = numa_mem_id(); | 252 | nid = numa_mem_id(); |
264 | 253 | ||
@@ -314,12 +303,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
314 | PHYS_PFN(res->start), | 303 | PHYS_PFN(res->start), |
315 | PHYS_PFN(resource_size(res)), pgmap); | 304 | PHYS_PFN(resource_size(res)), pgmap); |
316 | percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); | 305 | percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap)); |
317 | |||
318 | error = devm_add_action_or_reset(dev, devm_memremap_pages_release, | ||
319 | pgmap); | ||
320 | if (error) | ||
321 | return ERR_PTR(error); | ||
322 | |||
323 | return __va(res->start); | 306 | return __va(res->start); |
324 | 307 | ||
325 | err_add_memory: | 308 | err_add_memory: |
@@ -331,8 +314,46 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
331 | err_array: | 314 | err_array: |
332 | dev_pagemap_kill(pgmap); | 315 | dev_pagemap_kill(pgmap); |
333 | dev_pagemap_cleanup(pgmap); | 316 | dev_pagemap_cleanup(pgmap); |
317 | devmap_managed_enable_put(); | ||
334 | return ERR_PTR(error); | 318 | return ERR_PTR(error); |
335 | } | 319 | } |
320 | EXPORT_SYMBOL_GPL(memremap_pages); | ||
321 | |||
322 | /** | ||
323 | * devm_memremap_pages - remap and provide memmap backing for the given resource | ||
324 | * @dev: hosting device for @res | ||
325 | * @pgmap: pointer to a struct dev_pagemap | ||
326 | * | ||
327 | * Notes: | ||
328 | * 1/ At a minimum the res and type members of @pgmap must be initialized | ||
329 | * by the caller before passing it to this function | ||
330 | * | ||
331 | * 2/ The altmap field may optionally be initialized, in which case | ||
332 | * PGMAP_ALTMAP_VALID must be set in pgmap->flags. | ||
333 | * | ||
334 | * 3/ The ref field may optionally be provided, in which pgmap->ref must be | ||
335 | * 'live' on entry and will be killed and reaped at | ||
336 | * devm_memremap_pages_release() time, or if this routine fails. | ||
337 | * | ||
338 | * 4/ res is expected to be a host memory range that could feasibly be | ||
339 | * treated as a "System RAM" range, i.e. not a device mmio range, but | ||
340 | * this is not enforced. | ||
341 | */ | ||
342 | void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | ||
343 | { | ||
344 | int error; | ||
345 | void *ret; | ||
346 | |||
347 | ret = memremap_pages(pgmap, dev_to_node(dev)); | ||
348 | if (IS_ERR(ret)) | ||
349 | return ret; | ||
350 | |||
351 | error = devm_add_action_or_reset(dev, devm_memremap_pages_release, | ||
352 | pgmap); | ||
353 | if (error) | ||
354 | return ERR_PTR(error); | ||
355 | return ret; | ||
356 | } | ||
336 | EXPORT_SYMBOL_GPL(devm_memremap_pages); | 357 | EXPORT_SYMBOL_GPL(devm_memremap_pages); |
337 | 358 | ||
338 | void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) | 359 | void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) |
diff --git a/mm/migrate.c b/mm/migrate.c index a42858d8e00b..9f4ed4e985c1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/hugetlb.h> | 38 | #include <linux/hugetlb.h> |
39 | #include <linux/hugetlb_cgroup.h> | 39 | #include <linux/hugetlb_cgroup.h> |
40 | #include <linux/gfp.h> | 40 | #include <linux/gfp.h> |
41 | #include <linux/pagewalk.h> | ||
41 | #include <linux/pfn_t.h> | 42 | #include <linux/pfn_t.h> |
42 | #include <linux/memremap.h> | 43 | #include <linux/memremap.h> |
43 | #include <linux/userfaultfd_k.h> | 44 | #include <linux/userfaultfd_k.h> |
@@ -2119,17 +2120,7 @@ out_unlock: | |||
2119 | 2120 | ||
2120 | #endif /* CONFIG_NUMA */ | 2121 | #endif /* CONFIG_NUMA */ |
2121 | 2122 | ||
2122 | #if defined(CONFIG_MIGRATE_VMA_HELPER) | 2123 | #ifdef CONFIG_DEVICE_PRIVATE |
2123 | struct migrate_vma { | ||
2124 | struct vm_area_struct *vma; | ||
2125 | unsigned long *dst; | ||
2126 | unsigned long *src; | ||
2127 | unsigned long cpages; | ||
2128 | unsigned long npages; | ||
2129 | unsigned long start; | ||
2130 | unsigned long end; | ||
2131 | }; | ||
2132 | |||
2133 | static int migrate_vma_collect_hole(unsigned long start, | 2124 | static int migrate_vma_collect_hole(unsigned long start, |
2134 | unsigned long end, | 2125 | unsigned long end, |
2135 | struct mm_walk *walk) | 2126 | struct mm_walk *walk) |
@@ -2249,8 +2240,8 @@ again: | |||
2249 | goto next; | 2240 | goto next; |
2250 | 2241 | ||
2251 | page = device_private_entry_to_page(entry); | 2242 | page = device_private_entry_to_page(entry); |
2252 | mpfn = migrate_pfn(page_to_pfn(page))| | 2243 | mpfn = migrate_pfn(page_to_pfn(page)) | |
2253 | MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; | 2244 | MIGRATE_PFN_MIGRATE; |
2254 | if (is_write_device_private_entry(entry)) | 2245 | if (is_write_device_private_entry(entry)) |
2255 | mpfn |= MIGRATE_PFN_WRITE; | 2246 | mpfn |= MIGRATE_PFN_WRITE; |
2256 | } else { | 2247 | } else { |
@@ -2329,6 +2320,11 @@ next: | |||
2329 | return 0; | 2320 | return 0; |
2330 | } | 2321 | } |
2331 | 2322 | ||
2323 | static const struct mm_walk_ops migrate_vma_walk_ops = { | ||
2324 | .pmd_entry = migrate_vma_collect_pmd, | ||
2325 | .pte_hole = migrate_vma_collect_hole, | ||
2326 | }; | ||
2327 | |||
2332 | /* | 2328 | /* |
2333 | * migrate_vma_collect() - collect pages over a range of virtual addresses | 2329 | * migrate_vma_collect() - collect pages over a range of virtual addresses |
2334 | * @migrate: migrate struct containing all migration information | 2330 | * @migrate: migrate struct containing all migration information |
@@ -2340,21 +2336,15 @@ next: | |||
2340 | static void migrate_vma_collect(struct migrate_vma *migrate) | 2336 | static void migrate_vma_collect(struct migrate_vma *migrate) |
2341 | { | 2337 | { |
2342 | struct mmu_notifier_range range; | 2338 | struct mmu_notifier_range range; |
2343 | struct mm_walk mm_walk = { | ||
2344 | .pmd_entry = migrate_vma_collect_pmd, | ||
2345 | .pte_hole = migrate_vma_collect_hole, | ||
2346 | .vma = migrate->vma, | ||
2347 | .mm = migrate->vma->vm_mm, | ||
2348 | .private = migrate, | ||
2349 | }; | ||
2350 | 2339 | ||
2351 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, | 2340 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, |
2352 | migrate->start, | 2341 | migrate->vma->vm_mm, migrate->start, migrate->end); |
2353 | migrate->end); | ||
2354 | mmu_notifier_invalidate_range_start(&range); | 2342 | mmu_notifier_invalidate_range_start(&range); |
2355 | walk_page_range(migrate->start, migrate->end, &mm_walk); | ||
2356 | mmu_notifier_invalidate_range_end(&range); | ||
2357 | 2343 | ||
2344 | walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, | ||
2345 | &migrate_vma_walk_ops, migrate); | ||
2346 | |||
2347 | mmu_notifier_invalidate_range_end(&range); | ||
2358 | migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); | 2348 | migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); |
2359 | } | 2349 | } |
2360 | 2350 | ||
@@ -2577,6 +2567,110 @@ restore: | |||
2577 | } | 2567 | } |
2578 | } | 2568 | } |
2579 | 2569 | ||
2570 | /** | ||
2571 | * migrate_vma_setup() - prepare to migrate a range of memory | ||
2572 | * @args: contains the vma, start, and and pfns arrays for the migration | ||
2573 | * | ||
2574 | * Returns: negative errno on failures, 0 when 0 or more pages were migrated | ||
2575 | * without an error. | ||
2576 | * | ||
2577 | * Prepare to migrate a range of memory virtual address range by collecting all | ||
2578 | * the pages backing each virtual address in the range, saving them inside the | ||
2579 | * src array. Then lock those pages and unmap them. Once the pages are locked | ||
2580 | * and unmapped, check whether each page is pinned or not. Pages that aren't | ||
2581 | * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the | ||
2582 | * corresponding src array entry. Then restores any pages that are pinned, by | ||
2583 | * remapping and unlocking those pages. | ||
2584 | * | ||
2585 | * The caller should then allocate destination memory and copy source memory to | ||
2586 | * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE | ||
2587 | * flag set). Once these are allocated and copied, the caller must update each | ||
2588 | * corresponding entry in the dst array with the pfn value of the destination | ||
2589 | * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set | ||
2590 | * (destination pages must have their struct pages locked, via lock_page()). | ||
2591 | * | ||
2592 | * Note that the caller does not have to migrate all the pages that are marked | ||
2593 | * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from | ||
2594 | * device memory to system memory. If the caller cannot migrate a device page | ||
2595 | * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe | ||
2596 | * consequences for the userspace process, so it must be avoided if at all | ||
2597 | * possible. | ||
2598 | * | ||
2599 | * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we | ||
2600 | * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus | ||
2601 | * allowing the caller to allocate device memory for those unback virtual | ||
2602 | * address. For this the caller simply has to allocate device memory and | ||
2603 | * properly set the destination entry like for regular migration. Note that | ||
2604 | * this can still fails and thus inside the device driver must check if the | ||
2605 | * migration was successful for those entries after calling migrate_vma_pages() | ||
2606 | * just like for regular migration. | ||
2607 | * | ||
2608 | * After that, the callers must call migrate_vma_pages() to go over each entry | ||
2609 | * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag | ||
2610 | * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, | ||
2611 | * then migrate_vma_pages() to migrate struct page information from the source | ||
2612 | * struct page to the destination struct page. If it fails to migrate the | ||
2613 | * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the | ||
2614 | * src array. | ||
2615 | * | ||
2616 | * At this point all successfully migrated pages have an entry in the src | ||
2617 | * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst | ||
2618 | * array entry with MIGRATE_PFN_VALID flag set. | ||
2619 | * | ||
2620 | * Once migrate_vma_pages() returns the caller may inspect which pages were | ||
2621 | * successfully migrated, and which were not. Successfully migrated pages will | ||
2622 | * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. | ||
2623 | * | ||
2624 | * It is safe to update device page table after migrate_vma_pages() because | ||
2625 | * both destination and source page are still locked, and the mmap_sem is held | ||
2626 | * in read mode (hence no one can unmap the range being migrated). | ||
2627 | * | ||
2628 | * Once the caller is done cleaning up things and updating its page table (if it | ||
2629 | * chose to do so, this is not an obligation) it finally calls | ||
2630 | * migrate_vma_finalize() to update the CPU page table to point to new pages | ||
2631 | * for successfully migrated pages or otherwise restore the CPU page table to | ||
2632 | * point to the original source pages. | ||
2633 | */ | ||
2634 | int migrate_vma_setup(struct migrate_vma *args) | ||
2635 | { | ||
2636 | long nr_pages = (args->end - args->start) >> PAGE_SHIFT; | ||
2637 | |||
2638 | args->start &= PAGE_MASK; | ||
2639 | args->end &= PAGE_MASK; | ||
2640 | if (!args->vma || is_vm_hugetlb_page(args->vma) || | ||
2641 | (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) | ||
2642 | return -EINVAL; | ||
2643 | if (nr_pages <= 0) | ||
2644 | return -EINVAL; | ||
2645 | if (args->start < args->vma->vm_start || | ||
2646 | args->start >= args->vma->vm_end) | ||
2647 | return -EINVAL; | ||
2648 | if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) | ||
2649 | return -EINVAL; | ||
2650 | if (!args->src || !args->dst) | ||
2651 | return -EINVAL; | ||
2652 | |||
2653 | memset(args->src, 0, sizeof(*args->src) * nr_pages); | ||
2654 | args->cpages = 0; | ||
2655 | args->npages = 0; | ||
2656 | |||
2657 | migrate_vma_collect(args); | ||
2658 | |||
2659 | if (args->cpages) | ||
2660 | migrate_vma_prepare(args); | ||
2661 | if (args->cpages) | ||
2662 | migrate_vma_unmap(args); | ||
2663 | |||
2664 | /* | ||
2665 | * At this point pages are locked and unmapped, and thus they have | ||
2666 | * stable content and can safely be copied to destination memory that | ||
2667 | * is allocated by the drivers. | ||
2668 | */ | ||
2669 | return 0; | ||
2670 | |||
2671 | } | ||
2672 | EXPORT_SYMBOL(migrate_vma_setup); | ||
2673 | |||
2580 | static void migrate_vma_insert_page(struct migrate_vma *migrate, | 2674 | static void migrate_vma_insert_page(struct migrate_vma *migrate, |
2581 | unsigned long addr, | 2675 | unsigned long addr, |
2582 | struct page *page, | 2676 | struct page *page, |
@@ -2708,7 +2802,7 @@ abort: | |||
2708 | *src &= ~MIGRATE_PFN_MIGRATE; | 2802 | *src &= ~MIGRATE_PFN_MIGRATE; |
2709 | } | 2803 | } |
2710 | 2804 | ||
2711 | /* | 2805 | /** |
2712 | * migrate_vma_pages() - migrate meta-data from src page to dst page | 2806 | * migrate_vma_pages() - migrate meta-data from src page to dst page |
2713 | * @migrate: migrate struct containing all migration information | 2807 | * @migrate: migrate struct containing all migration information |
2714 | * | 2808 | * |
@@ -2716,7 +2810,7 @@ abort: | |||
2716 | * struct page. This effectively finishes the migration from source page to the | 2810 | * struct page. This effectively finishes the migration from source page to the |
2717 | * destination page. | 2811 | * destination page. |
2718 | */ | 2812 | */ |
2719 | static void migrate_vma_pages(struct migrate_vma *migrate) | 2813 | void migrate_vma_pages(struct migrate_vma *migrate) |
2720 | { | 2814 | { |
2721 | const unsigned long npages = migrate->npages; | 2815 | const unsigned long npages = migrate->npages; |
2722 | const unsigned long start = migrate->start; | 2816 | const unsigned long start = migrate->start; |
@@ -2790,8 +2884,9 @@ static void migrate_vma_pages(struct migrate_vma *migrate) | |||
2790 | if (notified) | 2884 | if (notified) |
2791 | mmu_notifier_invalidate_range_only_end(&range); | 2885 | mmu_notifier_invalidate_range_only_end(&range); |
2792 | } | 2886 | } |
2887 | EXPORT_SYMBOL(migrate_vma_pages); | ||
2793 | 2888 | ||
2794 | /* | 2889 | /** |
2795 | * migrate_vma_finalize() - restore CPU page table entry | 2890 | * migrate_vma_finalize() - restore CPU page table entry |
2796 | * @migrate: migrate struct containing all migration information | 2891 | * @migrate: migrate struct containing all migration information |
2797 | * | 2892 | * |
@@ -2802,7 +2897,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) | |||
2802 | * This also unlocks the pages and puts them back on the lru, or drops the extra | 2897 | * This also unlocks the pages and puts them back on the lru, or drops the extra |
2803 | * refcount, for device pages. | 2898 | * refcount, for device pages. |
2804 | */ | 2899 | */ |
2805 | static void migrate_vma_finalize(struct migrate_vma *migrate) | 2900 | void migrate_vma_finalize(struct migrate_vma *migrate) |
2806 | { | 2901 | { |
2807 | const unsigned long npages = migrate->npages; | 2902 | const unsigned long npages = migrate->npages; |
2808 | unsigned long i; | 2903 | unsigned long i; |
@@ -2845,124 +2940,5 @@ static void migrate_vma_finalize(struct migrate_vma *migrate) | |||
2845 | } | 2940 | } |
2846 | } | 2941 | } |
2847 | } | 2942 | } |
2848 | 2943 | EXPORT_SYMBOL(migrate_vma_finalize); | |
2849 | /* | 2944 | #endif /* CONFIG_DEVICE_PRIVATE */ |
2850 | * migrate_vma() - migrate a range of memory inside vma | ||
2851 | * | ||
2852 | * @ops: migration callback for allocating destination memory and copying | ||
2853 | * @vma: virtual memory area containing the range to be migrated | ||
2854 | * @start: start address of the range to migrate (inclusive) | ||
2855 | * @end: end address of the range to migrate (exclusive) | ||
2856 | * @src: array of hmm_pfn_t containing source pfns | ||
2857 | * @dst: array of hmm_pfn_t containing destination pfns | ||
2858 | * @private: pointer passed back to each of the callback | ||
2859 | * Returns: 0 on success, error code otherwise | ||
2860 | * | ||
2861 | * This function tries to migrate a range of memory virtual address range, using | ||
2862 | * callbacks to allocate and copy memory from source to destination. First it | ||
2863 | * collects all the pages backing each virtual address in the range, saving this | ||
2864 | * inside the src array. Then it locks those pages and unmaps them. Once the pages | ||
2865 | * are locked and unmapped, it checks whether each page is pinned or not. Pages | ||
2866 | * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) | ||
2867 | * in the corresponding src array entry. It then restores any pages that are | ||
2868 | * pinned, by remapping and unlocking those pages. | ||
2869 | * | ||
2870 | * At this point it calls the alloc_and_copy() callback. For documentation on | ||
2871 | * what is expected from that callback, see struct migrate_vma_ops comments in | ||
2872 | * include/linux/migrate.h | ||
2873 | * | ||
2874 | * After the alloc_and_copy() callback, this function goes over each entry in | ||
2875 | * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag | ||
2876 | * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, | ||
2877 | * then the function tries to migrate struct page information from the source | ||
2878 | * struct page to the destination struct page. If it fails to migrate the struct | ||
2879 | * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src | ||
2880 | * array. | ||
2881 | * | ||
2882 | * At this point all successfully migrated pages have an entry in the src | ||
2883 | * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst | ||
2884 | * array entry with MIGRATE_PFN_VALID flag set. | ||
2885 | * | ||
2886 | * It then calls the finalize_and_map() callback. See comments for "struct | ||
2887 | * migrate_vma_ops", in include/linux/migrate.h for details about | ||
2888 | * finalize_and_map() behavior. | ||
2889 | * | ||
2890 | * After the finalize_and_map() callback, for successfully migrated pages, this | ||
2891 | * function updates the CPU page table to point to new pages, otherwise it | ||
2892 | * restores the CPU page table to point to the original source pages. | ||
2893 | * | ||
2894 | * Function returns 0 after the above steps, even if no pages were migrated | ||
2895 | * (The function only returns an error if any of the arguments are invalid.) | ||
2896 | * | ||
2897 | * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT | ||
2898 | * unsigned long entries. | ||
2899 | */ | ||
2900 | int migrate_vma(const struct migrate_vma_ops *ops, | ||
2901 | struct vm_area_struct *vma, | ||
2902 | unsigned long start, | ||
2903 | unsigned long end, | ||
2904 | unsigned long *src, | ||
2905 | unsigned long *dst, | ||
2906 | void *private) | ||
2907 | { | ||
2908 | struct migrate_vma migrate; | ||
2909 | |||
2910 | /* Sanity check the arguments */ | ||
2911 | start &= PAGE_MASK; | ||
2912 | end &= PAGE_MASK; | ||
2913 | if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || | ||
2914 | vma_is_dax(vma)) | ||
2915 | return -EINVAL; | ||
2916 | if (start < vma->vm_start || start >= vma->vm_end) | ||
2917 | return -EINVAL; | ||
2918 | if (end <= vma->vm_start || end > vma->vm_end) | ||
2919 | return -EINVAL; | ||
2920 | if (!ops || !src || !dst || start >= end) | ||
2921 | return -EINVAL; | ||
2922 | |||
2923 | memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); | ||
2924 | migrate.src = src; | ||
2925 | migrate.dst = dst; | ||
2926 | migrate.start = start; | ||
2927 | migrate.npages = 0; | ||
2928 | migrate.cpages = 0; | ||
2929 | migrate.end = end; | ||
2930 | migrate.vma = vma; | ||
2931 | |||
2932 | /* Collect, and try to unmap source pages */ | ||
2933 | migrate_vma_collect(&migrate); | ||
2934 | if (!migrate.cpages) | ||
2935 | return 0; | ||
2936 | |||
2937 | /* Lock and isolate page */ | ||
2938 | migrate_vma_prepare(&migrate); | ||
2939 | if (!migrate.cpages) | ||
2940 | return 0; | ||
2941 | |||
2942 | /* Unmap pages */ | ||
2943 | migrate_vma_unmap(&migrate); | ||
2944 | if (!migrate.cpages) | ||
2945 | return 0; | ||
2946 | |||
2947 | /* | ||
2948 | * At this point pages are locked and unmapped, and thus they have | ||
2949 | * stable content and can safely be copied to destination memory that | ||
2950 | * is allocated by the callback. | ||
2951 | * | ||
2952 | * Note that migration can fail in migrate_vma_struct_page() for each | ||
2953 | * individual page. | ||
2954 | */ | ||
2955 | ops->alloc_and_copy(vma, src, dst, start, end, private); | ||
2956 | |||
2957 | /* This does the real migration of struct page */ | ||
2958 | migrate_vma_pages(&migrate); | ||
2959 | |||
2960 | ops->finalize_and_map(vma, src, dst, start, end, private); | ||
2961 | |||
2962 | /* Unlock and remap pages */ | ||
2963 | migrate_vma_finalize(&migrate); | ||
2964 | |||
2965 | return 0; | ||
2966 | } | ||
2967 | EXPORT_SYMBOL(migrate_vma); | ||
2968 | #endif /* defined(MIGRATE_VMA_HELPER) */ | ||
diff --git a/mm/mincore.c b/mm/mincore.c index 4fe91d497436..f9a9dbe8cd33 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -10,7 +10,7 @@ | |||
10 | */ | 10 | */ |
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | 13 | #include <linux/pagewalk.h> |
14 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
16 | #include <linux/swap.h> | 16 | #include <linux/swap.h> |
@@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma) | |||
193 | inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; | 193 | inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; |
194 | } | 194 | } |
195 | 195 | ||
196 | static const struct mm_walk_ops mincore_walk_ops = { | ||
197 | .pmd_entry = mincore_pte_range, | ||
198 | .pte_hole = mincore_unmapped_range, | ||
199 | .hugetlb_entry = mincore_hugetlb, | ||
200 | }; | ||
201 | |||
196 | /* | 202 | /* |
197 | * Do a chunk of "sys_mincore()". We've already checked | 203 | * Do a chunk of "sys_mincore()". We've already checked |
198 | * all the arguments, we hold the mmap semaphore: we should | 204 | * all the arguments, we hold the mmap semaphore: we should |
@@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
203 | struct vm_area_struct *vma; | 209 | struct vm_area_struct *vma; |
204 | unsigned long end; | 210 | unsigned long end; |
205 | int err; | 211 | int err; |
206 | struct mm_walk mincore_walk = { | ||
207 | .pmd_entry = mincore_pte_range, | ||
208 | .pte_hole = mincore_unmapped_range, | ||
209 | .hugetlb_entry = mincore_hugetlb, | ||
210 | .private = vec, | ||
211 | }; | ||
212 | 212 | ||
213 | vma = find_vma(current->mm, addr); | 213 | vma = find_vma(current->mm, addr); |
214 | if (!vma || addr < vma->vm_start) | 214 | if (!vma || addr < vma->vm_start) |
@@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
219 | memset(vec, 1, pages); | 219 | memset(vec, 1, pages); |
220 | return pages; | 220 | return pages; |
221 | } | 221 | } |
222 | mincore_walk.mm = vma->vm_mm; | 222 | err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); |
223 | err = walk_page_range(addr, end, &mincore_walk); | ||
224 | if (err < 0) | 223 | if (err < 0) |
225 | return err; | 224 | return err; |
226 | return (end - addr) >> PAGE_SHIFT; | 225 | return (end - addr) >> PAGE_SHIFT; |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index b5670620aea0..7fde88695f35 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -21,17 +21,11 @@ | |||
21 | /* global SRCU for all MMs */ | 21 | /* global SRCU for all MMs */ |
22 | DEFINE_STATIC_SRCU(srcu); | 22 | DEFINE_STATIC_SRCU(srcu); |
23 | 23 | ||
24 | /* | 24 | #ifdef CONFIG_LOCKDEP |
25 | * This function allows mmu_notifier::release callback to delay a call to | 25 | struct lockdep_map __mmu_notifier_invalidate_range_start_map = { |
26 | * a function that will free appropriate resources. The function must be | 26 | .name = "mmu_notifier_invalidate_range_start" |
27 | * quick and must not block. | 27 | }; |
28 | */ | 28 | #endif |
29 | void mmu_notifier_call_srcu(struct rcu_head *rcu, | ||
30 | void (*func)(struct rcu_head *rcu)) | ||
31 | { | ||
32 | call_srcu(&srcu, rcu, func); | ||
33 | } | ||
34 | EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); | ||
35 | 29 | ||
36 | /* | 30 | /* |
37 | * This function can't run concurrently against mmu_notifier_register | 31 | * This function can't run concurrently against mmu_notifier_register |
@@ -174,11 +168,19 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) | |||
174 | id = srcu_read_lock(&srcu); | 168 | id = srcu_read_lock(&srcu); |
175 | hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { | 169 | hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { |
176 | if (mn->ops->invalidate_range_start) { | 170 | if (mn->ops->invalidate_range_start) { |
177 | int _ret = mn->ops->invalidate_range_start(mn, range); | 171 | int _ret; |
172 | |||
173 | if (!mmu_notifier_range_blockable(range)) | ||
174 | non_block_start(); | ||
175 | _ret = mn->ops->invalidate_range_start(mn, range); | ||
176 | if (!mmu_notifier_range_blockable(range)) | ||
177 | non_block_end(); | ||
178 | if (_ret) { | 178 | if (_ret) { |
179 | pr_info("%pS callback failed with %d in %sblockable context.\n", | 179 | pr_info("%pS callback failed with %d in %sblockable context.\n", |
180 | mn->ops->invalidate_range_start, _ret, | 180 | mn->ops->invalidate_range_start, _ret, |
181 | !mmu_notifier_range_blockable(range) ? "non-" : ""); | 181 | !mmu_notifier_range_blockable(range) ? "non-" : ""); |
182 | WARN_ON(mmu_notifier_range_blockable(range) || | ||
183 | ret != -EAGAIN); | ||
182 | ret = _ret; | 184 | ret = _ret; |
183 | } | 185 | } |
184 | } | 186 | } |
@@ -187,7 +189,6 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) | |||
187 | 189 | ||
188 | return ret; | 190 | return ret; |
189 | } | 191 | } |
190 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); | ||
191 | 192 | ||
192 | void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, | 193 | void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, |
193 | bool only_end) | 194 | bool only_end) |
@@ -195,6 +196,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, | |||
195 | struct mmu_notifier *mn; | 196 | struct mmu_notifier *mn; |
196 | int id; | 197 | int id; |
197 | 198 | ||
199 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); | ||
198 | id = srcu_read_lock(&srcu); | 200 | id = srcu_read_lock(&srcu); |
199 | hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { | 201 | hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { |
200 | /* | 202 | /* |
@@ -214,12 +216,17 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, | |||
214 | mn->ops->invalidate_range(mn, range->mm, | 216 | mn->ops->invalidate_range(mn, range->mm, |
215 | range->start, | 217 | range->start, |
216 | range->end); | 218 | range->end); |
217 | if (mn->ops->invalidate_range_end) | 219 | if (mn->ops->invalidate_range_end) { |
220 | if (!mmu_notifier_range_blockable(range)) | ||
221 | non_block_start(); | ||
218 | mn->ops->invalidate_range_end(mn, range); | 222 | mn->ops->invalidate_range_end(mn, range); |
223 | if (!mmu_notifier_range_blockable(range)) | ||
224 | non_block_end(); | ||
225 | } | ||
219 | } | 226 | } |
220 | srcu_read_unlock(&srcu, id); | 227 | srcu_read_unlock(&srcu, id); |
228 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); | ||
221 | } | 229 | } |
222 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); | ||
223 | 230 | ||
224 | void __mmu_notifier_invalidate_range(struct mm_struct *mm, | 231 | void __mmu_notifier_invalidate_range(struct mm_struct *mm, |
225 | unsigned long start, unsigned long end) | 232 | unsigned long start, unsigned long end) |
@@ -234,35 +241,49 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, | |||
234 | } | 241 | } |
235 | srcu_read_unlock(&srcu, id); | 242 | srcu_read_unlock(&srcu, id); |
236 | } | 243 | } |
237 | EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); | ||
238 | 244 | ||
239 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 245 | /* |
240 | struct mm_struct *mm, | 246 | * Same as mmu_notifier_register but here the caller must hold the |
241 | int take_mmap_sem) | 247 | * mmap_sem in write mode. |
248 | */ | ||
249 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
242 | { | 250 | { |
243 | struct mmu_notifier_mm *mmu_notifier_mm; | 251 | struct mmu_notifier_mm *mmu_notifier_mm = NULL; |
244 | int ret; | 252 | int ret; |
245 | 253 | ||
254 | lockdep_assert_held_write(&mm->mmap_sem); | ||
246 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 255 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
247 | 256 | ||
248 | ret = -ENOMEM; | 257 | if (IS_ENABLED(CONFIG_LOCKDEP)) { |
249 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 258 | fs_reclaim_acquire(GFP_KERNEL); |
250 | if (unlikely(!mmu_notifier_mm)) | 259 | lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); |
251 | goto out; | 260 | lock_map_release(&__mmu_notifier_invalidate_range_start_map); |
261 | fs_reclaim_release(GFP_KERNEL); | ||
262 | } | ||
252 | 263 | ||
253 | if (take_mmap_sem) | 264 | mn->mm = mm; |
254 | down_write(&mm->mmap_sem); | 265 | mn->users = 1; |
255 | ret = mm_take_all_locks(mm); | 266 | |
256 | if (unlikely(ret)) | 267 | if (!mm->mmu_notifier_mm) { |
257 | goto out_clean; | 268 | /* |
269 | * kmalloc cannot be called under mm_take_all_locks(), but we | ||
270 | * know that mm->mmu_notifier_mm can't change while we hold | ||
271 | * the write side of the mmap_sem. | ||
272 | */ | ||
273 | mmu_notifier_mm = | ||
274 | kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | ||
275 | if (!mmu_notifier_mm) | ||
276 | return -ENOMEM; | ||
258 | 277 | ||
259 | if (!mm_has_notifiers(mm)) { | ||
260 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | 278 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); |
261 | spin_lock_init(&mmu_notifier_mm->lock); | 279 | spin_lock_init(&mmu_notifier_mm->lock); |
262 | |||
263 | mm->mmu_notifier_mm = mmu_notifier_mm; | ||
264 | mmu_notifier_mm = NULL; | ||
265 | } | 280 | } |
281 | |||
282 | ret = mm_take_all_locks(mm); | ||
283 | if (unlikely(ret)) | ||
284 | goto out_clean; | ||
285 | |||
286 | /* Pairs with the mmdrop in mmu_notifier_unregister_* */ | ||
266 | mmgrab(mm); | 287 | mmgrab(mm); |
267 | 288 | ||
268 | /* | 289 | /* |
@@ -273,48 +294,118 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
273 | * We can't race against any other mmu notifier method either | 294 | * We can't race against any other mmu notifier method either |
274 | * thanks to mm_take_all_locks(). | 295 | * thanks to mm_take_all_locks(). |
275 | */ | 296 | */ |
297 | if (mmu_notifier_mm) | ||
298 | mm->mmu_notifier_mm = mmu_notifier_mm; | ||
299 | |||
276 | spin_lock(&mm->mmu_notifier_mm->lock); | 300 | spin_lock(&mm->mmu_notifier_mm->lock); |
277 | hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); | 301 | hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); |
278 | spin_unlock(&mm->mmu_notifier_mm->lock); | 302 | spin_unlock(&mm->mmu_notifier_mm->lock); |
279 | 303 | ||
280 | mm_drop_all_locks(mm); | 304 | mm_drop_all_locks(mm); |
305 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
306 | return 0; | ||
307 | |||
281 | out_clean: | 308 | out_clean: |
282 | if (take_mmap_sem) | ||
283 | up_write(&mm->mmap_sem); | ||
284 | kfree(mmu_notifier_mm); | 309 | kfree(mmu_notifier_mm); |
285 | out: | ||
286 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
287 | return ret; | 310 | return ret; |
288 | } | 311 | } |
312 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); | ||
289 | 313 | ||
290 | /* | 314 | /** |
315 | * mmu_notifier_register - Register a notifier on a mm | ||
316 | * @mn: The notifier to attach | ||
317 | * @mm: The mm to attach the notifier to | ||
318 | * | ||
291 | * Must not hold mmap_sem nor any other VM related lock when calling | 319 | * Must not hold mmap_sem nor any other VM related lock when calling |
292 | * this registration function. Must also ensure mm_users can't go down | 320 | * this registration function. Must also ensure mm_users can't go down |
293 | * to zero while this runs to avoid races with mmu_notifier_release, | 321 | * to zero while this runs to avoid races with mmu_notifier_release, |
294 | * so mm has to be current->mm or the mm should be pinned safely such | 322 | * so mm has to be current->mm or the mm should be pinned safely such |
295 | * as with get_task_mm(). If the mm is not current->mm, the mm_users | 323 | * as with get_task_mm(). If the mm is not current->mm, the mm_users |
296 | * pin should be released by calling mmput after mmu_notifier_register | 324 | * pin should be released by calling mmput after mmu_notifier_register |
297 | * returns. mmu_notifier_unregister must be always called to | 325 | * returns. |
298 | * unregister the notifier. mm_count is automatically pinned to allow | 326 | * |
299 | * mmu_notifier_unregister to safely run at any time later, before or | 327 | * mmu_notifier_unregister() or mmu_notifier_put() must be always called to |
300 | * after exit_mmap. ->release will always be called before exit_mmap | 328 | * unregister the notifier. |
301 | * frees the pages. | 329 | * |
330 | * While the caller has a mmu_notifier get the mn->mm pointer will remain | ||
331 | * valid, and can be converted to an active mm pointer via mmget_not_zero(). | ||
302 | */ | 332 | */ |
303 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | 333 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) |
304 | { | 334 | { |
305 | return do_mmu_notifier_register(mn, mm, 1); | 335 | int ret; |
336 | |||
337 | down_write(&mm->mmap_sem); | ||
338 | ret = __mmu_notifier_register(mn, mm); | ||
339 | up_write(&mm->mmap_sem); | ||
340 | return ret; | ||
306 | } | 341 | } |
307 | EXPORT_SYMBOL_GPL(mmu_notifier_register); | 342 | EXPORT_SYMBOL_GPL(mmu_notifier_register); |
308 | 343 | ||
309 | /* | 344 | static struct mmu_notifier * |
310 | * Same as mmu_notifier_register but here the caller must hold the | 345 | find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) |
311 | * mmap_sem in write mode. | 346 | { |
347 | struct mmu_notifier *mn; | ||
348 | |||
349 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
350 | hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) { | ||
351 | if (mn->ops != ops) | ||
352 | continue; | ||
353 | |||
354 | if (likely(mn->users != UINT_MAX)) | ||
355 | mn->users++; | ||
356 | else | ||
357 | mn = ERR_PTR(-EOVERFLOW); | ||
358 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
359 | return mn; | ||
360 | } | ||
361 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
362 | return NULL; | ||
363 | } | ||
364 | |||
365 | /** | ||
366 | * mmu_notifier_get_locked - Return the single struct mmu_notifier for | ||
367 | * the mm & ops | ||
368 | * @ops: The operations struct being subscribe with | ||
369 | * @mm : The mm to attach notifiers too | ||
370 | * | ||
371 | * This function either allocates a new mmu_notifier via | ||
372 | * ops->alloc_notifier(), or returns an already existing notifier on the | ||
373 | * list. The value of the ops pointer is used to determine when two notifiers | ||
374 | * are the same. | ||
375 | * | ||
376 | * Each call to mmu_notifier_get() must be paired with a call to | ||
377 | * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem. | ||
378 | * | ||
379 | * While the caller has a mmu_notifier get the mm pointer will remain valid, | ||
380 | * and can be converted to an active mm pointer via mmget_not_zero(). | ||
312 | */ | 381 | */ |
313 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | 382 | struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, |
383 | struct mm_struct *mm) | ||
314 | { | 384 | { |
315 | return do_mmu_notifier_register(mn, mm, 0); | 385 | struct mmu_notifier *mn; |
386 | int ret; | ||
387 | |||
388 | lockdep_assert_held_write(&mm->mmap_sem); | ||
389 | |||
390 | if (mm->mmu_notifier_mm) { | ||
391 | mn = find_get_mmu_notifier(mm, ops); | ||
392 | if (mn) | ||
393 | return mn; | ||
394 | } | ||
395 | |||
396 | mn = ops->alloc_notifier(mm); | ||
397 | if (IS_ERR(mn)) | ||
398 | return mn; | ||
399 | mn->ops = ops; | ||
400 | ret = __mmu_notifier_register(mn, mm); | ||
401 | if (ret) | ||
402 | goto out_free; | ||
403 | return mn; | ||
404 | out_free: | ||
405 | mn->ops->free_notifier(mn); | ||
406 | return ERR_PTR(ret); | ||
316 | } | 407 | } |
317 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); | 408 | EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); |
318 | 409 | ||
319 | /* this is called after the last mmu_notifier_unregister() returned */ | 410 | /* this is called after the last mmu_notifier_unregister() returned */ |
320 | void __mmu_notifier_mm_destroy(struct mm_struct *mm) | 411 | void __mmu_notifier_mm_destroy(struct mm_struct *mm) |
@@ -375,24 +466,74 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
375 | } | 466 | } |
376 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | 467 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
377 | 468 | ||
378 | /* | 469 | static void mmu_notifier_free_rcu(struct rcu_head *rcu) |
379 | * Same as mmu_notifier_unregister but no callback and no srcu synchronization. | 470 | { |
471 | struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); | ||
472 | struct mm_struct *mm = mn->mm; | ||
473 | |||
474 | mn->ops->free_notifier(mn); | ||
475 | /* Pairs with the get in __mmu_notifier_register() */ | ||
476 | mmdrop(mm); | ||
477 | } | ||
478 | |||
479 | /** | ||
480 | * mmu_notifier_put - Release the reference on the notifier | ||
481 | * @mn: The notifier to act on | ||
482 | * | ||
483 | * This function must be paired with each mmu_notifier_get(), it releases the | ||
484 | * reference obtained by the get. If this is the last reference then process | ||
485 | * to free the notifier will be run asynchronously. | ||
486 | * | ||
487 | * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release | ||
488 | * when the mm_struct is destroyed. Instead free_notifier is always called to | ||
489 | * release any resources held by the user. | ||
490 | * | ||
491 | * As ops->release is not guaranteed to be called, the user must ensure that | ||
492 | * all sptes are dropped, and no new sptes can be established before | ||
493 | * mmu_notifier_put() is called. | ||
494 | * | ||
495 | * This function can be called from the ops->release callback, however the | ||
496 | * caller must still ensure it is called pairwise with mmu_notifier_get(). | ||
497 | * | ||
498 | * Modules calling this function must call mmu_notifier_synchronize() in | ||
499 | * their __exit functions to ensure the async work is completed. | ||
380 | */ | 500 | */ |
381 | void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, | 501 | void mmu_notifier_put(struct mmu_notifier *mn) |
382 | struct mm_struct *mm) | ||
383 | { | 502 | { |
503 | struct mm_struct *mm = mn->mm; | ||
504 | |||
384 | spin_lock(&mm->mmu_notifier_mm->lock); | 505 | spin_lock(&mm->mmu_notifier_mm->lock); |
385 | /* | 506 | if (WARN_ON(!mn->users) || --mn->users) |
386 | * Can not use list_del_rcu() since __mmu_notifier_release | 507 | goto out_unlock; |
387 | * can delete it before we hold the lock. | ||
388 | */ | ||
389 | hlist_del_init_rcu(&mn->hlist); | 508 | hlist_del_init_rcu(&mn->hlist); |
390 | spin_unlock(&mm->mmu_notifier_mm->lock); | 509 | spin_unlock(&mm->mmu_notifier_mm->lock); |
391 | 510 | ||
392 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 511 | call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); |
393 | mmdrop(mm); | 512 | return; |
513 | |||
514 | out_unlock: | ||
515 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
516 | } | ||
517 | EXPORT_SYMBOL_GPL(mmu_notifier_put); | ||
518 | |||
519 | /** | ||
520 | * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed | ||
521 | * | ||
522 | * This function ensures that all outstanding async SRU work from | ||
523 | * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops | ||
524 | * associated with an unused mmu_notifier will no longer be called. | ||
525 | * | ||
526 | * Before using the caller must ensure that all of its mmu_notifiers have been | ||
527 | * fully released via mmu_notifier_put(). | ||
528 | * | ||
529 | * Modules using the mmu_notifier_put() API should call this in their __exit | ||
530 | * function to avoid module unloading races. | ||
531 | */ | ||
532 | void mmu_notifier_synchronize(void) | ||
533 | { | ||
534 | synchronize_srcu(&srcu); | ||
394 | } | 535 | } |
395 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); | 536 | EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); |
396 | 537 | ||
397 | bool | 538 | bool |
398 | mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) | 539 | mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index bf38dfbbb4b4..675e5d34a507 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved | 9 | * (C) Copyright 2002 Red Hat Inc, All Rights Reserved |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/mm.h> | 12 | #include <linux/pagewalk.h> |
13 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
14 | #include <linux/shm.h> | 14 | #include <linux/shm.h> |
15 | #include <linux/mman.h> | 15 | #include <linux/mman.h> |
@@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next, | |||
329 | return 0; | 329 | return 0; |
330 | } | 330 | } |
331 | 331 | ||
332 | static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, | 332 | static const struct mm_walk_ops prot_none_walk_ops = { |
333 | unsigned long end, unsigned long newflags) | 333 | .pte_entry = prot_none_pte_entry, |
334 | { | 334 | .hugetlb_entry = prot_none_hugetlb_entry, |
335 | pgprot_t new_pgprot = vm_get_page_prot(newflags); | 335 | .test_walk = prot_none_test, |
336 | struct mm_walk prot_none_walk = { | 336 | }; |
337 | .pte_entry = prot_none_pte_entry, | ||
338 | .hugetlb_entry = prot_none_hugetlb_entry, | ||
339 | .test_walk = prot_none_test, | ||
340 | .mm = current->mm, | ||
341 | .private = &new_pgprot, | ||
342 | }; | ||
343 | |||
344 | return walk_page_range(start, end, &prot_none_walk); | ||
345 | } | ||
346 | 337 | ||
347 | int | 338 | int |
348 | mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | 339 | mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, |
@@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
369 | if (arch_has_pfn_modify_check() && | 360 | if (arch_has_pfn_modify_check() && |
370 | (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && | 361 | (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && |
371 | (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { | 362 | (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { |
372 | error = prot_none_walk(vma, start, end, newflags); | 363 | pgprot_t new_pgprot = vm_get_page_prot(newflags); |
364 | |||
365 | error = walk_page_range(current->mm, start, end, | ||
366 | &prot_none_walk_ops, &new_pgprot); | ||
373 | if (error) | 367 | if (error) |
374 | return error; | 368 | return error; |
375 | } | 369 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6991ccec9c32..ff5484fdbdf9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -5971,7 +5971,7 @@ void __ref memmap_init_zone_device(struct zone *zone, | |||
5971 | } | 5971 | } |
5972 | } | 5972 | } |
5973 | 5973 | ||
5974 | pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), | 5974 | pr_info("%s initialised %lu pages in %ums\n", __func__, |
5975 | size, jiffies_to_msecs(jiffies - start)); | 5975 | size, jiffies_to_msecs(jiffies - start)); |
5976 | } | 5976 | } |
5977 | 5977 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3084ff2569d..d48c2a986ea3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -1,5 +1,5 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/mm.h> | 2 | #include <linux/pagewalk.h> |
3 | #include <linux/highmem.h> | 3 | #include <linux/highmem.h> |
4 | #include <linux/sched.h> | 4 | #include <linux/sched.h> |
5 | #include <linux/hugetlb.h> | 5 | #include <linux/hugetlb.h> |
@@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
9 | { | 9 | { |
10 | pte_t *pte; | 10 | pte_t *pte; |
11 | int err = 0; | 11 | int err = 0; |
12 | const struct mm_walk_ops *ops = walk->ops; | ||
12 | 13 | ||
13 | pte = pte_offset_map(pmd, addr); | 14 | pte = pte_offset_map(pmd, addr); |
14 | for (;;) { | 15 | for (;;) { |
15 | err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); | 16 | err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); |
16 | if (err) | 17 | if (err) |
17 | break; | 18 | break; |
18 | addr += PAGE_SIZE; | 19 | addr += PAGE_SIZE; |
@@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
30 | { | 31 | { |
31 | pmd_t *pmd; | 32 | pmd_t *pmd; |
32 | unsigned long next; | 33 | unsigned long next; |
34 | const struct mm_walk_ops *ops = walk->ops; | ||
33 | int err = 0; | 35 | int err = 0; |
34 | 36 | ||
35 | pmd = pmd_offset(pud, addr); | 37 | pmd = pmd_offset(pud, addr); |
@@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
37 | again: | 39 | again: |
38 | next = pmd_addr_end(addr, end); | 40 | next = pmd_addr_end(addr, end); |
39 | if (pmd_none(*pmd) || !walk->vma) { | 41 | if (pmd_none(*pmd) || !walk->vma) { |
40 | if (walk->pte_hole) | 42 | if (ops->pte_hole) |
41 | err = walk->pte_hole(addr, next, walk); | 43 | err = ops->pte_hole(addr, next, walk); |
42 | if (err) | 44 | if (err) |
43 | break; | 45 | break; |
44 | continue; | 46 | continue; |
@@ -47,8 +49,8 @@ again: | |||
47 | * This implies that each ->pmd_entry() handler | 49 | * This implies that each ->pmd_entry() handler |
48 | * needs to know about pmd_trans_huge() pmds | 50 | * needs to know about pmd_trans_huge() pmds |
49 | */ | 51 | */ |
50 | if (walk->pmd_entry) | 52 | if (ops->pmd_entry) |
51 | err = walk->pmd_entry(pmd, addr, next, walk); | 53 | err = ops->pmd_entry(pmd, addr, next, walk); |
52 | if (err) | 54 | if (err) |
53 | break; | 55 | break; |
54 | 56 | ||
@@ -56,7 +58,7 @@ again: | |||
56 | * Check this here so we only break down trans_huge | 58 | * Check this here so we only break down trans_huge |
57 | * pages when we _need_ to | 59 | * pages when we _need_ to |
58 | */ | 60 | */ |
59 | if (!walk->pte_entry) | 61 | if (!ops->pte_entry) |
60 | continue; | 62 | continue; |
61 | 63 | ||
62 | split_huge_pmd(walk->vma, pmd, addr); | 64 | split_huge_pmd(walk->vma, pmd, addr); |
@@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, | |||
75 | { | 77 | { |
76 | pud_t *pud; | 78 | pud_t *pud; |
77 | unsigned long next; | 79 | unsigned long next; |
80 | const struct mm_walk_ops *ops = walk->ops; | ||
78 | int err = 0; | 81 | int err = 0; |
79 | 82 | ||
80 | pud = pud_offset(p4d, addr); | 83 | pud = pud_offset(p4d, addr); |
@@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, | |||
82 | again: | 85 | again: |
83 | next = pud_addr_end(addr, end); | 86 | next = pud_addr_end(addr, end); |
84 | if (pud_none(*pud) || !walk->vma) { | 87 | if (pud_none(*pud) || !walk->vma) { |
85 | if (walk->pte_hole) | 88 | if (ops->pte_hole) |
86 | err = walk->pte_hole(addr, next, walk); | 89 | err = ops->pte_hole(addr, next, walk); |
87 | if (err) | 90 | if (err) |
88 | break; | 91 | break; |
89 | continue; | 92 | continue; |
90 | } | 93 | } |
91 | 94 | ||
92 | if (walk->pud_entry) { | 95 | if (ops->pud_entry) { |
93 | spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); | 96 | spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); |
94 | 97 | ||
95 | if (ptl) { | 98 | if (ptl) { |
96 | err = walk->pud_entry(pud, addr, next, walk); | 99 | err = ops->pud_entry(pud, addr, next, walk); |
97 | spin_unlock(ptl); | 100 | spin_unlock(ptl); |
98 | if (err) | 101 | if (err) |
99 | break; | 102 | break; |
@@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, | |||
105 | if (pud_none(*pud)) | 108 | if (pud_none(*pud)) |
106 | goto again; | 109 | goto again; |
107 | 110 | ||
108 | if (walk->pmd_entry || walk->pte_entry) | 111 | if (ops->pmd_entry || ops->pte_entry) |
109 | err = walk_pmd_range(pud, addr, next, walk); | 112 | err = walk_pmd_range(pud, addr, next, walk); |
110 | if (err) | 113 | if (err) |
111 | break; | 114 | break; |
@@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
119 | { | 122 | { |
120 | p4d_t *p4d; | 123 | p4d_t *p4d; |
121 | unsigned long next; | 124 | unsigned long next; |
125 | const struct mm_walk_ops *ops = walk->ops; | ||
122 | int err = 0; | 126 | int err = 0; |
123 | 127 | ||
124 | p4d = p4d_offset(pgd, addr); | 128 | p4d = p4d_offset(pgd, addr); |
125 | do { | 129 | do { |
126 | next = p4d_addr_end(addr, end); | 130 | next = p4d_addr_end(addr, end); |
127 | if (p4d_none_or_clear_bad(p4d)) { | 131 | if (p4d_none_or_clear_bad(p4d)) { |
128 | if (walk->pte_hole) | 132 | if (ops->pte_hole) |
129 | err = walk->pte_hole(addr, next, walk); | 133 | err = ops->pte_hole(addr, next, walk); |
130 | if (err) | 134 | if (err) |
131 | break; | 135 | break; |
132 | continue; | 136 | continue; |
133 | } | 137 | } |
134 | if (walk->pmd_entry || walk->pte_entry) | 138 | if (ops->pmd_entry || ops->pte_entry) |
135 | err = walk_pud_range(p4d, addr, next, walk); | 139 | err = walk_pud_range(p4d, addr, next, walk); |
136 | if (err) | 140 | if (err) |
137 | break; | 141 | break; |
@@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, | |||
145 | { | 149 | { |
146 | pgd_t *pgd; | 150 | pgd_t *pgd; |
147 | unsigned long next; | 151 | unsigned long next; |
152 | const struct mm_walk_ops *ops = walk->ops; | ||
148 | int err = 0; | 153 | int err = 0; |
149 | 154 | ||
150 | pgd = pgd_offset(walk->mm, addr); | 155 | pgd = pgd_offset(walk->mm, addr); |
151 | do { | 156 | do { |
152 | next = pgd_addr_end(addr, end); | 157 | next = pgd_addr_end(addr, end); |
153 | if (pgd_none_or_clear_bad(pgd)) { | 158 | if (pgd_none_or_clear_bad(pgd)) { |
154 | if (walk->pte_hole) | 159 | if (ops->pte_hole) |
155 | err = walk->pte_hole(addr, next, walk); | 160 | err = ops->pte_hole(addr, next, walk); |
156 | if (err) | 161 | if (err) |
157 | break; | 162 | break; |
158 | continue; | 163 | continue; |
159 | } | 164 | } |
160 | if (walk->pmd_entry || walk->pte_entry) | 165 | if (ops->pmd_entry || ops->pte_entry) |
161 | err = walk_p4d_range(pgd, addr, next, walk); | 166 | err = walk_p4d_range(pgd, addr, next, walk); |
162 | if (err) | 167 | if (err) |
163 | break; | 168 | break; |
@@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, | |||
183 | unsigned long hmask = huge_page_mask(h); | 188 | unsigned long hmask = huge_page_mask(h); |
184 | unsigned long sz = huge_page_size(h); | 189 | unsigned long sz = huge_page_size(h); |
185 | pte_t *pte; | 190 | pte_t *pte; |
191 | const struct mm_walk_ops *ops = walk->ops; | ||
186 | int err = 0; | 192 | int err = 0; |
187 | 193 | ||
188 | do { | 194 | do { |
@@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, | |||
190 | pte = huge_pte_offset(walk->mm, addr & hmask, sz); | 196 | pte = huge_pte_offset(walk->mm, addr & hmask, sz); |
191 | 197 | ||
192 | if (pte) | 198 | if (pte) |
193 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); | 199 | err = ops->hugetlb_entry(pte, hmask, addr, next, walk); |
194 | else if (walk->pte_hole) | 200 | else if (ops->pte_hole) |
195 | err = walk->pte_hole(addr, next, walk); | 201 | err = ops->pte_hole(addr, next, walk); |
196 | 202 | ||
197 | if (err) | 203 | if (err) |
198 | break; | 204 | break; |
@@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end, | |||
220 | struct mm_walk *walk) | 226 | struct mm_walk *walk) |
221 | { | 227 | { |
222 | struct vm_area_struct *vma = walk->vma; | 228 | struct vm_area_struct *vma = walk->vma; |
229 | const struct mm_walk_ops *ops = walk->ops; | ||
223 | 230 | ||
224 | if (walk->test_walk) | 231 | if (ops->test_walk) |
225 | return walk->test_walk(start, end, walk); | 232 | return ops->test_walk(start, end, walk); |
226 | 233 | ||
227 | /* | 234 | /* |
228 | * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP | 235 | * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP |
@@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end, | |||
234 | */ | 241 | */ |
235 | if (vma->vm_flags & VM_PFNMAP) { | 242 | if (vma->vm_flags & VM_PFNMAP) { |
236 | int err = 1; | 243 | int err = 1; |
237 | if (walk->pte_hole) | 244 | if (ops->pte_hole) |
238 | err = walk->pte_hole(start, end, walk); | 245 | err = ops->pte_hole(start, end, walk); |
239 | return err ? err : 1; | 246 | return err ? err : 1; |
240 | } | 247 | } |
241 | return 0; | 248 | return 0; |
@@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, | |||
248 | struct vm_area_struct *vma = walk->vma; | 255 | struct vm_area_struct *vma = walk->vma; |
249 | 256 | ||
250 | if (vma && is_vm_hugetlb_page(vma)) { | 257 | if (vma && is_vm_hugetlb_page(vma)) { |
251 | if (walk->hugetlb_entry) | 258 | if (walk->ops->hugetlb_entry) |
252 | err = walk_hugetlb_range(start, end, walk); | 259 | err = walk_hugetlb_range(start, end, walk); |
253 | } else | 260 | } else |
254 | err = walk_pgd_range(start, end, walk); | 261 | err = walk_pgd_range(start, end, walk); |
@@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end, | |||
258 | 265 | ||
259 | /** | 266 | /** |
260 | * walk_page_range - walk page table with caller specific callbacks | 267 | * walk_page_range - walk page table with caller specific callbacks |
261 | * @start: start address of the virtual address range | 268 | * @mm: mm_struct representing the target process of page table walk |
262 | * @end: end address of the virtual address range | 269 | * @start: start address of the virtual address range |
263 | * @walk: mm_walk structure defining the callbacks and the target address space | 270 | * @end: end address of the virtual address range |
271 | * @ops: operation to call during the walk | ||
272 | * @private: private data for callbacks' usage | ||
264 | * | 273 | * |
265 | * Recursively walk the page table tree of the process represented by @walk->mm | 274 | * Recursively walk the page table tree of the process represented by @mm |
266 | * within the virtual address range [@start, @end). During walking, we can do | 275 | * within the virtual address range [@start, @end). During walking, we can do |
267 | * some caller-specific works for each entry, by setting up pmd_entry(), | 276 | * some caller-specific works for each entry, by setting up pmd_entry(), |
268 | * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these | 277 | * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these |
@@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end, | |||
278 | * | 287 | * |
279 | * Before starting to walk page table, some callers want to check whether | 288 | * Before starting to walk page table, some callers want to check whether |
280 | * they really want to walk over the current vma, typically by checking | 289 | * they really want to walk over the current vma, typically by checking |
281 | * its vm_flags. walk_page_test() and @walk->test_walk() are used for this | 290 | * its vm_flags. walk_page_test() and @ops->test_walk() are used for this |
282 | * purpose. | 291 | * purpose. |
283 | * | 292 | * |
284 | * struct mm_walk keeps current values of some common data like vma and pmd, | 293 | * struct mm_walk keeps current values of some common data like vma and pmd, |
285 | * which are useful for the access from callbacks. If you want to pass some | 294 | * which are useful for the access from callbacks. If you want to pass some |
286 | * caller-specific data to callbacks, @walk->private should be helpful. | 295 | * caller-specific data to callbacks, @private should be helpful. |
287 | * | 296 | * |
288 | * Locking: | 297 | * Locking: |
289 | * Callers of walk_page_range() and walk_page_vma() should hold | 298 | * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, |
290 | * @walk->mm->mmap_sem, because these function traverse vma list and/or | 299 | * because these function traverse vma list and/or access to vma's data. |
291 | * access to vma's data. | ||
292 | */ | 300 | */ |
293 | int walk_page_range(unsigned long start, unsigned long end, | 301 | int walk_page_range(struct mm_struct *mm, unsigned long start, |
294 | struct mm_walk *walk) | 302 | unsigned long end, const struct mm_walk_ops *ops, |
303 | void *private) | ||
295 | { | 304 | { |
296 | int err = 0; | 305 | int err = 0; |
297 | unsigned long next; | 306 | unsigned long next; |
298 | struct vm_area_struct *vma; | 307 | struct vm_area_struct *vma; |
308 | struct mm_walk walk = { | ||
309 | .ops = ops, | ||
310 | .mm = mm, | ||
311 | .private = private, | ||
312 | }; | ||
299 | 313 | ||
300 | if (start >= end) | 314 | if (start >= end) |
301 | return -EINVAL; | 315 | return -EINVAL; |
302 | 316 | ||
303 | if (!walk->mm) | 317 | if (!walk.mm) |
304 | return -EINVAL; | 318 | return -EINVAL; |
305 | 319 | ||
306 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); | 320 | lockdep_assert_held(&walk.mm->mmap_sem); |
307 | 321 | ||
308 | vma = find_vma(walk->mm, start); | 322 | vma = find_vma(walk.mm, start); |
309 | do { | 323 | do { |
310 | if (!vma) { /* after the last vma */ | 324 | if (!vma) { /* after the last vma */ |
311 | walk->vma = NULL; | 325 | walk.vma = NULL; |
312 | next = end; | 326 | next = end; |
313 | } else if (start < vma->vm_start) { /* outside vma */ | 327 | } else if (start < vma->vm_start) { /* outside vma */ |
314 | walk->vma = NULL; | 328 | walk.vma = NULL; |
315 | next = min(end, vma->vm_start); | 329 | next = min(end, vma->vm_start); |
316 | } else { /* inside vma */ | 330 | } else { /* inside vma */ |
317 | walk->vma = vma; | 331 | walk.vma = vma; |
318 | next = min(end, vma->vm_end); | 332 | next = min(end, vma->vm_end); |
319 | vma = vma->vm_next; | 333 | vma = vma->vm_next; |
320 | 334 | ||
321 | err = walk_page_test(start, next, walk); | 335 | err = walk_page_test(start, next, &walk); |
322 | if (err > 0) { | 336 | if (err > 0) { |
323 | /* | 337 | /* |
324 | * positive return values are purely for | 338 | * positive return values are purely for |
@@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end, | |||
331 | if (err < 0) | 345 | if (err < 0) |
332 | break; | 346 | break; |
333 | } | 347 | } |
334 | if (walk->vma || walk->pte_hole) | 348 | if (walk.vma || walk.ops->pte_hole) |
335 | err = __walk_page_range(start, next, walk); | 349 | err = __walk_page_range(start, next, &walk); |
336 | if (err) | 350 | if (err) |
337 | break; | 351 | break; |
338 | } while (start = next, start < end); | 352 | } while (start = next, start < end); |
339 | return err; | 353 | return err; |
340 | } | 354 | } |
341 | 355 | ||
342 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) | 356 | int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, |
357 | void *private) | ||
343 | { | 358 | { |
359 | struct mm_walk walk = { | ||
360 | .ops = ops, | ||
361 | .mm = vma->vm_mm, | ||
362 | .vma = vma, | ||
363 | .private = private, | ||
364 | }; | ||
344 | int err; | 365 | int err; |
345 | 366 | ||
346 | if (!walk->mm) | 367 | if (!walk.mm) |
347 | return -EINVAL; | 368 | return -EINVAL; |
348 | 369 | ||
349 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | 370 | lockdep_assert_held(&walk.mm->mmap_sem); |
350 | VM_BUG_ON(!vma); | 371 | |
351 | walk->vma = vma; | 372 | err = walk_page_test(vma->vm_start, vma->vm_end, &walk); |
352 | err = walk_page_test(vma->vm_start, vma->vm_end, walk); | ||
353 | if (err > 0) | 373 | if (err > 0) |
354 | return 0; | 374 | return 0; |
355 | if (err < 0) | 375 | if (err < 0) |
356 | return err; | 376 | return err; |
357 | return __walk_page_range(vma->vm_start, vma->vm_end, walk); | 377 | return __walk_page_range(vma->vm_start, vma->vm_end, &walk); |
358 | } | 378 | } |
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index cd040b5abffe..3f55f2f99112 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c | |||
@@ -132,7 +132,6 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
132 | if (!nfit_res) | 132 | if (!nfit_res) |
133 | return devm_memremap_pages(dev, pgmap); | 133 | return devm_memremap_pages(dev, pgmap); |
134 | 134 | ||
135 | pgmap->dev = dev; | ||
136 | if (!pgmap->ref) { | 135 | if (!pgmap->ref) { |
137 | if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) | 136 | if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) |
138 | return ERR_PTR(-EINVAL); | 137 | return ERR_PTR(-EINVAL); |