diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2016-07-26 18:25:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 19:19:19 -0400 |
commit | bae473a423f65e480db83c85b5e92254f6dfcb28 (patch) | |
tree | 9e09cd8cbcafdcc1a27298700f69f8f86f929392 | |
parent | dcddffd41d3f1d3bdcc1dce3f1cd142779b6d4c1 (diff) |
mm: introduce fault_env
The idea borrowed from Peter's patch from patchset on speculative page
faults[1]:
Instead of passing around the endless list of function arguments,
replace the lot with a single structure so we can change context without
endless function signature changes.
The changes are mostly mechanical with exception of faultaround code:
filemap_map_pages() got reworked a bit.
This patch is preparation for the next one.
[1] http://lkml.kernel.org/r/20141020222841.302891540@infradead.org
Link: http://lkml.kernel.org/r/1466021202-61880-9-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/filesystems/Locking | 10 | ||||
-rw-r--r-- | fs/userfaultfd.c | 22 | ||||
-rw-r--r-- | include/linux/huge_mm.h | 20 | ||||
-rw-r--r-- | include/linux/mm.h | 34 | ||||
-rw-r--r-- | include/linux/userfaultfd_k.h | 8 | ||||
-rw-r--r-- | mm/filemap.c | 28 | ||||
-rw-r--r-- | mm/huge_memory.c | 280 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memory.c | 582 | ||||
-rw-r--r-- | mm/nommu.c | 3 |
10 files changed, 475 insertions, 516 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index dda6e3f8e203..5a7386e38e2d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page | |||
548 | locked. The VM will unlock the page. | 548 | locked. The VM will unlock the page. |
549 | 549 | ||
550 | ->map_pages() is called when VM asks to map easy accessible pages. | 550 | ->map_pages() is called when VM asks to map easy accessible pages. |
551 | Filesystem should find and map pages associated with offsets from "pgoff" | 551 | Filesystem should find and map pages associated with offsets from "start_pgoff" |
552 | till "max_pgoff". ->map_pages() is called with page table locked and must | 552 | till "end_pgoff". ->map_pages() is called with page table locked and must |
553 | not block. If it's not possible to reach a page without blocking, | 553 | not block. If it's not possible to reach a page without blocking, |
554 | filesystem should skip it. Filesystem should use do_set_pte() to setup | 554 | filesystem should skip it. Filesystem should use do_set_pte() to setup |
555 | page table entry. Pointer to entry associated with offset "pgoff" is | 555 | page table entry. Pointer to entry associated with the page is passed in |
556 | passed in "pte" field in vm_fault structure. Pointers to entries for other | 556 | "pte" field in fault_env structure. Pointers to entries for other offsets |
557 | offsets should be calculated relative to "pte". | 557 | should be calculated relative to "pte". |
558 | 558 | ||
559 | ->page_mkwrite() is called when a previously read-only pte is | 559 | ->page_mkwrite() is called when a previously read-only pte is |
560 | about to become writeable. The filesystem again must ensure that there are | 560 | about to become writeable. The filesystem again must ensure that there are |
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2d97952e341a..85959d8324df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
@@ -257,10 +257,9 @@ out: | |||
257 | * fatal_signal_pending()s, and the mmap_sem must be released before | 257 | * fatal_signal_pending()s, and the mmap_sem must be released before |
258 | * returning it. | 258 | * returning it. |
259 | */ | 259 | */ |
260 | int handle_userfault(struct vm_area_struct *vma, unsigned long address, | 260 | int handle_userfault(struct fault_env *fe, unsigned long reason) |
261 | unsigned int flags, unsigned long reason) | ||
262 | { | 261 | { |
263 | struct mm_struct *mm = vma->vm_mm; | 262 | struct mm_struct *mm = fe->vma->vm_mm; |
264 | struct userfaultfd_ctx *ctx; | 263 | struct userfaultfd_ctx *ctx; |
265 | struct userfaultfd_wait_queue uwq; | 264 | struct userfaultfd_wait_queue uwq; |
266 | int ret; | 265 | int ret; |
@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
269 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 268 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
270 | 269 | ||
271 | ret = VM_FAULT_SIGBUS; | 270 | ret = VM_FAULT_SIGBUS; |
272 | ctx = vma->vm_userfaultfd_ctx.ctx; | 271 | ctx = fe->vma->vm_userfaultfd_ctx.ctx; |
273 | if (!ctx) | 272 | if (!ctx) |
274 | goto out; | 273 | goto out; |
275 | 274 | ||
@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
302 | * without first stopping userland access to the memory. For | 301 | * without first stopping userland access to the memory. For |
303 | * VM_UFFD_MISSING userfaults this is enough for now. | 302 | * VM_UFFD_MISSING userfaults this is enough for now. |
304 | */ | 303 | */ |
305 | if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { | 304 | if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { |
306 | /* | 305 | /* |
307 | * Validate the invariant that nowait must allow retry | 306 | * Validate the invariant that nowait must allow retry |
308 | * to be sure not to return SIGBUS erroneously on | 307 | * to be sure not to return SIGBUS erroneously on |
309 | * nowait invocations. | 308 | * nowait invocations. |
310 | */ | 309 | */ |
311 | BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); | 310 | BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); |
312 | #ifdef CONFIG_DEBUG_VM | 311 | #ifdef CONFIG_DEBUG_VM |
313 | if (printk_ratelimit()) { | 312 | if (printk_ratelimit()) { |
314 | printk(KERN_WARNING | 313 | printk(KERN_WARNING |
315 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); | 314 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); |
316 | dump_stack(); | 315 | dump_stack(); |
317 | } | 316 | } |
318 | #endif | 317 | #endif |
@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
324 | * and wait. | 323 | * and wait. |
325 | */ | 324 | */ |
326 | ret = VM_FAULT_RETRY; | 325 | ret = VM_FAULT_RETRY; |
327 | if (flags & FAULT_FLAG_RETRY_NOWAIT) | 326 | if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) |
328 | goto out; | 327 | goto out; |
329 | 328 | ||
330 | /* take the reference before dropping the mmap_sem */ | 329 | /* take the reference before dropping the mmap_sem */ |
@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
332 | 331 | ||
333 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); | 332 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); |
334 | uwq.wq.private = current; | 333 | uwq.wq.private = current; |
335 | uwq.msg = userfault_msg(address, flags, reason); | 334 | uwq.msg = userfault_msg(fe->address, fe->flags, reason); |
336 | uwq.ctx = ctx; | 335 | uwq.ctx = ctx; |
337 | 336 | ||
338 | return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | 337 | return_to_userland = |
338 | (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | ||
339 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); | 339 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); |
340 | 340 | ||
341 | spin_lock(&ctx->fault_pending_wqh.lock); | 341 | spin_lock(&ctx->fault_pending_wqh.lock); |
@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
353 | TASK_KILLABLE); | 353 | TASK_KILLABLE); |
354 | spin_unlock(&ctx->fault_pending_wqh.lock); | 354 | spin_unlock(&ctx->fault_pending_wqh.lock); |
355 | 355 | ||
356 | must_wait = userfaultfd_must_wait(ctx, address, flags, reason); | 356 | must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); |
357 | up_read(&mm->mmap_sem); | 357 | up_read(&mm->mmap_sem); |
358 | 358 | ||
359 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && | 359 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f0a7a0320300..9bed9249156f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -1,20 +1,12 @@ | |||
1 | #ifndef _LINUX_HUGE_MM_H | 1 | #ifndef _LINUX_HUGE_MM_H |
2 | #define _LINUX_HUGE_MM_H | 2 | #define _LINUX_HUGE_MM_H |
3 | 3 | ||
4 | extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, | 4 | extern int do_huge_pmd_anonymous_page(struct fault_env *fe); |
5 | struct vm_area_struct *vma, | ||
6 | unsigned long address, pmd_t *pmd, | ||
7 | unsigned int flags); | ||
8 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 5 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
9 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 6 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
10 | struct vm_area_struct *vma); | 7 | struct vm_area_struct *vma); |
11 | extern void huge_pmd_set_accessed(struct mm_struct *mm, | 8 | extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); |
12 | struct vm_area_struct *vma, | 9 | extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); |
13 | unsigned long address, pmd_t *pmd, | ||
14 | pmd_t orig_pmd, int dirty); | ||
15 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
16 | unsigned long address, pmd_t *pmd, | ||
17 | pmd_t orig_pmd); | ||
18 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | 10 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
19 | unsigned long addr, | 11 | unsigned long addr, |
20 | pmd_t *pmd, | 12 | pmd_t *pmd, |
@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page) | |||
134 | return 1; | 126 | return 1; |
135 | } | 127 | } |
136 | 128 | ||
137 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 129 | extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); |
138 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); | ||
139 | 130 | ||
140 | extern struct page *huge_zero_page; | 131 | extern struct page *huge_zero_page; |
141 | 132 | ||
@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, | |||
196 | return NULL; | 187 | return NULL; |
197 | } | 188 | } |
198 | 189 | ||
199 | static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 190 | static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) |
200 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
201 | { | 191 | { |
202 | return 0; | 192 | return 0; |
203 | } | 193 | } |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 646bc36b4d1b..8bd74558c0e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -309,10 +309,27 @@ struct vm_fault { | |||
309 | * VM_FAULT_DAX_LOCKED and fill in | 309 | * VM_FAULT_DAX_LOCKED and fill in |
310 | * entry here. | 310 | * entry here. |
311 | */ | 311 | */ |
312 | /* for ->map_pages() only */ | 312 | }; |
313 | pgoff_t max_pgoff; /* map pages for offset from pgoff till | 313 | |
314 | * max_pgoff inclusive */ | 314 | /* |
315 | pte_t *pte; /* pte entry associated with ->pgoff */ | 315 | * Page fault context: passes though page fault handler instead of endless list |
316 | * of function arguments. | ||
317 | */ | ||
318 | struct fault_env { | ||
319 | struct vm_area_struct *vma; /* Target VMA */ | ||
320 | unsigned long address; /* Faulting virtual address */ | ||
321 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | ||
322 | pmd_t *pmd; /* Pointer to pmd entry matching | ||
323 | * the 'address' | ||
324 | */ | ||
325 | pte_t *pte; /* Pointer to pte entry matching | ||
326 | * the 'address'. NULL if the page | ||
327 | * table hasn't been allocated. | ||
328 | */ | ||
329 | spinlock_t *ptl; /* Page table lock. | ||
330 | * Protects pte page table if 'pte' | ||
331 | * is not NULL, otherwise pmd. | ||
332 | */ | ||
316 | }; | 333 | }; |
317 | 334 | ||
318 | /* | 335 | /* |
@@ -327,7 +344,8 @@ struct vm_operations_struct { | |||
327 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 344 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
328 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, | 345 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, |
329 | pmd_t *, unsigned int flags); | 346 | pmd_t *, unsigned int flags); |
330 | void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); | 347 | void (*map_pages)(struct fault_env *fe, |
348 | pgoff_t start_pgoff, pgoff_t end_pgoff); | ||
331 | 349 | ||
332 | /* notification that a previously read-only page is about to become | 350 | /* notification that a previously read-only page is about to become |
333 | * writable, if an error is returned it will cause a SIGBUS */ | 351 | * writable, if an error is returned it will cause a SIGBUS */ |
@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
600 | return pte; | 618 | return pte; |
601 | } | 619 | } |
602 | 620 | ||
603 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 621 | void do_set_pte(struct fault_env *fe, struct page *page); |
604 | struct page *page, pte_t *pte, bool write, bool anon); | ||
605 | #endif | 622 | #endif |
606 | 623 | ||
607 | /* | 624 | /* |
@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *); | |||
2062 | 2079 | ||
2063 | /* generic vm_area_ops exported for stackable file systems */ | 2080 | /* generic vm_area_ops exported for stackable file systems */ |
2064 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); | 2081 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); |
2065 | extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); | 2082 | extern void filemap_map_pages(struct fault_env *fe, |
2083 | pgoff_t start_pgoff, pgoff_t end_pgoff); | ||
2066 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 2084 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
2067 | 2085 | ||
2068 | /* mm/page-writeback.c */ | 2086 | /* mm/page-writeback.c */ |
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 587480ad41b7..dd66a952e8cd 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h | |||
@@ -27,8 +27,7 @@ | |||
27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) | 27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) |
28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) | 28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) |
29 | 29 | ||
30 | extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, | 30 | extern int handle_userfault(struct fault_env *fe, unsigned long reason); |
31 | unsigned int flags, unsigned long reason); | ||
32 | 31 | ||
33 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, | 32 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, |
34 | unsigned long src_start, unsigned long len); | 33 | unsigned long src_start, unsigned long len); |
@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) | |||
56 | #else /* CONFIG_USERFAULTFD */ | 55 | #else /* CONFIG_USERFAULTFD */ |
57 | 56 | ||
58 | /* mm helpers */ | 57 | /* mm helpers */ |
59 | static inline int handle_userfault(struct vm_area_struct *vma, | 58 | static inline int handle_userfault(struct fault_env *fe, unsigned long reason) |
60 | unsigned long address, | ||
61 | unsigned int flags, | ||
62 | unsigned long reason) | ||
63 | { | 59 | { |
64 | return VM_FAULT_SIGBUS; | 60 | return VM_FAULT_SIGBUS; |
65 | } | 61 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 20f3b1f33f0e..54d5318f8d3f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -2128,22 +2128,27 @@ page_not_uptodate: | |||
2128 | } | 2128 | } |
2129 | EXPORT_SYMBOL(filemap_fault); | 2129 | EXPORT_SYMBOL(filemap_fault); |
2130 | 2130 | ||
2131 | void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | 2131 | void filemap_map_pages(struct fault_env *fe, |
2132 | pgoff_t start_pgoff, pgoff_t end_pgoff) | ||
2132 | { | 2133 | { |
2133 | struct radix_tree_iter iter; | 2134 | struct radix_tree_iter iter; |
2134 | void **slot; | 2135 | void **slot; |
2135 | struct file *file = vma->vm_file; | 2136 | struct file *file = fe->vma->vm_file; |
2136 | struct address_space *mapping = file->f_mapping; | 2137 | struct address_space *mapping = file->f_mapping; |
2138 | pgoff_t last_pgoff = start_pgoff; | ||
2137 | loff_t size; | 2139 | loff_t size; |
2138 | struct page *page; | 2140 | struct page *page; |
2139 | unsigned long address = (unsigned long) vmf->virtual_address; | ||
2140 | unsigned long addr; | ||
2141 | pte_t *pte; | ||
2142 | 2141 | ||
2143 | rcu_read_lock(); | 2142 | rcu_read_lock(); |
2144 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { | 2143 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, |
2145 | if (iter.index > vmf->max_pgoff) | 2144 | start_pgoff) { |
2145 | if (iter.index > end_pgoff) | ||
2146 | break; | 2146 | break; |
2147 | fe->pte += iter.index - last_pgoff; | ||
2148 | fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; | ||
2149 | last_pgoff = iter.index; | ||
2150 | if (!pte_none(*fe->pte)) | ||
2151 | goto next; | ||
2147 | repeat: | 2152 | repeat: |
2148 | page = radix_tree_deref_slot(slot); | 2153 | page = radix_tree_deref_slot(slot); |
2149 | if (unlikely(!page)) | 2154 | if (unlikely(!page)) |
@@ -2179,14 +2184,9 @@ repeat: | |||
2179 | if (page->index >= size >> PAGE_SHIFT) | 2184 | if (page->index >= size >> PAGE_SHIFT) |
2180 | goto unlock; | 2185 | goto unlock; |
2181 | 2186 | ||
2182 | pte = vmf->pte + page->index - vmf->pgoff; | ||
2183 | if (!pte_none(*pte)) | ||
2184 | goto unlock; | ||
2185 | |||
2186 | if (file->f_ra.mmap_miss > 0) | 2187 | if (file->f_ra.mmap_miss > 0) |
2187 | file->f_ra.mmap_miss--; | 2188 | file->f_ra.mmap_miss--; |
2188 | addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; | 2189 | do_set_pte(fe, page); |
2189 | do_set_pte(vma, addr, page, pte, false, false); | ||
2190 | unlock_page(page); | 2190 | unlock_page(page); |
2191 | goto next; | 2191 | goto next; |
2192 | unlock: | 2192 | unlock: |
@@ -2194,7 +2194,7 @@ unlock: | |||
2194 | skip: | 2194 | skip: |
2195 | put_page(page); | 2195 | put_page(page); |
2196 | next: | 2196 | next: |
2197 | if (iter.index == vmf->max_pgoff) | 2197 | if (iter.index == end_pgoff) |
2198 | break; | 2198 | break; |
2199 | } | 2199 | } |
2200 | rcu_read_unlock(); | 2200 | rcu_read_unlock(); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a90f55d930f..bc5abcbe376e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page) | |||
821 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); | 821 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); |
822 | } | 822 | } |
823 | 823 | ||
824 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 824 | static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, |
825 | struct vm_area_struct *vma, | 825 | gfp_t gfp) |
826 | unsigned long address, pmd_t *pmd, | ||
827 | struct page *page, gfp_t gfp, | ||
828 | unsigned int flags) | ||
829 | { | 826 | { |
827 | struct vm_area_struct *vma = fe->vma; | ||
830 | struct mem_cgroup *memcg; | 828 | struct mem_cgroup *memcg; |
831 | pgtable_t pgtable; | 829 | pgtable_t pgtable; |
832 | spinlock_t *ptl; | 830 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
833 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
834 | 831 | ||
835 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 832 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
836 | 833 | ||
837 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { | 834 | if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { |
838 | put_page(page); | 835 | put_page(page); |
839 | count_vm_event(THP_FAULT_FALLBACK); | 836 | count_vm_event(THP_FAULT_FALLBACK); |
840 | return VM_FAULT_FALLBACK; | 837 | return VM_FAULT_FALLBACK; |
841 | } | 838 | } |
842 | 839 | ||
843 | pgtable = pte_alloc_one(mm, haddr); | 840 | pgtable = pte_alloc_one(vma->vm_mm, haddr); |
844 | if (unlikely(!pgtable)) { | 841 | if (unlikely(!pgtable)) { |
845 | mem_cgroup_cancel_charge(page, memcg, true); | 842 | mem_cgroup_cancel_charge(page, memcg, true); |
846 | put_page(page); | 843 | put_page(page); |
@@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
855 | */ | 852 | */ |
856 | __SetPageUptodate(page); | 853 | __SetPageUptodate(page); |
857 | 854 | ||
858 | ptl = pmd_lock(mm, pmd); | 855 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
859 | if (unlikely(!pmd_none(*pmd))) { | 856 | if (unlikely(!pmd_none(*fe->pmd))) { |
860 | spin_unlock(ptl); | 857 | spin_unlock(fe->ptl); |
861 | mem_cgroup_cancel_charge(page, memcg, true); | 858 | mem_cgroup_cancel_charge(page, memcg, true); |
862 | put_page(page); | 859 | put_page(page); |
863 | pte_free(mm, pgtable); | 860 | pte_free(vma->vm_mm, pgtable); |
864 | } else { | 861 | } else { |
865 | pmd_t entry; | 862 | pmd_t entry; |
866 | 863 | ||
@@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
868 | if (userfaultfd_missing(vma)) { | 865 | if (userfaultfd_missing(vma)) { |
869 | int ret; | 866 | int ret; |
870 | 867 | ||
871 | spin_unlock(ptl); | 868 | spin_unlock(fe->ptl); |
872 | mem_cgroup_cancel_charge(page, memcg, true); | 869 | mem_cgroup_cancel_charge(page, memcg, true); |
873 | put_page(page); | 870 | put_page(page); |
874 | pte_free(mm, pgtable); | 871 | pte_free(vma->vm_mm, pgtable); |
875 | ret = handle_userfault(vma, address, flags, | 872 | ret = handle_userfault(fe, VM_UFFD_MISSING); |
876 | VM_UFFD_MISSING); | ||
877 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 873 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
878 | return ret; | 874 | return ret; |
879 | } | 875 | } |
@@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
883 | page_add_new_anon_rmap(page, vma, haddr, true); | 879 | page_add_new_anon_rmap(page, vma, haddr, true); |
884 | mem_cgroup_commit_charge(page, memcg, false, true); | 880 | mem_cgroup_commit_charge(page, memcg, false, true); |
885 | lru_cache_add_active_or_unevictable(page, vma); | 881 | lru_cache_add_active_or_unevictable(page, vma); |
886 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 882 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); |
887 | set_pmd_at(mm, haddr, pmd, entry); | 883 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); |
888 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 884 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
889 | atomic_long_inc(&mm->nr_ptes); | 885 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
890 | spin_unlock(ptl); | 886 | spin_unlock(fe->ptl); |
891 | count_vm_event(THP_FAULT_ALLOC); | 887 | count_vm_event(THP_FAULT_ALLOC); |
892 | } | 888 | } |
893 | 889 | ||
@@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
937 | return true; | 933 | return true; |
938 | } | 934 | } |
939 | 935 | ||
940 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 936 | int do_huge_pmd_anonymous_page(struct fault_env *fe) |
941 | unsigned long address, pmd_t *pmd, | ||
942 | unsigned int flags) | ||
943 | { | 937 | { |
938 | struct vm_area_struct *vma = fe->vma; | ||
944 | gfp_t gfp; | 939 | gfp_t gfp; |
945 | struct page *page; | 940 | struct page *page; |
946 | unsigned long haddr = address & HPAGE_PMD_MASK; | 941 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
947 | 942 | ||
948 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) | 943 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
949 | return VM_FAULT_FALLBACK; | 944 | return VM_FAULT_FALLBACK; |
@@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
951 | return VM_FAULT_OOM; | 946 | return VM_FAULT_OOM; |
952 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) | 947 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) |
953 | return VM_FAULT_OOM; | 948 | return VM_FAULT_OOM; |
954 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && | 949 | if (!(fe->flags & FAULT_FLAG_WRITE) && |
950 | !mm_forbids_zeropage(vma->vm_mm) && | ||
955 | transparent_hugepage_use_zero_page()) { | 951 | transparent_hugepage_use_zero_page()) { |
956 | spinlock_t *ptl; | ||
957 | pgtable_t pgtable; | 952 | pgtable_t pgtable; |
958 | struct page *zero_page; | 953 | struct page *zero_page; |
959 | bool set; | 954 | bool set; |
960 | int ret; | 955 | int ret; |
961 | pgtable = pte_alloc_one(mm, haddr); | 956 | pgtable = pte_alloc_one(vma->vm_mm, haddr); |
962 | if (unlikely(!pgtable)) | 957 | if (unlikely(!pgtable)) |
963 | return VM_FAULT_OOM; | 958 | return VM_FAULT_OOM; |
964 | zero_page = get_huge_zero_page(); | 959 | zero_page = get_huge_zero_page(); |
965 | if (unlikely(!zero_page)) { | 960 | if (unlikely(!zero_page)) { |
966 | pte_free(mm, pgtable); | 961 | pte_free(vma->vm_mm, pgtable); |
967 | count_vm_event(THP_FAULT_FALLBACK); | 962 | count_vm_event(THP_FAULT_FALLBACK); |
968 | return VM_FAULT_FALLBACK; | 963 | return VM_FAULT_FALLBACK; |
969 | } | 964 | } |
970 | ptl = pmd_lock(mm, pmd); | 965 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
971 | ret = 0; | 966 | ret = 0; |
972 | set = false; | 967 | set = false; |
973 | if (pmd_none(*pmd)) { | 968 | if (pmd_none(*fe->pmd)) { |
974 | if (userfaultfd_missing(vma)) { | 969 | if (userfaultfd_missing(vma)) { |
975 | spin_unlock(ptl); | 970 | spin_unlock(fe->ptl); |
976 | ret = handle_userfault(vma, address, flags, | 971 | ret = handle_userfault(fe, VM_UFFD_MISSING); |
977 | VM_UFFD_MISSING); | ||
978 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 972 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
979 | } else { | 973 | } else { |
980 | set_huge_zero_page(pgtable, mm, vma, | 974 | set_huge_zero_page(pgtable, vma->vm_mm, vma, |
981 | haddr, pmd, | 975 | haddr, fe->pmd, zero_page); |
982 | zero_page); | 976 | spin_unlock(fe->ptl); |
983 | spin_unlock(ptl); | ||
984 | set = true; | 977 | set = true; |
985 | } | 978 | } |
986 | } else | 979 | } else |
987 | spin_unlock(ptl); | 980 | spin_unlock(fe->ptl); |
988 | if (!set) { | 981 | if (!set) { |
989 | pte_free(mm, pgtable); | 982 | pte_free(vma->vm_mm, pgtable); |
990 | put_huge_zero_page(); | 983 | put_huge_zero_page(); |
991 | } | 984 | } |
992 | return ret; | 985 | return ret; |
@@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
998 | return VM_FAULT_FALLBACK; | 991 | return VM_FAULT_FALLBACK; |
999 | } | 992 | } |
1000 | prep_transhuge_page(page); | 993 | prep_transhuge_page(page); |
1001 | return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, | 994 | return __do_huge_pmd_anonymous_page(fe, page, gfp); |
1002 | flags); | ||
1003 | } | 995 | } |
1004 | 996 | ||
1005 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 997 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
@@ -1172,38 +1164,31 @@ out: | |||
1172 | return ret; | 1164 | return ret; |
1173 | } | 1165 | } |
1174 | 1166 | ||
1175 | void huge_pmd_set_accessed(struct mm_struct *mm, | 1167 | void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) |
1176 | struct vm_area_struct *vma, | ||
1177 | unsigned long address, | ||
1178 | pmd_t *pmd, pmd_t orig_pmd, | ||
1179 | int dirty) | ||
1180 | { | 1168 | { |
1181 | spinlock_t *ptl; | ||
1182 | pmd_t entry; | 1169 | pmd_t entry; |
1183 | unsigned long haddr; | 1170 | unsigned long haddr; |
1184 | 1171 | ||
1185 | ptl = pmd_lock(mm, pmd); | 1172 | fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); |
1186 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1173 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) |
1187 | goto unlock; | 1174 | goto unlock; |
1188 | 1175 | ||
1189 | entry = pmd_mkyoung(orig_pmd); | 1176 | entry = pmd_mkyoung(orig_pmd); |
1190 | haddr = address & HPAGE_PMD_MASK; | 1177 | haddr = fe->address & HPAGE_PMD_MASK; |
1191 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | 1178 | if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, |
1192 | update_mmu_cache_pmd(vma, address, pmd); | 1179 | fe->flags & FAULT_FLAG_WRITE)) |
1180 | update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); | ||
1193 | 1181 | ||
1194 | unlock: | 1182 | unlock: |
1195 | spin_unlock(ptl); | 1183 | spin_unlock(fe->ptl); |
1196 | } | 1184 | } |
1197 | 1185 | ||
1198 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1186 | static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, |
1199 | struct vm_area_struct *vma, | 1187 | struct page *page) |
1200 | unsigned long address, | ||
1201 | pmd_t *pmd, pmd_t orig_pmd, | ||
1202 | struct page *page, | ||
1203 | unsigned long haddr) | ||
1204 | { | 1188 | { |
1189 | struct vm_area_struct *vma = fe->vma; | ||
1190 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | ||
1205 | struct mem_cgroup *memcg; | 1191 | struct mem_cgroup *memcg; |
1206 | spinlock_t *ptl; | ||
1207 | pgtable_t pgtable; | 1192 | pgtable_t pgtable; |
1208 | pmd_t _pmd; | 1193 | pmd_t _pmd; |
1209 | int ret = 0, i; | 1194 | int ret = 0, i; |
@@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1220 | 1205 | ||
1221 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1206 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1222 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | | 1207 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
1223 | __GFP_OTHER_NODE, | 1208 | __GFP_OTHER_NODE, vma, |
1224 | vma, address, page_to_nid(page)); | 1209 | fe->address, page_to_nid(page)); |
1225 | if (unlikely(!pages[i] || | 1210 | if (unlikely(!pages[i] || |
1226 | mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, | 1211 | mem_cgroup_try_charge(pages[i], vma->vm_mm, |
1227 | &memcg, false))) { | 1212 | GFP_KERNEL, &memcg, false))) { |
1228 | if (pages[i]) | 1213 | if (pages[i]) |
1229 | put_page(pages[i]); | 1214 | put_page(pages[i]); |
1230 | while (--i >= 0) { | 1215 | while (--i >= 0) { |
@@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1250 | 1235 | ||
1251 | mmun_start = haddr; | 1236 | mmun_start = haddr; |
1252 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1237 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1253 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1238 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
1254 | 1239 | ||
1255 | ptl = pmd_lock(mm, pmd); | 1240 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
1256 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1241 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) |
1257 | goto out_free_pages; | 1242 | goto out_free_pages; |
1258 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1243 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1259 | 1244 | ||
1260 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 1245 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); |
1261 | /* leave pmd empty until pte is filled */ | 1246 | /* leave pmd empty until pte is filled */ |
1262 | 1247 | ||
1263 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 1248 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); |
1264 | pmd_populate(mm, &_pmd, pgtable); | 1249 | pmd_populate(vma->vm_mm, &_pmd, pgtable); |
1265 | 1250 | ||
1266 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1251 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1267 | pte_t *pte, entry; | 1252 | pte_t entry; |
1268 | entry = mk_pte(pages[i], vma->vm_page_prot); | 1253 | entry = mk_pte(pages[i], vma->vm_page_prot); |
1269 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1254 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1270 | memcg = (void *)page_private(pages[i]); | 1255 | memcg = (void *)page_private(pages[i]); |
1271 | set_page_private(pages[i], 0); | 1256 | set_page_private(pages[i], 0); |
1272 | page_add_new_anon_rmap(pages[i], vma, haddr, false); | 1257 | page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); |
1273 | mem_cgroup_commit_charge(pages[i], memcg, false, false); | 1258 | mem_cgroup_commit_charge(pages[i], memcg, false, false); |
1274 | lru_cache_add_active_or_unevictable(pages[i], vma); | 1259 | lru_cache_add_active_or_unevictable(pages[i], vma); |
1275 | pte = pte_offset_map(&_pmd, haddr); | 1260 | fe->pte = pte_offset_map(&_pmd, haddr); |
1276 | VM_BUG_ON(!pte_none(*pte)); | 1261 | VM_BUG_ON(!pte_none(*fe->pte)); |
1277 | set_pte_at(mm, haddr, pte, entry); | 1262 | set_pte_at(vma->vm_mm, haddr, fe->pte, entry); |
1278 | pte_unmap(pte); | 1263 | pte_unmap(fe->pte); |
1279 | } | 1264 | } |
1280 | kfree(pages); | 1265 | kfree(pages); |
1281 | 1266 | ||
1282 | smp_wmb(); /* make pte visible before pmd */ | 1267 | smp_wmb(); /* make pte visible before pmd */ |
1283 | pmd_populate(mm, pmd, pgtable); | 1268 | pmd_populate(vma->vm_mm, fe->pmd, pgtable); |
1284 | page_remove_rmap(page, true); | 1269 | page_remove_rmap(page, true); |
1285 | spin_unlock(ptl); | 1270 | spin_unlock(fe->ptl); |
1286 | 1271 | ||
1287 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1272 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1288 | 1273 | ||
1289 | ret |= VM_FAULT_WRITE; | 1274 | ret |= VM_FAULT_WRITE; |
1290 | put_page(page); | 1275 | put_page(page); |
@@ -1293,8 +1278,8 @@ out: | |||
1293 | return ret; | 1278 | return ret; |
1294 | 1279 | ||
1295 | out_free_pages: | 1280 | out_free_pages: |
1296 | spin_unlock(ptl); | 1281 | spin_unlock(fe->ptl); |
1297 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1282 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1298 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1283 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1299 | memcg = (void *)page_private(pages[i]); | 1284 | memcg = (void *)page_private(pages[i]); |
1300 | set_page_private(pages[i], 0); | 1285 | set_page_private(pages[i], 0); |
@@ -1305,25 +1290,23 @@ out_free_pages: | |||
1305 | goto out; | 1290 | goto out; |
1306 | } | 1291 | } |
1307 | 1292 | ||
1308 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1293 | int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) |
1309 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | ||
1310 | { | 1294 | { |
1311 | spinlock_t *ptl; | 1295 | struct vm_area_struct *vma = fe->vma; |
1312 | int ret = 0; | ||
1313 | struct page *page = NULL, *new_page; | 1296 | struct page *page = NULL, *new_page; |
1314 | struct mem_cgroup *memcg; | 1297 | struct mem_cgroup *memcg; |
1315 | unsigned long haddr; | 1298 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
1316 | unsigned long mmun_start; /* For mmu_notifiers */ | 1299 | unsigned long mmun_start; /* For mmu_notifiers */ |
1317 | unsigned long mmun_end; /* For mmu_notifiers */ | 1300 | unsigned long mmun_end; /* For mmu_notifiers */ |
1318 | gfp_t huge_gfp; /* for allocation and charge */ | 1301 | gfp_t huge_gfp; /* for allocation and charge */ |
1302 | int ret = 0; | ||
1319 | 1303 | ||
1320 | ptl = pmd_lockptr(mm, pmd); | 1304 | fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); |
1321 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 1305 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
1322 | haddr = address & HPAGE_PMD_MASK; | ||
1323 | if (is_huge_zero_pmd(orig_pmd)) | 1306 | if (is_huge_zero_pmd(orig_pmd)) |
1324 | goto alloc; | 1307 | goto alloc; |
1325 | spin_lock(ptl); | 1308 | spin_lock(fe->ptl); |
1326 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1309 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) |
1327 | goto out_unlock; | 1310 | goto out_unlock; |
1328 | 1311 | ||
1329 | page = pmd_page(orig_pmd); | 1312 | page = pmd_page(orig_pmd); |
@@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1336 | pmd_t entry; | 1319 | pmd_t entry; |
1337 | entry = pmd_mkyoung(orig_pmd); | 1320 | entry = pmd_mkyoung(orig_pmd); |
1338 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1321 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1339 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 1322 | if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) |
1340 | update_mmu_cache_pmd(vma, address, pmd); | 1323 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1341 | ret |= VM_FAULT_WRITE; | 1324 | ret |= VM_FAULT_WRITE; |
1342 | goto out_unlock; | 1325 | goto out_unlock; |
1343 | } | 1326 | } |
1344 | get_page(page); | 1327 | get_page(page); |
1345 | spin_unlock(ptl); | 1328 | spin_unlock(fe->ptl); |
1346 | alloc: | 1329 | alloc: |
1347 | if (transparent_hugepage_enabled(vma) && | 1330 | if (transparent_hugepage_enabled(vma) && |
1348 | !transparent_hugepage_debug_cow()) { | 1331 | !transparent_hugepage_debug_cow()) { |
@@ -1355,13 +1338,12 @@ alloc: | |||
1355 | prep_transhuge_page(new_page); | 1338 | prep_transhuge_page(new_page); |
1356 | } else { | 1339 | } else { |
1357 | if (!page) { | 1340 | if (!page) { |
1358 | split_huge_pmd(vma, pmd, address); | 1341 | split_huge_pmd(vma, fe->pmd, fe->address); |
1359 | ret |= VM_FAULT_FALLBACK; | 1342 | ret |= VM_FAULT_FALLBACK; |
1360 | } else { | 1343 | } else { |
1361 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1344 | ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); |
1362 | pmd, orig_pmd, page, haddr); | ||
1363 | if (ret & VM_FAULT_OOM) { | 1345 | if (ret & VM_FAULT_OOM) { |
1364 | split_huge_pmd(vma, pmd, address); | 1346 | split_huge_pmd(vma, fe->pmd, fe->address); |
1365 | ret |= VM_FAULT_FALLBACK; | 1347 | ret |= VM_FAULT_FALLBACK; |
1366 | } | 1348 | } |
1367 | put_page(page); | 1349 | put_page(page); |
@@ -1370,14 +1352,12 @@ alloc: | |||
1370 | goto out; | 1352 | goto out; |
1371 | } | 1353 | } |
1372 | 1354 | ||
1373 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, | 1355 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, |
1374 | true))) { | 1356 | huge_gfp, &memcg, true))) { |
1375 | put_page(new_page); | 1357 | put_page(new_page); |
1376 | if (page) { | 1358 | split_huge_pmd(vma, fe->pmd, fe->address); |
1377 | split_huge_pmd(vma, pmd, address); | 1359 | if (page) |
1378 | put_page(page); | 1360 | put_page(page); |
1379 | } else | ||
1380 | split_huge_pmd(vma, pmd, address); | ||
1381 | ret |= VM_FAULT_FALLBACK; | 1361 | ret |= VM_FAULT_FALLBACK; |
1382 | count_vm_event(THP_FAULT_FALLBACK); | 1362 | count_vm_event(THP_FAULT_FALLBACK); |
1383 | goto out; | 1363 | goto out; |
@@ -1393,13 +1373,13 @@ alloc: | |||
1393 | 1373 | ||
1394 | mmun_start = haddr; | 1374 | mmun_start = haddr; |
1395 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1375 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1396 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1376 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
1397 | 1377 | ||
1398 | spin_lock(ptl); | 1378 | spin_lock(fe->ptl); |
1399 | if (page) | 1379 | if (page) |
1400 | put_page(page); | 1380 | put_page(page); |
1401 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1381 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { |
1402 | spin_unlock(ptl); | 1382 | spin_unlock(fe->ptl); |
1403 | mem_cgroup_cancel_charge(new_page, memcg, true); | 1383 | mem_cgroup_cancel_charge(new_page, memcg, true); |
1404 | put_page(new_page); | 1384 | put_page(new_page); |
1405 | goto out_mn; | 1385 | goto out_mn; |
@@ -1407,14 +1387,14 @@ alloc: | |||
1407 | pmd_t entry; | 1387 | pmd_t entry; |
1408 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1388 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1409 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1389 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1410 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 1390 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); |
1411 | page_add_new_anon_rmap(new_page, vma, haddr, true); | 1391 | page_add_new_anon_rmap(new_page, vma, haddr, true); |
1412 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1392 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1413 | lru_cache_add_active_or_unevictable(new_page, vma); | 1393 | lru_cache_add_active_or_unevictable(new_page, vma); |
1414 | set_pmd_at(mm, haddr, pmd, entry); | 1394 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); |
1415 | update_mmu_cache_pmd(vma, address, pmd); | 1395 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1416 | if (!page) { | 1396 | if (!page) { |
1417 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1397 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1418 | put_huge_zero_page(); | 1398 | put_huge_zero_page(); |
1419 | } else { | 1399 | } else { |
1420 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1400 | VM_BUG_ON_PAGE(!PageHead(page), page); |
@@ -1423,13 +1403,13 @@ alloc: | |||
1423 | } | 1403 | } |
1424 | ret |= VM_FAULT_WRITE; | 1404 | ret |= VM_FAULT_WRITE; |
1425 | } | 1405 | } |
1426 | spin_unlock(ptl); | 1406 | spin_unlock(fe->ptl); |
1427 | out_mn: | 1407 | out_mn: |
1428 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1408 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1429 | out: | 1409 | out: |
1430 | return ret; | 1410 | return ret; |
1431 | out_unlock: | 1411 | out_unlock: |
1432 | spin_unlock(ptl); | 1412 | spin_unlock(fe->ptl); |
1433 | return ret; | 1413 | return ret; |
1434 | } | 1414 | } |
1435 | 1415 | ||
@@ -1489,13 +1469,12 @@ out: | |||
1489 | } | 1469 | } |
1490 | 1470 | ||
1491 | /* NUMA hinting page fault entry point for trans huge pmds */ | 1471 | /* NUMA hinting page fault entry point for trans huge pmds */ |
1492 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1472 | int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) |
1493 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
1494 | { | 1473 | { |
1495 | spinlock_t *ptl; | 1474 | struct vm_area_struct *vma = fe->vma; |
1496 | struct anon_vma *anon_vma = NULL; | 1475 | struct anon_vma *anon_vma = NULL; |
1497 | struct page *page; | 1476 | struct page *page; |
1498 | unsigned long haddr = addr & HPAGE_PMD_MASK; | 1477 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
1499 | int page_nid = -1, this_nid = numa_node_id(); | 1478 | int page_nid = -1, this_nid = numa_node_id(); |
1500 | int target_nid, last_cpupid = -1; | 1479 | int target_nid, last_cpupid = -1; |
1501 | bool page_locked; | 1480 | bool page_locked; |
@@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1506 | /* A PROT_NONE fault should not end up here */ | 1485 | /* A PROT_NONE fault should not end up here */ |
1507 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | 1486 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); |
1508 | 1487 | ||
1509 | ptl = pmd_lock(mm, pmdp); | 1488 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
1510 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1489 | if (unlikely(!pmd_same(pmd, *fe->pmd))) |
1511 | goto out_unlock; | 1490 | goto out_unlock; |
1512 | 1491 | ||
1513 | /* | 1492 | /* |
@@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1515 | * without disrupting NUMA hinting information. Do not relock and | 1494 | * without disrupting NUMA hinting information. Do not relock and |
1516 | * check_same as the page may no longer be mapped. | 1495 | * check_same as the page may no longer be mapped. |
1517 | */ | 1496 | */ |
1518 | if (unlikely(pmd_trans_migrating(*pmdp))) { | 1497 | if (unlikely(pmd_trans_migrating(*fe->pmd))) { |
1519 | page = pmd_page(*pmdp); | 1498 | page = pmd_page(*fe->pmd); |
1520 | spin_unlock(ptl); | 1499 | spin_unlock(fe->ptl); |
1521 | wait_on_page_locked(page); | 1500 | wait_on_page_locked(page); |
1522 | goto out; | 1501 | goto out; |
1523 | } | 1502 | } |
@@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1550 | 1529 | ||
1551 | /* Migration could have started since the pmd_trans_migrating check */ | 1530 | /* Migration could have started since the pmd_trans_migrating check */ |
1552 | if (!page_locked) { | 1531 | if (!page_locked) { |
1553 | spin_unlock(ptl); | 1532 | spin_unlock(fe->ptl); |
1554 | wait_on_page_locked(page); | 1533 | wait_on_page_locked(page); |
1555 | page_nid = -1; | 1534 | page_nid = -1; |
1556 | goto out; | 1535 | goto out; |
@@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1561 | * to serialises splits | 1540 | * to serialises splits |
1562 | */ | 1541 | */ |
1563 | get_page(page); | 1542 | get_page(page); |
1564 | spin_unlock(ptl); | 1543 | spin_unlock(fe->ptl); |
1565 | anon_vma = page_lock_anon_vma_read(page); | 1544 | anon_vma = page_lock_anon_vma_read(page); |
1566 | 1545 | ||
1567 | /* Confirm the PMD did not change while page_table_lock was released */ | 1546 | /* Confirm the PMD did not change while page_table_lock was released */ |
1568 | spin_lock(ptl); | 1547 | spin_lock(fe->ptl); |
1569 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1548 | if (unlikely(!pmd_same(pmd, *fe->pmd))) { |
1570 | unlock_page(page); | 1549 | unlock_page(page); |
1571 | put_page(page); | 1550 | put_page(page); |
1572 | page_nid = -1; | 1551 | page_nid = -1; |
@@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1584 | * Migrate the THP to the requested node, returns with page unlocked | 1563 | * Migrate the THP to the requested node, returns with page unlocked |
1585 | * and access rights restored. | 1564 | * and access rights restored. |
1586 | */ | 1565 | */ |
1587 | spin_unlock(ptl); | 1566 | spin_unlock(fe->ptl); |
1588 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1567 | migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, |
1589 | pmdp, pmd, addr, page, target_nid); | 1568 | fe->pmd, pmd, fe->address, page, target_nid); |
1590 | if (migrated) { | 1569 | if (migrated) { |
1591 | flags |= TNF_MIGRATED; | 1570 | flags |= TNF_MIGRATED; |
1592 | page_nid = target_nid; | 1571 | page_nid = target_nid; |
@@ -1601,18 +1580,18 @@ clear_pmdnuma: | |||
1601 | pmd = pmd_mkyoung(pmd); | 1580 | pmd = pmd_mkyoung(pmd); |
1602 | if (was_writable) | 1581 | if (was_writable) |
1603 | pmd = pmd_mkwrite(pmd); | 1582 | pmd = pmd_mkwrite(pmd); |
1604 | set_pmd_at(mm, haddr, pmdp, pmd); | 1583 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); |
1605 | update_mmu_cache_pmd(vma, addr, pmdp); | 1584 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1606 | unlock_page(page); | 1585 | unlock_page(page); |
1607 | out_unlock: | 1586 | out_unlock: |
1608 | spin_unlock(ptl); | 1587 | spin_unlock(fe->ptl); |
1609 | 1588 | ||
1610 | out: | 1589 | out: |
1611 | if (anon_vma) | 1590 | if (anon_vma) |
1612 | page_unlock_anon_vma_read(anon_vma); | 1591 | page_unlock_anon_vma_read(anon_vma); |
1613 | 1592 | ||
1614 | if (page_nid != -1) | 1593 | if (page_nid != -1) |
1615 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); | 1594 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); |
1616 | 1595 | ||
1617 | return 0; | 1596 | return 0; |
1618 | } | 1597 | } |
@@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
2413 | struct vm_area_struct *vma, | 2392 | struct vm_area_struct *vma, |
2414 | unsigned long address, pmd_t *pmd) | 2393 | unsigned long address, pmd_t *pmd) |
2415 | { | 2394 | { |
2416 | unsigned long _address; | 2395 | pte_t pteval; |
2417 | pte_t *pte, pteval; | ||
2418 | int swapped_in = 0, ret = 0; | 2396 | int swapped_in = 0, ret = 0; |
2419 | 2397 | struct fault_env fe = { | |
2420 | pte = pte_offset_map(pmd, address); | 2398 | .vma = vma, |
2421 | for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE; | 2399 | .address = address, |
2422 | pte++, _address += PAGE_SIZE) { | 2400 | .flags = FAULT_FLAG_ALLOW_RETRY, |
2423 | pteval = *pte; | 2401 | .pmd = pmd, |
2402 | }; | ||
2403 | |||
2404 | fe.pte = pte_offset_map(pmd, address); | ||
2405 | for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; | ||
2406 | fe.pte++, fe.address += PAGE_SIZE) { | ||
2407 | pteval = *fe.pte; | ||
2424 | if (!is_swap_pte(pteval)) | 2408 | if (!is_swap_pte(pteval)) |
2425 | continue; | 2409 | continue; |
2426 | swapped_in++; | 2410 | swapped_in++; |
2427 | ret = do_swap_page(mm, vma, _address, pte, pmd, | 2411 | ret = do_swap_page(&fe, pteval); |
2428 | FAULT_FLAG_ALLOW_RETRY, | ||
2429 | pteval); | ||
2430 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ | 2412 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ |
2431 | if (ret & VM_FAULT_RETRY) { | 2413 | if (ret & VM_FAULT_RETRY) { |
2432 | down_read(&mm->mmap_sem); | 2414 | down_read(&mm->mmap_sem); |
@@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
2442 | return false; | 2424 | return false; |
2443 | } | 2425 | } |
2444 | /* pte is unmapped now, we need to map it */ | 2426 | /* pte is unmapped now, we need to map it */ |
2445 | pte = pte_offset_map(pmd, _address); | 2427 | fe.pte = pte_offset_map(pmd, fe.address); |
2446 | } | 2428 | } |
2447 | pte--; | 2429 | fe.pte--; |
2448 | pte_unmap(pte); | 2430 | pte_unmap(fe.pte); |
2449 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); | 2431 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); |
2450 | return true; | 2432 | return true; |
2451 | } | 2433 | } |
diff --git a/mm/internal.h b/mm/internal.h index e1531758122b..9b6a6c43ac39 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -36,9 +36,7 @@ | |||
36 | /* Do not use these with a slab allocator */ | 36 | /* Do not use these with a slab allocator */ |
37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) | 37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) |
38 | 38 | ||
39 | extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 39 | int do_swap_page(struct fault_env *fe, pte_t orig_pte); |
40 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
41 | unsigned int flags, pte_t orig_pte); | ||
42 | 40 | ||
43 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 41 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
44 | unsigned long floor, unsigned long ceiling); | 42 | unsigned long floor, unsigned long ceiling); |
diff --git a/mm/memory.c b/mm/memory.c index 6bf2b8564376..72b520897339 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2070,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2070 | * case, all we need to do here is to mark the page as writable and update | 2070 | * case, all we need to do here is to mark the page as writable and update |
2071 | * any related book-keeping. | 2071 | * any related book-keeping. |
2072 | */ | 2072 | */ |
2073 | static inline int wp_page_reuse(struct mm_struct *mm, | 2073 | static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, |
2074 | struct vm_area_struct *vma, unsigned long address, | 2074 | struct page *page, int page_mkwrite, int dirty_shared) |
2075 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | 2075 | __releases(fe->ptl) |
2076 | struct page *page, int page_mkwrite, | ||
2077 | int dirty_shared) | ||
2078 | __releases(ptl) | ||
2079 | { | 2076 | { |
2077 | struct vm_area_struct *vma = fe->vma; | ||
2080 | pte_t entry; | 2078 | pte_t entry; |
2081 | /* | 2079 | /* |
2082 | * Clear the pages cpupid information as the existing | 2080 | * Clear the pages cpupid information as the existing |
@@ -2086,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm, | |||
2086 | if (page) | 2084 | if (page) |
2087 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); | 2085 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
2088 | 2086 | ||
2089 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2087 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); |
2090 | entry = pte_mkyoung(orig_pte); | 2088 | entry = pte_mkyoung(orig_pte); |
2091 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2089 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2092 | if (ptep_set_access_flags(vma, address, page_table, entry, 1)) | 2090 | if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) |
2093 | update_mmu_cache(vma, address, page_table); | 2091 | update_mmu_cache(vma, fe->address, fe->pte); |
2094 | pte_unmap_unlock(page_table, ptl); | 2092 | pte_unmap_unlock(fe->pte, fe->ptl); |
2095 | 2093 | ||
2096 | if (dirty_shared) { | 2094 | if (dirty_shared) { |
2097 | struct address_space *mapping; | 2095 | struct address_space *mapping; |
@@ -2137,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm, | |||
2137 | * held to the old page, as well as updating the rmap. | 2135 | * held to the old page, as well as updating the rmap. |
2138 | * - In any case, unlock the PTL and drop the reference we took to the old page. | 2136 | * - In any case, unlock the PTL and drop the reference we took to the old page. |
2139 | */ | 2137 | */ |
2140 | static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | 2138 | static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, |
2141 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2139 | struct page *old_page) |
2142 | pte_t orig_pte, struct page *old_page) | ||
2143 | { | 2140 | { |
2141 | struct vm_area_struct *vma = fe->vma; | ||
2142 | struct mm_struct *mm = vma->vm_mm; | ||
2144 | struct page *new_page = NULL; | 2143 | struct page *new_page = NULL; |
2145 | spinlock_t *ptl = NULL; | ||
2146 | pte_t entry; | 2144 | pte_t entry; |
2147 | int page_copied = 0; | 2145 | int page_copied = 0; |
2148 | const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ | 2146 | const unsigned long mmun_start = fe->address & PAGE_MASK; |
2149 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ | 2147 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; |
2150 | struct mem_cgroup *memcg; | 2148 | struct mem_cgroup *memcg; |
2151 | 2149 | ||
2152 | if (unlikely(anon_vma_prepare(vma))) | 2150 | if (unlikely(anon_vma_prepare(vma))) |
2153 | goto oom; | 2151 | goto oom; |
2154 | 2152 | ||
2155 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2153 | if (is_zero_pfn(pte_pfn(orig_pte))) { |
2156 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 2154 | new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); |
2157 | if (!new_page) | 2155 | if (!new_page) |
2158 | goto oom; | 2156 | goto oom; |
2159 | } else { | 2157 | } else { |
2160 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2158 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, |
2159 | fe->address); | ||
2161 | if (!new_page) | 2160 | if (!new_page) |
2162 | goto oom; | 2161 | goto oom; |
2163 | cow_user_page(new_page, old_page, address, vma); | 2162 | cow_user_page(new_page, old_page, fe->address, vma); |
2164 | } | 2163 | } |
2165 | 2164 | ||
2166 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2165 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
@@ -2173,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2173 | /* | 2172 | /* |
2174 | * Re-check the pte - we dropped the lock | 2173 | * Re-check the pte - we dropped the lock |
2175 | */ | 2174 | */ |
2176 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2175 | fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); |
2177 | if (likely(pte_same(*page_table, orig_pte))) { | 2176 | if (likely(pte_same(*fe->pte, orig_pte))) { |
2178 | if (old_page) { | 2177 | if (old_page) { |
2179 | if (!PageAnon(old_page)) { | 2178 | if (!PageAnon(old_page)) { |
2180 | dec_mm_counter_fast(mm, | 2179 | dec_mm_counter_fast(mm, |
@@ -2184,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2184 | } else { | 2183 | } else { |
2185 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2184 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2186 | } | 2185 | } |
2187 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2186 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); |
2188 | entry = mk_pte(new_page, vma->vm_page_prot); | 2187 | entry = mk_pte(new_page, vma->vm_page_prot); |
2189 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2188 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2190 | /* | 2189 | /* |
@@ -2193,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2193 | * seen in the presence of one thread doing SMC and another | 2192 | * seen in the presence of one thread doing SMC and another |
2194 | * thread doing COW. | 2193 | * thread doing COW. |
2195 | */ | 2194 | */ |
2196 | ptep_clear_flush_notify(vma, address, page_table); | 2195 | ptep_clear_flush_notify(vma, fe->address, fe->pte); |
2197 | page_add_new_anon_rmap(new_page, vma, address, false); | 2196 | page_add_new_anon_rmap(new_page, vma, fe->address, false); |
2198 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 2197 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
2199 | lru_cache_add_active_or_unevictable(new_page, vma); | 2198 | lru_cache_add_active_or_unevictable(new_page, vma); |
2200 | /* | 2199 | /* |
@@ -2202,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2202 | * mmu page tables (such as kvm shadow page tables), we want the | 2201 | * mmu page tables (such as kvm shadow page tables), we want the |
2203 | * new page to be mapped directly into the secondary page table. | 2202 | * new page to be mapped directly into the secondary page table. |
2204 | */ | 2203 | */ |
2205 | set_pte_at_notify(mm, address, page_table, entry); | 2204 | set_pte_at_notify(mm, fe->address, fe->pte, entry); |
2206 | update_mmu_cache(vma, address, page_table); | 2205 | update_mmu_cache(vma, fe->address, fe->pte); |
2207 | if (old_page) { | 2206 | if (old_page) { |
2208 | /* | 2207 | /* |
2209 | * Only after switching the pte to the new page may | 2208 | * Only after switching the pte to the new page may |
@@ -2240,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2240 | if (new_page) | 2239 | if (new_page) |
2241 | put_page(new_page); | 2240 | put_page(new_page); |
2242 | 2241 | ||
2243 | pte_unmap_unlock(page_table, ptl); | 2242 | pte_unmap_unlock(fe->pte, fe->ptl); |
2244 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2243 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2245 | if (old_page) { | 2244 | if (old_page) { |
2246 | /* | 2245 | /* |
@@ -2268,44 +2267,43 @@ oom: | |||
2268 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | 2267 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED |
2269 | * mapping | 2268 | * mapping |
2270 | */ | 2269 | */ |
2271 | static int wp_pfn_shared(struct mm_struct *mm, | 2270 | static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) |
2272 | struct vm_area_struct *vma, unsigned long address, | ||
2273 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
2274 | pmd_t *pmd) | ||
2275 | { | 2271 | { |
2272 | struct vm_area_struct *vma = fe->vma; | ||
2273 | |||
2276 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | 2274 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { |
2277 | struct vm_fault vmf = { | 2275 | struct vm_fault vmf = { |
2278 | .page = NULL, | 2276 | .page = NULL, |
2279 | .pgoff = linear_page_index(vma, address), | 2277 | .pgoff = linear_page_index(vma, fe->address), |
2280 | .virtual_address = (void __user *)(address & PAGE_MASK), | 2278 | .virtual_address = |
2279 | (void __user *)(fe->address & PAGE_MASK), | ||
2281 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | 2280 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, |
2282 | }; | 2281 | }; |
2283 | int ret; | 2282 | int ret; |
2284 | 2283 | ||
2285 | pte_unmap_unlock(page_table, ptl); | 2284 | pte_unmap_unlock(fe->pte, fe->ptl); |
2286 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | 2285 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); |
2287 | if (ret & VM_FAULT_ERROR) | 2286 | if (ret & VM_FAULT_ERROR) |
2288 | return ret; | 2287 | return ret; |
2289 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2288 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2289 | &fe->ptl); | ||
2290 | /* | 2290 | /* |
2291 | * We might have raced with another page fault while we | 2291 | * We might have raced with another page fault while we |
2292 | * released the pte_offset_map_lock. | 2292 | * released the pte_offset_map_lock. |
2293 | */ | 2293 | */ |
2294 | if (!pte_same(*page_table, orig_pte)) { | 2294 | if (!pte_same(*fe->pte, orig_pte)) { |
2295 | pte_unmap_unlock(page_table, ptl); | 2295 | pte_unmap_unlock(fe->pte, fe->ptl); |
2296 | return 0; | 2296 | return 0; |
2297 | } | 2297 | } |
2298 | } | 2298 | } |
2299 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | 2299 | return wp_page_reuse(fe, orig_pte, NULL, 0, 0); |
2300 | NULL, 0, 0); | ||
2301 | } | 2300 | } |
2302 | 2301 | ||
2303 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | 2302 | static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, |
2304 | unsigned long address, pte_t *page_table, | 2303 | struct page *old_page) |
2305 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | 2304 | __releases(fe->ptl) |
2306 | struct page *old_page) | ||
2307 | __releases(ptl) | ||
2308 | { | 2305 | { |
2306 | struct vm_area_struct *vma = fe->vma; | ||
2309 | int page_mkwrite = 0; | 2307 | int page_mkwrite = 0; |
2310 | 2308 | ||
2311 | get_page(old_page); | 2309 | get_page(old_page); |
@@ -2313,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2313 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2311 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2314 | int tmp; | 2312 | int tmp; |
2315 | 2313 | ||
2316 | pte_unmap_unlock(page_table, ptl); | 2314 | pte_unmap_unlock(fe->pte, fe->ptl); |
2317 | tmp = do_page_mkwrite(vma, old_page, address); | 2315 | tmp = do_page_mkwrite(vma, old_page, fe->address); |
2318 | if (unlikely(!tmp || (tmp & | 2316 | if (unlikely(!tmp || (tmp & |
2319 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2317 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2320 | put_page(old_page); | 2318 | put_page(old_page); |
@@ -2326,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2326 | * they did, we just return, as we can count on the | 2324 | * they did, we just return, as we can count on the |
2327 | * MMU to tell us if they didn't also make it writable. | 2325 | * MMU to tell us if they didn't also make it writable. |
2328 | */ | 2326 | */ |
2329 | page_table = pte_offset_map_lock(mm, pmd, address, | 2327 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2330 | &ptl); | 2328 | &fe->ptl); |
2331 | if (!pte_same(*page_table, orig_pte)) { | 2329 | if (!pte_same(*fe->pte, orig_pte)) { |
2332 | unlock_page(old_page); | 2330 | unlock_page(old_page); |
2333 | pte_unmap_unlock(page_table, ptl); | 2331 | pte_unmap_unlock(fe->pte, fe->ptl); |
2334 | put_page(old_page); | 2332 | put_page(old_page); |
2335 | return 0; | 2333 | return 0; |
2336 | } | 2334 | } |
2337 | page_mkwrite = 1; | 2335 | page_mkwrite = 1; |
2338 | } | 2336 | } |
2339 | 2337 | ||
2340 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2338 | return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); |
2341 | orig_pte, old_page, page_mkwrite, 1); | ||
2342 | } | 2339 | } |
2343 | 2340 | ||
2344 | /* | 2341 | /* |
@@ -2359,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2359 | * but allow concurrent faults), with pte both mapped and locked. | 2356 | * but allow concurrent faults), with pte both mapped and locked. |
2360 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2357 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2361 | */ | 2358 | */ |
2362 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2359 | static int do_wp_page(struct fault_env *fe, pte_t orig_pte) |
2363 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2360 | __releases(fe->ptl) |
2364 | spinlock_t *ptl, pte_t orig_pte) | ||
2365 | __releases(ptl) | ||
2366 | { | 2361 | { |
2362 | struct vm_area_struct *vma = fe->vma; | ||
2367 | struct page *old_page; | 2363 | struct page *old_page; |
2368 | 2364 | ||
2369 | old_page = vm_normal_page(vma, address, orig_pte); | 2365 | old_page = vm_normal_page(vma, fe->address, orig_pte); |
2370 | if (!old_page) { | 2366 | if (!old_page) { |
2371 | /* | 2367 | /* |
2372 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | 2368 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
@@ -2377,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2377 | */ | 2373 | */ |
2378 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2374 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2379 | (VM_WRITE|VM_SHARED)) | 2375 | (VM_WRITE|VM_SHARED)) |
2380 | return wp_pfn_shared(mm, vma, address, page_table, ptl, | 2376 | return wp_pfn_shared(fe, orig_pte); |
2381 | orig_pte, pmd); | ||
2382 | 2377 | ||
2383 | pte_unmap_unlock(page_table, ptl); | 2378 | pte_unmap_unlock(fe->pte, fe->ptl); |
2384 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2379 | return wp_page_copy(fe, orig_pte, old_page); |
2385 | orig_pte, old_page); | ||
2386 | } | 2380 | } |
2387 | 2381 | ||
2388 | /* | 2382 | /* |
@@ -2393,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2393 | int total_mapcount; | 2387 | int total_mapcount; |
2394 | if (!trylock_page(old_page)) { | 2388 | if (!trylock_page(old_page)) { |
2395 | get_page(old_page); | 2389 | get_page(old_page); |
2396 | pte_unmap_unlock(page_table, ptl); | 2390 | pte_unmap_unlock(fe->pte, fe->ptl); |
2397 | lock_page(old_page); | 2391 | lock_page(old_page); |
2398 | page_table = pte_offset_map_lock(mm, pmd, address, | 2392 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, |
2399 | &ptl); | 2393 | fe->address, &fe->ptl); |
2400 | if (!pte_same(*page_table, orig_pte)) { | 2394 | if (!pte_same(*fe->pte, orig_pte)) { |
2401 | unlock_page(old_page); | 2395 | unlock_page(old_page); |
2402 | pte_unmap_unlock(page_table, ptl); | 2396 | pte_unmap_unlock(fe->pte, fe->ptl); |
2403 | put_page(old_page); | 2397 | put_page(old_page); |
2404 | return 0; | 2398 | return 0; |
2405 | } | 2399 | } |
@@ -2417,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2417 | page_move_anon_rmap(old_page, vma); | 2411 | page_move_anon_rmap(old_page, vma); |
2418 | } | 2412 | } |
2419 | unlock_page(old_page); | 2413 | unlock_page(old_page); |
2420 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2414 | return wp_page_reuse(fe, orig_pte, old_page, 0, 0); |
2421 | orig_pte, old_page, 0, 0); | ||
2422 | } | 2415 | } |
2423 | unlock_page(old_page); | 2416 | unlock_page(old_page); |
2424 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2417 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2425 | (VM_WRITE|VM_SHARED))) { | 2418 | (VM_WRITE|VM_SHARED))) { |
2426 | return wp_page_shared(mm, vma, address, page_table, pmd, | 2419 | return wp_page_shared(fe, orig_pte, old_page); |
2427 | ptl, orig_pte, old_page); | ||
2428 | } | 2420 | } |
2429 | 2421 | ||
2430 | /* | 2422 | /* |
@@ -2432,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2432 | */ | 2424 | */ |
2433 | get_page(old_page); | 2425 | get_page(old_page); |
2434 | 2426 | ||
2435 | pte_unmap_unlock(page_table, ptl); | 2427 | pte_unmap_unlock(fe->pte, fe->ptl); |
2436 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2428 | return wp_page_copy(fe, orig_pte, old_page); |
2437 | orig_pte, old_page); | ||
2438 | } | 2429 | } |
2439 | 2430 | ||
2440 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2431 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
@@ -2522,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2522 | * We return with the mmap_sem locked or unlocked in the same cases | 2513 | * We return with the mmap_sem locked or unlocked in the same cases |
2523 | * as does filemap_fault(). | 2514 | * as does filemap_fault(). |
2524 | */ | 2515 | */ |
2525 | int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2516 | int do_swap_page(struct fault_env *fe, pte_t orig_pte) |
2526 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2527 | unsigned int flags, pte_t orig_pte) | ||
2528 | { | 2517 | { |
2529 | spinlock_t *ptl; | 2518 | struct vm_area_struct *vma = fe->vma; |
2530 | struct page *page, *swapcache; | 2519 | struct page *page, *swapcache; |
2531 | struct mem_cgroup *memcg; | 2520 | struct mem_cgroup *memcg; |
2532 | swp_entry_t entry; | 2521 | swp_entry_t entry; |
@@ -2535,17 +2524,17 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2535 | int exclusive = 0; | 2524 | int exclusive = 0; |
2536 | int ret = 0; | 2525 | int ret = 0; |
2537 | 2526 | ||
2538 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2527 | if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) |
2539 | goto out; | 2528 | goto out; |
2540 | 2529 | ||
2541 | entry = pte_to_swp_entry(orig_pte); | 2530 | entry = pte_to_swp_entry(orig_pte); |
2542 | if (unlikely(non_swap_entry(entry))) { | 2531 | if (unlikely(non_swap_entry(entry))) { |
2543 | if (is_migration_entry(entry)) { | 2532 | if (is_migration_entry(entry)) { |
2544 | migration_entry_wait(mm, pmd, address); | 2533 | migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); |
2545 | } else if (is_hwpoison_entry(entry)) { | 2534 | } else if (is_hwpoison_entry(entry)) { |
2546 | ret = VM_FAULT_HWPOISON; | 2535 | ret = VM_FAULT_HWPOISON; |
2547 | } else { | 2536 | } else { |
2548 | print_bad_pte(vma, address, orig_pte, NULL); | 2537 | print_bad_pte(vma, fe->address, orig_pte, NULL); |
2549 | ret = VM_FAULT_SIGBUS; | 2538 | ret = VM_FAULT_SIGBUS; |
2550 | } | 2539 | } |
2551 | goto out; | 2540 | goto out; |
@@ -2554,14 +2543,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2554 | page = lookup_swap_cache(entry); | 2543 | page = lookup_swap_cache(entry); |
2555 | if (!page) { | 2544 | if (!page) { |
2556 | page = swapin_readahead(entry, | 2545 | page = swapin_readahead(entry, |
2557 | GFP_HIGHUSER_MOVABLE, vma, address); | 2546 | GFP_HIGHUSER_MOVABLE, vma, fe->address); |
2558 | if (!page) { | 2547 | if (!page) { |
2559 | /* | 2548 | /* |
2560 | * Back out if somebody else faulted in this pte | 2549 | * Back out if somebody else faulted in this pte |
2561 | * while we released the pte lock. | 2550 | * while we released the pte lock. |
2562 | */ | 2551 | */ |
2563 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2552 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, |
2564 | if (likely(pte_same(*page_table, orig_pte))) | 2553 | fe->address, &fe->ptl); |
2554 | if (likely(pte_same(*fe->pte, orig_pte))) | ||
2565 | ret = VM_FAULT_OOM; | 2555 | ret = VM_FAULT_OOM; |
2566 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2556 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2567 | goto unlock; | 2557 | goto unlock; |
@@ -2570,7 +2560,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2570 | /* Had to read the page from swap area: Major fault */ | 2560 | /* Had to read the page from swap area: Major fault */ |
2571 | ret = VM_FAULT_MAJOR; | 2561 | ret = VM_FAULT_MAJOR; |
2572 | count_vm_event(PGMAJFAULT); | 2562 | count_vm_event(PGMAJFAULT); |
2573 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); | 2563 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
2574 | } else if (PageHWPoison(page)) { | 2564 | } else if (PageHWPoison(page)) { |
2575 | /* | 2565 | /* |
2576 | * hwpoisoned dirty swapcache pages are kept for killing | 2566 | * hwpoisoned dirty swapcache pages are kept for killing |
@@ -2583,7 +2573,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2583 | } | 2573 | } |
2584 | 2574 | ||
2585 | swapcache = page; | 2575 | swapcache = page; |
2586 | locked = lock_page_or_retry(page, mm, flags); | 2576 | locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); |
2587 | 2577 | ||
2588 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2578 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2589 | if (!locked) { | 2579 | if (!locked) { |
@@ -2600,14 +2590,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2600 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2590 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2601 | goto out_page; | 2591 | goto out_page; |
2602 | 2592 | ||
2603 | page = ksm_might_need_to_copy(page, vma, address); | 2593 | page = ksm_might_need_to_copy(page, vma, fe->address); |
2604 | if (unlikely(!page)) { | 2594 | if (unlikely(!page)) { |
2605 | ret = VM_FAULT_OOM; | 2595 | ret = VM_FAULT_OOM; |
2606 | page = swapcache; | 2596 | page = swapcache; |
2607 | goto out_page; | 2597 | goto out_page; |
2608 | } | 2598 | } |
2609 | 2599 | ||
2610 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { | 2600 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, |
2601 | &memcg, false)) { | ||
2611 | ret = VM_FAULT_OOM; | 2602 | ret = VM_FAULT_OOM; |
2612 | goto out_page; | 2603 | goto out_page; |
2613 | } | 2604 | } |
@@ -2615,8 +2606,9 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2615 | /* | 2606 | /* |
2616 | * Back out if somebody else already faulted in this pte. | 2607 | * Back out if somebody else already faulted in this pte. |
2617 | */ | 2608 | */ |
2618 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2609 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2619 | if (unlikely(!pte_same(*page_table, orig_pte))) | 2610 | &fe->ptl); |
2611 | if (unlikely(!pte_same(*fe->pte, orig_pte))) | ||
2620 | goto out_nomap; | 2612 | goto out_nomap; |
2621 | 2613 | ||
2622 | if (unlikely(!PageUptodate(page))) { | 2614 | if (unlikely(!PageUptodate(page))) { |
@@ -2634,24 +2626,24 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2634 | * must be called after the swap_free(), or it will never succeed. | 2626 | * must be called after the swap_free(), or it will never succeed. |
2635 | */ | 2627 | */ |
2636 | 2628 | ||
2637 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2629 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2638 | dec_mm_counter_fast(mm, MM_SWAPENTS); | 2630 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); |
2639 | pte = mk_pte(page, vma->vm_page_prot); | 2631 | pte = mk_pte(page, vma->vm_page_prot); |
2640 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { | 2632 | if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { |
2641 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2633 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2642 | flags &= ~FAULT_FLAG_WRITE; | 2634 | fe->flags &= ~FAULT_FLAG_WRITE; |
2643 | ret |= VM_FAULT_WRITE; | 2635 | ret |= VM_FAULT_WRITE; |
2644 | exclusive = RMAP_EXCLUSIVE; | 2636 | exclusive = RMAP_EXCLUSIVE; |
2645 | } | 2637 | } |
2646 | flush_icache_page(vma, page); | 2638 | flush_icache_page(vma, page); |
2647 | if (pte_swp_soft_dirty(orig_pte)) | 2639 | if (pte_swp_soft_dirty(orig_pte)) |
2648 | pte = pte_mksoft_dirty(pte); | 2640 | pte = pte_mksoft_dirty(pte); |
2649 | set_pte_at(mm, address, page_table, pte); | 2641 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); |
2650 | if (page == swapcache) { | 2642 | if (page == swapcache) { |
2651 | do_page_add_anon_rmap(page, vma, address, exclusive); | 2643 | do_page_add_anon_rmap(page, vma, fe->address, exclusive); |
2652 | mem_cgroup_commit_charge(page, memcg, true, false); | 2644 | mem_cgroup_commit_charge(page, memcg, true, false); |
2653 | } else { /* ksm created a completely new copy */ | 2645 | } else { /* ksm created a completely new copy */ |
2654 | page_add_new_anon_rmap(page, vma, address, false); | 2646 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2655 | mem_cgroup_commit_charge(page, memcg, false, false); | 2647 | mem_cgroup_commit_charge(page, memcg, false, false); |
2656 | lru_cache_add_active_or_unevictable(page, vma); | 2648 | lru_cache_add_active_or_unevictable(page, vma); |
2657 | } | 2649 | } |
@@ -2674,22 +2666,22 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2674 | put_page(swapcache); | 2666 | put_page(swapcache); |
2675 | } | 2667 | } |
2676 | 2668 | ||
2677 | if (flags & FAULT_FLAG_WRITE) { | 2669 | if (fe->flags & FAULT_FLAG_WRITE) { |
2678 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); | 2670 | ret |= do_wp_page(fe, pte); |
2679 | if (ret & VM_FAULT_ERROR) | 2671 | if (ret & VM_FAULT_ERROR) |
2680 | ret &= VM_FAULT_ERROR; | 2672 | ret &= VM_FAULT_ERROR; |
2681 | goto out; | 2673 | goto out; |
2682 | } | 2674 | } |
2683 | 2675 | ||
2684 | /* No need to invalidate - it was non-present before */ | 2676 | /* No need to invalidate - it was non-present before */ |
2685 | update_mmu_cache(vma, address, page_table); | 2677 | update_mmu_cache(vma, fe->address, fe->pte); |
2686 | unlock: | 2678 | unlock: |
2687 | pte_unmap_unlock(page_table, ptl); | 2679 | pte_unmap_unlock(fe->pte, fe->ptl); |
2688 | out: | 2680 | out: |
2689 | return ret; | 2681 | return ret; |
2690 | out_nomap: | 2682 | out_nomap: |
2691 | mem_cgroup_cancel_charge(page, memcg, false); | 2683 | mem_cgroup_cancel_charge(page, memcg, false); |
2692 | pte_unmap_unlock(page_table, ptl); | 2684 | pte_unmap_unlock(fe->pte, fe->ptl); |
2693 | out_page: | 2685 | out_page: |
2694 | unlock_page(page); | 2686 | unlock_page(page); |
2695 | out_release: | 2687 | out_release: |
@@ -2740,37 +2732,36 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2740 | * but allow concurrent faults), and pte mapped but not yet locked. | 2732 | * but allow concurrent faults), and pte mapped but not yet locked. |
2741 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2733 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2742 | */ | 2734 | */ |
2743 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2735 | static int do_anonymous_page(struct fault_env *fe) |
2744 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2745 | unsigned int flags) | ||
2746 | { | 2736 | { |
2737 | struct vm_area_struct *vma = fe->vma; | ||
2747 | struct mem_cgroup *memcg; | 2738 | struct mem_cgroup *memcg; |
2748 | struct page *page; | 2739 | struct page *page; |
2749 | spinlock_t *ptl; | ||
2750 | pte_t entry; | 2740 | pte_t entry; |
2751 | 2741 | ||
2752 | pte_unmap(page_table); | 2742 | pte_unmap(fe->pte); |
2753 | 2743 | ||
2754 | /* File mapping without ->vm_ops ? */ | 2744 | /* File mapping without ->vm_ops ? */ |
2755 | if (vma->vm_flags & VM_SHARED) | 2745 | if (vma->vm_flags & VM_SHARED) |
2756 | return VM_FAULT_SIGBUS; | 2746 | return VM_FAULT_SIGBUS; |
2757 | 2747 | ||
2758 | /* Check if we need to add a guard page to the stack */ | 2748 | /* Check if we need to add a guard page to the stack */ |
2759 | if (check_stack_guard_page(vma, address) < 0) | 2749 | if (check_stack_guard_page(vma, fe->address) < 0) |
2760 | return VM_FAULT_SIGSEGV; | 2750 | return VM_FAULT_SIGSEGV; |
2761 | 2751 | ||
2762 | /* Use the zero-page for reads */ | 2752 | /* Use the zero-page for reads */ |
2763 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { | 2753 | if (!(fe->flags & FAULT_FLAG_WRITE) && |
2764 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), | 2754 | !mm_forbids_zeropage(vma->vm_mm)) { |
2755 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), | ||
2765 | vma->vm_page_prot)); | 2756 | vma->vm_page_prot)); |
2766 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2757 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2767 | if (!pte_none(*page_table)) | 2758 | &fe->ptl); |
2759 | if (!pte_none(*fe->pte)) | ||
2768 | goto unlock; | 2760 | goto unlock; |
2769 | /* Deliver the page fault to userland, check inside PT lock */ | 2761 | /* Deliver the page fault to userland, check inside PT lock */ |
2770 | if (userfaultfd_missing(vma)) { | 2762 | if (userfaultfd_missing(vma)) { |
2771 | pte_unmap_unlock(page_table, ptl); | 2763 | pte_unmap_unlock(fe->pte, fe->ptl); |
2772 | return handle_userfault(vma, address, flags, | 2764 | return handle_userfault(fe, VM_UFFD_MISSING); |
2773 | VM_UFFD_MISSING); | ||
2774 | } | 2765 | } |
2775 | goto setpte; | 2766 | goto setpte; |
2776 | } | 2767 | } |
@@ -2778,11 +2769,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2778 | /* Allocate our own private page. */ | 2769 | /* Allocate our own private page. */ |
2779 | if (unlikely(anon_vma_prepare(vma))) | 2770 | if (unlikely(anon_vma_prepare(vma))) |
2780 | goto oom; | 2771 | goto oom; |
2781 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2772 | page = alloc_zeroed_user_highpage_movable(vma, fe->address); |
2782 | if (!page) | 2773 | if (!page) |
2783 | goto oom; | 2774 | goto oom; |
2784 | 2775 | ||
2785 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) | 2776 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) |
2786 | goto oom_free_page; | 2777 | goto oom_free_page; |
2787 | 2778 | ||
2788 | /* | 2779 | /* |
@@ -2796,30 +2787,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2796 | if (vma->vm_flags & VM_WRITE) | 2787 | if (vma->vm_flags & VM_WRITE) |
2797 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2788 | entry = pte_mkwrite(pte_mkdirty(entry)); |
2798 | 2789 | ||
2799 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2790 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2800 | if (!pte_none(*page_table)) | 2791 | &fe->ptl); |
2792 | if (!pte_none(*fe->pte)) | ||
2801 | goto release; | 2793 | goto release; |
2802 | 2794 | ||
2803 | /* Deliver the page fault to userland, check inside PT lock */ | 2795 | /* Deliver the page fault to userland, check inside PT lock */ |
2804 | if (userfaultfd_missing(vma)) { | 2796 | if (userfaultfd_missing(vma)) { |
2805 | pte_unmap_unlock(page_table, ptl); | 2797 | pte_unmap_unlock(fe->pte, fe->ptl); |
2806 | mem_cgroup_cancel_charge(page, memcg, false); | 2798 | mem_cgroup_cancel_charge(page, memcg, false); |
2807 | put_page(page); | 2799 | put_page(page); |
2808 | return handle_userfault(vma, address, flags, | 2800 | return handle_userfault(fe, VM_UFFD_MISSING); |
2809 | VM_UFFD_MISSING); | ||
2810 | } | 2801 | } |
2811 | 2802 | ||
2812 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2803 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2813 | page_add_new_anon_rmap(page, vma, address, false); | 2804 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2814 | mem_cgroup_commit_charge(page, memcg, false, false); | 2805 | mem_cgroup_commit_charge(page, memcg, false, false); |
2815 | lru_cache_add_active_or_unevictable(page, vma); | 2806 | lru_cache_add_active_or_unevictable(page, vma); |
2816 | setpte: | 2807 | setpte: |
2817 | set_pte_at(mm, address, page_table, entry); | 2808 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); |
2818 | 2809 | ||
2819 | /* No need to invalidate - it was non-present before */ | 2810 | /* No need to invalidate - it was non-present before */ |
2820 | update_mmu_cache(vma, address, page_table); | 2811 | update_mmu_cache(vma, fe->address, fe->pte); |
2821 | unlock: | 2812 | unlock: |
2822 | pte_unmap_unlock(page_table, ptl); | 2813 | pte_unmap_unlock(fe->pte, fe->ptl); |
2823 | return 0; | 2814 | return 0; |
2824 | release: | 2815 | release: |
2825 | mem_cgroup_cancel_charge(page, memcg, false); | 2816 | mem_cgroup_cancel_charge(page, memcg, false); |
@@ -2836,17 +2827,16 @@ oom: | |||
2836 | * released depending on flags and vma->vm_ops->fault() return value. | 2827 | * released depending on flags and vma->vm_ops->fault() return value. |
2837 | * See filemap_fault() and __lock_page_retry(). | 2828 | * See filemap_fault() and __lock_page_retry(). |
2838 | */ | 2829 | */ |
2839 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, | 2830 | static int __do_fault(struct fault_env *fe, pgoff_t pgoff, |
2840 | pgoff_t pgoff, unsigned int flags, | 2831 | struct page *cow_page, struct page **page, void **entry) |
2841 | struct page *cow_page, struct page **page, | ||
2842 | void **entry) | ||
2843 | { | 2832 | { |
2833 | struct vm_area_struct *vma = fe->vma; | ||
2844 | struct vm_fault vmf; | 2834 | struct vm_fault vmf; |
2845 | int ret; | 2835 | int ret; |
2846 | 2836 | ||
2847 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2837 | vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); |
2848 | vmf.pgoff = pgoff; | 2838 | vmf.pgoff = pgoff; |
2849 | vmf.flags = flags; | 2839 | vmf.flags = fe->flags; |
2850 | vmf.page = NULL; | 2840 | vmf.page = NULL; |
2851 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | 2841 | vmf.gfp_mask = __get_fault_gfp_mask(vma); |
2852 | vmf.cow_page = cow_page; | 2842 | vmf.cow_page = cow_page; |
@@ -2878,38 +2868,36 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
2878 | /** | 2868 | /** |
2879 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | 2869 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. |
2880 | * | 2870 | * |
2881 | * @vma: virtual memory area | 2871 | * @fe: fault environment |
2882 | * @address: user virtual address | ||
2883 | * @page: page to map | 2872 | * @page: page to map |
2884 | * @pte: pointer to target page table entry | ||
2885 | * @write: true, if new entry is writable | ||
2886 | * @anon: true, if it's anonymous page | ||
2887 | * | 2873 | * |
2888 | * Caller must hold page table lock relevant for @pte. | 2874 | * Caller must hold page table lock relevant for @fe->pte. |
2889 | * | 2875 | * |
2890 | * Target users are page handler itself and implementations of | 2876 | * Target users are page handler itself and implementations of |
2891 | * vm_ops->map_pages. | 2877 | * vm_ops->map_pages. |
2892 | */ | 2878 | */ |
2893 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 2879 | void do_set_pte(struct fault_env *fe, struct page *page) |
2894 | struct page *page, pte_t *pte, bool write, bool anon) | ||
2895 | { | 2880 | { |
2881 | struct vm_area_struct *vma = fe->vma; | ||
2882 | bool write = fe->flags & FAULT_FLAG_WRITE; | ||
2896 | pte_t entry; | 2883 | pte_t entry; |
2897 | 2884 | ||
2898 | flush_icache_page(vma, page); | 2885 | flush_icache_page(vma, page); |
2899 | entry = mk_pte(page, vma->vm_page_prot); | 2886 | entry = mk_pte(page, vma->vm_page_prot); |
2900 | if (write) | 2887 | if (write) |
2901 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2888 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2902 | if (anon) { | 2889 | /* copy-on-write page */ |
2890 | if (write && !(vma->vm_flags & VM_SHARED)) { | ||
2903 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2891 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2904 | page_add_new_anon_rmap(page, vma, address, false); | 2892 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2905 | } else { | 2893 | } else { |
2906 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 2894 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
2907 | page_add_file_rmap(page); | 2895 | page_add_file_rmap(page); |
2908 | } | 2896 | } |
2909 | set_pte_at(vma->vm_mm, address, pte, entry); | 2897 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); |
2910 | 2898 | ||
2911 | /* no need to invalidate: a not-present page won't be cached */ | 2899 | /* no need to invalidate: a not-present page won't be cached */ |
2912 | update_mmu_cache(vma, address, pte); | 2900 | update_mmu_cache(vma, fe->address, fe->pte); |
2913 | } | 2901 | } |
2914 | 2902 | ||
2915 | static unsigned long fault_around_bytes __read_mostly = | 2903 | static unsigned long fault_around_bytes __read_mostly = |
@@ -2976,57 +2964,53 @@ late_initcall(fault_around_debugfs); | |||
2976 | * fault_around_pages() value (and therefore to page order). This way it's | 2964 | * fault_around_pages() value (and therefore to page order). This way it's |
2977 | * easier to guarantee that we don't cross page table boundaries. | 2965 | * easier to guarantee that we don't cross page table boundaries. |
2978 | */ | 2966 | */ |
2979 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 2967 | static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) |
2980 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | ||
2981 | { | 2968 | { |
2982 | unsigned long start_addr, nr_pages, mask; | 2969 | unsigned long address = fe->address, start_addr, nr_pages, mask; |
2983 | pgoff_t max_pgoff; | 2970 | pte_t *pte = fe->pte; |
2984 | struct vm_fault vmf; | 2971 | pgoff_t end_pgoff; |
2985 | int off; | 2972 | int off; |
2986 | 2973 | ||
2987 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 2974 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2988 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 2975 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
2989 | 2976 | ||
2990 | start_addr = max(address & mask, vma->vm_start); | 2977 | start_addr = max(fe->address & mask, fe->vma->vm_start); |
2991 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 2978 | off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
2992 | pte -= off; | 2979 | fe->pte -= off; |
2993 | pgoff -= off; | 2980 | start_pgoff -= off; |
2994 | 2981 | ||
2995 | /* | 2982 | /* |
2996 | * max_pgoff is either end of page table or end of vma | 2983 | * end_pgoff is either end of page table or end of vma |
2997 | * or fault_around_pages() from pgoff, depending what is nearest. | 2984 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
2998 | */ | 2985 | */ |
2999 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 2986 | end_pgoff = start_pgoff - |
2987 | ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | ||
3000 | PTRS_PER_PTE - 1; | 2988 | PTRS_PER_PTE - 1; |
3001 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | 2989 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, |
3002 | pgoff + nr_pages - 1); | 2990 | start_pgoff + nr_pages - 1); |
3003 | 2991 | ||
3004 | /* Check if it makes any sense to call ->map_pages */ | 2992 | /* Check if it makes any sense to call ->map_pages */ |
3005 | while (!pte_none(*pte)) { | 2993 | fe->address = start_addr; |
3006 | if (++pgoff > max_pgoff) | 2994 | while (!pte_none(*fe->pte)) { |
3007 | return; | 2995 | if (++start_pgoff > end_pgoff) |
3008 | start_addr += PAGE_SIZE; | 2996 | goto out; |
3009 | if (start_addr >= vma->vm_end) | 2997 | fe->address += PAGE_SIZE; |
3010 | return; | 2998 | if (fe->address >= fe->vma->vm_end) |
3011 | pte++; | 2999 | goto out; |
3000 | fe->pte++; | ||
3012 | } | 3001 | } |
3013 | 3002 | ||
3014 | vmf.virtual_address = (void __user *) start_addr; | 3003 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); |
3015 | vmf.pte = pte; | 3004 | out: |
3016 | vmf.pgoff = pgoff; | 3005 | /* restore fault_env */ |
3017 | vmf.max_pgoff = max_pgoff; | 3006 | fe->pte = pte; |
3018 | vmf.flags = flags; | 3007 | fe->address = address; |
3019 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
3020 | vma->vm_ops->map_pages(vma, &vmf); | ||
3021 | } | 3008 | } |
3022 | 3009 | ||
3023 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3010 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) |
3024 | unsigned long address, pmd_t *pmd, | ||
3025 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3026 | { | 3011 | { |
3012 | struct vm_area_struct *vma = fe->vma; | ||
3027 | struct page *fault_page; | 3013 | struct page *fault_page; |
3028 | spinlock_t *ptl; | ||
3029 | pte_t *pte; | ||
3030 | int ret = 0; | 3014 | int ret = 0; |
3031 | 3015 | ||
3032 | /* | 3016 | /* |
@@ -3035,66 +3019,68 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3035 | * something). | 3019 | * something). |
3036 | */ | 3020 | */ |
3037 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3021 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
3038 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3022 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
3039 | do_fault_around(vma, address, pte, pgoff, flags); | 3023 | &fe->ptl); |
3040 | if (!pte_same(*pte, orig_pte)) | 3024 | if (!pte_same(*fe->pte, orig_pte)) |
3025 | goto unlock_out; | ||
3026 | do_fault_around(fe, pgoff); | ||
3027 | /* Check if the fault is handled by faultaround */ | ||
3028 | if (!pte_same(*fe->pte, orig_pte)) | ||
3041 | goto unlock_out; | 3029 | goto unlock_out; |
3042 | pte_unmap_unlock(pte, ptl); | 3030 | pte_unmap_unlock(fe->pte, fe->ptl); |
3043 | } | 3031 | } |
3044 | 3032 | ||
3045 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); | 3033 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3046 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3034 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3047 | return ret; | 3035 | return ret; |
3048 | 3036 | ||
3049 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3037 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl); |
3050 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3038 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { |
3051 | pte_unmap_unlock(pte, ptl); | 3039 | pte_unmap_unlock(fe->pte, fe->ptl); |
3052 | unlock_page(fault_page); | 3040 | unlock_page(fault_page); |
3053 | put_page(fault_page); | 3041 | put_page(fault_page); |
3054 | return ret; | 3042 | return ret; |
3055 | } | 3043 | } |
3056 | do_set_pte(vma, address, fault_page, pte, false, false); | 3044 | do_set_pte(fe, fault_page); |
3057 | unlock_page(fault_page); | 3045 | unlock_page(fault_page); |
3058 | unlock_out: | 3046 | unlock_out: |
3059 | pte_unmap_unlock(pte, ptl); | 3047 | pte_unmap_unlock(fe->pte, fe->ptl); |
3060 | return ret; | 3048 | return ret; |
3061 | } | 3049 | } |
3062 | 3050 | ||
3063 | static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3051 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) |
3064 | unsigned long address, pmd_t *pmd, | ||
3065 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3066 | { | 3052 | { |
3053 | struct vm_area_struct *vma = fe->vma; | ||
3067 | struct page *fault_page, *new_page; | 3054 | struct page *fault_page, *new_page; |
3068 | void *fault_entry; | 3055 | void *fault_entry; |
3069 | struct mem_cgroup *memcg; | 3056 | struct mem_cgroup *memcg; |
3070 | spinlock_t *ptl; | ||
3071 | pte_t *pte; | ||
3072 | int ret; | 3057 | int ret; |
3073 | 3058 | ||
3074 | if (unlikely(anon_vma_prepare(vma))) | 3059 | if (unlikely(anon_vma_prepare(vma))) |
3075 | return VM_FAULT_OOM; | 3060 | return VM_FAULT_OOM; |
3076 | 3061 | ||
3077 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 3062 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); |
3078 | if (!new_page) | 3063 | if (!new_page) |
3079 | return VM_FAULT_OOM; | 3064 | return VM_FAULT_OOM; |
3080 | 3065 | ||
3081 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { | 3066 | if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, |
3067 | &memcg, false)) { | ||
3082 | put_page(new_page); | 3068 | put_page(new_page); |
3083 | return VM_FAULT_OOM; | 3069 | return VM_FAULT_OOM; |
3084 | } | 3070 | } |
3085 | 3071 | ||
3086 | ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, | 3072 | ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); |
3087 | &fault_entry); | ||
3088 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3073 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3089 | goto uncharge_out; | 3074 | goto uncharge_out; |
3090 | 3075 | ||
3091 | if (!(ret & VM_FAULT_DAX_LOCKED)) | 3076 | if (!(ret & VM_FAULT_DAX_LOCKED)) |
3092 | copy_user_highpage(new_page, fault_page, address, vma); | 3077 | copy_user_highpage(new_page, fault_page, fe->address, vma); |
3093 | __SetPageUptodate(new_page); | 3078 | __SetPageUptodate(new_page); |
3094 | 3079 | ||
3095 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3080 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
3096 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3081 | &fe->ptl); |
3097 | pte_unmap_unlock(pte, ptl); | 3082 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { |
3083 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3098 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | 3084 | if (!(ret & VM_FAULT_DAX_LOCKED)) { |
3099 | unlock_page(fault_page); | 3085 | unlock_page(fault_page); |
3100 | put_page(fault_page); | 3086 | put_page(fault_page); |
@@ -3104,10 +3090,10 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3104 | } | 3090 | } |
3105 | goto uncharge_out; | 3091 | goto uncharge_out; |
3106 | } | 3092 | } |
3107 | do_set_pte(vma, address, new_page, pte, true, true); | 3093 | do_set_pte(fe, new_page); |
3108 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 3094 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
3109 | lru_cache_add_active_or_unevictable(new_page, vma); | 3095 | lru_cache_add_active_or_unevictable(new_page, vma); |
3110 | pte_unmap_unlock(pte, ptl); | 3096 | pte_unmap_unlock(fe->pte, fe->ptl); |
3111 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | 3097 | if (!(ret & VM_FAULT_DAX_LOCKED)) { |
3112 | unlock_page(fault_page); | 3098 | unlock_page(fault_page); |
3113 | put_page(fault_page); | 3099 | put_page(fault_page); |
@@ -3121,18 +3107,15 @@ uncharge_out: | |||
3121 | return ret; | 3107 | return ret; |
3122 | } | 3108 | } |
3123 | 3109 | ||
3124 | static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3110 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) |
3125 | unsigned long address, pmd_t *pmd, | ||
3126 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3127 | { | 3111 | { |
3112 | struct vm_area_struct *vma = fe->vma; | ||
3128 | struct page *fault_page; | 3113 | struct page *fault_page; |
3129 | struct address_space *mapping; | 3114 | struct address_space *mapping; |
3130 | spinlock_t *ptl; | ||
3131 | pte_t *pte; | ||
3132 | int dirtied = 0; | 3115 | int dirtied = 0; |
3133 | int ret, tmp; | 3116 | int ret, tmp; |
3134 | 3117 | ||
3135 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); | 3118 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3136 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3119 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3137 | return ret; | 3120 | return ret; |
3138 | 3121 | ||
@@ -3142,7 +3125,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3142 | */ | 3125 | */ |
3143 | if (vma->vm_ops->page_mkwrite) { | 3126 | if (vma->vm_ops->page_mkwrite) { |
3144 | unlock_page(fault_page); | 3127 | unlock_page(fault_page); |
3145 | tmp = do_page_mkwrite(vma, fault_page, address); | 3128 | tmp = do_page_mkwrite(vma, fault_page, fe->address); |
3146 | if (unlikely(!tmp || | 3129 | if (unlikely(!tmp || |
3147 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 3130 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
3148 | put_page(fault_page); | 3131 | put_page(fault_page); |
@@ -3150,15 +3133,16 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3150 | } | 3133 | } |
3151 | } | 3134 | } |
3152 | 3135 | ||
3153 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3136 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
3154 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3137 | &fe->ptl); |
3155 | pte_unmap_unlock(pte, ptl); | 3138 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { |
3139 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3156 | unlock_page(fault_page); | 3140 | unlock_page(fault_page); |
3157 | put_page(fault_page); | 3141 | put_page(fault_page); |
3158 | return ret; | 3142 | return ret; |
3159 | } | 3143 | } |
3160 | do_set_pte(vma, address, fault_page, pte, true, false); | 3144 | do_set_pte(fe, fault_page); |
3161 | pte_unmap_unlock(pte, ptl); | 3145 | pte_unmap_unlock(fe->pte, fe->ptl); |
3162 | 3146 | ||
3163 | if (set_page_dirty(fault_page)) | 3147 | if (set_page_dirty(fault_page)) |
3164 | dirtied = 1; | 3148 | dirtied = 1; |
@@ -3190,23 +3174,20 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3190 | * The mmap_sem may have been released depending on flags and our | 3174 | * The mmap_sem may have been released depending on flags and our |
3191 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3175 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3192 | */ | 3176 | */ |
3193 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3177 | static int do_fault(struct fault_env *fe, pte_t orig_pte) |
3194 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
3195 | unsigned int flags, pte_t orig_pte) | ||
3196 | { | 3178 | { |
3197 | pgoff_t pgoff = linear_page_index(vma, address); | 3179 | struct vm_area_struct *vma = fe->vma; |
3180 | pgoff_t pgoff = linear_page_index(vma, fe->address); | ||
3198 | 3181 | ||
3199 | pte_unmap(page_table); | 3182 | pte_unmap(fe->pte); |
3200 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3183 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
3201 | if (!vma->vm_ops->fault) | 3184 | if (!vma->vm_ops->fault) |
3202 | return VM_FAULT_SIGBUS; | 3185 | return VM_FAULT_SIGBUS; |
3203 | if (!(flags & FAULT_FLAG_WRITE)) | 3186 | if (!(fe->flags & FAULT_FLAG_WRITE)) |
3204 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | 3187 | return do_read_fault(fe, pgoff, orig_pte); |
3205 | orig_pte); | ||
3206 | if (!(vma->vm_flags & VM_SHARED)) | 3188 | if (!(vma->vm_flags & VM_SHARED)) |
3207 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3189 | return do_cow_fault(fe, pgoff, orig_pte); |
3208 | orig_pte); | 3190 | return do_shared_fault(fe, pgoff, orig_pte); |
3209 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | ||
3210 | } | 3191 | } |
3211 | 3192 | ||
3212 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3193 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
@@ -3224,11 +3205,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | |||
3224 | return mpol_misplaced(page, vma, addr); | 3205 | return mpol_misplaced(page, vma, addr); |
3225 | } | 3206 | } |
3226 | 3207 | ||
3227 | static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 3208 | static int do_numa_page(struct fault_env *fe, pte_t pte) |
3228 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3229 | { | 3209 | { |
3210 | struct vm_area_struct *vma = fe->vma; | ||
3230 | struct page *page = NULL; | 3211 | struct page *page = NULL; |
3231 | spinlock_t *ptl; | ||
3232 | int page_nid = -1; | 3212 | int page_nid = -1; |
3233 | int last_cpupid; | 3213 | int last_cpupid; |
3234 | int target_nid; | 3214 | int target_nid; |
@@ -3248,10 +3228,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3248 | * page table entry is not accessible, so there would be no | 3228 | * page table entry is not accessible, so there would be no |
3249 | * concurrent hardware modifications to the PTE. | 3229 | * concurrent hardware modifications to the PTE. |
3250 | */ | 3230 | */ |
3251 | ptl = pte_lockptr(mm, pmd); | 3231 | fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); |
3252 | spin_lock(ptl); | 3232 | spin_lock(fe->ptl); |
3253 | if (unlikely(!pte_same(*ptep, pte))) { | 3233 | if (unlikely(!pte_same(*fe->pte, pte))) { |
3254 | pte_unmap_unlock(ptep, ptl); | 3234 | pte_unmap_unlock(fe->pte, fe->ptl); |
3255 | goto out; | 3235 | goto out; |
3256 | } | 3236 | } |
3257 | 3237 | ||
@@ -3260,18 +3240,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3260 | pte = pte_mkyoung(pte); | 3240 | pte = pte_mkyoung(pte); |
3261 | if (was_writable) | 3241 | if (was_writable) |
3262 | pte = pte_mkwrite(pte); | 3242 | pte = pte_mkwrite(pte); |
3263 | set_pte_at(mm, addr, ptep, pte); | 3243 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); |
3264 | update_mmu_cache(vma, addr, ptep); | 3244 | update_mmu_cache(vma, fe->address, fe->pte); |
3265 | 3245 | ||
3266 | page = vm_normal_page(vma, addr, pte); | 3246 | page = vm_normal_page(vma, fe->address, pte); |
3267 | if (!page) { | 3247 | if (!page) { |
3268 | pte_unmap_unlock(ptep, ptl); | 3248 | pte_unmap_unlock(fe->pte, fe->ptl); |
3269 | return 0; | 3249 | return 0; |
3270 | } | 3250 | } |
3271 | 3251 | ||
3272 | /* TODO: handle PTE-mapped THP */ | 3252 | /* TODO: handle PTE-mapped THP */ |
3273 | if (PageCompound(page)) { | 3253 | if (PageCompound(page)) { |
3274 | pte_unmap_unlock(ptep, ptl); | 3254 | pte_unmap_unlock(fe->pte, fe->ptl); |
3275 | return 0; | 3255 | return 0; |
3276 | } | 3256 | } |
3277 | 3257 | ||
@@ -3295,8 +3275,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3295 | 3275 | ||
3296 | last_cpupid = page_cpupid_last(page); | 3276 | last_cpupid = page_cpupid_last(page); |
3297 | page_nid = page_to_nid(page); | 3277 | page_nid = page_to_nid(page); |
3298 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); | 3278 | target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, |
3299 | pte_unmap_unlock(ptep, ptl); | 3279 | &flags); |
3280 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3300 | if (target_nid == -1) { | 3281 | if (target_nid == -1) { |
3301 | put_page(page); | 3282 | put_page(page); |
3302 | goto out; | 3283 | goto out; |
@@ -3316,24 +3297,24 @@ out: | |||
3316 | return 0; | 3297 | return 0; |
3317 | } | 3298 | } |
3318 | 3299 | ||
3319 | static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3300 | static int create_huge_pmd(struct fault_env *fe) |
3320 | unsigned long address, pmd_t *pmd, unsigned int flags) | ||
3321 | { | 3301 | { |
3302 | struct vm_area_struct *vma = fe->vma; | ||
3322 | if (vma_is_anonymous(vma)) | 3303 | if (vma_is_anonymous(vma)) |
3323 | return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); | 3304 | return do_huge_pmd_anonymous_page(fe); |
3324 | if (vma->vm_ops->pmd_fault) | 3305 | if (vma->vm_ops->pmd_fault) |
3325 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3306 | return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, |
3307 | fe->flags); | ||
3326 | return VM_FAULT_FALLBACK; | 3308 | return VM_FAULT_FALLBACK; |
3327 | } | 3309 | } |
3328 | 3310 | ||
3329 | static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3311 | static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) |
3330 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd, | ||
3331 | unsigned int flags) | ||
3332 | { | 3312 | { |
3333 | if (vma_is_anonymous(vma)) | 3313 | if (vma_is_anonymous(fe->vma)) |
3334 | return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); | 3314 | return do_huge_pmd_wp_page(fe, orig_pmd); |
3335 | if (vma->vm_ops->pmd_fault) | 3315 | if (fe->vma->vm_ops->pmd_fault) |
3336 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3316 | return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, |
3317 | fe->flags); | ||
3337 | return VM_FAULT_FALLBACK; | 3318 | return VM_FAULT_FALLBACK; |
3338 | } | 3319 | } |
3339 | 3320 | ||
@@ -3353,12 +3334,9 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3353 | * The mmap_sem may have been released depending on flags and our | 3334 | * The mmap_sem may have been released depending on flags and our |
3354 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3335 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3355 | */ | 3336 | */ |
3356 | static int handle_pte_fault(struct mm_struct *mm, | 3337 | static int handle_pte_fault(struct fault_env *fe) |
3357 | struct vm_area_struct *vma, unsigned long address, | ||
3358 | pte_t *pte, pmd_t *pmd, unsigned int flags) | ||
3359 | { | 3338 | { |
3360 | pte_t entry; | 3339 | pte_t entry; |
3361 | spinlock_t *ptl; | ||
3362 | 3340 | ||
3363 | /* | 3341 | /* |
3364 | * some architectures can have larger ptes than wordsize, | 3342 | * some architectures can have larger ptes than wordsize, |
@@ -3368,37 +3346,34 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3368 | * we later double check anyway with the ptl lock held. So here | 3346 | * we later double check anyway with the ptl lock held. So here |
3369 | * a barrier will do. | 3347 | * a barrier will do. |
3370 | */ | 3348 | */ |
3371 | entry = *pte; | 3349 | entry = *fe->pte; |
3372 | barrier(); | 3350 | barrier(); |
3373 | if (!pte_present(entry)) { | 3351 | if (!pte_present(entry)) { |
3374 | if (pte_none(entry)) { | 3352 | if (pte_none(entry)) { |
3375 | if (vma_is_anonymous(vma)) | 3353 | if (vma_is_anonymous(fe->vma)) |
3376 | return do_anonymous_page(mm, vma, address, | 3354 | return do_anonymous_page(fe); |
3377 | pte, pmd, flags); | ||
3378 | else | 3355 | else |
3379 | return do_fault(mm, vma, address, pte, pmd, | 3356 | return do_fault(fe, entry); |
3380 | flags, entry); | ||
3381 | } | 3357 | } |
3382 | return do_swap_page(mm, vma, address, | 3358 | return do_swap_page(fe, entry); |
3383 | pte, pmd, flags, entry); | ||
3384 | } | 3359 | } |
3385 | 3360 | ||
3386 | if (pte_protnone(entry)) | 3361 | if (pte_protnone(entry)) |
3387 | return do_numa_page(mm, vma, address, entry, pte, pmd); | 3362 | return do_numa_page(fe, entry); |
3388 | 3363 | ||
3389 | ptl = pte_lockptr(mm, pmd); | 3364 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); |
3390 | spin_lock(ptl); | 3365 | spin_lock(fe->ptl); |
3391 | if (unlikely(!pte_same(*pte, entry))) | 3366 | if (unlikely(!pte_same(*fe->pte, entry))) |
3392 | goto unlock; | 3367 | goto unlock; |
3393 | if (flags & FAULT_FLAG_WRITE) { | 3368 | if (fe->flags & FAULT_FLAG_WRITE) { |
3394 | if (!pte_write(entry)) | 3369 | if (!pte_write(entry)) |
3395 | return do_wp_page(mm, vma, address, | 3370 | return do_wp_page(fe, entry); |
3396 | pte, pmd, ptl, entry); | ||
3397 | entry = pte_mkdirty(entry); | 3371 | entry = pte_mkdirty(entry); |
3398 | } | 3372 | } |
3399 | entry = pte_mkyoung(entry); | 3373 | entry = pte_mkyoung(entry); |
3400 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3374 | if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, |
3401 | update_mmu_cache(vma, address, pte); | 3375 | fe->flags & FAULT_FLAG_WRITE)) { |
3376 | update_mmu_cache(fe->vma, fe->address, fe->pte); | ||
3402 | } else { | 3377 | } else { |
3403 | /* | 3378 | /* |
3404 | * This is needed only for protection faults but the arch code | 3379 | * This is needed only for protection faults but the arch code |
@@ -3406,11 +3381,11 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3406 | * This still avoids useless tlb flushes for .text page faults | 3381 | * This still avoids useless tlb flushes for .text page faults |
3407 | * with threads. | 3382 | * with threads. |
3408 | */ | 3383 | */ |
3409 | if (flags & FAULT_FLAG_WRITE) | 3384 | if (fe->flags & FAULT_FLAG_WRITE) |
3410 | flush_tlb_fix_spurious_fault(vma, address); | 3385 | flush_tlb_fix_spurious_fault(fe->vma, fe->address); |
3411 | } | 3386 | } |
3412 | unlock: | 3387 | unlock: |
3413 | pte_unmap_unlock(pte, ptl); | 3388 | pte_unmap_unlock(fe->pte, fe->ptl); |
3414 | return 0; | 3389 | return 0; |
3415 | } | 3390 | } |
3416 | 3391 | ||
@@ -3423,51 +3398,42 @@ unlock: | |||
3423 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | 3398 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
3424 | unsigned int flags) | 3399 | unsigned int flags) |
3425 | { | 3400 | { |
3401 | struct fault_env fe = { | ||
3402 | .vma = vma, | ||
3403 | .address = address, | ||
3404 | .flags = flags, | ||
3405 | }; | ||
3426 | struct mm_struct *mm = vma->vm_mm; | 3406 | struct mm_struct *mm = vma->vm_mm; |
3427 | pgd_t *pgd; | 3407 | pgd_t *pgd; |
3428 | pud_t *pud; | 3408 | pud_t *pud; |
3429 | pmd_t *pmd; | ||
3430 | pte_t *pte; | ||
3431 | |||
3432 | if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, | ||
3433 | flags & FAULT_FLAG_INSTRUCTION, | ||
3434 | flags & FAULT_FLAG_REMOTE)) | ||
3435 | return VM_FAULT_SIGSEGV; | ||
3436 | |||
3437 | if (unlikely(is_vm_hugetlb_page(vma))) | ||
3438 | return hugetlb_fault(mm, vma, address, flags); | ||
3439 | 3409 | ||
3440 | pgd = pgd_offset(mm, address); | 3410 | pgd = pgd_offset(mm, address); |
3441 | pud = pud_alloc(mm, pgd, address); | 3411 | pud = pud_alloc(mm, pgd, address); |
3442 | if (!pud) | 3412 | if (!pud) |
3443 | return VM_FAULT_OOM; | 3413 | return VM_FAULT_OOM; |
3444 | pmd = pmd_alloc(mm, pud, address); | 3414 | fe.pmd = pmd_alloc(mm, pud, address); |
3445 | if (!pmd) | 3415 | if (!fe.pmd) |
3446 | return VM_FAULT_OOM; | 3416 | return VM_FAULT_OOM; |
3447 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3417 | if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { |
3448 | int ret = create_huge_pmd(mm, vma, address, pmd, flags); | 3418 | int ret = create_huge_pmd(&fe); |
3449 | if (!(ret & VM_FAULT_FALLBACK)) | 3419 | if (!(ret & VM_FAULT_FALLBACK)) |
3450 | return ret; | 3420 | return ret; |
3451 | } else { | 3421 | } else { |
3452 | pmd_t orig_pmd = *pmd; | 3422 | pmd_t orig_pmd = *fe.pmd; |
3453 | int ret; | 3423 | int ret; |
3454 | 3424 | ||
3455 | barrier(); | 3425 | barrier(); |
3456 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3426 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3457 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | ||
3458 | |||
3459 | if (pmd_protnone(orig_pmd)) | 3427 | if (pmd_protnone(orig_pmd)) |
3460 | return do_huge_pmd_numa_page(mm, vma, address, | 3428 | return do_huge_pmd_numa_page(&fe, orig_pmd); |
3461 | orig_pmd, pmd); | ||
3462 | 3429 | ||
3463 | if (dirty && !pmd_write(orig_pmd)) { | 3430 | if ((fe.flags & FAULT_FLAG_WRITE) && |
3464 | ret = wp_huge_pmd(mm, vma, address, pmd, | 3431 | !pmd_write(orig_pmd)) { |
3465 | orig_pmd, flags); | 3432 | ret = wp_huge_pmd(&fe, orig_pmd); |
3466 | if (!(ret & VM_FAULT_FALLBACK)) | 3433 | if (!(ret & VM_FAULT_FALLBACK)) |
3467 | return ret; | 3434 | return ret; |
3468 | } else { | 3435 | } else { |
3469 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3436 | huge_pmd_set_accessed(&fe, orig_pmd); |
3470 | orig_pmd, dirty); | ||
3471 | return 0; | 3437 | return 0; |
3472 | } | 3438 | } |
3473 | } | 3439 | } |
@@ -3478,7 +3444,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3478 | * run pte_offset_map on the pmd, if an huge pmd could | 3444 | * run pte_offset_map on the pmd, if an huge pmd could |
3479 | * materialize from under us from a different thread. | 3445 | * materialize from under us from a different thread. |
3480 | */ | 3446 | */ |
3481 | if (unlikely(pte_alloc(mm, pmd, address))) | 3447 | if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address))) |
3482 | return VM_FAULT_OOM; | 3448 | return VM_FAULT_OOM; |
3483 | /* | 3449 | /* |
3484 | * If a huge pmd materialized under us just retry later. Use | 3450 | * If a huge pmd materialized under us just retry later. Use |
@@ -3491,7 +3457,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3491 | * through an atomic read in C, which is what pmd_trans_unstable() | 3457 | * through an atomic read in C, which is what pmd_trans_unstable() |
3492 | * provides. | 3458 | * provides. |
3493 | */ | 3459 | */ |
3494 | if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) | 3460 | if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd))) |
3495 | return 0; | 3461 | return 0; |
3496 | /* | 3462 | /* |
3497 | * A regular pmd is established and it can't morph into a huge pmd | 3463 | * A regular pmd is established and it can't morph into a huge pmd |
@@ -3499,9 +3465,9 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3499 | * read mode and khugepaged takes it in write mode. So now it's | 3465 | * read mode and khugepaged takes it in write mode. So now it's |
3500 | * safe to run pte_offset_map(). | 3466 | * safe to run pte_offset_map(). |
3501 | */ | 3467 | */ |
3502 | pte = pte_offset_map(pmd, address); | 3468 | fe.pte = pte_offset_map(fe.pmd, fe.address); |
3503 | 3469 | ||
3504 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3470 | return handle_pte_fault(&fe); |
3505 | } | 3471 | } |
3506 | 3472 | ||
3507 | /* | 3473 | /* |
@@ -3530,7 +3496,15 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3530 | if (flags & FAULT_FLAG_USER) | 3496 | if (flags & FAULT_FLAG_USER) |
3531 | mem_cgroup_oom_enable(); | 3497 | mem_cgroup_oom_enable(); |
3532 | 3498 | ||
3533 | ret = __handle_mm_fault(vma, address, flags); | 3499 | if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, |
3500 | flags & FAULT_FLAG_INSTRUCTION, | ||
3501 | flags & FAULT_FLAG_REMOTE)) | ||
3502 | return VM_FAULT_SIGSEGV; | ||
3503 | |||
3504 | if (unlikely(is_vm_hugetlb_page(vma))) | ||
3505 | ret = hugetlb_fault(vma->vm_mm, vma, address, flags); | ||
3506 | else | ||
3507 | ret = __handle_mm_fault(vma, address, flags); | ||
3534 | 3508 | ||
3535 | if (flags & FAULT_FLAG_USER) { | 3509 | if (flags & FAULT_FLAG_USER) { |
3536 | mem_cgroup_oom_disable(); | 3510 | mem_cgroup_oom_disable(); |
diff --git a/mm/nommu.c b/mm/nommu.c index c2e58880207f..95daf81a4855 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1809 | } | 1809 | } |
1810 | EXPORT_SYMBOL(filemap_fault); | 1810 | EXPORT_SYMBOL(filemap_fault); |
1811 | 1811 | ||
1812 | void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | 1812 | void filemap_map_pages(struct fault_env *fe, |
1813 | pgoff_t start_pgoff, pgoff_t end_pgoff) | ||
1813 | { | 1814 | { |
1814 | BUG(); | 1815 | BUG(); |
1815 | } | 1816 | } |