aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-07-26 18:25:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 19:19:19 -0400
commitbae473a423f65e480db83c85b5e92254f6dfcb28 (patch)
tree9e09cd8cbcafdcc1a27298700f69f8f86f929392
parentdcddffd41d3f1d3bdcc1dce3f1cd142779b6d4c1 (diff)
mm: introduce fault_env
The idea borrowed from Peter's patch from patchset on speculative page faults[1]: Instead of passing around the endless list of function arguments, replace the lot with a single structure so we can change context without endless function signature changes. The changes are mostly mechanical with exception of faultaround code: filemap_map_pages() got reworked a bit. This patch is preparation for the next one. [1] http://lkml.kernel.org/r/20141020222841.302891540@infradead.org Link: http://lkml.kernel.org/r/1466021202-61880-9-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/Locking10
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--include/linux/huge_mm.h20
-rw-r--r--include/linux/mm.h34
-rw-r--r--include/linux/userfaultfd_k.h8
-rw-r--r--mm/filemap.c28
-rw-r--r--mm/huge_memory.c280
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c582
-rw-r--r--mm/nommu.c3
10 files changed, 475 insertions, 516 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index dda6e3f8e203..5a7386e38e2d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -548,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
548locked. The VM will unlock the page. 548locked. The VM will unlock the page.
549 549
550 ->map_pages() is called when VM asks to map easy accessible pages. 550 ->map_pages() is called when VM asks to map easy accessible pages.
551Filesystem should find and map pages associated with offsets from "pgoff" 551Filesystem should find and map pages associated with offsets from "start_pgoff"
552till "max_pgoff". ->map_pages() is called with page table locked and must 552till "end_pgoff". ->map_pages() is called with page table locked and must
553not block. If it's not possible to reach a page without blocking, 553not block. If it's not possible to reach a page without blocking,
554filesystem should skip it. Filesystem should use do_set_pte() to setup 554filesystem should skip it. Filesystem should use do_set_pte() to setup
555page table entry. Pointer to entry associated with offset "pgoff" is 555page table entry. Pointer to entry associated with the page is passed in
556passed in "pte" field in vm_fault structure. Pointers to entries for other 556"pte" field in fault_env structure. Pointers to entries for other offsets
557offsets should be calculated relative to "pte". 557should be calculated relative to "pte".
558 558
559 ->page_mkwrite() is called when a previously read-only pte is 559 ->page_mkwrite() is called when a previously read-only pte is
560about to become writeable. The filesystem again must ensure that there are 560about to become writeable. The filesystem again must ensure that there are
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2d97952e341a..85959d8324df 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,10 +257,9 @@ out:
257 * fatal_signal_pending()s, and the mmap_sem must be released before 257 * fatal_signal_pending()s, and the mmap_sem must be released before
258 * returning it. 258 * returning it.
259 */ 259 */
260int handle_userfault(struct vm_area_struct *vma, unsigned long address, 260int handle_userfault(struct fault_env *fe, unsigned long reason)
261 unsigned int flags, unsigned long reason)
262{ 261{
263 struct mm_struct *mm = vma->vm_mm; 262 struct mm_struct *mm = fe->vma->vm_mm;
264 struct userfaultfd_ctx *ctx; 263 struct userfaultfd_ctx *ctx;
265 struct userfaultfd_wait_queue uwq; 264 struct userfaultfd_wait_queue uwq;
266 int ret; 265 int ret;
@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
269 BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 268 BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
270 269
271 ret = VM_FAULT_SIGBUS; 270 ret = VM_FAULT_SIGBUS;
272 ctx = vma->vm_userfaultfd_ctx.ctx; 271 ctx = fe->vma->vm_userfaultfd_ctx.ctx;
273 if (!ctx) 272 if (!ctx)
274 goto out; 273 goto out;
275 274
@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
302 * without first stopping userland access to the memory. For 301 * without first stopping userland access to the memory. For
303 * VM_UFFD_MISSING userfaults this is enough for now. 302 * VM_UFFD_MISSING userfaults this is enough for now.
304 */ 303 */
305 if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { 304 if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
306 /* 305 /*
307 * Validate the invariant that nowait must allow retry 306 * Validate the invariant that nowait must allow retry
308 * to be sure not to return SIGBUS erroneously on 307 * to be sure not to return SIGBUS erroneously on
309 * nowait invocations. 308 * nowait invocations.
310 */ 309 */
311 BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); 310 BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
312#ifdef CONFIG_DEBUG_VM 311#ifdef CONFIG_DEBUG_VM
313 if (printk_ratelimit()) { 312 if (printk_ratelimit()) {
314 printk(KERN_WARNING 313 printk(KERN_WARNING
315 "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); 314 "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
316 dump_stack(); 315 dump_stack();
317 } 316 }
318#endif 317#endif
@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
324 * and wait. 323 * and wait.
325 */ 324 */
326 ret = VM_FAULT_RETRY; 325 ret = VM_FAULT_RETRY;
327 if (flags & FAULT_FLAG_RETRY_NOWAIT) 326 if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
328 goto out; 327 goto out;
329 328
330 /* take the reference before dropping the mmap_sem */ 329 /* take the reference before dropping the mmap_sem */
@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
332 331
333 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 332 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
334 uwq.wq.private = current; 333 uwq.wq.private = current;
335 uwq.msg = userfault_msg(address, flags, reason); 334 uwq.msg = userfault_msg(fe->address, fe->flags, reason);
336 uwq.ctx = ctx; 335 uwq.ctx = ctx;
337 336
338 return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == 337 return_to_userland =
338 (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
339 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); 339 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
340 340
341 spin_lock(&ctx->fault_pending_wqh.lock); 341 spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
353 TASK_KILLABLE); 353 TASK_KILLABLE);
354 spin_unlock(&ctx->fault_pending_wqh.lock); 354 spin_unlock(&ctx->fault_pending_wqh.lock);
355 355
356 must_wait = userfaultfd_must_wait(ctx, address, flags, reason); 356 must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
357 up_read(&mm->mmap_sem); 357 up_read(&mm->mmap_sem);
358 358
359 if (likely(must_wait && !ACCESS_ONCE(ctx->released) && 359 if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0a7a0320300..9bed9249156f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,20 +1,12 @@
1#ifndef _LINUX_HUGE_MM_H 1#ifndef _LINUX_HUGE_MM_H
2#define _LINUX_HUGE_MM_H 2#define _LINUX_HUGE_MM_H
3 3
4extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, 4extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
5 struct vm_area_struct *vma,
6 unsigned long address, pmd_t *pmd,
7 unsigned int flags);
8extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 5extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
9 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
10 struct vm_area_struct *vma); 7 struct vm_area_struct *vma);
11extern void huge_pmd_set_accessed(struct mm_struct *mm, 8extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
12 struct vm_area_struct *vma, 9extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
13 unsigned long address, pmd_t *pmd,
14 pmd_t orig_pmd, int dirty);
15extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
16 unsigned long address, pmd_t *pmd,
17 pmd_t orig_pmd);
18extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 10extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
19 unsigned long addr, 11 unsigned long addr,
20 pmd_t *pmd, 12 pmd_t *pmd,
@@ -134,8 +126,7 @@ static inline int hpage_nr_pages(struct page *page)
134 return 1; 126 return 1;
135} 127}
136 128
137extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 129extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
138 unsigned long addr, pmd_t pmd, pmd_t *pmdp);
139 130
140extern struct page *huge_zero_page; 131extern struct page *huge_zero_page;
141 132
@@ -196,8 +187,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
196 return NULL; 187 return NULL;
197} 188}
198 189
199static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 190static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
200 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
201{ 191{
202 return 0; 192 return 0;
203} 193}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 646bc36b4d1b..8bd74558c0e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,10 +309,27 @@ struct vm_fault {
309 * VM_FAULT_DAX_LOCKED and fill in 309 * VM_FAULT_DAX_LOCKED and fill in
310 * entry here. 310 * entry here.
311 */ 311 */
312 /* for ->map_pages() only */ 312};
313 pgoff_t max_pgoff; /* map pages for offset from pgoff till 313
314 * max_pgoff inclusive */ 314/*
315 pte_t *pte; /* pte entry associated with ->pgoff */ 315 * Page fault context: passes though page fault handler instead of endless list
316 * of function arguments.
317 */
318struct fault_env {
319 struct vm_area_struct *vma; /* Target VMA */
320 unsigned long address; /* Faulting virtual address */
321 unsigned int flags; /* FAULT_FLAG_xxx flags */
322 pmd_t *pmd; /* Pointer to pmd entry matching
323 * the 'address'
324 */
325 pte_t *pte; /* Pointer to pte entry matching
326 * the 'address'. NULL if the page
327 * table hasn't been allocated.
328 */
329 spinlock_t *ptl; /* Page table lock.
330 * Protects pte page table if 'pte'
331 * is not NULL, otherwise pmd.
332 */
316}; 333};
317 334
318/* 335/*
@@ -327,7 +344,8 @@ struct vm_operations_struct {
327 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 344 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
328 int (*pmd_fault)(struct vm_area_struct *, unsigned long address, 345 int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
329 pmd_t *, unsigned int flags); 346 pmd_t *, unsigned int flags);
330 void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); 347 void (*map_pages)(struct fault_env *fe,
348 pgoff_t start_pgoff, pgoff_t end_pgoff);
331 349
332 /* notification that a previously read-only page is about to become 350 /* notification that a previously read-only page is about to become
333 * writable, if an error is returned it will cause a SIGBUS */ 351 * writable, if an error is returned it will cause a SIGBUS */
@@ -600,8 +618,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
600 return pte; 618 return pte;
601} 619}
602 620
603void do_set_pte(struct vm_area_struct *vma, unsigned long address, 621void do_set_pte(struct fault_env *fe, struct page *page);
604 struct page *page, pte_t *pte, bool write, bool anon);
605#endif 622#endif
606 623
607/* 624/*
@@ -2062,7 +2079,8 @@ extern void truncate_inode_pages_final(struct address_space *);
2062 2079
2063/* generic vm_area_ops exported for stackable file systems */ 2080/* generic vm_area_ops exported for stackable file systems */
2064extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 2081extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
2065extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); 2082extern void filemap_map_pages(struct fault_env *fe,
2083 pgoff_t start_pgoff, pgoff_t end_pgoff);
2066extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2084extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2067 2085
2068/* mm/page-writeback.c */ 2086/* mm/page-writeback.c */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 587480ad41b7..dd66a952e8cd 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,8 +27,7 @@
27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) 27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) 28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
29 29
30extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, 30extern int handle_userfault(struct fault_env *fe, unsigned long reason);
31 unsigned int flags, unsigned long reason);
32 31
33extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 32extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
34 unsigned long src_start, unsigned long len); 33 unsigned long src_start, unsigned long len);
@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
56#else /* CONFIG_USERFAULTFD */ 55#else /* CONFIG_USERFAULTFD */
57 56
58/* mm helpers */ 57/* mm helpers */
59static inline int handle_userfault(struct vm_area_struct *vma, 58static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
60 unsigned long address,
61 unsigned int flags,
62 unsigned long reason)
63{ 59{
64 return VM_FAULT_SIGBUS; 60 return VM_FAULT_SIGBUS;
65} 61}
diff --git a/mm/filemap.c b/mm/filemap.c
index 20f3b1f33f0e..54d5318f8d3f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2128,22 +2128,27 @@ page_not_uptodate:
2128} 2128}
2129EXPORT_SYMBOL(filemap_fault); 2129EXPORT_SYMBOL(filemap_fault);
2130 2130
2131void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 2131void filemap_map_pages(struct fault_env *fe,
2132 pgoff_t start_pgoff, pgoff_t end_pgoff)
2132{ 2133{
2133 struct radix_tree_iter iter; 2134 struct radix_tree_iter iter;
2134 void **slot; 2135 void **slot;
2135 struct file *file = vma->vm_file; 2136 struct file *file = fe->vma->vm_file;
2136 struct address_space *mapping = file->f_mapping; 2137 struct address_space *mapping = file->f_mapping;
2138 pgoff_t last_pgoff = start_pgoff;
2137 loff_t size; 2139 loff_t size;
2138 struct page *page; 2140 struct page *page;
2139 unsigned long address = (unsigned long) vmf->virtual_address;
2140 unsigned long addr;
2141 pte_t *pte;
2142 2141
2143 rcu_read_lock(); 2142 rcu_read_lock();
2144 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { 2143 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
2145 if (iter.index > vmf->max_pgoff) 2144 start_pgoff) {
2145 if (iter.index > end_pgoff)
2146 break; 2146 break;
2147 fe->pte += iter.index - last_pgoff;
2148 fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
2149 last_pgoff = iter.index;
2150 if (!pte_none(*fe->pte))
2151 goto next;
2147repeat: 2152repeat:
2148 page = radix_tree_deref_slot(slot); 2153 page = radix_tree_deref_slot(slot);
2149 if (unlikely(!page)) 2154 if (unlikely(!page))
@@ -2179,14 +2184,9 @@ repeat:
2179 if (page->index >= size >> PAGE_SHIFT) 2184 if (page->index >= size >> PAGE_SHIFT)
2180 goto unlock; 2185 goto unlock;
2181 2186
2182 pte = vmf->pte + page->index - vmf->pgoff;
2183 if (!pte_none(*pte))
2184 goto unlock;
2185
2186 if (file->f_ra.mmap_miss > 0) 2187 if (file->f_ra.mmap_miss > 0)
2187 file->f_ra.mmap_miss--; 2188 file->f_ra.mmap_miss--;
2188 addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; 2189 do_set_pte(fe, page);
2189 do_set_pte(vma, addr, page, pte, false, false);
2190 unlock_page(page); 2190 unlock_page(page);
2191 goto next; 2191 goto next;
2192unlock: 2192unlock:
@@ -2194,7 +2194,7 @@ unlock:
2194skip: 2194skip:
2195 put_page(page); 2195 put_page(page);
2196next: 2196next:
2197 if (iter.index == vmf->max_pgoff) 2197 if (iter.index == end_pgoff)
2198 break; 2198 break;
2199 } 2199 }
2200 rcu_read_unlock(); 2200 rcu_read_unlock();
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1a90f55d930f..bc5abcbe376e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -821,26 +821,23 @@ void prep_transhuge_page(struct page *page)
821 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 821 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
822} 822}
823 823
824static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 824static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
825 struct vm_area_struct *vma, 825 gfp_t gfp)
826 unsigned long address, pmd_t *pmd,
827 struct page *page, gfp_t gfp,
828 unsigned int flags)
829{ 826{
827 struct vm_area_struct *vma = fe->vma;
830 struct mem_cgroup *memcg; 828 struct mem_cgroup *memcg;
831 pgtable_t pgtable; 829 pgtable_t pgtable;
832 spinlock_t *ptl; 830 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
833 unsigned long haddr = address & HPAGE_PMD_MASK;
834 831
835 VM_BUG_ON_PAGE(!PageCompound(page), page); 832 VM_BUG_ON_PAGE(!PageCompound(page), page);
836 833
837 if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { 834 if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
838 put_page(page); 835 put_page(page);
839 count_vm_event(THP_FAULT_FALLBACK); 836 count_vm_event(THP_FAULT_FALLBACK);
840 return VM_FAULT_FALLBACK; 837 return VM_FAULT_FALLBACK;
841 } 838 }
842 839
843 pgtable = pte_alloc_one(mm, haddr); 840 pgtable = pte_alloc_one(vma->vm_mm, haddr);
844 if (unlikely(!pgtable)) { 841 if (unlikely(!pgtable)) {
845 mem_cgroup_cancel_charge(page, memcg, true); 842 mem_cgroup_cancel_charge(page, memcg, true);
846 put_page(page); 843 put_page(page);
@@ -855,12 +852,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
855 */ 852 */
856 __SetPageUptodate(page); 853 __SetPageUptodate(page);
857 854
858 ptl = pmd_lock(mm, pmd); 855 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
859 if (unlikely(!pmd_none(*pmd))) { 856 if (unlikely(!pmd_none(*fe->pmd))) {
860 spin_unlock(ptl); 857 spin_unlock(fe->ptl);
861 mem_cgroup_cancel_charge(page, memcg, true); 858 mem_cgroup_cancel_charge(page, memcg, true);
862 put_page(page); 859 put_page(page);
863 pte_free(mm, pgtable); 860 pte_free(vma->vm_mm, pgtable);
864 } else { 861 } else {
865 pmd_t entry; 862 pmd_t entry;
866 863
@@ -868,12 +865,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
868 if (userfaultfd_missing(vma)) { 865 if (userfaultfd_missing(vma)) {
869 int ret; 866 int ret;
870 867
871 spin_unlock(ptl); 868 spin_unlock(fe->ptl);
872 mem_cgroup_cancel_charge(page, memcg, true); 869 mem_cgroup_cancel_charge(page, memcg, true);
873 put_page(page); 870 put_page(page);
874 pte_free(mm, pgtable); 871 pte_free(vma->vm_mm, pgtable);
875 ret = handle_userfault(vma, address, flags, 872 ret = handle_userfault(fe, VM_UFFD_MISSING);
876 VM_UFFD_MISSING);
877 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 873 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
878 return ret; 874 return ret;
879 } 875 }
@@ -883,11 +879,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
883 page_add_new_anon_rmap(page, vma, haddr, true); 879 page_add_new_anon_rmap(page, vma, haddr, true);
884 mem_cgroup_commit_charge(page, memcg, false, true); 880 mem_cgroup_commit_charge(page, memcg, false, true);
885 lru_cache_add_active_or_unevictable(page, vma); 881 lru_cache_add_active_or_unevictable(page, vma);
886 pgtable_trans_huge_deposit(mm, pmd, pgtable); 882 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
887 set_pmd_at(mm, haddr, pmd, entry); 883 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
888 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 884 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
889 atomic_long_inc(&mm->nr_ptes); 885 atomic_long_inc(&vma->vm_mm->nr_ptes);
890 spin_unlock(ptl); 886 spin_unlock(fe->ptl);
891 count_vm_event(THP_FAULT_ALLOC); 887 count_vm_event(THP_FAULT_ALLOC);
892 } 888 }
893 889
@@ -937,13 +933,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
937 return true; 933 return true;
938} 934}
939 935
940int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 936int do_huge_pmd_anonymous_page(struct fault_env *fe)
941 unsigned long address, pmd_t *pmd,
942 unsigned int flags)
943{ 937{
938 struct vm_area_struct *vma = fe->vma;
944 gfp_t gfp; 939 gfp_t gfp;
945 struct page *page; 940 struct page *page;
946 unsigned long haddr = address & HPAGE_PMD_MASK; 941 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
947 942
948 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 943 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
949 return VM_FAULT_FALLBACK; 944 return VM_FAULT_FALLBACK;
@@ -951,42 +946,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
951 return VM_FAULT_OOM; 946 return VM_FAULT_OOM;
952 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 947 if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
953 return VM_FAULT_OOM; 948 return VM_FAULT_OOM;
954 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && 949 if (!(fe->flags & FAULT_FLAG_WRITE) &&
950 !mm_forbids_zeropage(vma->vm_mm) &&
955 transparent_hugepage_use_zero_page()) { 951 transparent_hugepage_use_zero_page()) {
956 spinlock_t *ptl;
957 pgtable_t pgtable; 952 pgtable_t pgtable;
958 struct page *zero_page; 953 struct page *zero_page;
959 bool set; 954 bool set;
960 int ret; 955 int ret;
961 pgtable = pte_alloc_one(mm, haddr); 956 pgtable = pte_alloc_one(vma->vm_mm, haddr);
962 if (unlikely(!pgtable)) 957 if (unlikely(!pgtable))
963 return VM_FAULT_OOM; 958 return VM_FAULT_OOM;
964 zero_page = get_huge_zero_page(); 959 zero_page = get_huge_zero_page();
965 if (unlikely(!zero_page)) { 960 if (unlikely(!zero_page)) {
966 pte_free(mm, pgtable); 961 pte_free(vma->vm_mm, pgtable);
967 count_vm_event(THP_FAULT_FALLBACK); 962 count_vm_event(THP_FAULT_FALLBACK);
968 return VM_FAULT_FALLBACK; 963 return VM_FAULT_FALLBACK;
969 } 964 }
970 ptl = pmd_lock(mm, pmd); 965 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
971 ret = 0; 966 ret = 0;
972 set = false; 967 set = false;
973 if (pmd_none(*pmd)) { 968 if (pmd_none(*fe->pmd)) {
974 if (userfaultfd_missing(vma)) { 969 if (userfaultfd_missing(vma)) {
975 spin_unlock(ptl); 970 spin_unlock(fe->ptl);
976 ret = handle_userfault(vma, address, flags, 971 ret = handle_userfault(fe, VM_UFFD_MISSING);
977 VM_UFFD_MISSING);
978 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 972 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
979 } else { 973 } else {
980 set_huge_zero_page(pgtable, mm, vma, 974 set_huge_zero_page(pgtable, vma->vm_mm, vma,
981 haddr, pmd, 975 haddr, fe->pmd, zero_page);
982 zero_page); 976 spin_unlock(fe->ptl);
983 spin_unlock(ptl);
984 set = true; 977 set = true;
985 } 978 }
986 } else 979 } else
987 spin_unlock(ptl); 980 spin_unlock(fe->ptl);
988 if (!set) { 981 if (!set) {
989 pte_free(mm, pgtable); 982 pte_free(vma->vm_mm, pgtable);
990 put_huge_zero_page(); 983 put_huge_zero_page();
991 } 984 }
992 return ret; 985 return ret;
@@ -998,8 +991,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
998 return VM_FAULT_FALLBACK; 991 return VM_FAULT_FALLBACK;
999 } 992 }
1000 prep_transhuge_page(page); 993 prep_transhuge_page(page);
1001 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, 994 return __do_huge_pmd_anonymous_page(fe, page, gfp);
1002 flags);
1003} 995}
1004 996
1005static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 997static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1172,38 +1164,31 @@ out:
1172 return ret; 1164 return ret;
1173} 1165}
1174 1166
1175void huge_pmd_set_accessed(struct mm_struct *mm, 1167void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
1176 struct vm_area_struct *vma,
1177 unsigned long address,
1178 pmd_t *pmd, pmd_t orig_pmd,
1179 int dirty)
1180{ 1168{
1181 spinlock_t *ptl;
1182 pmd_t entry; 1169 pmd_t entry;
1183 unsigned long haddr; 1170 unsigned long haddr;
1184 1171
1185 ptl = pmd_lock(mm, pmd); 1172 fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
1186 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1173 if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
1187 goto unlock; 1174 goto unlock;
1188 1175
1189 entry = pmd_mkyoung(orig_pmd); 1176 entry = pmd_mkyoung(orig_pmd);
1190 haddr = address & HPAGE_PMD_MASK; 1177 haddr = fe->address & HPAGE_PMD_MASK;
1191 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 1178 if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
1192 update_mmu_cache_pmd(vma, address, pmd); 1179 fe->flags & FAULT_FLAG_WRITE))
1180 update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
1193 1181
1194unlock: 1182unlock:
1195 spin_unlock(ptl); 1183 spin_unlock(fe->ptl);
1196} 1184}
1197 1185
1198static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 1186static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
1199 struct vm_area_struct *vma, 1187 struct page *page)
1200 unsigned long address,
1201 pmd_t *pmd, pmd_t orig_pmd,
1202 struct page *page,
1203 unsigned long haddr)
1204{ 1188{
1189 struct vm_area_struct *vma = fe->vma;
1190 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1205 struct mem_cgroup *memcg; 1191 struct mem_cgroup *memcg;
1206 spinlock_t *ptl;
1207 pgtable_t pgtable; 1192 pgtable_t pgtable;
1208 pmd_t _pmd; 1193 pmd_t _pmd;
1209 int ret = 0, i; 1194 int ret = 0, i;
@@ -1220,11 +1205,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1220 1205
1221 for (i = 0; i < HPAGE_PMD_NR; i++) { 1206 for (i = 0; i < HPAGE_PMD_NR; i++) {
1222 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 1207 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
1223 __GFP_OTHER_NODE, 1208 __GFP_OTHER_NODE, vma,
1224 vma, address, page_to_nid(page)); 1209 fe->address, page_to_nid(page));
1225 if (unlikely(!pages[i] || 1210 if (unlikely(!pages[i] ||
1226 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, 1211 mem_cgroup_try_charge(pages[i], vma->vm_mm,
1227 &memcg, false))) { 1212 GFP_KERNEL, &memcg, false))) {
1228 if (pages[i]) 1213 if (pages[i])
1229 put_page(pages[i]); 1214 put_page(pages[i]);
1230 while (--i >= 0) { 1215 while (--i >= 0) {
@@ -1250,41 +1235,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1250 1235
1251 mmun_start = haddr; 1236 mmun_start = haddr;
1252 mmun_end = haddr + HPAGE_PMD_SIZE; 1237 mmun_end = haddr + HPAGE_PMD_SIZE;
1253 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1238 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1254 1239
1255 ptl = pmd_lock(mm, pmd); 1240 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
1256 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1241 if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
1257 goto out_free_pages; 1242 goto out_free_pages;
1258 VM_BUG_ON_PAGE(!PageHead(page), page); 1243 VM_BUG_ON_PAGE(!PageHead(page), page);
1259 1244
1260 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1245 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
1261 /* leave pmd empty until pte is filled */ 1246 /* leave pmd empty until pte is filled */
1262 1247
1263 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 1248 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
1264 pmd_populate(mm, &_pmd, pgtable); 1249 pmd_populate(vma->vm_mm, &_pmd, pgtable);
1265 1250
1266 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1251 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1267 pte_t *pte, entry; 1252 pte_t entry;
1268 entry = mk_pte(pages[i], vma->vm_page_prot); 1253 entry = mk_pte(pages[i], vma->vm_page_prot);
1269 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1254 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1270 memcg = (void *)page_private(pages[i]); 1255 memcg = (void *)page_private(pages[i]);
1271 set_page_private(pages[i], 0); 1256 set_page_private(pages[i], 0);
1272 page_add_new_anon_rmap(pages[i], vma, haddr, false); 1257 page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
1273 mem_cgroup_commit_charge(pages[i], memcg, false, false); 1258 mem_cgroup_commit_charge(pages[i], memcg, false, false);
1274 lru_cache_add_active_or_unevictable(pages[i], vma); 1259 lru_cache_add_active_or_unevictable(pages[i], vma);
1275 pte = pte_offset_map(&_pmd, haddr); 1260 fe->pte = pte_offset_map(&_pmd, haddr);
1276 VM_BUG_ON(!pte_none(*pte)); 1261 VM_BUG_ON(!pte_none(*fe->pte));
1277 set_pte_at(mm, haddr, pte, entry); 1262 set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
1278 pte_unmap(pte); 1263 pte_unmap(fe->pte);
1279 } 1264 }
1280 kfree(pages); 1265 kfree(pages);
1281 1266
1282 smp_wmb(); /* make pte visible before pmd */ 1267 smp_wmb(); /* make pte visible before pmd */
1283 pmd_populate(mm, pmd, pgtable); 1268 pmd_populate(vma->vm_mm, fe->pmd, pgtable);
1284 page_remove_rmap(page, true); 1269 page_remove_rmap(page, true);
1285 spin_unlock(ptl); 1270 spin_unlock(fe->ptl);
1286 1271
1287 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1272 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1288 1273
1289 ret |= VM_FAULT_WRITE; 1274 ret |= VM_FAULT_WRITE;
1290 put_page(page); 1275 put_page(page);
@@ -1293,8 +1278,8 @@ out:
1293 return ret; 1278 return ret;
1294 1279
1295out_free_pages: 1280out_free_pages:
1296 spin_unlock(ptl); 1281 spin_unlock(fe->ptl);
1297 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1282 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1298 for (i = 0; i < HPAGE_PMD_NR; i++) { 1283 for (i = 0; i < HPAGE_PMD_NR; i++) {
1299 memcg = (void *)page_private(pages[i]); 1284 memcg = (void *)page_private(pages[i]);
1300 set_page_private(pages[i], 0); 1285 set_page_private(pages[i], 0);
@@ -1305,25 +1290,23 @@ out_free_pages:
1305 goto out; 1290 goto out;
1306} 1291}
1307 1292
1308int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1293int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
1309 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
1310{ 1294{
1311 spinlock_t *ptl; 1295 struct vm_area_struct *vma = fe->vma;
1312 int ret = 0;
1313 struct page *page = NULL, *new_page; 1296 struct page *page = NULL, *new_page;
1314 struct mem_cgroup *memcg; 1297 struct mem_cgroup *memcg;
1315 unsigned long haddr; 1298 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1316 unsigned long mmun_start; /* For mmu_notifiers */ 1299 unsigned long mmun_start; /* For mmu_notifiers */
1317 unsigned long mmun_end; /* For mmu_notifiers */ 1300 unsigned long mmun_end; /* For mmu_notifiers */
1318 gfp_t huge_gfp; /* for allocation and charge */ 1301 gfp_t huge_gfp; /* for allocation and charge */
1302 int ret = 0;
1319 1303
1320 ptl = pmd_lockptr(mm, pmd); 1304 fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
1321 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1305 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1322 haddr = address & HPAGE_PMD_MASK;
1323 if (is_huge_zero_pmd(orig_pmd)) 1306 if (is_huge_zero_pmd(orig_pmd))
1324 goto alloc; 1307 goto alloc;
1325 spin_lock(ptl); 1308 spin_lock(fe->ptl);
1326 if (unlikely(!pmd_same(*pmd, orig_pmd))) 1309 if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
1327 goto out_unlock; 1310 goto out_unlock;
1328 1311
1329 page = pmd_page(orig_pmd); 1312 page = pmd_page(orig_pmd);
@@ -1336,13 +1319,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1336 pmd_t entry; 1319 pmd_t entry;
1337 entry = pmd_mkyoung(orig_pmd); 1320 entry = pmd_mkyoung(orig_pmd);
1338 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1321 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1339 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 1322 if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
1340 update_mmu_cache_pmd(vma, address, pmd); 1323 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1341 ret |= VM_FAULT_WRITE; 1324 ret |= VM_FAULT_WRITE;
1342 goto out_unlock; 1325 goto out_unlock;
1343 } 1326 }
1344 get_page(page); 1327 get_page(page);
1345 spin_unlock(ptl); 1328 spin_unlock(fe->ptl);
1346alloc: 1329alloc:
1347 if (transparent_hugepage_enabled(vma) && 1330 if (transparent_hugepage_enabled(vma) &&
1348 !transparent_hugepage_debug_cow()) { 1331 !transparent_hugepage_debug_cow()) {
@@ -1355,13 +1338,12 @@ alloc:
1355 prep_transhuge_page(new_page); 1338 prep_transhuge_page(new_page);
1356 } else { 1339 } else {
1357 if (!page) { 1340 if (!page) {
1358 split_huge_pmd(vma, pmd, address); 1341 split_huge_pmd(vma, fe->pmd, fe->address);
1359 ret |= VM_FAULT_FALLBACK; 1342 ret |= VM_FAULT_FALLBACK;
1360 } else { 1343 } else {
1361 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 1344 ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
1362 pmd, orig_pmd, page, haddr);
1363 if (ret & VM_FAULT_OOM) { 1345 if (ret & VM_FAULT_OOM) {
1364 split_huge_pmd(vma, pmd, address); 1346 split_huge_pmd(vma, fe->pmd, fe->address);
1365 ret |= VM_FAULT_FALLBACK; 1347 ret |= VM_FAULT_FALLBACK;
1366 } 1348 }
1367 put_page(page); 1349 put_page(page);
@@ -1370,14 +1352,12 @@ alloc:
1370 goto out; 1352 goto out;
1371 } 1353 }
1372 1354
1373 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, 1355 if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
1374 true))) { 1356 huge_gfp, &memcg, true))) {
1375 put_page(new_page); 1357 put_page(new_page);
1376 if (page) { 1358 split_huge_pmd(vma, fe->pmd, fe->address);
1377 split_huge_pmd(vma, pmd, address); 1359 if (page)
1378 put_page(page); 1360 put_page(page);
1379 } else
1380 split_huge_pmd(vma, pmd, address);
1381 ret |= VM_FAULT_FALLBACK; 1361 ret |= VM_FAULT_FALLBACK;
1382 count_vm_event(THP_FAULT_FALLBACK); 1362 count_vm_event(THP_FAULT_FALLBACK);
1383 goto out; 1363 goto out;
@@ -1393,13 +1373,13 @@ alloc:
1393 1373
1394 mmun_start = haddr; 1374 mmun_start = haddr;
1395 mmun_end = haddr + HPAGE_PMD_SIZE; 1375 mmun_end = haddr + HPAGE_PMD_SIZE;
1396 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1376 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1397 1377
1398 spin_lock(ptl); 1378 spin_lock(fe->ptl);
1399 if (page) 1379 if (page)
1400 put_page(page); 1380 put_page(page);
1401 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1381 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
1402 spin_unlock(ptl); 1382 spin_unlock(fe->ptl);
1403 mem_cgroup_cancel_charge(new_page, memcg, true); 1383 mem_cgroup_cancel_charge(new_page, memcg, true);
1404 put_page(new_page); 1384 put_page(new_page);
1405 goto out_mn; 1385 goto out_mn;
@@ -1407,14 +1387,14 @@ alloc:
1407 pmd_t entry; 1387 pmd_t entry;
1408 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1388 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1409 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1389 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1410 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1390 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
1411 page_add_new_anon_rmap(new_page, vma, haddr, true); 1391 page_add_new_anon_rmap(new_page, vma, haddr, true);
1412 mem_cgroup_commit_charge(new_page, memcg, false, true); 1392 mem_cgroup_commit_charge(new_page, memcg, false, true);
1413 lru_cache_add_active_or_unevictable(new_page, vma); 1393 lru_cache_add_active_or_unevictable(new_page, vma);
1414 set_pmd_at(mm, haddr, pmd, entry); 1394 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
1415 update_mmu_cache_pmd(vma, address, pmd); 1395 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1416 if (!page) { 1396 if (!page) {
1417 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1397 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1418 put_huge_zero_page(); 1398 put_huge_zero_page();
1419 } else { 1399 } else {
1420 VM_BUG_ON_PAGE(!PageHead(page), page); 1400 VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1423,13 +1403,13 @@ alloc:
1423 } 1403 }
1424 ret |= VM_FAULT_WRITE; 1404 ret |= VM_FAULT_WRITE;
1425 } 1405 }
1426 spin_unlock(ptl); 1406 spin_unlock(fe->ptl);
1427out_mn: 1407out_mn:
1428 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1408 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1429out: 1409out:
1430 return ret; 1410 return ret;
1431out_unlock: 1411out_unlock:
1432 spin_unlock(ptl); 1412 spin_unlock(fe->ptl);
1433 return ret; 1413 return ret;
1434} 1414}
1435 1415
@@ -1489,13 +1469,12 @@ out:
1489} 1469}
1490 1470
1491/* NUMA hinting page fault entry point for trans huge pmds */ 1471/* NUMA hinting page fault entry point for trans huge pmds */
1492int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1472int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1493 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1494{ 1473{
1495 spinlock_t *ptl; 1474 struct vm_area_struct *vma = fe->vma;
1496 struct anon_vma *anon_vma = NULL; 1475 struct anon_vma *anon_vma = NULL;
1497 struct page *page; 1476 struct page *page;
1498 unsigned long haddr = addr & HPAGE_PMD_MASK; 1477 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1499 int page_nid = -1, this_nid = numa_node_id(); 1478 int page_nid = -1, this_nid = numa_node_id();
1500 int target_nid, last_cpupid = -1; 1479 int target_nid, last_cpupid = -1;
1501 bool page_locked; 1480 bool page_locked;
@@ -1506,8 +1485,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1506 /* A PROT_NONE fault should not end up here */ 1485 /* A PROT_NONE fault should not end up here */
1507 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); 1486 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1508 1487
1509 ptl = pmd_lock(mm, pmdp); 1488 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
1510 if (unlikely(!pmd_same(pmd, *pmdp))) 1489 if (unlikely(!pmd_same(pmd, *fe->pmd)))
1511 goto out_unlock; 1490 goto out_unlock;
1512 1491
1513 /* 1492 /*
@@ -1515,9 +1494,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1515 * without disrupting NUMA hinting information. Do not relock and 1494 * without disrupting NUMA hinting information. Do not relock and
1516 * check_same as the page may no longer be mapped. 1495 * check_same as the page may no longer be mapped.
1517 */ 1496 */
1518 if (unlikely(pmd_trans_migrating(*pmdp))) { 1497 if (unlikely(pmd_trans_migrating(*fe->pmd))) {
1519 page = pmd_page(*pmdp); 1498 page = pmd_page(*fe->pmd);
1520 spin_unlock(ptl); 1499 spin_unlock(fe->ptl);
1521 wait_on_page_locked(page); 1500 wait_on_page_locked(page);
1522 goto out; 1501 goto out;
1523 } 1502 }
@@ -1550,7 +1529,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1550 1529
1551 /* Migration could have started since the pmd_trans_migrating check */ 1530 /* Migration could have started since the pmd_trans_migrating check */
1552 if (!page_locked) { 1531 if (!page_locked) {
1553 spin_unlock(ptl); 1532 spin_unlock(fe->ptl);
1554 wait_on_page_locked(page); 1533 wait_on_page_locked(page);
1555 page_nid = -1; 1534 page_nid = -1;
1556 goto out; 1535 goto out;
@@ -1561,12 +1540,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1561 * to serialises splits 1540 * to serialises splits
1562 */ 1541 */
1563 get_page(page); 1542 get_page(page);
1564 spin_unlock(ptl); 1543 spin_unlock(fe->ptl);
1565 anon_vma = page_lock_anon_vma_read(page); 1544 anon_vma = page_lock_anon_vma_read(page);
1566 1545
1567 /* Confirm the PMD did not change while page_table_lock was released */ 1546 /* Confirm the PMD did not change while page_table_lock was released */
1568 spin_lock(ptl); 1547 spin_lock(fe->ptl);
1569 if (unlikely(!pmd_same(pmd, *pmdp))) { 1548 if (unlikely(!pmd_same(pmd, *fe->pmd))) {
1570 unlock_page(page); 1549 unlock_page(page);
1571 put_page(page); 1550 put_page(page);
1572 page_nid = -1; 1551 page_nid = -1;
@@ -1584,9 +1563,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1584 * Migrate the THP to the requested node, returns with page unlocked 1563 * Migrate the THP to the requested node, returns with page unlocked
1585 * and access rights restored. 1564 * and access rights restored.
1586 */ 1565 */
1587 spin_unlock(ptl); 1566 spin_unlock(fe->ptl);
1588 migrated = migrate_misplaced_transhuge_page(mm, vma, 1567 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1589 pmdp, pmd, addr, page, target_nid); 1568 fe->pmd, pmd, fe->address, page, target_nid);
1590 if (migrated) { 1569 if (migrated) {
1591 flags |= TNF_MIGRATED; 1570 flags |= TNF_MIGRATED;
1592 page_nid = target_nid; 1571 page_nid = target_nid;
@@ -1601,18 +1580,18 @@ clear_pmdnuma:
1601 pmd = pmd_mkyoung(pmd); 1580 pmd = pmd_mkyoung(pmd);
1602 if (was_writable) 1581 if (was_writable)
1603 pmd = pmd_mkwrite(pmd); 1582 pmd = pmd_mkwrite(pmd);
1604 set_pmd_at(mm, haddr, pmdp, pmd); 1583 set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
1605 update_mmu_cache_pmd(vma, addr, pmdp); 1584 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1606 unlock_page(page); 1585 unlock_page(page);
1607out_unlock: 1586out_unlock:
1608 spin_unlock(ptl); 1587 spin_unlock(fe->ptl);
1609 1588
1610out: 1589out:
1611 if (anon_vma) 1590 if (anon_vma)
1612 page_unlock_anon_vma_read(anon_vma); 1591 page_unlock_anon_vma_read(anon_vma);
1613 1592
1614 if (page_nid != -1) 1593 if (page_nid != -1)
1615 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); 1594 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
1616 1595
1617 return 0; 1596 return 0;
1618} 1597}
@@ -2413,20 +2392,23 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
2413 struct vm_area_struct *vma, 2392 struct vm_area_struct *vma,
2414 unsigned long address, pmd_t *pmd) 2393 unsigned long address, pmd_t *pmd)
2415{ 2394{
2416 unsigned long _address; 2395 pte_t pteval;
2417 pte_t *pte, pteval;
2418 int swapped_in = 0, ret = 0; 2396 int swapped_in = 0, ret = 0;
2419 2397 struct fault_env fe = {
2420 pte = pte_offset_map(pmd, address); 2398 .vma = vma,
2421 for (_address = address; _address < address + HPAGE_PMD_NR*PAGE_SIZE; 2399 .address = address,
2422 pte++, _address += PAGE_SIZE) { 2400 .flags = FAULT_FLAG_ALLOW_RETRY,
2423 pteval = *pte; 2401 .pmd = pmd,
2402 };
2403
2404 fe.pte = pte_offset_map(pmd, address);
2405 for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
2406 fe.pte++, fe.address += PAGE_SIZE) {
2407 pteval = *fe.pte;
2424 if (!is_swap_pte(pteval)) 2408 if (!is_swap_pte(pteval))
2425 continue; 2409 continue;
2426 swapped_in++; 2410 swapped_in++;
2427 ret = do_swap_page(mm, vma, _address, pte, pmd, 2411 ret = do_swap_page(&fe, pteval);
2428 FAULT_FLAG_ALLOW_RETRY,
2429 pteval);
2430 /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ 2412 /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
2431 if (ret & VM_FAULT_RETRY) { 2413 if (ret & VM_FAULT_RETRY) {
2432 down_read(&mm->mmap_sem); 2414 down_read(&mm->mmap_sem);
@@ -2442,10 +2424,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
2442 return false; 2424 return false;
2443 } 2425 }
2444 /* pte is unmapped now, we need to map it */ 2426 /* pte is unmapped now, we need to map it */
2445 pte = pte_offset_map(pmd, _address); 2427 fe.pte = pte_offset_map(pmd, fe.address);
2446 } 2428 }
2447 pte--; 2429 fe.pte--;
2448 pte_unmap(pte); 2430 pte_unmap(fe.pte);
2449 trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1); 2431 trace_mm_collapse_huge_page_swapin(mm, swapped_in, 1);
2450 return true; 2432 return true;
2451} 2433}
diff --git a/mm/internal.h b/mm/internal.h
index e1531758122b..9b6a6c43ac39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,9 +36,7 @@
36/* Do not use these with a slab allocator */ 36/* Do not use these with a slab allocator */
37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) 37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
38 38
39extern int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 39int do_swap_page(struct fault_env *fe, pte_t orig_pte);
40 unsigned long address, pte_t *page_table, pmd_t *pmd,
41 unsigned int flags, pte_t orig_pte);
42 40
43void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 41void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
44 unsigned long floor, unsigned long ceiling); 42 unsigned long floor, unsigned long ceiling);
diff --git a/mm/memory.c b/mm/memory.c
index 6bf2b8564376..72b520897339 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2070,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2070 * case, all we need to do here is to mark the page as writable and update 2070 * case, all we need to do here is to mark the page as writable and update
2071 * any related book-keeping. 2071 * any related book-keeping.
2072 */ 2072 */
2073static inline int wp_page_reuse(struct mm_struct *mm, 2073static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2074 struct vm_area_struct *vma, unsigned long address, 2074 struct page *page, int page_mkwrite, int dirty_shared)
2075 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, 2075 __releases(fe->ptl)
2076 struct page *page, int page_mkwrite,
2077 int dirty_shared)
2078 __releases(ptl)
2079{ 2076{
2077 struct vm_area_struct *vma = fe->vma;
2080 pte_t entry; 2078 pte_t entry;
2081 /* 2079 /*
2082 * Clear the pages cpupid information as the existing 2080 * Clear the pages cpupid information as the existing
@@ -2086,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm,
2086 if (page) 2084 if (page)
2087 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); 2085 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2088 2086
2089 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2087 flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
2090 entry = pte_mkyoung(orig_pte); 2088 entry = pte_mkyoung(orig_pte);
2091 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2089 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2092 if (ptep_set_access_flags(vma, address, page_table, entry, 1)) 2090 if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
2093 update_mmu_cache(vma, address, page_table); 2091 update_mmu_cache(vma, fe->address, fe->pte);
2094 pte_unmap_unlock(page_table, ptl); 2092 pte_unmap_unlock(fe->pte, fe->ptl);
2095 2093
2096 if (dirty_shared) { 2094 if (dirty_shared) {
2097 struct address_space *mapping; 2095 struct address_space *mapping;
@@ -2137,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm,
2137 * held to the old page, as well as updating the rmap. 2135 * held to the old page, as well as updating the rmap.
2138 * - In any case, unlock the PTL and drop the reference we took to the old page. 2136 * - In any case, unlock the PTL and drop the reference we took to the old page.
2139 */ 2137 */
2140static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, 2138static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2141 unsigned long address, pte_t *page_table, pmd_t *pmd, 2139 struct page *old_page)
2142 pte_t orig_pte, struct page *old_page)
2143{ 2140{
2141 struct vm_area_struct *vma = fe->vma;
2142 struct mm_struct *mm = vma->vm_mm;
2144 struct page *new_page = NULL; 2143 struct page *new_page = NULL;
2145 spinlock_t *ptl = NULL;
2146 pte_t entry; 2144 pte_t entry;
2147 int page_copied = 0; 2145 int page_copied = 0;
2148 const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ 2146 const unsigned long mmun_start = fe->address & PAGE_MASK;
2149 const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ 2147 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2150 struct mem_cgroup *memcg; 2148 struct mem_cgroup *memcg;
2151 2149
2152 if (unlikely(anon_vma_prepare(vma))) 2150 if (unlikely(anon_vma_prepare(vma)))
2153 goto oom; 2151 goto oom;
2154 2152
2155 if (is_zero_pfn(pte_pfn(orig_pte))) { 2153 if (is_zero_pfn(pte_pfn(orig_pte))) {
2156 new_page = alloc_zeroed_user_highpage_movable(vma, address); 2154 new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
2157 if (!new_page) 2155 if (!new_page)
2158 goto oom; 2156 goto oom;
2159 } else { 2157 } else {
2160 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2158 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2159 fe->address);
2161 if (!new_page) 2160 if (!new_page)
2162 goto oom; 2161 goto oom;
2163 cow_user_page(new_page, old_page, address, vma); 2162 cow_user_page(new_page, old_page, fe->address, vma);
2164 } 2163 }
2165 2164
2166 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) 2165 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2173,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2173 /* 2172 /*
2174 * Re-check the pte - we dropped the lock 2173 * Re-check the pte - we dropped the lock
2175 */ 2174 */
2176 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2175 fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
2177 if (likely(pte_same(*page_table, orig_pte))) { 2176 if (likely(pte_same(*fe->pte, orig_pte))) {
2178 if (old_page) { 2177 if (old_page) {
2179 if (!PageAnon(old_page)) { 2178 if (!PageAnon(old_page)) {
2180 dec_mm_counter_fast(mm, 2179 dec_mm_counter_fast(mm,
@@ -2184,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2184 } else { 2183 } else {
2185 inc_mm_counter_fast(mm, MM_ANONPAGES); 2184 inc_mm_counter_fast(mm, MM_ANONPAGES);
2186 } 2185 }
2187 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2186 flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
2188 entry = mk_pte(new_page, vma->vm_page_prot); 2187 entry = mk_pte(new_page, vma->vm_page_prot);
2189 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2188 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2190 /* 2189 /*
@@ -2193,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2193 * seen in the presence of one thread doing SMC and another 2192 * seen in the presence of one thread doing SMC and another
2194 * thread doing COW. 2193 * thread doing COW.
2195 */ 2194 */
2196 ptep_clear_flush_notify(vma, address, page_table); 2195 ptep_clear_flush_notify(vma, fe->address, fe->pte);
2197 page_add_new_anon_rmap(new_page, vma, address, false); 2196 page_add_new_anon_rmap(new_page, vma, fe->address, false);
2198 mem_cgroup_commit_charge(new_page, memcg, false, false); 2197 mem_cgroup_commit_charge(new_page, memcg, false, false);
2199 lru_cache_add_active_or_unevictable(new_page, vma); 2198 lru_cache_add_active_or_unevictable(new_page, vma);
2200 /* 2199 /*
@@ -2202,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2202 * mmu page tables (such as kvm shadow page tables), we want the 2201 * mmu page tables (such as kvm shadow page tables), we want the
2203 * new page to be mapped directly into the secondary page table. 2202 * new page to be mapped directly into the secondary page table.
2204 */ 2203 */
2205 set_pte_at_notify(mm, address, page_table, entry); 2204 set_pte_at_notify(mm, fe->address, fe->pte, entry);
2206 update_mmu_cache(vma, address, page_table); 2205 update_mmu_cache(vma, fe->address, fe->pte);
2207 if (old_page) { 2206 if (old_page) {
2208 /* 2207 /*
2209 * Only after switching the pte to the new page may 2208 * Only after switching the pte to the new page may
@@ -2240,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2240 if (new_page) 2239 if (new_page)
2241 put_page(new_page); 2240 put_page(new_page);
2242 2241
2243 pte_unmap_unlock(page_table, ptl); 2242 pte_unmap_unlock(fe->pte, fe->ptl);
2244 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2243 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2245 if (old_page) { 2244 if (old_page) {
2246 /* 2245 /*
@@ -2268,44 +2267,43 @@ oom:
2268 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED 2267 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2269 * mapping 2268 * mapping
2270 */ 2269 */
2271static int wp_pfn_shared(struct mm_struct *mm, 2270static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
2272 struct vm_area_struct *vma, unsigned long address,
2273 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2274 pmd_t *pmd)
2275{ 2271{
2272 struct vm_area_struct *vma = fe->vma;
2273
2276 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { 2274 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2277 struct vm_fault vmf = { 2275 struct vm_fault vmf = {
2278 .page = NULL, 2276 .page = NULL,
2279 .pgoff = linear_page_index(vma, address), 2277 .pgoff = linear_page_index(vma, fe->address),
2280 .virtual_address = (void __user *)(address & PAGE_MASK), 2278 .virtual_address =
2279 (void __user *)(fe->address & PAGE_MASK),
2281 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, 2280 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2282 }; 2281 };
2283 int ret; 2282 int ret;
2284 2283
2285 pte_unmap_unlock(page_table, ptl); 2284 pte_unmap_unlock(fe->pte, fe->ptl);
2286 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); 2285 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2287 if (ret & VM_FAULT_ERROR) 2286 if (ret & VM_FAULT_ERROR)
2288 return ret; 2287 return ret;
2289 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2288 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2289 &fe->ptl);
2290 /* 2290 /*
2291 * We might have raced with another page fault while we 2291 * We might have raced with another page fault while we
2292 * released the pte_offset_map_lock. 2292 * released the pte_offset_map_lock.
2293 */ 2293 */
2294 if (!pte_same(*page_table, orig_pte)) { 2294 if (!pte_same(*fe->pte, orig_pte)) {
2295 pte_unmap_unlock(page_table, ptl); 2295 pte_unmap_unlock(fe->pte, fe->ptl);
2296 return 0; 2296 return 0;
2297 } 2297 }
2298 } 2298 }
2299 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, 2299 return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
2300 NULL, 0, 0);
2301} 2300}
2302 2301
2303static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, 2302static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
2304 unsigned long address, pte_t *page_table, 2303 struct page *old_page)
2305 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, 2304 __releases(fe->ptl)
2306 struct page *old_page)
2307 __releases(ptl)
2308{ 2305{
2306 struct vm_area_struct *vma = fe->vma;
2309 int page_mkwrite = 0; 2307 int page_mkwrite = 0;
2310 2308
2311 get_page(old_page); 2309 get_page(old_page);
@@ -2313,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2313 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2311 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2314 int tmp; 2312 int tmp;
2315 2313
2316 pte_unmap_unlock(page_table, ptl); 2314 pte_unmap_unlock(fe->pte, fe->ptl);
2317 tmp = do_page_mkwrite(vma, old_page, address); 2315 tmp = do_page_mkwrite(vma, old_page, fe->address);
2318 if (unlikely(!tmp || (tmp & 2316 if (unlikely(!tmp || (tmp &
2319 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 2317 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2320 put_page(old_page); 2318 put_page(old_page);
@@ -2326,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2326 * they did, we just return, as we can count on the 2324 * they did, we just return, as we can count on the
2327 * MMU to tell us if they didn't also make it writable. 2325 * MMU to tell us if they didn't also make it writable.
2328 */ 2326 */
2329 page_table = pte_offset_map_lock(mm, pmd, address, 2327 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2330 &ptl); 2328 &fe->ptl);
2331 if (!pte_same(*page_table, orig_pte)) { 2329 if (!pte_same(*fe->pte, orig_pte)) {
2332 unlock_page(old_page); 2330 unlock_page(old_page);
2333 pte_unmap_unlock(page_table, ptl); 2331 pte_unmap_unlock(fe->pte, fe->ptl);
2334 put_page(old_page); 2332 put_page(old_page);
2335 return 0; 2333 return 0;
2336 } 2334 }
2337 page_mkwrite = 1; 2335 page_mkwrite = 1;
2338 } 2336 }
2339 2337
2340 return wp_page_reuse(mm, vma, address, page_table, ptl, 2338 return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
2341 orig_pte, old_page, page_mkwrite, 1);
2342} 2339}
2343 2340
2344/* 2341/*
@@ -2359,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2359 * but allow concurrent faults), with pte both mapped and locked. 2356 * but allow concurrent faults), with pte both mapped and locked.
2360 * We return with mmap_sem still held, but pte unmapped and unlocked. 2357 * We return with mmap_sem still held, but pte unmapped and unlocked.
2361 */ 2358 */
2362static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2359static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2363 unsigned long address, pte_t *page_table, pmd_t *pmd, 2360 __releases(fe->ptl)
2364 spinlock_t *ptl, pte_t orig_pte)
2365 __releases(ptl)
2366{ 2361{
2362 struct vm_area_struct *vma = fe->vma;
2367 struct page *old_page; 2363 struct page *old_page;
2368 2364
2369 old_page = vm_normal_page(vma, address, orig_pte); 2365 old_page = vm_normal_page(vma, fe->address, orig_pte);
2370 if (!old_page) { 2366 if (!old_page) {
2371 /* 2367 /*
2372 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a 2368 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -2377,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2377 */ 2373 */
2378 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2374 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2379 (VM_WRITE|VM_SHARED)) 2375 (VM_WRITE|VM_SHARED))
2380 return wp_pfn_shared(mm, vma, address, page_table, ptl, 2376 return wp_pfn_shared(fe, orig_pte);
2381 orig_pte, pmd);
2382 2377
2383 pte_unmap_unlock(page_table, ptl); 2378 pte_unmap_unlock(fe->pte, fe->ptl);
2384 return wp_page_copy(mm, vma, address, page_table, pmd, 2379 return wp_page_copy(fe, orig_pte, old_page);
2385 orig_pte, old_page);
2386 } 2380 }
2387 2381
2388 /* 2382 /*
@@ -2393,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2393 int total_mapcount; 2387 int total_mapcount;
2394 if (!trylock_page(old_page)) { 2388 if (!trylock_page(old_page)) {
2395 get_page(old_page); 2389 get_page(old_page);
2396 pte_unmap_unlock(page_table, ptl); 2390 pte_unmap_unlock(fe->pte, fe->ptl);
2397 lock_page(old_page); 2391 lock_page(old_page);
2398 page_table = pte_offset_map_lock(mm, pmd, address, 2392 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
2399 &ptl); 2393 fe->address, &fe->ptl);
2400 if (!pte_same(*page_table, orig_pte)) { 2394 if (!pte_same(*fe->pte, orig_pte)) {
2401 unlock_page(old_page); 2395 unlock_page(old_page);
2402 pte_unmap_unlock(page_table, ptl); 2396 pte_unmap_unlock(fe->pte, fe->ptl);
2403 put_page(old_page); 2397 put_page(old_page);
2404 return 0; 2398 return 0;
2405 } 2399 }
@@ -2417,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2417 page_move_anon_rmap(old_page, vma); 2411 page_move_anon_rmap(old_page, vma);
2418 } 2412 }
2419 unlock_page(old_page); 2413 unlock_page(old_page);
2420 return wp_page_reuse(mm, vma, address, page_table, ptl, 2414 return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
2421 orig_pte, old_page, 0, 0);
2422 } 2415 }
2423 unlock_page(old_page); 2416 unlock_page(old_page);
2424 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2417 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2425 (VM_WRITE|VM_SHARED))) { 2418 (VM_WRITE|VM_SHARED))) {
2426 return wp_page_shared(mm, vma, address, page_table, pmd, 2419 return wp_page_shared(fe, orig_pte, old_page);
2427 ptl, orig_pte, old_page);
2428 } 2420 }
2429 2421
2430 /* 2422 /*
@@ -2432,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2432 */ 2424 */
2433 get_page(old_page); 2425 get_page(old_page);
2434 2426
2435 pte_unmap_unlock(page_table, ptl); 2427 pte_unmap_unlock(fe->pte, fe->ptl);
2436 return wp_page_copy(mm, vma, address, page_table, pmd, 2428 return wp_page_copy(fe, orig_pte, old_page);
2437 orig_pte, old_page);
2438} 2429}
2439 2430
2440static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2431static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2522,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
2522 * We return with the mmap_sem locked or unlocked in the same cases 2513 * We return with the mmap_sem locked or unlocked in the same cases
2523 * as does filemap_fault(). 2514 * as does filemap_fault().
2524 */ 2515 */
2525int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2516int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2526 unsigned long address, pte_t *page_table, pmd_t *pmd,
2527 unsigned int flags, pte_t orig_pte)
2528{ 2517{
2529 spinlock_t *ptl; 2518 struct vm_area_struct *vma = fe->vma;
2530 struct page *page, *swapcache; 2519 struct page *page, *swapcache;
2531 struct mem_cgroup *memcg; 2520 struct mem_cgroup *memcg;
2532 swp_entry_t entry; 2521 swp_entry_t entry;
@@ -2535,17 +2524,17 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2535 int exclusive = 0; 2524 int exclusive = 0;
2536 int ret = 0; 2525 int ret = 0;
2537 2526
2538 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2527 if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
2539 goto out; 2528 goto out;
2540 2529
2541 entry = pte_to_swp_entry(orig_pte); 2530 entry = pte_to_swp_entry(orig_pte);
2542 if (unlikely(non_swap_entry(entry))) { 2531 if (unlikely(non_swap_entry(entry))) {
2543 if (is_migration_entry(entry)) { 2532 if (is_migration_entry(entry)) {
2544 migration_entry_wait(mm, pmd, address); 2533 migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
2545 } else if (is_hwpoison_entry(entry)) { 2534 } else if (is_hwpoison_entry(entry)) {
2546 ret = VM_FAULT_HWPOISON; 2535 ret = VM_FAULT_HWPOISON;
2547 } else { 2536 } else {
2548 print_bad_pte(vma, address, orig_pte, NULL); 2537 print_bad_pte(vma, fe->address, orig_pte, NULL);
2549 ret = VM_FAULT_SIGBUS; 2538 ret = VM_FAULT_SIGBUS;
2550 } 2539 }
2551 goto out; 2540 goto out;
@@ -2554,14 +2543,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2554 page = lookup_swap_cache(entry); 2543 page = lookup_swap_cache(entry);
2555 if (!page) { 2544 if (!page) {
2556 page = swapin_readahead(entry, 2545 page = swapin_readahead(entry,
2557 GFP_HIGHUSER_MOVABLE, vma, address); 2546 GFP_HIGHUSER_MOVABLE, vma, fe->address);
2558 if (!page) { 2547 if (!page) {
2559 /* 2548 /*
2560 * Back out if somebody else faulted in this pte 2549 * Back out if somebody else faulted in this pte
2561 * while we released the pte lock. 2550 * while we released the pte lock.
2562 */ 2551 */
2563 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2552 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
2564 if (likely(pte_same(*page_table, orig_pte))) 2553 fe->address, &fe->ptl);
2554 if (likely(pte_same(*fe->pte, orig_pte)))
2565 ret = VM_FAULT_OOM; 2555 ret = VM_FAULT_OOM;
2566 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2556 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2567 goto unlock; 2557 goto unlock;
@@ -2570,7 +2560,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2570 /* Had to read the page from swap area: Major fault */ 2560 /* Had to read the page from swap area: Major fault */
2571 ret = VM_FAULT_MAJOR; 2561 ret = VM_FAULT_MAJOR;
2572 count_vm_event(PGMAJFAULT); 2562 count_vm_event(PGMAJFAULT);
2573 mem_cgroup_count_vm_event(mm, PGMAJFAULT); 2563 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
2574 } else if (PageHWPoison(page)) { 2564 } else if (PageHWPoison(page)) {
2575 /* 2565 /*
2576 * hwpoisoned dirty swapcache pages are kept for killing 2566 * hwpoisoned dirty swapcache pages are kept for killing
@@ -2583,7 +2573,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2583 } 2573 }
2584 2574
2585 swapcache = page; 2575 swapcache = page;
2586 locked = lock_page_or_retry(page, mm, flags); 2576 locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
2587 2577
2588 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2578 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2589 if (!locked) { 2579 if (!locked) {
@@ -2600,14 +2590,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2600 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2590 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2601 goto out_page; 2591 goto out_page;
2602 2592
2603 page = ksm_might_need_to_copy(page, vma, address); 2593 page = ksm_might_need_to_copy(page, vma, fe->address);
2604 if (unlikely(!page)) { 2594 if (unlikely(!page)) {
2605 ret = VM_FAULT_OOM; 2595 ret = VM_FAULT_OOM;
2606 page = swapcache; 2596 page = swapcache;
2607 goto out_page; 2597 goto out_page;
2608 } 2598 }
2609 2599
2610 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { 2600 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2601 &memcg, false)) {
2611 ret = VM_FAULT_OOM; 2602 ret = VM_FAULT_OOM;
2612 goto out_page; 2603 goto out_page;
2613 } 2604 }
@@ -2615,8 +2606,9 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2615 /* 2606 /*
2616 * Back out if somebody else already faulted in this pte. 2607 * Back out if somebody else already faulted in this pte.
2617 */ 2608 */
2618 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2609 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2619 if (unlikely(!pte_same(*page_table, orig_pte))) 2610 &fe->ptl);
2611 if (unlikely(!pte_same(*fe->pte, orig_pte)))
2620 goto out_nomap; 2612 goto out_nomap;
2621 2613
2622 if (unlikely(!PageUptodate(page))) { 2614 if (unlikely(!PageUptodate(page))) {
@@ -2634,24 +2626,24 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2634 * must be called after the swap_free(), or it will never succeed. 2626 * must be called after the swap_free(), or it will never succeed.
2635 */ 2627 */
2636 2628
2637 inc_mm_counter_fast(mm, MM_ANONPAGES); 2629 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2638 dec_mm_counter_fast(mm, MM_SWAPENTS); 2630 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2639 pte = mk_pte(page, vma->vm_page_prot); 2631 pte = mk_pte(page, vma->vm_page_prot);
2640 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { 2632 if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2641 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2633 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2642 flags &= ~FAULT_FLAG_WRITE; 2634 fe->flags &= ~FAULT_FLAG_WRITE;
2643 ret |= VM_FAULT_WRITE; 2635 ret |= VM_FAULT_WRITE;
2644 exclusive = RMAP_EXCLUSIVE; 2636 exclusive = RMAP_EXCLUSIVE;
2645 } 2637 }
2646 flush_icache_page(vma, page); 2638 flush_icache_page(vma, page);
2647 if (pte_swp_soft_dirty(orig_pte)) 2639 if (pte_swp_soft_dirty(orig_pte))
2648 pte = pte_mksoft_dirty(pte); 2640 pte = pte_mksoft_dirty(pte);
2649 set_pte_at(mm, address, page_table, pte); 2641 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
2650 if (page == swapcache) { 2642 if (page == swapcache) {
2651 do_page_add_anon_rmap(page, vma, address, exclusive); 2643 do_page_add_anon_rmap(page, vma, fe->address, exclusive);
2652 mem_cgroup_commit_charge(page, memcg, true, false); 2644 mem_cgroup_commit_charge(page, memcg, true, false);
2653 } else { /* ksm created a completely new copy */ 2645 } else { /* ksm created a completely new copy */
2654 page_add_new_anon_rmap(page, vma, address, false); 2646 page_add_new_anon_rmap(page, vma, fe->address, false);
2655 mem_cgroup_commit_charge(page, memcg, false, false); 2647 mem_cgroup_commit_charge(page, memcg, false, false);
2656 lru_cache_add_active_or_unevictable(page, vma); 2648 lru_cache_add_active_or_unevictable(page, vma);
2657 } 2649 }
@@ -2674,22 +2666,22 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2674 put_page(swapcache); 2666 put_page(swapcache);
2675 } 2667 }
2676 2668
2677 if (flags & FAULT_FLAG_WRITE) { 2669 if (fe->flags & FAULT_FLAG_WRITE) {
2678 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2670 ret |= do_wp_page(fe, pte);
2679 if (ret & VM_FAULT_ERROR) 2671 if (ret & VM_FAULT_ERROR)
2680 ret &= VM_FAULT_ERROR; 2672 ret &= VM_FAULT_ERROR;
2681 goto out; 2673 goto out;
2682 } 2674 }
2683 2675
2684 /* No need to invalidate - it was non-present before */ 2676 /* No need to invalidate - it was non-present before */
2685 update_mmu_cache(vma, address, page_table); 2677 update_mmu_cache(vma, fe->address, fe->pte);
2686unlock: 2678unlock:
2687 pte_unmap_unlock(page_table, ptl); 2679 pte_unmap_unlock(fe->pte, fe->ptl);
2688out: 2680out:
2689 return ret; 2681 return ret;
2690out_nomap: 2682out_nomap:
2691 mem_cgroup_cancel_charge(page, memcg, false); 2683 mem_cgroup_cancel_charge(page, memcg, false);
2692 pte_unmap_unlock(page_table, ptl); 2684 pte_unmap_unlock(fe->pte, fe->ptl);
2693out_page: 2685out_page:
2694 unlock_page(page); 2686 unlock_page(page);
2695out_release: 2687out_release:
@@ -2740,37 +2732,36 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2740 * but allow concurrent faults), and pte mapped but not yet locked. 2732 * but allow concurrent faults), and pte mapped but not yet locked.
2741 * We return with mmap_sem still held, but pte unmapped and unlocked. 2733 * We return with mmap_sem still held, but pte unmapped and unlocked.
2742 */ 2734 */
2743static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2735static int do_anonymous_page(struct fault_env *fe)
2744 unsigned long address, pte_t *page_table, pmd_t *pmd,
2745 unsigned int flags)
2746{ 2736{
2737 struct vm_area_struct *vma = fe->vma;
2747 struct mem_cgroup *memcg; 2738 struct mem_cgroup *memcg;
2748 struct page *page; 2739 struct page *page;
2749 spinlock_t *ptl;
2750 pte_t entry; 2740 pte_t entry;
2751 2741
2752 pte_unmap(page_table); 2742 pte_unmap(fe->pte);
2753 2743
2754 /* File mapping without ->vm_ops ? */ 2744 /* File mapping without ->vm_ops ? */
2755 if (vma->vm_flags & VM_SHARED) 2745 if (vma->vm_flags & VM_SHARED)
2756 return VM_FAULT_SIGBUS; 2746 return VM_FAULT_SIGBUS;
2757 2747
2758 /* Check if we need to add a guard page to the stack */ 2748 /* Check if we need to add a guard page to the stack */
2759 if (check_stack_guard_page(vma, address) < 0) 2749 if (check_stack_guard_page(vma, fe->address) < 0)
2760 return VM_FAULT_SIGSEGV; 2750 return VM_FAULT_SIGSEGV;
2761 2751
2762 /* Use the zero-page for reads */ 2752 /* Use the zero-page for reads */
2763 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { 2753 if (!(fe->flags & FAULT_FLAG_WRITE) &&
2764 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2754 !mm_forbids_zeropage(vma->vm_mm)) {
2755 entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
2765 vma->vm_page_prot)); 2756 vma->vm_page_prot));
2766 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2757 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2767 if (!pte_none(*page_table)) 2758 &fe->ptl);
2759 if (!pte_none(*fe->pte))
2768 goto unlock; 2760 goto unlock;
2769 /* Deliver the page fault to userland, check inside PT lock */ 2761 /* Deliver the page fault to userland, check inside PT lock */
2770 if (userfaultfd_missing(vma)) { 2762 if (userfaultfd_missing(vma)) {
2771 pte_unmap_unlock(page_table, ptl); 2763 pte_unmap_unlock(fe->pte, fe->ptl);
2772 return handle_userfault(vma, address, flags, 2764 return handle_userfault(fe, VM_UFFD_MISSING);
2773 VM_UFFD_MISSING);
2774 } 2765 }
2775 goto setpte; 2766 goto setpte;
2776 } 2767 }
@@ -2778,11 +2769,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2778 /* Allocate our own private page. */ 2769 /* Allocate our own private page. */
2779 if (unlikely(anon_vma_prepare(vma))) 2770 if (unlikely(anon_vma_prepare(vma)))
2780 goto oom; 2771 goto oom;
2781 page = alloc_zeroed_user_highpage_movable(vma, address); 2772 page = alloc_zeroed_user_highpage_movable(vma, fe->address);
2782 if (!page) 2773 if (!page)
2783 goto oom; 2774 goto oom;
2784 2775
2785 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) 2776 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2786 goto oom_free_page; 2777 goto oom_free_page;
2787 2778
2788 /* 2779 /*
@@ -2796,30 +2787,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2796 if (vma->vm_flags & VM_WRITE) 2787 if (vma->vm_flags & VM_WRITE)
2797 entry = pte_mkwrite(pte_mkdirty(entry)); 2788 entry = pte_mkwrite(pte_mkdirty(entry));
2798 2789
2799 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2790 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2800 if (!pte_none(*page_table)) 2791 &fe->ptl);
2792 if (!pte_none(*fe->pte))
2801 goto release; 2793 goto release;
2802 2794
2803 /* Deliver the page fault to userland, check inside PT lock */ 2795 /* Deliver the page fault to userland, check inside PT lock */
2804 if (userfaultfd_missing(vma)) { 2796 if (userfaultfd_missing(vma)) {
2805 pte_unmap_unlock(page_table, ptl); 2797 pte_unmap_unlock(fe->pte, fe->ptl);
2806 mem_cgroup_cancel_charge(page, memcg, false); 2798 mem_cgroup_cancel_charge(page, memcg, false);
2807 put_page(page); 2799 put_page(page);
2808 return handle_userfault(vma, address, flags, 2800 return handle_userfault(fe, VM_UFFD_MISSING);
2809 VM_UFFD_MISSING);
2810 } 2801 }
2811 2802
2812 inc_mm_counter_fast(mm, MM_ANONPAGES); 2803 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2813 page_add_new_anon_rmap(page, vma, address, false); 2804 page_add_new_anon_rmap(page, vma, fe->address, false);
2814 mem_cgroup_commit_charge(page, memcg, false, false); 2805 mem_cgroup_commit_charge(page, memcg, false, false);
2815 lru_cache_add_active_or_unevictable(page, vma); 2806 lru_cache_add_active_or_unevictable(page, vma);
2816setpte: 2807setpte:
2817 set_pte_at(mm, address, page_table, entry); 2808 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
2818 2809
2819 /* No need to invalidate - it was non-present before */ 2810 /* No need to invalidate - it was non-present before */
2820 update_mmu_cache(vma, address, page_table); 2811 update_mmu_cache(vma, fe->address, fe->pte);
2821unlock: 2812unlock:
2822 pte_unmap_unlock(page_table, ptl); 2813 pte_unmap_unlock(fe->pte, fe->ptl);
2823 return 0; 2814 return 0;
2824release: 2815release:
2825 mem_cgroup_cancel_charge(page, memcg, false); 2816 mem_cgroup_cancel_charge(page, memcg, false);
@@ -2836,17 +2827,16 @@ oom:
2836 * released depending on flags and vma->vm_ops->fault() return value. 2827 * released depending on flags and vma->vm_ops->fault() return value.
2837 * See filemap_fault() and __lock_page_retry(). 2828 * See filemap_fault() and __lock_page_retry().
2838 */ 2829 */
2839static int __do_fault(struct vm_area_struct *vma, unsigned long address, 2830static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
2840 pgoff_t pgoff, unsigned int flags, 2831 struct page *cow_page, struct page **page, void **entry)
2841 struct page *cow_page, struct page **page,
2842 void **entry)
2843{ 2832{
2833 struct vm_area_struct *vma = fe->vma;
2844 struct vm_fault vmf; 2834 struct vm_fault vmf;
2845 int ret; 2835 int ret;
2846 2836
2847 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2837 vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
2848 vmf.pgoff = pgoff; 2838 vmf.pgoff = pgoff;
2849 vmf.flags = flags; 2839 vmf.flags = fe->flags;
2850 vmf.page = NULL; 2840 vmf.page = NULL;
2851 vmf.gfp_mask = __get_fault_gfp_mask(vma); 2841 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2852 vmf.cow_page = cow_page; 2842 vmf.cow_page = cow_page;
@@ -2878,38 +2868,36 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2878/** 2868/**
2879 * do_set_pte - setup new PTE entry for given page and add reverse page mapping. 2869 * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
2880 * 2870 *
2881 * @vma: virtual memory area 2871 * @fe: fault environment
2882 * @address: user virtual address
2883 * @page: page to map 2872 * @page: page to map
2884 * @pte: pointer to target page table entry
2885 * @write: true, if new entry is writable
2886 * @anon: true, if it's anonymous page
2887 * 2873 *
2888 * Caller must hold page table lock relevant for @pte. 2874 * Caller must hold page table lock relevant for @fe->pte.
2889 * 2875 *
2890 * Target users are page handler itself and implementations of 2876 * Target users are page handler itself and implementations of
2891 * vm_ops->map_pages. 2877 * vm_ops->map_pages.
2892 */ 2878 */
2893void do_set_pte(struct vm_area_struct *vma, unsigned long address, 2879void do_set_pte(struct fault_env *fe, struct page *page)
2894 struct page *page, pte_t *pte, bool write, bool anon)
2895{ 2880{
2881 struct vm_area_struct *vma = fe->vma;
2882 bool write = fe->flags & FAULT_FLAG_WRITE;
2896 pte_t entry; 2883 pte_t entry;
2897 2884
2898 flush_icache_page(vma, page); 2885 flush_icache_page(vma, page);
2899 entry = mk_pte(page, vma->vm_page_prot); 2886 entry = mk_pte(page, vma->vm_page_prot);
2900 if (write) 2887 if (write)
2901 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2888 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2902 if (anon) { 2889 /* copy-on-write page */
2890 if (write && !(vma->vm_flags & VM_SHARED)) {
2903 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2891 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2904 page_add_new_anon_rmap(page, vma, address, false); 2892 page_add_new_anon_rmap(page, vma, fe->address, false);
2905 } else { 2893 } else {
2906 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 2894 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
2907 page_add_file_rmap(page); 2895 page_add_file_rmap(page);
2908 } 2896 }
2909 set_pte_at(vma->vm_mm, address, pte, entry); 2897 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
2910 2898
2911 /* no need to invalidate: a not-present page won't be cached */ 2899 /* no need to invalidate: a not-present page won't be cached */
2912 update_mmu_cache(vma, address, pte); 2900 update_mmu_cache(vma, fe->address, fe->pte);
2913} 2901}
2914 2902
2915static unsigned long fault_around_bytes __read_mostly = 2903static unsigned long fault_around_bytes __read_mostly =
@@ -2976,57 +2964,53 @@ late_initcall(fault_around_debugfs);
2976 * fault_around_pages() value (and therefore to page order). This way it's 2964 * fault_around_pages() value (and therefore to page order). This way it's
2977 * easier to guarantee that we don't cross page table boundaries. 2965 * easier to guarantee that we don't cross page table boundaries.
2978 */ 2966 */
2979static void do_fault_around(struct vm_area_struct *vma, unsigned long address, 2967static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
2980 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2981{ 2968{
2982 unsigned long start_addr, nr_pages, mask; 2969 unsigned long address = fe->address, start_addr, nr_pages, mask;
2983 pgoff_t max_pgoff; 2970 pte_t *pte = fe->pte;
2984 struct vm_fault vmf; 2971 pgoff_t end_pgoff;
2985 int off; 2972 int off;
2986 2973
2987 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 2974 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2988 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 2975 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2989 2976
2990 start_addr = max(address & mask, vma->vm_start); 2977 start_addr = max(fe->address & mask, fe->vma->vm_start);
2991 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 2978 off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2992 pte -= off; 2979 fe->pte -= off;
2993 pgoff -= off; 2980 start_pgoff -= off;
2994 2981
2995 /* 2982 /*
2996 * max_pgoff is either end of page table or end of vma 2983 * end_pgoff is either end of page table or end of vma
2997 * or fault_around_pages() from pgoff, depending what is nearest. 2984 * or fault_around_pages() from start_pgoff, depending what is nearest.
2998 */ 2985 */
2999 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 2986 end_pgoff = start_pgoff -
2987 ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3000 PTRS_PER_PTE - 1; 2988 PTRS_PER_PTE - 1;
3001 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, 2989 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
3002 pgoff + nr_pages - 1); 2990 start_pgoff + nr_pages - 1);
3003 2991
3004 /* Check if it makes any sense to call ->map_pages */ 2992 /* Check if it makes any sense to call ->map_pages */
3005 while (!pte_none(*pte)) { 2993 fe->address = start_addr;
3006 if (++pgoff > max_pgoff) 2994 while (!pte_none(*fe->pte)) {
3007 return; 2995 if (++start_pgoff > end_pgoff)
3008 start_addr += PAGE_SIZE; 2996 goto out;
3009 if (start_addr >= vma->vm_end) 2997 fe->address += PAGE_SIZE;
3010 return; 2998 if (fe->address >= fe->vma->vm_end)
3011 pte++; 2999 goto out;
3000 fe->pte++;
3012 } 3001 }
3013 3002
3014 vmf.virtual_address = (void __user *) start_addr; 3003 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
3015 vmf.pte = pte; 3004out:
3016 vmf.pgoff = pgoff; 3005 /* restore fault_env */
3017 vmf.max_pgoff = max_pgoff; 3006 fe->pte = pte;
3018 vmf.flags = flags; 3007 fe->address = address;
3019 vmf.gfp_mask = __get_fault_gfp_mask(vma);
3020 vma->vm_ops->map_pages(vma, &vmf);
3021} 3008}
3022 3009
3023static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3010static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3024 unsigned long address, pmd_t *pmd,
3025 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3026{ 3011{
3012 struct vm_area_struct *vma = fe->vma;
3027 struct page *fault_page; 3013 struct page *fault_page;
3028 spinlock_t *ptl;
3029 pte_t *pte;
3030 int ret = 0; 3014 int ret = 0;
3031 3015
3032 /* 3016 /*
@@ -3035,66 +3019,68 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3035 * something). 3019 * something).
3036 */ 3020 */
3037 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 3021 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3038 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3022 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
3039 do_fault_around(vma, address, pte, pgoff, flags); 3023 &fe->ptl);
3040 if (!pte_same(*pte, orig_pte)) 3024 if (!pte_same(*fe->pte, orig_pte))
3025 goto unlock_out;
3026 do_fault_around(fe, pgoff);
3027 /* Check if the fault is handled by faultaround */
3028 if (!pte_same(*fe->pte, orig_pte))
3041 goto unlock_out; 3029 goto unlock_out;
3042 pte_unmap_unlock(pte, ptl); 3030 pte_unmap_unlock(fe->pte, fe->ptl);
3043 } 3031 }
3044 3032
3045 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); 3033 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3046 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3034 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3047 return ret; 3035 return ret;
3048 3036
3049 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3037 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl);
3050 if (unlikely(!pte_same(*pte, orig_pte))) { 3038 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3051 pte_unmap_unlock(pte, ptl); 3039 pte_unmap_unlock(fe->pte, fe->ptl);
3052 unlock_page(fault_page); 3040 unlock_page(fault_page);
3053 put_page(fault_page); 3041 put_page(fault_page);
3054 return ret; 3042 return ret;
3055 } 3043 }
3056 do_set_pte(vma, address, fault_page, pte, false, false); 3044 do_set_pte(fe, fault_page);
3057 unlock_page(fault_page); 3045 unlock_page(fault_page);
3058unlock_out: 3046unlock_out:
3059 pte_unmap_unlock(pte, ptl); 3047 pte_unmap_unlock(fe->pte, fe->ptl);
3060 return ret; 3048 return ret;
3061} 3049}
3062 3050
3063static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3051static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3064 unsigned long address, pmd_t *pmd,
3065 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3066{ 3052{
3053 struct vm_area_struct *vma = fe->vma;
3067 struct page *fault_page, *new_page; 3054 struct page *fault_page, *new_page;
3068 void *fault_entry; 3055 void *fault_entry;
3069 struct mem_cgroup *memcg; 3056 struct mem_cgroup *memcg;
3070 spinlock_t *ptl;
3071 pte_t *pte;
3072 int ret; 3057 int ret;
3073 3058
3074 if (unlikely(anon_vma_prepare(vma))) 3059 if (unlikely(anon_vma_prepare(vma)))
3075 return VM_FAULT_OOM; 3060 return VM_FAULT_OOM;
3076 3061
3077 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 3062 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
3078 if (!new_page) 3063 if (!new_page)
3079 return VM_FAULT_OOM; 3064 return VM_FAULT_OOM;
3080 3065
3081 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { 3066 if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
3067 &memcg, false)) {
3082 put_page(new_page); 3068 put_page(new_page);
3083 return VM_FAULT_OOM; 3069 return VM_FAULT_OOM;
3084 } 3070 }
3085 3071
3086 ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, 3072 ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
3087 &fault_entry);
3088 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3073 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3089 goto uncharge_out; 3074 goto uncharge_out;
3090 3075
3091 if (!(ret & VM_FAULT_DAX_LOCKED)) 3076 if (!(ret & VM_FAULT_DAX_LOCKED))
3092 copy_user_highpage(new_page, fault_page, address, vma); 3077 copy_user_highpage(new_page, fault_page, fe->address, vma);
3093 __SetPageUptodate(new_page); 3078 __SetPageUptodate(new_page);
3094 3079
3095 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3080 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
3096 if (unlikely(!pte_same(*pte, orig_pte))) { 3081 &fe->ptl);
3097 pte_unmap_unlock(pte, ptl); 3082 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3083 pte_unmap_unlock(fe->pte, fe->ptl);
3098 if (!(ret & VM_FAULT_DAX_LOCKED)) { 3084 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3099 unlock_page(fault_page); 3085 unlock_page(fault_page);
3100 put_page(fault_page); 3086 put_page(fault_page);
@@ -3104,10 +3090,10 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3104 } 3090 }
3105 goto uncharge_out; 3091 goto uncharge_out;
3106 } 3092 }
3107 do_set_pte(vma, address, new_page, pte, true, true); 3093 do_set_pte(fe, new_page);
3108 mem_cgroup_commit_charge(new_page, memcg, false, false); 3094 mem_cgroup_commit_charge(new_page, memcg, false, false);
3109 lru_cache_add_active_or_unevictable(new_page, vma); 3095 lru_cache_add_active_or_unevictable(new_page, vma);
3110 pte_unmap_unlock(pte, ptl); 3096 pte_unmap_unlock(fe->pte, fe->ptl);
3111 if (!(ret & VM_FAULT_DAX_LOCKED)) { 3097 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3112 unlock_page(fault_page); 3098 unlock_page(fault_page);
3113 put_page(fault_page); 3099 put_page(fault_page);
@@ -3121,18 +3107,15 @@ uncharge_out:
3121 return ret; 3107 return ret;
3122} 3108}
3123 3109
3124static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3110static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3125 unsigned long address, pmd_t *pmd,
3126 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3127{ 3111{
3112 struct vm_area_struct *vma = fe->vma;
3128 struct page *fault_page; 3113 struct page *fault_page;
3129 struct address_space *mapping; 3114 struct address_space *mapping;
3130 spinlock_t *ptl;
3131 pte_t *pte;
3132 int dirtied = 0; 3115 int dirtied = 0;
3133 int ret, tmp; 3116 int ret, tmp;
3134 3117
3135 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); 3118 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3136 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3119 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3137 return ret; 3120 return ret;
3138 3121
@@ -3142,7 +3125,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3142 */ 3125 */
3143 if (vma->vm_ops->page_mkwrite) { 3126 if (vma->vm_ops->page_mkwrite) {
3144 unlock_page(fault_page); 3127 unlock_page(fault_page);
3145 tmp = do_page_mkwrite(vma, fault_page, address); 3128 tmp = do_page_mkwrite(vma, fault_page, fe->address);
3146 if (unlikely(!tmp || 3129 if (unlikely(!tmp ||
3147 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 3130 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3148 put_page(fault_page); 3131 put_page(fault_page);
@@ -3150,15 +3133,16 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3150 } 3133 }
3151 } 3134 }
3152 3135
3153 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3136 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
3154 if (unlikely(!pte_same(*pte, orig_pte))) { 3137 &fe->ptl);
3155 pte_unmap_unlock(pte, ptl); 3138 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3139 pte_unmap_unlock(fe->pte, fe->ptl);
3156 unlock_page(fault_page); 3140 unlock_page(fault_page);
3157 put_page(fault_page); 3141 put_page(fault_page);
3158 return ret; 3142 return ret;
3159 } 3143 }
3160 do_set_pte(vma, address, fault_page, pte, true, false); 3144 do_set_pte(fe, fault_page);
3161 pte_unmap_unlock(pte, ptl); 3145 pte_unmap_unlock(fe->pte, fe->ptl);
3162 3146
3163 if (set_page_dirty(fault_page)) 3147 if (set_page_dirty(fault_page))
3164 dirtied = 1; 3148 dirtied = 1;
@@ -3190,23 +3174,20 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3190 * The mmap_sem may have been released depending on flags and our 3174 * The mmap_sem may have been released depending on flags and our
3191 * return value. See filemap_fault() and __lock_page_or_retry(). 3175 * return value. See filemap_fault() and __lock_page_or_retry().
3192 */ 3176 */
3193static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3177static int do_fault(struct fault_env *fe, pte_t orig_pte)
3194 unsigned long address, pte_t *page_table, pmd_t *pmd,
3195 unsigned int flags, pte_t orig_pte)
3196{ 3178{
3197 pgoff_t pgoff = linear_page_index(vma, address); 3179 struct vm_area_struct *vma = fe->vma;
3180 pgoff_t pgoff = linear_page_index(vma, fe->address);
3198 3181
3199 pte_unmap(page_table); 3182 pte_unmap(fe->pte);
3200 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3183 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
3201 if (!vma->vm_ops->fault) 3184 if (!vma->vm_ops->fault)
3202 return VM_FAULT_SIGBUS; 3185 return VM_FAULT_SIGBUS;
3203 if (!(flags & FAULT_FLAG_WRITE)) 3186 if (!(fe->flags & FAULT_FLAG_WRITE))
3204 return do_read_fault(mm, vma, address, pmd, pgoff, flags, 3187 return do_read_fault(fe, pgoff, orig_pte);
3205 orig_pte);
3206 if (!(vma->vm_flags & VM_SHARED)) 3188 if (!(vma->vm_flags & VM_SHARED))
3207 return do_cow_fault(mm, vma, address, pmd, pgoff, flags, 3189 return do_cow_fault(fe, pgoff, orig_pte);
3208 orig_pte); 3190 return do_shared_fault(fe, pgoff, orig_pte);
3209 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3210} 3191}
3211 3192
3212static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3193static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3224,11 +3205,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3224 return mpol_misplaced(page, vma, addr); 3205 return mpol_misplaced(page, vma, addr);
3225} 3206}
3226 3207
3227static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 3208static int do_numa_page(struct fault_env *fe, pte_t pte)
3228 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3229{ 3209{
3210 struct vm_area_struct *vma = fe->vma;
3230 struct page *page = NULL; 3211 struct page *page = NULL;
3231 spinlock_t *ptl;
3232 int page_nid = -1; 3212 int page_nid = -1;
3233 int last_cpupid; 3213 int last_cpupid;
3234 int target_nid; 3214 int target_nid;
@@ -3248,10 +3228,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3248 * page table entry is not accessible, so there would be no 3228 * page table entry is not accessible, so there would be no
3249 * concurrent hardware modifications to the PTE. 3229 * concurrent hardware modifications to the PTE.
3250 */ 3230 */
3251 ptl = pte_lockptr(mm, pmd); 3231 fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
3252 spin_lock(ptl); 3232 spin_lock(fe->ptl);
3253 if (unlikely(!pte_same(*ptep, pte))) { 3233 if (unlikely(!pte_same(*fe->pte, pte))) {
3254 pte_unmap_unlock(ptep, ptl); 3234 pte_unmap_unlock(fe->pte, fe->ptl);
3255 goto out; 3235 goto out;
3256 } 3236 }
3257 3237
@@ -3260,18 +3240,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3260 pte = pte_mkyoung(pte); 3240 pte = pte_mkyoung(pte);
3261 if (was_writable) 3241 if (was_writable)
3262 pte = pte_mkwrite(pte); 3242 pte = pte_mkwrite(pte);
3263 set_pte_at(mm, addr, ptep, pte); 3243 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
3264 update_mmu_cache(vma, addr, ptep); 3244 update_mmu_cache(vma, fe->address, fe->pte);
3265 3245
3266 page = vm_normal_page(vma, addr, pte); 3246 page = vm_normal_page(vma, fe->address, pte);
3267 if (!page) { 3247 if (!page) {
3268 pte_unmap_unlock(ptep, ptl); 3248 pte_unmap_unlock(fe->pte, fe->ptl);
3269 return 0; 3249 return 0;
3270 } 3250 }
3271 3251
3272 /* TODO: handle PTE-mapped THP */ 3252 /* TODO: handle PTE-mapped THP */
3273 if (PageCompound(page)) { 3253 if (PageCompound(page)) {
3274 pte_unmap_unlock(ptep, ptl); 3254 pte_unmap_unlock(fe->pte, fe->ptl);
3275 return 0; 3255 return 0;
3276 } 3256 }
3277 3257
@@ -3295,8 +3275,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3295 3275
3296 last_cpupid = page_cpupid_last(page); 3276 last_cpupid = page_cpupid_last(page);
3297 page_nid = page_to_nid(page); 3277 page_nid = page_to_nid(page);
3298 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); 3278 target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
3299 pte_unmap_unlock(ptep, ptl); 3279 &flags);
3280 pte_unmap_unlock(fe->pte, fe->ptl);
3300 if (target_nid == -1) { 3281 if (target_nid == -1) {
3301 put_page(page); 3282 put_page(page);
3302 goto out; 3283 goto out;
@@ -3316,24 +3297,24 @@ out:
3316 return 0; 3297 return 0;
3317} 3298}
3318 3299
3319static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 3300static int create_huge_pmd(struct fault_env *fe)
3320 unsigned long address, pmd_t *pmd, unsigned int flags)
3321{ 3301{
3302 struct vm_area_struct *vma = fe->vma;
3322 if (vma_is_anonymous(vma)) 3303 if (vma_is_anonymous(vma))
3323 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); 3304 return do_huge_pmd_anonymous_page(fe);
3324 if (vma->vm_ops->pmd_fault) 3305 if (vma->vm_ops->pmd_fault)
3325 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3306 return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
3307 fe->flags);
3326 return VM_FAULT_FALLBACK; 3308 return VM_FAULT_FALLBACK;
3327} 3309}
3328 3310
3329static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 3311static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
3330 unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
3331 unsigned int flags)
3332{ 3312{
3333 if (vma_is_anonymous(vma)) 3313 if (vma_is_anonymous(fe->vma))
3334 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); 3314 return do_huge_pmd_wp_page(fe, orig_pmd);
3335 if (vma->vm_ops->pmd_fault) 3315 if (fe->vma->vm_ops->pmd_fault)
3336 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3316 return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
3317 fe->flags);
3337 return VM_FAULT_FALLBACK; 3318 return VM_FAULT_FALLBACK;
3338} 3319}
3339 3320
@@ -3353,12 +3334,9 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3353 * The mmap_sem may have been released depending on flags and our 3334 * The mmap_sem may have been released depending on flags and our
3354 * return value. See filemap_fault() and __lock_page_or_retry(). 3335 * return value. See filemap_fault() and __lock_page_or_retry().
3355 */ 3336 */
3356static int handle_pte_fault(struct mm_struct *mm, 3337static int handle_pte_fault(struct fault_env *fe)
3357 struct vm_area_struct *vma, unsigned long address,
3358 pte_t *pte, pmd_t *pmd, unsigned int flags)
3359{ 3338{
3360 pte_t entry; 3339 pte_t entry;
3361 spinlock_t *ptl;
3362 3340
3363 /* 3341 /*
3364 * some architectures can have larger ptes than wordsize, 3342 * some architectures can have larger ptes than wordsize,
@@ -3368,37 +3346,34 @@ static int handle_pte_fault(struct mm_struct *mm,
3368 * we later double check anyway with the ptl lock held. So here 3346 * we later double check anyway with the ptl lock held. So here
3369 * a barrier will do. 3347 * a barrier will do.
3370 */ 3348 */
3371 entry = *pte; 3349 entry = *fe->pte;
3372 barrier(); 3350 barrier();
3373 if (!pte_present(entry)) { 3351 if (!pte_present(entry)) {
3374 if (pte_none(entry)) { 3352 if (pte_none(entry)) {
3375 if (vma_is_anonymous(vma)) 3353 if (vma_is_anonymous(fe->vma))
3376 return do_anonymous_page(mm, vma, address, 3354 return do_anonymous_page(fe);
3377 pte, pmd, flags);
3378 else 3355 else
3379 return do_fault(mm, vma, address, pte, pmd, 3356 return do_fault(fe, entry);
3380 flags, entry);
3381 } 3357 }
3382 return do_swap_page(mm, vma, address, 3358 return do_swap_page(fe, entry);
3383 pte, pmd, flags, entry);
3384 } 3359 }
3385 3360
3386 if (pte_protnone(entry)) 3361 if (pte_protnone(entry))
3387 return do_numa_page(mm, vma, address, entry, pte, pmd); 3362 return do_numa_page(fe, entry);
3388 3363
3389 ptl = pte_lockptr(mm, pmd); 3364 fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
3390 spin_lock(ptl); 3365 spin_lock(fe->ptl);
3391 if (unlikely(!pte_same(*pte, entry))) 3366 if (unlikely(!pte_same(*fe->pte, entry)))
3392 goto unlock; 3367 goto unlock;
3393 if (flags & FAULT_FLAG_WRITE) { 3368 if (fe->flags & FAULT_FLAG_WRITE) {
3394 if (!pte_write(entry)) 3369 if (!pte_write(entry))
3395 return do_wp_page(mm, vma, address, 3370 return do_wp_page(fe, entry);
3396 pte, pmd, ptl, entry);
3397 entry = pte_mkdirty(entry); 3371 entry = pte_mkdirty(entry);
3398 } 3372 }
3399 entry = pte_mkyoung(entry); 3373 entry = pte_mkyoung(entry);
3400 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3374 if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
3401 update_mmu_cache(vma, address, pte); 3375 fe->flags & FAULT_FLAG_WRITE)) {
3376 update_mmu_cache(fe->vma, fe->address, fe->pte);
3402 } else { 3377 } else {
3403 /* 3378 /*
3404 * This is needed only for protection faults but the arch code 3379 * This is needed only for protection faults but the arch code
@@ -3406,11 +3381,11 @@ static int handle_pte_fault(struct mm_struct *mm,
3406 * This still avoids useless tlb flushes for .text page faults 3381 * This still avoids useless tlb flushes for .text page faults
3407 * with threads. 3382 * with threads.
3408 */ 3383 */
3409 if (flags & FAULT_FLAG_WRITE) 3384 if (fe->flags & FAULT_FLAG_WRITE)
3410 flush_tlb_fix_spurious_fault(vma, address); 3385 flush_tlb_fix_spurious_fault(fe->vma, fe->address);
3411 } 3386 }
3412unlock: 3387unlock:
3413 pte_unmap_unlock(pte, ptl); 3388 pte_unmap_unlock(fe->pte, fe->ptl);
3414 return 0; 3389 return 0;
3415} 3390}
3416 3391
@@ -3423,51 +3398,42 @@ unlock:
3423static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, 3398static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3424 unsigned int flags) 3399 unsigned int flags)
3425{ 3400{
3401 struct fault_env fe = {
3402 .vma = vma,
3403 .address = address,
3404 .flags = flags,
3405 };
3426 struct mm_struct *mm = vma->vm_mm; 3406 struct mm_struct *mm = vma->vm_mm;
3427 pgd_t *pgd; 3407 pgd_t *pgd;
3428 pud_t *pud; 3408 pud_t *pud;
3429 pmd_t *pmd;
3430 pte_t *pte;
3431
3432 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3433 flags & FAULT_FLAG_INSTRUCTION,
3434 flags & FAULT_FLAG_REMOTE))
3435 return VM_FAULT_SIGSEGV;
3436
3437 if (unlikely(is_vm_hugetlb_page(vma)))
3438 return hugetlb_fault(mm, vma, address, flags);
3439 3409
3440 pgd = pgd_offset(mm, address); 3410 pgd = pgd_offset(mm, address);
3441 pud = pud_alloc(mm, pgd, address); 3411 pud = pud_alloc(mm, pgd, address);
3442 if (!pud) 3412 if (!pud)
3443 return VM_FAULT_OOM; 3413 return VM_FAULT_OOM;
3444 pmd = pmd_alloc(mm, pud, address); 3414 fe.pmd = pmd_alloc(mm, pud, address);
3445 if (!pmd) 3415 if (!fe.pmd)
3446 return VM_FAULT_OOM; 3416 return VM_FAULT_OOM;
3447 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3417 if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
3448 int ret = create_huge_pmd(mm, vma, address, pmd, flags); 3418 int ret = create_huge_pmd(&fe);
3449 if (!(ret & VM_FAULT_FALLBACK)) 3419 if (!(ret & VM_FAULT_FALLBACK))
3450 return ret; 3420 return ret;
3451 } else { 3421 } else {
3452 pmd_t orig_pmd = *pmd; 3422 pmd_t orig_pmd = *fe.pmd;
3453 int ret; 3423 int ret;
3454 3424
3455 barrier(); 3425 barrier();
3456 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3426 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3457 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3458
3459 if (pmd_protnone(orig_pmd)) 3427 if (pmd_protnone(orig_pmd))
3460 return do_huge_pmd_numa_page(mm, vma, address, 3428 return do_huge_pmd_numa_page(&fe, orig_pmd);
3461 orig_pmd, pmd);
3462 3429
3463 if (dirty && !pmd_write(orig_pmd)) { 3430 if ((fe.flags & FAULT_FLAG_WRITE) &&
3464 ret = wp_huge_pmd(mm, vma, address, pmd, 3431 !pmd_write(orig_pmd)) {
3465 orig_pmd, flags); 3432 ret = wp_huge_pmd(&fe, orig_pmd);
3466 if (!(ret & VM_FAULT_FALLBACK)) 3433 if (!(ret & VM_FAULT_FALLBACK))
3467 return ret; 3434 return ret;
3468 } else { 3435 } else {
3469 huge_pmd_set_accessed(mm, vma, address, pmd, 3436 huge_pmd_set_accessed(&fe, orig_pmd);
3470 orig_pmd, dirty);
3471 return 0; 3437 return 0;
3472 } 3438 }
3473 } 3439 }
@@ -3478,7 +3444,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3478 * run pte_offset_map on the pmd, if an huge pmd could 3444 * run pte_offset_map on the pmd, if an huge pmd could
3479 * materialize from under us from a different thread. 3445 * materialize from under us from a different thread.
3480 */ 3446 */
3481 if (unlikely(pte_alloc(mm, pmd, address))) 3447 if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address)))
3482 return VM_FAULT_OOM; 3448 return VM_FAULT_OOM;
3483 /* 3449 /*
3484 * If a huge pmd materialized under us just retry later. Use 3450 * If a huge pmd materialized under us just retry later. Use
@@ -3491,7 +3457,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3491 * through an atomic read in C, which is what pmd_trans_unstable() 3457 * through an atomic read in C, which is what pmd_trans_unstable()
3492 * provides. 3458 * provides.
3493 */ 3459 */
3494 if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) 3460 if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd)))
3495 return 0; 3461 return 0;
3496 /* 3462 /*
3497 * A regular pmd is established and it can't morph into a huge pmd 3463 * A regular pmd is established and it can't morph into a huge pmd
@@ -3499,9 +3465,9 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3499 * read mode and khugepaged takes it in write mode. So now it's 3465 * read mode and khugepaged takes it in write mode. So now it's
3500 * safe to run pte_offset_map(). 3466 * safe to run pte_offset_map().
3501 */ 3467 */
3502 pte = pte_offset_map(pmd, address); 3468 fe.pte = pte_offset_map(fe.pmd, fe.address);
3503 3469
3504 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3470 return handle_pte_fault(&fe);
3505} 3471}
3506 3472
3507/* 3473/*
@@ -3530,7 +3496,15 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3530 if (flags & FAULT_FLAG_USER) 3496 if (flags & FAULT_FLAG_USER)
3531 mem_cgroup_oom_enable(); 3497 mem_cgroup_oom_enable();
3532 3498
3533 ret = __handle_mm_fault(vma, address, flags); 3499 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3500 flags & FAULT_FLAG_INSTRUCTION,
3501 flags & FAULT_FLAG_REMOTE))
3502 return VM_FAULT_SIGSEGV;
3503
3504 if (unlikely(is_vm_hugetlb_page(vma)))
3505 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
3506 else
3507 ret = __handle_mm_fault(vma, address, flags);
3534 3508
3535 if (flags & FAULT_FLAG_USER) { 3509 if (flags & FAULT_FLAG_USER) {
3536 mem_cgroup_oom_disable(); 3510 mem_cgroup_oom_disable();
diff --git a/mm/nommu.c b/mm/nommu.c
index c2e58880207f..95daf81a4855 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1809} 1809}
1810EXPORT_SYMBOL(filemap_fault); 1810EXPORT_SYMBOL(filemap_fault);
1811 1811
1812void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 1812void filemap_map_pages(struct fault_env *fe,
1813 pgoff_t start_pgoff, pgoff_t end_pgoff)
1813{ 1814{
1814 BUG(); 1815 BUG();
1815} 1816}