aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c859
1 files changed, 434 insertions, 425 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 08d8da39de28..455c3e628d52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2034 * 2034 *
2035 * We do this without the lock held, so that it can sleep if it needs to. 2035 * We do this without the lock held, so that it can sleep if it needs to.
2036 */ 2036 */
2037static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2037static int do_page_mkwrite(struct vm_fault *vmf)
2038 unsigned long address)
2039{ 2038{
2040 struct vm_fault vmf;
2041 int ret; 2039 int ret;
2040 struct page *page = vmf->page;
2041 unsigned int old_flags = vmf->flags;
2042 2042
2043 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2043 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2044 vmf.pgoff = page->index;
2045 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2046 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2047 vmf.page = page;
2048 vmf.cow_page = NULL;
2049 2044
2050 ret = vma->vm_ops->page_mkwrite(vma, &vmf); 2045 ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
2046 /* Restore original flags so that caller is not surprised */
2047 vmf->flags = old_flags;
2051 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2048 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2052 return ret; 2049 return ret;
2053 if (unlikely(!(ret & VM_FAULT_LOCKED))) { 2050 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2063} 2060}
2064 2061
2065/* 2062/*
2063 * Handle dirtying of a page in shared file mapping on a write fault.
2064 *
2065 * The function expects the page to be locked and unlocks it.
2066 */
2067static void fault_dirty_shared_page(struct vm_area_struct *vma,
2068 struct page *page)
2069{
2070 struct address_space *mapping;
2071 bool dirtied;
2072 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2073
2074 dirtied = set_page_dirty(page);
2075 VM_BUG_ON_PAGE(PageAnon(page), page);
2076 /*
2077 * Take a local copy of the address_space - page.mapping may be zeroed
2078 * by truncate after unlock_page(). The address_space itself remains
2079 * pinned by vma->vm_file's reference. We rely on unlock_page()'s
2080 * release semantics to prevent the compiler from undoing this copying.
2081 */
2082 mapping = page_rmapping(page);
2083 unlock_page(page);
2084
2085 if ((dirtied || page_mkwrite) && mapping) {
2086 /*
2087 * Some device drivers do not set page.mapping
2088 * but still dirty their pages
2089 */
2090 balance_dirty_pages_ratelimited(mapping);
2091 }
2092
2093 if (!page_mkwrite)
2094 file_update_time(vma->vm_file);
2095}
2096
2097/*
2066 * Handle write page faults for pages that can be reused in the current vma 2098 * Handle write page faults for pages that can be reused in the current vma
2067 * 2099 *
2068 * This can happen either due to the mapping being with the VM_SHARED flag, 2100 * This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2070 * case, all we need to do here is to mark the page as writable and update 2102 * case, all we need to do here is to mark the page as writable and update
2071 * any related book-keeping. 2103 * any related book-keeping.
2072 */ 2104 */
2073static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, 2105static inline void wp_page_reuse(struct vm_fault *vmf)
2074 struct page *page, int page_mkwrite, int dirty_shared) 2106 __releases(vmf->ptl)
2075 __releases(fe->ptl)
2076{ 2107{
2077 struct vm_area_struct *vma = fe->vma; 2108 struct vm_area_struct *vma = vmf->vma;
2109 struct page *page = vmf->page;
2078 pte_t entry; 2110 pte_t entry;
2079 /* 2111 /*
2080 * Clear the pages cpupid information as the existing 2112 * Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2084 if (page) 2116 if (page)
2085 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); 2117 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2086 2118
2087 flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); 2119 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2088 entry = pte_mkyoung(orig_pte); 2120 entry = pte_mkyoung(vmf->orig_pte);
2089 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2121 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2090 if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) 2122 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2091 update_mmu_cache(vma, fe->address, fe->pte); 2123 update_mmu_cache(vma, vmf->address, vmf->pte);
2092 pte_unmap_unlock(fe->pte, fe->ptl); 2124 pte_unmap_unlock(vmf->pte, vmf->ptl);
2093
2094 if (dirty_shared) {
2095 struct address_space *mapping;
2096 int dirtied;
2097
2098 if (!page_mkwrite)
2099 lock_page(page);
2100
2101 dirtied = set_page_dirty(page);
2102 VM_BUG_ON_PAGE(PageAnon(page), page);
2103 mapping = page->mapping;
2104 unlock_page(page);
2105 put_page(page);
2106
2107 if ((dirtied || page_mkwrite) && mapping) {
2108 /*
2109 * Some device drivers do not set page.mapping
2110 * but still dirty their pages
2111 */
2112 balance_dirty_pages_ratelimited(mapping);
2113 }
2114
2115 if (!page_mkwrite)
2116 file_update_time(vma->vm_file);
2117 }
2118
2119 return VM_FAULT_WRITE;
2120} 2125}
2121 2126
2122/* 2127/*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2135 * held to the old page, as well as updating the rmap. 2140 * held to the old page, as well as updating the rmap.
2136 * - In any case, unlock the PTL and drop the reference we took to the old page. 2141 * - In any case, unlock the PTL and drop the reference we took to the old page.
2137 */ 2142 */
2138static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, 2143static int wp_page_copy(struct vm_fault *vmf)
2139 struct page *old_page)
2140{ 2144{
2141 struct vm_area_struct *vma = fe->vma; 2145 struct vm_area_struct *vma = vmf->vma;
2142 struct mm_struct *mm = vma->vm_mm; 2146 struct mm_struct *mm = vma->vm_mm;
2147 struct page *old_page = vmf->page;
2143 struct page *new_page = NULL; 2148 struct page *new_page = NULL;
2144 pte_t entry; 2149 pte_t entry;
2145 int page_copied = 0; 2150 int page_copied = 0;
2146 const unsigned long mmun_start = fe->address & PAGE_MASK; 2151 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2147 const unsigned long mmun_end = mmun_start + PAGE_SIZE; 2152 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2148 struct mem_cgroup *memcg; 2153 struct mem_cgroup *memcg;
2149 2154
2150 if (unlikely(anon_vma_prepare(vma))) 2155 if (unlikely(anon_vma_prepare(vma)))
2151 goto oom; 2156 goto oom;
2152 2157
2153 if (is_zero_pfn(pte_pfn(orig_pte))) { 2158 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2154 new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); 2159 new_page = alloc_zeroed_user_highpage_movable(vma,
2160 vmf->address);
2155 if (!new_page) 2161 if (!new_page)
2156 goto oom; 2162 goto oom;
2157 } else { 2163 } else {
2158 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, 2164 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2159 fe->address); 2165 vmf->address);
2160 if (!new_page) 2166 if (!new_page)
2161 goto oom; 2167 goto oom;
2162 cow_user_page(new_page, old_page, fe->address, vma); 2168 cow_user_page(new_page, old_page, vmf->address, vma);
2163 } 2169 }
2164 2170
2165 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) 2171 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2172 /* 2178 /*
2173 * Re-check the pte - we dropped the lock 2179 * Re-check the pte - we dropped the lock
2174 */ 2180 */
2175 fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); 2181 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2176 if (likely(pte_same(*fe->pte, orig_pte))) { 2182 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2177 if (old_page) { 2183 if (old_page) {
2178 if (!PageAnon(old_page)) { 2184 if (!PageAnon(old_page)) {
2179 dec_mm_counter_fast(mm, 2185 dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2183 } else { 2189 } else {
2184 inc_mm_counter_fast(mm, MM_ANONPAGES); 2190 inc_mm_counter_fast(mm, MM_ANONPAGES);
2185 } 2191 }
2186 flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); 2192 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2187 entry = mk_pte(new_page, vma->vm_page_prot); 2193 entry = mk_pte(new_page, vma->vm_page_prot);
2188 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2194 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2189 /* 2195 /*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2192 * seen in the presence of one thread doing SMC and another 2198 * seen in the presence of one thread doing SMC and another
2193 * thread doing COW. 2199 * thread doing COW.
2194 */ 2200 */
2195 ptep_clear_flush_notify(vma, fe->address, fe->pte); 2201 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2196 page_add_new_anon_rmap(new_page, vma, fe->address, false); 2202 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2197 mem_cgroup_commit_charge(new_page, memcg, false, false); 2203 mem_cgroup_commit_charge(new_page, memcg, false, false);
2198 lru_cache_add_active_or_unevictable(new_page, vma); 2204 lru_cache_add_active_or_unevictable(new_page, vma);
2199 /* 2205 /*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2201 * mmu page tables (such as kvm shadow page tables), we want the 2207 * mmu page tables (such as kvm shadow page tables), we want the
2202 * new page to be mapped directly into the secondary page table. 2208 * new page to be mapped directly into the secondary page table.
2203 */ 2209 */
2204 set_pte_at_notify(mm, fe->address, fe->pte, entry); 2210 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2205 update_mmu_cache(vma, fe->address, fe->pte); 2211 update_mmu_cache(vma, vmf->address, vmf->pte);
2206 if (old_page) { 2212 if (old_page) {
2207 /* 2213 /*
2208 * Only after switching the pte to the new page may 2214 * Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2239 if (new_page) 2245 if (new_page)
2240 put_page(new_page); 2246 put_page(new_page);
2241 2247
2242 pte_unmap_unlock(fe->pte, fe->ptl); 2248 pte_unmap_unlock(vmf->pte, vmf->ptl);
2243 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2249 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2244 if (old_page) { 2250 if (old_page) {
2245 /* 2251 /*
@@ -2263,79 +2269,91 @@ oom:
2263 return VM_FAULT_OOM; 2269 return VM_FAULT_OOM;
2264} 2270}
2265 2271
2272/**
2273 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
2274 * writeable once the page is prepared
2275 *
2276 * @vmf: structure describing the fault
2277 *
2278 * This function handles all that is needed to finish a write page fault in a
2279 * shared mapping due to PTE being read-only once the mapped page is prepared.
2280 * It handles locking of PTE and modifying it. The function returns
2281 * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
2282 * lock.
2283 *
2284 * The function expects the page to be locked or other protection against
2285 * concurrent faults / writeback (such as DAX radix tree locks).
2286 */
2287int finish_mkwrite_fault(struct vm_fault *vmf)
2288{
2289 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2290 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2291 &vmf->ptl);
2292 /*
2293 * We might have raced with another page fault while we released the
2294 * pte_offset_map_lock.
2295 */
2296 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2297 pte_unmap_unlock(vmf->pte, vmf->ptl);
2298 return VM_FAULT_NOPAGE;
2299 }
2300 wp_page_reuse(vmf);
2301 return 0;
2302}
2303
2266/* 2304/*
2267 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED 2305 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2268 * mapping 2306 * mapping
2269 */ 2307 */
2270static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) 2308static int wp_pfn_shared(struct vm_fault *vmf)
2271{ 2309{
2272 struct vm_area_struct *vma = fe->vma; 2310 struct vm_area_struct *vma = vmf->vma;
2273 2311
2274 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { 2312 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2275 struct vm_fault vmf = {
2276 .page = NULL,
2277 .pgoff = linear_page_index(vma, fe->address),
2278 .virtual_address =
2279 (void __user *)(fe->address & PAGE_MASK),
2280 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2281 };
2282 int ret; 2313 int ret;
2283 2314
2284 pte_unmap_unlock(fe->pte, fe->ptl); 2315 pte_unmap_unlock(vmf->pte, vmf->ptl);
2285 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); 2316 vmf->flags |= FAULT_FLAG_MKWRITE;
2286 if (ret & VM_FAULT_ERROR) 2317 ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
2318 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2287 return ret; 2319 return ret;
2288 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2320 return finish_mkwrite_fault(vmf);
2289 &fe->ptl);
2290 /*
2291 * We might have raced with another page fault while we
2292 * released the pte_offset_map_lock.
2293 */
2294 if (!pte_same(*fe->pte, orig_pte)) {
2295 pte_unmap_unlock(fe->pte, fe->ptl);
2296 return 0;
2297 }
2298 } 2321 }
2299 return wp_page_reuse(fe, orig_pte, NULL, 0, 0); 2322 wp_page_reuse(vmf);
2323 return VM_FAULT_WRITE;
2300} 2324}
2301 2325
2302static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, 2326static int wp_page_shared(struct vm_fault *vmf)
2303 struct page *old_page) 2327 __releases(vmf->ptl)
2304 __releases(fe->ptl)
2305{ 2328{
2306 struct vm_area_struct *vma = fe->vma; 2329 struct vm_area_struct *vma = vmf->vma;
2307 int page_mkwrite = 0;
2308 2330
2309 get_page(old_page); 2331 get_page(vmf->page);
2310 2332
2311 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2333 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2312 int tmp; 2334 int tmp;
2313 2335
2314 pte_unmap_unlock(fe->pte, fe->ptl); 2336 pte_unmap_unlock(vmf->pte, vmf->ptl);
2315 tmp = do_page_mkwrite(vma, old_page, fe->address); 2337 tmp = do_page_mkwrite(vmf);
2316 if (unlikely(!tmp || (tmp & 2338 if (unlikely(!tmp || (tmp &
2317 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 2339 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2318 put_page(old_page); 2340 put_page(vmf->page);
2319 return tmp; 2341 return tmp;
2320 } 2342 }
2321 /* 2343 tmp = finish_mkwrite_fault(vmf);
2322 * Since we dropped the lock we need to revalidate 2344 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2323 * the PTE as someone else may have changed it. If 2345 unlock_page(vmf->page);
2324 * they did, we just return, as we can count on the 2346 put_page(vmf->page);
2325 * MMU to tell us if they didn't also make it writable. 2347 return tmp;
2326 */
2327 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2328 &fe->ptl);
2329 if (!pte_same(*fe->pte, orig_pte)) {
2330 unlock_page(old_page);
2331 pte_unmap_unlock(fe->pte, fe->ptl);
2332 put_page(old_page);
2333 return 0;
2334 } 2348 }
2335 page_mkwrite = 1; 2349 } else {
2350 wp_page_reuse(vmf);
2351 lock_page(vmf->page);
2336 } 2352 }
2353 fault_dirty_shared_page(vma, vmf->page);
2354 put_page(vmf->page);
2337 2355
2338 return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); 2356 return VM_FAULT_WRITE;
2339} 2357}
2340 2358
2341/* 2359/*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
2356 * but allow concurrent faults), with pte both mapped and locked. 2374 * but allow concurrent faults), with pte both mapped and locked.
2357 * We return with mmap_sem still held, but pte unmapped and unlocked. 2375 * We return with mmap_sem still held, but pte unmapped and unlocked.
2358 */ 2376 */
2359static int do_wp_page(struct fault_env *fe, pte_t orig_pte) 2377static int do_wp_page(struct vm_fault *vmf)
2360 __releases(fe->ptl) 2378 __releases(vmf->ptl)
2361{ 2379{
2362 struct vm_area_struct *vma = fe->vma; 2380 struct vm_area_struct *vma = vmf->vma;
2363 struct page *old_page;
2364 2381
2365 old_page = vm_normal_page(vma, fe->address, orig_pte); 2382 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2366 if (!old_page) { 2383 if (!vmf->page) {
2367 /* 2384 /*
2368 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a 2385 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2369 * VM_PFNMAP VMA. 2386 * VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2373 */ 2390 */
2374 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2391 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2375 (VM_WRITE|VM_SHARED)) 2392 (VM_WRITE|VM_SHARED))
2376 return wp_pfn_shared(fe, orig_pte); 2393 return wp_pfn_shared(vmf);
2377 2394
2378 pte_unmap_unlock(fe->pte, fe->ptl); 2395 pte_unmap_unlock(vmf->pte, vmf->ptl);
2379 return wp_page_copy(fe, orig_pte, old_page); 2396 return wp_page_copy(vmf);
2380 } 2397 }
2381 2398
2382 /* 2399 /*
2383 * Take out anonymous pages first, anonymous shared vmas are 2400 * Take out anonymous pages first, anonymous shared vmas are
2384 * not dirty accountable. 2401 * not dirty accountable.
2385 */ 2402 */
2386 if (PageAnon(old_page) && !PageKsm(old_page)) { 2403 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2387 int total_mapcount; 2404 int total_mapcount;
2388 if (!trylock_page(old_page)) { 2405 if (!trylock_page(vmf->page)) {
2389 get_page(old_page); 2406 get_page(vmf->page);
2390 pte_unmap_unlock(fe->pte, fe->ptl); 2407 pte_unmap_unlock(vmf->pte, vmf->ptl);
2391 lock_page(old_page); 2408 lock_page(vmf->page);
2392 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, 2409 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2393 fe->address, &fe->ptl); 2410 vmf->address, &vmf->ptl);
2394 if (!pte_same(*fe->pte, orig_pte)) { 2411 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2395 unlock_page(old_page); 2412 unlock_page(vmf->page);
2396 pte_unmap_unlock(fe->pte, fe->ptl); 2413 pte_unmap_unlock(vmf->pte, vmf->ptl);
2397 put_page(old_page); 2414 put_page(vmf->page);
2398 return 0; 2415 return 0;
2399 } 2416 }
2400 put_page(old_page); 2417 put_page(vmf->page);
2401 } 2418 }
2402 if (reuse_swap_page(old_page, &total_mapcount)) { 2419 if (reuse_swap_page(vmf->page, &total_mapcount)) {
2403 if (total_mapcount == 1) { 2420 if (total_mapcount == 1) {
2404 /* 2421 /*
2405 * The page is all ours. Move it to 2422 * The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2408 * Protected against the rmap code by 2425 * Protected against the rmap code by
2409 * the page lock. 2426 * the page lock.
2410 */ 2427 */
2411 page_move_anon_rmap(old_page, vma); 2428 page_move_anon_rmap(vmf->page, vma);
2412 } 2429 }
2413 unlock_page(old_page); 2430 unlock_page(vmf->page);
2414 return wp_page_reuse(fe, orig_pte, old_page, 0, 0); 2431 wp_page_reuse(vmf);
2432 return VM_FAULT_WRITE;
2415 } 2433 }
2416 unlock_page(old_page); 2434 unlock_page(vmf->page);
2417 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2435 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2418 (VM_WRITE|VM_SHARED))) { 2436 (VM_WRITE|VM_SHARED))) {
2419 return wp_page_shared(fe, orig_pte, old_page); 2437 return wp_page_shared(vmf);
2420 } 2438 }
2421 2439
2422 /* 2440 /*
2423 * Ok, we need to copy. Oh, well.. 2441 * Ok, we need to copy. Oh, well..
2424 */ 2442 */
2425 get_page(old_page); 2443 get_page(vmf->page);
2426 2444
2427 pte_unmap_unlock(fe->pte, fe->ptl); 2445 pte_unmap_unlock(vmf->pte, vmf->ptl);
2428 return wp_page_copy(fe, orig_pte, old_page); 2446 return wp_page_copy(vmf);
2429} 2447}
2430 2448
2431static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2449static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
2513 * We return with the mmap_sem locked or unlocked in the same cases 2531 * We return with the mmap_sem locked or unlocked in the same cases
2514 * as does filemap_fault(). 2532 * as does filemap_fault().
2515 */ 2533 */
2516int do_swap_page(struct fault_env *fe, pte_t orig_pte) 2534int do_swap_page(struct vm_fault *vmf)
2517{ 2535{
2518 struct vm_area_struct *vma = fe->vma; 2536 struct vm_area_struct *vma = vmf->vma;
2519 struct page *page, *swapcache; 2537 struct page *page, *swapcache;
2520 struct mem_cgroup *memcg; 2538 struct mem_cgroup *memcg;
2521 swp_entry_t entry; 2539 swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2524 int exclusive = 0; 2542 int exclusive = 0;
2525 int ret = 0; 2543 int ret = 0;
2526 2544
2527 if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) 2545 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2528 goto out; 2546 goto out;
2529 2547
2530 entry = pte_to_swp_entry(orig_pte); 2548 entry = pte_to_swp_entry(vmf->orig_pte);
2531 if (unlikely(non_swap_entry(entry))) { 2549 if (unlikely(non_swap_entry(entry))) {
2532 if (is_migration_entry(entry)) { 2550 if (is_migration_entry(entry)) {
2533 migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); 2551 migration_entry_wait(vma->vm_mm, vmf->pmd,
2552 vmf->address);
2534 } else if (is_hwpoison_entry(entry)) { 2553 } else if (is_hwpoison_entry(entry)) {
2535 ret = VM_FAULT_HWPOISON; 2554 ret = VM_FAULT_HWPOISON;
2536 } else { 2555 } else {
2537 print_bad_pte(vma, fe->address, orig_pte, NULL); 2556 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2538 ret = VM_FAULT_SIGBUS; 2557 ret = VM_FAULT_SIGBUS;
2539 } 2558 }
2540 goto out; 2559 goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2542 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2561 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2543 page = lookup_swap_cache(entry); 2562 page = lookup_swap_cache(entry);
2544 if (!page) { 2563 if (!page) {
2545 page = swapin_readahead(entry, 2564 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
2546 GFP_HIGHUSER_MOVABLE, vma, fe->address); 2565 vmf->address);
2547 if (!page) { 2566 if (!page) {
2548 /* 2567 /*
2549 * Back out if somebody else faulted in this pte 2568 * Back out if somebody else faulted in this pte
2550 * while we released the pte lock. 2569 * while we released the pte lock.
2551 */ 2570 */
2552 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, 2571 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2553 fe->address, &fe->ptl); 2572 vmf->address, &vmf->ptl);
2554 if (likely(pte_same(*fe->pte, orig_pte))) 2573 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2555 ret = VM_FAULT_OOM; 2574 ret = VM_FAULT_OOM;
2556 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2575 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2557 goto unlock; 2576 goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2573 } 2592 }
2574 2593
2575 swapcache = page; 2594 swapcache = page;
2576 locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); 2595 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2577 2596
2578 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2597 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2579 if (!locked) { 2598 if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2590 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2609 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2591 goto out_page; 2610 goto out_page;
2592 2611
2593 page = ksm_might_need_to_copy(page, vma, fe->address); 2612 page = ksm_might_need_to_copy(page, vma, vmf->address);
2594 if (unlikely(!page)) { 2613 if (unlikely(!page)) {
2595 ret = VM_FAULT_OOM; 2614 ret = VM_FAULT_OOM;
2596 page = swapcache; 2615 page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2606 /* 2625 /*
2607 * Back out if somebody else already faulted in this pte. 2626 * Back out if somebody else already faulted in this pte.
2608 */ 2627 */
2609 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2628 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2610 &fe->ptl); 2629 &vmf->ptl);
2611 if (unlikely(!pte_same(*fe->pte, orig_pte))) 2630 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2612 goto out_nomap; 2631 goto out_nomap;
2613 2632
2614 if (unlikely(!PageUptodate(page))) { 2633 if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2629 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2648 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2630 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); 2649 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2631 pte = mk_pte(page, vma->vm_page_prot); 2650 pte = mk_pte(page, vma->vm_page_prot);
2632 if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { 2651 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2633 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2652 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2634 fe->flags &= ~FAULT_FLAG_WRITE; 2653 vmf->flags &= ~FAULT_FLAG_WRITE;
2635 ret |= VM_FAULT_WRITE; 2654 ret |= VM_FAULT_WRITE;
2636 exclusive = RMAP_EXCLUSIVE; 2655 exclusive = RMAP_EXCLUSIVE;
2637 } 2656 }
2638 flush_icache_page(vma, page); 2657 flush_icache_page(vma, page);
2639 if (pte_swp_soft_dirty(orig_pte)) 2658 if (pte_swp_soft_dirty(vmf->orig_pte))
2640 pte = pte_mksoft_dirty(pte); 2659 pte = pte_mksoft_dirty(pte);
2641 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); 2660 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2661 vmf->orig_pte = pte;
2642 if (page == swapcache) { 2662 if (page == swapcache) {
2643 do_page_add_anon_rmap(page, vma, fe->address, exclusive); 2663 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2644 mem_cgroup_commit_charge(page, memcg, true, false); 2664 mem_cgroup_commit_charge(page, memcg, true, false);
2645 activate_page(page); 2665 activate_page(page);
2646 } else { /* ksm created a completely new copy */ 2666 } else { /* ksm created a completely new copy */
2647 page_add_new_anon_rmap(page, vma, fe->address, false); 2667 page_add_new_anon_rmap(page, vma, vmf->address, false);
2648 mem_cgroup_commit_charge(page, memcg, false, false); 2668 mem_cgroup_commit_charge(page, memcg, false, false);
2649 lru_cache_add_active_or_unevictable(page, vma); 2669 lru_cache_add_active_or_unevictable(page, vma);
2650 } 2670 }
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2667 put_page(swapcache); 2687 put_page(swapcache);
2668 } 2688 }
2669 2689
2670 if (fe->flags & FAULT_FLAG_WRITE) { 2690 if (vmf->flags & FAULT_FLAG_WRITE) {
2671 ret |= do_wp_page(fe, pte); 2691 ret |= do_wp_page(vmf);
2672 if (ret & VM_FAULT_ERROR) 2692 if (ret & VM_FAULT_ERROR)
2673 ret &= VM_FAULT_ERROR; 2693 ret &= VM_FAULT_ERROR;
2674 goto out; 2694 goto out;
2675 } 2695 }
2676 2696
2677 /* No need to invalidate - it was non-present before */ 2697 /* No need to invalidate - it was non-present before */
2678 update_mmu_cache(vma, fe->address, fe->pte); 2698 update_mmu_cache(vma, vmf->address, vmf->pte);
2679unlock: 2699unlock:
2680 pte_unmap_unlock(fe->pte, fe->ptl); 2700 pte_unmap_unlock(vmf->pte, vmf->ptl);
2681out: 2701out:
2682 return ret; 2702 return ret;
2683out_nomap: 2703out_nomap:
2684 mem_cgroup_cancel_charge(page, memcg, false); 2704 mem_cgroup_cancel_charge(page, memcg, false);
2685 pte_unmap_unlock(fe->pte, fe->ptl); 2705 pte_unmap_unlock(vmf->pte, vmf->ptl);
2686out_page: 2706out_page:
2687 unlock_page(page); 2707 unlock_page(page);
2688out_release: 2708out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2733 * but allow concurrent faults), and pte mapped but not yet locked. 2753 * but allow concurrent faults), and pte mapped but not yet locked.
2734 * We return with mmap_sem still held, but pte unmapped and unlocked. 2754 * We return with mmap_sem still held, but pte unmapped and unlocked.
2735 */ 2755 */
2736static int do_anonymous_page(struct fault_env *fe) 2756static int do_anonymous_page(struct vm_fault *vmf)
2737{ 2757{
2738 struct vm_area_struct *vma = fe->vma; 2758 struct vm_area_struct *vma = vmf->vma;
2739 struct mem_cgroup *memcg; 2759 struct mem_cgroup *memcg;
2740 struct page *page; 2760 struct page *page;
2741 pte_t entry; 2761 pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
2745 return VM_FAULT_SIGBUS; 2765 return VM_FAULT_SIGBUS;
2746 2766
2747 /* Check if we need to add a guard page to the stack */ 2767 /* Check if we need to add a guard page to the stack */
2748 if (check_stack_guard_page(vma, fe->address) < 0) 2768 if (check_stack_guard_page(vma, vmf->address) < 0)
2749 return VM_FAULT_SIGSEGV; 2769 return VM_FAULT_SIGSEGV;
2750 2770
2751 /* 2771 /*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
2758 * 2778 *
2759 * Here we only have down_read(mmap_sem). 2779 * Here we only have down_read(mmap_sem).
2760 */ 2780 */
2761 if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) 2781 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
2762 return VM_FAULT_OOM; 2782 return VM_FAULT_OOM;
2763 2783
2764 /* See the comment in pte_alloc_one_map() */ 2784 /* See the comment in pte_alloc_one_map() */
2765 if (unlikely(pmd_trans_unstable(fe->pmd))) 2785 if (unlikely(pmd_trans_unstable(vmf->pmd)))
2766 return 0; 2786 return 0;
2767 2787
2768 /* Use the zero-page for reads */ 2788 /* Use the zero-page for reads */
2769 if (!(fe->flags & FAULT_FLAG_WRITE) && 2789 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
2770 !mm_forbids_zeropage(vma->vm_mm)) { 2790 !mm_forbids_zeropage(vma->vm_mm)) {
2771 entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), 2791 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
2772 vma->vm_page_prot)); 2792 vma->vm_page_prot));
2773 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2793 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2774 &fe->ptl); 2794 vmf->address, &vmf->ptl);
2775 if (!pte_none(*fe->pte)) 2795 if (!pte_none(*vmf->pte))
2776 goto unlock; 2796 goto unlock;
2777 /* Deliver the page fault to userland, check inside PT lock */ 2797 /* Deliver the page fault to userland, check inside PT lock */
2778 if (userfaultfd_missing(vma)) { 2798 if (userfaultfd_missing(vma)) {
2779 pte_unmap_unlock(fe->pte, fe->ptl); 2799 pte_unmap_unlock(vmf->pte, vmf->ptl);
2780 return handle_userfault(fe, VM_UFFD_MISSING); 2800 return handle_userfault(vmf, VM_UFFD_MISSING);
2781 } 2801 }
2782 goto setpte; 2802 goto setpte;
2783 } 2803 }
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
2785 /* Allocate our own private page. */ 2805 /* Allocate our own private page. */
2786 if (unlikely(anon_vma_prepare(vma))) 2806 if (unlikely(anon_vma_prepare(vma)))
2787 goto oom; 2807 goto oom;
2788 page = alloc_zeroed_user_highpage_movable(vma, fe->address); 2808 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
2789 if (!page) 2809 if (!page)
2790 goto oom; 2810 goto oom;
2791 2811
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
2803 if (vma->vm_flags & VM_WRITE) 2823 if (vma->vm_flags & VM_WRITE)
2804 entry = pte_mkwrite(pte_mkdirty(entry)); 2824 entry = pte_mkwrite(pte_mkdirty(entry));
2805 2825
2806 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2826 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2807 &fe->ptl); 2827 &vmf->ptl);
2808 if (!pte_none(*fe->pte)) 2828 if (!pte_none(*vmf->pte))
2809 goto release; 2829 goto release;
2810 2830
2811 /* Deliver the page fault to userland, check inside PT lock */ 2831 /* Deliver the page fault to userland, check inside PT lock */
2812 if (userfaultfd_missing(vma)) { 2832 if (userfaultfd_missing(vma)) {
2813 pte_unmap_unlock(fe->pte, fe->ptl); 2833 pte_unmap_unlock(vmf->pte, vmf->ptl);
2814 mem_cgroup_cancel_charge(page, memcg, false); 2834 mem_cgroup_cancel_charge(page, memcg, false);
2815 put_page(page); 2835 put_page(page);
2816 return handle_userfault(fe, VM_UFFD_MISSING); 2836 return handle_userfault(vmf, VM_UFFD_MISSING);
2817 } 2837 }
2818 2838
2819 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2839 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2820 page_add_new_anon_rmap(page, vma, fe->address, false); 2840 page_add_new_anon_rmap(page, vma, vmf->address, false);
2821 mem_cgroup_commit_charge(page, memcg, false, false); 2841 mem_cgroup_commit_charge(page, memcg, false, false);
2822 lru_cache_add_active_or_unevictable(page, vma); 2842 lru_cache_add_active_or_unevictable(page, vma);
2823setpte: 2843setpte:
2824 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); 2844 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
2825 2845
2826 /* No need to invalidate - it was non-present before */ 2846 /* No need to invalidate - it was non-present before */
2827 update_mmu_cache(vma, fe->address, fe->pte); 2847 update_mmu_cache(vma, vmf->address, vmf->pte);
2828unlock: 2848unlock:
2829 pte_unmap_unlock(fe->pte, fe->ptl); 2849 pte_unmap_unlock(vmf->pte, vmf->ptl);
2830 return 0; 2850 return 0;
2831release: 2851release:
2832 mem_cgroup_cancel_charge(page, memcg, false); 2852 mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
2843 * released depending on flags and vma->vm_ops->fault() return value. 2863 * released depending on flags and vma->vm_ops->fault() return value.
2844 * See filemap_fault() and __lock_page_retry(). 2864 * See filemap_fault() and __lock_page_retry().
2845 */ 2865 */
2846static int __do_fault(struct fault_env *fe, pgoff_t pgoff, 2866static int __do_fault(struct vm_fault *vmf)
2847 struct page *cow_page, struct page **page, void **entry)
2848{ 2867{
2849 struct vm_area_struct *vma = fe->vma; 2868 struct vm_area_struct *vma = vmf->vma;
2850 struct vm_fault vmf;
2851 int ret; 2869 int ret;
2852 2870
2853 vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); 2871 ret = vma->vm_ops->fault(vma, vmf);
2854 vmf.pgoff = pgoff; 2872 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
2855 vmf.flags = fe->flags; 2873 VM_FAULT_DONE_COW)))
2856 vmf.page = NULL;
2857 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2858 vmf.cow_page = cow_page;
2859
2860 ret = vma->vm_ops->fault(vma, &vmf);
2861 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2862 return ret;
2863 if (ret & VM_FAULT_DAX_LOCKED) {
2864 *entry = vmf.entry;
2865 return ret; 2874 return ret;
2866 }
2867 2875
2868 if (unlikely(PageHWPoison(vmf.page))) { 2876 if (unlikely(PageHWPoison(vmf->page))) {
2869 if (ret & VM_FAULT_LOCKED) 2877 if (ret & VM_FAULT_LOCKED)
2870 unlock_page(vmf.page); 2878 unlock_page(vmf->page);
2871 put_page(vmf.page); 2879 put_page(vmf->page);
2880 vmf->page = NULL;
2872 return VM_FAULT_HWPOISON; 2881 return VM_FAULT_HWPOISON;
2873 } 2882 }
2874 2883
2875 if (unlikely(!(ret & VM_FAULT_LOCKED))) 2884 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2876 lock_page(vmf.page); 2885 lock_page(vmf->page);
2877 else 2886 else
2878 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); 2887 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
2879 2888
2880 *page = vmf.page;
2881 return ret; 2889 return ret;
2882} 2890}
2883 2891
2884static int pte_alloc_one_map(struct fault_env *fe) 2892static int pte_alloc_one_map(struct vm_fault *vmf)
2885{ 2893{
2886 struct vm_area_struct *vma = fe->vma; 2894 struct vm_area_struct *vma = vmf->vma;
2887 2895
2888 if (!pmd_none(*fe->pmd)) 2896 if (!pmd_none(*vmf->pmd))
2889 goto map_pte; 2897 goto map_pte;
2890 if (fe->prealloc_pte) { 2898 if (vmf->prealloc_pte) {
2891 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 2899 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2892 if (unlikely(!pmd_none(*fe->pmd))) { 2900 if (unlikely(!pmd_none(*vmf->pmd))) {
2893 spin_unlock(fe->ptl); 2901 spin_unlock(vmf->ptl);
2894 goto map_pte; 2902 goto map_pte;
2895 } 2903 }
2896 2904
2897 atomic_long_inc(&vma->vm_mm->nr_ptes); 2905 atomic_long_inc(&vma->vm_mm->nr_ptes);
2898 pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); 2906 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
2899 spin_unlock(fe->ptl); 2907 spin_unlock(vmf->ptl);
2900 fe->prealloc_pte = 0; 2908 vmf->prealloc_pte = 0;
2901 } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { 2909 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
2902 return VM_FAULT_OOM; 2910 return VM_FAULT_OOM;
2903 } 2911 }
2904map_pte: 2912map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
2913 * through an atomic read in C, which is what pmd_trans_unstable() 2921 * through an atomic read in C, which is what pmd_trans_unstable()
2914 * provides. 2922 * provides.
2915 */ 2923 */
2916 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) 2924 if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
2917 return VM_FAULT_NOPAGE; 2925 return VM_FAULT_NOPAGE;
2918 2926
2919 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2927 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2920 &fe->ptl); 2928 &vmf->ptl);
2921 return 0; 2929 return 0;
2922} 2930}
2923 2931
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
2935 return true; 2943 return true;
2936} 2944}
2937 2945
2938static void deposit_prealloc_pte(struct fault_env *fe) 2946static void deposit_prealloc_pte(struct vm_fault *vmf)
2939{ 2947{
2940 struct vm_area_struct *vma = fe->vma; 2948 struct vm_area_struct *vma = vmf->vma;
2941 2949
2942 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); 2950 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
2943 /* 2951 /*
2944 * We are going to consume the prealloc table, 2952 * We are going to consume the prealloc table,
2945 * count that as nr_ptes. 2953 * count that as nr_ptes.
2946 */ 2954 */
2947 atomic_long_inc(&vma->vm_mm->nr_ptes); 2955 atomic_long_inc(&vma->vm_mm->nr_ptes);
2948 fe->prealloc_pte = 0; 2956 vmf->prealloc_pte = 0;
2949} 2957}
2950 2958
2951static int do_set_pmd(struct fault_env *fe, struct page *page) 2959static int do_set_pmd(struct vm_fault *vmf, struct page *page)
2952{ 2960{
2953 struct vm_area_struct *vma = fe->vma; 2961 struct vm_area_struct *vma = vmf->vma;
2954 bool write = fe->flags & FAULT_FLAG_WRITE; 2962 bool write = vmf->flags & FAULT_FLAG_WRITE;
2955 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 2963 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2956 pmd_t entry; 2964 pmd_t entry;
2957 int i, ret; 2965 int i, ret;
2958 2966
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2966 * Archs like ppc64 need additonal space to store information 2974 * Archs like ppc64 need additonal space to store information
2967 * related to pte entry. Use the preallocated table for that. 2975 * related to pte entry. Use the preallocated table for that.
2968 */ 2976 */
2969 if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { 2977 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
2970 fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); 2978 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
2971 if (!fe->prealloc_pte) 2979 if (!vmf->prealloc_pte)
2972 return VM_FAULT_OOM; 2980 return VM_FAULT_OOM;
2973 smp_wmb(); /* See comment in __pte_alloc() */ 2981 smp_wmb(); /* See comment in __pte_alloc() */
2974 } 2982 }
2975 2983
2976 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 2984 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2977 if (unlikely(!pmd_none(*fe->pmd))) 2985 if (unlikely(!pmd_none(*vmf->pmd)))
2978 goto out; 2986 goto out;
2979 2987
2980 for (i = 0; i < HPAGE_PMD_NR; i++) 2988 for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2990 * deposit and withdraw with pmd lock held 2998 * deposit and withdraw with pmd lock held
2991 */ 2999 */
2992 if (arch_needs_pgtable_deposit()) 3000 if (arch_needs_pgtable_deposit())
2993 deposit_prealloc_pte(fe); 3001 deposit_prealloc_pte(vmf);
2994 3002
2995 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 3003 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
2996 3004
2997 update_mmu_cache_pmd(vma, haddr, fe->pmd); 3005 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
2998 3006
2999 /* fault is handled */ 3007 /* fault is handled */
3000 ret = 0; 3008 ret = 0;
@@ -3005,13 +3013,13 @@ out:
3005 * withdraw with pmd lock held. 3013 * withdraw with pmd lock held.
3006 */ 3014 */
3007 if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) 3015 if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
3008 fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, 3016 vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
3009 fe->pmd); 3017 vmf->pmd);
3010 spin_unlock(fe->ptl); 3018 spin_unlock(vmf->ptl);
3011 return ret; 3019 return ret;
3012} 3020}
3013#else 3021#else
3014static int do_set_pmd(struct fault_env *fe, struct page *page) 3022static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3015{ 3023{
3016 BUILD_BUG(); 3024 BUILD_BUG();
3017 return 0; 3025 return 0;
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
3022 * alloc_set_pte - setup new PTE entry for given page and add reverse page 3030 * alloc_set_pte - setup new PTE entry for given page and add reverse page
3023 * mapping. If needed, the fucntion allocates page table or use pre-allocated. 3031 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
3024 * 3032 *
3025 * @fe: fault environment 3033 * @vmf: fault environment
3026 * @memcg: memcg to charge page (only for private mappings) 3034 * @memcg: memcg to charge page (only for private mappings)
3027 * @page: page to map 3035 * @page: page to map
3028 * 3036 *
3029 * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. 3037 * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
3038 * return.
3030 * 3039 *
3031 * Target users are page handler itself and implementations of 3040 * Target users are page handler itself and implementations of
3032 * vm_ops->map_pages. 3041 * vm_ops->map_pages.
3033 */ 3042 */
3034int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, 3043int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3035 struct page *page) 3044 struct page *page)
3036{ 3045{
3037 struct vm_area_struct *vma = fe->vma; 3046 struct vm_area_struct *vma = vmf->vma;
3038 bool write = fe->flags & FAULT_FLAG_WRITE; 3047 bool write = vmf->flags & FAULT_FLAG_WRITE;
3039 pte_t entry; 3048 pte_t entry;
3040 int ret; 3049 int ret;
3041 3050
3042 if (pmd_none(*fe->pmd) && PageTransCompound(page) && 3051 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3043 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { 3052 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3044 /* THP on COW? */ 3053 /* THP on COW? */
3045 VM_BUG_ON_PAGE(memcg, page); 3054 VM_BUG_ON_PAGE(memcg, page);
3046 3055
3047 ret = do_set_pmd(fe, page); 3056 ret = do_set_pmd(vmf, page);
3048 if (ret != VM_FAULT_FALLBACK) 3057 if (ret != VM_FAULT_FALLBACK)
3049 goto fault_handled; 3058 goto fault_handled;
3050 } 3059 }
3051 3060
3052 if (!fe->pte) { 3061 if (!vmf->pte) {
3053 ret = pte_alloc_one_map(fe); 3062 ret = pte_alloc_one_map(vmf);
3054 if (ret) 3063 if (ret)
3055 goto fault_handled; 3064 goto fault_handled;
3056 } 3065 }
3057 3066
3058 /* Re-check under ptl */ 3067 /* Re-check under ptl */
3059 if (unlikely(!pte_none(*fe->pte))) { 3068 if (unlikely(!pte_none(*vmf->pte))) {
3060 ret = VM_FAULT_NOPAGE; 3069 ret = VM_FAULT_NOPAGE;
3061 goto fault_handled; 3070 goto fault_handled;
3062 } 3071 }
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
3068 /* copy-on-write page */ 3077 /* copy-on-write page */
3069 if (write && !(vma->vm_flags & VM_SHARED)) { 3078 if (write && !(vma->vm_flags & VM_SHARED)) {
3070 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 3079 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3071 page_add_new_anon_rmap(page, vma, fe->address, false); 3080 page_add_new_anon_rmap(page, vma, vmf->address, false);
3072 mem_cgroup_commit_charge(page, memcg, false, false); 3081 mem_cgroup_commit_charge(page, memcg, false, false);
3073 lru_cache_add_active_or_unevictable(page, vma); 3082 lru_cache_add_active_or_unevictable(page, vma);
3074 } else { 3083 } else {
3075 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 3084 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3076 page_add_file_rmap(page, false); 3085 page_add_file_rmap(page, false);
3077 } 3086 }
3078 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); 3087 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3079 3088
3080 /* no need to invalidate: a not-present page won't be cached */ 3089 /* no need to invalidate: a not-present page won't be cached */
3081 update_mmu_cache(vma, fe->address, fe->pte); 3090 update_mmu_cache(vma, vmf->address, vmf->pte);
3082 ret = 0; 3091 ret = 0;
3083 3092
3084fault_handled: 3093fault_handled:
3085 /* preallocated pagetable is unused: free it */ 3094 /* preallocated pagetable is unused: free it */
3086 if (fe->prealloc_pte) { 3095 if (vmf->prealloc_pte) {
3087 pte_free(fe->vma->vm_mm, fe->prealloc_pte); 3096 pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
3088 fe->prealloc_pte = 0; 3097 vmf->prealloc_pte = 0;
3089 } 3098 }
3090 return ret; 3099 return ret;
3091} 3100}
3092 3101
3102
3103/**
3104 * finish_fault - finish page fault once we have prepared the page to fault
3105 *
3106 * @vmf: structure describing the fault
3107 *
3108 * This function handles all that is needed to finish a page fault once the
3109 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
3110 * given page, adds reverse page mapping, handles memcg charges and LRU
3111 * addition. The function returns 0 on success, VM_FAULT_ code in case of
3112 * error.
3113 *
3114 * The function expects the page to be locked and on success it consumes a
3115 * reference of a page being mapped (for the PTE which maps it).
3116 */
3117int finish_fault(struct vm_fault *vmf)
3118{
3119 struct page *page;
3120 int ret;
3121
3122 /* Did we COW the page? */
3123 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3124 !(vmf->vma->vm_flags & VM_SHARED))
3125 page = vmf->cow_page;
3126 else
3127 page = vmf->page;
3128 ret = alloc_set_pte(vmf, vmf->memcg, page);
3129 if (vmf->pte)
3130 pte_unmap_unlock(vmf->pte, vmf->ptl);
3131 return ret;
3132}
3133
3093static unsigned long fault_around_bytes __read_mostly = 3134static unsigned long fault_around_bytes __read_mostly =
3094 rounddown_pow_of_two(65536); 3135 rounddown_pow_of_two(65536);
3095 3136
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs);
3154 * fault_around_pages() value (and therefore to page order). This way it's 3195 * fault_around_pages() value (and therefore to page order). This way it's
3155 * easier to guarantee that we don't cross page table boundaries. 3196 * easier to guarantee that we don't cross page table boundaries.
3156 */ 3197 */
3157static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) 3198static int do_fault_around(struct vm_fault *vmf)
3158{ 3199{
3159 unsigned long address = fe->address, nr_pages, mask; 3200 unsigned long address = vmf->address, nr_pages, mask;
3201 pgoff_t start_pgoff = vmf->pgoff;
3160 pgoff_t end_pgoff; 3202 pgoff_t end_pgoff;
3161 int off, ret = 0; 3203 int off, ret = 0;
3162 3204
3163 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 3205 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3164 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 3206 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3165 3207
3166 fe->address = max(address & mask, fe->vma->vm_start); 3208 vmf->address = max(address & mask, vmf->vma->vm_start);
3167 off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 3209 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3168 start_pgoff -= off; 3210 start_pgoff -= off;
3169 3211
3170 /* 3212 /*
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
3172 * or fault_around_pages() from start_pgoff, depending what is nearest. 3214 * or fault_around_pages() from start_pgoff, depending what is nearest.
3173 */ 3215 */
3174 end_pgoff = start_pgoff - 3216 end_pgoff = start_pgoff -
3175 ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 3217 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3176 PTRS_PER_PTE - 1; 3218 PTRS_PER_PTE - 1;
3177 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, 3219 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3178 start_pgoff + nr_pages - 1); 3220 start_pgoff + nr_pages - 1);
3179 3221
3180 if (pmd_none(*fe->pmd)) { 3222 if (pmd_none(*vmf->pmd)) {
3181 fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); 3223 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3182 if (!fe->prealloc_pte) 3224 vmf->address);
3225 if (!vmf->prealloc_pte)
3183 goto out; 3226 goto out;
3184 smp_wmb(); /* See comment in __pte_alloc() */ 3227 smp_wmb(); /* See comment in __pte_alloc() */
3185 } 3228 }
3186 3229
3187 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); 3230 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3188 3231
3189 /* Huge page is mapped? Page fault is solved */ 3232 /* Huge page is mapped? Page fault is solved */
3190 if (pmd_trans_huge(*fe->pmd)) { 3233 if (pmd_trans_huge(*vmf->pmd)) {
3191 ret = VM_FAULT_NOPAGE; 3234 ret = VM_FAULT_NOPAGE;
3192 goto out; 3235 goto out;
3193 } 3236 }
3194 3237
3195 /* ->map_pages() haven't done anything useful. Cold page cache? */ 3238 /* ->map_pages() haven't done anything useful. Cold page cache? */
3196 if (!fe->pte) 3239 if (!vmf->pte)
3197 goto out; 3240 goto out;
3198 3241
3199 /* check if the page fault is solved */ 3242 /* check if the page fault is solved */
3200 fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); 3243 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3201 if (!pte_none(*fe->pte)) 3244 if (!pte_none(*vmf->pte))
3202 ret = VM_FAULT_NOPAGE; 3245 ret = VM_FAULT_NOPAGE;
3203 pte_unmap_unlock(fe->pte, fe->ptl); 3246 pte_unmap_unlock(vmf->pte, vmf->ptl);
3204out: 3247out:
3205 fe->address = address; 3248 vmf->address = address;
3206 fe->pte = NULL; 3249 vmf->pte = NULL;
3207 return ret; 3250 return ret;
3208} 3251}
3209 3252
3210static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) 3253static int do_read_fault(struct vm_fault *vmf)
3211{ 3254{
3212 struct vm_area_struct *vma = fe->vma; 3255 struct vm_area_struct *vma = vmf->vma;
3213 struct page *fault_page;
3214 int ret = 0; 3256 int ret = 0;
3215 3257
3216 /* 3258 /*
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
3219 * something). 3261 * something).
3220 */ 3262 */
3221 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 3263 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3222 ret = do_fault_around(fe, pgoff); 3264 ret = do_fault_around(vmf);
3223 if (ret) 3265 if (ret)
3224 return ret; 3266 return ret;
3225 } 3267 }
3226 3268
3227 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); 3269 ret = __do_fault(vmf);
3228 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3270 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3229 return ret; 3271 return ret;
3230 3272
3231 ret |= alloc_set_pte(fe, NULL, fault_page); 3273 ret |= finish_fault(vmf);
3232 if (fe->pte) 3274 unlock_page(vmf->page);
3233 pte_unmap_unlock(fe->pte, fe->ptl);
3234 unlock_page(fault_page);
3235 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3275 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3236 put_page(fault_page); 3276 put_page(vmf->page);
3237 return ret; 3277 return ret;
3238} 3278}
3239 3279
3240static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) 3280static int do_cow_fault(struct vm_fault *vmf)
3241{ 3281{
3242 struct vm_area_struct *vma = fe->vma; 3282 struct vm_area_struct *vma = vmf->vma;
3243 struct page *fault_page, *new_page;
3244 void *fault_entry;
3245 struct mem_cgroup *memcg;
3246 int ret; 3283 int ret;
3247 3284
3248 if (unlikely(anon_vma_prepare(vma))) 3285 if (unlikely(anon_vma_prepare(vma)))
3249 return VM_FAULT_OOM; 3286 return VM_FAULT_OOM;
3250 3287
3251 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); 3288 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3252 if (!new_page) 3289 if (!vmf->cow_page)
3253 return VM_FAULT_OOM; 3290 return VM_FAULT_OOM;
3254 3291
3255 if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, 3292 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3256 &memcg, false)) { 3293 &vmf->memcg, false)) {
3257 put_page(new_page); 3294 put_page(vmf->cow_page);
3258 return VM_FAULT_OOM; 3295 return VM_FAULT_OOM;
3259 } 3296 }
3260 3297
3261 ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); 3298 ret = __do_fault(vmf);
3262 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3299 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3263 goto uncharge_out; 3300 goto uncharge_out;
3301 if (ret & VM_FAULT_DONE_COW)
3302 return ret;
3264 3303
3265 if (!(ret & VM_FAULT_DAX_LOCKED)) 3304 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3266 copy_user_highpage(new_page, fault_page, fe->address, vma); 3305 __SetPageUptodate(vmf->cow_page);
3267 __SetPageUptodate(new_page);
3268 3306
3269 ret |= alloc_set_pte(fe, memcg, new_page); 3307 ret |= finish_fault(vmf);
3270 if (fe->pte) 3308 unlock_page(vmf->page);
3271 pte_unmap_unlock(fe->pte, fe->ptl); 3309 put_page(vmf->page);
3272 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3273 unlock_page(fault_page);
3274 put_page(fault_page);
3275 } else {
3276 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
3277 }
3278 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3310 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3279 goto uncharge_out; 3311 goto uncharge_out;
3280 return ret; 3312 return ret;
3281uncharge_out: 3313uncharge_out:
3282 mem_cgroup_cancel_charge(new_page, memcg, false); 3314 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3283 put_page(new_page); 3315 put_page(vmf->cow_page);
3284 return ret; 3316 return ret;
3285} 3317}
3286 3318
3287static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) 3319static int do_shared_fault(struct vm_fault *vmf)
3288{ 3320{
3289 struct vm_area_struct *vma = fe->vma; 3321 struct vm_area_struct *vma = vmf->vma;
3290 struct page *fault_page;
3291 struct address_space *mapping;
3292 int dirtied = 0;
3293 int ret, tmp; 3322 int ret, tmp;
3294 3323
3295 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); 3324 ret = __do_fault(vmf);
3296 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3325 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3297 return ret; 3326 return ret;
3298 3327
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3301 * about to become writable 3330 * about to become writable
3302 */ 3331 */
3303 if (vma->vm_ops->page_mkwrite) { 3332 if (vma->vm_ops->page_mkwrite) {
3304 unlock_page(fault_page); 3333 unlock_page(vmf->page);
3305 tmp = do_page_mkwrite(vma, fault_page, fe->address); 3334 tmp = do_page_mkwrite(vmf);
3306 if (unlikely(!tmp || 3335 if (unlikely(!tmp ||
3307 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 3336 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3308 put_page(fault_page); 3337 put_page(vmf->page);
3309 return tmp; 3338 return tmp;
3310 } 3339 }
3311 } 3340 }
3312 3341
3313 ret |= alloc_set_pte(fe, NULL, fault_page); 3342 ret |= finish_fault(vmf);
3314 if (fe->pte)
3315 pte_unmap_unlock(fe->pte, fe->ptl);
3316 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3343 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3317 VM_FAULT_RETRY))) { 3344 VM_FAULT_RETRY))) {
3318 unlock_page(fault_page); 3345 unlock_page(vmf->page);
3319 put_page(fault_page); 3346 put_page(vmf->page);
3320 return ret; 3347 return ret;
3321 } 3348 }
3322 3349
3323 if (set_page_dirty(fault_page)) 3350 fault_dirty_shared_page(vma, vmf->page);
3324 dirtied = 1;
3325 /*
3326 * Take a local copy of the address_space - page.mapping may be zeroed
3327 * by truncate after unlock_page(). The address_space itself remains
3328 * pinned by vma->vm_file's reference. We rely on unlock_page()'s
3329 * release semantics to prevent the compiler from undoing this copying.
3330 */
3331 mapping = page_rmapping(fault_page);
3332 unlock_page(fault_page);
3333 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3334 /*
3335 * Some device drivers do not set page.mapping but still
3336 * dirty their pages
3337 */
3338 balance_dirty_pages_ratelimited(mapping);
3339 }
3340
3341 if (!vma->vm_ops->page_mkwrite)
3342 file_update_time(vma->vm_file);
3343
3344 return ret; 3351 return ret;
3345} 3352}
3346 3353
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3350 * The mmap_sem may have been released depending on flags and our 3357 * The mmap_sem may have been released depending on flags and our
3351 * return value. See filemap_fault() and __lock_page_or_retry(). 3358 * return value. See filemap_fault() and __lock_page_or_retry().
3352 */ 3359 */
3353static int do_fault(struct fault_env *fe) 3360static int do_fault(struct vm_fault *vmf)
3354{ 3361{
3355 struct vm_area_struct *vma = fe->vma; 3362 struct vm_area_struct *vma = vmf->vma;
3356 pgoff_t pgoff = linear_page_index(vma, fe->address);
3357 3363
3358 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3364 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
3359 if (!vma->vm_ops->fault) 3365 if (!vma->vm_ops->fault)
3360 return VM_FAULT_SIGBUS; 3366 return VM_FAULT_SIGBUS;
3361 if (!(fe->flags & FAULT_FLAG_WRITE)) 3367 if (!(vmf->flags & FAULT_FLAG_WRITE))
3362 return do_read_fault(fe, pgoff); 3368 return do_read_fault(vmf);
3363 if (!(vma->vm_flags & VM_SHARED)) 3369 if (!(vma->vm_flags & VM_SHARED))
3364 return do_cow_fault(fe, pgoff); 3370 return do_cow_fault(vmf);
3365 return do_shared_fault(fe, pgoff); 3371 return do_shared_fault(vmf);
3366} 3372}
3367 3373
3368static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3374static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3380 return mpol_misplaced(page, vma, addr); 3386 return mpol_misplaced(page, vma, addr);
3381} 3387}
3382 3388
3383static int do_numa_page(struct fault_env *fe, pte_t pte) 3389static int do_numa_page(struct vm_fault *vmf)
3384{ 3390{
3385 struct vm_area_struct *vma = fe->vma; 3391 struct vm_area_struct *vma = vmf->vma;
3386 struct page *page = NULL; 3392 struct page *page = NULL;
3387 int page_nid = -1; 3393 int page_nid = -1;
3388 int last_cpupid; 3394 int last_cpupid;
3389 int target_nid; 3395 int target_nid;
3390 bool migrated = false; 3396 bool migrated = false;
3397 pte_t pte = vmf->orig_pte;
3391 bool was_writable = pte_write(pte); 3398 bool was_writable = pte_write(pte);
3392 int flags = 0; 3399 int flags = 0;
3393 3400
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
3400 * page table entry is not accessible, so there would be no 3407 * page table entry is not accessible, so there would be no
3401 * concurrent hardware modifications to the PTE. 3408 * concurrent hardware modifications to the PTE.
3402 */ 3409 */
3403 fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); 3410 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3404 spin_lock(fe->ptl); 3411 spin_lock(vmf->ptl);
3405 if (unlikely(!pte_same(*fe->pte, pte))) { 3412 if (unlikely(!pte_same(*vmf->pte, pte))) {
3406 pte_unmap_unlock(fe->pte, fe->ptl); 3413 pte_unmap_unlock(vmf->pte, vmf->ptl);
3407 goto out; 3414 goto out;
3408 } 3415 }
3409 3416
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
3412 pte = pte_mkyoung(pte); 3419 pte = pte_mkyoung(pte);
3413 if (was_writable) 3420 if (was_writable)
3414 pte = pte_mkwrite(pte); 3421 pte = pte_mkwrite(pte);
3415 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); 3422 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3416 update_mmu_cache(vma, fe->address, fe->pte); 3423 update_mmu_cache(vma, vmf->address, vmf->pte);
3417 3424
3418 page = vm_normal_page(vma, fe->address, pte); 3425 page = vm_normal_page(vma, vmf->address, pte);
3419 if (!page) { 3426 if (!page) {
3420 pte_unmap_unlock(fe->pte, fe->ptl); 3427 pte_unmap_unlock(vmf->pte, vmf->ptl);
3421 return 0; 3428 return 0;
3422 } 3429 }
3423 3430
3424 /* TODO: handle PTE-mapped THP */ 3431 /* TODO: handle PTE-mapped THP */
3425 if (PageCompound(page)) { 3432 if (PageCompound(page)) {
3426 pte_unmap_unlock(fe->pte, fe->ptl); 3433 pte_unmap_unlock(vmf->pte, vmf->ptl);
3427 return 0; 3434 return 0;
3428 } 3435 }
3429 3436
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
3447 3454
3448 last_cpupid = page_cpupid_last(page); 3455 last_cpupid = page_cpupid_last(page);
3449 page_nid = page_to_nid(page); 3456 page_nid = page_to_nid(page);
3450 target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, 3457 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3451 &flags); 3458 &flags);
3452 pte_unmap_unlock(fe->pte, fe->ptl); 3459 pte_unmap_unlock(vmf->pte, vmf->ptl);
3453 if (target_nid == -1) { 3460 if (target_nid == -1) {
3454 put_page(page); 3461 put_page(page);
3455 goto out; 3462 goto out;
@@ -3469,28 +3476,28 @@ out:
3469 return 0; 3476 return 0;
3470} 3477}
3471 3478
3472static int create_huge_pmd(struct fault_env *fe) 3479static int create_huge_pmd(struct vm_fault *vmf)
3473{ 3480{
3474 struct vm_area_struct *vma = fe->vma; 3481 struct vm_area_struct *vma = vmf->vma;
3475 if (vma_is_anonymous(vma)) 3482 if (vma_is_anonymous(vma))
3476 return do_huge_pmd_anonymous_page(fe); 3483 return do_huge_pmd_anonymous_page(vmf);
3477 if (vma->vm_ops->pmd_fault) 3484 if (vma->vm_ops->pmd_fault)
3478 return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, 3485 return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
3479 fe->flags); 3486 vmf->flags);
3480 return VM_FAULT_FALLBACK; 3487 return VM_FAULT_FALLBACK;
3481} 3488}
3482 3489
3483static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) 3490static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3484{ 3491{
3485 if (vma_is_anonymous(fe->vma)) 3492 if (vma_is_anonymous(vmf->vma))
3486 return do_huge_pmd_wp_page(fe, orig_pmd); 3493 return do_huge_pmd_wp_page(vmf, orig_pmd);
3487 if (fe->vma->vm_ops->pmd_fault) 3494 if (vmf->vma->vm_ops->pmd_fault)
3488 return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, 3495 return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
3489 fe->flags); 3496 vmf->pmd, vmf->flags);
3490 3497
3491 /* COW handled on pte level: split pmd */ 3498 /* COW handled on pte level: split pmd */
3492 VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); 3499 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3493 __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); 3500 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3494 3501
3495 return VM_FAULT_FALLBACK; 3502 return VM_FAULT_FALLBACK;
3496} 3503}
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
3515 * The mmap_sem may have been released depending on flags and our return value. 3522 * The mmap_sem may have been released depending on flags and our return value.
3516 * See filemap_fault() and __lock_page_or_retry(). 3523 * See filemap_fault() and __lock_page_or_retry().
3517 */ 3524 */
3518static int handle_pte_fault(struct fault_env *fe) 3525static int handle_pte_fault(struct vm_fault *vmf)
3519{ 3526{
3520 pte_t entry; 3527 pte_t entry;
3521 3528
3522 if (unlikely(pmd_none(*fe->pmd))) { 3529 if (unlikely(pmd_none(*vmf->pmd))) {
3523 /* 3530 /*
3524 * Leave __pte_alloc() until later: because vm_ops->fault may 3531 * Leave __pte_alloc() until later: because vm_ops->fault may
3525 * want to allocate huge page, and if we expose page table 3532 * want to allocate huge page, and if we expose page table
3526 * for an instant, it will be difficult to retract from 3533 * for an instant, it will be difficult to retract from
3527 * concurrent faults and from rmap lookups. 3534 * concurrent faults and from rmap lookups.
3528 */ 3535 */
3529 fe->pte = NULL; 3536 vmf->pte = NULL;
3530 } else { 3537 } else {
3531 /* See comment in pte_alloc_one_map() */ 3538 /* See comment in pte_alloc_one_map() */
3532 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) 3539 if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
3533 return 0; 3540 return 0;
3534 /* 3541 /*
3535 * A regular pmd is established and it can't morph into a huge 3542 * A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
3537 * mmap_sem read mode and khugepaged takes it in write mode. 3544 * mmap_sem read mode and khugepaged takes it in write mode.
3538 * So now it's safe to run pte_offset_map(). 3545 * So now it's safe to run pte_offset_map().
3539 */ 3546 */
3540 fe->pte = pte_offset_map(fe->pmd, fe->address); 3547 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3541 3548 vmf->orig_pte = *vmf->pte;
3542 entry = *fe->pte;
3543 3549
3544 /* 3550 /*
3545 * some architectures can have larger ptes than wordsize, 3551 * some architectures can have larger ptes than wordsize,
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
3550 * ptl lock held. So here a barrier will do. 3556 * ptl lock held. So here a barrier will do.
3551 */ 3557 */
3552 barrier(); 3558 barrier();
3553 if (pte_none(entry)) { 3559 if (pte_none(vmf->orig_pte)) {
3554 pte_unmap(fe->pte); 3560 pte_unmap(vmf->pte);
3555 fe->pte = NULL; 3561 vmf->pte = NULL;
3556 } 3562 }
3557 } 3563 }
3558 3564
3559 if (!fe->pte) { 3565 if (!vmf->pte) {
3560 if (vma_is_anonymous(fe->vma)) 3566 if (vma_is_anonymous(vmf->vma))
3561 return do_anonymous_page(fe); 3567 return do_anonymous_page(vmf);
3562 else 3568 else
3563 return do_fault(fe); 3569 return do_fault(vmf);
3564 } 3570 }
3565 3571
3566 if (!pte_present(entry)) 3572 if (!pte_present(vmf->orig_pte))
3567 return do_swap_page(fe, entry); 3573 return do_swap_page(vmf);
3568 3574
3569 if (pte_protnone(entry) && vma_is_accessible(fe->vma)) 3575 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3570 return do_numa_page(fe, entry); 3576 return do_numa_page(vmf);
3571 3577
3572 fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); 3578 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3573 spin_lock(fe->ptl); 3579 spin_lock(vmf->ptl);
3574 if (unlikely(!pte_same(*fe->pte, entry))) 3580 entry = vmf->orig_pte;
3581 if (unlikely(!pte_same(*vmf->pte, entry)))
3575 goto unlock; 3582 goto unlock;
3576 if (fe->flags & FAULT_FLAG_WRITE) { 3583 if (vmf->flags & FAULT_FLAG_WRITE) {
3577 if (!pte_write(entry)) 3584 if (!pte_write(entry))
3578 return do_wp_page(fe, entry); 3585 return do_wp_page(vmf);
3579 entry = pte_mkdirty(entry); 3586 entry = pte_mkdirty(entry);
3580 } 3587 }
3581 entry = pte_mkyoung(entry); 3588 entry = pte_mkyoung(entry);
3582 if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, 3589 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3583 fe->flags & FAULT_FLAG_WRITE)) { 3590 vmf->flags & FAULT_FLAG_WRITE)) {
3584 update_mmu_cache(fe->vma, fe->address, fe->pte); 3591 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3585 } else { 3592 } else {
3586 /* 3593 /*
3587 * This is needed only for protection faults but the arch code 3594 * This is needed only for protection faults but the arch code
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
3589 * This still avoids useless tlb flushes for .text page faults 3596 * This still avoids useless tlb flushes for .text page faults
3590 * with threads. 3597 * with threads.
3591 */ 3598 */
3592 if (fe->flags & FAULT_FLAG_WRITE) 3599 if (vmf->flags & FAULT_FLAG_WRITE)
3593 flush_tlb_fix_spurious_fault(fe->vma, fe->address); 3600 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3594 } 3601 }
3595unlock: 3602unlock:
3596 pte_unmap_unlock(fe->pte, fe->ptl); 3603 pte_unmap_unlock(vmf->pte, vmf->ptl);
3597 return 0; 3604 return 0;
3598} 3605}
3599 3606
@@ -3606,10 +3613,12 @@ unlock:
3606static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, 3613static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3607 unsigned int flags) 3614 unsigned int flags)
3608{ 3615{
3609 struct fault_env fe = { 3616 struct vm_fault vmf = {
3610 .vma = vma, 3617 .vma = vma,
3611 .address = address, 3618 .address = address & PAGE_MASK,
3612 .flags = flags, 3619 .flags = flags,
3620 .pgoff = linear_page_index(vma, address),
3621 .gfp_mask = __get_fault_gfp_mask(vma),
3613 }; 3622 };
3614 struct mm_struct *mm = vma->vm_mm; 3623 struct mm_struct *mm = vma->vm_mm;
3615 pgd_t *pgd; 3624 pgd_t *pgd;
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3619 pud = pud_alloc(mm, pgd, address); 3628 pud = pud_alloc(mm, pgd, address);
3620 if (!pud) 3629 if (!pud)
3621 return VM_FAULT_OOM; 3630 return VM_FAULT_OOM;
3622 fe.pmd = pmd_alloc(mm, pud, address); 3631 vmf.pmd = pmd_alloc(mm, pud, address);
3623 if (!fe.pmd) 3632 if (!vmf.pmd)
3624 return VM_FAULT_OOM; 3633 return VM_FAULT_OOM;
3625 if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { 3634 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
3626 int ret = create_huge_pmd(&fe); 3635 int ret = create_huge_pmd(&vmf);
3627 if (!(ret & VM_FAULT_FALLBACK)) 3636 if (!(ret & VM_FAULT_FALLBACK))
3628 return ret; 3637 return ret;
3629 } else { 3638 } else {
3630 pmd_t orig_pmd = *fe.pmd; 3639 pmd_t orig_pmd = *vmf.pmd;
3631 int ret; 3640 int ret;
3632 3641
3633 barrier(); 3642 barrier();
3634 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3643 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3635 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) 3644 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3636 return do_huge_pmd_numa_page(&fe, orig_pmd); 3645 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3637 3646
3638 if ((fe.flags & FAULT_FLAG_WRITE) && 3647 if ((vmf.flags & FAULT_FLAG_WRITE) &&
3639 !pmd_write(orig_pmd)) { 3648 !pmd_write(orig_pmd)) {
3640 ret = wp_huge_pmd(&fe, orig_pmd); 3649 ret = wp_huge_pmd(&vmf, orig_pmd);
3641 if (!(ret & VM_FAULT_FALLBACK)) 3650 if (!(ret & VM_FAULT_FALLBACK))
3642 return ret; 3651 return ret;
3643 } else { 3652 } else {
3644 huge_pmd_set_accessed(&fe, orig_pmd); 3653 huge_pmd_set_accessed(&vmf, orig_pmd);
3645 return 0; 3654 return 0;
3646 } 3655 }
3647 } 3656 }
3648 } 3657 }
3649 3658
3650 return handle_pte_fault(&fe); 3659 return handle_pte_fault(&vmf);
3651} 3660}
3652 3661
3653/* 3662/*
@@ -3808,8 +3817,8 @@ out:
3808 return -EINVAL; 3817 return -EINVAL;
3809} 3818}
3810 3819
3811static inline int follow_pte(struct mm_struct *mm, unsigned long address, 3820int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
3812 pte_t **ptepp, spinlock_t **ptlp) 3821 spinlock_t **ptlp)
3813{ 3822{
3814 int res; 3823 int res;
3815 3824
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3919 struct page *page = NULL; 3928 struct page *page = NULL;
3920 3929
3921 ret = get_user_pages_remote(tsk, mm, addr, 1, 3930 ret = get_user_pages_remote(tsk, mm, addr, 1,
3922 gup_flags, &page, &vma); 3931 gup_flags, &page, &vma, NULL);
3923 if (ret <= 0) { 3932 if (ret <= 0) {
3924#ifndef CONFIG_HAVE_IOREMAP_PROT 3933#ifndef CONFIG_HAVE_IOREMAP_PROT
3925 break; 3934 break;