diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 859 |
1 files changed, 434 insertions, 425 deletions
diff --git a/mm/memory.c b/mm/memory.c index 08d8da39de28..455c3e628d52 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) | |||
2034 | * | 2034 | * |
2035 | * We do this without the lock held, so that it can sleep if it needs to. | 2035 | * We do this without the lock held, so that it can sleep if it needs to. |
2036 | */ | 2036 | */ |
2037 | static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 2037 | static int do_page_mkwrite(struct vm_fault *vmf) |
2038 | unsigned long address) | ||
2039 | { | 2038 | { |
2040 | struct vm_fault vmf; | ||
2041 | int ret; | 2039 | int ret; |
2040 | struct page *page = vmf->page; | ||
2041 | unsigned int old_flags = vmf->flags; | ||
2042 | 2042 | ||
2043 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2043 | vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
2044 | vmf.pgoff = page->index; | ||
2045 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | ||
2046 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
2047 | vmf.page = page; | ||
2048 | vmf.cow_page = NULL; | ||
2049 | 2044 | ||
2050 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); | 2045 | ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); |
2046 | /* Restore original flags so that caller is not surprised */ | ||
2047 | vmf->flags = old_flags; | ||
2051 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2048 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2052 | return ret; | 2049 | return ret; |
2053 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { | 2050 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { |
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2063 | } | 2060 | } |
2064 | 2061 | ||
2065 | /* | 2062 | /* |
2063 | * Handle dirtying of a page in shared file mapping on a write fault. | ||
2064 | * | ||
2065 | * The function expects the page to be locked and unlocks it. | ||
2066 | */ | ||
2067 | static void fault_dirty_shared_page(struct vm_area_struct *vma, | ||
2068 | struct page *page) | ||
2069 | { | ||
2070 | struct address_space *mapping; | ||
2071 | bool dirtied; | ||
2072 | bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; | ||
2073 | |||
2074 | dirtied = set_page_dirty(page); | ||
2075 | VM_BUG_ON_PAGE(PageAnon(page), page); | ||
2076 | /* | ||
2077 | * Take a local copy of the address_space - page.mapping may be zeroed | ||
2078 | * by truncate after unlock_page(). The address_space itself remains | ||
2079 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | ||
2080 | * release semantics to prevent the compiler from undoing this copying. | ||
2081 | */ | ||
2082 | mapping = page_rmapping(page); | ||
2083 | unlock_page(page); | ||
2084 | |||
2085 | if ((dirtied || page_mkwrite) && mapping) { | ||
2086 | /* | ||
2087 | * Some device drivers do not set page.mapping | ||
2088 | * but still dirty their pages | ||
2089 | */ | ||
2090 | balance_dirty_pages_ratelimited(mapping); | ||
2091 | } | ||
2092 | |||
2093 | if (!page_mkwrite) | ||
2094 | file_update_time(vma->vm_file); | ||
2095 | } | ||
2096 | |||
2097 | /* | ||
2066 | * Handle write page faults for pages that can be reused in the current vma | 2098 | * Handle write page faults for pages that can be reused in the current vma |
2067 | * | 2099 | * |
2068 | * This can happen either due to the mapping being with the VM_SHARED flag, | 2100 | * This can happen either due to the mapping being with the VM_SHARED flag, |
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2070 | * case, all we need to do here is to mark the page as writable and update | 2102 | * case, all we need to do here is to mark the page as writable and update |
2071 | * any related book-keeping. | 2103 | * any related book-keeping. |
2072 | */ | 2104 | */ |
2073 | static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | 2105 | static inline void wp_page_reuse(struct vm_fault *vmf) |
2074 | struct page *page, int page_mkwrite, int dirty_shared) | 2106 | __releases(vmf->ptl) |
2075 | __releases(fe->ptl) | ||
2076 | { | 2107 | { |
2077 | struct vm_area_struct *vma = fe->vma; | 2108 | struct vm_area_struct *vma = vmf->vma; |
2109 | struct page *page = vmf->page; | ||
2078 | pte_t entry; | 2110 | pte_t entry; |
2079 | /* | 2111 | /* |
2080 | * Clear the pages cpupid information as the existing | 2112 | * Clear the pages cpupid information as the existing |
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | |||
2084 | if (page) | 2116 | if (page) |
2085 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); | 2117 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
2086 | 2118 | ||
2087 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); | 2119 | flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
2088 | entry = pte_mkyoung(orig_pte); | 2120 | entry = pte_mkyoung(vmf->orig_pte); |
2089 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2121 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2090 | if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) | 2122 | if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) |
2091 | update_mmu_cache(vma, fe->address, fe->pte); | 2123 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2092 | pte_unmap_unlock(fe->pte, fe->ptl); | 2124 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2093 | |||
2094 | if (dirty_shared) { | ||
2095 | struct address_space *mapping; | ||
2096 | int dirtied; | ||
2097 | |||
2098 | if (!page_mkwrite) | ||
2099 | lock_page(page); | ||
2100 | |||
2101 | dirtied = set_page_dirty(page); | ||
2102 | VM_BUG_ON_PAGE(PageAnon(page), page); | ||
2103 | mapping = page->mapping; | ||
2104 | unlock_page(page); | ||
2105 | put_page(page); | ||
2106 | |||
2107 | if ((dirtied || page_mkwrite) && mapping) { | ||
2108 | /* | ||
2109 | * Some device drivers do not set page.mapping | ||
2110 | * but still dirty their pages | ||
2111 | */ | ||
2112 | balance_dirty_pages_ratelimited(mapping); | ||
2113 | } | ||
2114 | |||
2115 | if (!page_mkwrite) | ||
2116 | file_update_time(vma->vm_file); | ||
2117 | } | ||
2118 | |||
2119 | return VM_FAULT_WRITE; | ||
2120 | } | 2125 | } |
2121 | 2126 | ||
2122 | /* | 2127 | /* |
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | |||
2135 | * held to the old page, as well as updating the rmap. | 2140 | * held to the old page, as well as updating the rmap. |
2136 | * - In any case, unlock the PTL and drop the reference we took to the old page. | 2141 | * - In any case, unlock the PTL and drop the reference we took to the old page. |
2137 | */ | 2142 | */ |
2138 | static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | 2143 | static int wp_page_copy(struct vm_fault *vmf) |
2139 | struct page *old_page) | ||
2140 | { | 2144 | { |
2141 | struct vm_area_struct *vma = fe->vma; | 2145 | struct vm_area_struct *vma = vmf->vma; |
2142 | struct mm_struct *mm = vma->vm_mm; | 2146 | struct mm_struct *mm = vma->vm_mm; |
2147 | struct page *old_page = vmf->page; | ||
2143 | struct page *new_page = NULL; | 2148 | struct page *new_page = NULL; |
2144 | pte_t entry; | 2149 | pte_t entry; |
2145 | int page_copied = 0; | 2150 | int page_copied = 0; |
2146 | const unsigned long mmun_start = fe->address & PAGE_MASK; | 2151 | const unsigned long mmun_start = vmf->address & PAGE_MASK; |
2147 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; | 2152 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; |
2148 | struct mem_cgroup *memcg; | 2153 | struct mem_cgroup *memcg; |
2149 | 2154 | ||
2150 | if (unlikely(anon_vma_prepare(vma))) | 2155 | if (unlikely(anon_vma_prepare(vma))) |
2151 | goto oom; | 2156 | goto oom; |
2152 | 2157 | ||
2153 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2158 | if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { |
2154 | new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); | 2159 | new_page = alloc_zeroed_user_highpage_movable(vma, |
2160 | vmf->address); | ||
2155 | if (!new_page) | 2161 | if (!new_page) |
2156 | goto oom; | 2162 | goto oom; |
2157 | } else { | 2163 | } else { |
2158 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, | 2164 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, |
2159 | fe->address); | 2165 | vmf->address); |
2160 | if (!new_page) | 2166 | if (!new_page) |
2161 | goto oom; | 2167 | goto oom; |
2162 | cow_user_page(new_page, old_page, fe->address, vma); | 2168 | cow_user_page(new_page, old_page, vmf->address, vma); |
2163 | } | 2169 | } |
2164 | 2170 | ||
2165 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2171 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2172 | /* | 2178 | /* |
2173 | * Re-check the pte - we dropped the lock | 2179 | * Re-check the pte - we dropped the lock |
2174 | */ | 2180 | */ |
2175 | fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); | 2181 | vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); |
2176 | if (likely(pte_same(*fe->pte, orig_pte))) { | 2182 | if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { |
2177 | if (old_page) { | 2183 | if (old_page) { |
2178 | if (!PageAnon(old_page)) { | 2184 | if (!PageAnon(old_page)) { |
2179 | dec_mm_counter_fast(mm, | 2185 | dec_mm_counter_fast(mm, |
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2183 | } else { | 2189 | } else { |
2184 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2190 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2185 | } | 2191 | } |
2186 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); | 2192 | flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
2187 | entry = mk_pte(new_page, vma->vm_page_prot); | 2193 | entry = mk_pte(new_page, vma->vm_page_prot); |
2188 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2194 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2189 | /* | 2195 | /* |
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2192 | * seen in the presence of one thread doing SMC and another | 2198 | * seen in the presence of one thread doing SMC and another |
2193 | * thread doing COW. | 2199 | * thread doing COW. |
2194 | */ | 2200 | */ |
2195 | ptep_clear_flush_notify(vma, fe->address, fe->pte); | 2201 | ptep_clear_flush_notify(vma, vmf->address, vmf->pte); |
2196 | page_add_new_anon_rmap(new_page, vma, fe->address, false); | 2202 | page_add_new_anon_rmap(new_page, vma, vmf->address, false); |
2197 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 2203 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
2198 | lru_cache_add_active_or_unevictable(new_page, vma); | 2204 | lru_cache_add_active_or_unevictable(new_page, vma); |
2199 | /* | 2205 | /* |
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2201 | * mmu page tables (such as kvm shadow page tables), we want the | 2207 | * mmu page tables (such as kvm shadow page tables), we want the |
2202 | * new page to be mapped directly into the secondary page table. | 2208 | * new page to be mapped directly into the secondary page table. |
2203 | */ | 2209 | */ |
2204 | set_pte_at_notify(mm, fe->address, fe->pte, entry); | 2210 | set_pte_at_notify(mm, vmf->address, vmf->pte, entry); |
2205 | update_mmu_cache(vma, fe->address, fe->pte); | 2211 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2206 | if (old_page) { | 2212 | if (old_page) { |
2207 | /* | 2213 | /* |
2208 | * Only after switching the pte to the new page may | 2214 | * Only after switching the pte to the new page may |
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2239 | if (new_page) | 2245 | if (new_page) |
2240 | put_page(new_page); | 2246 | put_page(new_page); |
2241 | 2247 | ||
2242 | pte_unmap_unlock(fe->pte, fe->ptl); | 2248 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2243 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2249 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2244 | if (old_page) { | 2250 | if (old_page) { |
2245 | /* | 2251 | /* |
@@ -2263,79 +2269,91 @@ oom: | |||
2263 | return VM_FAULT_OOM; | 2269 | return VM_FAULT_OOM; |
2264 | } | 2270 | } |
2265 | 2271 | ||
2272 | /** | ||
2273 | * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE | ||
2274 | * writeable once the page is prepared | ||
2275 | * | ||
2276 | * @vmf: structure describing the fault | ||
2277 | * | ||
2278 | * This function handles all that is needed to finish a write page fault in a | ||
2279 | * shared mapping due to PTE being read-only once the mapped page is prepared. | ||
2280 | * It handles locking of PTE and modifying it. The function returns | ||
2281 | * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE | ||
2282 | * lock. | ||
2283 | * | ||
2284 | * The function expects the page to be locked or other protection against | ||
2285 | * concurrent faults / writeback (such as DAX radix tree locks). | ||
2286 | */ | ||
2287 | int finish_mkwrite_fault(struct vm_fault *vmf) | ||
2288 | { | ||
2289 | WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); | ||
2290 | vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, | ||
2291 | &vmf->ptl); | ||
2292 | /* | ||
2293 | * We might have raced with another page fault while we released the | ||
2294 | * pte_offset_map_lock. | ||
2295 | */ | ||
2296 | if (!pte_same(*vmf->pte, vmf->orig_pte)) { | ||
2297 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
2298 | return VM_FAULT_NOPAGE; | ||
2299 | } | ||
2300 | wp_page_reuse(vmf); | ||
2301 | return 0; | ||
2302 | } | ||
2303 | |||
2266 | /* | 2304 | /* |
2267 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | 2305 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED |
2268 | * mapping | 2306 | * mapping |
2269 | */ | 2307 | */ |
2270 | static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) | 2308 | static int wp_pfn_shared(struct vm_fault *vmf) |
2271 | { | 2309 | { |
2272 | struct vm_area_struct *vma = fe->vma; | 2310 | struct vm_area_struct *vma = vmf->vma; |
2273 | 2311 | ||
2274 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | 2312 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { |
2275 | struct vm_fault vmf = { | ||
2276 | .page = NULL, | ||
2277 | .pgoff = linear_page_index(vma, fe->address), | ||
2278 | .virtual_address = | ||
2279 | (void __user *)(fe->address & PAGE_MASK), | ||
2280 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
2281 | }; | ||
2282 | int ret; | 2313 | int ret; |
2283 | 2314 | ||
2284 | pte_unmap_unlock(fe->pte, fe->ptl); | 2315 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2285 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | 2316 | vmf->flags |= FAULT_FLAG_MKWRITE; |
2286 | if (ret & VM_FAULT_ERROR) | 2317 | ret = vma->vm_ops->pfn_mkwrite(vma, vmf); |
2318 | if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) | ||
2287 | return ret; | 2319 | return ret; |
2288 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2320 | return finish_mkwrite_fault(vmf); |
2289 | &fe->ptl); | ||
2290 | /* | ||
2291 | * We might have raced with another page fault while we | ||
2292 | * released the pte_offset_map_lock. | ||
2293 | */ | ||
2294 | if (!pte_same(*fe->pte, orig_pte)) { | ||
2295 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
2296 | return 0; | ||
2297 | } | ||
2298 | } | 2321 | } |
2299 | return wp_page_reuse(fe, orig_pte, NULL, 0, 0); | 2322 | wp_page_reuse(vmf); |
2323 | return VM_FAULT_WRITE; | ||
2300 | } | 2324 | } |
2301 | 2325 | ||
2302 | static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, | 2326 | static int wp_page_shared(struct vm_fault *vmf) |
2303 | struct page *old_page) | 2327 | __releases(vmf->ptl) |
2304 | __releases(fe->ptl) | ||
2305 | { | 2328 | { |
2306 | struct vm_area_struct *vma = fe->vma; | 2329 | struct vm_area_struct *vma = vmf->vma; |
2307 | int page_mkwrite = 0; | ||
2308 | 2330 | ||
2309 | get_page(old_page); | 2331 | get_page(vmf->page); |
2310 | 2332 | ||
2311 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2333 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2312 | int tmp; | 2334 | int tmp; |
2313 | 2335 | ||
2314 | pte_unmap_unlock(fe->pte, fe->ptl); | 2336 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2315 | tmp = do_page_mkwrite(vma, old_page, fe->address); | 2337 | tmp = do_page_mkwrite(vmf); |
2316 | if (unlikely(!tmp || (tmp & | 2338 | if (unlikely(!tmp || (tmp & |
2317 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2339 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2318 | put_page(old_page); | 2340 | put_page(vmf->page); |
2319 | return tmp; | 2341 | return tmp; |
2320 | } | 2342 | } |
2321 | /* | 2343 | tmp = finish_mkwrite_fault(vmf); |
2322 | * Since we dropped the lock we need to revalidate | 2344 | if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { |
2323 | * the PTE as someone else may have changed it. If | 2345 | unlock_page(vmf->page); |
2324 | * they did, we just return, as we can count on the | 2346 | put_page(vmf->page); |
2325 | * MMU to tell us if they didn't also make it writable. | 2347 | return tmp; |
2326 | */ | ||
2327 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | ||
2328 | &fe->ptl); | ||
2329 | if (!pte_same(*fe->pte, orig_pte)) { | ||
2330 | unlock_page(old_page); | ||
2331 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
2332 | put_page(old_page); | ||
2333 | return 0; | ||
2334 | } | 2348 | } |
2335 | page_mkwrite = 1; | 2349 | } else { |
2350 | wp_page_reuse(vmf); | ||
2351 | lock_page(vmf->page); | ||
2336 | } | 2352 | } |
2353 | fault_dirty_shared_page(vma, vmf->page); | ||
2354 | put_page(vmf->page); | ||
2337 | 2355 | ||
2338 | return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); | 2356 | return VM_FAULT_WRITE; |
2339 | } | 2357 | } |
2340 | 2358 | ||
2341 | /* | 2359 | /* |
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, | |||
2356 | * but allow concurrent faults), with pte both mapped and locked. | 2374 | * but allow concurrent faults), with pte both mapped and locked. |
2357 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2375 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2358 | */ | 2376 | */ |
2359 | static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | 2377 | static int do_wp_page(struct vm_fault *vmf) |
2360 | __releases(fe->ptl) | 2378 | __releases(vmf->ptl) |
2361 | { | 2379 | { |
2362 | struct vm_area_struct *vma = fe->vma; | 2380 | struct vm_area_struct *vma = vmf->vma; |
2363 | struct page *old_page; | ||
2364 | 2381 | ||
2365 | old_page = vm_normal_page(vma, fe->address, orig_pte); | 2382 | vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); |
2366 | if (!old_page) { | 2383 | if (!vmf->page) { |
2367 | /* | 2384 | /* |
2368 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | 2385 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
2369 | * VM_PFNMAP VMA. | 2386 | * VM_PFNMAP VMA. |
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | |||
2373 | */ | 2390 | */ |
2374 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2391 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2375 | (VM_WRITE|VM_SHARED)) | 2392 | (VM_WRITE|VM_SHARED)) |
2376 | return wp_pfn_shared(fe, orig_pte); | 2393 | return wp_pfn_shared(vmf); |
2377 | 2394 | ||
2378 | pte_unmap_unlock(fe->pte, fe->ptl); | 2395 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2379 | return wp_page_copy(fe, orig_pte, old_page); | 2396 | return wp_page_copy(vmf); |
2380 | } | 2397 | } |
2381 | 2398 | ||
2382 | /* | 2399 | /* |
2383 | * Take out anonymous pages first, anonymous shared vmas are | 2400 | * Take out anonymous pages first, anonymous shared vmas are |
2384 | * not dirty accountable. | 2401 | * not dirty accountable. |
2385 | */ | 2402 | */ |
2386 | if (PageAnon(old_page) && !PageKsm(old_page)) { | 2403 | if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { |
2387 | int total_mapcount; | 2404 | int total_mapcount; |
2388 | if (!trylock_page(old_page)) { | 2405 | if (!trylock_page(vmf->page)) { |
2389 | get_page(old_page); | 2406 | get_page(vmf->page); |
2390 | pte_unmap_unlock(fe->pte, fe->ptl); | 2407 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2391 | lock_page(old_page); | 2408 | lock_page(vmf->page); |
2392 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, | 2409 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
2393 | fe->address, &fe->ptl); | 2410 | vmf->address, &vmf->ptl); |
2394 | if (!pte_same(*fe->pte, orig_pte)) { | 2411 | if (!pte_same(*vmf->pte, vmf->orig_pte)) { |
2395 | unlock_page(old_page); | 2412 | unlock_page(vmf->page); |
2396 | pte_unmap_unlock(fe->pte, fe->ptl); | 2413 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2397 | put_page(old_page); | 2414 | put_page(vmf->page); |
2398 | return 0; | 2415 | return 0; |
2399 | } | 2416 | } |
2400 | put_page(old_page); | 2417 | put_page(vmf->page); |
2401 | } | 2418 | } |
2402 | if (reuse_swap_page(old_page, &total_mapcount)) { | 2419 | if (reuse_swap_page(vmf->page, &total_mapcount)) { |
2403 | if (total_mapcount == 1) { | 2420 | if (total_mapcount == 1) { |
2404 | /* | 2421 | /* |
2405 | * The page is all ours. Move it to | 2422 | * The page is all ours. Move it to |
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | |||
2408 | * Protected against the rmap code by | 2425 | * Protected against the rmap code by |
2409 | * the page lock. | 2426 | * the page lock. |
2410 | */ | 2427 | */ |
2411 | page_move_anon_rmap(old_page, vma); | 2428 | page_move_anon_rmap(vmf->page, vma); |
2412 | } | 2429 | } |
2413 | unlock_page(old_page); | 2430 | unlock_page(vmf->page); |
2414 | return wp_page_reuse(fe, orig_pte, old_page, 0, 0); | 2431 | wp_page_reuse(vmf); |
2432 | return VM_FAULT_WRITE; | ||
2415 | } | 2433 | } |
2416 | unlock_page(old_page); | 2434 | unlock_page(vmf->page); |
2417 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2435 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2418 | (VM_WRITE|VM_SHARED))) { | 2436 | (VM_WRITE|VM_SHARED))) { |
2419 | return wp_page_shared(fe, orig_pte, old_page); | 2437 | return wp_page_shared(vmf); |
2420 | } | 2438 | } |
2421 | 2439 | ||
2422 | /* | 2440 | /* |
2423 | * Ok, we need to copy. Oh, well.. | 2441 | * Ok, we need to copy. Oh, well.. |
2424 | */ | 2442 | */ |
2425 | get_page(old_page); | 2443 | get_page(vmf->page); |
2426 | 2444 | ||
2427 | pte_unmap_unlock(fe->pte, fe->ptl); | 2445 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2428 | return wp_page_copy(fe, orig_pte, old_page); | 2446 | return wp_page_copy(vmf); |
2429 | } | 2447 | } |
2430 | 2448 | ||
2431 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2449 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2513 | * We return with the mmap_sem locked or unlocked in the same cases | 2531 | * We return with the mmap_sem locked or unlocked in the same cases |
2514 | * as does filemap_fault(). | 2532 | * as does filemap_fault(). |
2515 | */ | 2533 | */ |
2516 | int do_swap_page(struct fault_env *fe, pte_t orig_pte) | 2534 | int do_swap_page(struct vm_fault *vmf) |
2517 | { | 2535 | { |
2518 | struct vm_area_struct *vma = fe->vma; | 2536 | struct vm_area_struct *vma = vmf->vma; |
2519 | struct page *page, *swapcache; | 2537 | struct page *page, *swapcache; |
2520 | struct mem_cgroup *memcg; | 2538 | struct mem_cgroup *memcg; |
2521 | swp_entry_t entry; | 2539 | swp_entry_t entry; |
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2524 | int exclusive = 0; | 2542 | int exclusive = 0; |
2525 | int ret = 0; | 2543 | int ret = 0; |
2526 | 2544 | ||
2527 | if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) | 2545 | if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) |
2528 | goto out; | 2546 | goto out; |
2529 | 2547 | ||
2530 | entry = pte_to_swp_entry(orig_pte); | 2548 | entry = pte_to_swp_entry(vmf->orig_pte); |
2531 | if (unlikely(non_swap_entry(entry))) { | 2549 | if (unlikely(non_swap_entry(entry))) { |
2532 | if (is_migration_entry(entry)) { | 2550 | if (is_migration_entry(entry)) { |
2533 | migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); | 2551 | migration_entry_wait(vma->vm_mm, vmf->pmd, |
2552 | vmf->address); | ||
2534 | } else if (is_hwpoison_entry(entry)) { | 2553 | } else if (is_hwpoison_entry(entry)) { |
2535 | ret = VM_FAULT_HWPOISON; | 2554 | ret = VM_FAULT_HWPOISON; |
2536 | } else { | 2555 | } else { |
2537 | print_bad_pte(vma, fe->address, orig_pte, NULL); | 2556 | print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); |
2538 | ret = VM_FAULT_SIGBUS; | 2557 | ret = VM_FAULT_SIGBUS; |
2539 | } | 2558 | } |
2540 | goto out; | 2559 | goto out; |
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2542 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2561 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2543 | page = lookup_swap_cache(entry); | 2562 | page = lookup_swap_cache(entry); |
2544 | if (!page) { | 2563 | if (!page) { |
2545 | page = swapin_readahead(entry, | 2564 | page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, |
2546 | GFP_HIGHUSER_MOVABLE, vma, fe->address); | 2565 | vmf->address); |
2547 | if (!page) { | 2566 | if (!page) { |
2548 | /* | 2567 | /* |
2549 | * Back out if somebody else faulted in this pte | 2568 | * Back out if somebody else faulted in this pte |
2550 | * while we released the pte lock. | 2569 | * while we released the pte lock. |
2551 | */ | 2570 | */ |
2552 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, | 2571 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
2553 | fe->address, &fe->ptl); | 2572 | vmf->address, &vmf->ptl); |
2554 | if (likely(pte_same(*fe->pte, orig_pte))) | 2573 | if (likely(pte_same(*vmf->pte, vmf->orig_pte))) |
2555 | ret = VM_FAULT_OOM; | 2574 | ret = VM_FAULT_OOM; |
2556 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2575 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2557 | goto unlock; | 2576 | goto unlock; |
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2573 | } | 2592 | } |
2574 | 2593 | ||
2575 | swapcache = page; | 2594 | swapcache = page; |
2576 | locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); | 2595 | locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); |
2577 | 2596 | ||
2578 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2597 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2579 | if (!locked) { | 2598 | if (!locked) { |
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2590 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2609 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2591 | goto out_page; | 2610 | goto out_page; |
2592 | 2611 | ||
2593 | page = ksm_might_need_to_copy(page, vma, fe->address); | 2612 | page = ksm_might_need_to_copy(page, vma, vmf->address); |
2594 | if (unlikely(!page)) { | 2613 | if (unlikely(!page)) { |
2595 | ret = VM_FAULT_OOM; | 2614 | ret = VM_FAULT_OOM; |
2596 | page = swapcache; | 2615 | page = swapcache; |
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2606 | /* | 2625 | /* |
2607 | * Back out if somebody else already faulted in this pte. | 2626 | * Back out if somebody else already faulted in this pte. |
2608 | */ | 2627 | */ |
2609 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2628 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
2610 | &fe->ptl); | 2629 | &vmf->ptl); |
2611 | if (unlikely(!pte_same(*fe->pte, orig_pte))) | 2630 | if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) |
2612 | goto out_nomap; | 2631 | goto out_nomap; |
2613 | 2632 | ||
2614 | if (unlikely(!PageUptodate(page))) { | 2633 | if (unlikely(!PageUptodate(page))) { |
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2629 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2648 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2630 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); | 2649 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); |
2631 | pte = mk_pte(page, vma->vm_page_prot); | 2650 | pte = mk_pte(page, vma->vm_page_prot); |
2632 | if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { | 2651 | if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { |
2633 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2652 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2634 | fe->flags &= ~FAULT_FLAG_WRITE; | 2653 | vmf->flags &= ~FAULT_FLAG_WRITE; |
2635 | ret |= VM_FAULT_WRITE; | 2654 | ret |= VM_FAULT_WRITE; |
2636 | exclusive = RMAP_EXCLUSIVE; | 2655 | exclusive = RMAP_EXCLUSIVE; |
2637 | } | 2656 | } |
2638 | flush_icache_page(vma, page); | 2657 | flush_icache_page(vma, page); |
2639 | if (pte_swp_soft_dirty(orig_pte)) | 2658 | if (pte_swp_soft_dirty(vmf->orig_pte)) |
2640 | pte = pte_mksoft_dirty(pte); | 2659 | pte = pte_mksoft_dirty(pte); |
2641 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); | 2660 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
2661 | vmf->orig_pte = pte; | ||
2642 | if (page == swapcache) { | 2662 | if (page == swapcache) { |
2643 | do_page_add_anon_rmap(page, vma, fe->address, exclusive); | 2663 | do_page_add_anon_rmap(page, vma, vmf->address, exclusive); |
2644 | mem_cgroup_commit_charge(page, memcg, true, false); | 2664 | mem_cgroup_commit_charge(page, memcg, true, false); |
2645 | activate_page(page); | 2665 | activate_page(page); |
2646 | } else { /* ksm created a completely new copy */ | 2666 | } else { /* ksm created a completely new copy */ |
2647 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2667 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
2648 | mem_cgroup_commit_charge(page, memcg, false, false); | 2668 | mem_cgroup_commit_charge(page, memcg, false, false); |
2649 | lru_cache_add_active_or_unevictable(page, vma); | 2669 | lru_cache_add_active_or_unevictable(page, vma); |
2650 | } | 2670 | } |
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2667 | put_page(swapcache); | 2687 | put_page(swapcache); |
2668 | } | 2688 | } |
2669 | 2689 | ||
2670 | if (fe->flags & FAULT_FLAG_WRITE) { | 2690 | if (vmf->flags & FAULT_FLAG_WRITE) { |
2671 | ret |= do_wp_page(fe, pte); | 2691 | ret |= do_wp_page(vmf); |
2672 | if (ret & VM_FAULT_ERROR) | 2692 | if (ret & VM_FAULT_ERROR) |
2673 | ret &= VM_FAULT_ERROR; | 2693 | ret &= VM_FAULT_ERROR; |
2674 | goto out; | 2694 | goto out; |
2675 | } | 2695 | } |
2676 | 2696 | ||
2677 | /* No need to invalidate - it was non-present before */ | 2697 | /* No need to invalidate - it was non-present before */ |
2678 | update_mmu_cache(vma, fe->address, fe->pte); | 2698 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2679 | unlock: | 2699 | unlock: |
2680 | pte_unmap_unlock(fe->pte, fe->ptl); | 2700 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2681 | out: | 2701 | out: |
2682 | return ret; | 2702 | return ret; |
2683 | out_nomap: | 2703 | out_nomap: |
2684 | mem_cgroup_cancel_charge(page, memcg, false); | 2704 | mem_cgroup_cancel_charge(page, memcg, false); |
2685 | pte_unmap_unlock(fe->pte, fe->ptl); | 2705 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2686 | out_page: | 2706 | out_page: |
2687 | unlock_page(page); | 2707 | unlock_page(page); |
2688 | out_release: | 2708 | out_release: |
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2733 | * but allow concurrent faults), and pte mapped but not yet locked. | 2753 | * but allow concurrent faults), and pte mapped but not yet locked. |
2734 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2754 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2735 | */ | 2755 | */ |
2736 | static int do_anonymous_page(struct fault_env *fe) | 2756 | static int do_anonymous_page(struct vm_fault *vmf) |
2737 | { | 2757 | { |
2738 | struct vm_area_struct *vma = fe->vma; | 2758 | struct vm_area_struct *vma = vmf->vma; |
2739 | struct mem_cgroup *memcg; | 2759 | struct mem_cgroup *memcg; |
2740 | struct page *page; | 2760 | struct page *page; |
2741 | pte_t entry; | 2761 | pte_t entry; |
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2745 | return VM_FAULT_SIGBUS; | 2765 | return VM_FAULT_SIGBUS; |
2746 | 2766 | ||
2747 | /* Check if we need to add a guard page to the stack */ | 2767 | /* Check if we need to add a guard page to the stack */ |
2748 | if (check_stack_guard_page(vma, fe->address) < 0) | 2768 | if (check_stack_guard_page(vma, vmf->address) < 0) |
2749 | return VM_FAULT_SIGSEGV; | 2769 | return VM_FAULT_SIGSEGV; |
2750 | 2770 | ||
2751 | /* | 2771 | /* |
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2758 | * | 2778 | * |
2759 | * Here we only have down_read(mmap_sem). | 2779 | * Here we only have down_read(mmap_sem). |
2760 | */ | 2780 | */ |
2761 | if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) | 2781 | if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) |
2762 | return VM_FAULT_OOM; | 2782 | return VM_FAULT_OOM; |
2763 | 2783 | ||
2764 | /* See the comment in pte_alloc_one_map() */ | 2784 | /* See the comment in pte_alloc_one_map() */ |
2765 | if (unlikely(pmd_trans_unstable(fe->pmd))) | 2785 | if (unlikely(pmd_trans_unstable(vmf->pmd))) |
2766 | return 0; | 2786 | return 0; |
2767 | 2787 | ||
2768 | /* Use the zero-page for reads */ | 2788 | /* Use the zero-page for reads */ |
2769 | if (!(fe->flags & FAULT_FLAG_WRITE) && | 2789 | if (!(vmf->flags & FAULT_FLAG_WRITE) && |
2770 | !mm_forbids_zeropage(vma->vm_mm)) { | 2790 | !mm_forbids_zeropage(vma->vm_mm)) { |
2771 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), | 2791 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), |
2772 | vma->vm_page_prot)); | 2792 | vma->vm_page_prot)); |
2773 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2793 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
2774 | &fe->ptl); | 2794 | vmf->address, &vmf->ptl); |
2775 | if (!pte_none(*fe->pte)) | 2795 | if (!pte_none(*vmf->pte)) |
2776 | goto unlock; | 2796 | goto unlock; |
2777 | /* Deliver the page fault to userland, check inside PT lock */ | 2797 | /* Deliver the page fault to userland, check inside PT lock */ |
2778 | if (userfaultfd_missing(vma)) { | 2798 | if (userfaultfd_missing(vma)) { |
2779 | pte_unmap_unlock(fe->pte, fe->ptl); | 2799 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2780 | return handle_userfault(fe, VM_UFFD_MISSING); | 2800 | return handle_userfault(vmf, VM_UFFD_MISSING); |
2781 | } | 2801 | } |
2782 | goto setpte; | 2802 | goto setpte; |
2783 | } | 2803 | } |
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2785 | /* Allocate our own private page. */ | 2805 | /* Allocate our own private page. */ |
2786 | if (unlikely(anon_vma_prepare(vma))) | 2806 | if (unlikely(anon_vma_prepare(vma))) |
2787 | goto oom; | 2807 | goto oom; |
2788 | page = alloc_zeroed_user_highpage_movable(vma, fe->address); | 2808 | page = alloc_zeroed_user_highpage_movable(vma, vmf->address); |
2789 | if (!page) | 2809 | if (!page) |
2790 | goto oom; | 2810 | goto oom; |
2791 | 2811 | ||
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2803 | if (vma->vm_flags & VM_WRITE) | 2823 | if (vma->vm_flags & VM_WRITE) |
2804 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2824 | entry = pte_mkwrite(pte_mkdirty(entry)); |
2805 | 2825 | ||
2806 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2826 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
2807 | &fe->ptl); | 2827 | &vmf->ptl); |
2808 | if (!pte_none(*fe->pte)) | 2828 | if (!pte_none(*vmf->pte)) |
2809 | goto release; | 2829 | goto release; |
2810 | 2830 | ||
2811 | /* Deliver the page fault to userland, check inside PT lock */ | 2831 | /* Deliver the page fault to userland, check inside PT lock */ |
2812 | if (userfaultfd_missing(vma)) { | 2832 | if (userfaultfd_missing(vma)) { |
2813 | pte_unmap_unlock(fe->pte, fe->ptl); | 2833 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2814 | mem_cgroup_cancel_charge(page, memcg, false); | 2834 | mem_cgroup_cancel_charge(page, memcg, false); |
2815 | put_page(page); | 2835 | put_page(page); |
2816 | return handle_userfault(fe, VM_UFFD_MISSING); | 2836 | return handle_userfault(vmf, VM_UFFD_MISSING); |
2817 | } | 2837 | } |
2818 | 2838 | ||
2819 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2839 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2820 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2840 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
2821 | mem_cgroup_commit_charge(page, memcg, false, false); | 2841 | mem_cgroup_commit_charge(page, memcg, false, false); |
2822 | lru_cache_add_active_or_unevictable(page, vma); | 2842 | lru_cache_add_active_or_unevictable(page, vma); |
2823 | setpte: | 2843 | setpte: |
2824 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); | 2844 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); |
2825 | 2845 | ||
2826 | /* No need to invalidate - it was non-present before */ | 2846 | /* No need to invalidate - it was non-present before */ |
2827 | update_mmu_cache(vma, fe->address, fe->pte); | 2847 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2828 | unlock: | 2848 | unlock: |
2829 | pte_unmap_unlock(fe->pte, fe->ptl); | 2849 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2830 | return 0; | 2850 | return 0; |
2831 | release: | 2851 | release: |
2832 | mem_cgroup_cancel_charge(page, memcg, false); | 2852 | mem_cgroup_cancel_charge(page, memcg, false); |
@@ -2843,62 +2863,50 @@ oom: | |||
2843 | * released depending on flags and vma->vm_ops->fault() return value. | 2863 | * released depending on flags and vma->vm_ops->fault() return value. |
2844 | * See filemap_fault() and __lock_page_retry(). | 2864 | * See filemap_fault() and __lock_page_retry(). |
2845 | */ | 2865 | */ |
2846 | static int __do_fault(struct fault_env *fe, pgoff_t pgoff, | 2866 | static int __do_fault(struct vm_fault *vmf) |
2847 | struct page *cow_page, struct page **page, void **entry) | ||
2848 | { | 2867 | { |
2849 | struct vm_area_struct *vma = fe->vma; | 2868 | struct vm_area_struct *vma = vmf->vma; |
2850 | struct vm_fault vmf; | ||
2851 | int ret; | 2869 | int ret; |
2852 | 2870 | ||
2853 | vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); | 2871 | ret = vma->vm_ops->fault(vma, vmf); |
2854 | vmf.pgoff = pgoff; | 2872 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | |
2855 | vmf.flags = fe->flags; | 2873 | VM_FAULT_DONE_COW))) |
2856 | vmf.page = NULL; | ||
2857 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
2858 | vmf.cow_page = cow_page; | ||
2859 | |||
2860 | ret = vma->vm_ops->fault(vma, &vmf); | ||
2861 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | ||
2862 | return ret; | ||
2863 | if (ret & VM_FAULT_DAX_LOCKED) { | ||
2864 | *entry = vmf.entry; | ||
2865 | return ret; | 2874 | return ret; |
2866 | } | ||
2867 | 2875 | ||
2868 | if (unlikely(PageHWPoison(vmf.page))) { | 2876 | if (unlikely(PageHWPoison(vmf->page))) { |
2869 | if (ret & VM_FAULT_LOCKED) | 2877 | if (ret & VM_FAULT_LOCKED) |
2870 | unlock_page(vmf.page); | 2878 | unlock_page(vmf->page); |
2871 | put_page(vmf.page); | 2879 | put_page(vmf->page); |
2880 | vmf->page = NULL; | ||
2872 | return VM_FAULT_HWPOISON; | 2881 | return VM_FAULT_HWPOISON; |
2873 | } | 2882 | } |
2874 | 2883 | ||
2875 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 2884 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
2876 | lock_page(vmf.page); | 2885 | lock_page(vmf->page); |
2877 | else | 2886 | else |
2878 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); | 2887 | VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); |
2879 | 2888 | ||
2880 | *page = vmf.page; | ||
2881 | return ret; | 2889 | return ret; |
2882 | } | 2890 | } |
2883 | 2891 | ||
2884 | static int pte_alloc_one_map(struct fault_env *fe) | 2892 | static int pte_alloc_one_map(struct vm_fault *vmf) |
2885 | { | 2893 | { |
2886 | struct vm_area_struct *vma = fe->vma; | 2894 | struct vm_area_struct *vma = vmf->vma; |
2887 | 2895 | ||
2888 | if (!pmd_none(*fe->pmd)) | 2896 | if (!pmd_none(*vmf->pmd)) |
2889 | goto map_pte; | 2897 | goto map_pte; |
2890 | if (fe->prealloc_pte) { | 2898 | if (vmf->prealloc_pte) { |
2891 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 2899 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
2892 | if (unlikely(!pmd_none(*fe->pmd))) { | 2900 | if (unlikely(!pmd_none(*vmf->pmd))) { |
2893 | spin_unlock(fe->ptl); | 2901 | spin_unlock(vmf->ptl); |
2894 | goto map_pte; | 2902 | goto map_pte; |
2895 | } | 2903 | } |
2896 | 2904 | ||
2897 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 2905 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
2898 | pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); | 2906 | pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
2899 | spin_unlock(fe->ptl); | 2907 | spin_unlock(vmf->ptl); |
2900 | fe->prealloc_pte = 0; | 2908 | vmf->prealloc_pte = 0; |
2901 | } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { | 2909 | } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { |
2902 | return VM_FAULT_OOM; | 2910 | return VM_FAULT_OOM; |
2903 | } | 2911 | } |
2904 | map_pte: | 2912 | map_pte: |
@@ -2913,11 +2921,11 @@ map_pte: | |||
2913 | * through an atomic read in C, which is what pmd_trans_unstable() | 2921 | * through an atomic read in C, which is what pmd_trans_unstable() |
2914 | * provides. | 2922 | * provides. |
2915 | */ | 2923 | */ |
2916 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | 2924 | if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) |
2917 | return VM_FAULT_NOPAGE; | 2925 | return VM_FAULT_NOPAGE; |
2918 | 2926 | ||
2919 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2927 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
2920 | &fe->ptl); | 2928 | &vmf->ptl); |
2921 | return 0; | 2929 | return 0; |
2922 | } | 2930 | } |
2923 | 2931 | ||
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, | |||
2935 | return true; | 2943 | return true; |
2936 | } | 2944 | } |
2937 | 2945 | ||
2938 | static void deposit_prealloc_pte(struct fault_env *fe) | 2946 | static void deposit_prealloc_pte(struct vm_fault *vmf) |
2939 | { | 2947 | { |
2940 | struct vm_area_struct *vma = fe->vma; | 2948 | struct vm_area_struct *vma = vmf->vma; |
2941 | 2949 | ||
2942 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); | 2950 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
2943 | /* | 2951 | /* |
2944 | * We are going to consume the prealloc table, | 2952 | * We are going to consume the prealloc table, |
2945 | * count that as nr_ptes. | 2953 | * count that as nr_ptes. |
2946 | */ | 2954 | */ |
2947 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 2955 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
2948 | fe->prealloc_pte = 0; | 2956 | vmf->prealloc_pte = 0; |
2949 | } | 2957 | } |
2950 | 2958 | ||
2951 | static int do_set_pmd(struct fault_env *fe, struct page *page) | 2959 | static int do_set_pmd(struct vm_fault *vmf, struct page *page) |
2952 | { | 2960 | { |
2953 | struct vm_area_struct *vma = fe->vma; | 2961 | struct vm_area_struct *vma = vmf->vma; |
2954 | bool write = fe->flags & FAULT_FLAG_WRITE; | 2962 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
2955 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 2963 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
2956 | pmd_t entry; | 2964 | pmd_t entry; |
2957 | int i, ret; | 2965 | int i, ret; |
2958 | 2966 | ||
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
2966 | * Archs like ppc64 need additonal space to store information | 2974 | * Archs like ppc64 need additonal space to store information |
2967 | * related to pte entry. Use the preallocated table for that. | 2975 | * related to pte entry. Use the preallocated table for that. |
2968 | */ | 2976 | */ |
2969 | if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { | 2977 | if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { |
2970 | fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); | 2978 | vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); |
2971 | if (!fe->prealloc_pte) | 2979 | if (!vmf->prealloc_pte) |
2972 | return VM_FAULT_OOM; | 2980 | return VM_FAULT_OOM; |
2973 | smp_wmb(); /* See comment in __pte_alloc() */ | 2981 | smp_wmb(); /* See comment in __pte_alloc() */ |
2974 | } | 2982 | } |
2975 | 2983 | ||
2976 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 2984 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
2977 | if (unlikely(!pmd_none(*fe->pmd))) | 2985 | if (unlikely(!pmd_none(*vmf->pmd))) |
2978 | goto out; | 2986 | goto out; |
2979 | 2987 | ||
2980 | for (i = 0; i < HPAGE_PMD_NR; i++) | 2988 | for (i = 0; i < HPAGE_PMD_NR; i++) |
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
2990 | * deposit and withdraw with pmd lock held | 2998 | * deposit and withdraw with pmd lock held |
2991 | */ | 2999 | */ |
2992 | if (arch_needs_pgtable_deposit()) | 3000 | if (arch_needs_pgtable_deposit()) |
2993 | deposit_prealloc_pte(fe); | 3001 | deposit_prealloc_pte(vmf); |
2994 | 3002 | ||
2995 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 3003 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
2996 | 3004 | ||
2997 | update_mmu_cache_pmd(vma, haddr, fe->pmd); | 3005 | update_mmu_cache_pmd(vma, haddr, vmf->pmd); |
2998 | 3006 | ||
2999 | /* fault is handled */ | 3007 | /* fault is handled */ |
3000 | ret = 0; | 3008 | ret = 0; |
@@ -3005,13 +3013,13 @@ out: | |||
3005 | * withdraw with pmd lock held. | 3013 | * withdraw with pmd lock held. |
3006 | */ | 3014 | */ |
3007 | if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) | 3015 | if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) |
3008 | fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, | 3016 | vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, |
3009 | fe->pmd); | 3017 | vmf->pmd); |
3010 | spin_unlock(fe->ptl); | 3018 | spin_unlock(vmf->ptl); |
3011 | return ret; | 3019 | return ret; |
3012 | } | 3020 | } |
3013 | #else | 3021 | #else |
3014 | static int do_set_pmd(struct fault_env *fe, struct page *page) | 3022 | static int do_set_pmd(struct vm_fault *vmf, struct page *page) |
3015 | { | 3023 | { |
3016 | BUILD_BUG(); | 3024 | BUILD_BUG(); |
3017 | return 0; | 3025 | return 0; |
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
3022 | * alloc_set_pte - setup new PTE entry for given page and add reverse page | 3030 | * alloc_set_pte - setup new PTE entry for given page and add reverse page |
3023 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. | 3031 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. |
3024 | * | 3032 | * |
3025 | * @fe: fault environment | 3033 | * @vmf: fault environment |
3026 | * @memcg: memcg to charge page (only for private mappings) | 3034 | * @memcg: memcg to charge page (only for private mappings) |
3027 | * @page: page to map | 3035 | * @page: page to map |
3028 | * | 3036 | * |
3029 | * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. | 3037 | * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on |
3038 | * return. | ||
3030 | * | 3039 | * |
3031 | * Target users are page handler itself and implementations of | 3040 | * Target users are page handler itself and implementations of |
3032 | * vm_ops->map_pages. | 3041 | * vm_ops->map_pages. |
3033 | */ | 3042 | */ |
3034 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | 3043 | int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
3035 | struct page *page) | 3044 | struct page *page) |
3036 | { | 3045 | { |
3037 | struct vm_area_struct *vma = fe->vma; | 3046 | struct vm_area_struct *vma = vmf->vma; |
3038 | bool write = fe->flags & FAULT_FLAG_WRITE; | 3047 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
3039 | pte_t entry; | 3048 | pte_t entry; |
3040 | int ret; | 3049 | int ret; |
3041 | 3050 | ||
3042 | if (pmd_none(*fe->pmd) && PageTransCompound(page) && | 3051 | if (pmd_none(*vmf->pmd) && PageTransCompound(page) && |
3043 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { | 3052 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { |
3044 | /* THP on COW? */ | 3053 | /* THP on COW? */ |
3045 | VM_BUG_ON_PAGE(memcg, page); | 3054 | VM_BUG_ON_PAGE(memcg, page); |
3046 | 3055 | ||
3047 | ret = do_set_pmd(fe, page); | 3056 | ret = do_set_pmd(vmf, page); |
3048 | if (ret != VM_FAULT_FALLBACK) | 3057 | if (ret != VM_FAULT_FALLBACK) |
3049 | goto fault_handled; | 3058 | goto fault_handled; |
3050 | } | 3059 | } |
3051 | 3060 | ||
3052 | if (!fe->pte) { | 3061 | if (!vmf->pte) { |
3053 | ret = pte_alloc_one_map(fe); | 3062 | ret = pte_alloc_one_map(vmf); |
3054 | if (ret) | 3063 | if (ret) |
3055 | goto fault_handled; | 3064 | goto fault_handled; |
3056 | } | 3065 | } |
3057 | 3066 | ||
3058 | /* Re-check under ptl */ | 3067 | /* Re-check under ptl */ |
3059 | if (unlikely(!pte_none(*fe->pte))) { | 3068 | if (unlikely(!pte_none(*vmf->pte))) { |
3060 | ret = VM_FAULT_NOPAGE; | 3069 | ret = VM_FAULT_NOPAGE; |
3061 | goto fault_handled; | 3070 | goto fault_handled; |
3062 | } | 3071 | } |
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | |||
3068 | /* copy-on-write page */ | 3077 | /* copy-on-write page */ |
3069 | if (write && !(vma->vm_flags & VM_SHARED)) { | 3078 | if (write && !(vma->vm_flags & VM_SHARED)) { |
3070 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 3079 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
3071 | page_add_new_anon_rmap(page, vma, fe->address, false); | 3080 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
3072 | mem_cgroup_commit_charge(page, memcg, false, false); | 3081 | mem_cgroup_commit_charge(page, memcg, false, false); |
3073 | lru_cache_add_active_or_unevictable(page, vma); | 3082 | lru_cache_add_active_or_unevictable(page, vma); |
3074 | } else { | 3083 | } else { |
3075 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 3084 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
3076 | page_add_file_rmap(page, false); | 3085 | page_add_file_rmap(page, false); |
3077 | } | 3086 | } |
3078 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); | 3087 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); |
3079 | 3088 | ||
3080 | /* no need to invalidate: a not-present page won't be cached */ | 3089 | /* no need to invalidate: a not-present page won't be cached */ |
3081 | update_mmu_cache(vma, fe->address, fe->pte); | 3090 | update_mmu_cache(vma, vmf->address, vmf->pte); |
3082 | ret = 0; | 3091 | ret = 0; |
3083 | 3092 | ||
3084 | fault_handled: | 3093 | fault_handled: |
3085 | /* preallocated pagetable is unused: free it */ | 3094 | /* preallocated pagetable is unused: free it */ |
3086 | if (fe->prealloc_pte) { | 3095 | if (vmf->prealloc_pte) { |
3087 | pte_free(fe->vma->vm_mm, fe->prealloc_pte); | 3096 | pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); |
3088 | fe->prealloc_pte = 0; | 3097 | vmf->prealloc_pte = 0; |
3089 | } | 3098 | } |
3090 | return ret; | 3099 | return ret; |
3091 | } | 3100 | } |
3092 | 3101 | ||
3102 | |||
3103 | /** | ||
3104 | * finish_fault - finish page fault once we have prepared the page to fault | ||
3105 | * | ||
3106 | * @vmf: structure describing the fault | ||
3107 | * | ||
3108 | * This function handles all that is needed to finish a page fault once the | ||
3109 | * page to fault in is prepared. It handles locking of PTEs, inserts PTE for | ||
3110 | * given page, adds reverse page mapping, handles memcg charges and LRU | ||
3111 | * addition. The function returns 0 on success, VM_FAULT_ code in case of | ||
3112 | * error. | ||
3113 | * | ||
3114 | * The function expects the page to be locked and on success it consumes a | ||
3115 | * reference of a page being mapped (for the PTE which maps it). | ||
3116 | */ | ||
3117 | int finish_fault(struct vm_fault *vmf) | ||
3118 | { | ||
3119 | struct page *page; | ||
3120 | int ret; | ||
3121 | |||
3122 | /* Did we COW the page? */ | ||
3123 | if ((vmf->flags & FAULT_FLAG_WRITE) && | ||
3124 | !(vmf->vma->vm_flags & VM_SHARED)) | ||
3125 | page = vmf->cow_page; | ||
3126 | else | ||
3127 | page = vmf->page; | ||
3128 | ret = alloc_set_pte(vmf, vmf->memcg, page); | ||
3129 | if (vmf->pte) | ||
3130 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
3131 | return ret; | ||
3132 | } | ||
3133 | |||
3093 | static unsigned long fault_around_bytes __read_mostly = | 3134 | static unsigned long fault_around_bytes __read_mostly = |
3094 | rounddown_pow_of_two(65536); | 3135 | rounddown_pow_of_two(65536); |
3095 | 3136 | ||
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs); | |||
3154 | * fault_around_pages() value (and therefore to page order). This way it's | 3195 | * fault_around_pages() value (and therefore to page order). This way it's |
3155 | * easier to guarantee that we don't cross page table boundaries. | 3196 | * easier to guarantee that we don't cross page table boundaries. |
3156 | */ | 3197 | */ |
3157 | static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | 3198 | static int do_fault_around(struct vm_fault *vmf) |
3158 | { | 3199 | { |
3159 | unsigned long address = fe->address, nr_pages, mask; | 3200 | unsigned long address = vmf->address, nr_pages, mask; |
3201 | pgoff_t start_pgoff = vmf->pgoff; | ||
3160 | pgoff_t end_pgoff; | 3202 | pgoff_t end_pgoff; |
3161 | int off, ret = 0; | 3203 | int off, ret = 0; |
3162 | 3204 | ||
3163 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 3205 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
3164 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 3206 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
3165 | 3207 | ||
3166 | fe->address = max(address & mask, fe->vma->vm_start); | 3208 | vmf->address = max(address & mask, vmf->vma->vm_start); |
3167 | off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 3209 | off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
3168 | start_pgoff -= off; | 3210 | start_pgoff -= off; |
3169 | 3211 | ||
3170 | /* | 3212 | /* |
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | |||
3172 | * or fault_around_pages() from start_pgoff, depending what is nearest. | 3214 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
3173 | */ | 3215 | */ |
3174 | end_pgoff = start_pgoff - | 3216 | end_pgoff = start_pgoff - |
3175 | ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 3217 | ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
3176 | PTRS_PER_PTE - 1; | 3218 | PTRS_PER_PTE - 1; |
3177 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, | 3219 | end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, |
3178 | start_pgoff + nr_pages - 1); | 3220 | start_pgoff + nr_pages - 1); |
3179 | 3221 | ||
3180 | if (pmd_none(*fe->pmd)) { | 3222 | if (pmd_none(*vmf->pmd)) { |
3181 | fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); | 3223 | vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, |
3182 | if (!fe->prealloc_pte) | 3224 | vmf->address); |
3225 | if (!vmf->prealloc_pte) | ||
3183 | goto out; | 3226 | goto out; |
3184 | smp_wmb(); /* See comment in __pte_alloc() */ | 3227 | smp_wmb(); /* See comment in __pte_alloc() */ |
3185 | } | 3228 | } |
3186 | 3229 | ||
3187 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); | 3230 | vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); |
3188 | 3231 | ||
3189 | /* Huge page is mapped? Page fault is solved */ | 3232 | /* Huge page is mapped? Page fault is solved */ |
3190 | if (pmd_trans_huge(*fe->pmd)) { | 3233 | if (pmd_trans_huge(*vmf->pmd)) { |
3191 | ret = VM_FAULT_NOPAGE; | 3234 | ret = VM_FAULT_NOPAGE; |
3192 | goto out; | 3235 | goto out; |
3193 | } | 3236 | } |
3194 | 3237 | ||
3195 | /* ->map_pages() haven't done anything useful. Cold page cache? */ | 3238 | /* ->map_pages() haven't done anything useful. Cold page cache? */ |
3196 | if (!fe->pte) | 3239 | if (!vmf->pte) |
3197 | goto out; | 3240 | goto out; |
3198 | 3241 | ||
3199 | /* check if the page fault is solved */ | 3242 | /* check if the page fault is solved */ |
3200 | fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); | 3243 | vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); |
3201 | if (!pte_none(*fe->pte)) | 3244 | if (!pte_none(*vmf->pte)) |
3202 | ret = VM_FAULT_NOPAGE; | 3245 | ret = VM_FAULT_NOPAGE; |
3203 | pte_unmap_unlock(fe->pte, fe->ptl); | 3246 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3204 | out: | 3247 | out: |
3205 | fe->address = address; | 3248 | vmf->address = address; |
3206 | fe->pte = NULL; | 3249 | vmf->pte = NULL; |
3207 | return ret; | 3250 | return ret; |
3208 | } | 3251 | } |
3209 | 3252 | ||
3210 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) | 3253 | static int do_read_fault(struct vm_fault *vmf) |
3211 | { | 3254 | { |
3212 | struct vm_area_struct *vma = fe->vma; | 3255 | struct vm_area_struct *vma = vmf->vma; |
3213 | struct page *fault_page; | ||
3214 | int ret = 0; | 3256 | int ret = 0; |
3215 | 3257 | ||
3216 | /* | 3258 | /* |
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) | |||
3219 | * something). | 3261 | * something). |
3220 | */ | 3262 | */ |
3221 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3263 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
3222 | ret = do_fault_around(fe, pgoff); | 3264 | ret = do_fault_around(vmf); |
3223 | if (ret) | 3265 | if (ret) |
3224 | return ret; | 3266 | return ret; |
3225 | } | 3267 | } |
3226 | 3268 | ||
3227 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3269 | ret = __do_fault(vmf); |
3228 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3270 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3229 | return ret; | 3271 | return ret; |
3230 | 3272 | ||
3231 | ret |= alloc_set_pte(fe, NULL, fault_page); | 3273 | ret |= finish_fault(vmf); |
3232 | if (fe->pte) | 3274 | unlock_page(vmf->page); |
3233 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3234 | unlock_page(fault_page); | ||
3235 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3275 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3236 | put_page(fault_page); | 3276 | put_page(vmf->page); |
3237 | return ret; | 3277 | return ret; |
3238 | } | 3278 | } |
3239 | 3279 | ||
3240 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) | 3280 | static int do_cow_fault(struct vm_fault *vmf) |
3241 | { | 3281 | { |
3242 | struct vm_area_struct *vma = fe->vma; | 3282 | struct vm_area_struct *vma = vmf->vma; |
3243 | struct page *fault_page, *new_page; | ||
3244 | void *fault_entry; | ||
3245 | struct mem_cgroup *memcg; | ||
3246 | int ret; | 3283 | int ret; |
3247 | 3284 | ||
3248 | if (unlikely(anon_vma_prepare(vma))) | 3285 | if (unlikely(anon_vma_prepare(vma))) |
3249 | return VM_FAULT_OOM; | 3286 | return VM_FAULT_OOM; |
3250 | 3287 | ||
3251 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); | 3288 | vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); |
3252 | if (!new_page) | 3289 | if (!vmf->cow_page) |
3253 | return VM_FAULT_OOM; | 3290 | return VM_FAULT_OOM; |
3254 | 3291 | ||
3255 | if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, | 3292 | if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, |
3256 | &memcg, false)) { | 3293 | &vmf->memcg, false)) { |
3257 | put_page(new_page); | 3294 | put_page(vmf->cow_page); |
3258 | return VM_FAULT_OOM; | 3295 | return VM_FAULT_OOM; |
3259 | } | 3296 | } |
3260 | 3297 | ||
3261 | ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); | 3298 | ret = __do_fault(vmf); |
3262 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3299 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3263 | goto uncharge_out; | 3300 | goto uncharge_out; |
3301 | if (ret & VM_FAULT_DONE_COW) | ||
3302 | return ret; | ||
3264 | 3303 | ||
3265 | if (!(ret & VM_FAULT_DAX_LOCKED)) | 3304 | copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); |
3266 | copy_user_highpage(new_page, fault_page, fe->address, vma); | 3305 | __SetPageUptodate(vmf->cow_page); |
3267 | __SetPageUptodate(new_page); | ||
3268 | 3306 | ||
3269 | ret |= alloc_set_pte(fe, memcg, new_page); | 3307 | ret |= finish_fault(vmf); |
3270 | if (fe->pte) | 3308 | unlock_page(vmf->page); |
3271 | pte_unmap_unlock(fe->pte, fe->ptl); | 3309 | put_page(vmf->page); |
3272 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | ||
3273 | unlock_page(fault_page); | ||
3274 | put_page(fault_page); | ||
3275 | } else { | ||
3276 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); | ||
3277 | } | ||
3278 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3310 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3279 | goto uncharge_out; | 3311 | goto uncharge_out; |
3280 | return ret; | 3312 | return ret; |
3281 | uncharge_out: | 3313 | uncharge_out: |
3282 | mem_cgroup_cancel_charge(new_page, memcg, false); | 3314 | mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); |
3283 | put_page(new_page); | 3315 | put_page(vmf->cow_page); |
3284 | return ret; | 3316 | return ret; |
3285 | } | 3317 | } |
3286 | 3318 | ||
3287 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | 3319 | static int do_shared_fault(struct vm_fault *vmf) |
3288 | { | 3320 | { |
3289 | struct vm_area_struct *vma = fe->vma; | 3321 | struct vm_area_struct *vma = vmf->vma; |
3290 | struct page *fault_page; | ||
3291 | struct address_space *mapping; | ||
3292 | int dirtied = 0; | ||
3293 | int ret, tmp; | 3322 | int ret, tmp; |
3294 | 3323 | ||
3295 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3324 | ret = __do_fault(vmf); |
3296 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3325 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3297 | return ret; | 3326 | return ret; |
3298 | 3327 | ||
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | |||
3301 | * about to become writable | 3330 | * about to become writable |
3302 | */ | 3331 | */ |
3303 | if (vma->vm_ops->page_mkwrite) { | 3332 | if (vma->vm_ops->page_mkwrite) { |
3304 | unlock_page(fault_page); | 3333 | unlock_page(vmf->page); |
3305 | tmp = do_page_mkwrite(vma, fault_page, fe->address); | 3334 | tmp = do_page_mkwrite(vmf); |
3306 | if (unlikely(!tmp || | 3335 | if (unlikely(!tmp || |
3307 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 3336 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
3308 | put_page(fault_page); | 3337 | put_page(vmf->page); |
3309 | return tmp; | 3338 | return tmp; |
3310 | } | 3339 | } |
3311 | } | 3340 | } |
3312 | 3341 | ||
3313 | ret |= alloc_set_pte(fe, NULL, fault_page); | 3342 | ret |= finish_fault(vmf); |
3314 | if (fe->pte) | ||
3315 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3316 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3343 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3317 | VM_FAULT_RETRY))) { | 3344 | VM_FAULT_RETRY))) { |
3318 | unlock_page(fault_page); | 3345 | unlock_page(vmf->page); |
3319 | put_page(fault_page); | 3346 | put_page(vmf->page); |
3320 | return ret; | 3347 | return ret; |
3321 | } | 3348 | } |
3322 | 3349 | ||
3323 | if (set_page_dirty(fault_page)) | 3350 | fault_dirty_shared_page(vma, vmf->page); |
3324 | dirtied = 1; | ||
3325 | /* | ||
3326 | * Take a local copy of the address_space - page.mapping may be zeroed | ||
3327 | * by truncate after unlock_page(). The address_space itself remains | ||
3328 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | ||
3329 | * release semantics to prevent the compiler from undoing this copying. | ||
3330 | */ | ||
3331 | mapping = page_rmapping(fault_page); | ||
3332 | unlock_page(fault_page); | ||
3333 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { | ||
3334 | /* | ||
3335 | * Some device drivers do not set page.mapping but still | ||
3336 | * dirty their pages | ||
3337 | */ | ||
3338 | balance_dirty_pages_ratelimited(mapping); | ||
3339 | } | ||
3340 | |||
3341 | if (!vma->vm_ops->page_mkwrite) | ||
3342 | file_update_time(vma->vm_file); | ||
3343 | |||
3344 | return ret; | 3351 | return ret; |
3345 | } | 3352 | } |
3346 | 3353 | ||
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | |||
3350 | * The mmap_sem may have been released depending on flags and our | 3357 | * The mmap_sem may have been released depending on flags and our |
3351 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3358 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3352 | */ | 3359 | */ |
3353 | static int do_fault(struct fault_env *fe) | 3360 | static int do_fault(struct vm_fault *vmf) |
3354 | { | 3361 | { |
3355 | struct vm_area_struct *vma = fe->vma; | 3362 | struct vm_area_struct *vma = vmf->vma; |
3356 | pgoff_t pgoff = linear_page_index(vma, fe->address); | ||
3357 | 3363 | ||
3358 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3364 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
3359 | if (!vma->vm_ops->fault) | 3365 | if (!vma->vm_ops->fault) |
3360 | return VM_FAULT_SIGBUS; | 3366 | return VM_FAULT_SIGBUS; |
3361 | if (!(fe->flags & FAULT_FLAG_WRITE)) | 3367 | if (!(vmf->flags & FAULT_FLAG_WRITE)) |
3362 | return do_read_fault(fe, pgoff); | 3368 | return do_read_fault(vmf); |
3363 | if (!(vma->vm_flags & VM_SHARED)) | 3369 | if (!(vma->vm_flags & VM_SHARED)) |
3364 | return do_cow_fault(fe, pgoff); | 3370 | return do_cow_fault(vmf); |
3365 | return do_shared_fault(fe, pgoff); | 3371 | return do_shared_fault(vmf); |
3366 | } | 3372 | } |
3367 | 3373 | ||
3368 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3374 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | |||
3380 | return mpol_misplaced(page, vma, addr); | 3386 | return mpol_misplaced(page, vma, addr); |
3381 | } | 3387 | } |
3382 | 3388 | ||
3383 | static int do_numa_page(struct fault_env *fe, pte_t pte) | 3389 | static int do_numa_page(struct vm_fault *vmf) |
3384 | { | 3390 | { |
3385 | struct vm_area_struct *vma = fe->vma; | 3391 | struct vm_area_struct *vma = vmf->vma; |
3386 | struct page *page = NULL; | 3392 | struct page *page = NULL; |
3387 | int page_nid = -1; | 3393 | int page_nid = -1; |
3388 | int last_cpupid; | 3394 | int last_cpupid; |
3389 | int target_nid; | 3395 | int target_nid; |
3390 | bool migrated = false; | 3396 | bool migrated = false; |
3397 | pte_t pte = vmf->orig_pte; | ||
3391 | bool was_writable = pte_write(pte); | 3398 | bool was_writable = pte_write(pte); |
3392 | int flags = 0; | 3399 | int flags = 0; |
3393 | 3400 | ||
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3400 | * page table entry is not accessible, so there would be no | 3407 | * page table entry is not accessible, so there would be no |
3401 | * concurrent hardware modifications to the PTE. | 3408 | * concurrent hardware modifications to the PTE. |
3402 | */ | 3409 | */ |
3403 | fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); | 3410 | vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); |
3404 | spin_lock(fe->ptl); | 3411 | spin_lock(vmf->ptl); |
3405 | if (unlikely(!pte_same(*fe->pte, pte))) { | 3412 | if (unlikely(!pte_same(*vmf->pte, pte))) { |
3406 | pte_unmap_unlock(fe->pte, fe->ptl); | 3413 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3407 | goto out; | 3414 | goto out; |
3408 | } | 3415 | } |
3409 | 3416 | ||
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3412 | pte = pte_mkyoung(pte); | 3419 | pte = pte_mkyoung(pte); |
3413 | if (was_writable) | 3420 | if (was_writable) |
3414 | pte = pte_mkwrite(pte); | 3421 | pte = pte_mkwrite(pte); |
3415 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); | 3422 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
3416 | update_mmu_cache(vma, fe->address, fe->pte); | 3423 | update_mmu_cache(vma, vmf->address, vmf->pte); |
3417 | 3424 | ||
3418 | page = vm_normal_page(vma, fe->address, pte); | 3425 | page = vm_normal_page(vma, vmf->address, pte); |
3419 | if (!page) { | 3426 | if (!page) { |
3420 | pte_unmap_unlock(fe->pte, fe->ptl); | 3427 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3421 | return 0; | 3428 | return 0; |
3422 | } | 3429 | } |
3423 | 3430 | ||
3424 | /* TODO: handle PTE-mapped THP */ | 3431 | /* TODO: handle PTE-mapped THP */ |
3425 | if (PageCompound(page)) { | 3432 | if (PageCompound(page)) { |
3426 | pte_unmap_unlock(fe->pte, fe->ptl); | 3433 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3427 | return 0; | 3434 | return 0; |
3428 | } | 3435 | } |
3429 | 3436 | ||
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3447 | 3454 | ||
3448 | last_cpupid = page_cpupid_last(page); | 3455 | last_cpupid = page_cpupid_last(page); |
3449 | page_nid = page_to_nid(page); | 3456 | page_nid = page_to_nid(page); |
3450 | target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, | 3457 | target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, |
3451 | &flags); | 3458 | &flags); |
3452 | pte_unmap_unlock(fe->pte, fe->ptl); | 3459 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3453 | if (target_nid == -1) { | 3460 | if (target_nid == -1) { |
3454 | put_page(page); | 3461 | put_page(page); |
3455 | goto out; | 3462 | goto out; |
@@ -3469,28 +3476,28 @@ out: | |||
3469 | return 0; | 3476 | return 0; |
3470 | } | 3477 | } |
3471 | 3478 | ||
3472 | static int create_huge_pmd(struct fault_env *fe) | 3479 | static int create_huge_pmd(struct vm_fault *vmf) |
3473 | { | 3480 | { |
3474 | struct vm_area_struct *vma = fe->vma; | 3481 | struct vm_area_struct *vma = vmf->vma; |
3475 | if (vma_is_anonymous(vma)) | 3482 | if (vma_is_anonymous(vma)) |
3476 | return do_huge_pmd_anonymous_page(fe); | 3483 | return do_huge_pmd_anonymous_page(vmf); |
3477 | if (vma->vm_ops->pmd_fault) | 3484 | if (vma->vm_ops->pmd_fault) |
3478 | return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, | 3485 | return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, |
3479 | fe->flags); | 3486 | vmf->flags); |
3480 | return VM_FAULT_FALLBACK; | 3487 | return VM_FAULT_FALLBACK; |
3481 | } | 3488 | } |
3482 | 3489 | ||
3483 | static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) | 3490 | static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) |
3484 | { | 3491 | { |
3485 | if (vma_is_anonymous(fe->vma)) | 3492 | if (vma_is_anonymous(vmf->vma)) |
3486 | return do_huge_pmd_wp_page(fe, orig_pmd); | 3493 | return do_huge_pmd_wp_page(vmf, orig_pmd); |
3487 | if (fe->vma->vm_ops->pmd_fault) | 3494 | if (vmf->vma->vm_ops->pmd_fault) |
3488 | return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, | 3495 | return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, |
3489 | fe->flags); | 3496 | vmf->pmd, vmf->flags); |
3490 | 3497 | ||
3491 | /* COW handled on pte level: split pmd */ | 3498 | /* COW handled on pte level: split pmd */ |
3492 | VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); | 3499 | VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); |
3493 | __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); | 3500 | __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); |
3494 | 3501 | ||
3495 | return VM_FAULT_FALLBACK; | 3502 | return VM_FAULT_FALLBACK; |
3496 | } | 3503 | } |
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) | |||
3515 | * The mmap_sem may have been released depending on flags and our return value. | 3522 | * The mmap_sem may have been released depending on flags and our return value. |
3516 | * See filemap_fault() and __lock_page_or_retry(). | 3523 | * See filemap_fault() and __lock_page_or_retry(). |
3517 | */ | 3524 | */ |
3518 | static int handle_pte_fault(struct fault_env *fe) | 3525 | static int handle_pte_fault(struct vm_fault *vmf) |
3519 | { | 3526 | { |
3520 | pte_t entry; | 3527 | pte_t entry; |
3521 | 3528 | ||
3522 | if (unlikely(pmd_none(*fe->pmd))) { | 3529 | if (unlikely(pmd_none(*vmf->pmd))) { |
3523 | /* | 3530 | /* |
3524 | * Leave __pte_alloc() until later: because vm_ops->fault may | 3531 | * Leave __pte_alloc() until later: because vm_ops->fault may |
3525 | * want to allocate huge page, and if we expose page table | 3532 | * want to allocate huge page, and if we expose page table |
3526 | * for an instant, it will be difficult to retract from | 3533 | * for an instant, it will be difficult to retract from |
3527 | * concurrent faults and from rmap lookups. | 3534 | * concurrent faults and from rmap lookups. |
3528 | */ | 3535 | */ |
3529 | fe->pte = NULL; | 3536 | vmf->pte = NULL; |
3530 | } else { | 3537 | } else { |
3531 | /* See comment in pte_alloc_one_map() */ | 3538 | /* See comment in pte_alloc_one_map() */ |
3532 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | 3539 | if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) |
3533 | return 0; | 3540 | return 0; |
3534 | /* | 3541 | /* |
3535 | * A regular pmd is established and it can't morph into a huge | 3542 | * A regular pmd is established and it can't morph into a huge |
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3537 | * mmap_sem read mode and khugepaged takes it in write mode. | 3544 | * mmap_sem read mode and khugepaged takes it in write mode. |
3538 | * So now it's safe to run pte_offset_map(). | 3545 | * So now it's safe to run pte_offset_map(). |
3539 | */ | 3546 | */ |
3540 | fe->pte = pte_offset_map(fe->pmd, fe->address); | 3547 | vmf->pte = pte_offset_map(vmf->pmd, vmf->address); |
3541 | 3548 | vmf->orig_pte = *vmf->pte; | |
3542 | entry = *fe->pte; | ||
3543 | 3549 | ||
3544 | /* | 3550 | /* |
3545 | * some architectures can have larger ptes than wordsize, | 3551 | * some architectures can have larger ptes than wordsize, |
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3550 | * ptl lock held. So here a barrier will do. | 3556 | * ptl lock held. So here a barrier will do. |
3551 | */ | 3557 | */ |
3552 | barrier(); | 3558 | barrier(); |
3553 | if (pte_none(entry)) { | 3559 | if (pte_none(vmf->orig_pte)) { |
3554 | pte_unmap(fe->pte); | 3560 | pte_unmap(vmf->pte); |
3555 | fe->pte = NULL; | 3561 | vmf->pte = NULL; |
3556 | } | 3562 | } |
3557 | } | 3563 | } |
3558 | 3564 | ||
3559 | if (!fe->pte) { | 3565 | if (!vmf->pte) { |
3560 | if (vma_is_anonymous(fe->vma)) | 3566 | if (vma_is_anonymous(vmf->vma)) |
3561 | return do_anonymous_page(fe); | 3567 | return do_anonymous_page(vmf); |
3562 | else | 3568 | else |
3563 | return do_fault(fe); | 3569 | return do_fault(vmf); |
3564 | } | 3570 | } |
3565 | 3571 | ||
3566 | if (!pte_present(entry)) | 3572 | if (!pte_present(vmf->orig_pte)) |
3567 | return do_swap_page(fe, entry); | 3573 | return do_swap_page(vmf); |
3568 | 3574 | ||
3569 | if (pte_protnone(entry) && vma_is_accessible(fe->vma)) | 3575 | if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) |
3570 | return do_numa_page(fe, entry); | 3576 | return do_numa_page(vmf); |
3571 | 3577 | ||
3572 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); | 3578 | vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); |
3573 | spin_lock(fe->ptl); | 3579 | spin_lock(vmf->ptl); |
3574 | if (unlikely(!pte_same(*fe->pte, entry))) | 3580 | entry = vmf->orig_pte; |
3581 | if (unlikely(!pte_same(*vmf->pte, entry))) | ||
3575 | goto unlock; | 3582 | goto unlock; |
3576 | if (fe->flags & FAULT_FLAG_WRITE) { | 3583 | if (vmf->flags & FAULT_FLAG_WRITE) { |
3577 | if (!pte_write(entry)) | 3584 | if (!pte_write(entry)) |
3578 | return do_wp_page(fe, entry); | 3585 | return do_wp_page(vmf); |
3579 | entry = pte_mkdirty(entry); | 3586 | entry = pte_mkdirty(entry); |
3580 | } | 3587 | } |
3581 | entry = pte_mkyoung(entry); | 3588 | entry = pte_mkyoung(entry); |
3582 | if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, | 3589 | if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, |
3583 | fe->flags & FAULT_FLAG_WRITE)) { | 3590 | vmf->flags & FAULT_FLAG_WRITE)) { |
3584 | update_mmu_cache(fe->vma, fe->address, fe->pte); | 3591 | update_mmu_cache(vmf->vma, vmf->address, vmf->pte); |
3585 | } else { | 3592 | } else { |
3586 | /* | 3593 | /* |
3587 | * This is needed only for protection faults but the arch code | 3594 | * This is needed only for protection faults but the arch code |
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3589 | * This still avoids useless tlb flushes for .text page faults | 3596 | * This still avoids useless tlb flushes for .text page faults |
3590 | * with threads. | 3597 | * with threads. |
3591 | */ | 3598 | */ |
3592 | if (fe->flags & FAULT_FLAG_WRITE) | 3599 | if (vmf->flags & FAULT_FLAG_WRITE) |
3593 | flush_tlb_fix_spurious_fault(fe->vma, fe->address); | 3600 | flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); |
3594 | } | 3601 | } |
3595 | unlock: | 3602 | unlock: |
3596 | pte_unmap_unlock(fe->pte, fe->ptl); | 3603 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3597 | return 0; | 3604 | return 0; |
3598 | } | 3605 | } |
3599 | 3606 | ||
@@ -3606,10 +3613,12 @@ unlock: | |||
3606 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | 3613 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
3607 | unsigned int flags) | 3614 | unsigned int flags) |
3608 | { | 3615 | { |
3609 | struct fault_env fe = { | 3616 | struct vm_fault vmf = { |
3610 | .vma = vma, | 3617 | .vma = vma, |
3611 | .address = address, | 3618 | .address = address & PAGE_MASK, |
3612 | .flags = flags, | 3619 | .flags = flags, |
3620 | .pgoff = linear_page_index(vma, address), | ||
3621 | .gfp_mask = __get_fault_gfp_mask(vma), | ||
3613 | }; | 3622 | }; |
3614 | struct mm_struct *mm = vma->vm_mm; | 3623 | struct mm_struct *mm = vma->vm_mm; |
3615 | pgd_t *pgd; | 3624 | pgd_t *pgd; |
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3619 | pud = pud_alloc(mm, pgd, address); | 3628 | pud = pud_alloc(mm, pgd, address); |
3620 | if (!pud) | 3629 | if (!pud) |
3621 | return VM_FAULT_OOM; | 3630 | return VM_FAULT_OOM; |
3622 | fe.pmd = pmd_alloc(mm, pud, address); | 3631 | vmf.pmd = pmd_alloc(mm, pud, address); |
3623 | if (!fe.pmd) | 3632 | if (!vmf.pmd) |
3624 | return VM_FAULT_OOM; | 3633 | return VM_FAULT_OOM; |
3625 | if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { | 3634 | if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { |
3626 | int ret = create_huge_pmd(&fe); | 3635 | int ret = create_huge_pmd(&vmf); |
3627 | if (!(ret & VM_FAULT_FALLBACK)) | 3636 | if (!(ret & VM_FAULT_FALLBACK)) |
3628 | return ret; | 3637 | return ret; |
3629 | } else { | 3638 | } else { |
3630 | pmd_t orig_pmd = *fe.pmd; | 3639 | pmd_t orig_pmd = *vmf.pmd; |
3631 | int ret; | 3640 | int ret; |
3632 | 3641 | ||
3633 | barrier(); | 3642 | barrier(); |
3634 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3643 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3635 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) | 3644 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) |
3636 | return do_huge_pmd_numa_page(&fe, orig_pmd); | 3645 | return do_huge_pmd_numa_page(&vmf, orig_pmd); |
3637 | 3646 | ||
3638 | if ((fe.flags & FAULT_FLAG_WRITE) && | 3647 | if ((vmf.flags & FAULT_FLAG_WRITE) && |
3639 | !pmd_write(orig_pmd)) { | 3648 | !pmd_write(orig_pmd)) { |
3640 | ret = wp_huge_pmd(&fe, orig_pmd); | 3649 | ret = wp_huge_pmd(&vmf, orig_pmd); |
3641 | if (!(ret & VM_FAULT_FALLBACK)) | 3650 | if (!(ret & VM_FAULT_FALLBACK)) |
3642 | return ret; | 3651 | return ret; |
3643 | } else { | 3652 | } else { |
3644 | huge_pmd_set_accessed(&fe, orig_pmd); | 3653 | huge_pmd_set_accessed(&vmf, orig_pmd); |
3645 | return 0; | 3654 | return 0; |
3646 | } | 3655 | } |
3647 | } | 3656 | } |
3648 | } | 3657 | } |
3649 | 3658 | ||
3650 | return handle_pte_fault(&fe); | 3659 | return handle_pte_fault(&vmf); |
3651 | } | 3660 | } |
3652 | 3661 | ||
3653 | /* | 3662 | /* |
@@ -3808,8 +3817,8 @@ out: | |||
3808 | return -EINVAL; | 3817 | return -EINVAL; |
3809 | } | 3818 | } |
3810 | 3819 | ||
3811 | static inline int follow_pte(struct mm_struct *mm, unsigned long address, | 3820 | int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, |
3812 | pte_t **ptepp, spinlock_t **ptlp) | 3821 | spinlock_t **ptlp) |
3813 | { | 3822 | { |
3814 | int res; | 3823 | int res; |
3815 | 3824 | ||
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
3919 | struct page *page = NULL; | 3928 | struct page *page = NULL; |
3920 | 3929 | ||
3921 | ret = get_user_pages_remote(tsk, mm, addr, 1, | 3930 | ret = get_user_pages_remote(tsk, mm, addr, 1, |
3922 | gup_flags, &page, &vma); | 3931 | gup_flags, &page, &vma, NULL); |
3923 | if (ret <= 0) { | 3932 | if (ret <= 0) { |
3924 | #ifndef CONFIG_HAVE_IOREMAP_PROT | 3933 | #ifndef CONFIG_HAVE_IOREMAP_PROT |
3925 | break; | 3934 | break; |