diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 582 |
1 files changed, 278 insertions, 304 deletions
diff --git a/mm/memory.c b/mm/memory.c index 6bf2b8564376..72b520897339 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2070,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2070 | * case, all we need to do here is to mark the page as writable and update | 2070 | * case, all we need to do here is to mark the page as writable and update |
2071 | * any related book-keeping. | 2071 | * any related book-keeping. |
2072 | */ | 2072 | */ |
2073 | static inline int wp_page_reuse(struct mm_struct *mm, | 2073 | static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, |
2074 | struct vm_area_struct *vma, unsigned long address, | 2074 | struct page *page, int page_mkwrite, int dirty_shared) |
2075 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | 2075 | __releases(fe->ptl) |
2076 | struct page *page, int page_mkwrite, | ||
2077 | int dirty_shared) | ||
2078 | __releases(ptl) | ||
2079 | { | 2076 | { |
2077 | struct vm_area_struct *vma = fe->vma; | ||
2080 | pte_t entry; | 2078 | pte_t entry; |
2081 | /* | 2079 | /* |
2082 | * Clear the pages cpupid information as the existing | 2080 | * Clear the pages cpupid information as the existing |
@@ -2086,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm, | |||
2086 | if (page) | 2084 | if (page) |
2087 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); | 2085 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
2088 | 2086 | ||
2089 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2087 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); |
2090 | entry = pte_mkyoung(orig_pte); | 2088 | entry = pte_mkyoung(orig_pte); |
2091 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2089 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2092 | if (ptep_set_access_flags(vma, address, page_table, entry, 1)) | 2090 | if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) |
2093 | update_mmu_cache(vma, address, page_table); | 2091 | update_mmu_cache(vma, fe->address, fe->pte); |
2094 | pte_unmap_unlock(page_table, ptl); | 2092 | pte_unmap_unlock(fe->pte, fe->ptl); |
2095 | 2093 | ||
2096 | if (dirty_shared) { | 2094 | if (dirty_shared) { |
2097 | struct address_space *mapping; | 2095 | struct address_space *mapping; |
@@ -2137,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm, | |||
2137 | * held to the old page, as well as updating the rmap. | 2135 | * held to the old page, as well as updating the rmap. |
2138 | * - In any case, unlock the PTL and drop the reference we took to the old page. | 2136 | * - In any case, unlock the PTL and drop the reference we took to the old page. |
2139 | */ | 2137 | */ |
2140 | static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | 2138 | static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, |
2141 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2139 | struct page *old_page) |
2142 | pte_t orig_pte, struct page *old_page) | ||
2143 | { | 2140 | { |
2141 | struct vm_area_struct *vma = fe->vma; | ||
2142 | struct mm_struct *mm = vma->vm_mm; | ||
2144 | struct page *new_page = NULL; | 2143 | struct page *new_page = NULL; |
2145 | spinlock_t *ptl = NULL; | ||
2146 | pte_t entry; | 2144 | pte_t entry; |
2147 | int page_copied = 0; | 2145 | int page_copied = 0; |
2148 | const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ | 2146 | const unsigned long mmun_start = fe->address & PAGE_MASK; |
2149 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ | 2147 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; |
2150 | struct mem_cgroup *memcg; | 2148 | struct mem_cgroup *memcg; |
2151 | 2149 | ||
2152 | if (unlikely(anon_vma_prepare(vma))) | 2150 | if (unlikely(anon_vma_prepare(vma))) |
2153 | goto oom; | 2151 | goto oom; |
2154 | 2152 | ||
2155 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2153 | if (is_zero_pfn(pte_pfn(orig_pte))) { |
2156 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 2154 | new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); |
2157 | if (!new_page) | 2155 | if (!new_page) |
2158 | goto oom; | 2156 | goto oom; |
2159 | } else { | 2157 | } else { |
2160 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2158 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, |
2159 | fe->address); | ||
2161 | if (!new_page) | 2160 | if (!new_page) |
2162 | goto oom; | 2161 | goto oom; |
2163 | cow_user_page(new_page, old_page, address, vma); | 2162 | cow_user_page(new_page, old_page, fe->address, vma); |
2164 | } | 2163 | } |
2165 | 2164 | ||
2166 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2165 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
@@ -2173,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2173 | /* | 2172 | /* |
2174 | * Re-check the pte - we dropped the lock | 2173 | * Re-check the pte - we dropped the lock |
2175 | */ | 2174 | */ |
2176 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2175 | fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); |
2177 | if (likely(pte_same(*page_table, orig_pte))) { | 2176 | if (likely(pte_same(*fe->pte, orig_pte))) { |
2178 | if (old_page) { | 2177 | if (old_page) { |
2179 | if (!PageAnon(old_page)) { | 2178 | if (!PageAnon(old_page)) { |
2180 | dec_mm_counter_fast(mm, | 2179 | dec_mm_counter_fast(mm, |
@@ -2184,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2184 | } else { | 2183 | } else { |
2185 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2184 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2186 | } | 2185 | } |
2187 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2186 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); |
2188 | entry = mk_pte(new_page, vma->vm_page_prot); | 2187 | entry = mk_pte(new_page, vma->vm_page_prot); |
2189 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2188 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2190 | /* | 2189 | /* |
@@ -2193,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2193 | * seen in the presence of one thread doing SMC and another | 2192 | * seen in the presence of one thread doing SMC and another |
2194 | * thread doing COW. | 2193 | * thread doing COW. |
2195 | */ | 2194 | */ |
2196 | ptep_clear_flush_notify(vma, address, page_table); | 2195 | ptep_clear_flush_notify(vma, fe->address, fe->pte); |
2197 | page_add_new_anon_rmap(new_page, vma, address, false); | 2196 | page_add_new_anon_rmap(new_page, vma, fe->address, false); |
2198 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 2197 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
2199 | lru_cache_add_active_or_unevictable(new_page, vma); | 2198 | lru_cache_add_active_or_unevictable(new_page, vma); |
2200 | /* | 2199 | /* |
@@ -2202,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2202 | * mmu page tables (such as kvm shadow page tables), we want the | 2201 | * mmu page tables (such as kvm shadow page tables), we want the |
2203 | * new page to be mapped directly into the secondary page table. | 2202 | * new page to be mapped directly into the secondary page table. |
2204 | */ | 2203 | */ |
2205 | set_pte_at_notify(mm, address, page_table, entry); | 2204 | set_pte_at_notify(mm, fe->address, fe->pte, entry); |
2206 | update_mmu_cache(vma, address, page_table); | 2205 | update_mmu_cache(vma, fe->address, fe->pte); |
2207 | if (old_page) { | 2206 | if (old_page) { |
2208 | /* | 2207 | /* |
2209 | * Only after switching the pte to the new page may | 2208 | * Only after switching the pte to the new page may |
@@ -2240,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2240 | if (new_page) | 2239 | if (new_page) |
2241 | put_page(new_page); | 2240 | put_page(new_page); |
2242 | 2241 | ||
2243 | pte_unmap_unlock(page_table, ptl); | 2242 | pte_unmap_unlock(fe->pte, fe->ptl); |
2244 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2243 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2245 | if (old_page) { | 2244 | if (old_page) { |
2246 | /* | 2245 | /* |
@@ -2268,44 +2267,43 @@ oom: | |||
2268 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | 2267 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED |
2269 | * mapping | 2268 | * mapping |
2270 | */ | 2269 | */ |
2271 | static int wp_pfn_shared(struct mm_struct *mm, | 2270 | static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) |
2272 | struct vm_area_struct *vma, unsigned long address, | ||
2273 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
2274 | pmd_t *pmd) | ||
2275 | { | 2271 | { |
2272 | struct vm_area_struct *vma = fe->vma; | ||
2273 | |||
2276 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | 2274 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { |
2277 | struct vm_fault vmf = { | 2275 | struct vm_fault vmf = { |
2278 | .page = NULL, | 2276 | .page = NULL, |
2279 | .pgoff = linear_page_index(vma, address), | 2277 | .pgoff = linear_page_index(vma, fe->address), |
2280 | .virtual_address = (void __user *)(address & PAGE_MASK), | 2278 | .virtual_address = |
2279 | (void __user *)(fe->address & PAGE_MASK), | ||
2281 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | 2280 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, |
2282 | }; | 2281 | }; |
2283 | int ret; | 2282 | int ret; |
2284 | 2283 | ||
2285 | pte_unmap_unlock(page_table, ptl); | 2284 | pte_unmap_unlock(fe->pte, fe->ptl); |
2286 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | 2285 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); |
2287 | if (ret & VM_FAULT_ERROR) | 2286 | if (ret & VM_FAULT_ERROR) |
2288 | return ret; | 2287 | return ret; |
2289 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2288 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2289 | &fe->ptl); | ||
2290 | /* | 2290 | /* |
2291 | * We might have raced with another page fault while we | 2291 | * We might have raced with another page fault while we |
2292 | * released the pte_offset_map_lock. | 2292 | * released the pte_offset_map_lock. |
2293 | */ | 2293 | */ |
2294 | if (!pte_same(*page_table, orig_pte)) { | 2294 | if (!pte_same(*fe->pte, orig_pte)) { |
2295 | pte_unmap_unlock(page_table, ptl); | 2295 | pte_unmap_unlock(fe->pte, fe->ptl); |
2296 | return 0; | 2296 | return 0; |
2297 | } | 2297 | } |
2298 | } | 2298 | } |
2299 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | 2299 | return wp_page_reuse(fe, orig_pte, NULL, 0, 0); |
2300 | NULL, 0, 0); | ||
2301 | } | 2300 | } |
2302 | 2301 | ||
2303 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | 2302 | static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, |
2304 | unsigned long address, pte_t *page_table, | 2303 | struct page *old_page) |
2305 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | 2304 | __releases(fe->ptl) |
2306 | struct page *old_page) | ||
2307 | __releases(ptl) | ||
2308 | { | 2305 | { |
2306 | struct vm_area_struct *vma = fe->vma; | ||
2309 | int page_mkwrite = 0; | 2307 | int page_mkwrite = 0; |
2310 | 2308 | ||
2311 | get_page(old_page); | 2309 | get_page(old_page); |
@@ -2313,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2313 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2311 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2314 | int tmp; | 2312 | int tmp; |
2315 | 2313 | ||
2316 | pte_unmap_unlock(page_table, ptl); | 2314 | pte_unmap_unlock(fe->pte, fe->ptl); |
2317 | tmp = do_page_mkwrite(vma, old_page, address); | 2315 | tmp = do_page_mkwrite(vma, old_page, fe->address); |
2318 | if (unlikely(!tmp || (tmp & | 2316 | if (unlikely(!tmp || (tmp & |
2319 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2317 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2320 | put_page(old_page); | 2318 | put_page(old_page); |
@@ -2326,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2326 | * they did, we just return, as we can count on the | 2324 | * they did, we just return, as we can count on the |
2327 | * MMU to tell us if they didn't also make it writable. | 2325 | * MMU to tell us if they didn't also make it writable. |
2328 | */ | 2326 | */ |
2329 | page_table = pte_offset_map_lock(mm, pmd, address, | 2327 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2330 | &ptl); | 2328 | &fe->ptl); |
2331 | if (!pte_same(*page_table, orig_pte)) { | 2329 | if (!pte_same(*fe->pte, orig_pte)) { |
2332 | unlock_page(old_page); | 2330 | unlock_page(old_page); |
2333 | pte_unmap_unlock(page_table, ptl); | 2331 | pte_unmap_unlock(fe->pte, fe->ptl); |
2334 | put_page(old_page); | 2332 | put_page(old_page); |
2335 | return 0; | 2333 | return 0; |
2336 | } | 2334 | } |
2337 | page_mkwrite = 1; | 2335 | page_mkwrite = 1; |
2338 | } | 2336 | } |
2339 | 2337 | ||
2340 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2338 | return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); |
2341 | orig_pte, old_page, page_mkwrite, 1); | ||
2342 | } | 2339 | } |
2343 | 2340 | ||
2344 | /* | 2341 | /* |
@@ -2359,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2359 | * but allow concurrent faults), with pte both mapped and locked. | 2356 | * but allow concurrent faults), with pte both mapped and locked. |
2360 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2357 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2361 | */ | 2358 | */ |
2362 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2359 | static int do_wp_page(struct fault_env *fe, pte_t orig_pte) |
2363 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2360 | __releases(fe->ptl) |
2364 | spinlock_t *ptl, pte_t orig_pte) | ||
2365 | __releases(ptl) | ||
2366 | { | 2361 | { |
2362 | struct vm_area_struct *vma = fe->vma; | ||
2367 | struct page *old_page; | 2363 | struct page *old_page; |
2368 | 2364 | ||
2369 | old_page = vm_normal_page(vma, address, orig_pte); | 2365 | old_page = vm_normal_page(vma, fe->address, orig_pte); |
2370 | if (!old_page) { | 2366 | if (!old_page) { |
2371 | /* | 2367 | /* |
2372 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | 2368 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
@@ -2377,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2377 | */ | 2373 | */ |
2378 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2374 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2379 | (VM_WRITE|VM_SHARED)) | 2375 | (VM_WRITE|VM_SHARED)) |
2380 | return wp_pfn_shared(mm, vma, address, page_table, ptl, | 2376 | return wp_pfn_shared(fe, orig_pte); |
2381 | orig_pte, pmd); | ||
2382 | 2377 | ||
2383 | pte_unmap_unlock(page_table, ptl); | 2378 | pte_unmap_unlock(fe->pte, fe->ptl); |
2384 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2379 | return wp_page_copy(fe, orig_pte, old_page); |
2385 | orig_pte, old_page); | ||
2386 | } | 2380 | } |
2387 | 2381 | ||
2388 | /* | 2382 | /* |
@@ -2393,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2393 | int total_mapcount; | 2387 | int total_mapcount; |
2394 | if (!trylock_page(old_page)) { | 2388 | if (!trylock_page(old_page)) { |
2395 | get_page(old_page); | 2389 | get_page(old_page); |
2396 | pte_unmap_unlock(page_table, ptl); | 2390 | pte_unmap_unlock(fe->pte, fe->ptl); |
2397 | lock_page(old_page); | 2391 | lock_page(old_page); |
2398 | page_table = pte_offset_map_lock(mm, pmd, address, | 2392 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, |
2399 | &ptl); | 2393 | fe->address, &fe->ptl); |
2400 | if (!pte_same(*page_table, orig_pte)) { | 2394 | if (!pte_same(*fe->pte, orig_pte)) { |
2401 | unlock_page(old_page); | 2395 | unlock_page(old_page); |
2402 | pte_unmap_unlock(page_table, ptl); | 2396 | pte_unmap_unlock(fe->pte, fe->ptl); |
2403 | put_page(old_page); | 2397 | put_page(old_page); |
2404 | return 0; | 2398 | return 0; |
2405 | } | 2399 | } |
@@ -2417,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2417 | page_move_anon_rmap(old_page, vma); | 2411 | page_move_anon_rmap(old_page, vma); |
2418 | } | 2412 | } |
2419 | unlock_page(old_page); | 2413 | unlock_page(old_page); |
2420 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2414 | return wp_page_reuse(fe, orig_pte, old_page, 0, 0); |
2421 | orig_pte, old_page, 0, 0); | ||
2422 | } | 2415 | } |
2423 | unlock_page(old_page); | 2416 | unlock_page(old_page); |
2424 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2417 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2425 | (VM_WRITE|VM_SHARED))) { | 2418 | (VM_WRITE|VM_SHARED))) { |
2426 | return wp_page_shared(mm, vma, address, page_table, pmd, | 2419 | return wp_page_shared(fe, orig_pte, old_page); |
2427 | ptl, orig_pte, old_page); | ||
2428 | } | 2420 | } |
2429 | 2421 | ||
2430 | /* | 2422 | /* |
@@ -2432,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2432 | */ | 2424 | */ |
2433 | get_page(old_page); | 2425 | get_page(old_page); |
2434 | 2426 | ||
2435 | pte_unmap_unlock(page_table, ptl); | 2427 | pte_unmap_unlock(fe->pte, fe->ptl); |
2436 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2428 | return wp_page_copy(fe, orig_pte, old_page); |
2437 | orig_pte, old_page); | ||
2438 | } | 2429 | } |
2439 | 2430 | ||
2440 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2431 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
@@ -2522,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2522 | * We return with the mmap_sem locked or unlocked in the same cases | 2513 | * We return with the mmap_sem locked or unlocked in the same cases |
2523 | * as does filemap_fault(). | 2514 | * as does filemap_fault(). |
2524 | */ | 2515 | */ |
2525 | int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2516 | int do_swap_page(struct fault_env *fe, pte_t orig_pte) |
2526 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2527 | unsigned int flags, pte_t orig_pte) | ||
2528 | { | 2517 | { |
2529 | spinlock_t *ptl; | 2518 | struct vm_area_struct *vma = fe->vma; |
2530 | struct page *page, *swapcache; | 2519 | struct page *page, *swapcache; |
2531 | struct mem_cgroup *memcg; | 2520 | struct mem_cgroup *memcg; |
2532 | swp_entry_t entry; | 2521 | swp_entry_t entry; |
@@ -2535,17 +2524,17 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2535 | int exclusive = 0; | 2524 | int exclusive = 0; |
2536 | int ret = 0; | 2525 | int ret = 0; |
2537 | 2526 | ||
2538 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2527 | if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) |
2539 | goto out; | 2528 | goto out; |
2540 | 2529 | ||
2541 | entry = pte_to_swp_entry(orig_pte); | 2530 | entry = pte_to_swp_entry(orig_pte); |
2542 | if (unlikely(non_swap_entry(entry))) { | 2531 | if (unlikely(non_swap_entry(entry))) { |
2543 | if (is_migration_entry(entry)) { | 2532 | if (is_migration_entry(entry)) { |
2544 | migration_entry_wait(mm, pmd, address); | 2533 | migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); |
2545 | } else if (is_hwpoison_entry(entry)) { | 2534 | } else if (is_hwpoison_entry(entry)) { |
2546 | ret = VM_FAULT_HWPOISON; | 2535 | ret = VM_FAULT_HWPOISON; |
2547 | } else { | 2536 | } else { |
2548 | print_bad_pte(vma, address, orig_pte, NULL); | 2537 | print_bad_pte(vma, fe->address, orig_pte, NULL); |
2549 | ret = VM_FAULT_SIGBUS; | 2538 | ret = VM_FAULT_SIGBUS; |
2550 | } | 2539 | } |
2551 | goto out; | 2540 | goto out; |
@@ -2554,14 +2543,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2554 | page = lookup_swap_cache(entry); | 2543 | page = lookup_swap_cache(entry); |
2555 | if (!page) { | 2544 | if (!page) { |
2556 | page = swapin_readahead(entry, | 2545 | page = swapin_readahead(entry, |
2557 | GFP_HIGHUSER_MOVABLE, vma, address); | 2546 | GFP_HIGHUSER_MOVABLE, vma, fe->address); |
2558 | if (!page) { | 2547 | if (!page) { |
2559 | /* | 2548 | /* |
2560 | * Back out if somebody else faulted in this pte | 2549 | * Back out if somebody else faulted in this pte |
2561 | * while we released the pte lock. | 2550 | * while we released the pte lock. |
2562 | */ | 2551 | */ |
2563 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2552 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, |
2564 | if (likely(pte_same(*page_table, orig_pte))) | 2553 | fe->address, &fe->ptl); |
2554 | if (likely(pte_same(*fe->pte, orig_pte))) | ||
2565 | ret = VM_FAULT_OOM; | 2555 | ret = VM_FAULT_OOM; |
2566 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2556 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2567 | goto unlock; | 2557 | goto unlock; |
@@ -2570,7 +2560,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2570 | /* Had to read the page from swap area: Major fault */ | 2560 | /* Had to read the page from swap area: Major fault */ |
2571 | ret = VM_FAULT_MAJOR; | 2561 | ret = VM_FAULT_MAJOR; |
2572 | count_vm_event(PGMAJFAULT); | 2562 | count_vm_event(PGMAJFAULT); |
2573 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); | 2563 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
2574 | } else if (PageHWPoison(page)) { | 2564 | } else if (PageHWPoison(page)) { |
2575 | /* | 2565 | /* |
2576 | * hwpoisoned dirty swapcache pages are kept for killing | 2566 | * hwpoisoned dirty swapcache pages are kept for killing |
@@ -2583,7 +2573,7 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2583 | } | 2573 | } |
2584 | 2574 | ||
2585 | swapcache = page; | 2575 | swapcache = page; |
2586 | locked = lock_page_or_retry(page, mm, flags); | 2576 | locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); |
2587 | 2577 | ||
2588 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2578 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2589 | if (!locked) { | 2579 | if (!locked) { |
@@ -2600,14 +2590,15 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2600 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2590 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2601 | goto out_page; | 2591 | goto out_page; |
2602 | 2592 | ||
2603 | page = ksm_might_need_to_copy(page, vma, address); | 2593 | page = ksm_might_need_to_copy(page, vma, fe->address); |
2604 | if (unlikely(!page)) { | 2594 | if (unlikely(!page)) { |
2605 | ret = VM_FAULT_OOM; | 2595 | ret = VM_FAULT_OOM; |
2606 | page = swapcache; | 2596 | page = swapcache; |
2607 | goto out_page; | 2597 | goto out_page; |
2608 | } | 2598 | } |
2609 | 2599 | ||
2610 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { | 2600 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, |
2601 | &memcg, false)) { | ||
2611 | ret = VM_FAULT_OOM; | 2602 | ret = VM_FAULT_OOM; |
2612 | goto out_page; | 2603 | goto out_page; |
2613 | } | 2604 | } |
@@ -2615,8 +2606,9 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2615 | /* | 2606 | /* |
2616 | * Back out if somebody else already faulted in this pte. | 2607 | * Back out if somebody else already faulted in this pte. |
2617 | */ | 2608 | */ |
2618 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2609 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2619 | if (unlikely(!pte_same(*page_table, orig_pte))) | 2610 | &fe->ptl); |
2611 | if (unlikely(!pte_same(*fe->pte, orig_pte))) | ||
2620 | goto out_nomap; | 2612 | goto out_nomap; |
2621 | 2613 | ||
2622 | if (unlikely(!PageUptodate(page))) { | 2614 | if (unlikely(!PageUptodate(page))) { |
@@ -2634,24 +2626,24 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2634 | * must be called after the swap_free(), or it will never succeed. | 2626 | * must be called after the swap_free(), or it will never succeed. |
2635 | */ | 2627 | */ |
2636 | 2628 | ||
2637 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2629 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2638 | dec_mm_counter_fast(mm, MM_SWAPENTS); | 2630 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); |
2639 | pte = mk_pte(page, vma->vm_page_prot); | 2631 | pte = mk_pte(page, vma->vm_page_prot); |
2640 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { | 2632 | if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { |
2641 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2633 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2642 | flags &= ~FAULT_FLAG_WRITE; | 2634 | fe->flags &= ~FAULT_FLAG_WRITE; |
2643 | ret |= VM_FAULT_WRITE; | 2635 | ret |= VM_FAULT_WRITE; |
2644 | exclusive = RMAP_EXCLUSIVE; | 2636 | exclusive = RMAP_EXCLUSIVE; |
2645 | } | 2637 | } |
2646 | flush_icache_page(vma, page); | 2638 | flush_icache_page(vma, page); |
2647 | if (pte_swp_soft_dirty(orig_pte)) | 2639 | if (pte_swp_soft_dirty(orig_pte)) |
2648 | pte = pte_mksoft_dirty(pte); | 2640 | pte = pte_mksoft_dirty(pte); |
2649 | set_pte_at(mm, address, page_table, pte); | 2641 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); |
2650 | if (page == swapcache) { | 2642 | if (page == swapcache) { |
2651 | do_page_add_anon_rmap(page, vma, address, exclusive); | 2643 | do_page_add_anon_rmap(page, vma, fe->address, exclusive); |
2652 | mem_cgroup_commit_charge(page, memcg, true, false); | 2644 | mem_cgroup_commit_charge(page, memcg, true, false); |
2653 | } else { /* ksm created a completely new copy */ | 2645 | } else { /* ksm created a completely new copy */ |
2654 | page_add_new_anon_rmap(page, vma, address, false); | 2646 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2655 | mem_cgroup_commit_charge(page, memcg, false, false); | 2647 | mem_cgroup_commit_charge(page, memcg, false, false); |
2656 | lru_cache_add_active_or_unevictable(page, vma); | 2648 | lru_cache_add_active_or_unevictable(page, vma); |
2657 | } | 2649 | } |
@@ -2674,22 +2666,22 @@ int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2674 | put_page(swapcache); | 2666 | put_page(swapcache); |
2675 | } | 2667 | } |
2676 | 2668 | ||
2677 | if (flags & FAULT_FLAG_WRITE) { | 2669 | if (fe->flags & FAULT_FLAG_WRITE) { |
2678 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); | 2670 | ret |= do_wp_page(fe, pte); |
2679 | if (ret & VM_FAULT_ERROR) | 2671 | if (ret & VM_FAULT_ERROR) |
2680 | ret &= VM_FAULT_ERROR; | 2672 | ret &= VM_FAULT_ERROR; |
2681 | goto out; | 2673 | goto out; |
2682 | } | 2674 | } |
2683 | 2675 | ||
2684 | /* No need to invalidate - it was non-present before */ | 2676 | /* No need to invalidate - it was non-present before */ |
2685 | update_mmu_cache(vma, address, page_table); | 2677 | update_mmu_cache(vma, fe->address, fe->pte); |
2686 | unlock: | 2678 | unlock: |
2687 | pte_unmap_unlock(page_table, ptl); | 2679 | pte_unmap_unlock(fe->pte, fe->ptl); |
2688 | out: | 2680 | out: |
2689 | return ret; | 2681 | return ret; |
2690 | out_nomap: | 2682 | out_nomap: |
2691 | mem_cgroup_cancel_charge(page, memcg, false); | 2683 | mem_cgroup_cancel_charge(page, memcg, false); |
2692 | pte_unmap_unlock(page_table, ptl); | 2684 | pte_unmap_unlock(fe->pte, fe->ptl); |
2693 | out_page: | 2685 | out_page: |
2694 | unlock_page(page); | 2686 | unlock_page(page); |
2695 | out_release: | 2687 | out_release: |
@@ -2740,37 +2732,36 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2740 | * but allow concurrent faults), and pte mapped but not yet locked. | 2732 | * but allow concurrent faults), and pte mapped but not yet locked. |
2741 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2733 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2742 | */ | 2734 | */ |
2743 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2735 | static int do_anonymous_page(struct fault_env *fe) |
2744 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2745 | unsigned int flags) | ||
2746 | { | 2736 | { |
2737 | struct vm_area_struct *vma = fe->vma; | ||
2747 | struct mem_cgroup *memcg; | 2738 | struct mem_cgroup *memcg; |
2748 | struct page *page; | 2739 | struct page *page; |
2749 | spinlock_t *ptl; | ||
2750 | pte_t entry; | 2740 | pte_t entry; |
2751 | 2741 | ||
2752 | pte_unmap(page_table); | 2742 | pte_unmap(fe->pte); |
2753 | 2743 | ||
2754 | /* File mapping without ->vm_ops ? */ | 2744 | /* File mapping without ->vm_ops ? */ |
2755 | if (vma->vm_flags & VM_SHARED) | 2745 | if (vma->vm_flags & VM_SHARED) |
2756 | return VM_FAULT_SIGBUS; | 2746 | return VM_FAULT_SIGBUS; |
2757 | 2747 | ||
2758 | /* Check if we need to add a guard page to the stack */ | 2748 | /* Check if we need to add a guard page to the stack */ |
2759 | if (check_stack_guard_page(vma, address) < 0) | 2749 | if (check_stack_guard_page(vma, fe->address) < 0) |
2760 | return VM_FAULT_SIGSEGV; | 2750 | return VM_FAULT_SIGSEGV; |
2761 | 2751 | ||
2762 | /* Use the zero-page for reads */ | 2752 | /* Use the zero-page for reads */ |
2763 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { | 2753 | if (!(fe->flags & FAULT_FLAG_WRITE) && |
2764 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), | 2754 | !mm_forbids_zeropage(vma->vm_mm)) { |
2755 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), | ||
2765 | vma->vm_page_prot)); | 2756 | vma->vm_page_prot)); |
2766 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2757 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2767 | if (!pte_none(*page_table)) | 2758 | &fe->ptl); |
2759 | if (!pte_none(*fe->pte)) | ||
2768 | goto unlock; | 2760 | goto unlock; |
2769 | /* Deliver the page fault to userland, check inside PT lock */ | 2761 | /* Deliver the page fault to userland, check inside PT lock */ |
2770 | if (userfaultfd_missing(vma)) { | 2762 | if (userfaultfd_missing(vma)) { |
2771 | pte_unmap_unlock(page_table, ptl); | 2763 | pte_unmap_unlock(fe->pte, fe->ptl); |
2772 | return handle_userfault(vma, address, flags, | 2764 | return handle_userfault(fe, VM_UFFD_MISSING); |
2773 | VM_UFFD_MISSING); | ||
2774 | } | 2765 | } |
2775 | goto setpte; | 2766 | goto setpte; |
2776 | } | 2767 | } |
@@ -2778,11 +2769,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2778 | /* Allocate our own private page. */ | 2769 | /* Allocate our own private page. */ |
2779 | if (unlikely(anon_vma_prepare(vma))) | 2770 | if (unlikely(anon_vma_prepare(vma))) |
2780 | goto oom; | 2771 | goto oom; |
2781 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2772 | page = alloc_zeroed_user_highpage_movable(vma, fe->address); |
2782 | if (!page) | 2773 | if (!page) |
2783 | goto oom; | 2774 | goto oom; |
2784 | 2775 | ||
2785 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) | 2776 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) |
2786 | goto oom_free_page; | 2777 | goto oom_free_page; |
2787 | 2778 | ||
2788 | /* | 2779 | /* |
@@ -2796,30 +2787,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2796 | if (vma->vm_flags & VM_WRITE) | 2787 | if (vma->vm_flags & VM_WRITE) |
2797 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2788 | entry = pte_mkwrite(pte_mkdirty(entry)); |
2798 | 2789 | ||
2799 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2790 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2800 | if (!pte_none(*page_table)) | 2791 | &fe->ptl); |
2792 | if (!pte_none(*fe->pte)) | ||
2801 | goto release; | 2793 | goto release; |
2802 | 2794 | ||
2803 | /* Deliver the page fault to userland, check inside PT lock */ | 2795 | /* Deliver the page fault to userland, check inside PT lock */ |
2804 | if (userfaultfd_missing(vma)) { | 2796 | if (userfaultfd_missing(vma)) { |
2805 | pte_unmap_unlock(page_table, ptl); | 2797 | pte_unmap_unlock(fe->pte, fe->ptl); |
2806 | mem_cgroup_cancel_charge(page, memcg, false); | 2798 | mem_cgroup_cancel_charge(page, memcg, false); |
2807 | put_page(page); | 2799 | put_page(page); |
2808 | return handle_userfault(vma, address, flags, | 2800 | return handle_userfault(fe, VM_UFFD_MISSING); |
2809 | VM_UFFD_MISSING); | ||
2810 | } | 2801 | } |
2811 | 2802 | ||
2812 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2803 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2813 | page_add_new_anon_rmap(page, vma, address, false); | 2804 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2814 | mem_cgroup_commit_charge(page, memcg, false, false); | 2805 | mem_cgroup_commit_charge(page, memcg, false, false); |
2815 | lru_cache_add_active_or_unevictable(page, vma); | 2806 | lru_cache_add_active_or_unevictable(page, vma); |
2816 | setpte: | 2807 | setpte: |
2817 | set_pte_at(mm, address, page_table, entry); | 2808 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); |
2818 | 2809 | ||
2819 | /* No need to invalidate - it was non-present before */ | 2810 | /* No need to invalidate - it was non-present before */ |
2820 | update_mmu_cache(vma, address, page_table); | 2811 | update_mmu_cache(vma, fe->address, fe->pte); |
2821 | unlock: | 2812 | unlock: |
2822 | pte_unmap_unlock(page_table, ptl); | 2813 | pte_unmap_unlock(fe->pte, fe->ptl); |
2823 | return 0; | 2814 | return 0; |
2824 | release: | 2815 | release: |
2825 | mem_cgroup_cancel_charge(page, memcg, false); | 2816 | mem_cgroup_cancel_charge(page, memcg, false); |
@@ -2836,17 +2827,16 @@ oom: | |||
2836 | * released depending on flags and vma->vm_ops->fault() return value. | 2827 | * released depending on flags and vma->vm_ops->fault() return value. |
2837 | * See filemap_fault() and __lock_page_retry(). | 2828 | * See filemap_fault() and __lock_page_retry(). |
2838 | */ | 2829 | */ |
2839 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, | 2830 | static int __do_fault(struct fault_env *fe, pgoff_t pgoff, |
2840 | pgoff_t pgoff, unsigned int flags, | 2831 | struct page *cow_page, struct page **page, void **entry) |
2841 | struct page *cow_page, struct page **page, | ||
2842 | void **entry) | ||
2843 | { | 2832 | { |
2833 | struct vm_area_struct *vma = fe->vma; | ||
2844 | struct vm_fault vmf; | 2834 | struct vm_fault vmf; |
2845 | int ret; | 2835 | int ret; |
2846 | 2836 | ||
2847 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2837 | vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); |
2848 | vmf.pgoff = pgoff; | 2838 | vmf.pgoff = pgoff; |
2849 | vmf.flags = flags; | 2839 | vmf.flags = fe->flags; |
2850 | vmf.page = NULL; | 2840 | vmf.page = NULL; |
2851 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | 2841 | vmf.gfp_mask = __get_fault_gfp_mask(vma); |
2852 | vmf.cow_page = cow_page; | 2842 | vmf.cow_page = cow_page; |
@@ -2878,38 +2868,36 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
2878 | /** | 2868 | /** |
2879 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | 2869 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. |
2880 | * | 2870 | * |
2881 | * @vma: virtual memory area | 2871 | * @fe: fault environment |
2882 | * @address: user virtual address | ||
2883 | * @page: page to map | 2872 | * @page: page to map |
2884 | * @pte: pointer to target page table entry | ||
2885 | * @write: true, if new entry is writable | ||
2886 | * @anon: true, if it's anonymous page | ||
2887 | * | 2873 | * |
2888 | * Caller must hold page table lock relevant for @pte. | 2874 | * Caller must hold page table lock relevant for @fe->pte. |
2889 | * | 2875 | * |
2890 | * Target users are page handler itself and implementations of | 2876 | * Target users are page handler itself and implementations of |
2891 | * vm_ops->map_pages. | 2877 | * vm_ops->map_pages. |
2892 | */ | 2878 | */ |
2893 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 2879 | void do_set_pte(struct fault_env *fe, struct page *page) |
2894 | struct page *page, pte_t *pte, bool write, bool anon) | ||
2895 | { | 2880 | { |
2881 | struct vm_area_struct *vma = fe->vma; | ||
2882 | bool write = fe->flags & FAULT_FLAG_WRITE; | ||
2896 | pte_t entry; | 2883 | pte_t entry; |
2897 | 2884 | ||
2898 | flush_icache_page(vma, page); | 2885 | flush_icache_page(vma, page); |
2899 | entry = mk_pte(page, vma->vm_page_prot); | 2886 | entry = mk_pte(page, vma->vm_page_prot); |
2900 | if (write) | 2887 | if (write) |
2901 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2888 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2902 | if (anon) { | 2889 | /* copy-on-write page */ |
2890 | if (write && !(vma->vm_flags & VM_SHARED)) { | ||
2903 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2891 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2904 | page_add_new_anon_rmap(page, vma, address, false); | 2892 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2905 | } else { | 2893 | } else { |
2906 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 2894 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
2907 | page_add_file_rmap(page); | 2895 | page_add_file_rmap(page); |
2908 | } | 2896 | } |
2909 | set_pte_at(vma->vm_mm, address, pte, entry); | 2897 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); |
2910 | 2898 | ||
2911 | /* no need to invalidate: a not-present page won't be cached */ | 2899 | /* no need to invalidate: a not-present page won't be cached */ |
2912 | update_mmu_cache(vma, address, pte); | 2900 | update_mmu_cache(vma, fe->address, fe->pte); |
2913 | } | 2901 | } |
2914 | 2902 | ||
2915 | static unsigned long fault_around_bytes __read_mostly = | 2903 | static unsigned long fault_around_bytes __read_mostly = |
@@ -2976,57 +2964,53 @@ late_initcall(fault_around_debugfs); | |||
2976 | * fault_around_pages() value (and therefore to page order). This way it's | 2964 | * fault_around_pages() value (and therefore to page order). This way it's |
2977 | * easier to guarantee that we don't cross page table boundaries. | 2965 | * easier to guarantee that we don't cross page table boundaries. |
2978 | */ | 2966 | */ |
2979 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 2967 | static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) |
2980 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | ||
2981 | { | 2968 | { |
2982 | unsigned long start_addr, nr_pages, mask; | 2969 | unsigned long address = fe->address, start_addr, nr_pages, mask; |
2983 | pgoff_t max_pgoff; | 2970 | pte_t *pte = fe->pte; |
2984 | struct vm_fault vmf; | 2971 | pgoff_t end_pgoff; |
2985 | int off; | 2972 | int off; |
2986 | 2973 | ||
2987 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 2974 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2988 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 2975 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
2989 | 2976 | ||
2990 | start_addr = max(address & mask, vma->vm_start); | 2977 | start_addr = max(fe->address & mask, fe->vma->vm_start); |
2991 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 2978 | off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
2992 | pte -= off; | 2979 | fe->pte -= off; |
2993 | pgoff -= off; | 2980 | start_pgoff -= off; |
2994 | 2981 | ||
2995 | /* | 2982 | /* |
2996 | * max_pgoff is either end of page table or end of vma | 2983 | * end_pgoff is either end of page table or end of vma |
2997 | * or fault_around_pages() from pgoff, depending what is nearest. | 2984 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
2998 | */ | 2985 | */ |
2999 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 2986 | end_pgoff = start_pgoff - |
2987 | ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | ||
3000 | PTRS_PER_PTE - 1; | 2988 | PTRS_PER_PTE - 1; |
3001 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | 2989 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, |
3002 | pgoff + nr_pages - 1); | 2990 | start_pgoff + nr_pages - 1); |
3003 | 2991 | ||
3004 | /* Check if it makes any sense to call ->map_pages */ | 2992 | /* Check if it makes any sense to call ->map_pages */ |
3005 | while (!pte_none(*pte)) { | 2993 | fe->address = start_addr; |
3006 | if (++pgoff > max_pgoff) | 2994 | while (!pte_none(*fe->pte)) { |
3007 | return; | 2995 | if (++start_pgoff > end_pgoff) |
3008 | start_addr += PAGE_SIZE; | 2996 | goto out; |
3009 | if (start_addr >= vma->vm_end) | 2997 | fe->address += PAGE_SIZE; |
3010 | return; | 2998 | if (fe->address >= fe->vma->vm_end) |
3011 | pte++; | 2999 | goto out; |
3000 | fe->pte++; | ||
3012 | } | 3001 | } |
3013 | 3002 | ||
3014 | vmf.virtual_address = (void __user *) start_addr; | 3003 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); |
3015 | vmf.pte = pte; | 3004 | out: |
3016 | vmf.pgoff = pgoff; | 3005 | /* restore fault_env */ |
3017 | vmf.max_pgoff = max_pgoff; | 3006 | fe->pte = pte; |
3018 | vmf.flags = flags; | 3007 | fe->address = address; |
3019 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
3020 | vma->vm_ops->map_pages(vma, &vmf); | ||
3021 | } | 3008 | } |
3022 | 3009 | ||
3023 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3010 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) |
3024 | unsigned long address, pmd_t *pmd, | ||
3025 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3026 | { | 3011 | { |
3012 | struct vm_area_struct *vma = fe->vma; | ||
3027 | struct page *fault_page; | 3013 | struct page *fault_page; |
3028 | spinlock_t *ptl; | ||
3029 | pte_t *pte; | ||
3030 | int ret = 0; | 3014 | int ret = 0; |
3031 | 3015 | ||
3032 | /* | 3016 | /* |
@@ -3035,66 +3019,68 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3035 | * something). | 3019 | * something). |
3036 | */ | 3020 | */ |
3037 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3021 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
3038 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3022 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
3039 | do_fault_around(vma, address, pte, pgoff, flags); | 3023 | &fe->ptl); |
3040 | if (!pte_same(*pte, orig_pte)) | 3024 | if (!pte_same(*fe->pte, orig_pte)) |
3025 | goto unlock_out; | ||
3026 | do_fault_around(fe, pgoff); | ||
3027 | /* Check if the fault is handled by faultaround */ | ||
3028 | if (!pte_same(*fe->pte, orig_pte)) | ||
3041 | goto unlock_out; | 3029 | goto unlock_out; |
3042 | pte_unmap_unlock(pte, ptl); | 3030 | pte_unmap_unlock(fe->pte, fe->ptl); |
3043 | } | 3031 | } |
3044 | 3032 | ||
3045 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); | 3033 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3046 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3034 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3047 | return ret; | 3035 | return ret; |
3048 | 3036 | ||
3049 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3037 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl); |
3050 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3038 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { |
3051 | pte_unmap_unlock(pte, ptl); | 3039 | pte_unmap_unlock(fe->pte, fe->ptl); |
3052 | unlock_page(fault_page); | 3040 | unlock_page(fault_page); |
3053 | put_page(fault_page); | 3041 | put_page(fault_page); |
3054 | return ret; | 3042 | return ret; |
3055 | } | 3043 | } |
3056 | do_set_pte(vma, address, fault_page, pte, false, false); | 3044 | do_set_pte(fe, fault_page); |
3057 | unlock_page(fault_page); | 3045 | unlock_page(fault_page); |
3058 | unlock_out: | 3046 | unlock_out: |
3059 | pte_unmap_unlock(pte, ptl); | 3047 | pte_unmap_unlock(fe->pte, fe->ptl); |
3060 | return ret; | 3048 | return ret; |
3061 | } | 3049 | } |
3062 | 3050 | ||
3063 | static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3051 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) |
3064 | unsigned long address, pmd_t *pmd, | ||
3065 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3066 | { | 3052 | { |
3053 | struct vm_area_struct *vma = fe->vma; | ||
3067 | struct page *fault_page, *new_page; | 3054 | struct page *fault_page, *new_page; |
3068 | void *fault_entry; | 3055 | void *fault_entry; |
3069 | struct mem_cgroup *memcg; | 3056 | struct mem_cgroup *memcg; |
3070 | spinlock_t *ptl; | ||
3071 | pte_t *pte; | ||
3072 | int ret; | 3057 | int ret; |
3073 | 3058 | ||
3074 | if (unlikely(anon_vma_prepare(vma))) | 3059 | if (unlikely(anon_vma_prepare(vma))) |
3075 | return VM_FAULT_OOM; | 3060 | return VM_FAULT_OOM; |
3076 | 3061 | ||
3077 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 3062 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); |
3078 | if (!new_page) | 3063 | if (!new_page) |
3079 | return VM_FAULT_OOM; | 3064 | return VM_FAULT_OOM; |
3080 | 3065 | ||
3081 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { | 3066 | if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, |
3067 | &memcg, false)) { | ||
3082 | put_page(new_page); | 3068 | put_page(new_page); |
3083 | return VM_FAULT_OOM; | 3069 | return VM_FAULT_OOM; |
3084 | } | 3070 | } |
3085 | 3071 | ||
3086 | ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, | 3072 | ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); |
3087 | &fault_entry); | ||
3088 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3073 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3089 | goto uncharge_out; | 3074 | goto uncharge_out; |
3090 | 3075 | ||
3091 | if (!(ret & VM_FAULT_DAX_LOCKED)) | 3076 | if (!(ret & VM_FAULT_DAX_LOCKED)) |
3092 | copy_user_highpage(new_page, fault_page, address, vma); | 3077 | copy_user_highpage(new_page, fault_page, fe->address, vma); |
3093 | __SetPageUptodate(new_page); | 3078 | __SetPageUptodate(new_page); |
3094 | 3079 | ||
3095 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3080 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
3096 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3081 | &fe->ptl); |
3097 | pte_unmap_unlock(pte, ptl); | 3082 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { |
3083 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3098 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | 3084 | if (!(ret & VM_FAULT_DAX_LOCKED)) { |
3099 | unlock_page(fault_page); | 3085 | unlock_page(fault_page); |
3100 | put_page(fault_page); | 3086 | put_page(fault_page); |
@@ -3104,10 +3090,10 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3104 | } | 3090 | } |
3105 | goto uncharge_out; | 3091 | goto uncharge_out; |
3106 | } | 3092 | } |
3107 | do_set_pte(vma, address, new_page, pte, true, true); | 3093 | do_set_pte(fe, new_page); |
3108 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 3094 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
3109 | lru_cache_add_active_or_unevictable(new_page, vma); | 3095 | lru_cache_add_active_or_unevictable(new_page, vma); |
3110 | pte_unmap_unlock(pte, ptl); | 3096 | pte_unmap_unlock(fe->pte, fe->ptl); |
3111 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | 3097 | if (!(ret & VM_FAULT_DAX_LOCKED)) { |
3112 | unlock_page(fault_page); | 3098 | unlock_page(fault_page); |
3113 | put_page(fault_page); | 3099 | put_page(fault_page); |
@@ -3121,18 +3107,15 @@ uncharge_out: | |||
3121 | return ret; | 3107 | return ret; |
3122 | } | 3108 | } |
3123 | 3109 | ||
3124 | static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3110 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) |
3125 | unsigned long address, pmd_t *pmd, | ||
3126 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3127 | { | 3111 | { |
3112 | struct vm_area_struct *vma = fe->vma; | ||
3128 | struct page *fault_page; | 3113 | struct page *fault_page; |
3129 | struct address_space *mapping; | 3114 | struct address_space *mapping; |
3130 | spinlock_t *ptl; | ||
3131 | pte_t *pte; | ||
3132 | int dirtied = 0; | 3115 | int dirtied = 0; |
3133 | int ret, tmp; | 3116 | int ret, tmp; |
3134 | 3117 | ||
3135 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); | 3118 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3136 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3119 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3137 | return ret; | 3120 | return ret; |
3138 | 3121 | ||
@@ -3142,7 +3125,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3142 | */ | 3125 | */ |
3143 | if (vma->vm_ops->page_mkwrite) { | 3126 | if (vma->vm_ops->page_mkwrite) { |
3144 | unlock_page(fault_page); | 3127 | unlock_page(fault_page); |
3145 | tmp = do_page_mkwrite(vma, fault_page, address); | 3128 | tmp = do_page_mkwrite(vma, fault_page, fe->address); |
3146 | if (unlikely(!tmp || | 3129 | if (unlikely(!tmp || |
3147 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 3130 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
3148 | put_page(fault_page); | 3131 | put_page(fault_page); |
@@ -3150,15 +3133,16 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3150 | } | 3133 | } |
3151 | } | 3134 | } |
3152 | 3135 | ||
3153 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3136 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
3154 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3137 | &fe->ptl); |
3155 | pte_unmap_unlock(pte, ptl); | 3138 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { |
3139 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3156 | unlock_page(fault_page); | 3140 | unlock_page(fault_page); |
3157 | put_page(fault_page); | 3141 | put_page(fault_page); |
3158 | return ret; | 3142 | return ret; |
3159 | } | 3143 | } |
3160 | do_set_pte(vma, address, fault_page, pte, true, false); | 3144 | do_set_pte(fe, fault_page); |
3161 | pte_unmap_unlock(pte, ptl); | 3145 | pte_unmap_unlock(fe->pte, fe->ptl); |
3162 | 3146 | ||
3163 | if (set_page_dirty(fault_page)) | 3147 | if (set_page_dirty(fault_page)) |
3164 | dirtied = 1; | 3148 | dirtied = 1; |
@@ -3190,23 +3174,20 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3190 | * The mmap_sem may have been released depending on flags and our | 3174 | * The mmap_sem may have been released depending on flags and our |
3191 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3175 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3192 | */ | 3176 | */ |
3193 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3177 | static int do_fault(struct fault_env *fe, pte_t orig_pte) |
3194 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
3195 | unsigned int flags, pte_t orig_pte) | ||
3196 | { | 3178 | { |
3197 | pgoff_t pgoff = linear_page_index(vma, address); | 3179 | struct vm_area_struct *vma = fe->vma; |
3180 | pgoff_t pgoff = linear_page_index(vma, fe->address); | ||
3198 | 3181 | ||
3199 | pte_unmap(page_table); | 3182 | pte_unmap(fe->pte); |
3200 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3183 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
3201 | if (!vma->vm_ops->fault) | 3184 | if (!vma->vm_ops->fault) |
3202 | return VM_FAULT_SIGBUS; | 3185 | return VM_FAULT_SIGBUS; |
3203 | if (!(flags & FAULT_FLAG_WRITE)) | 3186 | if (!(fe->flags & FAULT_FLAG_WRITE)) |
3204 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | 3187 | return do_read_fault(fe, pgoff, orig_pte); |
3205 | orig_pte); | ||
3206 | if (!(vma->vm_flags & VM_SHARED)) | 3188 | if (!(vma->vm_flags & VM_SHARED)) |
3207 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3189 | return do_cow_fault(fe, pgoff, orig_pte); |
3208 | orig_pte); | 3190 | return do_shared_fault(fe, pgoff, orig_pte); |
3209 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | ||
3210 | } | 3191 | } |
3211 | 3192 | ||
3212 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3193 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
@@ -3224,11 +3205,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | |||
3224 | return mpol_misplaced(page, vma, addr); | 3205 | return mpol_misplaced(page, vma, addr); |
3225 | } | 3206 | } |
3226 | 3207 | ||
3227 | static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 3208 | static int do_numa_page(struct fault_env *fe, pte_t pte) |
3228 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3229 | { | 3209 | { |
3210 | struct vm_area_struct *vma = fe->vma; | ||
3230 | struct page *page = NULL; | 3211 | struct page *page = NULL; |
3231 | spinlock_t *ptl; | ||
3232 | int page_nid = -1; | 3212 | int page_nid = -1; |
3233 | int last_cpupid; | 3213 | int last_cpupid; |
3234 | int target_nid; | 3214 | int target_nid; |
@@ -3248,10 +3228,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3248 | * page table entry is not accessible, so there would be no | 3228 | * page table entry is not accessible, so there would be no |
3249 | * concurrent hardware modifications to the PTE. | 3229 | * concurrent hardware modifications to the PTE. |
3250 | */ | 3230 | */ |
3251 | ptl = pte_lockptr(mm, pmd); | 3231 | fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); |
3252 | spin_lock(ptl); | 3232 | spin_lock(fe->ptl); |
3253 | if (unlikely(!pte_same(*ptep, pte))) { | 3233 | if (unlikely(!pte_same(*fe->pte, pte))) { |
3254 | pte_unmap_unlock(ptep, ptl); | 3234 | pte_unmap_unlock(fe->pte, fe->ptl); |
3255 | goto out; | 3235 | goto out; |
3256 | } | 3236 | } |
3257 | 3237 | ||
@@ -3260,18 +3240,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3260 | pte = pte_mkyoung(pte); | 3240 | pte = pte_mkyoung(pte); |
3261 | if (was_writable) | 3241 | if (was_writable) |
3262 | pte = pte_mkwrite(pte); | 3242 | pte = pte_mkwrite(pte); |
3263 | set_pte_at(mm, addr, ptep, pte); | 3243 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); |
3264 | update_mmu_cache(vma, addr, ptep); | 3244 | update_mmu_cache(vma, fe->address, fe->pte); |
3265 | 3245 | ||
3266 | page = vm_normal_page(vma, addr, pte); | 3246 | page = vm_normal_page(vma, fe->address, pte); |
3267 | if (!page) { | 3247 | if (!page) { |
3268 | pte_unmap_unlock(ptep, ptl); | 3248 | pte_unmap_unlock(fe->pte, fe->ptl); |
3269 | return 0; | 3249 | return 0; |
3270 | } | 3250 | } |
3271 | 3251 | ||
3272 | /* TODO: handle PTE-mapped THP */ | 3252 | /* TODO: handle PTE-mapped THP */ |
3273 | if (PageCompound(page)) { | 3253 | if (PageCompound(page)) { |
3274 | pte_unmap_unlock(ptep, ptl); | 3254 | pte_unmap_unlock(fe->pte, fe->ptl); |
3275 | return 0; | 3255 | return 0; |
3276 | } | 3256 | } |
3277 | 3257 | ||
@@ -3295,8 +3275,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3295 | 3275 | ||
3296 | last_cpupid = page_cpupid_last(page); | 3276 | last_cpupid = page_cpupid_last(page); |
3297 | page_nid = page_to_nid(page); | 3277 | page_nid = page_to_nid(page); |
3298 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); | 3278 | target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, |
3299 | pte_unmap_unlock(ptep, ptl); | 3279 | &flags); |
3280 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3300 | if (target_nid == -1) { | 3281 | if (target_nid == -1) { |
3301 | put_page(page); | 3282 | put_page(page); |
3302 | goto out; | 3283 | goto out; |
@@ -3316,24 +3297,24 @@ out: | |||
3316 | return 0; | 3297 | return 0; |
3317 | } | 3298 | } |
3318 | 3299 | ||
3319 | static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3300 | static int create_huge_pmd(struct fault_env *fe) |
3320 | unsigned long address, pmd_t *pmd, unsigned int flags) | ||
3321 | { | 3301 | { |
3302 | struct vm_area_struct *vma = fe->vma; | ||
3322 | if (vma_is_anonymous(vma)) | 3303 | if (vma_is_anonymous(vma)) |
3323 | return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); | 3304 | return do_huge_pmd_anonymous_page(fe); |
3324 | if (vma->vm_ops->pmd_fault) | 3305 | if (vma->vm_ops->pmd_fault) |
3325 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3306 | return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, |
3307 | fe->flags); | ||
3326 | return VM_FAULT_FALLBACK; | 3308 | return VM_FAULT_FALLBACK; |
3327 | } | 3309 | } |
3328 | 3310 | ||
3329 | static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3311 | static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) |
3330 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd, | ||
3331 | unsigned int flags) | ||
3332 | { | 3312 | { |
3333 | if (vma_is_anonymous(vma)) | 3313 | if (vma_is_anonymous(fe->vma)) |
3334 | return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); | 3314 | return do_huge_pmd_wp_page(fe, orig_pmd); |
3335 | if (vma->vm_ops->pmd_fault) | 3315 | if (fe->vma->vm_ops->pmd_fault) |
3336 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3316 | return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, |
3317 | fe->flags); | ||
3337 | return VM_FAULT_FALLBACK; | 3318 | return VM_FAULT_FALLBACK; |
3338 | } | 3319 | } |
3339 | 3320 | ||
@@ -3353,12 +3334,9 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3353 | * The mmap_sem may have been released depending on flags and our | 3334 | * The mmap_sem may have been released depending on flags and our |
3354 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3335 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3355 | */ | 3336 | */ |
3356 | static int handle_pte_fault(struct mm_struct *mm, | 3337 | static int handle_pte_fault(struct fault_env *fe) |
3357 | struct vm_area_struct *vma, unsigned long address, | ||
3358 | pte_t *pte, pmd_t *pmd, unsigned int flags) | ||
3359 | { | 3338 | { |
3360 | pte_t entry; | 3339 | pte_t entry; |
3361 | spinlock_t *ptl; | ||
3362 | 3340 | ||
3363 | /* | 3341 | /* |
3364 | * some architectures can have larger ptes than wordsize, | 3342 | * some architectures can have larger ptes than wordsize, |
@@ -3368,37 +3346,34 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3368 | * we later double check anyway with the ptl lock held. So here | 3346 | * we later double check anyway with the ptl lock held. So here |
3369 | * a barrier will do. | 3347 | * a barrier will do. |
3370 | */ | 3348 | */ |
3371 | entry = *pte; | 3349 | entry = *fe->pte; |
3372 | barrier(); | 3350 | barrier(); |
3373 | if (!pte_present(entry)) { | 3351 | if (!pte_present(entry)) { |
3374 | if (pte_none(entry)) { | 3352 | if (pte_none(entry)) { |
3375 | if (vma_is_anonymous(vma)) | 3353 | if (vma_is_anonymous(fe->vma)) |
3376 | return do_anonymous_page(mm, vma, address, | 3354 | return do_anonymous_page(fe); |
3377 | pte, pmd, flags); | ||
3378 | else | 3355 | else |
3379 | return do_fault(mm, vma, address, pte, pmd, | 3356 | return do_fault(fe, entry); |
3380 | flags, entry); | ||
3381 | } | 3357 | } |
3382 | return do_swap_page(mm, vma, address, | 3358 | return do_swap_page(fe, entry); |
3383 | pte, pmd, flags, entry); | ||
3384 | } | 3359 | } |
3385 | 3360 | ||
3386 | if (pte_protnone(entry)) | 3361 | if (pte_protnone(entry)) |
3387 | return do_numa_page(mm, vma, address, entry, pte, pmd); | 3362 | return do_numa_page(fe, entry); |
3388 | 3363 | ||
3389 | ptl = pte_lockptr(mm, pmd); | 3364 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); |
3390 | spin_lock(ptl); | 3365 | spin_lock(fe->ptl); |
3391 | if (unlikely(!pte_same(*pte, entry))) | 3366 | if (unlikely(!pte_same(*fe->pte, entry))) |
3392 | goto unlock; | 3367 | goto unlock; |
3393 | if (flags & FAULT_FLAG_WRITE) { | 3368 | if (fe->flags & FAULT_FLAG_WRITE) { |
3394 | if (!pte_write(entry)) | 3369 | if (!pte_write(entry)) |
3395 | return do_wp_page(mm, vma, address, | 3370 | return do_wp_page(fe, entry); |
3396 | pte, pmd, ptl, entry); | ||
3397 | entry = pte_mkdirty(entry); | 3371 | entry = pte_mkdirty(entry); |
3398 | } | 3372 | } |
3399 | entry = pte_mkyoung(entry); | 3373 | entry = pte_mkyoung(entry); |
3400 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3374 | if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, |
3401 | update_mmu_cache(vma, address, pte); | 3375 | fe->flags & FAULT_FLAG_WRITE)) { |
3376 | update_mmu_cache(fe->vma, fe->address, fe->pte); | ||
3402 | } else { | 3377 | } else { |
3403 | /* | 3378 | /* |
3404 | * This is needed only for protection faults but the arch code | 3379 | * This is needed only for protection faults but the arch code |
@@ -3406,11 +3381,11 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3406 | * This still avoids useless tlb flushes for .text page faults | 3381 | * This still avoids useless tlb flushes for .text page faults |
3407 | * with threads. | 3382 | * with threads. |
3408 | */ | 3383 | */ |
3409 | if (flags & FAULT_FLAG_WRITE) | 3384 | if (fe->flags & FAULT_FLAG_WRITE) |
3410 | flush_tlb_fix_spurious_fault(vma, address); | 3385 | flush_tlb_fix_spurious_fault(fe->vma, fe->address); |
3411 | } | 3386 | } |
3412 | unlock: | 3387 | unlock: |
3413 | pte_unmap_unlock(pte, ptl); | 3388 | pte_unmap_unlock(fe->pte, fe->ptl); |
3414 | return 0; | 3389 | return 0; |
3415 | } | 3390 | } |
3416 | 3391 | ||
@@ -3423,51 +3398,42 @@ unlock: | |||
3423 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | 3398 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
3424 | unsigned int flags) | 3399 | unsigned int flags) |
3425 | { | 3400 | { |
3401 | struct fault_env fe = { | ||
3402 | .vma = vma, | ||
3403 | .address = address, | ||
3404 | .flags = flags, | ||
3405 | }; | ||
3426 | struct mm_struct *mm = vma->vm_mm; | 3406 | struct mm_struct *mm = vma->vm_mm; |
3427 | pgd_t *pgd; | 3407 | pgd_t *pgd; |
3428 | pud_t *pud; | 3408 | pud_t *pud; |
3429 | pmd_t *pmd; | ||
3430 | pte_t *pte; | ||
3431 | |||
3432 | if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, | ||
3433 | flags & FAULT_FLAG_INSTRUCTION, | ||
3434 | flags & FAULT_FLAG_REMOTE)) | ||
3435 | return VM_FAULT_SIGSEGV; | ||
3436 | |||
3437 | if (unlikely(is_vm_hugetlb_page(vma))) | ||
3438 | return hugetlb_fault(mm, vma, address, flags); | ||
3439 | 3409 | ||
3440 | pgd = pgd_offset(mm, address); | 3410 | pgd = pgd_offset(mm, address); |
3441 | pud = pud_alloc(mm, pgd, address); | 3411 | pud = pud_alloc(mm, pgd, address); |
3442 | if (!pud) | 3412 | if (!pud) |
3443 | return VM_FAULT_OOM; | 3413 | return VM_FAULT_OOM; |
3444 | pmd = pmd_alloc(mm, pud, address); | 3414 | fe.pmd = pmd_alloc(mm, pud, address); |
3445 | if (!pmd) | 3415 | if (!fe.pmd) |
3446 | return VM_FAULT_OOM; | 3416 | return VM_FAULT_OOM; |
3447 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3417 | if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { |
3448 | int ret = create_huge_pmd(mm, vma, address, pmd, flags); | 3418 | int ret = create_huge_pmd(&fe); |
3449 | if (!(ret & VM_FAULT_FALLBACK)) | 3419 | if (!(ret & VM_FAULT_FALLBACK)) |
3450 | return ret; | 3420 | return ret; |
3451 | } else { | 3421 | } else { |
3452 | pmd_t orig_pmd = *pmd; | 3422 | pmd_t orig_pmd = *fe.pmd; |
3453 | int ret; | 3423 | int ret; |
3454 | 3424 | ||
3455 | barrier(); | 3425 | barrier(); |
3456 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3426 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3457 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | ||
3458 | |||
3459 | if (pmd_protnone(orig_pmd)) | 3427 | if (pmd_protnone(orig_pmd)) |
3460 | return do_huge_pmd_numa_page(mm, vma, address, | 3428 | return do_huge_pmd_numa_page(&fe, orig_pmd); |
3461 | orig_pmd, pmd); | ||
3462 | 3429 | ||
3463 | if (dirty && !pmd_write(orig_pmd)) { | 3430 | if ((fe.flags & FAULT_FLAG_WRITE) && |
3464 | ret = wp_huge_pmd(mm, vma, address, pmd, | 3431 | !pmd_write(orig_pmd)) { |
3465 | orig_pmd, flags); | 3432 | ret = wp_huge_pmd(&fe, orig_pmd); |
3466 | if (!(ret & VM_FAULT_FALLBACK)) | 3433 | if (!(ret & VM_FAULT_FALLBACK)) |
3467 | return ret; | 3434 | return ret; |
3468 | } else { | 3435 | } else { |
3469 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3436 | huge_pmd_set_accessed(&fe, orig_pmd); |
3470 | orig_pmd, dirty); | ||
3471 | return 0; | 3437 | return 0; |
3472 | } | 3438 | } |
3473 | } | 3439 | } |
@@ -3478,7 +3444,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3478 | * run pte_offset_map on the pmd, if an huge pmd could | 3444 | * run pte_offset_map on the pmd, if an huge pmd could |
3479 | * materialize from under us from a different thread. | 3445 | * materialize from under us from a different thread. |
3480 | */ | 3446 | */ |
3481 | if (unlikely(pte_alloc(mm, pmd, address))) | 3447 | if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address))) |
3482 | return VM_FAULT_OOM; | 3448 | return VM_FAULT_OOM; |
3483 | /* | 3449 | /* |
3484 | * If a huge pmd materialized under us just retry later. Use | 3450 | * If a huge pmd materialized under us just retry later. Use |
@@ -3491,7 +3457,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3491 | * through an atomic read in C, which is what pmd_trans_unstable() | 3457 | * through an atomic read in C, which is what pmd_trans_unstable() |
3492 | * provides. | 3458 | * provides. |
3493 | */ | 3459 | */ |
3494 | if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) | 3460 | if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd))) |
3495 | return 0; | 3461 | return 0; |
3496 | /* | 3462 | /* |
3497 | * A regular pmd is established and it can't morph into a huge pmd | 3463 | * A regular pmd is established and it can't morph into a huge pmd |
@@ -3499,9 +3465,9 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3499 | * read mode and khugepaged takes it in write mode. So now it's | 3465 | * read mode and khugepaged takes it in write mode. So now it's |
3500 | * safe to run pte_offset_map(). | 3466 | * safe to run pte_offset_map(). |
3501 | */ | 3467 | */ |
3502 | pte = pte_offset_map(pmd, address); | 3468 | fe.pte = pte_offset_map(fe.pmd, fe.address); |
3503 | 3469 | ||
3504 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | 3470 | return handle_pte_fault(&fe); |
3505 | } | 3471 | } |
3506 | 3472 | ||
3507 | /* | 3473 | /* |
@@ -3530,7 +3496,15 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3530 | if (flags & FAULT_FLAG_USER) | 3496 | if (flags & FAULT_FLAG_USER) |
3531 | mem_cgroup_oom_enable(); | 3497 | mem_cgroup_oom_enable(); |
3532 | 3498 | ||
3533 | ret = __handle_mm_fault(vma, address, flags); | 3499 | if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, |
3500 | flags & FAULT_FLAG_INSTRUCTION, | ||
3501 | flags & FAULT_FLAG_REMOTE)) | ||
3502 | return VM_FAULT_SIGSEGV; | ||
3503 | |||
3504 | if (unlikely(is_vm_hugetlb_page(vma))) | ||
3505 | ret = hugetlb_fault(vma->vm_mm, vma, address, flags); | ||
3506 | else | ||
3507 | ret = __handle_mm_fault(vma, address, flags); | ||
3534 | 3508 | ||
3535 | if (flags & FAULT_FLAG_USER) { | 3509 | if (flags & FAULT_FLAG_USER) { |
3536 | mem_cgroup_oom_disable(); | 3510 | mem_cgroup_oom_disable(); |