diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 371 |
1 files changed, 216 insertions, 155 deletions
diff --git a/mm/memory.c b/mm/memory.c index 97839f5c8c30..ac20b2a6a0c3 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
1983 | } | 1983 | } |
1984 | 1984 | ||
1985 | /* | 1985 | /* |
1986 | * This routine handles present pages, when users try to write | 1986 | * Handle write page faults for pages that can be reused in the current vma |
1987 | * to a shared page. It is done by copying the page to a new address | ||
1988 | * and decrementing the shared-page counter for the old page. | ||
1989 | * | ||
1990 | * Note that this routine assumes that the protection checks have been | ||
1991 | * done by the caller (the low-level page fault routine in most cases). | ||
1992 | * Thus we can safely just mark it writable once we've done any necessary | ||
1993 | * COW. | ||
1994 | * | 1987 | * |
1995 | * We also mark the page dirty at this point even though the page will | 1988 | * This can happen either due to the mapping being with the VM_SHARED flag, |
1996 | * change only once the write actually happens. This avoids a few races, | 1989 | * or due to us being the last reference standing to the page. In either |
1997 | * and potentially makes it more efficient. | 1990 | * case, all we need to do here is to mark the page as writable and update |
1998 | * | 1991 | * any related book-keeping. |
1999 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
2000 | * but allow concurrent faults), with pte both mapped and locked. | ||
2001 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
2002 | */ | 1992 | */ |
2003 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1993 | static inline int wp_page_reuse(struct mm_struct *mm, |
2004 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 1994 | struct vm_area_struct *vma, unsigned long address, |
2005 | spinlock_t *ptl, pte_t orig_pte) | 1995 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, |
1996 | struct page *page, int page_mkwrite, | ||
1997 | int dirty_shared) | ||
2006 | __releases(ptl) | 1998 | __releases(ptl) |
2007 | { | 1999 | { |
2008 | struct page *old_page, *new_page = NULL; | ||
2009 | pte_t entry; | 2000 | pte_t entry; |
2010 | int ret = 0; | ||
2011 | int page_mkwrite = 0; | ||
2012 | bool dirty_shared = false; | ||
2013 | unsigned long mmun_start = 0; /* For mmu_notifiers */ | ||
2014 | unsigned long mmun_end = 0; /* For mmu_notifiers */ | ||
2015 | struct mem_cgroup *memcg; | ||
2016 | |||
2017 | old_page = vm_normal_page(vma, address, orig_pte); | ||
2018 | if (!old_page) { | ||
2019 | /* | ||
2020 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | ||
2021 | * VM_PFNMAP VMA. | ||
2022 | * | ||
2023 | * We should not cow pages in a shared writeable mapping. | ||
2024 | * Just mark the pages writable as we can't do any dirty | ||
2025 | * accounting on raw pfn maps. | ||
2026 | */ | ||
2027 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
2028 | (VM_WRITE|VM_SHARED)) | ||
2029 | goto reuse; | ||
2030 | goto gotten; | ||
2031 | } | ||
2032 | |||
2033 | /* | 2001 | /* |
2034 | * Take out anonymous pages first, anonymous shared vmas are | 2002 | * Clear the pages cpupid information as the existing |
2035 | * not dirty accountable. | 2003 | * information potentially belongs to a now completely |
2004 | * unrelated process. | ||
2036 | */ | 2005 | */ |
2037 | if (PageAnon(old_page) && !PageKsm(old_page)) { | 2006 | if (page) |
2038 | if (!trylock_page(old_page)) { | 2007 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
2039 | page_cache_get(old_page); | ||
2040 | pte_unmap_unlock(page_table, ptl); | ||
2041 | lock_page(old_page); | ||
2042 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
2043 | &ptl); | ||
2044 | if (!pte_same(*page_table, orig_pte)) { | ||
2045 | unlock_page(old_page); | ||
2046 | goto unlock; | ||
2047 | } | ||
2048 | page_cache_release(old_page); | ||
2049 | } | ||
2050 | if (reuse_swap_page(old_page)) { | ||
2051 | /* | ||
2052 | * The page is all ours. Move it to our anon_vma so | ||
2053 | * the rmap code will not search our parent or siblings. | ||
2054 | * Protected against the rmap code by the page lock. | ||
2055 | */ | ||
2056 | page_move_anon_rmap(old_page, vma, address); | ||
2057 | unlock_page(old_page); | ||
2058 | goto reuse; | ||
2059 | } | ||
2060 | unlock_page(old_page); | ||
2061 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
2062 | (VM_WRITE|VM_SHARED))) { | ||
2063 | page_cache_get(old_page); | ||
2064 | /* | ||
2065 | * Only catch write-faults on shared writable pages, | ||
2066 | * read-only shared pages can get COWed by | ||
2067 | * get_user_pages(.write=1, .force=1). | ||
2068 | */ | ||
2069 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | ||
2070 | int tmp; | ||
2071 | |||
2072 | pte_unmap_unlock(page_table, ptl); | ||
2073 | tmp = do_page_mkwrite(vma, old_page, address); | ||
2074 | if (unlikely(!tmp || (tmp & | ||
2075 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | ||
2076 | page_cache_release(old_page); | ||
2077 | return tmp; | ||
2078 | } | ||
2079 | /* | ||
2080 | * Since we dropped the lock we need to revalidate | ||
2081 | * the PTE as someone else may have changed it. If | ||
2082 | * they did, we just return, as we can count on the | ||
2083 | * MMU to tell us if they didn't also make it writable. | ||
2084 | */ | ||
2085 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
2086 | &ptl); | ||
2087 | if (!pte_same(*page_table, orig_pte)) { | ||
2088 | unlock_page(old_page); | ||
2089 | goto unlock; | ||
2090 | } | ||
2091 | page_mkwrite = 1; | ||
2092 | } | ||
2093 | |||
2094 | dirty_shared = true; | ||
2095 | |||
2096 | reuse: | ||
2097 | /* | ||
2098 | * Clear the pages cpupid information as the existing | ||
2099 | * information potentially belongs to a now completely | ||
2100 | * unrelated process. | ||
2101 | */ | ||
2102 | if (old_page) | ||
2103 | page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); | ||
2104 | |||
2105 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | ||
2106 | entry = pte_mkyoung(orig_pte); | ||
2107 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2108 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | ||
2109 | update_mmu_cache(vma, address, page_table); | ||
2110 | pte_unmap_unlock(page_table, ptl); | ||
2111 | ret |= VM_FAULT_WRITE; | ||
2112 | 2008 | ||
2113 | if (dirty_shared) { | 2009 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2114 | struct address_space *mapping; | 2010 | entry = pte_mkyoung(orig_pte); |
2115 | int dirtied; | 2011 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2012 | if (ptep_set_access_flags(vma, address, page_table, entry, 1)) | ||
2013 | update_mmu_cache(vma, address, page_table); | ||
2014 | pte_unmap_unlock(page_table, ptl); | ||
2116 | 2015 | ||
2117 | if (!page_mkwrite) | 2016 | if (dirty_shared) { |
2118 | lock_page(old_page); | 2017 | struct address_space *mapping; |
2018 | int dirtied; | ||
2119 | 2019 | ||
2120 | dirtied = set_page_dirty(old_page); | 2020 | if (!page_mkwrite) |
2121 | VM_BUG_ON_PAGE(PageAnon(old_page), old_page); | 2021 | lock_page(page); |
2122 | mapping = old_page->mapping; | ||
2123 | unlock_page(old_page); | ||
2124 | page_cache_release(old_page); | ||
2125 | 2022 | ||
2126 | if ((dirtied || page_mkwrite) && mapping) { | 2023 | dirtied = set_page_dirty(page); |
2127 | /* | 2024 | VM_BUG_ON_PAGE(PageAnon(page), page); |
2128 | * Some device drivers do not set page.mapping | 2025 | mapping = page->mapping; |
2129 | * but still dirty their pages | 2026 | unlock_page(page); |
2130 | */ | 2027 | page_cache_release(page); |
2131 | balance_dirty_pages_ratelimited(mapping); | ||
2132 | } | ||
2133 | 2028 | ||
2134 | if (!page_mkwrite) | 2029 | if ((dirtied || page_mkwrite) && mapping) { |
2135 | file_update_time(vma->vm_file); | 2030 | /* |
2031 | * Some device drivers do not set page.mapping | ||
2032 | * but still dirty their pages | ||
2033 | */ | ||
2034 | balance_dirty_pages_ratelimited(mapping); | ||
2136 | } | 2035 | } |
2137 | 2036 | ||
2138 | return ret; | 2037 | if (!page_mkwrite) |
2038 | file_update_time(vma->vm_file); | ||
2139 | } | 2039 | } |
2140 | 2040 | ||
2141 | /* | 2041 | return VM_FAULT_WRITE; |
2142 | * Ok, we need to copy. Oh, well.. | 2042 | } |
2143 | */ | 2043 | |
2144 | page_cache_get(old_page); | 2044 | /* |
2145 | gotten: | 2045 | * Handle the case of a page which we actually need to copy to a new page. |
2146 | pte_unmap_unlock(page_table, ptl); | 2046 | * |
2047 | * Called with mmap_sem locked and the old page referenced, but | ||
2048 | * without the ptl held. | ||
2049 | * | ||
2050 | * High level logic flow: | ||
2051 | * | ||
2052 | * - Allocate a page, copy the content of the old page to the new one. | ||
2053 | * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. | ||
2054 | * - Take the PTL. If the pte changed, bail out and release the allocated page | ||
2055 | * - If the pte is still the way we remember it, update the page table and all | ||
2056 | * relevant references. This includes dropping the reference the page-table | ||
2057 | * held to the old page, as well as updating the rmap. | ||
2058 | * - In any case, unlock the PTL and drop the reference we took to the old page. | ||
2059 | */ | ||
2060 | static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2061 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2062 | pte_t orig_pte, struct page *old_page) | ||
2063 | { | ||
2064 | struct page *new_page = NULL; | ||
2065 | spinlock_t *ptl = NULL; | ||
2066 | pte_t entry; | ||
2067 | int page_copied = 0; | ||
2068 | const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ | ||
2069 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ | ||
2070 | struct mem_cgroup *memcg; | ||
2147 | 2071 | ||
2148 | if (unlikely(anon_vma_prepare(vma))) | 2072 | if (unlikely(anon_vma_prepare(vma))) |
2149 | goto oom; | 2073 | goto oom; |
@@ -2163,8 +2087,6 @@ gotten: | |||
2163 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) | 2087 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) |
2164 | goto oom_free_new; | 2088 | goto oom_free_new; |
2165 | 2089 | ||
2166 | mmun_start = address & PAGE_MASK; | ||
2167 | mmun_end = mmun_start + PAGE_SIZE; | ||
2168 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2090 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2169 | 2091 | ||
2170 | /* | 2092 | /* |
@@ -2177,8 +2099,9 @@ gotten: | |||
2177 | dec_mm_counter_fast(mm, MM_FILEPAGES); | 2099 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
2178 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2100 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2179 | } | 2101 | } |
2180 | } else | 2102 | } else { |
2181 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2103 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2104 | } | ||
2182 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2105 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
2183 | entry = mk_pte(new_page, vma->vm_page_prot); | 2106 | entry = mk_pte(new_page, vma->vm_page_prot); |
2184 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2107 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2227,29 +2150,29 @@ gotten: | |||
2227 | 2150 | ||
2228 | /* Free the old page.. */ | 2151 | /* Free the old page.. */ |
2229 | new_page = old_page; | 2152 | new_page = old_page; |
2230 | ret |= VM_FAULT_WRITE; | 2153 | page_copied = 1; |
2231 | } else | 2154 | } else { |
2232 | mem_cgroup_cancel_charge(new_page, memcg); | 2155 | mem_cgroup_cancel_charge(new_page, memcg); |
2156 | } | ||
2233 | 2157 | ||
2234 | if (new_page) | 2158 | if (new_page) |
2235 | page_cache_release(new_page); | 2159 | page_cache_release(new_page); |
2236 | unlock: | 2160 | |
2237 | pte_unmap_unlock(page_table, ptl); | 2161 | pte_unmap_unlock(page_table, ptl); |
2238 | if (mmun_end > mmun_start) | 2162 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2239 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2240 | if (old_page) { | 2163 | if (old_page) { |
2241 | /* | 2164 | /* |
2242 | * Don't let another task, with possibly unlocked vma, | 2165 | * Don't let another task, with possibly unlocked vma, |
2243 | * keep the mlocked page. | 2166 | * keep the mlocked page. |
2244 | */ | 2167 | */ |
2245 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { | 2168 | if (page_copied && (vma->vm_flags & VM_LOCKED)) { |
2246 | lock_page(old_page); /* LRU manipulation */ | 2169 | lock_page(old_page); /* LRU manipulation */ |
2247 | munlock_vma_page(old_page); | 2170 | munlock_vma_page(old_page); |
2248 | unlock_page(old_page); | 2171 | unlock_page(old_page); |
2249 | } | 2172 | } |
2250 | page_cache_release(old_page); | 2173 | page_cache_release(old_page); |
2251 | } | 2174 | } |
2252 | return ret; | 2175 | return page_copied ? VM_FAULT_WRITE : 0; |
2253 | oom_free_new: | 2176 | oom_free_new: |
2254 | page_cache_release(new_page); | 2177 | page_cache_release(new_page); |
2255 | oom: | 2178 | oom: |
@@ -2258,6 +2181,144 @@ oom: | |||
2258 | return VM_FAULT_OOM; | 2181 | return VM_FAULT_OOM; |
2259 | } | 2182 | } |
2260 | 2183 | ||
2184 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2185 | unsigned long address, pte_t *page_table, | ||
2186 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | ||
2187 | struct page *old_page) | ||
2188 | __releases(ptl) | ||
2189 | { | ||
2190 | int page_mkwrite = 0; | ||
2191 | |||
2192 | page_cache_get(old_page); | ||
2193 | |||
2194 | /* | ||
2195 | * Only catch write-faults on shared writable pages, | ||
2196 | * read-only shared pages can get COWed by | ||
2197 | * get_user_pages(.write=1, .force=1). | ||
2198 | */ | ||
2199 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | ||
2200 | int tmp; | ||
2201 | |||
2202 | pte_unmap_unlock(page_table, ptl); | ||
2203 | tmp = do_page_mkwrite(vma, old_page, address); | ||
2204 | if (unlikely(!tmp || (tmp & | ||
2205 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | ||
2206 | page_cache_release(old_page); | ||
2207 | return tmp; | ||
2208 | } | ||
2209 | /* | ||
2210 | * Since we dropped the lock we need to revalidate | ||
2211 | * the PTE as someone else may have changed it. If | ||
2212 | * they did, we just return, as we can count on the | ||
2213 | * MMU to tell us if they didn't also make it writable. | ||
2214 | */ | ||
2215 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
2216 | &ptl); | ||
2217 | if (!pte_same(*page_table, orig_pte)) { | ||
2218 | unlock_page(old_page); | ||
2219 | pte_unmap_unlock(page_table, ptl); | ||
2220 | page_cache_release(old_page); | ||
2221 | return 0; | ||
2222 | } | ||
2223 | page_mkwrite = 1; | ||
2224 | } | ||
2225 | |||
2226 | return wp_page_reuse(mm, vma, address, page_table, ptl, | ||
2227 | orig_pte, old_page, page_mkwrite, 1); | ||
2228 | } | ||
2229 | |||
2230 | /* | ||
2231 | * This routine handles present pages, when users try to write | ||
2232 | * to a shared page. It is done by copying the page to a new address | ||
2233 | * and decrementing the shared-page counter for the old page. | ||
2234 | * | ||
2235 | * Note that this routine assumes that the protection checks have been | ||
2236 | * done by the caller (the low-level page fault routine in most cases). | ||
2237 | * Thus we can safely just mark it writable once we've done any necessary | ||
2238 | * COW. | ||
2239 | * | ||
2240 | * We also mark the page dirty at this point even though the page will | ||
2241 | * change only once the write actually happens. This avoids a few races, | ||
2242 | * and potentially makes it more efficient. | ||
2243 | * | ||
2244 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
2245 | * but allow concurrent faults), with pte both mapped and locked. | ||
2246 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
2247 | */ | ||
2248 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2249 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2250 | spinlock_t *ptl, pte_t orig_pte) | ||
2251 | __releases(ptl) | ||
2252 | { | ||
2253 | struct page *old_page; | ||
2254 | |||
2255 | old_page = vm_normal_page(vma, address, orig_pte); | ||
2256 | if (!old_page) { | ||
2257 | /* | ||
2258 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | ||
2259 | * VM_PFNMAP VMA. | ||
2260 | * | ||
2261 | * We should not cow pages in a shared writeable mapping. | ||
2262 | * Just mark the pages writable as we can't do any dirty | ||
2263 | * accounting on raw pfn maps. | ||
2264 | */ | ||
2265 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
2266 | (VM_WRITE|VM_SHARED)) | ||
2267 | return wp_page_reuse(mm, vma, address, page_table, ptl, | ||
2268 | orig_pte, old_page, 0, 0); | ||
2269 | |||
2270 | pte_unmap_unlock(page_table, ptl); | ||
2271 | return wp_page_copy(mm, vma, address, page_table, pmd, | ||
2272 | orig_pte, old_page); | ||
2273 | } | ||
2274 | |||
2275 | /* | ||
2276 | * Take out anonymous pages first, anonymous shared vmas are | ||
2277 | * not dirty accountable. | ||
2278 | */ | ||
2279 | if (PageAnon(old_page) && !PageKsm(old_page)) { | ||
2280 | if (!trylock_page(old_page)) { | ||
2281 | page_cache_get(old_page); | ||
2282 | pte_unmap_unlock(page_table, ptl); | ||
2283 | lock_page(old_page); | ||
2284 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
2285 | &ptl); | ||
2286 | if (!pte_same(*page_table, orig_pte)) { | ||
2287 | unlock_page(old_page); | ||
2288 | pte_unmap_unlock(page_table, ptl); | ||
2289 | page_cache_release(old_page); | ||
2290 | return 0; | ||
2291 | } | ||
2292 | page_cache_release(old_page); | ||
2293 | } | ||
2294 | if (reuse_swap_page(old_page)) { | ||
2295 | /* | ||
2296 | * The page is all ours. Move it to our anon_vma so | ||
2297 | * the rmap code will not search our parent or siblings. | ||
2298 | * Protected against the rmap code by the page lock. | ||
2299 | */ | ||
2300 | page_move_anon_rmap(old_page, vma, address); | ||
2301 | unlock_page(old_page); | ||
2302 | return wp_page_reuse(mm, vma, address, page_table, ptl, | ||
2303 | orig_pte, old_page, 0, 0); | ||
2304 | } | ||
2305 | unlock_page(old_page); | ||
2306 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
2307 | (VM_WRITE|VM_SHARED))) { | ||
2308 | return wp_page_shared(mm, vma, address, page_table, pmd, | ||
2309 | ptl, orig_pte, old_page); | ||
2310 | } | ||
2311 | |||
2312 | /* | ||
2313 | * Ok, we need to copy. Oh, well.. | ||
2314 | */ | ||
2315 | page_cache_get(old_page); | ||
2316 | |||
2317 | pte_unmap_unlock(page_table, ptl); | ||
2318 | return wp_page_copy(mm, vma, address, page_table, pmd, | ||
2319 | orig_pte, old_page); | ||
2320 | } | ||
2321 | |||
2261 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2322 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
2262 | unsigned long start_addr, unsigned long end_addr, | 2323 | unsigned long start_addr, unsigned long end_addr, |
2263 | struct zap_details *details) | 2324 | struct zap_details *details) |