aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c371
1 files changed, 216 insertions, 155 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..ac20b2a6a0c3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1983} 1983}
1984 1984
1985/* 1985/*
1986 * This routine handles present pages, when users try to write 1986 * Handle write page faults for pages that can be reused in the current vma
1987 * to a shared page. It is done by copying the page to a new address
1988 * and decrementing the shared-page counter for the old page.
1989 *
1990 * Note that this routine assumes that the protection checks have been
1991 * done by the caller (the low-level page fault routine in most cases).
1992 * Thus we can safely just mark it writable once we've done any necessary
1993 * COW.
1994 * 1987 *
1995 * We also mark the page dirty at this point even though the page will 1988 * This can happen either due to the mapping being with the VM_SHARED flag,
1996 * change only once the write actually happens. This avoids a few races, 1989 * or due to us being the last reference standing to the page. In either
1997 * and potentially makes it more efficient. 1990 * case, all we need to do here is to mark the page as writable and update
1998 * 1991 * any related book-keeping.
1999 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2000 * but allow concurrent faults), with pte both mapped and locked.
2001 * We return with mmap_sem still held, but pte unmapped and unlocked.
2002 */ 1992 */
2003static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1993static inline int wp_page_reuse(struct mm_struct *mm,
2004 unsigned long address, pte_t *page_table, pmd_t *pmd, 1994 struct vm_area_struct *vma, unsigned long address,
2005 spinlock_t *ptl, pte_t orig_pte) 1995 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
1996 struct page *page, int page_mkwrite,
1997 int dirty_shared)
2006 __releases(ptl) 1998 __releases(ptl)
2007{ 1999{
2008 struct page *old_page, *new_page = NULL;
2009 pte_t entry; 2000 pte_t entry;
2010 int ret = 0;
2011 int page_mkwrite = 0;
2012 bool dirty_shared = false;
2013 unsigned long mmun_start = 0; /* For mmu_notifiers */
2014 unsigned long mmun_end = 0; /* For mmu_notifiers */
2015 struct mem_cgroup *memcg;
2016
2017 old_page = vm_normal_page(vma, address, orig_pte);
2018 if (!old_page) {
2019 /*
2020 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2021 * VM_PFNMAP VMA.
2022 *
2023 * We should not cow pages in a shared writeable mapping.
2024 * Just mark the pages writable as we can't do any dirty
2025 * accounting on raw pfn maps.
2026 */
2027 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2028 (VM_WRITE|VM_SHARED))
2029 goto reuse;
2030 goto gotten;
2031 }
2032
2033 /* 2001 /*
2034 * Take out anonymous pages first, anonymous shared vmas are 2002 * Clear the pages cpupid information as the existing
2035 * not dirty accountable. 2003 * information potentially belongs to a now completely
2004 * unrelated process.
2036 */ 2005 */
2037 if (PageAnon(old_page) && !PageKsm(old_page)) { 2006 if (page)
2038 if (!trylock_page(old_page)) { 2007 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2039 page_cache_get(old_page);
2040 pte_unmap_unlock(page_table, ptl);
2041 lock_page(old_page);
2042 page_table = pte_offset_map_lock(mm, pmd, address,
2043 &ptl);
2044 if (!pte_same(*page_table, orig_pte)) {
2045 unlock_page(old_page);
2046 goto unlock;
2047 }
2048 page_cache_release(old_page);
2049 }
2050 if (reuse_swap_page(old_page)) {
2051 /*
2052 * The page is all ours. Move it to our anon_vma so
2053 * the rmap code will not search our parent or siblings.
2054 * Protected against the rmap code by the page lock.
2055 */
2056 page_move_anon_rmap(old_page, vma, address);
2057 unlock_page(old_page);
2058 goto reuse;
2059 }
2060 unlock_page(old_page);
2061 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2062 (VM_WRITE|VM_SHARED))) {
2063 page_cache_get(old_page);
2064 /*
2065 * Only catch write-faults on shared writable pages,
2066 * read-only shared pages can get COWed by
2067 * get_user_pages(.write=1, .force=1).
2068 */
2069 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2070 int tmp;
2071
2072 pte_unmap_unlock(page_table, ptl);
2073 tmp = do_page_mkwrite(vma, old_page, address);
2074 if (unlikely(!tmp || (tmp &
2075 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2076 page_cache_release(old_page);
2077 return tmp;
2078 }
2079 /*
2080 * Since we dropped the lock we need to revalidate
2081 * the PTE as someone else may have changed it. If
2082 * they did, we just return, as we can count on the
2083 * MMU to tell us if they didn't also make it writable.
2084 */
2085 page_table = pte_offset_map_lock(mm, pmd, address,
2086 &ptl);
2087 if (!pte_same(*page_table, orig_pte)) {
2088 unlock_page(old_page);
2089 goto unlock;
2090 }
2091 page_mkwrite = 1;
2092 }
2093
2094 dirty_shared = true;
2095
2096reuse:
2097 /*
2098 * Clear the pages cpupid information as the existing
2099 * information potentially belongs to a now completely
2100 * unrelated process.
2101 */
2102 if (old_page)
2103 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2104
2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2106 entry = pte_mkyoung(orig_pte);
2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2108 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2109 update_mmu_cache(vma, address, page_table);
2110 pte_unmap_unlock(page_table, ptl);
2111 ret |= VM_FAULT_WRITE;
2112 2008
2113 if (dirty_shared) { 2009 flush_cache_page(vma, address, pte_pfn(orig_pte));
2114 struct address_space *mapping; 2010 entry = pte_mkyoung(orig_pte);
2115 int dirtied; 2011 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2012 if (ptep_set_access_flags(vma, address, page_table, entry, 1))
2013 update_mmu_cache(vma, address, page_table);
2014 pte_unmap_unlock(page_table, ptl);
2116 2015
2117 if (!page_mkwrite) 2016 if (dirty_shared) {
2118 lock_page(old_page); 2017 struct address_space *mapping;
2018 int dirtied;
2119 2019
2120 dirtied = set_page_dirty(old_page); 2020 if (!page_mkwrite)
2121 VM_BUG_ON_PAGE(PageAnon(old_page), old_page); 2021 lock_page(page);
2122 mapping = old_page->mapping;
2123 unlock_page(old_page);
2124 page_cache_release(old_page);
2125 2022
2126 if ((dirtied || page_mkwrite) && mapping) { 2023 dirtied = set_page_dirty(page);
2127 /* 2024 VM_BUG_ON_PAGE(PageAnon(page), page);
2128 * Some device drivers do not set page.mapping 2025 mapping = page->mapping;
2129 * but still dirty their pages 2026 unlock_page(page);
2130 */ 2027 page_cache_release(page);
2131 balance_dirty_pages_ratelimited(mapping);
2132 }
2133 2028
2134 if (!page_mkwrite) 2029 if ((dirtied || page_mkwrite) && mapping) {
2135 file_update_time(vma->vm_file); 2030 /*
2031 * Some device drivers do not set page.mapping
2032 * but still dirty their pages
2033 */
2034 balance_dirty_pages_ratelimited(mapping);
2136 } 2035 }
2137 2036
2138 return ret; 2037 if (!page_mkwrite)
2038 file_update_time(vma->vm_file);
2139 } 2039 }
2140 2040
2141 /* 2041 return VM_FAULT_WRITE;
2142 * Ok, we need to copy. Oh, well.. 2042}
2143 */ 2043
2144 page_cache_get(old_page); 2044/*
2145gotten: 2045 * Handle the case of a page which we actually need to copy to a new page.
2146 pte_unmap_unlock(page_table, ptl); 2046 *
2047 * Called with mmap_sem locked and the old page referenced, but
2048 * without the ptl held.
2049 *
2050 * High level logic flow:
2051 *
2052 * - Allocate a page, copy the content of the old page to the new one.
2053 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
2054 * - Take the PTL. If the pte changed, bail out and release the allocated page
2055 * - If the pte is still the way we remember it, update the page table and all
2056 * relevant references. This includes dropping the reference the page-table
2057 * held to the old page, as well as updating the rmap.
2058 * - In any case, unlock the PTL and drop the reference we took to the old page.
2059 */
2060static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2061 unsigned long address, pte_t *page_table, pmd_t *pmd,
2062 pte_t orig_pte, struct page *old_page)
2063{
2064 struct page *new_page = NULL;
2065 spinlock_t *ptl = NULL;
2066 pte_t entry;
2067 int page_copied = 0;
2068 const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
2069 const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
2070 struct mem_cgroup *memcg;
2147 2071
2148 if (unlikely(anon_vma_prepare(vma))) 2072 if (unlikely(anon_vma_prepare(vma)))
2149 goto oom; 2073 goto oom;
@@ -2163,8 +2087,6 @@ gotten:
2163 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) 2087 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2164 goto oom_free_new; 2088 goto oom_free_new;
2165 2089
2166 mmun_start = address & PAGE_MASK;
2167 mmun_end = mmun_start + PAGE_SIZE;
2168 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2090 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2169 2091
2170 /* 2092 /*
@@ -2177,8 +2099,9 @@ gotten:
2177 dec_mm_counter_fast(mm, MM_FILEPAGES); 2099 dec_mm_counter_fast(mm, MM_FILEPAGES);
2178 inc_mm_counter_fast(mm, MM_ANONPAGES); 2100 inc_mm_counter_fast(mm, MM_ANONPAGES);
2179 } 2101 }
2180 } else 2102 } else {
2181 inc_mm_counter_fast(mm, MM_ANONPAGES); 2103 inc_mm_counter_fast(mm, MM_ANONPAGES);
2104 }
2182 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2183 entry = mk_pte(new_page, vma->vm_page_prot); 2106 entry = mk_pte(new_page, vma->vm_page_prot);
2184 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
2227 2150
2228 /* Free the old page.. */ 2151 /* Free the old page.. */
2229 new_page = old_page; 2152 new_page = old_page;
2230 ret |= VM_FAULT_WRITE; 2153 page_copied = 1;
2231 } else 2154 } else {
2232 mem_cgroup_cancel_charge(new_page, memcg); 2155 mem_cgroup_cancel_charge(new_page, memcg);
2156 }
2233 2157
2234 if (new_page) 2158 if (new_page)
2235 page_cache_release(new_page); 2159 page_cache_release(new_page);
2236unlock: 2160
2237 pte_unmap_unlock(page_table, ptl); 2161 pte_unmap_unlock(page_table, ptl);
2238 if (mmun_end > mmun_start) 2162 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2239 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2240 if (old_page) { 2163 if (old_page) {
2241 /* 2164 /*
2242 * Don't let another task, with possibly unlocked vma, 2165 * Don't let another task, with possibly unlocked vma,
2243 * keep the mlocked page. 2166 * keep the mlocked page.
2244 */ 2167 */
2245 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { 2168 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2246 lock_page(old_page); /* LRU manipulation */ 2169 lock_page(old_page); /* LRU manipulation */
2247 munlock_vma_page(old_page); 2170 munlock_vma_page(old_page);
2248 unlock_page(old_page); 2171 unlock_page(old_page);
2249 } 2172 }
2250 page_cache_release(old_page); 2173 page_cache_release(old_page);
2251 } 2174 }
2252 return ret; 2175 return page_copied ? VM_FAULT_WRITE : 0;
2253oom_free_new: 2176oom_free_new:
2254 page_cache_release(new_page); 2177 page_cache_release(new_page);
2255oom: 2178oom:
@@ -2258,6 +2181,144 @@ oom:
2258 return VM_FAULT_OOM; 2181 return VM_FAULT_OOM;
2259} 2182}
2260 2183
2184static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2185 unsigned long address, pte_t *page_table,
2186 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
2187 struct page *old_page)
2188 __releases(ptl)
2189{
2190 int page_mkwrite = 0;
2191
2192 page_cache_get(old_page);
2193
2194 /*
2195 * Only catch write-faults on shared writable pages,
2196 * read-only shared pages can get COWed by
2197 * get_user_pages(.write=1, .force=1).
2198 */
2199 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2200 int tmp;
2201
2202 pte_unmap_unlock(page_table, ptl);
2203 tmp = do_page_mkwrite(vma, old_page, address);
2204 if (unlikely(!tmp || (tmp &
2205 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2206 page_cache_release(old_page);
2207 return tmp;
2208 }
2209 /*
2210 * Since we dropped the lock we need to revalidate
2211 * the PTE as someone else may have changed it. If
2212 * they did, we just return, as we can count on the
2213 * MMU to tell us if they didn't also make it writable.
2214 */
2215 page_table = pte_offset_map_lock(mm, pmd, address,
2216 &ptl);
2217 if (!pte_same(*page_table, orig_pte)) {
2218 unlock_page(old_page);
2219 pte_unmap_unlock(page_table, ptl);
2220 page_cache_release(old_page);
2221 return 0;
2222 }
2223 page_mkwrite = 1;
2224 }
2225
2226 return wp_page_reuse(mm, vma, address, page_table, ptl,
2227 orig_pte, old_page, page_mkwrite, 1);
2228}
2229
2230/*
2231 * This routine handles present pages, when users try to write
2232 * to a shared page. It is done by copying the page to a new address
2233 * and decrementing the shared-page counter for the old page.
2234 *
2235 * Note that this routine assumes that the protection checks have been
2236 * done by the caller (the low-level page fault routine in most cases).
2237 * Thus we can safely just mark it writable once we've done any necessary
2238 * COW.
2239 *
2240 * We also mark the page dirty at this point even though the page will
2241 * change only once the write actually happens. This avoids a few races,
2242 * and potentially makes it more efficient.
2243 *
2244 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2245 * but allow concurrent faults), with pte both mapped and locked.
2246 * We return with mmap_sem still held, but pte unmapped and unlocked.
2247 */
2248static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2249 unsigned long address, pte_t *page_table, pmd_t *pmd,
2250 spinlock_t *ptl, pte_t orig_pte)
2251 __releases(ptl)
2252{
2253 struct page *old_page;
2254
2255 old_page = vm_normal_page(vma, address, orig_pte);
2256 if (!old_page) {
2257 /*
2258 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2259 * VM_PFNMAP VMA.
2260 *
2261 * We should not cow pages in a shared writeable mapping.
2262 * Just mark the pages writable as we can't do any dirty
2263 * accounting on raw pfn maps.
2264 */
2265 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2266 (VM_WRITE|VM_SHARED))
2267 return wp_page_reuse(mm, vma, address, page_table, ptl,
2268 orig_pte, old_page, 0, 0);
2269
2270 pte_unmap_unlock(page_table, ptl);
2271 return wp_page_copy(mm, vma, address, page_table, pmd,
2272 orig_pte, old_page);
2273 }
2274
2275 /*
2276 * Take out anonymous pages first, anonymous shared vmas are
2277 * not dirty accountable.
2278 */
2279 if (PageAnon(old_page) && !PageKsm(old_page)) {
2280 if (!trylock_page(old_page)) {
2281 page_cache_get(old_page);
2282 pte_unmap_unlock(page_table, ptl);
2283 lock_page(old_page);
2284 page_table = pte_offset_map_lock(mm, pmd, address,
2285 &ptl);
2286 if (!pte_same(*page_table, orig_pte)) {
2287 unlock_page(old_page);
2288 pte_unmap_unlock(page_table, ptl);
2289 page_cache_release(old_page);
2290 return 0;
2291 }
2292 page_cache_release(old_page);
2293 }
2294 if (reuse_swap_page(old_page)) {
2295 /*
2296 * The page is all ours. Move it to our anon_vma so
2297 * the rmap code will not search our parent or siblings.
2298 * Protected against the rmap code by the page lock.
2299 */
2300 page_move_anon_rmap(old_page, vma, address);
2301 unlock_page(old_page);
2302 return wp_page_reuse(mm, vma, address, page_table, ptl,
2303 orig_pte, old_page, 0, 0);
2304 }
2305 unlock_page(old_page);
2306 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2307 (VM_WRITE|VM_SHARED))) {
2308 return wp_page_shared(mm, vma, address, page_table, pmd,
2309 ptl, orig_pte, old_page);
2310 }
2311
2312 /*
2313 * Ok, we need to copy. Oh, well..
2314 */
2315 page_cache_get(old_page);
2316
2317 pte_unmap_unlock(page_table, ptl);
2318 return wp_page_copy(mm, vma, address, page_table, pmd,
2319 orig_pte, old_page);
2320}
2321
2261static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2322static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2262 unsigned long start_addr, unsigned long end_addr, 2323 unsigned long start_addr, unsigned long end_addr,
2263 struct zap_details *details) 2324 struct zap_details *details)