aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorShachar Raindel <raindel@mellanox.com>2015-04-14 18:46:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-14 19:49:03 -0400
commit2f38ab2c3c7fef04dca0313fd89d91f142ca9281 (patch)
tree0f749c9d45d2177903ac5bcc29457e35961f64b2 /mm/memory.c
parent28766805275c12c2298883cece3f98505ac764b4 (diff)
mm: refactor do_wp_page, extract the page copy flow
In some cases, do_wp_page had to copy the page suffering a write fault to a new location. If the function logic decided that to do this, it was done by jumping with a "goto" operation to the relevant code block. This made the code really hard to understand. It is also against the kernel coding style guidelines. This patch extracts the page copy and page table update logic to a separate function. It also clean up the naming, from "gotten" to "wp_page_copy", and adds few comments. Signed-off-by: Shachar Raindel <raindel@mellanox.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Andi Kleen <ak@linux.intel.com> Acked-by: Haggai Eran <haggaie@mellanox.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Peter Feiner <pfeiner@google.com> Cc: Michel Lespinasse <walken@google.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c265
1 files changed, 147 insertions, 118 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 0e28fddafdaf..cfd3c78f00fe 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2042,6 +2042,146 @@ static inline int wp_page_reuse(struct mm_struct *mm,
2042} 2042}
2043 2043
2044/* 2044/*
2045 * Handle the case of a page which we actually need to copy to a new page.
2046 *
2047 * Called with mmap_sem locked and the old page referenced, but
2048 * without the ptl held.
2049 *
2050 * High level logic flow:
2051 *
2052 * - Allocate a page, copy the content of the old page to the new one.
2053 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
2054 * - Take the PTL. If the pte changed, bail out and release the allocated page
2055 * - If the pte is still the way we remember it, update the page table and all
2056 * relevant references. This includes dropping the reference the page-table
2057 * held to the old page, as well as updating the rmap.
2058 * - In any case, unlock the PTL and drop the reference we took to the old page.
2059 */
2060static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2061 unsigned long address, pte_t *page_table, pmd_t *pmd,
2062 pte_t orig_pte, struct page *old_page)
2063{
2064 struct page *new_page = NULL;
2065 spinlock_t *ptl = NULL;
2066 pte_t entry;
2067 int page_copied = 0;
2068 const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
2069 const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
2070 struct mem_cgroup *memcg;
2071
2072 if (unlikely(anon_vma_prepare(vma)))
2073 goto oom;
2074
2075 if (is_zero_pfn(pte_pfn(orig_pte))) {
2076 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2077 if (!new_page)
2078 goto oom;
2079 } else {
2080 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2081 if (!new_page)
2082 goto oom;
2083 cow_user_page(new_page, old_page, address, vma);
2084 }
2085 __SetPageUptodate(new_page);
2086
2087 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2088 goto oom_free_new;
2089
2090 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2091
2092 /*
2093 * Re-check the pte - we dropped the lock
2094 */
2095 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2096 if (likely(pte_same(*page_table, orig_pte))) {
2097 if (old_page) {
2098 if (!PageAnon(old_page)) {
2099 dec_mm_counter_fast(mm, MM_FILEPAGES);
2100 inc_mm_counter_fast(mm, MM_ANONPAGES);
2101 }
2102 } else {
2103 inc_mm_counter_fast(mm, MM_ANONPAGES);
2104 }
2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2106 entry = mk_pte(new_page, vma->vm_page_prot);
2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2108 /*
2109 * Clear the pte entry and flush it first, before updating the
2110 * pte with the new entry. This will avoid a race condition
2111 * seen in the presence of one thread doing SMC and another
2112 * thread doing COW.
2113 */
2114 ptep_clear_flush_notify(vma, address, page_table);
2115 page_add_new_anon_rmap(new_page, vma, address);
2116 mem_cgroup_commit_charge(new_page, memcg, false);
2117 lru_cache_add_active_or_unevictable(new_page, vma);
2118 /*
2119 * We call the notify macro here because, when using secondary
2120 * mmu page tables (such as kvm shadow page tables), we want the
2121 * new page to be mapped directly into the secondary page table.
2122 */
2123 set_pte_at_notify(mm, address, page_table, entry);
2124 update_mmu_cache(vma, address, page_table);
2125 if (old_page) {
2126 /*
2127 * Only after switching the pte to the new page may
2128 * we remove the mapcount here. Otherwise another
2129 * process may come and find the rmap count decremented
2130 * before the pte is switched to the new page, and
2131 * "reuse" the old page writing into it while our pte
2132 * here still points into it and can be read by other
2133 * threads.
2134 *
2135 * The critical issue is to order this
2136 * page_remove_rmap with the ptp_clear_flush above.
2137 * Those stores are ordered by (if nothing else,)
2138 * the barrier present in the atomic_add_negative
2139 * in page_remove_rmap.
2140 *
2141 * Then the TLB flush in ptep_clear_flush ensures that
2142 * no process can access the old page before the
2143 * decremented mapcount is visible. And the old page
2144 * cannot be reused until after the decremented
2145 * mapcount is visible. So transitively, TLBs to
2146 * old page will be flushed before it can be reused.
2147 */
2148 page_remove_rmap(old_page);
2149 }
2150
2151 /* Free the old page.. */
2152 new_page = old_page;
2153 page_copied = 1;
2154 } else {
2155 mem_cgroup_cancel_charge(new_page, memcg);
2156 }
2157
2158 if (new_page)
2159 page_cache_release(new_page);
2160
2161 pte_unmap_unlock(page_table, ptl);
2162 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2163 if (old_page) {
2164 /*
2165 * Don't let another task, with possibly unlocked vma,
2166 * keep the mlocked page.
2167 */
2168 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2169 lock_page(old_page); /* LRU manipulation */
2170 munlock_vma_page(old_page);
2171 unlock_page(old_page);
2172 }
2173 page_cache_release(old_page);
2174 }
2175 return page_copied ? VM_FAULT_WRITE : 0;
2176oom_free_new:
2177 page_cache_release(new_page);
2178oom:
2179 if (old_page)
2180 page_cache_release(old_page);
2181 return VM_FAULT_OOM;
2182}
2183
2184/*
2045 * This routine handles present pages, when users try to write 2185 * This routine handles present pages, when users try to write
2046 * to a shared page. It is done by copying the page to a new address 2186 * to a shared page. It is done by copying the page to a new address
2047 * and decrementing the shared-page counter for the old page. 2187 * and decrementing the shared-page counter for the old page.
@@ -2064,12 +2204,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2064 spinlock_t *ptl, pte_t orig_pte) 2204 spinlock_t *ptl, pte_t orig_pte)
2065 __releases(ptl) 2205 __releases(ptl)
2066{ 2206{
2067 struct page *old_page, *new_page = NULL; 2207 struct page *old_page;
2068 pte_t entry;
2069 int page_copied = 0;
2070 unsigned long mmun_start = 0; /* For mmu_notifiers */
2071 unsigned long mmun_end = 0; /* For mmu_notifiers */
2072 struct mem_cgroup *memcg;
2073 2208
2074 old_page = vm_normal_page(vma, address, orig_pte); 2209 old_page = vm_normal_page(vma, address, orig_pte);
2075 if (!old_page) { 2210 if (!old_page) {
@@ -2085,7 +2220,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2085 (VM_WRITE|VM_SHARED)) 2220 (VM_WRITE|VM_SHARED))
2086 return wp_page_reuse(mm, vma, address, page_table, ptl, 2221 return wp_page_reuse(mm, vma, address, page_table, ptl,
2087 orig_pte, old_page, 0, 0); 2222 orig_pte, old_page, 0, 0);
2088 goto gotten; 2223
2224 pte_unmap_unlock(page_table, ptl);
2225 return wp_page_copy(mm, vma, address, page_table, pmd,
2226 orig_pte, old_page);
2089 } 2227 }
2090 2228
2091 /* 2229 /*
@@ -2165,119 +2303,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2165 * Ok, we need to copy. Oh, well.. 2303 * Ok, we need to copy. Oh, well..
2166 */ 2304 */
2167 page_cache_get(old_page); 2305 page_cache_get(old_page);
2168gotten:
2169 pte_unmap_unlock(page_table, ptl);
2170
2171 if (unlikely(anon_vma_prepare(vma)))
2172 goto oom;
2173
2174 if (is_zero_pfn(pte_pfn(orig_pte))) {
2175 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2176 if (!new_page)
2177 goto oom;
2178 } else {
2179 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2180 if (!new_page)
2181 goto oom;
2182 cow_user_page(new_page, old_page, address, vma);
2183 }
2184 __SetPageUptodate(new_page);
2185
2186 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2187 goto oom_free_new;
2188
2189 mmun_start = address & PAGE_MASK;
2190 mmun_end = mmun_start + PAGE_SIZE;
2191 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2192
2193 /*
2194 * Re-check the pte - we dropped the lock
2195 */
2196 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2197 if (likely(pte_same(*page_table, orig_pte))) {
2198 if (old_page) {
2199 if (!PageAnon(old_page)) {
2200 dec_mm_counter_fast(mm, MM_FILEPAGES);
2201 inc_mm_counter_fast(mm, MM_ANONPAGES);
2202 }
2203 } else
2204 inc_mm_counter_fast(mm, MM_ANONPAGES);
2205 flush_cache_page(vma, address, pte_pfn(orig_pte));
2206 entry = mk_pte(new_page, vma->vm_page_prot);
2207 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2208 /*
2209 * Clear the pte entry and flush it first, before updating the
2210 * pte with the new entry. This will avoid a race condition
2211 * seen in the presence of one thread doing SMC and another
2212 * thread doing COW.
2213 */
2214 ptep_clear_flush_notify(vma, address, page_table);
2215 page_add_new_anon_rmap(new_page, vma, address);
2216 mem_cgroup_commit_charge(new_page, memcg, false);
2217 lru_cache_add_active_or_unevictable(new_page, vma);
2218 /*
2219 * We call the notify macro here because, when using secondary
2220 * mmu page tables (such as kvm shadow page tables), we want the
2221 * new page to be mapped directly into the secondary page table.
2222 */
2223 set_pte_at_notify(mm, address, page_table, entry);
2224 update_mmu_cache(vma, address, page_table);
2225 if (old_page) {
2226 /*
2227 * Only after switching the pte to the new page may
2228 * we remove the mapcount here. Otherwise another
2229 * process may come and find the rmap count decremented
2230 * before the pte is switched to the new page, and
2231 * "reuse" the old page writing into it while our pte
2232 * here still points into it and can be read by other
2233 * threads.
2234 *
2235 * The critical issue is to order this
2236 * page_remove_rmap with the ptp_clear_flush above.
2237 * Those stores are ordered by (if nothing else,)
2238 * the barrier present in the atomic_add_negative
2239 * in page_remove_rmap.
2240 *
2241 * Then the TLB flush in ptep_clear_flush ensures that
2242 * no process can access the old page before the
2243 * decremented mapcount is visible. And the old page
2244 * cannot be reused until after the decremented
2245 * mapcount is visible. So transitively, TLBs to
2246 * old page will be flushed before it can be reused.
2247 */
2248 page_remove_rmap(old_page);
2249 }
2250
2251 /* Free the old page.. */
2252 new_page = old_page;
2253 page_copied = 1;
2254 } else
2255 mem_cgroup_cancel_charge(new_page, memcg);
2256
2257 if (new_page)
2258 page_cache_release(new_page);
2259 2306
2260 pte_unmap_unlock(page_table, ptl); 2307 pte_unmap_unlock(page_table, ptl);
2261 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2308 return wp_page_copy(mm, vma, address, page_table, pmd,
2262 if (old_page) { 2309 orig_pte, old_page);
2263 /*
2264 * Don't let another task, with possibly unlocked vma,
2265 * keep the mlocked page.
2266 */
2267 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2268 lock_page(old_page); /* LRU manipulation */
2269 munlock_vma_page(old_page);
2270 unlock_page(old_page);
2271 }
2272 page_cache_release(old_page);
2273 }
2274 return page_copied ? VM_FAULT_WRITE : 0;
2275oom_free_new:
2276 page_cache_release(new_page);
2277oom:
2278 if (old_page)
2279 page_cache_release(old_page);
2280 return VM_FAULT_OOM;
2281} 2310}
2282 2311
2283static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2312static void unmap_mapping_range_vma(struct vm_area_struct *vma,