diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 9 | ||||
-rw-r--r-- | mm/memcontrol.c | 17 | ||||
-rw-r--r-- | mm/memory.c | 39 | ||||
-rw-r--r-- | mm/mmap.c | 13 | ||||
-rw-r--r-- | mm/page-writeback.c | 43 | ||||
-rw-r--r-- | mm/rmap.c | 42 | ||||
-rw-r--r-- | mm/vmscan.c | 24 |
7 files changed, 102 insertions, 85 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 56badfc4810a..957d3da53ddd 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC | |||
14 | depends on !KMEMCHECK | 14 | depends on !KMEMCHECK |
15 | select PAGE_EXTENSION | 15 | select PAGE_EXTENSION |
16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
17 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
18 | ---help--- | 17 | ---help--- |
19 | Unmap pages from the kernel linear mapping after free_pages(). | 18 | Unmap pages from the kernel linear mapping after free_pages(). |
20 | This results in a large slowdown, but helps to find certain types | 19 | This results in a large slowdown, but helps to find certain types |
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC | |||
27 | that would result in incorrect warnings of memory corruption after | 26 | that would result in incorrect warnings of memory corruption after |
28 | a resume because free pages are not saved to the suspend image. | 27 | a resume because free pages are not saved to the suspend image. |
29 | 28 | ||
30 | config WANT_PAGE_DEBUG_FLAGS | ||
31 | bool | ||
32 | |||
33 | config PAGE_POISONING | 29 | config PAGE_POISONING |
34 | bool | 30 | bool |
35 | select WANT_PAGE_DEBUG_FLAGS | ||
36 | |||
37 | config PAGE_GUARD | ||
38 | bool | ||
39 | select WANT_PAGE_DEBUG_FLAGS | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ef91e856c7e4..851924fa5170 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3043 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 3043 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { |
3044 | mem_cgroup_swap_statistics(from, false); | 3044 | mem_cgroup_swap_statistics(from, false); |
3045 | mem_cgroup_swap_statistics(to, true); | 3045 | mem_cgroup_swap_statistics(to, true); |
3046 | /* | ||
3047 | * This function is only called from task migration context now. | ||
3048 | * It postpones page_counter and refcount handling till the end | ||
3049 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
3050 | * improvement. But we cannot postpone css_get(to) because if | ||
3051 | * the process that has been moved to @to does swap-in, the | ||
3052 | * refcount of @to might be decreased to 0. | ||
3053 | * | ||
3054 | * We are in attach() phase, so the cgroup is guaranteed to be | ||
3055 | * alive, so we can just call css_get(). | ||
3056 | */ | ||
3057 | css_get(&to->css); | ||
3058 | return 0; | 3046 | return 0; |
3059 | } | 3047 | } |
3060 | return -EINVAL; | 3048 | return -EINVAL; |
@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4679 | if (parent_css == NULL) { | 4667 | if (parent_css == NULL) { |
4680 | root_mem_cgroup = memcg; | 4668 | root_mem_cgroup = memcg; |
4681 | page_counter_init(&memcg->memory, NULL); | 4669 | page_counter_init(&memcg->memory, NULL); |
4670 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4682 | page_counter_init(&memcg->memsw, NULL); | 4671 | page_counter_init(&memcg->memsw, NULL); |
4683 | page_counter_init(&memcg->kmem, NULL); | 4672 | page_counter_init(&memcg->kmem, NULL); |
4684 | } | 4673 | } |
@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4724 | 4713 | ||
4725 | if (parent->use_hierarchy) { | 4714 | if (parent->use_hierarchy) { |
4726 | page_counter_init(&memcg->memory, &parent->memory); | 4715 | page_counter_init(&memcg->memory, &parent->memory); |
4716 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4727 | page_counter_init(&memcg->memsw, &parent->memsw); | 4717 | page_counter_init(&memcg->memsw, &parent->memsw); |
4728 | page_counter_init(&memcg->kmem, &parent->kmem); | 4718 | page_counter_init(&memcg->kmem, &parent->kmem); |
4729 | 4719 | ||
@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4733 | */ | 4723 | */ |
4734 | } else { | 4724 | } else { |
4735 | page_counter_init(&memcg->memory, NULL); | 4725 | page_counter_init(&memcg->memory, NULL); |
4726 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4736 | page_counter_init(&memcg->memsw, NULL); | 4727 | page_counter_init(&memcg->memsw, NULL); |
4737 | page_counter_init(&memcg->kmem, NULL); | 4728 | page_counter_init(&memcg->kmem, NULL); |
4738 | /* | 4729 | /* |
@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
4807 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); | 4798 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
4808 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); | 4799 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
4809 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); | 4800 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
4810 | memcg->soft_limit = 0; | 4801 | memcg->soft_limit = PAGE_COUNTER_MAX; |
4811 | } | 4802 | } |
4812 | 4803 | ||
4813 | #ifdef CONFIG_MMU | 4804 | #ifdef CONFIG_MMU |
diff --git a/mm/memory.c b/mm/memory.c index ca920d1fd314..54f3a9b00956 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long | |||
235 | 235 | ||
236 | static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) | 236 | static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) |
237 | { | 237 | { |
238 | if (!tlb->end) | ||
239 | return; | ||
240 | |||
238 | tlb_flush(tlb); | 241 | tlb_flush(tlb); |
239 | mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); | 242 | mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); |
240 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 243 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
@@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) | |||
247 | { | 250 | { |
248 | struct mmu_gather_batch *batch; | 251 | struct mmu_gather_batch *batch; |
249 | 252 | ||
250 | for (batch = &tlb->local; batch; batch = batch->next) { | 253 | for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { |
251 | free_pages_and_swap_cache(batch->pages, batch->nr); | 254 | free_pages_and_swap_cache(batch->pages, batch->nr); |
252 | batch->nr = 0; | 255 | batch->nr = 0; |
253 | } | 256 | } |
@@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) | |||
256 | 259 | ||
257 | void tlb_flush_mmu(struct mmu_gather *tlb) | 260 | void tlb_flush_mmu(struct mmu_gather *tlb) |
258 | { | 261 | { |
259 | if (!tlb->end) | ||
260 | return; | ||
261 | |||
262 | tlb_flush_mmu_tlbonly(tlb); | 262 | tlb_flush_mmu_tlbonly(tlb); |
263 | tlb_flush_mmu_free(tlb); | 263 | tlb_flush_mmu_free(tlb); |
264 | } | 264 | } |
@@ -2137,17 +2137,24 @@ reuse: | |||
2137 | if (!dirty_page) | 2137 | if (!dirty_page) |
2138 | return ret; | 2138 | return ret; |
2139 | 2139 | ||
2140 | /* | ||
2141 | * Yes, Virginia, this is actually required to prevent a race | ||
2142 | * with clear_page_dirty_for_io() from clearing the page dirty | ||
2143 | * bit after it clear all dirty ptes, but before a racing | ||
2144 | * do_wp_page installs a dirty pte. | ||
2145 | * | ||
2146 | * do_shared_fault is protected similarly. | ||
2147 | */ | ||
2148 | if (!page_mkwrite) { | 2140 | if (!page_mkwrite) { |
2149 | wait_on_page_locked(dirty_page); | 2141 | struct address_space *mapping; |
2150 | set_page_dirty_balance(dirty_page); | 2142 | int dirtied; |
2143 | |||
2144 | lock_page(dirty_page); | ||
2145 | dirtied = set_page_dirty(dirty_page); | ||
2146 | VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page); | ||
2147 | mapping = dirty_page->mapping; | ||
2148 | unlock_page(dirty_page); | ||
2149 | |||
2150 | if (dirtied && mapping) { | ||
2151 | /* | ||
2152 | * Some device drivers do not set page.mapping | ||
2153 | * but still dirty their pages | ||
2154 | */ | ||
2155 | balance_dirty_pages_ratelimited(mapping); | ||
2156 | } | ||
2157 | |||
2151 | /* file_update_time outside page_lock */ | 2158 | /* file_update_time outside page_lock */ |
2152 | if (vma->vm_file) | 2159 | if (vma->vm_file) |
2153 | file_update_time(vma->vm_file); | 2160 | file_update_time(vma->vm_file); |
@@ -2593,7 +2600,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2593 | if (prev && prev->vm_end == address) | 2600 | if (prev && prev->vm_end == address) |
2594 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; | 2601 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; |
2595 | 2602 | ||
2596 | expand_downwards(vma, address - PAGE_SIZE); | 2603 | return expand_downwards(vma, address - PAGE_SIZE); |
2597 | } | 2604 | } |
2598 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { | 2605 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { |
2599 | struct vm_area_struct *next = vma->vm_next; | 2606 | struct vm_area_struct *next = vma->vm_next; |
@@ -2602,7 +2609,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2602 | if (next && next->vm_start == address + PAGE_SIZE) | 2609 | if (next && next->vm_start == address + PAGE_SIZE) |
2603 | return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; | 2610 | return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; |
2604 | 2611 | ||
2605 | expand_upwards(vma, address + PAGE_SIZE); | 2612 | return expand_upwards(vma, address + PAGE_SIZE); |
2606 | } | 2613 | } |
2607 | return 0; | 2614 | return 0; |
2608 | } | 2615 | } |
@@ -778,10 +778,12 @@ again: remove_next = 1 + (end > next->vm_end); | |||
778 | if (exporter && exporter->anon_vma && !importer->anon_vma) { | 778 | if (exporter && exporter->anon_vma && !importer->anon_vma) { |
779 | int error; | 779 | int error; |
780 | 780 | ||
781 | importer->anon_vma = exporter->anon_vma; | ||
781 | error = anon_vma_clone(importer, exporter); | 782 | error = anon_vma_clone(importer, exporter); |
782 | if (error) | 783 | if (error) { |
784 | importer->anon_vma = NULL; | ||
783 | return error; | 785 | return error; |
784 | importer->anon_vma = exporter->anon_vma; | 786 | } |
785 | } | 787 | } |
786 | } | 788 | } |
787 | 789 | ||
@@ -2099,14 +2101,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
2099 | { | 2101 | { |
2100 | struct mm_struct *mm = vma->vm_mm; | 2102 | struct mm_struct *mm = vma->vm_mm; |
2101 | struct rlimit *rlim = current->signal->rlim; | 2103 | struct rlimit *rlim = current->signal->rlim; |
2102 | unsigned long new_start; | 2104 | unsigned long new_start, actual_size; |
2103 | 2105 | ||
2104 | /* address space limit tests */ | 2106 | /* address space limit tests */ |
2105 | if (!may_expand_vm(mm, grow)) | 2107 | if (!may_expand_vm(mm, grow)) |
2106 | return -ENOMEM; | 2108 | return -ENOMEM; |
2107 | 2109 | ||
2108 | /* Stack limit test */ | 2110 | /* Stack limit test */ |
2109 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2111 | actual_size = size; |
2112 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) | ||
2113 | actual_size -= PAGE_SIZE; | ||
2114 | if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | ||
2110 | return -ENOMEM; | 2115 | return -ENOMEM; |
2111 | 2116 | ||
2112 | /* mlock limit tests */ | 2117 | /* mlock limit tests */ |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d5d81f5384d1..6f4335238e33 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1541,16 +1541,6 @@ pause: | |||
1541 | bdi_start_background_writeback(bdi); | 1541 | bdi_start_background_writeback(bdi); |
1542 | } | 1542 | } |
1543 | 1543 | ||
1544 | void set_page_dirty_balance(struct page *page) | ||
1545 | { | ||
1546 | if (set_page_dirty(page)) { | ||
1547 | struct address_space *mapping = page_mapping(page); | ||
1548 | |||
1549 | if (mapping) | ||
1550 | balance_dirty_pages_ratelimited(mapping); | ||
1551 | } | ||
1552 | } | ||
1553 | |||
1554 | static DEFINE_PER_CPU(int, bdp_ratelimits); | 1544 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
1555 | 1545 | ||
1556 | /* | 1546 | /* |
@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
2123 | * page dirty in that case, but not all the buffers. This is a "bottom-up" | 2113 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
2124 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. | 2114 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. |
2125 | * | 2115 | * |
2126 | * Most callers have locked the page, which pins the address_space in memory. | 2116 | * The caller must ensure this doesn't race with truncation. Most will simply |
2127 | * But zap_pte_range() does not lock the page, however in that case the | 2117 | * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and |
2128 | * mapping is pinned by the vma's ->vm_file reference. | 2118 | * the pte lock held, which also locks out truncation. |
2129 | * | ||
2130 | * We take care to handle the case where the page was truncated from the | ||
2131 | * mapping by re-checking page_mapping() inside tree_lock. | ||
2132 | */ | 2119 | */ |
2133 | int __set_page_dirty_nobuffers(struct page *page) | 2120 | int __set_page_dirty_nobuffers(struct page *page) |
2134 | { | 2121 | { |
2135 | if (!TestSetPageDirty(page)) { | 2122 | if (!TestSetPageDirty(page)) { |
2136 | struct address_space *mapping = page_mapping(page); | 2123 | struct address_space *mapping = page_mapping(page); |
2137 | struct address_space *mapping2; | ||
2138 | unsigned long flags; | 2124 | unsigned long flags; |
2139 | 2125 | ||
2140 | if (!mapping) | 2126 | if (!mapping) |
2141 | return 1; | 2127 | return 1; |
2142 | 2128 | ||
2143 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2129 | spin_lock_irqsave(&mapping->tree_lock, flags); |
2144 | mapping2 = page_mapping(page); | 2130 | BUG_ON(page_mapping(page) != mapping); |
2145 | if (mapping2) { /* Race with truncate? */ | 2131 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
2146 | BUG_ON(mapping2 != mapping); | 2132 | account_page_dirtied(page, mapping); |
2147 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 2133 | radix_tree_tag_set(&mapping->page_tree, page_index(page), |
2148 | account_page_dirtied(page, mapping); | 2134 | PAGECACHE_TAG_DIRTY); |
2149 | radix_tree_tag_set(&mapping->page_tree, | ||
2150 | page_index(page), PAGECACHE_TAG_DIRTY); | ||
2151 | } | ||
2152 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 2135 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
2153 | if (mapping->host) { | 2136 | if (mapping->host) { |
2154 | /* !PageAnon && !swapper_space */ | 2137 | /* !PageAnon && !swapper_space */ |
@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page) | |||
2305 | /* | 2288 | /* |
2306 | * We carefully synchronise fault handlers against | 2289 | * We carefully synchronise fault handlers against |
2307 | * installing a dirty pte and marking the page dirty | 2290 | * installing a dirty pte and marking the page dirty |
2308 | * at this point. We do this by having them hold the | 2291 | * at this point. We do this by having them hold the |
2309 | * page lock at some point after installing their | 2292 | * page lock while dirtying the page, and pages are |
2310 | * pte, but before marking the page dirty. | 2293 | * always locked coming in here, so we get the desired |
2311 | * Pages are always locked coming in here, so we get | 2294 | * exclusion. |
2312 | * the desired exclusion. See mm/memory.c:do_wp_page() | ||
2313 | * for more comments. | ||
2314 | */ | 2295 | */ |
2315 | if (TestClearPageDirty(page)) { | 2296 | if (TestClearPageDirty(page)) { |
2316 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2297 | dec_zone_page_state(page, NR_FILE_DIRTY); |
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
72 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 72 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
73 | if (anon_vma) { | 73 | if (anon_vma) { |
74 | atomic_set(&anon_vma->refcount, 1); | 74 | atomic_set(&anon_vma->refcount, 1); |
75 | anon_vma->degree = 1; /* Reference for first vma */ | ||
76 | anon_vma->parent = anon_vma; | ||
75 | /* | 77 | /* |
76 | * Initialise the anon_vma root to point to itself. If called | 78 | * Initialise the anon_vma root to point to itself. If called |
77 | * from fork, the root will be reset to the parents anon_vma. | 79 | * from fork, the root will be reset to the parents anon_vma. |
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
188 | if (likely(!vma->anon_vma)) { | 190 | if (likely(!vma->anon_vma)) { |
189 | vma->anon_vma = anon_vma; | 191 | vma->anon_vma = anon_vma; |
190 | anon_vma_chain_link(vma, avc, anon_vma); | 192 | anon_vma_chain_link(vma, avc, anon_vma); |
193 | /* vma reference or self-parent link for new root */ | ||
194 | anon_vma->degree++; | ||
191 | allocated = NULL; | 195 | allocated = NULL; |
192 | avc = NULL; | 196 | avc = NULL; |
193 | } | 197 | } |
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) | |||
236 | /* | 240 | /* |
237 | * Attach the anon_vmas from src to dst. | 241 | * Attach the anon_vmas from src to dst. |
238 | * Returns 0 on success, -ENOMEM on failure. | 242 | * Returns 0 on success, -ENOMEM on failure. |
243 | * | ||
244 | * If dst->anon_vma is NULL this function tries to find and reuse existing | ||
245 | * anon_vma which has no vmas and only one child anon_vma. This prevents | ||
246 | * degradation of anon_vma hierarchy to endless linear chain in case of | ||
247 | * constantly forking task. On the other hand, an anon_vma with more than one | ||
248 | * child isn't reused even if there was no alive vma, thus rmap walker has a | ||
249 | * good chance of avoiding scanning the whole hierarchy when it searches where | ||
250 | * page is mapped. | ||
239 | */ | 251 | */ |
240 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 252 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
241 | { | 253 | { |
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
256 | anon_vma = pavc->anon_vma; | 268 | anon_vma = pavc->anon_vma; |
257 | root = lock_anon_vma_root(root, anon_vma); | 269 | root = lock_anon_vma_root(root, anon_vma); |
258 | anon_vma_chain_link(dst, avc, anon_vma); | 270 | anon_vma_chain_link(dst, avc, anon_vma); |
271 | |||
272 | /* | ||
273 | * Reuse existing anon_vma if its degree lower than two, | ||
274 | * that means it has no vma and only one anon_vma child. | ||
275 | * | ||
276 | * Do not chose parent anon_vma, otherwise first child | ||
277 | * will always reuse it. Root anon_vma is never reused: | ||
278 | * it has self-parent reference and at least one child. | ||
279 | */ | ||
280 | if (!dst->anon_vma && anon_vma != src->anon_vma && | ||
281 | anon_vma->degree < 2) | ||
282 | dst->anon_vma = anon_vma; | ||
259 | } | 283 | } |
284 | if (dst->anon_vma) | ||
285 | dst->anon_vma->degree++; | ||
260 | unlock_anon_vma_root(root); | 286 | unlock_anon_vma_root(root); |
261 | return 0; | 287 | return 0; |
262 | 288 | ||
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
280 | if (!pvma->anon_vma) | 306 | if (!pvma->anon_vma) |
281 | return 0; | 307 | return 0; |
282 | 308 | ||
309 | /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ | ||
310 | vma->anon_vma = NULL; | ||
311 | |||
283 | /* | 312 | /* |
284 | * First, attach the new VMA to the parent VMA's anon_vmas, | 313 | * First, attach the new VMA to the parent VMA's anon_vmas, |
285 | * so rmap can find non-COWed pages in child processes. | 314 | * so rmap can find non-COWed pages in child processes. |
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
288 | if (error) | 317 | if (error) |
289 | return error; | 318 | return error; |
290 | 319 | ||
320 | /* An existing anon_vma has been reused, all done then. */ | ||
321 | if (vma->anon_vma) | ||
322 | return 0; | ||
323 | |||
291 | /* Then add our own anon_vma. */ | 324 | /* Then add our own anon_vma. */ |
292 | anon_vma = anon_vma_alloc(); | 325 | anon_vma = anon_vma_alloc(); |
293 | if (!anon_vma) | 326 | if (!anon_vma) |
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
301 | * lock any of the anon_vmas in this anon_vma tree. | 334 | * lock any of the anon_vmas in this anon_vma tree. |
302 | */ | 335 | */ |
303 | anon_vma->root = pvma->anon_vma->root; | 336 | anon_vma->root = pvma->anon_vma->root; |
337 | anon_vma->parent = pvma->anon_vma; | ||
304 | /* | 338 | /* |
305 | * With refcounts, an anon_vma can stay around longer than the | 339 | * With refcounts, an anon_vma can stay around longer than the |
306 | * process it belongs to. The root anon_vma needs to be pinned until | 340 | * process it belongs to. The root anon_vma needs to be pinned until |
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
311 | vma->anon_vma = anon_vma; | 345 | vma->anon_vma = anon_vma; |
312 | anon_vma_lock_write(anon_vma); | 346 | anon_vma_lock_write(anon_vma); |
313 | anon_vma_chain_link(vma, avc, anon_vma); | 347 | anon_vma_chain_link(vma, avc, anon_vma); |
348 | anon_vma->parent->degree++; | ||
314 | anon_vma_unlock_write(anon_vma); | 349 | anon_vma_unlock_write(anon_vma); |
315 | 350 | ||
316 | return 0; | 351 | return 0; |
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
341 | * Leave empty anon_vmas on the list - we'll need | 376 | * Leave empty anon_vmas on the list - we'll need |
342 | * to free them outside the lock. | 377 | * to free them outside the lock. |
343 | */ | 378 | */ |
344 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) | 379 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { |
380 | anon_vma->parent->degree--; | ||
345 | continue; | 381 | continue; |
382 | } | ||
346 | 383 | ||
347 | list_del(&avc->same_vma); | 384 | list_del(&avc->same_vma); |
348 | anon_vma_chain_free(avc); | 385 | anon_vma_chain_free(avc); |
349 | } | 386 | } |
387 | if (vma->anon_vma) | ||
388 | vma->anon_vma->degree--; | ||
350 | unlock_anon_vma_root(root); | 389 | unlock_anon_vma_root(root); |
351 | 390 | ||
352 | /* | 391 | /* |
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
357 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 396 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
358 | struct anon_vma *anon_vma = avc->anon_vma; | 397 | struct anon_vma *anon_vma = avc->anon_vma; |
359 | 398 | ||
399 | BUG_ON(anon_vma->degree); | ||
360 | put_anon_vma(anon_vma); | 400 | put_anon_vma(anon_vma); |
361 | 401 | ||
362 | list_del(&avc->same_vma); | 402 | list_del(&avc->same_vma); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index bd9a72bc4a1b..ab2505c3ef54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2921 | return false; | 2921 | return false; |
2922 | 2922 | ||
2923 | /* | 2923 | /* |
2924 | * There is a potential race between when kswapd checks its watermarks | 2924 | * The throttled processes are normally woken up in balance_pgdat() as |
2925 | * and a process gets throttled. There is also a potential race if | 2925 | * soon as pfmemalloc_watermark_ok() is true. But there is a potential |
2926 | * processes get throttled, kswapd wakes, a large process exits therby | 2926 | * race between when kswapd checks the watermarks and a process gets |
2927 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | 2927 | * throttled. There is also a potential race if processes get |
2928 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | 2928 | * throttled, kswapd wakes, a large process exits thereby balancing the |
2929 | * so wake them now if necessary. If necessary, processes will wake | 2929 | * zones, which causes kswapd to exit balance_pgdat() before reaching |
2930 | * kswapd and get throttled again | 2930 | * the wake up checks. If kswapd is going to sleep, no process should |
2931 | * be sleeping on pfmemalloc_wait, so wake them now if necessary. If | ||
2932 | * the wake up is premature, processes will wake kswapd and get | ||
2933 | * throttled again. The difference from wake ups in balance_pgdat() is | ||
2934 | * that here we are under prepare_to_wait(). | ||
2931 | */ | 2935 | */ |
2932 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | 2936 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) |
2933 | wake_up(&pgdat->pfmemalloc_wait); | 2937 | wake_up_all(&pgdat->pfmemalloc_wait); |
2934 | return false; | ||
2935 | } | ||
2936 | 2938 | ||
2937 | return pgdat_balanced(pgdat, order, classzone_idx); | 2939 | return pgdat_balanced(pgdat, order, classzone_idx); |
2938 | } | 2940 | } |