7 files changed, 102 insertions, 85 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 56badfc4810a..957d3da53ddd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
        depends on !KMEMCHECK
        select PAGE_EXTENSION
        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
-        select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
          Unmap pages from the kernel linear mapping after free_pages().
          This results in a large slowdown, but helps to find certain types
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
          that would result in incorrect warnings of memory corruption after
          a resume because free pages are not saved to the suspend image.
-config WANT_PAGE_DEBUG_FLAGS
-        bool
 config PAGE_POISONING
        bool
-        select WANT_PAGE_DEBUG_FLAGS
-config PAGE_GUARD
-        bool
-        select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e856c7e4..851924fa5170 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
                mem_cgroup_swap_statistics(from, false);
                mem_cgroup_swap_statistics(to, true);
-                /*
-                 * This function is only called from task migration context now.
-                 * It postpones page_counter and refcount handling till the end
-                 * of task migration(mem_cgroup_clear_mc()) for performance
-                 * improvement. But we cannot postpone css_get(to)  because if
-                 * the process that has been moved to @to does swap-in, the
-                 * refcount of @to might be decreased to 0.
-                 *
-                 * We are in attach() phase, so the cgroup is guaranteed to be
-                 * alive, so we can just call css_get().
-                 */
-                css_get(&to->css);
                return 0;
        }
        return -EINVAL;
@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
                page_counter_init(&memcg->memory, NULL);
+                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
        }
@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        if (parent->use_hierarchy) {
                page_counter_init(&memcg->memory, &parent->memory);
+                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, &parent->memsw);
                page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 */
        } else {
                page_counter_init(&memcg->memory, NULL);
+                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
                /*
@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
        mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
        memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
-        memcg->soft_limit = 0;
+        memcg->soft_limit = PAGE_COUNTER_MAX;
 }
 #ifdef CONFIG_MMU
diff --git a/mm/memory.c b/mm/memory.c
index ca920d1fd314..54f3a9b00956 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
 static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
+        if (!tlb->end)
+                return;
        tlb_flush(tlb);
        mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
 #ifdef CONFIG_HAVE_RCU_TABLE_FREE
@@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 {
        struct mmu_gather_batch *batch;
-        for (batch = &tlb->local; batch; batch = batch->next) {
+        for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
                free_pages_and_swap_cache(batch->pages, batch->nr);
                batch->nr = 0;
        }
@@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 void tlb_flush_mmu(struct mmu_gather *tlb)
 {
-        if (!tlb->end)
-                return;
        tlb_flush_mmu_tlbonly(tlb);
        tlb_flush_mmu_free(tlb);
 }
@@ -2137,17 +2137,24 @@ reuse:
                if (!dirty_page)
                        return ret;
-                /*
-                 * Yes, Virginia, this is actually required to prevent a race
-                 * with clear_page_dirty_for_io() from clearing the page dirty
-                 * bit after it clear all dirty ptes, but before a racing
-                 * do_wp_page installs a dirty pte.
-                 *
-                 * do_shared_fault is protected similarly.
-                 */
                if (!page_mkwrite) {
-                        wait_on_page_locked(dirty_page);
+                        struct address_space *mapping;
-                        set_page_dirty_balance(dirty_page);
+                        int dirtied;
+                        lock_page(dirty_page);
+                        dirtied = set_page_dirty(dirty_page);
+                        VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
+                        mapping = dirty_page->mapping;
+                        unlock_page(dirty_page);
+                        if (dirtied && mapping) {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
                        /* file_update_time outside page_lock */
                        if (vma->vm_file)
                                file_update_time(vma->vm_file);
@@ -2593,7 +2600,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                if (prev && prev->vm_end == address)
                        return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-                expand_downwards(vma, address - PAGE_SIZE);
+                return expand_downwards(vma, address - PAGE_SIZE);
        }
        if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
                struct vm_area_struct *next = vma->vm_next;
@@ -2602,7 +2609,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                if (next && next->vm_start == address + PAGE_SIZE)
                        return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
-                expand_upwards(vma, address + PAGE_SIZE);
+                return expand_upwards(vma, address + PAGE_SIZE);
        }
        return 0;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b36aa7cc89a..7f684d5a8087 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -778,10 +778,12 @@ again:			remove_next = 1 + (end > next->vm_end);
                if (exporter && exporter->anon_vma && !importer->anon_vma) {
                        int error;
+                        importer->anon_vma = exporter->anon_vma;
                        error = anon_vma_clone(importer, exporter);
-                        if (error)
+                        if (error) {
+                                importer->anon_vma = NULL;
                                return error;
-                        importer->anon_vma = exporter->anon_vma;
+                        }
                }
        }
@@ -2099,14 +2101,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 {
        struct mm_struct *mm = vma->vm_mm;
        struct rlimit *rlim = current->signal->rlim;
-        unsigned long new_start;
+        unsigned long new_start, actual_size;
        /* address space limit tests */
        if (!may_expand_vm(mm, grow))
                return -ENOMEM;
        /* Stack limit test */
-        if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+        actual_size = size;
+        if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
+                actual_size -= PAGE_SIZE;
+        if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
                return -ENOMEM;
        /* mlock limit tests */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d5d81f5384d1..6f4335238e33 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1541,16 +1541,6 @@ pause:
                bdi_start_background_writeback(bdi);
 }
-void set_page_dirty_balance(struct page *page)
-{
-        if (set_page_dirty(page)) {
-                struct address_space *mapping = page_mapping(page);
-                if (mapping)
-                        balance_dirty_pages_ratelimited(mapping);
-        }
-}
 static DEFINE_PER_CPU(int, bdp_ratelimits);
 /*
@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
 * page dirty in that case, but not all the buffers.  This is a "bottom-up"
 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
 *
- * Most callers have locked the page, which pins the address_space in memory.
+ * The caller must ensure this doesn't race with truncation.  Most will simply
- * But zap_pte_range() does not lock the page, however in that case the
+ * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
- * mapping is pinned by the vma's ->vm_file reference.
+ * the pte lock held, which also locks out truncation.
- *
- * We take care to handle the case where the page was truncated from the
- * mapping by re-checking page_mapping() inside tree_lock.
 */
 int __set_page_dirty_nobuffers(struct page *page)
 {
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
-                struct address_space *mapping2;
                unsigned long flags;
                if (!mapping)
                        return 1;
                spin_lock_irqsave(&mapping->tree_lock, flags);
-                mapping2 = page_mapping(page);
+                BUG_ON(page_mapping(page) != mapping);
-                if (mapping2) { /* Race with truncate? */
+                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                        BUG_ON(mapping2 != mapping);
+                account_page_dirtied(page, mapping);
-                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+                radix_tree_tag_set(&mapping->page_tree, page_index(page),
-                        account_page_dirtied(page, mapping);
+                                   PAGECACHE_TAG_DIRTY);
-                        radix_tree_tag_set(&mapping->page_tree,
-                                page_index(page), PAGECACHE_TAG_DIRTY);
-                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
                /*
                 * We carefully synchronise fault handlers against
                 * installing a dirty pte and marking the page dirty
-                 * at this point. We do this by having them hold the
+                 * at this point.  We do this by having them hold the
-                 * page lock at some point after installing their
+                 * page lock while dirtying the page, and pages are
-                 * pte, but before marking the page dirty.
+                 * always locked coming in here, so we get the desired
-                 * Pages are always locked coming in here, so we get
+                 * exclusion.
-                 * the desired exclusion. See mm/memory.c:do_wp_page()
-                 * for more comments.
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
diff --git a/mm/rmap.c b/mm/rmap.c
index c5bc241127b2..71cd5bd0c17d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
+                anon_vma->degree = 1;   /* Reference for first vma */
+                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
                        anon_vma_chain_link(vma, avc, anon_vma);
+                        /* vma reference or self-parent link for new root */
+                        anon_vma->degree++;
                        allocated = NULL;
                        avc = NULL;
                }
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
 /*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
+ *
+ * If dst->anon_vma is NULL this function tries to find and reuse existing
+ * anon_vma which has no vmas and only one child anon_vma. This prevents
+ * degradation of anon_vma hierarchy to endless linear chain in case of
+ * constantly forking task. On the other hand, an anon_vma with more than one
+ * child isn't reused even if there was no alive vma, thus rmap walker has a
+ * good chance of avoiding scanning the whole hierarchy when it searches where
+ * page is mapped.
 */
 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);
+                /*
+                 * Reuse existing anon_vma if its degree lower than two,
+                 * that means it has no vma and only one anon_vma child.
+                 *
+                 * Do not chose parent anon_vma, otherwise first child
+                 * will always reuse it. Root anon_vma is never reused:
+                 * it has self-parent reference and at least one child.
+                 */
+                if (!dst->anon_vma && anon_vma != src->anon_vma &&
+                                anon_vma->degree < 2)
+                        dst->anon_vma = anon_vma;
        }
+        if (dst->anon_vma)
+                dst->anon_vma->degree++;
        unlock_anon_vma_root(root);
        return 0;
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        if (!pvma->anon_vma)
                return 0;
+        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+        vma->anon_vma = NULL;
        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        if (error)
                return error;
+        /* An existing anon_vma has been reused, all done then. */
+        if (vma->anon_vma)
+                return 0;
        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
+        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
+        anon_vma->parent->degree++;
        anon_vma_unlock_write(anon_vma);
        return 0;
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
-                if (RB_EMPTY_ROOT(&anon_vma->rb_root))
+                if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+                        anon_vma->parent->degree--;
                        continue;
+                }
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
+        if (vma->anon_vma)
+                vma->anon_vma->degree--;
        unlock_anon_vma_root(root);
        /*
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;
+                BUG_ON(anon_vma->degree);
                put_anon_vma(anon_vma);
                list_del(&avc->same_vma);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd9a72bc4a1b..ab2505c3ef54 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                return false;
        /*
-         * There is a potential race between when kswapd checks its watermarks
+         * The throttled processes are normally woken up in balance_pgdat() as
-         * and a process gets throttled. There is also a potential race if
+         * soon as pfmemalloc_watermark_ok() is true. But there is a potential
-         * processes get throttled, kswapd wakes, a large process exits therby
+         * race between when kswapd checks the watermarks and a process gets
-         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * throttled. There is also a potential race if processes get
-         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * throttled, kswapd wakes, a large process exits thereby balancing the
-         * so wake them now if necessary. If necessary, processes will wake
+         * zones, which causes kswapd to exit balance_pgdat() before reaching
-         * kswapd and get throttled again
+         * the wake up checks. If kswapd is going to sleep, no process should
+         * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+         * the wake up is premature, processes will wake kswapd and get
+         * throttled again. The difference from wake ups in balance_pgdat() is
+         * that here we are under prepare_to_wait().
         */
-        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+        if (waitqueue_active(&pgdat->pfmemalloc_wait))
-                wake_up(&pgdat->pfmemalloc_wait);
+                wake_up_all(&pgdat->pfmemalloc_wait);
-                return false;
-        }
        return pgdat_balanced(pgdat, order, classzone_idx);
 }