17 files changed, 155 insertions, 101 deletions
diff --git a/.mailmap b/.mailmap
index ada8ad696b2e..d357e1bd2a43 100644
--- a/.mailmap
+++ b/.mailmap
@@ -51,6 +51,7 @@ Greg Kroah-Hartman <gregkh@suse.de>
 Greg Kroah-Hartman <greg@kroah.com>
 Henk Vergonet <Henk.Vergonet@gmail.com>
 Henrik Kretzschmar <henne@nachtwindheim.de>
+Henrik Rydberg <rydberg@bitmath.org>
 Herbert Xu <herbert@gondor.apana.org.au>
 Jacob Shin <Jacob.Shin@amd.com>
 James Bottomley <jejb@mulgrave.(none)>
diff --git a/MAINTAINERS b/MAINTAINERS
index ddb9ac8d32b3..79b2e4ba78ee 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -724,15 +724,15 @@ F:	include/uapi/linux/apm_bios.h
 F:      drivers/char/apm-emulation.c
 APPLE BCM5974 MULTITOUCH DRIVER
-M:      Henrik Rydberg <rydberg@euromail.se>
+M:      Henrik Rydberg <rydberg@bitmath.org>
 L:      linux-input@vger.kernel.org
-S:      Maintained
+S:      Odd fixes
 F:      drivers/input/mouse/bcm5974.c
 APPLE SMC DRIVER
-M:      Henrik Rydberg <rydberg@euromail.se>
+M:      Henrik Rydberg <rydberg@bitmath.org>
 L:      lm-sensors@lm-sensors.org
-S:      Maintained
+S:      Odd fixes
 F:      drivers/hwmon/applesmc.c
 APPLETALK NETWORK LAYER
@@ -4940,10 +4940,10 @@ F:	include/uapi/linux/input.h
 F:      include/linux/input/
 INPUT MULTITOUCH (MT) PROTOCOL
-M:      Henrik Rydberg <rydberg@euromail.se>
+M:      Henrik Rydberg <rydberg@bitmath.org>
 L:      linux-input@vger.kernel.org
 T:      git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git
-S:      Maintained
+S:      Odd fixes
 F:      Documentation/input/multi-touch-protocol.txt
 F:      drivers/input/input-mt.c
 K:      \b(ABS|SYN)_MT_
diff --git a/arch/blackfin/mach-bf533/boards/stamp.c b/arch/blackfin/mach-bf533/boards/stamp.c
index 6f4bac969bf7..23eada79439c 100644
--- a/arch/blackfin/mach-bf533/boards/stamp.c
+++ b/arch/blackfin/mach-bf533/boards/stamp.c
@@ -7,6 +7,7 @@
 */
 #include <linux/device.h>
+#include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 99d440a4a6ba..ee85cd4e136a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                __FMODE_EXEC    | O_PATH        | __O_TMPFILE
+                __FMODE_EXEC    | O_PATH        | __O_TMPFILE   |
+                __FMODE_NONOTIFY
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 79b5af5e6a7b..cecd875653e4 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2023,11 +2023,8 @@ leave:
        dlm_lockres_drop_inflight_ref(dlm, res);
        spin_unlock(&res->spinlock);
-        if (ret < 0) {
+        if (ret < 0)
                mlog_errno(ret);
-                if (newlock)
-                        dlm_lock_put(newlock);
-        }
        return ret;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b931e04e3388..914c121ec890 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                                     struct inode *inode,
                                     const char *symname);
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+                             struct buffer_head **bh1,
+                             struct inode *inode1,
+                             struct buffer_head **bh2,
+                             struct inode *inode2,
+                             int rename);
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
@@ -678,8 +686,10 @@ static int ocfs2_link(struct dentry *old_dentry,
 {
        handle_t *handle;
        struct inode *inode = old_dentry->d_inode;
+        struct inode *old_dir = old_dentry->d_parent->d_inode;
        int err;
        struct buffer_head *fe_bh = NULL;
+        struct buffer_head *old_dir_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -696,19 +706,33 @@ static int ocfs2_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
+        err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+                        &parent_fe_bh, dir, 0);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
                return err;
        }
+        /* make sure both dirs have bhs
+         * get an extra ref on old_dir_bh if old==new */
+        if (!parent_fe_bh) {
+                if (old_dir_bh) {
+                        parent_fe_bh = old_dir_bh;
+                        get_bh(parent_fe_bh);
+                } else {
+                        mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
+                        err = -EIO;
+                        goto out;
+                }
+        }
        if (!dir->i_nlink) {
                err = -ENOENT;
                goto out;
        }
-        err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+        err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
                        old_dentry->d_name.len, &old_de_ino);
        if (err) {
                err = -ENOENT;
@@ -801,10 +825,11 @@ out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
 out:
-        ocfs2_inode_unlock(dir, 1);
+        ocfs2_double_unlock(old_dir, dir);
        brelse(fe_bh);
        brelse(parent_fe_bh);
+        brelse(old_dir_bh);
        ocfs2_free_dir_lookup_result(&lookup);
@@ -1072,14 +1097,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
 }
 /*
- * The only place this should be used is rename!
+ * The only place this should be used is rename and link!
 * if they have the same id, then the 1st one is the only one locked.
 */
 static int ocfs2_double_lock(struct ocfs2_super *osb,
                             struct buffer_head **bh1,
                             struct inode *inode1,
                             struct buffer_head **bh2,
-                             struct inode *inode2)
+                             struct inode *inode2,
+                             int rename)
 {
        int status;
        int inode1_is_ancestor, inode2_is_ancestor;
@@ -1127,7 +1153,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                }
                /* lock id2 */
                status = ocfs2_inode_lock_nested(inode2, bh2, 1,
-                                                 OI_LS_RENAME1);
+                                rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -1136,7 +1162,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        }
        /* lock id1 */
-        status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
+        status = ocfs2_inode_lock_nested(inode1, bh1, 1,
+                        rename == 1 ?  OI_LS_RENAME2 : OI_LS_PARENT);
        if (status < 0) {
                /*
                 * An error return must mean that no cluster locks
@@ -1252,7 +1279,7 @@ static int ocfs2_rename(struct inode *old_dir,
        /* if old and new are the same, this'll just do one lock. */
        status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
-                                   &new_dir_bh, new_dir);
+                                   &new_dir_bh, new_dir, 1);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f90c0282c114..42efe13077b6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
 /* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY          ((__force fmode_t)0x1000000)
+#define FMODE_NONOTIFY          ((__force fmode_t)0x4000000)
 /*
 * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c0c2bce6b0b7..d9d7e7e56352 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -37,6 +37,16 @@ struct anon_vma {
        atomic_t refcount;
        /*
+         * Count of child anon_vmas and VMAs which points to this anon_vma.
+         *
+         * This counter is used for making decision about reusing anon_vma
+         * instead of forking new one. See comments in function anon_vma_clone.
+         */
+        unsigned degree;
+        struct anon_vma *parent;        /* Parent of this anon_vma */
+        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a219be961c0a..00048339c23e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
-void set_page_dirty_balance(struct page *page);
 void writeback_set_ratelimit(void);
 void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 7543b3e51331..e063effe0cc1 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -5,7 +5,7 @@
 /*
 * FMODE_EXEC is 0x20
- * FMODE_NONOTIFY is 0x1000000
+ * FMODE_NONOTIFY is 0x4000000
 * These cannot be used by userspace O_* until internal and external open
 * flags are split.
 * -Eric Paris
diff --git a/kernel/exit.c b/kernel/exit.c
index 1ea4369890a3..6806c55475ee 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
 {
+        /*
+         * We can race with wait_task_zombie() from another thread.
+         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+         * can't confuse the checks below.
+         */
+        int exit_state = ACCESS_ONCE(p->exit_state);
        int ret;
-        if (unlikely(p->exit_state == EXIT_DEAD))
+        if (unlikely(exit_state == EXIT_DEAD))
                return 0;
        ret = eligible_child(wo, p);
@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
                return 0;
        }
-        if (unlikely(p->exit_state == EXIT_TRACE)) {
+        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
        }
        /* slay zombie? */
-        if (p->exit_state == EXIT_ZOMBIE) {
+        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 56badfc4810a..957d3da53ddd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
        depends on !KMEMCHECK
        select PAGE_EXTENSION
        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
-        select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
          Unmap pages from the kernel linear mapping after free_pages().
          This results in a large slowdown, but helps to find certain types
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
          that would result in incorrect warnings of memory corruption after
          a resume because free pages are not saved to the suspend image.
-config WANT_PAGE_DEBUG_FLAGS
-        bool
 config PAGE_POISONING
        bool
-        select WANT_PAGE_DEBUG_FLAGS
-config PAGE_GUARD
-        bool
-        select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e856c7e4..851924fa5170 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
                mem_cgroup_swap_statistics(from, false);
                mem_cgroup_swap_statistics(to, true);
-                /*
-                 * This function is only called from task migration context now.
-                 * It postpones page_counter and refcount handling till the end
-                 * of task migration(mem_cgroup_clear_mc()) for performance
-                 * improvement. But we cannot postpone css_get(to)  because if
-                 * the process that has been moved to @to does swap-in, the
-                 * refcount of @to might be decreased to 0.
-                 *
-                 * We are in attach() phase, so the cgroup is guaranteed to be
-                 * alive, so we can just call css_get().
-                 */
-                css_get(&to->css);
                return 0;
        }
        return -EINVAL;
@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
                page_counter_init(&memcg->memory, NULL);
+                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
        }
@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        if (parent->use_hierarchy) {
                page_counter_init(&memcg->memory, &parent->memory);
+                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, &parent->memsw);
                page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 */
        } else {
                page_counter_init(&memcg->memory, NULL);
+                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
                /*
@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
        mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
        memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
-        memcg->soft_limit = 0;
+        memcg->soft_limit = PAGE_COUNTER_MAX;
 }
 #ifdef CONFIG_MMU
diff --git a/mm/memory.c b/mm/memory.c
index d7e497e98f46..c6565f00fb38 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2137,17 +2137,24 @@ reuse:
                if (!dirty_page)
                        return ret;
-                /*
-                 * Yes, Virginia, this is actually required to prevent a race
-                 * with clear_page_dirty_for_io() from clearing the page dirty
-                 * bit after it clear all dirty ptes, but before a racing
-                 * do_wp_page installs a dirty pte.
-                 *
-                 * do_shared_fault is protected similarly.
-                 */
                if (!page_mkwrite) {
-                        wait_on_page_locked(dirty_page);
+                        struct address_space *mapping;
-                        set_page_dirty_balance(dirty_page);
+                        int dirtied;
+                        lock_page(dirty_page);
+                        dirtied = set_page_dirty(dirty_page);
+                        VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
+                        mapping = dirty_page->mapping;
+                        unlock_page(dirty_page);
+                        if (dirtied && mapping) {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
                        /* file_update_time outside page_lock */
                        if (vma->vm_file)
                                file_update_time(vma->vm_file);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d5d81f5384d1..6f4335238e33 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1541,16 +1541,6 @@ pause:
                bdi_start_background_writeback(bdi);
 }
-void set_page_dirty_balance(struct page *page)
-{
-        if (set_page_dirty(page)) {
-                struct address_space *mapping = page_mapping(page);
-                if (mapping)
-                        balance_dirty_pages_ratelimited(mapping);
-        }
-}
 static DEFINE_PER_CPU(int, bdp_ratelimits);
 /*
@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
 * page dirty in that case, but not all the buffers.  This is a "bottom-up"
 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
 *
- * Most callers have locked the page, which pins the address_space in memory.
+ * The caller must ensure this doesn't race with truncation.  Most will simply
- * But zap_pte_range() does not lock the page, however in that case the
+ * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
- * mapping is pinned by the vma's ->vm_file reference.
+ * the pte lock held, which also locks out truncation.
- *
- * We take care to handle the case where the page was truncated from the
- * mapping by re-checking page_mapping() inside tree_lock.
 */
 int __set_page_dirty_nobuffers(struct page *page)
 {
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
-                struct address_space *mapping2;
                unsigned long flags;
                if (!mapping)
                        return 1;
                spin_lock_irqsave(&mapping->tree_lock, flags);
-                mapping2 = page_mapping(page);
+                BUG_ON(page_mapping(page) != mapping);
-                if (mapping2) { /* Race with truncate? */
+                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-                        BUG_ON(mapping2 != mapping);
+                account_page_dirtied(page, mapping);
-                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+                radix_tree_tag_set(&mapping->page_tree, page_index(page),
-                        account_page_dirtied(page, mapping);
+                                   PAGECACHE_TAG_DIRTY);
-                        radix_tree_tag_set(&mapping->page_tree,
-                                page_index(page), PAGECACHE_TAG_DIRTY);
-                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
                /*
                 * We carefully synchronise fault handlers against
                 * installing a dirty pte and marking the page dirty
-                 * at this point. We do this by having them hold the
+                 * at this point.  We do this by having them hold the
-                 * page lock at some point after installing their
+                 * page lock while dirtying the page, and pages are
-                 * pte, but before marking the page dirty.
+                 * always locked coming in here, so we get the desired
-                 * Pages are always locked coming in here, so we get
+                 * exclusion.
-                 * the desired exclusion. See mm/memory.c:do_wp_page()
-                 * for more comments.
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
diff --git a/mm/rmap.c b/mm/rmap.c
index c5bc241127b2..71cd5bd0c17d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
+                anon_vma->degree = 1;   /* Reference for first vma */
+                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
                        anon_vma_chain_link(vma, avc, anon_vma);
+                        /* vma reference or self-parent link for new root */
+                        anon_vma->degree++;
                        allocated = NULL;
                        avc = NULL;
                }
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
 /*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
+ *
+ * If dst->anon_vma is NULL this function tries to find and reuse existing
+ * anon_vma which has no vmas and only one child anon_vma. This prevents
+ * degradation of anon_vma hierarchy to endless linear chain in case of
+ * constantly forking task. On the other hand, an anon_vma with more than one
+ * child isn't reused even if there was no alive vma, thus rmap walker has a
+ * good chance of avoiding scanning the whole hierarchy when it searches where
+ * page is mapped.
 */
 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);
+                /*
+                 * Reuse existing anon_vma if its degree lower than two,
+                 * that means it has no vma and only one anon_vma child.
+                 *
+                 * Do not chose parent anon_vma, otherwise first child
+                 * will always reuse it. Root anon_vma is never reused:
+                 * it has self-parent reference and at least one child.
+                 */
+                if (!dst->anon_vma && anon_vma != src->anon_vma &&
+                                anon_vma->degree < 2)
+                        dst->anon_vma = anon_vma;
        }
+        if (dst->anon_vma)
+                dst->anon_vma->degree++;
        unlock_anon_vma_root(root);
        return 0;
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        if (!pvma->anon_vma)
                return 0;
+        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+        vma->anon_vma = NULL;
        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        if (error)
                return error;
+        /* An existing anon_vma has been reused, all done then. */
+        if (vma->anon_vma)
+                return 0;
        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
+        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
+        anon_vma->parent->degree++;
        anon_vma_unlock_write(anon_vma);
        return 0;
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
-                if (RB_EMPTY_ROOT(&anon_vma->rb_root))
+                if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+                        anon_vma->parent->degree--;
                        continue;
+                }
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
+        if (vma->anon_vma)
+                vma->anon_vma->degree--;
        unlock_anon_vma_root(root);
        /*
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;
+                BUG_ON(anon_vma->degree);
                put_anon_vma(anon_vma);
                list_del(&avc->same_vma);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd9a72bc4a1b..ab2505c3ef54 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                return false;
        /*
-         * There is a potential race between when kswapd checks its watermarks
+         * The throttled processes are normally woken up in balance_pgdat() as
-         * and a process gets throttled. There is also a potential race if
+         * soon as pfmemalloc_watermark_ok() is true. But there is a potential
-         * processes get throttled, kswapd wakes, a large process exits therby
+         * race between when kswapd checks the watermarks and a process gets
-         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * throttled. There is also a potential race if processes get
-         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * throttled, kswapd wakes, a large process exits thereby balancing the
-         * so wake them now if necessary. If necessary, processes will wake
+         * zones, which causes kswapd to exit balance_pgdat() before reaching
-         * kswapd and get throttled again
+         * the wake up checks. If kswapd is going to sleep, no process should
+         * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+         * the wake up is premature, processes will wake kswapd and get
+         * throttled again. The difference from wake ups in balance_pgdat() is
+         * that here we are under prepare_to_wait().
         */
-        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+        if (waitqueue_active(&pgdat->pfmemalloc_wait))
-                wake_up(&pgdat->pfmemalloc_wait);
+                wake_up_all(&pgdat->pfmemalloc_wait);
-                return false;
-        }
        return pgdat_balanced(pgdat, order, classzone_idx);
 }