20 files changed, 1165 insertions, 807 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
        do {
                struct page *page;
-                pgoff_t index;          /* Pagecache index for current page */
                unsigned long offset;   /* Offset into pagecache page */
                unsigned long bytes;    /* Bytes to write to page */
                size_t copied;          /* Bytes copied from user */
                void *fsdata;
                offset = (pos & (PAGE_CACHE_SIZE - 1));
-                index = pos >> PAGE_CACHE_SHIFT;
                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_count(i));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..b61d2db9f34e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2349,11 +2349,17 @@ retry_avoidcopy:
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
+                mmu_notifier_invalidate_range_start(mm,
+                        address & huge_page_mask(h),
+                        (address & huge_page_mask(h)) + huge_page_size(h));
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
                /* Make the old page be freed below */
                new_page = old_page;
+                mmu_notifier_invalidate_range_end(mm,
+                        address & huge_page_mask(h),
+                        (address & huge_page_mask(h)) + huge_page_size(h));
        }
        page_cache_release(new_page);
        page_cache_release(old_page);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da9668..1d29cdfe8ebb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>
+#include <asm/mmu.h>
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
 struct mm_struct init_mm = {
        .mm_rb          = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
        .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
        .cpu_vm_mask    = CPU_MASK_ALL,
+        INIT_MM_CONTEXT(init_mm)
 };
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..e2ae00458320 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
 #include <linux/ksm.h>
+#include <linux/hash.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
 static struct rb_root root_stable_tree = RB_ROOT;
 static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_HEADS 1024
+#define MM_SLOTS_HASH_SHIFT 10
-static struct hlist_head *mm_slots_hash;
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
 static struct mm_slot ksm_mm_head = {
        .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
        kmem_cache_free(mm_slot_cache, mm_slot);
 }
-static int __init mm_slots_hash_init(void)
-{
-        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
-                                GFP_KERNEL);
-        if (!mm_slots_hash)
-                return -ENOMEM;
-        return 0;
-}
-static void __init mm_slots_hash_free(void)
-{
-        kfree(mm_slots_hash);
-}
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
        struct mm_slot *mm_slot;
        struct hlist_head *bucket;
        struct hlist_node *node;
-        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+        bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
-                                % MM_SLOTS_HASH_HEADS];
        hlist_for_each_entry(mm_slot, node, bucket, link) {
                if (mm == mm_slot->mm)
                        return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
 {
        struct hlist_head *bucket;
-        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+        bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
-                                % MM_SLOTS_HASH_HEADS];
        mm_slot->mm = mm;
        hlist_add_head(&mm_slot->link, bucket);
 }
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
                          struct anon_vma *anon_vma)
 {
        rmap_item->anon_vma = anon_vma;
-        atomic_inc(&anon_vma->external_refcount);
+        get_anon_vma(anon_vma);
 }
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
 {
        struct anon_vma *anon_vma = rmap_item->anon_vma;
-        if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+        drop_anon_vma(anon_vma);
-                int empty = list_empty(&anon_vma->head);
-                spin_unlock(&anon_vma->lock);
-                if (empty)
-                        anon_vma_free(anon_vma);
-        }
 }
 /*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
         * It is not an accident that whenever we want to break COW
         * to undo, we also need to drop a reference to the anon_vma.
         */
-        drop_anon_vma(rmap_item);
+        ksm_drop_anon_vma(rmap_item);
        down_read(&mm->mmap_sem);
        if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;
-                drop_anon_vma(rmap_item);
+                ksm_drop_anon_vma(rmap_item);
                rmap_item->address &= PAGE_MASK;
                cond_resched();
        }
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                else
                        ksm_pages_shared--;
-                drop_anon_vma(rmap_item);
+                ksm_drop_anon_vma(rmap_item);
                rmap_item->address &= PAGE_MASK;
        } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
                        if (!search_new_forks || !mapcount)
                                break;
                }
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
                if (!mapcount)
                        goto out;
        }
@@ -1619,7 +1600,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
                        ret = try_to_unmap_one(page, vma,
                                        rmap_item->address, flags);
                        if (ret != SWAP_AGAIN || !page_mapped(page)) {
-                                spin_unlock(&anon_vma->lock);
+                                anon_vma_unlock(anon_vma);
                                goto out;
                        }
                }
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
@@ -1671,7 +1652,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
                        ret = rmap_one(page, vma, rmap_item->address, arg);
                        if (ret != SWAP_AGAIN) {
-                                spin_unlock(&anon_vma->lock);
+                                anon_vma_unlock(anon_vma);
                                goto out;
                        }
                }
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
        if (err)
                goto out;
-        err = mm_slots_hash_init();
-        if (err)
-                goto out_free1;
        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
        if (IS_ERR(ksm_thread)) {
                printk(KERN_ERR "ksm: creating kthread failed\n");
                err = PTR_ERR(ksm_thread);
-                goto out_free2;
+                goto out_free;
        }
 #ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
        if (err) {
                printk(KERN_ERR "ksm: register sysfs failed\n");
                kthread_stop(ksm_thread);
-                goto out_free2;
+                goto out_free;
        }
 #else
        ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
 #endif
        return 0;
-out_free2:
+out_free:
-        mm_slots_hash_free();
-out_free1:
        ksm_slab_free();
 out:
        return err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..0576e9e64586 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,8 @@
 #include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +213,6 @@ struct mem_cgroup {
        */
        spinlock_t reclaim_param_lock;
-        int     prev_priority;  /* for recording reclaim priority */
        /*
         * While reclaiming in a hierarchy, we cache the last child we
         * reclaimed from.
@@ -858,35 +858,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
        return ret;
 }
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
-        int prev_priority;
-        spin_lock(&mem->reclaim_param_lock);
-        prev_priority = mem->prev_priority;
-        spin_unlock(&mem->reclaim_param_lock);
-        return prev_priority;
-}
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-        spin_lock(&mem->reclaim_param_lock);
-        if (priority < mem->prev_priority)
-                mem->prev_priority = priority;
-        spin_unlock(&mem->reclaim_param_lock);
-}
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-        spin_lock(&mem->reclaim_param_lock);
-        mem->prev_priority = priority;
-        spin_unlock(&mem->reclaim_param_lock);
-}
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
        unsigned long active;
@@ -1038,6 +1009,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        }
        *scanned = scan;
+        trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+                                      0, 0, 0, mode);
        return nr_taken;
 }
@@ -1158,6 +1133,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
 }
 /*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+        u64 limit;
+        u64 memsw;
+        limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+                        total_swap_pages;
+        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+        /*
+         * If memsw is finite and limits the amount of swap space available
+         * to this memcg, return that limit.
+         */
+        return min(limit, memsw);
+}
+/*
 * Visit the first child (need not be the first child as per the ordering
 * of the cgroup list, since we track last_scanned_child) of @mem and use
 * that to reclaim free pages from.
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..858829d06a92 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long start;
        /*
         * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
        if (addr > end - 1)
                return;
-        start = addr;
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -2008,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long start = addr, end = addr + size;
+        unsigned long end = addr + size;
        int err;
        BUG_ON(addr >= end);
-        mmu_notifier_invalidate_range_start(mm, start, end);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -2020,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
-        mmu_notifier_invalidate_range_end(mm, start, end);
        return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2630,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        swp_entry_t entry;
        pte_t pte;
        struct mem_cgroup *ptr = NULL;
+        int exclusive = 0;
        int ret = 0;
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2724,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                flags &= ~FAULT_FLAG_WRITE;
+                ret |= VM_FAULT_WRITE;
+                exclusive = 1;
        }
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
-        page_add_anon_rmap(page, vma, address);
+        do_page_add_anon_rmap(page, vma, address, exclusive);
        /* It's better to call commit-charge after rmap is established */
        mem_cgroup_commit_charge_swapin(page, ptr);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..f969da5dd8a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1275,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, new_nodes)
 {
        const struct cred *cred = current_cred(), *tcred;
-        struct mm_struct *mm;
+        struct mm_struct *mm = NULL;
        struct task_struct *task;
-        nodemask_t old;
-        nodemask_t new;
        nodemask_t task_nodes;
        int err;
+        nodemask_t *old;
+        nodemask_t *new;
+        NODEMASK_SCRATCH(scratch);
+        if (!scratch)
+                return -ENOMEM;
+        old = &scratch->mask1;
+        new = &scratch->mask2;
-        err = get_nodes(&old, old_nodes, maxnode);
+        err = get_nodes(old, old_nodes, maxnode);
        if (err)
-                return err;
+                goto out;
-        err = get_nodes(&new, new_nodes, maxnode);
+        err = get_nodes(new, new_nodes, maxnode);
        if (err)
-                return err;
+                goto out;
        /* Find the mm_struct */
        read_lock(&tasklist_lock);
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                read_unlock(&tasklist_lock);
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
        }
        mm = get_task_mm(task);
        read_unlock(&tasklist_lock);
+        err = -EINVAL;
        if (!mm)
-                return -EINVAL;
+                goto out;
        /*
         * Check if this process has the right to modify the specified
@@ -1322,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
-        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out;
        }
-        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+        if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
                err = -EINVAL;
                goto out;
        }
@@ -1336,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        if (err)
                goto out;
-        err = do_migrate_pages(mm, &old, &new,
+        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
-        mmput(mm);
+        if (mm)
+                mmput(mm);
+        NODEMASK_SCRATCH_FREE(scratch);
        return err;
 }
@@ -1712,6 +1724,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 }
 #endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy.  Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+                                        const nodemask_t *mask)
+{
+        struct mempolicy *mempolicy;
+        bool ret = true;
+        if (!mask)
+                return ret;
+        task_lock(tsk);
+        mempolicy = tsk->mempolicy;
+        if (!mempolicy)
+                goto out;
+        switch (mempolicy->mode) {
+        case MPOL_PREFERRED:
+                /*
+                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+                 * allocate from, they may fallback to other nodes when oom.
+                 * Thus, it's possible for tsk to have allocated memory from
+                 * nodes in mask.
+                 */
+                break;
+        case MPOL_BIND:
+        case MPOL_INTERLEAVE:
+                ret = nodes_intersects(mempolicy->v.nodes, *mask);
+                break;
+        default:
+                BUG();
+        }
+out:
+        task_unlock(tsk);
+        return ret;
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         * exist when the page is remapped later
                         */
                        anon_vma = page_anon_vma(page);
-                        atomic_inc(&anon_vma->external_refcount);
+                        get_anon_vma(anon_vma);
                }
        }
@@ -682,12 +682,8 @@ skip_unmap:
 rcu_unlock:
        /* Drop an anon_vma reference if we took one */
-        if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+        if (anon_vma)
-                int empty = list_empty(&anon_vma->head);
+                drop_anon_vma(anon_vma);
-                spin_unlock(&anon_vma->lock);
-                if (empty)
-                        anon_vma_free(anon_vma);
-        }
        if (rcu_locked)
                rcu_read_unlock();
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..31003338b978 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -452,12 +452,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
                spin_lock(&mapping->i_mmap_lock);
                vma->vm_truncate_count = mapping->truncate_count;
        }
-        anon_vma_lock(vma);
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
-        anon_vma_unlock(vma);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
@@ -506,6 +504,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
        struct prio_tree_root *root = NULL;
+        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        long adjust_next = 0;
        int remove_next = 0;
@@ -578,6 +577,17 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
+        /*
+         * When changing only vma->vm_end, we don't really need anon_vma
+         * lock. This is a fairly rare case by itself, but the anon_vma
+         * lock may be shared between many sibling processes.  Skipping
+         * the lock for brk adjustments makes a difference sometimes.
+         */
+        if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+                anon_vma = vma->anon_vma;
+                anon_vma_lock(anon_vma);
+        }
        if (root) {
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_remove(vma, root);
@@ -617,6 +627,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
+        if (anon_vma)
+                anon_vma_unlock(anon_vma);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
@@ -1710,7 +1722,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
         */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;
-        anon_vma_lock(vma);
+        vma_lock_anon_vma(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1733,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
        if (address < PAGE_ALIGN(address+4))
                address = PAGE_ALIGN(address+4);
        else {
-                anon_vma_unlock(vma);
+                vma_unlock_anon_vma(vma);
                return -ENOMEM;
        }
        error = 0;
@@ -1739,7 +1751,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                        perf_event_mmap(vma);
                }
        }
-        anon_vma_unlock(vma);
+        vma_unlock_anon_vma(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1776,7 @@ static int expand_downwards(struct vm_area_struct *vma,
        if (error)
                return error;
-        anon_vma_lock(vma);
+        vma_lock_anon_vma(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1798,7 @@ static int expand_downwards(struct vm_area_struct *vma,
                        perf_event_mmap(vma);
                }
        }
-        anon_vma_unlock(vma);
+        vma_unlock_anon_vma(vma);
        return error;
 }
@@ -2470,23 +2482,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+        if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+                spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->lock. If some other vma in this mm shares
+                 * anon_vma->root->lock. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->lock.
+                 * anon_vma->root->lock.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                       &anon_vma->head.next))
+                                       &anon_vma->root->head.next))
                        BUG();
        }
 }
@@ -2577,7 +2589,7 @@ out_unlock:
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+        if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
@@ -2588,12 +2600,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->lock.
+                 * anon_vma->root->lock.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                          &anon_vma->head.next))
+                                          &anon_vma->root->head.next))
                        BUG();
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
        }
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 709aedfaa014..d3def05a33d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
 *  Copyright (C)  1998,2000  Rik van Riel
 *      Thanks go out to Claus Fischer for some serious inspiration and
 *      for goading me into coding this file...
+ *  Copyright (C)  2010  Google, Inc.
+ *      Rewritten by David Rientjes
 *
 *  The routines in this file are used to kill a process when
 *  we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,188 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
 #include <linux/security.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks;
+int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
+ */
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+                                        const nodemask_t *mask)
+{
+        struct task_struct *start = tsk;
+        do {
+                if (mask) {
+                        /*
+                         * If this is a mempolicy constrained oom, tsk's
+                         * cpuset is irrelevant.  Only return true if its
+                         * mempolicy intersects current, otherwise it may be
+                         * needlessly killed.
+                         */
+                        if (mempolicy_nodemask_intersects(tsk, mask))
+                                return true;
+                } else {
+                        /*
+                         * This is not a mempolicy constrained oom, so only
+                         * check the mems of tsk's cpuset.
+                         */
+                        if (cpuset_mems_allowed_intersects(current, tsk))
+                                return true;
+                }
+        } while_each_thread(start, tsk);
+        return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+                                        const nodemask_t *mask)
+{
+        return true;
+}
+#endif /* CONFIG_NUMA */
 /*
- * Is all threads of the target process nodes overlap ours?
+ * If this is a system OOM (not a memcg OOM) and the task selected to be
+ * killed is not already running at high (RT) priorities, speed up the
+ * recovery by boosting the dying task to the lowest FIFO priority.
+ * That helps with the recovery and avoids interfering with RT tasks.
 */
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+static void boost_dying_task_prio(struct task_struct *p,
+                                  struct mem_cgroup *mem)
 {
-        struct task_struct *t;
+        struct sched_param param = { .sched_priority = 1 };
+        if (mem)
+                return;
+        if (!rt_task(p))
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+}
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer.  Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
+static struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+        struct task_struct *t = p;
-        t = tsk;
        do {
-                if (cpuset_mems_allowed_intersects(current, t))
+                task_lock(t);
-                        return 1;
+                if (likely(t->mm))
-                t = next_thread(t);
+                        return t;
-        } while (t != tsk);
+                task_unlock(t);
+        } while_each_thread(p, t);
-        return 0;
+        return NULL;
+}
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
+                           const nodemask_t *nodemask)
+{
+        if (is_global_init(p))
+                return true;
+        if (p->flags & PF_KTHREAD)
+                return true;
+        /* When mem_cgroup_out_of_memory() and p is not member of the group */
+        if (mem && !task_in_mem_cgroup(p, mem))
+                return true;
+        /* p may not have freeable memory in nodemask */
+        if (!has_intersects_mems_allowed(p, nodemask))
+                return true;
+        return false;
 }
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
 * @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
+ * @totalpages: total present RAM allowed for page allocation
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
 *
- * Good in this context means that:
+ * The heuristic for determining which task to kill is made to be as simple and
- * 1) we lose the minimum amount of work done
+ * predictable as possible.  The goal is to return the highest value for the
- * 2) we recover a large amount of memory
+ * task consuming the most memory to avoid subsequent oom failures.
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- *    algorithm has been meticulously tuned to meet the principle
- *    of least surprise ... (be careful when you change it)
 */
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+                      const nodemask_t *nodemask, unsigned long totalpages)
 {
-        unsigned long points, cpu_time, run_time;
+        int points;
-        struct mm_struct *mm;
-        struct task_struct *child;
-        int oom_adj = p->signal->oom_adj;
-        struct task_cputime task_time;
-        unsigned long utime;
-        unsigned long stime;
-        if (oom_adj == OOM_DISABLE)
+        if (oom_unkillable_task(p, mem, nodemask))
                return 0;
-        task_lock(p);
+        p = find_lock_task_mm(p);
-        mm = p->mm;
+        if (!p)
-        if (!mm) {
-                task_unlock(p);
                return 0;
-        }
-        /*
-         * The memory size of the process is the basis for the badness.
-         */
-        points = mm->total_vm;
        /*
-         * After this unlock we can no longer dereference local variable `mm'
+         * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+         * need to be executed for something that cannot be killed.
         */
-        task_unlock(p);
+        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                task_unlock(p);
-        /*
+                return 0;
-         * swapoff can easily use up all memory, so kill those first.
-         */
-        if (p->flags & PF_OOM_ORIGIN)
-                return ULONG_MAX;
-        /*
-         * Processes which fork a lot of child processes are likely
-         * a good choice. We add half the vmsize of the children if they
-         * have an own mm. This prevents forking servers to flood the
-         * machine with an endless amount of children. In case a single
-         * child is eating the vast majority of memory, adding only half
-         * to the parents will make the child our kill candidate of choice.
-         */
-        list_for_each_entry(child, &p->children, sibling) {
-                task_lock(child);
-                if (child->mm != mm && child->mm)
-                        points += child->mm->total_vm/2 + 1;
-                task_unlock(child);
        }
        /*
-         * CPU time is in tens of seconds and run time is in thousands
+         * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
-         * of seconds. There is no particular reason for this other than
+         * priority for oom killing.
-         * that it turned out to work very well in practice.
-         */
-        thread_group_cputime(p, &task_time);
-        utime = cputime_to_jiffies(task_time.utime);
-        stime = cputime_to_jiffies(task_time.stime);
-        cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-        if (uptime >= p->start_time.tv_sec)
-                run_time = (uptime - p->start_time.tv_sec) >> 10;
-        else
-                run_time = 0;
-        if (cpu_time)
-                points /= int_sqrt(cpu_time);
-        if (run_time)
-                points /= int_sqrt(int_sqrt(run_time));
-        /*
-         * Niced processes are most likely less important, so double
-         * their badness points.
         */
-        if (task_nice(p) > 0)
+        if (p->flags & PF_OOM_ORIGIN) {
-                points *= 2;
+                task_unlock(p);
+                return 1000;
+        }
        /*
-         * Superuser processes are usually more important, so we make it
+         * The memory controller may have a limit of 0 bytes, so avoid a divide
-         * less likely that we kill those.
+         * by zero, if necessary.
         */
-        if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
+        if (!totalpages)
-            has_capability_noaudit(p, CAP_SYS_RESOURCE))
+                totalpages = 1;
-                points /= 4;
        /*
-         * We don't want to kill a process with direct hardware access.
+         * The baseline for the badness score is the proportion of RAM that each
-         * Not only could that mess up the hardware, but usually users
+         * task's rss and swap space use.
-         * tend to only have this flag set on applications they think
-         * of as important.
         */
-        if (has_capability_noaudit(p, CAP_SYS_RAWIO))
+        points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
-                points /= 4;
+                        totalpages;
+        task_unlock(p);
        /*
-         * If p's nodes don't overlap ours, it may still help to kill p
+         * Root processes get 3% bonus, just like the __vm_enough_memory()
-         * because p may have allocated or otherwise mapped memory on
+         * implementation used by LSMs.
-         * this node before. However it will be less likely.
         */
-        if (!has_intersects_mems_allowed(p))
+        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points /= 8;
+                points -= 30;
        /*
-         * Adjust the score by oom_adj.
+         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+         * either completely disable oom killing or always prefer a certain
+         * task.
         */
-        if (oom_adj) {
+        points += p->signal->oom_score_adj;
-                if (oom_adj > 0) {
-                        if (!points)
-                                points = 1;
-                        points <<= oom_adj;
-                } else
-                        points >>= -(oom_adj);
-        }
-#ifdef DEBUG
+        if (points < 0)
-        printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
+                return 0;
-        p->pid, p->comm, points);
+        return (points < 1000) ? points : 1000;
-#endif
-        return points;
 }
 /*
@@ -199,12 +218,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 */
 #ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                                    gfp_t gfp_mask, nodemask_t *nodemask)
+                                gfp_t gfp_mask, nodemask_t *nodemask,
+                                unsigned long *totalpages)
 {
        struct zone *zone;
        struct zoneref *z;
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        bool cpuset_limited = false;
+        int nid;
+        /* Default to all available memory */
+        *totalpages = totalram_pages + total_swap_pages;
+        if (!zonelist)
+                return CONSTRAINT_NONE;
        /*
         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
         * to kill current.We have to random task kill in this case.
@@ -214,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
                return CONSTRAINT_NONE;
        /*
-         * The nodemask here is a nodemask passed to alloc_pages(). Now,
+         * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
-         * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
+         * the page allocator means a mempolicy is in effect.  Cpuset policy
-         * feature. mempolicy is an only user of nodemask here.
+         * is enforced in get_page_from_freelist().
-         * check mempolicy's nodemask contains all N_HIGH_MEMORY
         */
-        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+                *totalpages = total_swap_pages;
+                for_each_node_mask(nid, *nodemask)
+                        *totalpages += node_spanned_pages(nid);
                return CONSTRAINT_MEMORY_POLICY;
+        }
        /* Check this allocation failure is caused by cpuset's wall function */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask)
                if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
-                        return CONSTRAINT_CPUSET;
+                        cpuset_limited = true;
+        if (cpuset_limited) {
+                *totalpages = total_swap_pages;
+                for_each_node_mask(nid, cpuset_current_mems_allowed)
+                        *totalpages += node_spanned_pages(nid);
+                return CONSTRAINT_CPUSET;
+        }
        return CONSTRAINT_NONE;
 }
 #else
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                                gfp_t gfp_mask, nodemask_t *nodemask)
+                                gfp_t gfp_mask, nodemask_t *nodemask,
+                                unsigned long *totalpages)
 {
+        *totalpages = totalram_pages + total_swap_pages;
        return CONSTRAINT_NONE;
 }
 #endif
@@ -244,28 +282,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
-static struct task_struct *select_bad_process(unsigned long *ppoints,
+static struct task_struct *select_bad_process(unsigned int *ppoints,
-                                                struct mem_cgroup *mem)
+                unsigned long totalpages, struct mem_cgroup *mem,
+                const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *chosen = NULL;
-        struct timespec uptime;
        *ppoints = 0;
-        do_posix_clock_monotonic_gettime(&uptime);
        for_each_process(p) {
-                unsigned long points;
+                unsigned int points;
-                /*
+                if (oom_unkillable_task(p, mem, nodemask))
-                 * skip kernel threads and tasks which have already released
-                 * their mm.
-                 */
-                if (!p->mm)
-                        continue;
-                /* skip the init task */
-                if (is_global_init(p))
-                        continue;
-                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
                /*
@@ -290,19 +318,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                 * the process of exiting and releasing its resources.
                 * Otherwise we could get an easy OOM deadlock.
                 */
-                if (p->flags & PF_EXITING) {
+                if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
                        if (p != current)
                                return ERR_PTR(-1UL);
                        chosen = p;
-                        *ppoints = ULONG_MAX;
+                        *ppoints = 1000;
                }
-                if (p->signal->oom_adj == OOM_DISABLE)
+                points = oom_badness(p, mem, nodemask, totalpages);
-                        continue;
+                if (points > *ppoints) {
-                points = badness(p, uptime.tv_sec);
-                if (points > *ppoints || !chosen) {
                        chosen = p;
                        *ppoints = points;
                }
@@ -313,11 +338,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 /**
 * dump_tasks - dump current memory state of all system tasks
- * @mem: target memory controller
+ * @mem: current's memory controller, if constrained
 *
 * Dumps the current memory state of all system tasks, excluding kernel threads.
 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
 *
 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
 * shown.
@@ -326,44 +351,43 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 */
 static void dump_tasks(const struct mem_cgroup *mem)
 {
-        struct task_struct *g, *p;
+        struct task_struct *p;
+        struct task_struct *task;
-        printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
-               "name\n");
-        do_each_thread(g, p) {
-                struct mm_struct *mm;
-                if (mem && !task_in_mem_cgroup(p, mem))
+        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
+        for_each_process(p) {
+                if (p->flags & PF_KTHREAD)
                        continue;
-                if (!thread_group_leader(p))
+                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
-                task_lock(p);
+                task = find_lock_task_mm(p);
-                mm = p->mm;
+                if (!task) {
-                if (!mm) {
                        /*
-                         * total_vm and rss sizes do not exist for tasks with no
+                         * This is a kthread or all of p's threads have already
-                         * mm so there's no need to report them; they can't be
+                         * detached their mm's.  There's no need to report
-                         * oom killed anyway.
+                         * them; they can't be oom killed anyway.
                         */
-                        task_unlock(p);
                        continue;
                }
-                printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-                       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
+                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
-                       get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
+                        task->pid, __task_cred(task)->uid, task->tgid,
-                       p->comm);
+                        task->mm->total_vm, get_mm_rss(task->mm),
-                task_unlock(p);
+                        task_cpu(task), task->signal->oom_adj,
-        } while_each_thread(g, p);
+                        task->signal->oom_score_adj, task->comm);
+                task_unlock(task);
+        }
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
                                                        struct mem_cgroup *mem)
 {
-        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-                "oom_adj=%d\n",
-                current->comm, gfp_mask, order, current->signal->oom_adj);
        task_lock(current);
+        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+                "oom_adj=%d, oom_score_adj=%d\n",
+                current->comm, gfp_mask, order, current->signal->oom_adj,
+                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
        dump_stack();
@@ -374,72 +398,43 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
+static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
-/*
- * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
- * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
- * set.
- */
-static void __oom_kill_task(struct task_struct *p, int verbose)
 {
-        if (is_global_init(p)) {
+        p = find_lock_task_mm(p);
-                WARN_ON(1);
+        if (!p) {
-                printk(KERN_WARNING "tried to kill init!\n");
-                return;
-        }
-        task_lock(p);
-        if (!p->mm) {
-                WARN_ON(1);
-                printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
-                        task_pid_nr(p), p->comm);
                task_unlock(p);
-                return;
+                return 1;
        }
+        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
-        if (verbose)
+                task_pid_nr(p), p->comm, K(p->mm->total_vm),
-                printk(KERN_ERR "Killed process %d (%s) "
+                K(get_mm_counter(p->mm, MM_ANONPAGES)),
-                       "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+                K(get_mm_counter(p->mm, MM_FILEPAGES)));
-                       task_pid_nr(p), p->comm,
-                       K(p->mm->total_vm),
-                       K(get_mm_counter(p->mm, MM_ANONPAGES)),
-                       K(get_mm_counter(p->mm, MM_FILEPAGES)));
        task_unlock(p);
+        set_tsk_thread_flag(p, TIF_MEMDIE);
+        force_sig(SIGKILL, p);
        /*
         * We give our sacrificial lamb high priority and access to
         * all the memory it needs. That way it should be able to
         * exit() and clear out its resources quickly...
         */
-        p->rt.time_slice = HZ;
+        boost_dying_task_prio(p, mem);
-        set_tsk_thread_flag(p, TIF_MEMDIE);
-        force_sig(SIGKILL, p);
-}
-static int oom_kill_task(struct task_struct *p)
-{
-        /* WARNING: mm may not be dereferenced since we did not obtain its
-         * value from get_task_mm(p).  This is OK since all we need to do is
-         * compare mm to q->mm below.
-         *
-         * Furthermore, even if mm contains a non-NULL value, p->mm may
-         * change to NULL at any time since we do not hold task_lock(p).
-         * However, this is of no concern to us.
-         */
-        if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
-                return 1;
-        __oom_kill_task(p, 1);
        return 0;
 }
+#undef K
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                            unsigned long points, struct mem_cgroup *mem,
+                            unsigned int points, unsigned long totalpages,
+                            struct mem_cgroup *mem, nodemask_t *nodemask,
                            const char *message)
 {
-        struct task_struct *c;
+        struct task_struct *victim = p;
+        struct task_struct *child;
+        struct task_struct *t = p;
+        unsigned int victim_points = 0;
        if (printk_ratelimit())
                dump_header(p, gfp_mask, order, mem);
@@ -449,40 +444,81 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
        if (p->flags & PF_EXITING) {
-                __oom_kill_task(p, 0);
+                set_tsk_thread_flag(p, TIF_MEMDIE);
+                boost_dying_task_prio(p, mem);
                return 0;
        }
-        printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+        task_lock(p);
-                                        message, task_pid_nr(p), p->comm, points);
+        pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+                message, task_pid_nr(p), p->comm, points);
+        task_unlock(p);
-        /* Try to kill a child first */
+        /*
-        list_for_each_entry(c, &p->children, sibling) {
+         * If any of p's children has a different mm and is eligible for kill,
-                if (c->mm == p->mm)
+         * the one with the highest badness() score is sacrificed for its
-                        continue;
+         * parent.  This attempts to lose the minimal amount of work done while
-                if (mem && !task_in_mem_cgroup(c, mem))
+         * still freeing memory.
-                        continue;
+         */
-                if (!oom_kill_task(c))
+        do {
-                        return 0;
+                list_for_each_entry(child, &t->children, sibling) {
+                        unsigned int child_points;
+                        /*
+                         * oom_badness() returns 0 if the thread is unkillable
+                         */
+                        child_points = oom_badness(child, mem, nodemask,
+                                                                totalpages);
+                        if (child_points > victim_points) {
+                                victim = child;
+                                victim_points = child_points;
+                        }
+                }
+        } while_each_thread(p, t);
+        return oom_kill_task(victim, mem);
+}
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+                                int order)
+{
+        if (likely(!sysctl_panic_on_oom))
+                return;
+        if (sysctl_panic_on_oom != 2) {
+                /*
+                 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+                 * does not panic for cpuset, mempolicy, or memcg allocation
+                 * failures.
+                 */
+                if (constraint != CONSTRAINT_NONE)
+                        return;
        }
-        return oom_kill_task(p);
+        read_lock(&tasklist_lock);
+        dump_header(NULL, gfp_mask, order, NULL);
+        read_unlock(&tasklist_lock);
+        panic("Out of memory: %s panic_on_oom is enabled\n",
+                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
-        unsigned long points = 0;
+        unsigned long limit;
+        unsigned int points = 0;
        struct task_struct *p;
-        if (sysctl_panic_on_oom == 2)
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
-                panic("out of memory(memcg). panic_on_oom is selected.\n");
+        limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
 retry:
-        p = select_bad_process(&points, mem);
+        p = select_bad_process(&points, limit, mem, NULL);
        if (!p || PTR_ERR(p) == -1UL)
                goto out;
-        if (oom_kill_process(p, gfp_mask, 0, points, mem,
+        if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
                                "Memory cgroup out of memory"))
                goto retry;
 out:
@@ -509,7 +545,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 * if a parallel OOM killing is already taking place that includes a zone in
 * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
 */
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
        struct zoneref *z;
        struct zone *zone;
@@ -526,7 +562,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                /*
                 * Lock each zone in the zonelist under zone_scan_lock so a
-                 * parallel invocation of try_set_zone_oom() doesn't succeed
+                 * parallel invocation of try_set_zonelist_oom() doesn't succeed
                 * when it shouldn't.
                 */
                zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -555,65 +591,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 /*
- * Must be called with tasklist_lock held for read.
+ * Try to acquire the oom killer lock for all system zones.  Returns zero if a
+ * parallel oom killing is taking place, otherwise locks all zones and returns
+ * non-zero.
 */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static int try_set_system_oom(void)
 {
-        struct task_struct *p;
+        struct zone *zone;
-        unsigned long points;
+        int ret = 1;
-        if (sysctl_oom_kill_allocating_task)
-                if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
-                                "Out of memory (oom_kill_allocating_task)"))
-                        return;
-retry:
-        /*
-         * Rambo mode: Shoot down a process and hope it solves whatever
-         * issues we may have.
-         */
-        p = select_bad_process(&points, NULL);
-        if (PTR_ERR(p) == -1UL)
-                return;
-        /* Found nothing?!?! Either we hang forever, or we panic. */
-        if (!p) {
-                read_unlock(&tasklist_lock);
-                dump_header(NULL, gfp_mask, order, NULL);
-                panic("Out of memory and no killable processes...\n");
-        }
-        if (oom_kill_process(p, gfp_mask, order, points, NULL,
+        spin_lock(&zone_scan_lock);
-                             "Out of memory"))
+        for_each_populated_zone(zone)
-                goto retry;
+                if (zone_is_oom_locked(zone)) {
+                        ret = 0;
+                        goto out;
+                }
+        for_each_populated_zone(zone)
+                zone_set_flag(zone, ZONE_OOM_LOCKED);
+out:
+        spin_unlock(&zone_scan_lock);
+        return ret;
 }
 /*
- * pagefault handler calls into here because it is out of memory but
+ * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
- * doesn't know exactly how or why.
+ * attempts or page faults may now recall the oom killer, if necessary.
 */
-void pagefault_out_of_memory(void)
+static void clear_system_oom(void)
 {
-        unsigned long freed = 0;
+        struct zone *zone;
-        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
-        if (freed > 0)
-                /* Got some memory back in the last second. */
-                return;
-        if (sysctl_panic_on_oom)
-                panic("out of memory from page fault. panic_on_oom is selected.\n");
-        read_lock(&tasklist_lock);
-        __out_of_memory(0, 0); /* unknown gfp_mask and order */
-        read_unlock(&tasklist_lock);
-        /*
+        spin_lock(&zone_scan_lock);
-         * Give "p" a good chance of killing itself before we
+        for_each_populated_zone(zone)
-         * retry to allocate memory.
+                zone_clear_flag(zone, ZONE_OOM_LOCKED);
-         */
+        spin_unlock(&zone_scan_lock);
-        if (!test_thread_flag(TIF_MEMDIE))
-                schedule_timeout_uninterruptible(1);
 }
 /**
@@ -621,6 +632,7 @@ void pagefault_out_of_memory(void)
 * @zonelist: zonelist pointer
 * @gfp_mask: memory allocation flags
 * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
@@ -630,43 +642,68 @@ void pagefault_out_of_memory(void)
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *nodemask)
 {
+        struct task_struct *p;
+        unsigned long totalpages;
        unsigned long freed = 0;
-        enum oom_constraint constraint;
+        unsigned int points;
+        enum oom_constraint constraint = CONSTRAINT_NONE;
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
                return;
-        if (sysctl_panic_on_oom == 2) {
+        /*
-                dump_header(NULL, gfp_mask, order, NULL);
+         * If current has a pending SIGKILL, then automatically select it.  The
-                panic("out of memory. Compulsory panic_on_oom is selected.\n");
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                boost_dying_task_prio(current, NULL);
+                return;
        }
        /*
         * Check if there were limitations on the allocation (only relevant for
         * NUMA) that may require different handling.
         */
-        constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+                                                &totalpages);
+        check_panic_on_oom(constraint, gfp_mask, order);
        read_lock(&tasklist_lock);
+        if (sysctl_oom_kill_allocating_task &&
+            !oom_unkillable_task(current, NULL, nodemask) &&
+            (current->signal->oom_adj != OOM_DISABLE)) {
+                /*
+                 * oom_kill_process() needs tasklist_lock held.  If it returns
+                 * non-zero, current could not be killed so we must fallback to
+                 * the tasklist scan.
+                 */
+                if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+                                NULL, nodemask,
+                                "Out of memory (oom_kill_allocating_task)"))
+                        return;
+        }
-        switch (constraint) {
+retry:
-        case CONSTRAINT_MEMORY_POLICY:
+        p = select_bad_process(&points, totalpages, NULL,
-                oom_kill_process(current, gfp_mask, order, 0, NULL,
+                        constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
-                                "No available memory (MPOL_BIND)");
+                                                                 NULL);
-                break;
+        if (PTR_ERR(p) == -1UL)
+                return;
-        case CONSTRAINT_NONE:
+        /* Found nothing?!?! Either we hang forever, or we panic. */
-                if (sysctl_panic_on_oom) {
+        if (!p) {
-                        dump_header(NULL, gfp_mask, order, NULL);
+                dump_header(NULL, gfp_mask, order, NULL);
-                        panic("out of memory. panic_on_oom is selected\n");
+                read_unlock(&tasklist_lock);
-                }
+                panic("Out of memory and no killable processes...\n");
-                /* Fall-through */
-        case CONSTRAINT_CPUSET:
-                __out_of_memory(gfp_mask, order);
-                break;
        }
+        if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+                                nodemask, "Out of memory"))
+                goto retry;
        read_unlock(&tasklist_lock);
        /*
@@ -676,3 +713,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        if (!test_thread_flag(TIF_MEMDIE))
                schedule_timeout_uninterruptible(1);
 }
+/*
+ * The pagefault handler calls here because it is out of memory, so kill a
+ * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel
+ * oom killing is already in progress so do nothing.  If a task is found with
+ * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ */
+void pagefault_out_of_memory(void)
+{
+        if (try_set_system_oom()) {
+                out_of_memory(NULL, 0, 0, NULL);
+                clear_system_oom();
+        }
+        if (!test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_uninterruptible(1);
+}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..df8202ebc7b8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -805,6 +805,41 @@ void __init page_writeback_init(void)
 }
 /**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+#define WRITEBACK_TAG_BATCH 4096
+void tag_pages_for_writeback(struct address_space *mapping,
+                             pgoff_t start, pgoff_t end)
+{
+        unsigned long tagged;
+        do {
+                spin_lock_irq(&mapping->tree_lock);
+                tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+                                &start, end, WRITEBACK_TAG_BATCH,
+                                PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+                spin_unlock_irq(&mapping->tree_lock);
+                WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+                cond_resched();
+        } while (tagged >= WRITEBACK_TAG_BATCH);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +853,13 @@ void __init page_writeback_init(void)
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
 */
 int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +875,7 @@ int write_cache_pages(struct address_space *mapping,
        pgoff_t done_index;
        int cycled;
        int range_whole = 0;
+        int tag;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -849,29 +892,19 @@ int write_cache_pages(struct address_space *mapping,
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
-                /*
-                 * If this is a data integrity sync, cap the writeback to the
-                 * current end of file. Any extension to the file that occurs
-                 * after this is a new write and we don't need to write those
-                 * pages out to fulfil our data integrity requirements. If we
-                 * try to write them out, we can get stuck in this scan until
-                 * the concurrent writer stops adding dirty pages and extending
-                 * EOF.
-                 */
-                if (wbc->sync_mode == WB_SYNC_ALL &&
-                    wbc->range_end == LLONG_MAX) {
-                        end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
-                }
        }
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
                int i;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                              PAGECACHE_TAG_DIRTY,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -1327,6 +1360,9 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
+                radix_tree_tag_clear(&mapping->page_tree,
+                                     page_index(page),
+                                     PAGECACHE_TAG_TOWRITE);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c6..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        struct page *page;
        /* Acquire the OOM killer lock for the zones in zonelist */
-        if (!try_set_zone_oom(zonelist, gfp_mask)) {
+        if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
                schedule_timeout_uninterruptible(1);
                return NULL;
        }
@@ -1759,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                /* The OOM killer will not help higher order allocs */
                if (order > PAGE_ALLOC_COSTLY_ORDER)
                        goto out;
+                /* The OOM killer does not needlessly kill tasks for lowmem */
+                if (high_zoneidx < ZONE_NORMAL)
+                        goto out;
                /*
                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2052,15 +2055,23 @@ rebalance:
                        if (page)
                                goto got_pg;
-                        /*
+                        if (!(gfp_mask & __GFP_NOFAIL)) {
-                         * The OOM killer does not trigger for high-order
+                                /*
-                         * ~__GFP_NOFAIL allocations so if no progress is being
+                                 * The oom killer is not called for high-order
-                         * made, there are no other options and retrying is
+                                 * allocations that may fail, so if no progress
-                         * unlikely to help.
+                                 * is being made, there are no other options and
-                         */
+                                 * retrying is unlikely to help.
-                        if (order > PAGE_ALLOC_COSTLY_ORDER &&
+                                 */
-                                                !(gfp_mask & __GFP_NOFAIL))
+                                if (order > PAGE_ALLOC_COSTLY_ORDER)
-                                goto nopage;
+                                        goto nopage;
+                                /*
+                                 * The oom killer is not called for lowmem
+                                 * allocations to prevent needlessly killing
+                                 * innocent tasks.
+                                 */
+                                if (high_zoneidx < ZONE_NORMAL)
+                                        goto nopage;
+                        }
                        goto restart;
                }
@@ -4089,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_seqlock_init(zone);
                zone->zone_pgdat = pgdat;
-                zone->prev_priority = DEF_PRIORITY;
                zone_pcp_init(zone);
                for_each_lru(l) {
                        INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..a7d0f5482634 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -132,9 +132,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        if (unlikely(!anon_vma))
                                goto out_enomem_free_avc;
                        allocated = anon_vma;
+                        /*
+                         * This VMA had no anon_vma yet.  This anon_vma is
+                         * the root of any anon_vma tree that might form.
+                         */
+                        anon_vma->root = anon_vma;
                }
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -142,12 +147,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        avc->anon_vma = anon_vma;
                        avc->vma = vma;
                        list_add(&avc->same_vma, &vma->anon_vma_chain);
-                        list_add(&avc->same_anon_vma, &anon_vma->head);
+                        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
                        allocated = NULL;
                        avc = NULL;
                }
                spin_unlock(&mm->page_table_lock);
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
                if (unlikely(allocated))
                        anon_vma_free(allocated);
@@ -170,9 +175,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
 }
 /*
@@ -224,9 +229,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        avc = anon_vma_chain_alloc();
        if (!avc)
                goto out_error_free_anon_vma;
-        anon_vma_chain_link(vma, avc, anon_vma);
+        /*
+         * The root anon_vma's spinlock is the lock actually used when we
+         * lock any of the anon_vmas in this anon_vma tree.
+         */
+        anon_vma->root = pvma->anon_vma->root;
+        /*
+         * With KSM refcounts, an anon_vma can stay around longer than the
+         * process it belongs to.  The root anon_vma needs to be pinned
+         * until this anon_vma is freed, because the lock lives in the root.
+         */
+        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
+        anon_vma_chain_link(vma, avc, anon_vma);
        return 0;
@@ -246,22 +263,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
        if (!anon_vma)
                return;
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        list_del(&anon_vma_chain->same_anon_vma);
        /* We must garbage collect the anon_vma if it's empty */
        empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
-        if (empty)
+        if (empty) {
+                /* We no longer need the root anon_vma */
+                if (anon_vma->root != anon_vma)
+                        drop_anon_vma(anon_vma->root);
                anon_vma_free(anon_vma);
+        }
 }
 void unlink_anon_vmas(struct vm_area_struct *vma)
 {
        struct anon_vma_chain *avc, *next;
-        /* Unlink each anon_vma chained to the VMA. */
+        /*
+         * Unlink each anon_vma chained to the VMA.  This list is ordered
+         * from newest to oldest, ensuring the root anon_vma gets freed last.
+         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                anon_vma_unlink(avc);
                list_del(&avc->same_vma);
@@ -302,7 +326,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
                goto out;
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        return anon_vma;
 out:
        rcu_read_unlock();
@@ -311,7 +335,7 @@ out:
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
        rcu_read_unlock();
 }
@@ -340,9 +364,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
-        if (PageAnon(page))
+        if (PageAnon(page)) {
-                ;
+                if (vma->anon_vma->root != page_anon_vma(page)->root)
-        else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+                        return -EFAULT;
+        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
                if (!vma->vm_file ||
                    vma->vm_file->f_mapping != page->mapping)
                        return -EFAULT;
@@ -743,14 +768,20 @@ static void __page_set_anon_rmap(struct page *page,
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
-         *
-         * So take the last AVC chain entry in the vma, which is
-         * the deepest ancestor, and use the anon_vma from that.
         */
        if (!exclusive) {
-                struct anon_vma_chain *avc;
+                if (PageAnon(page))
-                avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+                        return;
-                anon_vma = avc->anon_vma;
+                anon_vma = anon_vma->root;
+        } else {
+                /*
+                 * In this case, swapped-out-but-not-discarded swap-cache
+                 * is remapped. So, no need to update page->mapping here.
+                 * We convice anon_vma poitned by page->mapping is not obsolete
+                 * because vma->anon_vma is necessary to be a family of it.
+                 */
+                if (PageAnon(page))
+                        return;
        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +811,7 @@ static void __page_check_anon_rmap(struct page *page,
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to page_add_new_anon_rmap.
         */
+        BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
        BUG_ON(page->index != linear_page_index(vma, address));
 #endif
 }
@@ -798,6 +830,17 @@ static void __page_check_anon_rmap(struct page *page,
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
+        do_page_add_anon_rmap(page, vma, address, 0);
+}
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
        int first = atomic_inc_and_test(&page->_mapcount);
        if (first)
                __inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +850,7 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (first)
-                __page_set_anon_rmap(page, vma, address, 0);
+                __page_set_anon_rmap(page, vma, address, exclusive);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -1368,6 +1411,42 @@ int try_to_munlock(struct page *page)
                return try_to_unmap_file(page, TTU_MUNLOCK);
 }
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary.  Be careful to do all the tests under the lock.  Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+        BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
+        if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+                struct anon_vma *root = anon_vma->root;
+                int empty = list_empty(&anon_vma->head);
+                int last_root_user = 0;
+                int root_empty = 0;
+                /*
+                 * The refcount on a non-root anon_vma got dropped.  Drop
+                 * the refcount on the root and check if we need to free it.
+                 */
+                if (empty && anon_vma != root) {
+                        BUG_ON(atomic_read(&root->external_refcount) <= 0);
+                        last_root_user = atomic_dec_and_test(&root->external_refcount);
+                        root_empty = list_empty(&root->head);
+                }
+                anon_vma_unlock(anon_vma);
+                if (empty) {
+                        anon_vma_free(anon_vma);
+                        if (root_empty && last_root_user)
+                                anon_vma_free(root);
+                }
+        }
+}
+#endif
 #ifdef CONFIG_MIGRATION
 /*
 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1468,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return ret;
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
@@ -1399,7 +1478,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                if (ret != SWAP_AGAIN)
                        break;
        }
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
        return ret;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..566f9a481e64 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        if (sbinfo->max_blocks) {
-                spin_lock(&sbinfo->stat_lock);
+                percpu_counter_add(&sbinfo->used_blocks, -pages);
-                sbinfo->free_blocks += pages;
+                spin_lock(&inode->i_lock);
                inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-                spin_unlock(&sbinfo->stat_lock);
+                spin_unlock(&inode->i_lock);
        }
 }
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
                if (sgp == SGP_READ)
                        return shmem_swp_map(ZERO_PAGE(0));
                /*
-                 * Test free_blocks against 1 not 0, since we have 1 data
+                 * Test used_blocks against 1 less max_blocks, since we have 1 data
                 * page (and perhaps indirect index pages) yet to allocate:
                 * a waste to allocate index if we cannot allocate data.
                 */
                if (sbinfo->max_blocks) {
-                        spin_lock(&sbinfo->stat_lock);
+                        if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
-                        if (sbinfo->free_blocks <= 1) {
-                                spin_unlock(&sbinfo->stat_lock);
                                return ERR_PTR(-ENOSPC);
-                        }
+                        percpu_counter_inc(&sbinfo->used_blocks);
-                        sbinfo->free_blocks--;
+                        spin_lock(&inode->i_lock);
                        inode->i_blocks += BLOCKS_PER_PAGE;
-                        spin_unlock(&sbinfo->stat_lock);
+                        spin_unlock(&inode->i_lock);
                }
                spin_unlock(&info->lock);
@@ -1223,6 +1222,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
        struct shmem_sb_info *sbinfo;
        struct page *filepage = *pagep;
        struct page *swappage;
+        struct page *prealloc_page = NULL;
        swp_entry_t *entry;
        swp_entry_t swap;
        gfp_t gfp;
@@ -1247,7 +1247,6 @@ repeat:
                filepage = find_lock_page(mapping, idx);
        if (filepage && PageUptodate(filepage))
                goto done;
-        error = 0;
        gfp = mapping_gfp_mask(mapping);
        if (!filepage) {
                /*
@@ -1258,7 +1257,19 @@ repeat:
                if (error)
                        goto failed;
                radix_tree_preload_end();
+                if (sgp != SGP_READ && !prealloc_page) {
+                        /* We don't care if this fails */
+                        prealloc_page = shmem_alloc_page(gfp, info, idx);
+                        if (prealloc_page) {
+                                if (mem_cgroup_cache_charge(prealloc_page,
+                                                current->mm, GFP_KERNEL)) {
+                                        page_cache_release(prealloc_page);
+                                        prealloc_page = NULL;
+                                }
+                        }
+                }
        }
+        error = 0;
        spin_lock(&info->lock);
        shmem_recalc_inode(inode);
@@ -1387,17 +1398,16 @@ repeat:
                shmem_swp_unmap(entry);
                sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
-                        spin_lock(&sbinfo->stat_lock);
+                        if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
-                        if (sbinfo->free_blocks == 0 ||
                            shmem_acct_block(info->flags)) {
-                                spin_unlock(&sbinfo->stat_lock);
                                spin_unlock(&info->lock);
                                error = -ENOSPC;
                                goto failed;
                        }
-                        sbinfo->free_blocks--;
+                        percpu_counter_inc(&sbinfo->used_blocks);
+                        spin_lock(&inode->i_lock);
                        inode->i_blocks += BLOCKS_PER_PAGE;
-                        spin_unlock(&sbinfo->stat_lock);
+                        spin_unlock(&inode->i_lock);
                } else if (shmem_acct_block(info->flags)) {
                        spin_unlock(&info->lock);
                        error = -ENOSPC;
@@ -1407,28 +1417,38 @@ repeat:
                if (!filepage) {
                        int ret;
-                        spin_unlock(&info->lock);
+                        if (!prealloc_page) {
-                        filepage = shmem_alloc_page(gfp, info, idx);
+                                spin_unlock(&info->lock);
-                        if (!filepage) {
+                                filepage = shmem_alloc_page(gfp, info, idx);
-                                shmem_unacct_blocks(info->flags, 1);
+                                if (!filepage) {
-                                shmem_free_blocks(inode, 1);
+                                        shmem_unacct_blocks(info->flags, 1);
-                                error = -ENOMEM;
+                                        shmem_free_blocks(inode, 1);
-                                goto failed;
+                                        error = -ENOMEM;
-                        }
+                                        goto failed;
-                        SetPageSwapBacked(filepage);
+                                }
+                                SetPageSwapBacked(filepage);
-                        /* Precharge page while we can wait, compensate after */
+                                /*
-                        error = mem_cgroup_cache_charge(filepage, current->mm,
+                                 * Precharge page while we can wait, compensate
-                                        GFP_KERNEL);
+                                 * after
-                        if (error) {
+                                 */
-                                page_cache_release(filepage);
+                                error = mem_cgroup_cache_charge(filepage,
-                                shmem_unacct_blocks(info->flags, 1);
+                                        current->mm, GFP_KERNEL);
-                                shmem_free_blocks(inode, 1);
+                                if (error) {
-                                filepage = NULL;
+                                        page_cache_release(filepage);
-                                goto failed;
+                                        shmem_unacct_blocks(info->flags, 1);
+                                        shmem_free_blocks(inode, 1);
+                                        filepage = NULL;
+                                        goto failed;
+                                }
+                                spin_lock(&info->lock);
+                        } else {
+                                filepage = prealloc_page;
+                                prealloc_page = NULL;
+                                SetPageSwapBacked(filepage);
                        }
-                        spin_lock(&info->lock);
                        entry = shmem_swp_alloc(info, idx, sgp);
                        if (IS_ERR(entry))
                                error = PTR_ERR(entry);
@@ -1469,13 +1489,19 @@ repeat:
        }
 done:
        *pagep = filepage;
-        return 0;
+        error = 0;
+        goto out;
 failed:
        if (*pagep != filepage) {
                unlock_page(filepage);
                page_cache_release(filepage);
        }
+out:
+        if (prealloc_page) {
+                mem_cgroup_uncharge_cache_page(prealloc_page);
+                page_cache_release(prealloc_page);
+        }
        return error;
 }
@@ -1791,17 +1817,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
        buf->f_namelen = NAME_MAX;
-        spin_lock(&sbinfo->stat_lock);
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
-                buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+                buf->f_bavail = buf->f_bfree =
+                                sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_inodes;
        }
        /* else leave those fields 0 like simple_statfs */
-        spin_unlock(&sbinfo->stat_lock);
        return 0;
 }
@@ -2242,7 +2267,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        struct shmem_sb_info config = *sbinfo;
-        unsigned long blocks;
        unsigned long inodes;
        int error = -EINVAL;
@@ -2250,9 +2274,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
                return error;
        spin_lock(&sbinfo->stat_lock);
-        blocks = sbinfo->max_blocks - sbinfo->free_blocks;
        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-        if (config.max_blocks < blocks)
+        if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
                goto out;
        if (config.max_inodes < inodes)
                goto out;
@@ -2269,7 +2292,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        error = 0;
        sbinfo->max_blocks  = config.max_blocks;
-        sbinfo->free_blocks = config.max_blocks - blocks;
        sbinfo->max_inodes  = config.max_inodes;
        sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2344,7 +2366,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 #endif
        spin_lock_init(&sbinfo->stat_lock);
-        sbinfo->free_blocks = sbinfo->max_blocks;
+        percpu_counter_init(&sbinfo->used_blocks, 0);
        sbinfo->free_inodes = sbinfo->max_inodes;
        sb->s_maxbytes = SHMEM_MAX_BYTES;
diff --git a/mm/slab.c b/mm/slab.c
index 736e497733d6..88435fcc8387 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 #define STATS_DEC_ACTIVE(x)     do { } while (0)
 #define STATS_INC_ALLOCED(x)    do { } while (0)
 #define STATS_INC_GROWN(x)      do { } while (0)
-#define STATS_ADD_REAPED(x,y)   do { } while (0)
+#define STATS_ADD_REAPED(x,y)   do { (void)(y); } while (0)
 #define STATS_SET_HIGH(x)       do { } while (0)
 #define STATS_INC_ERR(x)        do { } while (0)
 #define STATS_INC_NODEALLOCS(x) do { } while (0)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 03aa2d55f1a2..1f3f9c59a73a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
 long total_swap_pages;
 static int least_priority;
+static bool swap_for_hibernation;
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
@@ -318,8 +320,10 @@ checks:
        if (offset > si->highest_bit)
                scan_base = offset = si->lowest_bit;
-        /* reuse swap entry of cache-only swap if not busy. */
+        /* reuse swap entry of cache-only swap if not hibernation. */
-        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+        if (vm_swap_full()
+                && usage == SWAP_HAS_CACHE
+                && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
                spin_unlock(&swap_lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -449,6 +453,8 @@ swp_entry_t get_swap_page(void)
        spin_lock(&swap_lock);
        if (nr_swap_pages <= 0)
                goto noswap;
+        if (swap_for_hibernation)
+                goto noswap;
        nr_swap_pages--;
        for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -481,28 +487,6 @@ noswap:
        return (swp_entry_t) {0};
 }
-/* The only caller of this function is now susupend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
-        struct swap_info_struct *si;
-        pgoff_t offset;
-        spin_lock(&swap_lock);
-        si = swap_info[type];
-        if (si && (si->flags & SWP_WRITEOK)) {
-                nr_swap_pages--;
-                /* This is called for allocating swap entry, not cache */
-                offset = scan_swap_map(si, 1);
-                if (offset) {
-                        spin_unlock(&swap_lock);
-                        return swp_entry(type, offset);
-                }
-                nr_swap_pages++;
-        }
-        spin_unlock(&swap_lock);
-        return (swp_entry_t) {0};
-}
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct *p;
@@ -762,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
 #endif
 #ifdef CONFIG_HIBERNATION
+static pgoff_t hibernation_offset[MAX_SWAPFILES];
+/*
+ * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
+ * saved swap_map[] image to the disk will be an incomplete because it's
+ * changing without synchronization with hibernation snap shot.
+ * At resume, we just make swap_for_hibernation=false. We can forget
+ * used maps easily.
+ */
+void hibernation_freeze_swap(void)
+{
+        int i;
+        spin_lock(&swap_lock);
+        printk(KERN_INFO "PM: Freeze Swap\n");
+        swap_for_hibernation = true;
+        for (i = 0; i < MAX_SWAPFILES; i++)
+                hibernation_offset[i] = 1;
+        spin_unlock(&swap_lock);
+}
+void hibernation_thaw_swap(void)
+{
+        spin_lock(&swap_lock);
+        if (swap_for_hibernation) {
+                printk(KERN_INFO "PM: Thaw Swap\n");
+                swap_for_hibernation = false;
+        }
+        spin_unlock(&swap_lock);
+}
+/*
+ * Because updateing swap_map[] can make not-saved-status-change,
+ * we use our own easy allocator.
+ * Please see kernel/power/swap.c, Used swaps are recorded into
+ * RB-tree.
+ */
+swp_entry_t get_swap_for_hibernation(int type)
+{
+        pgoff_t off;
+        swp_entry_t val = {0};
+        struct swap_info_struct *si;
+        spin_lock(&swap_lock);
+        si = swap_info[type];
+        if (!si || !(si->flags & SWP_WRITEOK))
+                goto done;
+        for (off = hibernation_offset[type]; off < si->max; ++off) {
+                if (!si->swap_map[off])
+                        break;
+        }
+        if (off < si->max) {
+                val = swp_entry(type, off);
+                hibernation_offset[type] = off + 1;
+        }
+done:
+        spin_unlock(&swap_lock);
+        return val;
+}
+void swap_free_for_hibernation(swp_entry_t ent)
+{
+        /* Nothing to do */
+}
 /*
 * Find the swap type that corresponds to given device (if any).
 *
diff --git a/mm/util.c b/mm/util.c
index f5712e8964be..4735ea481816 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
        if (length > n)
                return ERR_PTR(-EINVAL);
-        p = kmalloc(length, GFP_KERNEL);
+        p = memdup_user(s, length);
-        if (!p)
+        if (IS_ERR(p))
-                return ERR_PTR(-ENOMEM);
+                return p;
-        if (copy_from_user(p, s, length)) {
-                kfree(p);
-                return ERR_PTR(-EFAULT);
-        }
        p[length - 1] = '\0';
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b7e314b1009f..918c51335d64 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -732,7 +732,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
                                        node, gfp_mask);
        if (unlikely(IS_ERR(va))) {
                kfree(vb);
-                return ERR_PTR(PTR_ERR(va));
+                return ERR_CAST(va);
        }
        err = radix_tree_preload(gfp_mask);
@@ -2437,8 +2437,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
        unsigned int *ptr = NULL;
        int ret;
-        if (NUMA_BUILD)
+        if (NUMA_BUILD) {
                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+                if (ptr == NULL)
+                        return -ENOMEM;
+        }
        ret = seq_open(file, &vmalloc_op);
        if (!ret) {
                struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..ec5ddccbf82e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
+                trace_mm_vmscan_writepage(page,
+                        trace_reclaim_flags(page, sync_writeback));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+        struct pagevec freed_pvec;
+        struct page *page, *tmp;
+        pagevec_init(&freed_pvec, 1);
+        list_for_each_entry_safe(page, tmp, free_pages, lru) {
+                list_del(&page->lru);
+                if (!pagevec_add(&freed_pvec, page)) {
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
+        }
+        pagevec_free(&freed_pvec);
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        enum pageout_io sync_writeback)
 {
        LIST_HEAD(ret_pages);
-        struct pagevec freed_pvec;
+        LIST_HEAD(free_pages);
        int pgactivate = 0;
        unsigned long nr_reclaimed = 0;
        cond_resched();
-        pagevec_init(&freed_pvec, 1);
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                __clear_page_locked(page);
 free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_free(&freed_pvec);
+                /*
-                        pagevec_reinit(&freed_pvec);
+                 * Is there need to periodically free_page_list? It would
-                }
+                 * appear not as the counts should be low
+                 */
+                list_add(&page->lru, &free_pages);
                continue;
 cull_mlocked:
@@ -832,9 +856,10 @@ keep:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
+        free_page_list(&free_pages);
        list_splice(&ret_pages, page_list);
-        if (pagevec_count(&freed_pvec))
-                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                unsigned long *scanned, int order, int mode, int file)
 {
        unsigned long nr_taken = 0;
+        unsigned long nr_lumpy_taken = 0;
+        unsigned long nr_lumpy_dirty = 0;
+        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
                                nr_taken++;
+                                nr_lumpy_taken++;
+                                if (PageDirty(cursor_page))
+                                        nr_lumpy_dirty++;
                                scan++;
+                        } else {
+                                if (mode == ISOLATE_BOTH &&
+                                                page_count(cursor_page))
+                                        nr_lumpy_failed++;
                        }
                }
        }
        *scanned = scan;
+        trace_mm_vmscan_lru_isolate(order,
+                        nr_to_scan, scan,
+                        nr_taken,
+                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+                        mode);
        return nr_taken;
 }
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
                        ClearPageActive(page);
                        nr_active++;
                }
-                count[lru]++;
+                if (count)
+                        count[lru]++;
        }
        return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
 }
 /*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * TODO: Try merging with migrations version of putback_lru_pages
- * of reclaimed pages
 */
-static unsigned long shrink_inactive_list(unsigned long max_scan,
+static noinline_for_stack void
-                        struct zone *zone, struct scan_control *sc,
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
-                        int priority, int file)
+                                unsigned long nr_anon, unsigned long nr_file,
+                                struct list_head *page_list)
 {
-        LIST_HEAD(page_list);
+        struct page *page;
        struct pagevec pvec;
-        unsigned long nr_scanned = 0;
-        unsigned long nr_reclaimed = 0;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        while (unlikely(too_many_isolated(zone, file, sc))) {
+        pagevec_init(&pvec, 1);
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                /* We are about to die and free our memory. Return now. */
+        /*
-                if (fatal_signal_pending(current))
+         * Put back any unfreeable pages.
-                        return SWAP_CLUSTER_MAX;
+         */
+        spin_lock(&zone->lru_lock);
+        while (!list_empty(page_list)) {
+                int lru;
+                page = lru_to_page(page_list);
+                VM_BUG_ON(PageLRU(page));
+                list_del(&page->lru);
+                if (unlikely(!page_evictable(page, NULL))) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        putback_lru_page(page);
+                        spin_lock_irq(&zone->lru_lock);
+                        continue;
+                }
+                SetPageLRU(page);
+                lru = page_lru(page);
+                add_page_to_lru_list(zone, page, lru);
+                if (is_active_lru(lru)) {
+                        int file = is_file_lru(lru);
+                        reclaim_stat->recent_rotated[file]++;
+                }
+                if (!pagevec_add(&pvec, page)) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        __pagevec_release(&pvec);
+                        spin_lock_irq(&zone->lru_lock);
+                }
        }
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+        spin_unlock_irq(&zone->lru_lock);
+        pagevec_release(&pvec);
+}
-        pagevec_init(&pvec, 1);
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+                                        struct scan_control *sc,
+                                        unsigned long *nr_anon,
+                                        unsigned long *nr_file,
+                                        struct list_head *isolated_list)
+{
+        unsigned long nr_active;
+        unsigned int count[NR_LRU_LISTS] = { 0, };
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        lru_add_drain();
+        nr_active = clear_active_flags(isolated_list, count);
-        spin_lock_irq(&zone->lru_lock);
+        __count_vm_events(PGDEACTIVATE, nr_active);
-        do {
-                struct page *page;
-                unsigned long nr_taken;
-                unsigned long nr_scan;
-                unsigned long nr_freed;
-                unsigned long nr_active;
-                unsigned int count[NR_LRU_LISTS] = { 0, };
-                int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
-                unsigned long nr_anon;
-                unsigned long nr_file;
-                if (scanning_global_lru(sc)) {
+        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
-                        nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
+                              -count[LRU_ACTIVE_FILE]);
-                                                        &page_list, &nr_scan,
+        __mod_zone_page_state(zone, NR_INACTIVE_FILE,
-                                                        sc->order, mode,
+                              -count[LRU_INACTIVE_FILE]);
-                                                        zone, 0, file);
+        __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                        zone->pages_scanned += nr_scan;
+                              -count[LRU_ACTIVE_ANON]);
-                        if (current_is_kswapd())
+        __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                                __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+                              -count[LRU_INACTIVE_ANON]);
-                                                       nr_scan);
-                        else
-                                __count_zone_vm_events(PGSCAN_DIRECT, zone,
-                                                       nr_scan);
-                } else {
-                        nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
-                                                        &page_list, &nr_scan,
-                                                        sc->order, mode,
-                                                        zone, sc->mem_cgroup,
-                                                        0, file);
-                        /*
-                         * mem_cgroup_isolate_pages() keeps track of
-                         * scanned pages on its own.
-                         */
-                }
-                if (nr_taken == 0)
+        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-                        goto done;
+        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
+        reclaim_stat->recent_scanned[0] += *nr_anon;
+        reclaim_stat->recent_scanned[1] += *nr_file;
+}
-                nr_active = clear_active_flags(&page_list, count);
+/*
-                __count_vm_events(PGDEACTIVATE, nr_active);
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+                                        unsigned long nr_freed,
+                                        int priority,
+                                        struct scan_control *sc)
+{
+        int lumpy_stall_priority;
-                __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+        /* kswapd should not stall on sync IO */
-                                                -count[LRU_ACTIVE_FILE]);
+        if (current_is_kswapd())
-                __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+                return false;
-                                                -count[LRU_INACTIVE_FILE]);
-                __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                                                -count[LRU_ACTIVE_ANON]);
-                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                                                -count[LRU_INACTIVE_ANON]);
-                nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+        /* Only stall on lumpy reclaim */
-                nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        if (!sc->lumpy_reclaim_mode)
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+                return false;
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-                reclaim_stat->recent_scanned[0] += nr_anon;
+        /* If we have relaimed everything on the isolated list, no stall */
-                reclaim_stat->recent_scanned[1] += nr_file;
+        if (nr_freed == nr_taken)
+                return false;
-                spin_unlock_irq(&zone->lru_lock);
+        /*
+         * For high-order allocations, there are two stall thresholds.
+         * High-cost allocations stall immediately where as lower
+         * order allocations such as stacks require the scanning
+         * priority to be much higher before stalling.
+         */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                lumpy_stall_priority = DEF_PRIORITY;
+        else
+                lumpy_stall_priority = DEF_PRIORITY / 3;
+        return priority <= lumpy_stall_priority;
+}
+/*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+                        struct scan_control *sc, int priority, int file)
+{
+        LIST_HEAD(page_list);
+        unsigned long nr_scanned;
+        unsigned long nr_reclaimed = 0;
+        unsigned long nr_taken;
+        unsigned long nr_active;
+        unsigned long nr_anon;
+        unsigned long nr_file;
-                nr_scanned += nr_scan;
+        while (unlikely(too_many_isolated(zone, file, sc))) {
-                nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                /* We are about to die and free our memory. Return now. */
+                if (fatal_signal_pending(current))
+                        return SWAP_CLUSTER_MAX;
+        }
+        lru_add_drain();
+        spin_lock_irq(&zone->lru_lock);
+        if (scanning_global_lru(sc)) {
+                nr_taken = isolate_pages_global(nr_to_scan,
+                        &page_list, &nr_scanned, sc->order,
+                        sc->lumpy_reclaim_mode ?
+                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                        zone, 0, file);
+                zone->pages_scanned += nr_scanned;
+                if (current_is_kswapd())
+                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+                                               nr_scanned);
+                else
+                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
+                                               nr_scanned);
+        } else {
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                        &page_list, &nr_scanned, sc->order,
+                        sc->lumpy_reclaim_mode ?
+                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                        zone, sc->mem_cgroup,
+                        0, file);
                /*
-                 * If we are direct reclaiming for contiguous pages and we do
+                 * mem_cgroup_isolate_pages() keeps track of
-                 * not reclaim everything in the list, try again and wait
+                 * scanned pages on its own.
-                 * for IO to complete. This will stall high-order allocations
-                 * but that should be acceptable to the caller
                 */
-                if (nr_freed < nr_taken && !current_is_kswapd() &&
+        }
-                    sc->lumpy_reclaim_mode) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
-                        /*
+        if (nr_taken == 0) {
-                         * The attempt at page out may have made some
+                spin_unlock_irq(&zone->lru_lock);
-                         * of the pages active, mark them inactive again.
+                return 0;
-                         */
+        }
-                        nr_active = clear_active_flags(&page_list, count);
-                        count_vm_events(PGDEACTIVATE, nr_active);
-                        nr_freed += shrink_page_list(&page_list, sc,
+        update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
-                                                        PAGEOUT_IO_SYNC);
-                }
-                nr_reclaimed += nr_freed;
+        spin_unlock_irq(&zone->lru_lock);
-                local_irq_disable();
+        nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
-                if (current_is_kswapd())
-                        __count_vm_events(KSWAPD_STEAL, nr_freed);
+        /* Check if we should syncronously wait for writeback */
-                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                spin_lock(&zone->lru_lock);
                /*
-                 * Put back any unfreeable pages.
+                 * The attempt at page out may have made some
+                 * of the pages active, mark them inactive again.
                 */
-                while (!list_empty(&page_list)) {
+                nr_active = clear_active_flags(&page_list, NULL);
-                        int lru;
+                count_vm_events(PGDEACTIVATE, nr_active);
-                        page = lru_to_page(&page_list);
-                        VM_BUG_ON(PageLRU(page));
-                        list_del(&page->lru);
-                        if (unlikely(!page_evictable(page, NULL))) {
-                                spin_unlock_irq(&zone->lru_lock);
-                                putback_lru_page(page);
-                                spin_lock_irq(&zone->lru_lock);
-                                continue;
-                        }
-                        SetPageLRU(page);
-                        lru = page_lru(page);
-                        add_page_to_lru_list(zone, page, lru);
-                        if (is_active_lru(lru)) {
-                                int file = is_file_lru(lru);
-                                reclaim_stat->recent_rotated[file]++;
-                        }
-                        if (!pagevec_add(&pvec, page)) {
-                                spin_unlock_irq(&zone->lru_lock);
-                                __pagevec_release(&pvec);
-                                spin_lock_irq(&zone->lru_lock);
-                        }
-                }
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
-        } while (nr_scanned < max_scan);
+                nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+        }
-done:
+        local_irq_disable();
-        spin_unlock_irq(&zone->lru_lock);
+        if (current_is_kswapd())
-        pagevec_release(&pvec);
+                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
-        return nr_reclaimed;
+        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
-}
-/*
+        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
- * We are about to scan this zone at a certain priority level.  If that priority
+        return nr_reclaimed;
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-        if (priority < zone->prev_priority)
-                zone->prev_priority = priority;
 }
 /*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        }
        /*
+         * With swappiness at 100, anonymous and file have the same priority.
+         * This scanning priority is essentially the inverse of IO cost.
+         */
+        anon_prio = sc->swappiness;
+        file_prio = 200 - sc->swappiness;
+        /*
         * OK, so we have swap space and a fair amount of page cache
         * pages.  We use the recently rotated / recently scanned
         * ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
+        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
-                spin_lock_irq(&zone->lru_lock);
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
-                spin_unlock_irq(&zone->lru_lock);
        }
        if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
-                spin_lock_irq(&zone->lru_lock);
                reclaim_stat->recent_scanned[1] /= 2;
                reclaim_stat->recent_rotated[1] /= 2;
-                spin_unlock_irq(&zone->lru_lock);
        }
        /*
-         * With swappiness at 100, anonymous and file have the same priority.
-         * This scanning priority is essentially the inverse of IO cost.
-         */
-        anon_prio = sc->swappiness;
-        file_prio = 200 - sc->swappiness;
-        /*
         * The amount of pressure on anon vs file pages is inversely
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
+        spin_unlock_irq(&zone->lru_lock);
        fraction[0] = ap;
        fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
 static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
-        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        struct zoneref *z;
        struct zone *zone;
        bool all_unreclaimable = true;
-        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                        sc->nodemask) {
+                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
                if (!populated_zone(zone))
                        continue;
                /*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                if (scanning_global_lru(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                        note_zone_scanning_priority(zone, priority);
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
-                } else {
-                        /*
-                         * Ignore cpuset limitation here. We just want to reduce
-                         * # of used pages by us regardless of memory shortage.
-                         */
-                        mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
-                                                        priority);
                }
                shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        bool all_unreclaimable;
        unsigned long total_scanned = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        unsigned long lru_pages = 0;
        struct zoneref *z;
        struct zone *zone;
-        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        unsigned long writeback_threshold;
        get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        if (scanning_global_lru(sc))
                count_vm_event(ALLOCSTALL);
-        /*
-         * mem_cgroup will not do shrink_slab.
-         */
-        if (scanning_global_lru(sc)) {
-                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                                continue;
-                        lru_pages += zone_reclaimable_pages(zone);
-                }
-        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 * over limit cgroups
                 */
                if (scanning_global_lru(sc)) {
+                        unsigned long lru_pages = 0;
+                        for_each_zone_zonelist(zone, z, zonelist,
+                                        gfp_zone(sc->gfp_mask)) {
+                                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                        continue;
+                                lru_pages += zone_reclaimable_pages(zone);
+                        }
                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
        if (priority < 0)
                priority = 0;
-        if (scanning_global_lru(sc)) {
-                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                                continue;
-                        zone->prev_priority = priority;
-                }
-        } else
-                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
        delayacct_freepages_end();
        put_mems_allowed();
@@ -1888,6 +1940,7 @@ out:
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
+        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .nodemask = nodemask,
        };
-        return do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_direct_reclaim_begin(order,
+                                sc.may_writepage,
+                                gfp_mask);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+        return nr_reclaimed;
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1925,6 +1986,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
        sc.nodemask = &nm;
        sc.nr_reclaimed = 0;
        sc.nr_scanned = 0;
+        trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+                                                      sc.may_writepage,
+                                                      sc.gfp_mask);
        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1999,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         * the priority and make it zero.
         */
        shrink_zone(0, zone, &sc);
+        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
        return sc.nr_reclaimed;
 }
@@ -1942,6 +2011,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           unsigned int swappiness)
 {
        struct zonelist *zonelist;
+        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -1956,7 +2026,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
-        return do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_memcg_reclaim_begin(0,
+                                            sc.may_writepage,
+                                            sc.gfp_mask);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+        return nr_reclaimed;
 }
 #endif
@@ -2028,22 +2107,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .order = order,
                .mem_cgroup = NULL,
        };
-        /*
-         * temp_priority is used to remember the scanning priority at which
-         * this zone was successfully refilled to
-         * free_pages == high_wmark_pages(zone).
-         */
-        int temp_priority[MAX_NR_ZONES];
 loop_again:
        total_scanned = 0;
        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
-        for (i = 0; i < pgdat->nr_zones; i++)
-                temp_priority[i] = DEF_PRIORITY;
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
@@ -2111,9 +2180,7 @@ loop_again:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
-                        note_zone_scanning_priority(zone, priority);
                        nid = pgdat->node_id;
                        zid = zone_idx(zone);
@@ -2186,16 +2253,6 @@ loop_again:
                        break;
        }
 out:
-        /*
-         * Note within each zone the priority level at which this zone was
-         * brought into a happy state.  So that the next thread which scans this
-         * zone will start out at that priority level.
-         */
-        for (i = 0; i < pgdat->nr_zones; i++) {
-                struct zone *zone = pgdat->node_zones + i;
-                zone->prev_priority = temp_priority[i];
-        }
        if (!all_zones_ok) {
                cond_resched();
@@ -2299,9 +2356,10 @@ static int kswapd(void *p)
                                 * premature sleep. If not, then go fully
                                 * to sleep until explicitly woken up
                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining))
+                                if (!sleeping_prematurely(pgdat, order, remaining)) {
+                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                                        schedule();
-                                else {
+                                } else {
                                        if (remaining)
                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
                                        else
@@ -2321,8 +2379,10 @@ static int kswapd(void *p)
                 * We can speed up thawing tasks if we don't call balance_pgdat
                 * after returning from the refrigerator
                 */
-                if (!ret)
+                if (!ret) {
+                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
                        balance_pgdat(pgdat, order);
+                }
        }
        return 0;
 }
@@ -2342,6 +2402,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
        if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2651,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swappiness = vm_swappiness,
                .order = order,
        };
-        unsigned long slab_reclaimable;
+        unsigned long nr_slab_pages0, nr_slab_pages1;
-        disable_swap_token();
        cond_resched();
        /*
         * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2671,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 */
                priority = ZONE_RECLAIM_PRIORITY;
                do {
-                        note_zone_scanning_priority(zone, priority);
                        shrink_zone(priority, zone, &sc);
                        priority--;
                } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
        }
-        slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+        nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-        if (slab_reclaimable > zone->min_slab_pages) {
+        if (nr_slab_pages0 > zone->min_slab_pages) {
                /*
                 * shrink_slab() does not currently allow us to determine how
                 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2688,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * Note that shrink_slab will free memory on all zones and may
                 * take a long time.
                 */
-                while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+                for (;;) {
-                        zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+                        unsigned long lru_pages = zone_reclaimable_pages(zone);
-                                slab_reclaimable - nr_pages)
-                        ;
+                        /* No reclaimable slab or very low memory pressure */
+                        if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                                break;
+                        /* Freed enough memory */
+                        nr_slab_pages1 = zone_page_state(zone,
+                                                        NR_SLAB_RECLAIMABLE);
+                        if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+                                break;
+                }
                /*
                 * Update nr_reclaimed by the number of slab pages we
                 * reclaimed from this zone.
                 */
-                sc.nr_reclaimed += slab_reclaimable -
+                nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-                        zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+                if (nr_slab_pages1 < nr_slab_pages0)
+                        sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
        }
        p->reclaim_state = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e77..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,14 +22,14 @@
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
+static void sum_vm_events(unsigned long *ret)
 {
        int cpu;
        int i;
        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
-        for_each_cpu(cpu, cpumask) {
+        for_each_online_cpu(cpu) {
                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -45,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 void all_vm_events(unsigned long *ret)
 {
        get_online_cpus();
-        sum_vm_events(ret, cpu_online_mask);
+        sum_vm_events(ret);
        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
        }
        seq_printf(m,
                   "\n  all_unreclaimable: %u"
-                   "\n  prev_priority:     %i"
                   "\n  start_pfn:         %lu"
                   "\n  inactive_ratio:    %u",
                   zone->all_unreclaimable,
-                   zone->prev_priority,
                   zone->zone_start_pfn,
                   zone->inactive_ratio);
        seq_putc(m, '\n');